2 **********************************************************************
3 * Copyright (C) 2001-2008 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
26 // don't use Boyer-Moore
29 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
31 // internal definition ---------------------------------------------------
33 #define LAST_BYTE_MASK_ 0xFF
34 #define SECOND_LAST_BYTE_SHIFT_ 8
35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
37 static const uint16_t *FCD_
= NULL
;
39 // internal methods -------------------------------------------------
42 * Fast collation element iterator setOffset.
43 * This function does not check for bounds.
44 * @param coleiter collation element iterator
45 * @param offset to set
48 inline void setColEIterOffset(UCollationElements
*elems
,
51 collIterate
*ci
= &(elems
->iteratordata_
);
52 ci
->pos
= ci
->string
+ offset
;
53 ci
->CEpos
= ci
->toReturn
= ci
->extendCEs
? ci
->extendCEs
: ci
->CEs
;
54 if (ci
->flags
& UCOL_ITER_INNORMBUF
) {
55 ci
->flags
= ci
->origFlags
;
57 ci
->fcdPosition
= NULL
;
59 ci
->offsetReturn
= NULL
;
60 ci
->offsetStore
= ci
->offsetBuffer
;
61 ci
->offsetRepeatCount
= ci
->offsetRepeatValue
= 0;
65 * Getting the mask for collation strength
66 * @param strength collation strength
67 * @return collation element mask
70 inline uint32_t getMask(UCollationStrength strength
)
75 return UCOL_PRIMARYORDERMASK
;
77 return UCOL_SECONDARYORDERMASK
| UCOL_PRIMARYORDERMASK
;
79 return UCOL_TERTIARYORDERMASK
| UCOL_SECONDARYORDERMASK
|
80 UCOL_PRIMARYORDERMASK
;
85 * This is to squeeze the 21bit ces into a 256 table
86 * @param ce collation element
87 * @return collapsed version of the collation element
90 inline int hash(uint32_t ce
)
92 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
93 // well with the new collation where most of the latin 1 characters
94 // are of the value xx000xxx. their hashes will most of the time be 0
95 // to be discussed on the hash algo.
96 return UCOL_PRIMARYORDER(ce
) % MAX_TABLE_SIZE_
;
100 static UBool U_CALLCONV
101 usearch_cleanup(void) {
108 * Initializing the fcd tables.
109 * Internal method, status assumed to be a success.
110 * @param status output error if any, caller to check status before calling
111 * method, status assumed to be success when passed in.
114 inline void initializeFCD(UErrorCode
*status
)
117 FCD_
= unorm_getFCDTrie(status
);
118 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH
, usearch_cleanup
);
123 * Gets the fcd value for a character at the argument index.
124 * This method takes into accounts of the supplementary characters.
125 * @param str UTF16 string where character for fcd retrieval resides
126 * @param offset position of the character whose fcd is to be retrieved, to be
127 * overwritten with the next character position, taking
128 * surrogate characters into consideration.
129 * @param strlength length of the argument string
133 uint16_t getFCD(const UChar
*str
, int32_t *offset
,
136 int32_t temp
= *offset
;
138 UChar ch
= str
[temp
];
139 result
= unorm_getFCD16(FCD_
, ch
);
142 if (result
&& temp
!= strlength
&& UTF_IS_FIRST_SURROGATE(ch
)) {
144 if (UTF_IS_SECOND_SURROGATE(ch
)) {
145 result
= unorm_getFCD16FromSurrogatePair(FCD_
, result
, ch
);
156 * Getting the modified collation elements taking into account the collation
158 * @param strsrch string search data
160 * @return the modified collation element
163 inline int32_t getCE(const UStringSearch
*strsrch
, uint32_t sourcece
)
165 // note for tertiary we can't use the collator->tertiaryMask, that
166 // is a preprocessed mask that takes into account case options. since
167 // we are only concerned with exact matches, we don't need that.
168 sourcece
&= strsrch
->ceMask
;
170 if (strsrch
->toShift
) {
171 // alternate handling here, since only the 16 most significant digits
172 // is only used, we can safely do a compare without masking
173 // if the ce is a variable, we mask and get only the primary values
174 // no shifting to quartenary is required since all primary values
175 // less than variabletop will need to be masked off anyway.
176 if (strsrch
->variableTop
> sourcece
) {
177 if (strsrch
->strength
== UCOL_QUATERNARY
) {
178 sourcece
&= UCOL_PRIMARYORDERMASK
;
181 sourcece
= UCOL_IGNORABLE
;
190 * Allocate a memory and returns NULL if it failed.
191 * Internal method, status assumed to be a success.
192 * @param size to allocate
193 * @param status output error if any, caller to check status before calling
194 * method, status assumed to be success when passed in.
195 * @return newly allocated array, NULL otherwise
198 inline void * allocateMemory(uint32_t size
, UErrorCode
*status
)
200 uint32_t *result
= (uint32_t *)uprv_malloc(size
);
201 if (result
== NULL
) {
202 *status
= U_MEMORY_ALLOCATION_ERROR
;
208 * Adds a uint32_t value to a destination array.
209 * Creates a new array if we run out of space. The caller will have to
210 * manually deallocate the newly allocated array.
211 * Internal method, status assumed to be success, caller has to check status
212 * before calling this method. destination not to be NULL and has at least
213 * size destinationlength.
214 * @param destination target array
215 * @param offset destination offset to add value
216 * @param destinationlength target array size, return value for the new size
217 * @param value to be added
218 * @param increments incremental size expected
219 * @param status output error if any, caller to check status before calling
220 * method, status assumed to be success when passed in.
221 * @return new destination array, destination if there was no new allocation
224 inline int32_t * addTouint32_tArray(int32_t *destination
,
226 uint32_t *destinationlength
,
231 uint32_t newlength
= *destinationlength
;
232 if (offset
+ 1 == newlength
) {
233 newlength
+= increments
;
234 int32_t *temp
= (int32_t *)allocateMemory(
235 sizeof(int32_t) * newlength
, status
);
236 if (U_FAILURE(*status
)) {
239 uprv_memcpy(temp
, destination
, sizeof(int32_t) * offset
);
240 *destinationlength
= newlength
;
243 destination
[offset
] = value
;
248 * Adds a uint64_t value to a destination array.
249 * Creates a new array if we run out of space. The caller will have to
250 * manually deallocate the newly allocated array.
251 * Internal method, status assumed to be success, caller has to check status
252 * before calling this method. destination not to be NULL and has at least
253 * size destinationlength.
254 * @param destination target array
255 * @param offset destination offset to add value
256 * @param destinationlength target array size, return value for the new size
257 * @param value to be added
258 * @param increments incremental size expected
259 * @param status output error if any, caller to check status before calling
260 * method, status assumed to be success when passed in.
261 * @return new destination array, destination if there was no new allocation
264 inline int64_t * addTouint64_tArray(int64_t *destination
,
266 uint32_t *destinationlength
,
271 uint32_t newlength
= *destinationlength
;
272 if (offset
+ 1 == newlength
) {
273 newlength
+= increments
;
274 int64_t *temp
= (int64_t *)allocateMemory(
275 sizeof(int64_t) * newlength
, status
);
277 if (U_FAILURE(*status
)) {
281 uprv_memcpy(temp
, destination
, sizeof(int64_t) * offset
);
282 *destinationlength
= newlength
;
286 destination
[offset
] = value
;
292 * Initializing the ce table for a pattern.
293 * Stores non-ignorable collation keys.
294 * Table size will be estimated by the size of the pattern text. Table
295 * expansion will be perform as we go along. Adding 1 to ensure that the table
296 * size definitely increases.
297 * Internal method, status assumed to be a success.
298 * @param strsrch string search data
299 * @param status output error if any, caller to check status before calling
300 * method, status assumed to be success when passed in.
301 * @return total number of expansions
304 inline uint16_t initializePatternCETable(UStringSearch
*strsrch
,
307 UPattern
*pattern
= &(strsrch
->pattern
);
308 uint32_t cetablesize
= INITIAL_ARRAY_SIZE_
;
309 int32_t *cetable
= pattern
->CEBuffer
;
310 uint32_t patternlength
= pattern
->textLength
;
311 UCollationElements
*coleiter
= strsrch
->utilIter
;
313 if (coleiter
== NULL
) {
314 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
315 patternlength
, status
);
316 // status will be checked in ucol_next(..) later and if it is an
317 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
319 strsrch
->utilIter
= coleiter
;
322 uprv_init_collIterate(strsrch
->collator
, pattern
->text
,
324 &coleiter
->iteratordata_
);
327 if (pattern
->CE
!= cetable
&& pattern
->CE
) {
328 uprv_free(pattern
->CE
);
335 while ((ce
= ucol_next(coleiter
, status
)) != UCOL_NULLORDER
&&
336 U_SUCCESS(*status
)) {
337 uint32_t newce
= getCE(strsrch
, ce
);
339 int32_t *temp
= addTouint32_tArray(cetable
, offset
, &cetablesize
,
341 patternlength
- ucol_getOffset(coleiter
) + 1,
343 if (U_FAILURE(*status
)) {
347 if (cetable
!= temp
&& cetable
!= pattern
->CEBuffer
) {
352 result
+= (uint16_t)(ucol_getMaxExpansion(coleiter
, ce
) - 1);
356 pattern
->CE
= cetable
;
357 pattern
->CELength
= offset
;
363 * Initializing the pce table for a pattern.
364 * Stores non-ignorable collation keys.
365 * Table size will be estimated by the size of the pattern text. Table
366 * expansion will be perform as we go along. Adding 1 to ensure that the table
367 * size definitely increases.
368 * Internal method, status assumed to be a success.
369 * @param strsrch string search data
370 * @param status output error if any, caller to check status before calling
371 * method, status assumed to be success when passed in.
372 * @return total number of expansions
375 inline uint16_t initializePatternPCETable(UStringSearch
*strsrch
,
378 UPattern
*pattern
= &(strsrch
->pattern
);
379 uint32_t pcetablesize
= INITIAL_ARRAY_SIZE_
;
380 int64_t *pcetable
= pattern
->PCEBuffer
;
381 uint32_t patternlength
= pattern
->textLength
;
382 UCollationElements
*coleiter
= strsrch
->utilIter
;
384 if (coleiter
== NULL
) {
385 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
386 patternlength
, status
);
387 // status will be checked in ucol_next(..) later and if it is an
388 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
390 strsrch
->utilIter
= coleiter
;
392 uprv_init_collIterate(strsrch
->collator
, pattern
->text
,
394 &coleiter
->iteratordata_
);
397 if (pattern
->PCE
!= pcetable
&& pattern
->PCE
!= NULL
) {
398 uprv_free(pattern
->PCE
);
405 uprv_init_pce(coleiter
);
407 // ** Should processed CEs be signed or unsigned?
408 // ** (the rest of the code in this file seems to play fast-and-loose with
409 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
410 while ((pce
= ucol_nextProcessed(coleiter
, NULL
, NULL
, status
)) != UCOL_PROCESSED_NULLORDER
&&
411 U_SUCCESS(*status
)) {
412 int64_t *temp
= addTouint64_tArray(pcetable
, offset
, &pcetablesize
,
414 patternlength
- ucol_getOffset(coleiter
) + 1,
417 if (U_FAILURE(*status
)) {
423 if (pcetable
!= temp
&& pcetable
!= pattern
->PCEBuffer
) {
428 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
431 pcetable
[offset
] = 0;
432 pattern
->PCE
= pcetable
;
433 pattern
->PCELength
= offset
;
439 * Initializes the pattern struct.
440 * Internal method, status assumed to be success.
441 * @param strsrch UStringSearch data storage
442 * @param status output error if any, caller to check status before calling
443 * method, status assumed to be success when passed in.
444 * @return expansionsize the total expansion size of the pattern
447 inline int16_t initializePattern(UStringSearch
*strsrch
, UErrorCode
*status
)
449 UPattern
*pattern
= &(strsrch
->pattern
);
450 const UChar
*patterntext
= pattern
->text
;
451 int32_t length
= pattern
->textLength
;
454 // Since the strength is primary, accents are ignored in the pattern.
455 if (strsrch
->strength
== UCOL_PRIMARY
) {
456 pattern
->hasPrefixAccents
= 0;
457 pattern
->hasSuffixAccents
= 0;
459 pattern
->hasPrefixAccents
= getFCD(patterntext
, &index
, length
) >>
460 SECOND_LAST_BYTE_SHIFT_
;
462 UTF_BACK_1(patterntext
, 0, index
);
463 pattern
->hasSuffixAccents
= getFCD(patterntext
, &index
, length
) &
468 if (strsrch
->pattern
.PCE
!= NULL
) {
469 if (strsrch
->pattern
.PCE
!= strsrch
->pattern
.PCEBuffer
) {
470 uprv_free(strsrch
->pattern
.PCE
);
473 strsrch
->pattern
.PCE
= NULL
;
476 // since intializePattern is an internal method status is a success.
477 return initializePatternCETable(strsrch
, status
);
481 * Initializing shift tables, with the default values.
482 * If a corresponding default value is 0, the shift table is not set.
483 * @param shift table for forwards shift
484 * @param backshift table for backwards shift
485 * @param cetable table containing pattern ce
486 * @param cesize size of the pattern ces
487 * @param expansionsize total size of the expansions
488 * @param defaultforward the default forward value
489 * @param defaultbackward the default backward value
492 inline void setShiftTable(int16_t shift
[], int16_t backshift
[],
493 int32_t *cetable
, int32_t cesize
,
494 int16_t expansionsize
,
495 int16_t defaultforward
,
496 int16_t defaultbackward
)
498 // estimate the value to shift. to do that we estimate the smallest
499 // number of characters to give the relevant ces, ie approximately
500 // the number of ces minus their expansion, since expansions can come
503 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
504 shift
[count
] = defaultforward
;
506 cesize
--; // down to the last index
507 for (count
= 0; count
< cesize
; count
++) {
508 // number of ces from right of array to the count
509 int temp
= defaultforward
- count
- 1;
510 shift
[hash(cetable
[count
])] = temp
> 1 ? temp
: 1;
512 shift
[hash(cetable
[cesize
])] = 1;
513 // for ignorables we just shift by one. see test examples.
516 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
517 backshift
[count
] = defaultbackward
;
519 for (count
= cesize
; count
> 0; count
--) {
520 // the original value count does not seem to work
521 backshift
[hash(cetable
[count
])] = count
> expansionsize
?
522 (int16_t)(count
- expansionsize
) : 1;
524 backshift
[hash(cetable
[0])] = 1;
525 backshift
[hash(0)] = 1;
529 * Building of the pattern collation element list and the boyer moore strsrch
531 * The canonical match will only be performed after the default match fails.
532 * For both cases we need to remember the size of the composed and decomposed
533 * versions of the string. Since the Boyer-Moore shift calculations shifts by
534 * a number of characters in the text and tries to match the pattern from that
535 * offset, the shift value can not be too large in case we miss some
536 * characters. To choose a right shift size, we estimate the NFC form of the
537 * and use its size as a shift guide. The NFC form should be the small
538 * possible representation of the pattern. Anyways, we'll err on the smaller
539 * shift size. Hence the calculation for minlength.
540 * Canonical match will be performed slightly differently. We'll split the
541 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
542 * the first and last base character (MS), the ending accents (EA). Matches
543 * will be done on MS first, and only when we match MS then some processing
544 * will be required for the prefix and end accents in order to determine if
545 * they match PA and EA. Hence the default shift values
546 * for the canonical match will take the size of either end's accent into
547 * consideration. Forwards search will take the end accents into consideration
548 * for the default shift values and the backwards search will take the prefix
549 * accents into consideration.
550 * If pattern has no non-ignorable ce, we return a illegal argument error.
551 * Internal method, status assumed to be success.
552 * @param strsrch UStringSearch data storage
553 * @param status for output errors if it occurs, status is assumed to be a
554 * success when it is passed in.
557 inline void initialize(UStringSearch
*strsrch
, UErrorCode
*status
)
559 int16_t expandlength
= initializePattern(strsrch
, status
);
560 if (U_SUCCESS(*status
) && strsrch
->pattern
.CELength
> 0) {
561 UPattern
*pattern
= &strsrch
->pattern
;
562 int32_t cesize
= pattern
->CELength
;
564 int16_t minlength
= cesize
> expandlength
565 ? (int16_t)cesize
- expandlength
: 1;
566 pattern
->defaultShiftSize
= minlength
;
567 setShiftTable(pattern
->shift
, pattern
->backShift
, pattern
->CE
,
568 cesize
, expandlength
, minlength
, minlength
);
571 strsrch
->pattern
.defaultShiftSize
= 0;
576 * Check to make sure that the match length is at the end of the character by
577 * using the breakiterator.
578 * @param strsrch string search data
579 * @param start target text start offset
580 * @param end target text end offset
583 void checkBreakBoundary(const UStringSearch
*strsrch
, int32_t * /*start*/,
586 #if !UCONFIG_NO_BREAK_ITERATION
587 UBreakIterator
*breakiterator
= strsrch
->search
->internalBreakIter
;
589 int32_t matchend
= *end
;
590 //int32_t matchstart = *start;
592 if (!ubrk_isBoundary(breakiterator
, matchend
)) {
593 *end
= ubrk_following(breakiterator
, matchend
);
596 /* Check the start of the matched text to make sure it doesn't have any accents
597 * before it. This code may not be necessary and so it is commented out */
598 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
599 *start = ubrk_preceding(breakiterator, matchstart);
606 * Determine whether the target text in UStringSearch bounded by the offset
607 * start and end is one or more whole units of text as
608 * determined by the breakiterator in UStringSearch.
609 * @param strsrch string search data
610 * @param start target text start offset
611 * @param end target text end offset
614 UBool
isBreakUnit(const UStringSearch
*strsrch
, int32_t start
,
617 #if !UCONFIG_NO_BREAK_ITERATION
618 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
621 int32_t startindex
= ubrk_first(breakiterator
);
622 int32_t endindex
= ubrk_last(breakiterator
);
624 // out-of-range indexes are never boundary positions
625 if (start
< startindex
|| start
> endindex
||
626 end
< startindex
|| end
> endindex
) {
629 // otherwise, we can use following() on the position before the
630 // specified one and return true of the position we get back is the
631 // one the user specified
632 UBool result
= (start
== startindex
||
633 ubrk_following(breakiterator
, start
- 1) == start
) &&
635 ubrk_following(breakiterator
, end
- 1) == end
);
637 // iterates the individual ces
638 UCollationElements
*coleiter
= strsrch
->utilIter
;
639 const UChar
*text
= strsrch
->search
->text
+
641 UErrorCode status
= U_ZERO_ERROR
;
642 ucol_setText(coleiter
, text
, end
- start
, &status
);
643 for (int32_t count
= 0; count
< strsrch
->pattern
.CELength
;
645 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
646 if (ce
== UCOL_IGNORABLE
) {
650 if (U_FAILURE(status
) || ce
!= strsrch
->pattern
.CE
[count
]) {
654 int32_t nextce
= ucol_next(coleiter
, &status
);
655 while (ucol_getOffset(coleiter
) == (end
- start
)
656 && getCE(strsrch
, nextce
) == UCOL_IGNORABLE
) {
657 nextce
= ucol_next(coleiter
, &status
);
659 if (ucol_getOffset(coleiter
) == (end
- start
)
660 && nextce
!= UCOL_NULLORDER
) {
661 // extra collation elements at the end of the match
672 * Getting the next base character offset if current offset is an accent,
673 * or the current offset if the current character contains a base character.
674 * accents the following base character will be returned
676 * @param textoffset current offset
677 * @param textlength length of text string
678 * @return the next base character or the current offset
679 * if the current character is contains a base character.
682 inline int32_t getNextBaseOffset(const UChar
*text
,
686 if (textoffset
< textlength
) {
687 int32_t temp
= textoffset
;
688 if (getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
689 while (temp
< textlength
) {
690 int32_t result
= temp
;
691 if ((getFCD(text
, &temp
, textlength
) >>
692 SECOND_LAST_BYTE_SHIFT_
) == 0) {
703 * Gets the next base character offset depending on the string search pattern
705 * @param strsrch string search data
706 * @param textoffset current offset, one offset away from the last character
708 * @return start index of the next base character or the current offset
709 * if the current character is contains a base character.
712 inline int32_t getNextUStringSearchBaseOffset(UStringSearch
*strsrch
,
715 int32_t textlength
= strsrch
->search
->textLength
;
716 if (strsrch
->pattern
.hasSuffixAccents
&&
717 textoffset
< textlength
) {
718 int32_t temp
= textoffset
;
719 const UChar
*text
= strsrch
->search
->text
;
720 UTF_BACK_1(text
, 0, temp
);
721 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
722 return getNextBaseOffset(text
, textoffset
, textlength
);
729 * Shifting the collation element iterator position forward to prepare for
730 * a following match. If the last character is a unsafe character, we'll only
731 * shift by 1 to capture contractions, normalization etc.
732 * Internal method, status assumed to be success.
733 * @param text strsrch string search data
734 * @param textoffset start text position to do search
735 * @param ce the text ce which failed the match.
736 * @param patternceindex index of the ce within the pattern ce buffer which
738 * @return final offset
741 inline int32_t shiftForward(UStringSearch
*strsrch
,
744 int32_t patternceindex
)
746 UPattern
*pattern
= &(strsrch
->pattern
);
747 if (ce
!= UCOL_NULLORDER
) {
748 int32_t shift
= pattern
->shift
[hash(ce
)];
749 // this is to adjust for characters in the middle of the
750 // substring for matching that failed.
751 int32_t adjust
= pattern
->CELength
- patternceindex
;
752 if (adjust
> 1 && shift
>= adjust
) {
758 textoffset
+= pattern
->defaultShiftSize
;
761 textoffset
= getNextUStringSearchBaseOffset(strsrch
, textoffset
);
762 // check for unsafe characters
763 // * if it is the start or middle of a contraction: to be done after
764 // a initial match is found
765 // * thai or lao base consonant character: similar to contraction
766 // * high surrogate character: similar to contraction
767 // * next character is a accent: shift to the next base character
770 #endif // #if BOYER_MOORE
773 * sets match not found
774 * @param strsrch string search data
777 inline void setMatchNotFound(UStringSearch
*strsrch
)
779 // this method resets the match result regardless of the error status.
780 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
781 strsrch
->search
->matchedLength
= 0;
782 if (strsrch
->search
->isForwardSearching
) {
783 setColEIterOffset(strsrch
->textIter
, strsrch
->search
->textLength
);
786 setColEIterOffset(strsrch
->textIter
, 0);
792 * Gets the offset to the next safe point in text.
793 * ie. not the middle of a contraction, swappable characters or supplementary
795 * @param collator collation sata
796 * @param text string to work with
797 * @param textoffset offset in string
798 * @param textlength length of text string
799 * @return offset to the next safe character
802 inline int32_t getNextSafeOffset(const UCollator
*collator
,
807 int32_t result
= textoffset
; // first contraction character
808 while (result
!= textlength
&& ucol_unsafeCP(text
[result
], collator
)) {
815 * This checks for accents in the potential match started with a .
816 * composite character.
817 * This is really painful... we have to check that composite character do not
818 * have any extra accents. We have to normalize the potential match and find
819 * the immediate decomposed character before the match.
820 * The first composite character would have been taken care of by the fcd
821 * checks in checkForwardExactMatch.
822 * This is the slow path after the fcd of the first character and
823 * the last character has been checked by checkForwardExactMatch and we
824 * determine that the potential match has extra non-ignorable preceding
826 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
827 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
828 * Note here that accents checking are slow and cautioned in the API docs.
829 * Internal method, status assumed to be a success, caller should check status
830 * before calling this method
831 * @param strsrch string search data
832 * @param start index of the potential unfriendly composite character
833 * @param end index of the potential unfriendly composite character
834 * @param status output error status if any.
835 * @return TRUE if there is non-ignorable accents before at the beginning
836 * of the match, FALSE otherwise.
840 UBool
checkExtraMatchAccents(const UStringSearch
*strsrch
, int32_t start
,
844 UBool result
= FALSE
;
845 if (strsrch
->pattern
.hasPrefixAccents
) {
846 int32_t length
= end
- start
;
848 const UChar
*text
= strsrch
->search
->text
+ start
;
850 UTF_FWD_1(text
, offset
, length
);
851 // we are only concerned with the first composite character
852 if (unorm_quickCheck(text
, offset
, UNORM_NFD
, status
) == UNORM_NO
) {
853 int32_t safeoffset
= getNextSafeOffset(strsrch
->collator
,
855 if (safeoffset
!= length
) {
859 UChar buffer
[INITIAL_ARRAY_SIZE_
];
860 int32_t size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0,
861 buffer
, INITIAL_ARRAY_SIZE_
,
863 if (U_FAILURE(*status
)) {
866 if (size
>= INITIAL_ARRAY_SIZE_
) {
867 norm
= (UChar
*)allocateMemory((size
+ 1) * sizeof(UChar
),
869 // if allocation failed, status will be set to
870 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
872 size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0, norm
,
874 if (U_FAILURE(*status
) && norm
!= NULL
) {
883 UCollationElements
*coleiter
= strsrch
->utilIter
;
884 ucol_setText(coleiter
, norm
, size
, status
);
885 uint32_t firstce
= strsrch
->pattern
.CE
[0];
886 UBool ignorable
= TRUE
;
887 uint32_t ce
= UCOL_IGNORABLE
;
888 while (U_SUCCESS(*status
) && ce
!= firstce
&& ce
!= (uint32_t)UCOL_NULLORDER
) {
889 offset
= ucol_getOffset(coleiter
);
890 if (ce
!= firstce
&& ce
!= UCOL_IGNORABLE
) {
893 ce
= ucol_next(coleiter
, status
);
896 UTF_PREV_CHAR(norm
, 0, offset
, codepoint
);
897 result
= !ignorable
&& (u_getCombiningClass(codepoint
) != 0);
899 if (norm
!= buffer
) {
909 * Used by exact matches, checks if there are accents before the match.
910 * This is really painful... we have to check that composite characters at
911 * the start of the matches have to not have any extra accents.
912 * We check the FCD of the character first, if it starts with an accent and
913 * the first pattern ce does not match the first ce of the character, we bail.
914 * Otherwise we try normalizing the first composite
915 * character and find the immediate decomposed character before the match to
916 * see if it is an non-ignorable accent.
917 * Now normalizing the first composite character is enough because we ensure
918 * that when the match is passed in here with extra beginning ces, the
919 * first or last ce that match has to occur within the first character.
920 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
921 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
922 * Note here that accents checking are slow and cautioned in the API docs.
923 * @param strsrch string search data
924 * @param start offset
926 * @return TRUE if there are accents on either side of the match,
930 UBool
hasAccentsBeforeMatch(const UStringSearch
*strsrch
, int32_t start
,
933 if (strsrch
->pattern
.hasPrefixAccents
) {
934 UCollationElements
*coleiter
= strsrch
->textIter
;
935 UErrorCode status
= U_ZERO_ERROR
;
936 // we have been iterating forwards previously
937 uint32_t ignorable
= TRUE
;
938 int32_t firstce
= strsrch
->pattern
.CE
[0];
940 setColEIterOffset(coleiter
, start
);
941 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
942 if (U_FAILURE(status
)) {
945 while (ce
!= firstce
) {
946 if (ce
!= UCOL_IGNORABLE
) {
949 ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
950 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
954 if (!ignorable
&& inNormBuf(coleiter
)) {
955 // within normalization buffer, discontiguous handled here
960 int32_t temp
= start
;
962 // accent = (getFCD(strsrch->search->text, &temp,
963 // strsrch->search->textLength)
964 // >> SECOND_LAST_BYTE_SHIFT_);
965 // however this code does not work well with VC7 .net in release mode.
966 // maybe the inlines for getFCD combined with shifting has bugs in
967 // VC7. anyways this is a work around.
968 UBool accent
= getFCD(strsrch
->search
->text
, &temp
,
969 strsrch
->search
->textLength
) > 0xFF;
971 return checkExtraMatchAccents(strsrch
, start
, end
, &status
);
978 UTF_BACK_1(strsrch
->search
->text
, 0, temp
);
979 if (getFCD(strsrch
->search
->text
, &temp
,
980 strsrch
->search
->textLength
) & LAST_BYTE_MASK_
) {
981 setColEIterOffset(coleiter
, start
);
982 ce
= ucol_previous(coleiter
, &status
);
983 if (U_FAILURE(status
) ||
984 (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
)) {
995 * Used by exact matches, checks if there are accents bounding the match.
996 * Note this is the initial boundary check. If the potential match
997 * starts or ends with composite characters, the accents in those
998 * characters will be determined later.
999 * Not doing backwards iteration here, since discontiguos contraction for
1000 * backwards collation element iterator, use up too many characters.
1001 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
1002 * should fail since there is a acute at the end of \u01FA
1003 * Note here that accents checking are slow and cautioned in the API docs.
1004 * @param strsrch string search data
1005 * @param start offset of match
1006 * @param end end offset of the match
1007 * @return TRUE if there are accents on either side of the match,
1011 UBool
hasAccentsAfterMatch(const UStringSearch
*strsrch
, int32_t start
,
1014 if (strsrch
->pattern
.hasSuffixAccents
) {
1015 const UChar
*text
= strsrch
->search
->text
;
1017 int32_t textlength
= strsrch
->search
->textLength
;
1018 UTF_BACK_1(text
, 0, temp
);
1019 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
1020 int32_t firstce
= strsrch
->pattern
.CE
[0];
1021 UCollationElements
*coleiter
= strsrch
->textIter
;
1022 UErrorCode status
= U_ZERO_ERROR
;
1024 setColEIterOffset(coleiter
, start
);
1025 while ((ce
= getCE(strsrch
, ucol_next(coleiter
, &status
))) != firstce
) {
1026 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
1031 while (count
< strsrch
->pattern
.CELength
) {
1032 if (getCE(strsrch
, ucol_next(coleiter
, &status
))
1033 == UCOL_IGNORABLE
) {
1034 // Thai can give an ignorable here.
1037 if (U_FAILURE(status
)) {
1043 ce
= ucol_next(coleiter
, &status
);
1044 if (U_FAILURE(status
)) {
1047 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1048 ce
= getCE(strsrch
, ce
);
1050 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1051 if (ucol_getOffset(coleiter
) <= end
) {
1054 if (getFCD(text
, &end
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
1062 #endif // #if BOYER_MOORE
1065 * Checks if the offset runs out of the text string
1067 * @param textlength of the text string
1068 * @return TRUE if offset is out of bounds, FALSE otherwise
1071 inline UBool
isOutOfBounds(int32_t textlength
, int32_t offset
)
1073 return offset
< 0 || offset
> textlength
;
1078 * Checks for identical match
1079 * @param strsrch string search data
1080 * @param start offset of possible match
1081 * @param end offset of possible match
1082 * @return TRUE if identical match is found
1085 inline UBool
checkIdentical(const UStringSearch
*strsrch
, int32_t start
,
1088 UChar t2
[32], p2
[32];
1089 int32_t length
= end
- start
;
1090 if (strsrch
->strength
!= UCOL_IDENTICAL
) {
1094 UErrorCode status
= U_ZERO_ERROR
, status2
= U_ZERO_ERROR
;
1095 int32_t decomplength
= unorm_decompose(t2
, LENGTHOF(t2
),
1096 strsrch
->search
->text
+ start
, length
,
1098 // use separate status2 in case of buffer overflow
1099 if (decomplength
!= unorm_decompose(p2
, LENGTHOF(p2
),
1100 strsrch
->pattern
.text
,
1101 strsrch
->pattern
.textLength
,
1102 FALSE
, 0, &status2
)) {
1103 return FALSE
; // lengths are different
1107 UChar
*text
, *pattern
;
1108 if(U_SUCCESS(status
)) {
1111 } else if(status
==U_BUFFER_OVERFLOW_ERROR
) {
1112 status
= U_ZERO_ERROR
;
1113 // allocate one buffer for both decompositions
1114 text
= (UChar
*)uprv_malloc(decomplength
* 2 * U_SIZEOF_UCHAR
);
1115 // Check for allocation failure.
1119 pattern
= text
+ decomplength
;
1120 unorm_decompose(text
, decomplength
, strsrch
->search
->text
+ start
,
1121 length
, FALSE
, 0, &status
);
1122 unorm_decompose(pattern
, decomplength
, strsrch
->pattern
.text
,
1123 strsrch
->pattern
.textLength
, FALSE
, 0, &status
);
1125 // NFD failed, make sure that u_memcmp() does not overrun t2 & p2
1126 // and that we don't uprv_free() an undefined text pointer
1127 text
= pattern
= t2
;
1130 UBool result
= (UBool
)(u_memcmp(pattern
, text
, decomplength
) == 0);
1134 // return FALSE if NFD failed
1135 return U_SUCCESS(status
) && result
;
1139 * Checks to see if the match is repeated
1140 * @param strsrch string search data
1141 * @param start new match start index
1142 * @param end new match end index
1143 * @return TRUE if the the match is repeated, FALSE otherwise
1146 inline UBool
checkRepeatedMatch(UStringSearch
*strsrch
,
1150 int32_t lastmatchindex
= strsrch
->search
->matchedIndex
;
1152 if (lastmatchindex
== USEARCH_DONE
) {
1155 if (strsrch
->search
->isForwardSearching
) {
1156 result
= start
<= lastmatchindex
;
1159 result
= start
>= lastmatchindex
;
1161 if (!result
&& !strsrch
->search
->isOverlap
) {
1162 if (strsrch
->search
->isForwardSearching
) {
1163 result
= start
< lastmatchindex
+ strsrch
->search
->matchedLength
;
1166 result
= end
> lastmatchindex
;
1173 * Gets the collation element iterator's current offset.
1174 * @param coleiter collation element iterator
1175 * @param forwards flag TRUE if we are moving in th forwards direction
1176 * @return current offset
1179 inline int32_t getColElemIterOffset(const UCollationElements
*coleiter
,
1182 int32_t result
= ucol_getOffset(coleiter
);
1183 // intricacies of the the backwards collation element iterator
1184 if (FALSE
&& !forwards
&& inNormBuf(coleiter
) && !isFCDPointerNull(coleiter
)) {
1191 * Checks match for contraction.
1192 * If the match ends with a partial contraction we fail.
1193 * If the match starts too far off (because of backwards iteration) we try to
1194 * chip off the extra characters depending on whether a breakiterator has
1196 * Internal method, error assumed to be success, caller has to check status
1197 * before calling this method.
1198 * @param strsrch string search data
1199 * @param start offset of potential match, to be modified if necessary
1200 * @param end offset of potential match, to be modified if necessary
1201 * @param status output error status if any
1202 * @return TRUE if match passes the contraction test, FALSE otherwise
1206 UBool
checkNextExactContractionMatch(UStringSearch
*strsrch
,
1208 int32_t *end
, UErrorCode
*status
)
1210 UCollationElements
*coleiter
= strsrch
->textIter
;
1211 int32_t textlength
= strsrch
->search
->textLength
;
1212 int32_t temp
= *start
;
1213 const UCollator
*collator
= strsrch
->collator
;
1214 const UChar
*text
= strsrch
->search
->text
;
1215 // This part checks if either ends of the match contains potential
1216 // contraction. If so we'll have to iterate through them
1217 // The start contraction needs to be checked since ucol_previous dumps
1218 // all characters till the first safe character into the buffer.
1219 // *start + 1 is used to test for the unsafe characters instead of *start
1220 // because ucol_prev takes all unsafe characters till the first safe
1221 // character ie *start. so by testing *start + 1, we can estimate if
1222 // excess prefix characters has been included in the potential search
1224 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1225 (*start
+ 1 < textlength
1226 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1227 int32_t expansion
= getExpansionPrefix(coleiter
);
1228 UBool expandflag
= expansion
> 0;
1229 setColEIterOffset(coleiter
, *start
);
1230 while (expansion
> 0) {
1231 // getting rid of the redundant ce, caused by setOffset.
1232 // since backward contraction/expansion may have extra ces if we
1233 // are in the normalization buffer, hasAccentsBeforeMatch would
1234 // have taken care of it.
1235 // E.g. the character \u01FA will have an expansion of 3, but if
1236 // we are only looking for acute and ring \u030A and \u0301, we'll
1237 // have to skip the first ce in the expansion buffer.
1238 ucol_next(coleiter
, status
);
1239 if (U_FAILURE(*status
)) {
1242 if (ucol_getOffset(coleiter
) != temp
) {
1244 temp
= ucol_getOffset(coleiter
);
1249 int32_t *patternce
= strsrch
->pattern
.CE
;
1250 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1252 while (count
< patterncelength
) {
1253 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1254 if (ce
== UCOL_IGNORABLE
) {
1257 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1259 temp
= ucol_getOffset(coleiter
);
1261 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1263 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1273 * Checks and sets the match information if found.
1276 * <li> the potential match does not repeat the previous match
1277 * <li> boundaries are correct
1278 * <li> exact matches has no extra accents
1279 * <li> identical matchesb
1280 * <li> potential match does not end in the middle of a contraction
1282 * Otherwise the offset will be shifted to the next character.
1283 * Internal method, status assumed to be success, caller has to check status
1284 * before calling this method.
1285 * @param strsrch string search data
1286 * @param textoffset offset in the collation element text. the returned value
1287 * will be the truncated end offset of the match or the new start
1289 * @param status output error status if any
1290 * @return TRUE if the match is valid, FALSE otherwise
1293 inline UBool
checkNextExactMatch(UStringSearch
*strsrch
,
1294 int32_t *textoffset
, UErrorCode
*status
)
1296 UCollationElements
*coleiter
= strsrch
->textIter
;
1297 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1299 if (!checkNextExactContractionMatch(strsrch
, &start
, textoffset
, status
)) {
1303 // this totally matches, however we need to check if it is repeating
1304 if (!isBreakUnit(strsrch
, start
, *textoffset
) ||
1305 checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1306 hasAccentsBeforeMatch(strsrch
, start
, *textoffset
) ||
1307 !checkIdentical(strsrch
, start
, *textoffset
) ||
1308 hasAccentsAfterMatch(strsrch
, start
, *textoffset
)) {
1311 *textoffset
= getNextUStringSearchBaseOffset(strsrch
, *textoffset
);
1315 //Add breakiterator boundary check for primary strength search.
1316 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
1317 checkBreakBoundary(strsrch
, &start
, textoffset
);
1320 // totally match, we will get rid of the ending ignorables.
1321 strsrch
->search
->matchedIndex
= start
;
1322 strsrch
->search
->matchedLength
= *textoffset
- start
;
1327 * Getting the previous base character offset, or the current offset if the
1328 * current character is a base character
1329 * @param text string
1330 * @param textoffset one offset after the current character
1331 * @return the offset of the next character after the base character or the first
1332 * composed character with accents
1335 inline int32_t getPreviousBaseOffset(const UChar
*text
,
1338 if (textoffset
> 0) {
1340 int32_t result
= textoffset
;
1341 UTF_BACK_1(text
, 0, textoffset
);
1342 int32_t temp
= textoffset
;
1343 uint16_t fcd
= getFCD(text
, &temp
, result
);
1344 if ((fcd
>> SECOND_LAST_BYTE_SHIFT_
) == 0) {
1345 if (fcd
& LAST_BYTE_MASK_
) {
1350 if (textoffset
== 0) {
1359 * Getting the indexes of the accents that are not blocked in the argument
1361 * @param accents array of accents in nfd terminated by a 0.
1362 * @param accentsindex array of indexes of the accents that are not blocked
1365 inline int getUnblockedAccentIndex(UChar
*accents
, int32_t *accentsindex
)
1368 int32_t length
= u_strlen(accents
);
1369 UChar32 codepoint
= 0;
1373 while (index
< length
) {
1375 UTF_NEXT_CHAR(accents
, index
, length
, codepoint
);
1376 if (u_getCombiningClass(codepoint
) != cclass
) {
1377 cclass
= u_getCombiningClass(codepoint
);
1378 accentsindex
[result
] = temp
;
1382 accentsindex
[result
] = length
;
1387 * Appends 3 UChar arrays to a destination array.
1388 * Creates a new array if we run out of space. The caller will have to
1389 * manually deallocate the newly allocated array.
1390 * Internal method, status assumed to be success, caller has to check status
1391 * before calling this method. destination not to be NULL and has at least
1392 * size destinationlength.
1393 * @param destination target array
1394 * @param destinationlength target array size, returning the appended length
1395 * @param source1 null-terminated first array
1396 * @param source2 second array
1397 * @param source2length length of seond array
1398 * @param source3 null-terminated third array
1399 * @param status error status if any
1400 * @return new destination array, destination if there was no new allocation
1403 inline UChar
* addToUCharArray( UChar
*destination
,
1404 int32_t *destinationlength
,
1405 const UChar
*source1
,
1406 const UChar
*source2
,
1407 int32_t source2length
,
1408 const UChar
*source3
,
1411 int32_t source1length
= source1
? u_strlen(source1
) : 0;
1412 int32_t source3length
= source3
? u_strlen(source3
) : 0;
1413 if (*destinationlength
< source1length
+ source2length
+ source3length
+
1416 destination
= (UChar
*)allocateMemory(
1417 (source1length
+ source2length
+ source3length
+ 1) * sizeof(UChar
),
1419 // if error allocating memory, status will be
1420 // U_MEMORY_ALLOCATION_ERROR
1421 if (U_FAILURE(*status
)) {
1422 *destinationlength
= 0;
1426 if (source1length
!= 0) {
1427 uprv_memcpy(destination
, source1
, sizeof(UChar
) * source1length
);
1429 if (source2length
!= 0) {
1430 uprv_memcpy(destination
+ source1length
, source2
,
1431 sizeof(UChar
) * source2length
);
1433 if (source3length
!= 0) {
1434 uprv_memcpy(destination
+ source1length
+ source2length
, source3
,
1435 sizeof(UChar
) * source3length
);
1437 *destinationlength
= source1length
+ source2length
+ source3length
;
1442 * Running through a collation element iterator to see if the contents matches
1443 * pattern in string search data
1444 * @param strsrch string search data
1445 * @param coleiter collation element iterator
1446 * @return TRUE if a match if found, FALSE otherwise
1449 inline UBool
checkCollationMatch(const UStringSearch
*strsrch
,
1450 UCollationElements
*coleiter
)
1452 int patternceindex
= strsrch
->pattern
.CELength
;
1453 int32_t *patternce
= strsrch
->pattern
.CE
;
1454 UErrorCode status
= U_ZERO_ERROR
;
1455 while (patternceindex
> 0) {
1456 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
1457 if (ce
== UCOL_IGNORABLE
) {
1460 if (U_FAILURE(status
) || ce
!= *patternce
) {
1470 * Rearranges the front accents to try matching.
1471 * Prefix accents in the text will be grouped according to their combining
1472 * class and the groups will be mixed and matched to try find the perfect
1473 * match with the pattern.
1474 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1475 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1476 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1478 * step 2: check if any of the generated substrings matches the pattern.
1479 * Internal method, status is assumed to be success, caller has to check status
1480 * before calling this method.
1481 * @param strsrch string search match
1482 * @param start first offset of the accents to start searching
1483 * @param end start of the last accent set
1484 * @param status output error status if any
1485 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1486 * offset of the match. Note this start includes all preceding accents.
1489 int32_t doNextCanonicalPrefixMatch(UStringSearch
*strsrch
,
1494 const UChar
*text
= strsrch
->search
->text
;
1495 int32_t textlength
= strsrch
->search
->textLength
;
1496 int32_t tempstart
= start
;
1498 if ((getFCD(text
, &tempstart
, textlength
) & LAST_BYTE_MASK_
) == 0) {
1499 // die... failed at a base character
1500 return USEARCH_DONE
;
1503 int32_t offset
= getNextBaseOffset(text
, tempstart
, textlength
);
1504 start
= getPreviousBaseOffset(text
, tempstart
);
1506 UChar accents
[INITIAL_ARRAY_SIZE_
];
1507 // normalizing the offensive string
1508 unorm_normalize(text
+ start
, offset
- start
, UNORM_NFD
, 0, accents
,
1509 INITIAL_ARRAY_SIZE_
, status
);
1510 if (U_FAILURE(*status
)) {
1511 return USEARCH_DONE
;
1514 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1515 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1517 int32_t count
= (2 << (accentsize
- 1)) - 1;
1518 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1519 UCollationElements
*coleiter
= strsrch
->utilIter
;
1520 while (U_SUCCESS(*status
) && count
> 0) {
1521 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
1522 // copy the base characters
1523 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1524 *rearrange
++ = accents
[k
];
1526 // forming all possible canonical rearrangement by dropping
1528 for (int i
= 0; i
<= accentsize
- 1; i
++) {
1529 int32_t mask
= 1 << (accentsize
- i
- 1);
1531 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1532 *rearrange
++ = accents
[j
];
1537 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
1538 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
1539 strsrch
->canonicalPrefixAccents
,
1540 strsrch
->search
->text
+ offset
,
1542 strsrch
->canonicalSuffixAccents
,
1545 // if status is a failure, ucol_setText does nothing.
1546 // run the collator iterator through this match
1547 ucol_setText(coleiter
, match
, matchsize
, status
);
1548 if (U_SUCCESS(*status
)) {
1549 if (checkCollationMatch(strsrch
, coleiter
)) {
1550 if (match
!= buffer
) {
1558 return USEARCH_DONE
;
1562 * Gets the offset to the safe point in text before textoffset.
1563 * ie. not the middle of a contraction, swappable characters or supplementary
1565 * @param collator collation sata
1566 * @param text string to work with
1567 * @param textoffset offset in string
1568 * @param textlength length of text string
1569 * @return offset to the previous safe character
1572 inline uint32_t getPreviousSafeOffset(const UCollator
*collator
,
1576 int32_t result
= textoffset
; // first contraction character
1577 while (result
!= 0 && ucol_unsafeCP(text
[result
- 1], collator
)) {
1581 // the first contraction character is consider unsafe here
1588 * Cleaning up after we passed the safe zone
1589 * @param strsrch string search data
1590 * @param safetext safe text array
1591 * @param safebuffer safe text buffer
1592 * @param coleiter collation element iterator for safe text
1595 inline void cleanUpSafeText(const UStringSearch
*strsrch
, UChar
*safetext
,
1598 if (safetext
!= safebuffer
&& safetext
!= strsrch
->canonicalSuffixAccents
)
1600 uprv_free(safetext
);
1605 * Take the rearranged end accents and tries matching. If match failed at
1606 * a seperate preceding set of accents (seperated from the rearranged on by
1607 * at least a base character) then we rearrange the preceding accents and
1608 * tries matching again.
1609 * We allow skipping of the ends of the accent set if the ces do not match.
1610 * However if the failure is found before the accent set, it fails.
1611 * Internal method, status assumed to be success, caller has to check status
1612 * before calling this method.
1613 * @param strsrch string search data
1614 * @param textoffset of the start of the rearranged accent
1615 * @param status output error status if any
1616 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1617 * offset of the match. Note this start includes all preceding accents.
1620 int32_t doNextCanonicalSuffixMatch(UStringSearch
*strsrch
,
1624 const UChar
*text
= strsrch
->search
->text
;
1625 const UCollator
*collator
= strsrch
->collator
;
1626 int32_t safelength
= 0;
1628 int32_t safetextlength
;
1629 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
1630 UCollationElements
*coleiter
= strsrch
->utilIter
;
1631 int32_t safeoffset
= textoffset
;
1633 if (textoffset
!= 0 && ucol_unsafeCP(strsrch
->canonicalSuffixAccents
[0],
1635 safeoffset
= getPreviousSafeOffset(collator
, text
, textoffset
);
1636 safelength
= textoffset
- safeoffset
;
1637 safetextlength
= INITIAL_ARRAY_SIZE_
;
1638 safetext
= addToUCharArray(safebuffer
, &safetextlength
, NULL
,
1639 text
+ safeoffset
, safelength
,
1640 strsrch
->canonicalSuffixAccents
,
1644 safetextlength
= u_strlen(strsrch
->canonicalSuffixAccents
);
1645 safetext
= strsrch
->canonicalSuffixAccents
;
1648 // if status is a failure, ucol_setText does nothing
1649 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
1650 // status checked in loop below
1652 int32_t *ce
= strsrch
->pattern
.CE
;
1653 int32_t celength
= strsrch
->pattern
.CELength
;
1654 int ceindex
= celength
- 1;
1655 UBool isSafe
= TRUE
; // indication flag for position in safe zone
1657 while (ceindex
>= 0) {
1658 int32_t textce
= ucol_previous(coleiter
, status
);
1659 if (U_FAILURE(*status
)) {
1661 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1663 return USEARCH_DONE
;
1665 if (textce
== UCOL_NULLORDER
) {
1666 // check if we have passed the safe buffer
1667 if (coleiter
== strsrch
->textIter
) {
1668 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1669 return USEARCH_DONE
;
1671 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1672 safetext
= safebuffer
;
1673 coleiter
= strsrch
->textIter
;
1674 setColEIterOffset(coleiter
, safeoffset
);
1675 // status checked at the start of the loop
1679 textce
= getCE(strsrch
, textce
);
1680 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
1681 // do the beginning stuff
1682 int32_t failedoffset
= getColElemIterOffset(coleiter
, FALSE
);
1683 if (isSafe
&& failedoffset
>= safelength
) {
1684 // alas... no hope. failed at rearranged accent set
1685 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1686 return USEARCH_DONE
;
1690 failedoffset
+= safeoffset
;
1691 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1694 // try rearranging the front accents
1695 int32_t result
= doNextCanonicalPrefixMatch(strsrch
,
1696 failedoffset
, textoffset
, status
);
1697 if (result
!= USEARCH_DONE
) {
1698 // if status is a failure, ucol_setOffset does nothing
1699 setColEIterOffset(strsrch
->textIter
, result
);
1701 if (U_FAILURE(*status
)) {
1702 return USEARCH_DONE
;
1707 if (textce
== ce
[ceindex
]) {
1713 int32_t result
= getColElemIterOffset(coleiter
, FALSE
);
1714 // sets the text iterator here with the correct expansion and offset
1715 int32_t leftoverces
= getExpansionPrefix(coleiter
);
1716 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1717 if (result
>= safelength
) {
1718 result
= textoffset
;
1721 result
+= safeoffset
;
1723 setColEIterOffset(strsrch
->textIter
, result
);
1724 strsrch
->textIter
->iteratordata_
.toReturn
=
1725 setExpansionPrefix(strsrch
->textIter
, leftoverces
);
1729 return ucol_getOffset(coleiter
);
1733 * Trying out the substring and sees if it can be a canonical match.
1734 * This will try normalizing the end accents and arranging them into canonical
1735 * equivalents and check their corresponding ces with the pattern ce.
1736 * Suffix accents in the text will be grouped according to their combining
1737 * class and the groups will be mixed and matched to try find the perfect
1738 * match with the pattern.
1739 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1740 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1741 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1743 * step 2: check if any of the generated substrings matches the pattern.
1744 * Internal method, status assumed to be success, caller has to check status
1745 * before calling this method.
1746 * @param strsrch string search data
1747 * @param textoffset end offset in the collation element text that ends with
1748 * the accents to be rearranged
1749 * @param status error status if any
1750 * @return TRUE if the match is valid, FALSE otherwise
1753 UBool
doNextCanonicalMatch(UStringSearch
*strsrch
,
1757 const UChar
*text
= strsrch
->search
->text
;
1758 int32_t temp
= textoffset
;
1759 UTF_BACK_1(text
, 0, temp
);
1760 if ((getFCD(text
, &temp
, textoffset
) & LAST_BYTE_MASK_
) == 0) {
1761 UCollationElements
*coleiter
= strsrch
->textIter
;
1762 int32_t offset
= getColElemIterOffset(coleiter
, FALSE
);
1763 if (strsrch
->pattern
.hasPrefixAccents
) {
1764 offset
= doNextCanonicalPrefixMatch(strsrch
, offset
, textoffset
,
1766 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
1767 setColEIterOffset(coleiter
, offset
);
1774 if (!strsrch
->pattern
.hasSuffixAccents
) {
1778 UChar accents
[INITIAL_ARRAY_SIZE_
];
1779 // offset to the last base character in substring to search
1780 int32_t baseoffset
= getPreviousBaseOffset(text
, textoffset
);
1781 // normalizing the offensive string
1782 unorm_normalize(text
+ baseoffset
, textoffset
- baseoffset
, UNORM_NFD
,
1783 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
1784 // status checked in loop below
1786 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1787 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
1789 // 2 power n - 1 plus the full set of accents
1790 int32_t count
= (2 << (size
- 1)) - 1;
1791 while (U_SUCCESS(*status
) && count
> 0) {
1792 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
1793 // copy the base characters
1794 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1795 *rearrange
++ = accents
[k
];
1797 // forming all possible canonical rearrangement by dropping
1799 for (int i
= 0; i
<= size
- 1; i
++) {
1800 int32_t mask
= 1 << (size
- i
- 1);
1802 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1803 *rearrange
++ = accents
[j
];
1808 int32_t offset
= doNextCanonicalSuffixMatch(strsrch
, baseoffset
,
1810 if (offset
!= USEARCH_DONE
) {
1811 return TRUE
; // match found
1819 * Gets the previous base character offset depending on the string search
1821 * @param strsrch string search data
1822 * @param textoffset current offset, current character
1823 * @return the offset of the next character after this base character or itself
1824 * if it is a composed character with accents
1827 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch
*strsrch
,
1830 if (strsrch
->pattern
.hasPrefixAccents
&& textoffset
> 0) {
1831 const UChar
*text
= strsrch
->search
->text
;
1832 int32_t offset
= textoffset
;
1833 if (getFCD(text
, &offset
, strsrch
->search
->textLength
) >>
1834 SECOND_LAST_BYTE_SHIFT_
) {
1835 return getPreviousBaseOffset(text
, textoffset
);
1842 * Checks match for contraction.
1843 * If the match ends with a partial contraction we fail.
1844 * If the match starts too far off (because of backwards iteration) we try to
1845 * chip off the extra characters
1846 * Internal method, status assumed to be success, caller has to check status
1847 * before calling this method.
1848 * @param strsrch string search data
1849 * @param start offset of potential match, to be modified if necessary
1850 * @param end offset of potential match, to be modified if necessary
1851 * @param status output error status if any
1852 * @return TRUE if match passes the contraction test, FALSE otherwise
1855 UBool
checkNextCanonicalContractionMatch(UStringSearch
*strsrch
,
1860 UCollationElements
*coleiter
= strsrch
->textIter
;
1861 int32_t textlength
= strsrch
->search
->textLength
;
1862 int32_t temp
= *start
;
1863 const UCollator
*collator
= strsrch
->collator
;
1864 const UChar
*text
= strsrch
->search
->text
;
1865 // This part checks if either ends of the match contains potential
1866 // contraction. If so we'll have to iterate through them
1867 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1868 (*start
+ 1 < textlength
1869 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1870 int32_t expansion
= getExpansionPrefix(coleiter
);
1871 UBool expandflag
= expansion
> 0;
1872 setColEIterOffset(coleiter
, *start
);
1873 while (expansion
> 0) {
1874 // getting rid of the redundant ce, caused by setOffset.
1875 // since backward contraction/expansion may have extra ces if we
1876 // are in the normalization buffer, hasAccentsBeforeMatch would
1877 // have taken care of it.
1878 // E.g. the character \u01FA will have an expansion of 3, but if
1879 // we are only looking for acute and ring \u030A and \u0301, we'll
1880 // have to skip the first ce in the expansion buffer.
1881 ucol_next(coleiter
, status
);
1882 if (U_FAILURE(*status
)) {
1885 if (ucol_getOffset(coleiter
) != temp
) {
1887 temp
= ucol_getOffset(coleiter
);
1892 int32_t *patternce
= strsrch
->pattern
.CE
;
1893 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1895 int32_t textlength
= strsrch
->search
->textLength
;
1896 while (count
< patterncelength
) {
1897 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1898 // status checked below, note that if status is a failure
1899 // ucol_next returns UCOL_NULLORDER
1900 if (ce
== UCOL_IGNORABLE
) {
1903 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1905 temp
= ucol_getOffset(coleiter
);
1908 if (count
== 0 && ce
!= patternce
[0]) {
1909 // accents may have extra starting ces, this occurs when a
1910 // pure accent pattern is matched without rearrangement
1911 // text \u0325\u0300 and looking for \u0300
1912 int32_t expected
= patternce
[0];
1913 if (getFCD(text
, start
, textlength
) & LAST_BYTE_MASK_
) {
1914 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1915 while (U_SUCCESS(*status
) && ce
!= expected
&&
1916 ce
!= UCOL_NULLORDER
&&
1917 ucol_getOffset(coleiter
) <= *end
) {
1918 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1922 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1924 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1934 * Checks and sets the match information if found.
1937 * <li> the potential match does not repeat the previous match
1938 * <li> boundaries are correct
1939 * <li> potential match does not end in the middle of a contraction
1940 * <li> identical matches
1942 * Otherwise the offset will be shifted to the next character.
1943 * Internal method, status assumed to be success, caller has to check the
1944 * status before calling this method.
1945 * @param strsrch string search data
1946 * @param textoffset offset in the collation element text. the returned value
1947 * will be the truncated end offset of the match or the new start
1949 * @param status output error status if any
1950 * @return TRUE if the match is valid, FALSE otherwise
1953 inline UBool
checkNextCanonicalMatch(UStringSearch
*strsrch
,
1954 int32_t *textoffset
,
1957 // to ensure that the start and ends are not composite characters
1958 UCollationElements
*coleiter
= strsrch
->textIter
;
1959 // if we have a canonical accent match
1960 if ((strsrch
->pattern
.hasSuffixAccents
&&
1961 strsrch
->canonicalSuffixAccents
[0]) ||
1962 (strsrch
->pattern
.hasPrefixAccents
&&
1963 strsrch
->canonicalPrefixAccents
[0])) {
1964 strsrch
->search
->matchedIndex
= getPreviousUStringSearchBaseOffset(
1966 ucol_getOffset(coleiter
));
1967 strsrch
->search
->matchedLength
= *textoffset
-
1968 strsrch
->search
->matchedIndex
;
1972 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1973 if (!checkNextCanonicalContractionMatch(strsrch
, &start
, textoffset
,
1974 status
) || U_FAILURE(*status
)) {
1978 start
= getPreviousUStringSearchBaseOffset(strsrch
, start
);
1979 // this totally matches, however we need to check if it is repeating
1980 if (checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1981 !isBreakUnit(strsrch
, start
, *textoffset
) ||
1982 !checkIdentical(strsrch
, start
, *textoffset
)) {
1984 *textoffset
= getNextBaseOffset(strsrch
->search
->text
, *textoffset
,
1985 strsrch
->search
->textLength
);
1989 strsrch
->search
->matchedIndex
= start
;
1990 strsrch
->search
->matchedLength
= *textoffset
- start
;
1995 * Shifting the collation element iterator position forward to prepare for
1996 * a preceding match. If the first character is a unsafe character, we'll only
1997 * shift by 1 to capture contractions, normalization etc.
1998 * Internal method, status assumed to be success, caller has to check status
1999 * before calling this method.
2000 * @param text strsrch string search data
2001 * @param textoffset start text position to do search
2002 * @param ce the text ce which failed the match.
2003 * @param patternceindex index of the ce within the pattern ce buffer which
2005 * @return final offset
2008 inline int32_t reverseShift(UStringSearch
*strsrch
,
2011 int32_t patternceindex
)
2013 if (strsrch
->search
->isOverlap
) {
2014 if (textoffset
!= strsrch
->search
->textLength
) {
2018 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
2022 if (ce
!= UCOL_NULLORDER
) {
2023 int32_t shift
= strsrch
->pattern
.backShift
[hash(ce
)];
2025 // this is to adjust for characters in the middle of the substring
2026 // for matching that failed.
2027 int32_t adjust
= patternceindex
;
2028 if (adjust
> 1 && shift
> adjust
) {
2029 shift
-= adjust
- 1;
2031 textoffset
-= shift
;
2034 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
2037 textoffset
= getPreviousUStringSearchBaseOffset(strsrch
, textoffset
);
2042 * Checks match for contraction.
2043 * If the match starts with a partial contraction we fail.
2044 * Internal method, status assumed to be success, caller has to check status
2045 * before calling this method.
2046 * @param strsrch string search data
2047 * @param start offset of potential match, to be modified if necessary
2048 * @param end offset of potential match, to be modified if necessary
2049 * @param status output error status if any
2050 * @return TRUE if match passes the contraction test, FALSE otherwise
2053 UBool
checkPreviousExactContractionMatch(UStringSearch
*strsrch
,
2055 int32_t *end
, UErrorCode
*status
)
2057 UCollationElements
*coleiter
= strsrch
->textIter
;
2058 int32_t textlength
= strsrch
->search
->textLength
;
2059 int32_t temp
= *end
;
2060 const UCollator
*collator
= strsrch
->collator
;
2061 const UChar
*text
= strsrch
->search
->text
;
2062 // This part checks if either if the start of the match contains potential
2063 // contraction. If so we'll have to iterate through them
2064 // Since we used ucol_next while previously looking for the potential
2065 // match, this guarantees that our end will not be a partial contraction,
2066 // or a partial supplementary character.
2067 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2068 int32_t expansion
= getExpansionSuffix(coleiter
);
2069 UBool expandflag
= expansion
> 0;
2070 setColEIterOffset(coleiter
, *end
);
2071 while (U_SUCCESS(*status
) && expansion
> 0) {
2072 // getting rid of the redundant ce
2073 // since forward contraction/expansion may have extra ces
2074 // if we are in the normalization buffer, hasAccentsBeforeMatch
2075 // would have taken care of it.
2076 // E.g. the character \u01FA will have an expansion of 3, but if
2077 // we are only looking for A ring A\u030A, we'll have to skip the
2078 // last ce in the expansion buffer
2079 ucol_previous(coleiter
, status
);
2080 if (U_FAILURE(*status
)) {
2083 if (ucol_getOffset(coleiter
) != temp
) {
2085 temp
= ucol_getOffset(coleiter
);
2090 int32_t *patternce
= strsrch
->pattern
.CE
;
2091 int32_t patterncelength
= strsrch
->pattern
.CELength
;
2092 int32_t count
= patterncelength
;
2094 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2095 // status checked below, note that if status is a failure
2096 // ucol_previous returns UCOL_NULLORDER
2097 if (ce
== UCOL_IGNORABLE
) {
2100 if (expandflag
&& count
== 0 &&
2101 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2103 temp
= ucol_getOffset(coleiter
);
2105 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2107 *start
= getPreviousBaseOffset(text
, *start
);
2117 * Checks and sets the match information if found.
2120 * <li> the current match does not repeat the last match
2121 * <li> boundaries are correct
2122 * <li> exact matches has no extra accents
2123 * <li> identical matches
2125 * Otherwise the offset will be shifted to the preceding character.
2126 * Internal method, status assumed to be success, caller has to check status
2127 * before calling this method.
2128 * @param strsrch string search data
2130 * @param coleiter collation element iterator
2131 * @param text string
2132 * @param textoffset offset in the collation element text. the returned value
2133 * will be the truncated start offset of the match or the new start
2135 * @param status output error status if any
2136 * @return TRUE if the match is valid, FALSE otherwise
2139 inline UBool
checkPreviousExactMatch(UStringSearch
*strsrch
,
2140 int32_t *textoffset
,
2143 // to ensure that the start and ends are not composite characters
2144 int32_t end
= ucol_getOffset(strsrch
->textIter
);
2145 if (!checkPreviousExactContractionMatch(strsrch
, textoffset
, &end
, status
)
2146 || U_FAILURE(*status
)) {
2150 // this totally matches, however we need to check if it is repeating
2152 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2153 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2154 hasAccentsBeforeMatch(strsrch
, *textoffset
, end
) ||
2155 !checkIdentical(strsrch
, *textoffset
, end
) ||
2156 hasAccentsAfterMatch(strsrch
, *textoffset
, end
)) {
2158 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2163 //Add breakiterator boundary check for primary strength search.
2164 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
2165 checkBreakBoundary(strsrch
, textoffset
, &end
);
2168 strsrch
->search
->matchedIndex
= *textoffset
;
2169 strsrch
->search
->matchedLength
= end
- *textoffset
;
2174 * Rearranges the end accents to try matching.
2175 * Suffix accents in the text will be grouped according to their combining
2176 * class and the groups will be mixed and matched to try find the perfect
2177 * match with the pattern.
2178 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2179 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2180 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2182 * step 2: check if any of the generated substrings matches the pattern.
2183 * Internal method, status assumed to be success, user has to check status
2184 * before calling this method.
2185 * @param strsrch string search match
2186 * @param start offset of the first base character
2187 * @param end start of the last accent set
2188 * @param status only error status if any
2189 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2190 * offset of the match. Note this start includes all following accents.
2193 int32_t doPreviousCanonicalSuffixMatch(UStringSearch
*strsrch
,
2198 const UChar
*text
= strsrch
->search
->text
;
2199 int32_t tempend
= end
;
2201 UTF_BACK_1(text
, 0, tempend
);
2202 if (!(getFCD(text
, &tempend
, strsrch
->search
->textLength
) &
2204 // die... failed at a base character
2205 return USEARCH_DONE
;
2207 end
= getNextBaseOffset(text
, end
, strsrch
->search
->textLength
);
2209 if (U_SUCCESS(*status
)) {
2210 UChar accents
[INITIAL_ARRAY_SIZE_
];
2211 int32_t offset
= getPreviousBaseOffset(text
, end
);
2212 // normalizing the offensive string
2213 unorm_normalize(text
+ offset
, end
- offset
, UNORM_NFD
, 0, accents
,
2214 INITIAL_ARRAY_SIZE_
, status
);
2216 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2217 int32_t accentsize
= getUnblockedAccentIndex(accents
,
2219 int32_t count
= (2 << (accentsize
- 1)) - 1;
2220 UChar buffer
[INITIAL_ARRAY_SIZE_
];
2221 UCollationElements
*coleiter
= strsrch
->utilIter
;
2222 while (U_SUCCESS(*status
) && count
> 0) {
2223 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
2224 // copy the base characters
2225 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2226 *rearrange
++ = accents
[k
];
2228 // forming all possible canonical rearrangement by dropping
2230 for (int i
= 0; i
<= accentsize
- 1; i
++) {
2231 int32_t mask
= 1 << (accentsize
- i
- 1);
2233 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2234 *rearrange
++ = accents
[j
];
2239 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
2240 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
2241 strsrch
->canonicalPrefixAccents
,
2242 strsrch
->search
->text
+ start
,
2244 strsrch
->canonicalSuffixAccents
,
2247 // run the collator iterator through this match
2248 // if status is a failure ucol_setText does nothing
2249 ucol_setText(coleiter
, match
, matchsize
, status
);
2250 if (U_SUCCESS(*status
)) {
2251 if (checkCollationMatch(strsrch
, coleiter
)) {
2252 if (match
!= buffer
) {
2261 return USEARCH_DONE
;
2265 * Take the rearranged start accents and tries matching. If match failed at
2266 * a seperate following set of accents (seperated from the rearranged on by
2267 * at least a base character) then we rearrange the preceding accents and
2268 * tries matching again.
2269 * We allow skipping of the ends of the accent set if the ces do not match.
2270 * However if the failure is found before the accent set, it fails.
2271 * Internal method, status assumed to be success, caller has to check status
2272 * before calling this method.
2273 * @param strsrch string search data
2274 * @param textoffset of the ends of the rearranged accent
2275 * @param status output error status if any
2276 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2277 * offset of the match. Note this start includes all following accents.
2280 int32_t doPreviousCanonicalPrefixMatch(UStringSearch
*strsrch
,
2284 const UChar
*text
= strsrch
->search
->text
;
2285 const UCollator
*collator
= strsrch
->collator
;
2286 int32_t safelength
= 0;
2288 int32_t safetextlength
;
2289 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
2290 int32_t safeoffset
= textoffset
;
2293 ucol_unsafeCP(strsrch
->canonicalPrefixAccents
[
2294 u_strlen(strsrch
->canonicalPrefixAccents
) - 1
2296 safeoffset
= getNextSafeOffset(collator
, text
, textoffset
,
2297 strsrch
->search
->textLength
);
2298 safelength
= safeoffset
- textoffset
;
2299 safetextlength
= INITIAL_ARRAY_SIZE_
;
2300 safetext
= addToUCharArray(safebuffer
, &safetextlength
,
2301 strsrch
->canonicalPrefixAccents
,
2302 text
+ textoffset
, safelength
,
2306 safetextlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2307 safetext
= strsrch
->canonicalPrefixAccents
;
2310 UCollationElements
*coleiter
= strsrch
->utilIter
;
2311 // if status is a failure, ucol_setText does nothing
2312 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
2313 // status checked in loop below
2315 int32_t *ce
= strsrch
->pattern
.CE
;
2316 int32_t celength
= strsrch
->pattern
.CELength
;
2318 UBool isSafe
= TRUE
; // safe zone indication flag for position
2319 int32_t prefixlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2321 while (ceindex
< celength
) {
2322 int32_t textce
= ucol_next(coleiter
, status
);
2323 if (U_FAILURE(*status
)) {
2325 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2327 return USEARCH_DONE
;
2329 if (textce
== UCOL_NULLORDER
) {
2330 // check if we have passed the safe buffer
2331 if (coleiter
== strsrch
->textIter
) {
2332 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2333 return USEARCH_DONE
;
2335 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2336 safetext
= safebuffer
;
2337 coleiter
= strsrch
->textIter
;
2338 setColEIterOffset(coleiter
, safeoffset
);
2339 // status checked at the start of the loop
2343 textce
= getCE(strsrch
, textce
);
2344 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
2345 // do the beginning stuff
2346 int32_t failedoffset
= ucol_getOffset(coleiter
);
2347 if (isSafe
&& failedoffset
<= prefixlength
) {
2348 // alas... no hope. failed at rearranged accent set
2349 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2350 return USEARCH_DONE
;
2354 failedoffset
= safeoffset
- failedoffset
;
2355 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2358 // try rearranging the end accents
2359 int32_t result
= doPreviousCanonicalSuffixMatch(strsrch
,
2360 textoffset
, failedoffset
, status
);
2361 if (result
!= USEARCH_DONE
) {
2362 // if status is a failure, ucol_setOffset does nothing
2363 setColEIterOffset(strsrch
->textIter
, result
);
2365 if (U_FAILURE(*status
)) {
2366 return USEARCH_DONE
;
2371 if (textce
== ce
[ceindex
]) {
2377 int32_t result
= ucol_getOffset(coleiter
);
2378 // sets the text iterator here with the correct expansion and offset
2379 int32_t leftoverces
= getExpansionSuffix(coleiter
);
2380 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2381 if (result
<= prefixlength
) {
2382 result
= textoffset
;
2385 result
= textoffset
+ (safeoffset
- result
);
2387 setColEIterOffset(strsrch
->textIter
, result
);
2388 setExpansionSuffix(strsrch
->textIter
, leftoverces
);
2392 return ucol_getOffset(coleiter
);
2396 * Trying out the substring and sees if it can be a canonical match.
2397 * This will try normalizing the starting accents and arranging them into
2398 * canonical equivalents and check their corresponding ces with the pattern ce.
2399 * Prefix accents in the text will be grouped according to their combining
2400 * class and the groups will be mixed and matched to try find the perfect
2401 * match with the pattern.
2402 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2403 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2404 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2406 * step 2: check if any of the generated substrings matches the pattern.
2407 * Internal method, status assumed to be success, caller has to check status
2408 * before calling this method.
2409 * @param strsrch string search data
2410 * @param textoffset start offset in the collation element text that starts
2411 * with the accents to be rearranged
2412 * @param status output error status if any
2413 * @return TRUE if the match is valid, FALSE otherwise
2416 UBool
doPreviousCanonicalMatch(UStringSearch
*strsrch
,
2420 const UChar
*text
= strsrch
->search
->text
;
2421 int32_t temp
= textoffset
;
2422 int32_t textlength
= strsrch
->search
->textLength
;
2423 if ((getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) == 0) {
2424 UCollationElements
*coleiter
= strsrch
->textIter
;
2425 int32_t offset
= ucol_getOffset(coleiter
);
2426 if (strsrch
->pattern
.hasSuffixAccents
) {
2427 offset
= doPreviousCanonicalSuffixMatch(strsrch
, textoffset
,
2429 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
2430 setColEIterOffset(coleiter
, offset
);
2437 if (!strsrch
->pattern
.hasPrefixAccents
) {
2441 UChar accents
[INITIAL_ARRAY_SIZE_
];
2442 // offset to the last base character in substring to search
2443 int32_t baseoffset
= getNextBaseOffset(text
, textoffset
, textlength
);
2444 // normalizing the offensive string
2445 unorm_normalize(text
+ textoffset
, baseoffset
- textoffset
, UNORM_NFD
,
2446 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
2447 // status checked in loop
2449 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2450 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
2452 // 2 power n - 1 plus the full set of accents
2453 int32_t count
= (2 << (size
- 1)) - 1;
2454 while (U_SUCCESS(*status
) && count
> 0) {
2455 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
2456 // copy the base characters
2457 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2458 *rearrange
++ = accents
[k
];
2460 // forming all possible canonical rearrangement by dropping
2462 for (int i
= 0; i
<= size
- 1; i
++) {
2463 int32_t mask
= 1 << (size
- i
- 1);
2465 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2466 *rearrange
++ = accents
[j
];
2471 int32_t offset
= doPreviousCanonicalPrefixMatch(strsrch
,
2472 baseoffset
, status
);
2473 if (offset
!= USEARCH_DONE
) {
2474 return TRUE
; // match found
2482 * Checks match for contraction.
2483 * If the match starts with a partial contraction we fail.
2484 * Internal method, status assumed to be success, caller has to check status
2485 * before calling this method.
2486 * @param strsrch string search data
2487 * @param start offset of potential match, to be modified if necessary
2488 * @param end offset of potential match, to be modified if necessary
2489 * @param status only error status if any
2490 * @return TRUE if match passes the contraction test, FALSE otherwise
2493 UBool
checkPreviousCanonicalContractionMatch(UStringSearch
*strsrch
,
2495 int32_t *end
, UErrorCode
*status
)
2497 UCollationElements
*coleiter
= strsrch
->textIter
;
2498 int32_t textlength
= strsrch
->search
->textLength
;
2499 int32_t temp
= *end
;
2500 const UCollator
*collator
= strsrch
->collator
;
2501 const UChar
*text
= strsrch
->search
->text
;
2502 // This part checks if either if the start of the match contains potential
2503 // contraction. If so we'll have to iterate through them
2504 // Since we used ucol_next while previously looking for the potential
2505 // match, this guarantees that our end will not be a partial contraction,
2506 // or a partial supplementary character.
2507 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2508 int32_t expansion
= getExpansionSuffix(coleiter
);
2509 UBool expandflag
= expansion
> 0;
2510 setColEIterOffset(coleiter
, *end
);
2511 while (expansion
> 0) {
2512 // getting rid of the redundant ce
2513 // since forward contraction/expansion may have extra ces
2514 // if we are in the normalization buffer, hasAccentsBeforeMatch
2515 // would have taken care of it.
2516 // E.g. the character \u01FA will have an expansion of 3, but if
2517 // we are only looking for A ring A\u030A, we'll have to skip the
2518 // last ce in the expansion buffer
2519 ucol_previous(coleiter
, status
);
2520 if (U_FAILURE(*status
)) {
2523 if (ucol_getOffset(coleiter
) != temp
) {
2525 temp
= ucol_getOffset(coleiter
);
2530 int32_t *patternce
= strsrch
->pattern
.CE
;
2531 int32_t patterncelength
= strsrch
->pattern
.CELength
;
2532 int32_t count
= patterncelength
;
2534 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2535 // status checked below, note that if status is a failure
2536 // ucol_previous returns UCOL_NULLORDER
2537 if (ce
== UCOL_IGNORABLE
) {
2540 if (expandflag
&& count
== 0 &&
2541 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2543 temp
= ucol_getOffset(coleiter
);
2545 if (count
== patterncelength
&&
2546 ce
!= patternce
[patterncelength
- 1]) {
2547 // accents may have extra starting ces, this occurs when a
2548 // pure accent pattern is matched without rearrangement
2549 int32_t expected
= patternce
[patterncelength
- 1];
2550 UTF_BACK_1(text
, 0, *end
);
2551 if (getFCD(text
, end
, textlength
) & LAST_BYTE_MASK_
) {
2552 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2553 while (U_SUCCESS(*status
) && ce
!= expected
&&
2554 ce
!= UCOL_NULLORDER
&&
2555 ucol_getOffset(coleiter
) <= *start
) {
2556 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2560 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2562 *start
= getPreviousBaseOffset(text
, *start
);
2572 * Checks and sets the match information if found.
2575 * <li> the potential match does not repeat the previous match
2576 * <li> boundaries are correct
2577 * <li> potential match does not end in the middle of a contraction
2578 * <li> identical matches
2580 * Otherwise the offset will be shifted to the next character.
2581 * Internal method, status assumed to be success, caller has to check status
2582 * before calling this method.
2583 * @param strsrch string search data
2584 * @param textoffset offset in the collation element text. the returned value
2585 * will be the truncated start offset of the match or the new start
2587 * @param status only error status if any
2588 * @return TRUE if the match is valid, FALSE otherwise
2591 inline UBool
checkPreviousCanonicalMatch(UStringSearch
*strsrch
,
2592 int32_t *textoffset
,
2595 // to ensure that the start and ends are not composite characters
2596 UCollationElements
*coleiter
= strsrch
->textIter
;
2597 // if we have a canonical accent match
2598 if ((strsrch
->pattern
.hasSuffixAccents
&&
2599 strsrch
->canonicalSuffixAccents
[0]) ||
2600 (strsrch
->pattern
.hasPrefixAccents
&&
2601 strsrch
->canonicalPrefixAccents
[0])) {
2602 strsrch
->search
->matchedIndex
= *textoffset
;
2603 strsrch
->search
->matchedLength
=
2604 getNextUStringSearchBaseOffset(strsrch
,
2605 getColElemIterOffset(coleiter
, FALSE
))
2610 int32_t end
= ucol_getOffset(coleiter
);
2611 if (!checkPreviousCanonicalContractionMatch(strsrch
, textoffset
, &end
,
2613 U_FAILURE(*status
)) {
2617 end
= getNextUStringSearchBaseOffset(strsrch
, end
);
2618 // this totally matches, however we need to check if it is repeating
2619 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2620 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2621 !checkIdentical(strsrch
, *textoffset
, end
)) {
2623 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2628 strsrch
->search
->matchedIndex
= *textoffset
;
2629 strsrch
->search
->matchedLength
= end
- *textoffset
;
2632 #endif // #if BOYER_MOORE
2634 // constructors and destructor -------------------------------------------
2636 U_CAPI UStringSearch
* U_EXPORT2
usearch_open(const UChar
*pattern
,
2637 int32_t patternlength
,
2641 UBreakIterator
*breakiter
,
2644 if (U_FAILURE(*status
)) {
2647 #if UCONFIG_NO_BREAK_ITERATION
2648 if (breakiter
!= NULL
) {
2649 *status
= U_UNSUPPORTED_ERROR
;
2654 // ucol_open internally checks for status
2655 UCollator
*collator
= ucol_open(locale
, status
);
2656 // pattern, text checks are done in usearch_openFromCollator
2657 UStringSearch
*result
= usearch_openFromCollator(pattern
,
2658 patternlength
, text
, textlength
,
2659 collator
, breakiter
, status
);
2661 if (result
== NULL
|| U_FAILURE(*status
)) {
2663 ucol_close(collator
);
2668 result
->ownCollator
= TRUE
;
2672 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2676 U_CAPI UStringSearch
* U_EXPORT2
usearch_openFromCollator(
2677 const UChar
*pattern
,
2678 int32_t patternlength
,
2681 const UCollator
*collator
,
2682 UBreakIterator
*breakiter
,
2685 if (U_FAILURE(*status
)) {
2688 #if UCONFIG_NO_BREAK_ITERATION
2689 if (breakiter
!= NULL
) {
2690 *status
= U_UNSUPPORTED_ERROR
;
2694 if (pattern
== NULL
|| text
== NULL
|| collator
== NULL
) {
2695 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2699 // string search does not really work when numeric collation is turned on
2700 if(ucol_getAttribute(collator
, UCOL_NUMERIC_COLLATION
, status
) == UCOL_ON
) {
2701 *status
= U_UNSUPPORTED_ERROR
;
2705 if (U_SUCCESS(*status
)) {
2706 initializeFCD(status
);
2707 if (U_FAILURE(*status
)) {
2711 UStringSearch
*result
;
2712 if (textlength
== -1) {
2713 textlength
= u_strlen(text
);
2715 if (patternlength
== -1) {
2716 patternlength
= u_strlen(pattern
);
2718 if (textlength
<= 0 || patternlength
<= 0) {
2719 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2723 result
= (UStringSearch
*)uprv_malloc(sizeof(UStringSearch
));
2724 if (result
== NULL
) {
2725 *status
= U_MEMORY_ALLOCATION_ERROR
;
2729 result
->collator
= collator
;
2730 result
->strength
= ucol_getStrength(collator
);
2731 result
->ceMask
= getMask(result
->strength
);
2733 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2735 result
->variableTop
= ucol_getVariableTop(collator
, status
);
2737 if (U_FAILURE(*status
)) {
2742 result
->search
= (USearch
*)uprv_malloc(sizeof(USearch
));
2743 if (result
->search
== NULL
) {
2744 *status
= U_MEMORY_ALLOCATION_ERROR
;
2749 result
->search
->text
= text
;
2750 result
->search
->textLength
= textlength
;
2752 result
->pattern
.text
= pattern
;
2753 result
->pattern
.textLength
= patternlength
;
2754 result
->pattern
.CE
= NULL
;
2755 result
->pattern
.PCE
= NULL
;
2757 result
->search
->breakIter
= breakiter
;
2758 #if !UCONFIG_NO_BREAK_ITERATION
2759 result
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocale(result
->collator
, ULOC_VALID_LOCALE
, status
), text
, textlength
, status
);
2761 ubrk_setText(breakiter
, text
, textlength
, status
);
2765 result
->ownCollator
= FALSE
;
2766 result
->search
->matchedLength
= 0;
2767 result
->search
->matchedIndex
= USEARCH_DONE
;
2768 result
->utilIter
= NULL
;
2769 result
->textIter
= ucol_openElements(collator
, text
,
2770 textlength
, status
);
2771 if (U_FAILURE(*status
)) {
2772 usearch_close(result
);
2776 result
->search
->isOverlap
= FALSE
;
2777 result
->search
->isCanonicalMatch
= FALSE
;
2778 result
->search
->isForwardSearching
= TRUE
;
2779 result
->search
->reset
= TRUE
;
2781 initialize(result
, status
);
2783 if (U_FAILURE(*status
)) {
2784 usearch_close(result
);
2793 U_CAPI
void U_EXPORT2
usearch_close(UStringSearch
*strsrch
)
2796 if (strsrch
->pattern
.CE
!= strsrch
->pattern
.CEBuffer
&&
2797 strsrch
->pattern
.CE
) {
2798 uprv_free(strsrch
->pattern
.CE
);
2801 if (strsrch
->pattern
.PCE
!= NULL
&&
2802 strsrch
->pattern
.PCE
!= strsrch
->pattern
.PCEBuffer
) {
2803 uprv_free(strsrch
->pattern
.PCE
);
2806 ucol_closeElements(strsrch
->textIter
);
2807 ucol_closeElements(strsrch
->utilIter
);
2809 if (strsrch
->ownCollator
&& strsrch
->collator
) {
2810 ucol_close((UCollator
*)strsrch
->collator
);
2813 #if !UCONFIG_NO_BREAK_ITERATION
2814 if (strsrch
->search
->internalBreakIter
) {
2815 ubrk_close(strsrch
->search
->internalBreakIter
);
2819 uprv_free(strsrch
->search
);
2824 // set and get methods --------------------------------------------------
2826 U_CAPI
void U_EXPORT2
usearch_setOffset(UStringSearch
*strsrch
,
2830 if (U_SUCCESS(*status
) && strsrch
) {
2831 if (isOutOfBounds(strsrch
->search
->textLength
, position
)) {
2832 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2835 setColEIterOffset(strsrch
->textIter
, position
);
2837 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2838 strsrch
->search
->matchedLength
= 0;
2839 strsrch
->search
->reset
= FALSE
;
2843 U_CAPI
int32_t U_EXPORT2
usearch_getOffset(const UStringSearch
*strsrch
)
2846 int32_t result
= ucol_getOffset(strsrch
->textIter
);
2847 if (isOutOfBounds(strsrch
->search
->textLength
, result
)) {
2848 return USEARCH_DONE
;
2852 return USEARCH_DONE
;
2855 U_CAPI
void U_EXPORT2
usearch_setAttribute(UStringSearch
*strsrch
,
2856 USearchAttribute attribute
,
2857 USearchAttributeValue value
,
2860 if (U_SUCCESS(*status
) && strsrch
) {
2863 case USEARCH_OVERLAP
:
2864 strsrch
->search
->isOverlap
= (value
== USEARCH_ON
? TRUE
: FALSE
);
2866 case USEARCH_CANONICAL_MATCH
:
2867 strsrch
->search
->isCanonicalMatch
= (value
== USEARCH_ON
? TRUE
:
2870 case USEARCH_ATTRIBUTE_COUNT
:
2872 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2875 if (value
== USEARCH_ATTRIBUTE_VALUE_COUNT
) {
2876 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2880 U_CAPI USearchAttributeValue U_EXPORT2
usearch_getAttribute(
2881 const UStringSearch
*strsrch
,
2882 USearchAttribute attribute
)
2885 switch (attribute
) {
2886 case USEARCH_OVERLAP
:
2887 return (strsrch
->search
->isOverlap
== TRUE
? USEARCH_ON
:
2889 case USEARCH_CANONICAL_MATCH
:
2890 return (strsrch
->search
->isCanonicalMatch
== TRUE
? USEARCH_ON
:
2892 case USEARCH_ATTRIBUTE_COUNT
:
2893 return USEARCH_DEFAULT
;
2896 return USEARCH_DEFAULT
;
2899 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedStart(
2900 const UStringSearch
*strsrch
)
2902 if (strsrch
== NULL
) {
2903 return USEARCH_DONE
;
2905 return strsrch
->search
->matchedIndex
;
2909 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedText(const UStringSearch
*strsrch
,
2911 int32_t resultCapacity
,
2914 if (U_FAILURE(*status
)) {
2915 return USEARCH_DONE
;
2917 if (strsrch
== NULL
|| resultCapacity
< 0 || (resultCapacity
> 0 &&
2919 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2920 return USEARCH_DONE
;
2923 int32_t copylength
= strsrch
->search
->matchedLength
;
2924 int32_t copyindex
= strsrch
->search
->matchedIndex
;
2925 if (copyindex
== USEARCH_DONE
) {
2926 u_terminateUChars(result
, resultCapacity
, 0, status
);
2927 return USEARCH_DONE
;
2930 if (resultCapacity
< copylength
) {
2931 copylength
= resultCapacity
;
2933 if (copylength
> 0) {
2934 uprv_memcpy(result
, strsrch
->search
->text
+ copyindex
,
2935 copylength
* sizeof(UChar
));
2937 return u_terminateUChars(result
, resultCapacity
,
2938 strsrch
->search
->matchedLength
, status
);
2941 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedLength(
2942 const UStringSearch
*strsrch
)
2945 return strsrch
->search
->matchedLength
;
2947 return USEARCH_DONE
;
2950 #if !UCONFIG_NO_BREAK_ITERATION
2952 U_CAPI
void U_EXPORT2
usearch_setBreakIterator(UStringSearch
*strsrch
,
2953 UBreakIterator
*breakiter
,
2956 if (U_SUCCESS(*status
) && strsrch
) {
2957 strsrch
->search
->breakIter
= breakiter
;
2959 ubrk_setText(breakiter
, strsrch
->search
->text
,
2960 strsrch
->search
->textLength
, status
);
2965 U_CAPI
const UBreakIterator
* U_EXPORT2
2966 usearch_getBreakIterator(const UStringSearch
*strsrch
)
2969 return strsrch
->search
->breakIter
;
2976 U_CAPI
void U_EXPORT2
usearch_setText( UStringSearch
*strsrch
,
2981 if (U_SUCCESS(*status
)) {
2982 if (strsrch
== NULL
|| text
== NULL
|| textlength
< -1 ||
2984 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2987 if (textlength
== -1) {
2988 textlength
= u_strlen(text
);
2990 strsrch
->search
->text
= text
;
2991 strsrch
->search
->textLength
= textlength
;
2992 ucol_setText(strsrch
->textIter
, text
, textlength
, status
);
2993 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2994 strsrch
->search
->matchedLength
= 0;
2995 strsrch
->search
->reset
= TRUE
;
2996 #if !UCONFIG_NO_BREAK_ITERATION
2997 if (strsrch
->search
->breakIter
!= NULL
) {
2998 ubrk_setText(strsrch
->search
->breakIter
, text
,
2999 textlength
, status
);
3001 ubrk_setText(strsrch
->search
->internalBreakIter
, text
, textlength
, status
);
3007 U_CAPI
const UChar
* U_EXPORT2
usearch_getText(const UStringSearch
*strsrch
,
3011 *length
= strsrch
->search
->textLength
;
3012 return strsrch
->search
->text
;
3017 U_CAPI
void U_EXPORT2
usearch_setCollator( UStringSearch
*strsrch
,
3018 const UCollator
*collator
,
3021 if (U_SUCCESS(*status
)) {
3022 if (collator
== NULL
) {
3023 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3028 if (strsrch
->ownCollator
&& (strsrch
->collator
!= collator
)) {
3029 ucol_close((UCollator
*)strsrch
->collator
);
3030 strsrch
->ownCollator
= FALSE
;
3032 strsrch
->collator
= collator
;
3033 strsrch
->strength
= ucol_getStrength(collator
);
3034 strsrch
->ceMask
= getMask(strsrch
->strength
);
3035 #if !UCONFIG_NO_BREAK_ITERATION
3036 ubrk_close(strsrch
->search
->internalBreakIter
);
3037 strsrch
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocale(collator
, ULOC_VALID_LOCALE
, status
),
3038 strsrch
->search
->text
, strsrch
->search
->textLength
, status
);
3040 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3042 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
3044 // if status is a failure, ucol_getVariableTop returns 0
3045 strsrch
->variableTop
= ucol_getVariableTop(collator
, status
);
3046 if (U_SUCCESS(*status
)) {
3047 initialize(strsrch
, status
);
3048 if (U_SUCCESS(*status
)) {
3049 /* free offset buffer to avoid memory leak before initializing. */
3050 freeOffsetBuffer(&(strsrch
->textIter
->iteratordata_
));
3051 uprv_init_collIterate(collator
, strsrch
->search
->text
,
3052 strsrch
->search
->textLength
,
3053 &(strsrch
->textIter
->iteratordata_
));
3054 strsrch
->utilIter
->iteratordata_
.coll
= collator
;
3059 // **** are these calls needed?
3060 // **** we call uprv_init_pce in initializePatternPCETable
3061 // **** and the CEBuffer constructor...
3063 uprv_init_pce(strsrch
->textIter
);
3064 uprv_init_pce(strsrch
->utilIter
);
3069 U_CAPI UCollator
* U_EXPORT2
usearch_getCollator(const UStringSearch
*strsrch
)
3072 return (UCollator
*)strsrch
->collator
;
3077 U_CAPI
void U_EXPORT2
usearch_setPattern( UStringSearch
*strsrch
,
3078 const UChar
*pattern
,
3079 int32_t patternlength
,
3082 if (U_SUCCESS(*status
)) {
3083 if (strsrch
== NULL
|| pattern
== NULL
) {
3084 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3087 if (patternlength
== -1) {
3088 patternlength
= u_strlen(pattern
);
3090 if (patternlength
== 0) {
3091 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3094 strsrch
->pattern
.text
= pattern
;
3095 strsrch
->pattern
.textLength
= patternlength
;
3096 initialize(strsrch
, status
);
3101 U_CAPI
const UChar
* U_EXPORT2
3102 usearch_getPattern(const UStringSearch
*strsrch
,
3106 *length
= strsrch
->pattern
.textLength
;
3107 return strsrch
->pattern
.text
;
3112 // miscellanous methods --------------------------------------------------
3114 U_CAPI
int32_t U_EXPORT2
usearch_first(UStringSearch
*strsrch
,
3117 if (strsrch
&& U_SUCCESS(*status
)) {
3118 strsrch
->search
->isForwardSearching
= TRUE
;
3119 usearch_setOffset(strsrch
, 0, status
);
3120 if (U_SUCCESS(*status
)) {
3121 return usearch_next(strsrch
, status
);
3124 return USEARCH_DONE
;
3127 U_CAPI
int32_t U_EXPORT2
usearch_following(UStringSearch
*strsrch
,
3131 if (strsrch
&& U_SUCCESS(*status
)) {
3132 strsrch
->search
->isForwardSearching
= TRUE
;
3133 // position checked in usearch_setOffset
3134 usearch_setOffset(strsrch
, position
, status
);
3135 if (U_SUCCESS(*status
)) {
3136 return usearch_next(strsrch
, status
);
3139 return USEARCH_DONE
;
3142 U_CAPI
int32_t U_EXPORT2
usearch_last(UStringSearch
*strsrch
,
3145 if (strsrch
&& U_SUCCESS(*status
)) {
3146 strsrch
->search
->isForwardSearching
= FALSE
;
3147 usearch_setOffset(strsrch
, strsrch
->search
->textLength
, status
);
3148 if (U_SUCCESS(*status
)) {
3149 return usearch_previous(strsrch
, status
);
3152 return USEARCH_DONE
;
3155 U_CAPI
int32_t U_EXPORT2
usearch_preceding(UStringSearch
*strsrch
,
3159 if (strsrch
&& U_SUCCESS(*status
)) {
3160 strsrch
->search
->isForwardSearching
= FALSE
;
3161 // position checked in usearch_setOffset
3162 usearch_setOffset(strsrch
, position
, status
);
3163 if (U_SUCCESS(*status
)) {
3164 return usearch_previous(strsrch
, status
);
3167 return USEARCH_DONE
;
3171 * If a direction switch is required, we'll count the number of ces till the
3172 * beginning of the collation element iterator and iterate forwards that
3173 * number of times. This is so that we get to the correct point within the
3174 * string to continue the search in. Imagine when we are in the middle of the
3175 * normalization buffer when the change in direction is request. arrrgghh....
3176 * After searching the offset within the collation element iterator will be
3177 * shifted to the start of the match. If a match is not found, the offset would
3178 * have been set to the end of the text string in the collation element
3180 * Okay, here's my take on normalization buffer. The only time when there can
3181 * be 2 matches within the same normalization is when the pattern is consists
3182 * of all accents. But since the offset returned is from the text string, we
3183 * should not confuse the caller by returning the second match within the
3184 * same normalization buffer. If we do, the 2 results will have the same match
3185 * offsets, and that'll be confusing. I'll return the next match that doesn't
3186 * fall within the same normalization buffer. Note this does not affect the
3187 * results of matches spanning the text and the normalization buffer.
3188 * The position to start searching is taken from the collation element
3189 * iterator. Callers of this API would have to set the offset in the collation
3190 * element iterator before using this method.
3192 U_CAPI
int32_t U_EXPORT2
usearch_next(UStringSearch
*strsrch
,
3195 if (U_SUCCESS(*status
) && strsrch
) {
3196 // note offset is either equivalent to the start of the previous match
3197 // or is set by the user
3198 int32_t offset
= usearch_getOffset(strsrch
);
3199 USearch
*search
= strsrch
->search
;
3200 search
->reset
= FALSE
;
3201 int32_t textlength
= search
->textLength
;
3202 if (search
->isForwardSearching
) {
3204 if (offset
== textlength
3205 || (!search
->isOverlap
&&
3206 (offset
+ strsrch
->pattern
.defaultShiftSize
> textlength
||
3207 (search
->matchedIndex
!= USEARCH_DONE
&&
3208 offset
+ search
->matchedLength
>= textlength
)))) {
3209 // not enough characters to match
3210 setMatchNotFound(strsrch
);
3211 return USEARCH_DONE
;
3214 if (offset
== textlength
||
3215 (! search
->isOverlap
&&
3216 (search
->matchedIndex
!= USEARCH_DONE
&&
3217 offset
+ search
->matchedLength
> textlength
))) {
3218 // not enough characters to match
3219 setMatchNotFound(strsrch
);
3220 return USEARCH_DONE
;
3225 // switching direction.
3226 // if matchedIndex == USEARCH_DONE, it means that either a
3227 // setOffset has been called or that previous ran off the text
3228 // string. the iterator would have been set to offset 0 if a
3229 // match is not found.
3230 search
->isForwardSearching
= TRUE
;
3231 if (search
->matchedIndex
!= USEARCH_DONE
) {
3232 // there's no need to set the collation element iterator
3233 // the next call to next will set the offset.
3234 return search
->matchedIndex
;
3238 if (U_SUCCESS(*status
)) {
3239 if (strsrch
->pattern
.CELength
== 0) {
3240 if (search
->matchedIndex
== USEARCH_DONE
) {
3241 search
->matchedIndex
= offset
;
3243 else { // moves by codepoints
3244 UTF_FWD_1(search
->text
, search
->matchedIndex
, textlength
);
3247 search
->matchedLength
= 0;
3248 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3249 // status checked below
3250 if (search
->matchedIndex
== textlength
) {
3251 search
->matchedIndex
= USEARCH_DONE
;
3255 if (search
->matchedLength
> 0) {
3256 // if matchlength is 0 we are at the start of the iteration
3257 if (search
->isOverlap
) {
3258 ucol_setOffset(strsrch
->textIter
, offset
+ 1, status
);
3261 ucol_setOffset(strsrch
->textIter
,
3262 offset
+ search
->matchedLength
, status
);
3266 // for boundary check purposes. this will ensure that the
3267 // next match will not preceed the current offset
3268 // note search->matchedIndex will always be set to something
3270 search
->matchedIndex
= offset
- 1;
3273 if (search
->isCanonicalMatch
) {
3274 // can't use exact here since extra accents are allowed.
3275 usearch_handleNextCanonical(strsrch
, status
);
3278 usearch_handleNextExact(strsrch
, status
);
3282 if (U_FAILURE(*status
)) {
3283 return USEARCH_DONE
;
3287 if (search
->matchedIndex
== USEARCH_DONE
) {
3288 ucol_setOffset(strsrch
->textIter
, search
->textLength
, status
);
3290 ucol_setOffset(strsrch
->textIter
, search
->matchedIndex
, status
);
3294 return search
->matchedIndex
;
3297 return USEARCH_DONE
;
3300 U_CAPI
int32_t U_EXPORT2
usearch_previous(UStringSearch
*strsrch
,
3303 if (U_SUCCESS(*status
) && strsrch
) {
3305 USearch
*search
= strsrch
->search
;
3306 if (search
->reset
) {
3307 offset
= search
->textLength
;
3308 search
->isForwardSearching
= FALSE
;
3309 search
->reset
= FALSE
;
3310 setColEIterOffset(strsrch
->textIter
, offset
);
3313 offset
= usearch_getOffset(strsrch
);
3316 int32_t matchedindex
= search
->matchedIndex
;
3317 if (search
->isForwardSearching
== TRUE
) {
3318 // switching direction.
3319 // if matchedIndex == USEARCH_DONE, it means that either a
3320 // setOffset has been called or that next ran off the text
3321 // string. the iterator would have been set to offset textLength if
3322 // a match is not found.
3323 search
->isForwardSearching
= FALSE
;
3324 if (matchedindex
!= USEARCH_DONE
) {
3325 return matchedindex
;
3330 if (offset
== 0 || matchedindex
== 0 ||
3331 (!search
->isOverlap
&&
3332 (offset
< strsrch
->pattern
.defaultShiftSize
||
3333 (matchedindex
!= USEARCH_DONE
&&
3334 matchedindex
< strsrch
->pattern
.defaultShiftSize
)))) {
3335 // not enough characters to match
3336 setMatchNotFound(strsrch
);
3337 return USEARCH_DONE
;
3340 // Could check pattern length, but the
3341 // linear search will do the right thing
3342 if (offset
== 0 || matchedindex
== 0) {
3343 setMatchNotFound(strsrch
);
3344 return USEARCH_DONE
;
3349 if (U_SUCCESS(*status
)) {
3350 if (strsrch
->pattern
.CELength
== 0) {
3351 search
->matchedIndex
=
3352 (matchedindex
== USEARCH_DONE
? offset
: matchedindex
);
3353 if (search
->matchedIndex
== 0) {
3354 setMatchNotFound(strsrch
);
3355 // status checked below
3357 else { // move by codepoints
3358 UTF_BACK_1(search
->text
, 0, search
->matchedIndex
);
3359 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3360 // status checked below
3361 search
->matchedLength
= 0;
3366 if (search
->matchedIndex
!= USEARCH_DONE
) {
3367 if (search
->isOverlap
) {
3368 ucol_setOffset(strsrch
->textIter
, search
->matchedIndex
+ search
->matchedLength
- 2, status
);
3373 if (strsrch
->search
->isCanonicalMatch
) {
3374 // can't use exact here since extra accents are allowed.
3375 usearch_handlePreviousCanonical(strsrch
, status
);
3376 // status checked below
3379 usearch_handlePreviousExact(strsrch
, status
);
3380 // status checked below
3384 if (U_FAILURE(*status
)) {
3385 return USEARCH_DONE
;
3388 return search
->matchedIndex
;
3391 return USEARCH_DONE
;
3396 U_CAPI
void U_EXPORT2
usearch_reset(UStringSearch
*strsrch
)
3399 reset is setting the attributes that are already in
3400 string search, hence all attributes in the collator should
3401 be retrieved without any problems
3404 UErrorCode status
= U_ZERO_ERROR
;
3405 UBool sameCollAttribute
= TRUE
;
3410 strsrch
->strength
= ucol_getStrength(strsrch
->collator
);
3411 ceMask
= getMask(strsrch
->strength
);
3412 if (strsrch
->ceMask
!= ceMask
) {
3413 strsrch
->ceMask
= ceMask
;
3414 sameCollAttribute
= FALSE
;
3416 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3417 shift
= ucol_getAttribute(strsrch
->collator
, UCOL_ALTERNATE_HANDLING
,
3418 &status
) == UCOL_SHIFTED
;
3419 if (strsrch
->toShift
!= shift
) {
3420 strsrch
->toShift
= shift
;
3421 sameCollAttribute
= FALSE
;
3424 // if status is a failure, ucol_getVariableTop returns 0
3425 varTop
= ucol_getVariableTop(strsrch
->collator
, &status
);
3426 if (strsrch
->variableTop
!= varTop
) {
3427 strsrch
->variableTop
= varTop
;
3428 sameCollAttribute
= FALSE
;
3430 if (!sameCollAttribute
) {
3431 initialize(strsrch
, &status
);
3433 /* free offset buffer to avoid memory leak before initializing. */
3434 freeOffsetBuffer(&(strsrch
->textIter
->iteratordata_
));
3435 uprv_init_collIterate(strsrch
->collator
, strsrch
->search
->text
,
3436 strsrch
->search
->textLength
,
3437 &(strsrch
->textIter
->iteratordata_
));
3438 strsrch
->search
->matchedLength
= 0;
3439 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
3440 strsrch
->search
->isOverlap
= FALSE
;
3441 strsrch
->search
->isCanonicalMatch
= FALSE
;
3442 strsrch
->search
->isForwardSearching
= TRUE
;
3443 strsrch
->search
->reset
= TRUE
;
3448 // CEI Collation Element + source text index.
3449 // These structs are kept in the circular buffer.
3461 // CEBuffer A circular buffer of CEs from the text being searched.
3463 #define DEFAULT_CEBUFFER_SIZE 50
3465 CEI defBuf
[DEFAULT_CEBUFFER_SIZE
];
3470 UCollationElements
*ceIter
;
3471 UStringSearch
*strSearch
;
3475 CEBuffer(UStringSearch
*ss
, UErrorCode
*status
);
3477 const CEI
*get(int32_t index
);
3478 const CEI
*getPrevious(int32_t index
);
3482 CEBuffer::CEBuffer(UStringSearch
*ss
, UErrorCode
*status
) {
3485 bufSize
= ss
->pattern
.CELength
+10;
3486 ceIter
= ss
->textIter
;
3490 uprv_init_pce(ceIter
);
3492 if (bufSize
>DEFAULT_CEBUFFER_SIZE
) {
3493 buf
= (CEI
*)uprv_malloc(bufSize
* sizeof(CEI
));
3495 *status
= U_MEMORY_ALLOCATION_ERROR
;
3500 // TODO: add a reset or init function so that allocated
3501 // buffers can be retained & reused.
3503 CEBuffer::~CEBuffer() {
3504 if (buf
!= defBuf
) {
3510 // Get the CE with the specified index.
3511 // Index must be in the range
3512 // n-history_size < index < n+1
3513 // where n is the largest index to have been fetched by some previous call to this function.
3514 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3516 const CEI
*CEBuffer::get(int32_t index
) {
3517 int i
= index
% bufSize
;
3519 if (index
>=firstIx
&& index
<limitIx
) {
3520 // The request was for an entry already in our buffer.
3525 // Caller is requesting a new, never accessed before, CE.
3526 // Verify that it is the next one in sequence, which is all
3528 if (index
!= limitIx
) {
3534 // Manage the circular CE buffer indexing
3537 if (limitIx
- firstIx
>= bufSize
) {
3538 // The buffer is full, knock out the lowest-indexed entry.
3542 UErrorCode status
= U_ZERO_ERROR
;
3544 buf
[i
].ce
= ucol_nextProcessed(ceIter
, &buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3549 // Get the CE with the specified index.
3550 // Index must be in the range
3551 // n-history_size < index < n+1
3552 // where n is the largest index to have been fetched by some previous call to this function.
3553 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3555 const CEI
*CEBuffer::getPrevious(int32_t index
) {
3556 int i
= index
% bufSize
;
3558 if (index
>=firstIx
&& index
<limitIx
) {
3559 // The request was for an entry already in our buffer.
3564 // Caller is requesting a new, never accessed before, CE.
3565 // Verify that it is the next one in sequence, which is all
3567 if (index
!= limitIx
) {
3573 // Manage the circular CE buffer indexing
3576 if (limitIx
- firstIx
>= bufSize
) {
3577 // The buffer is full, knock out the lowest-indexed entry.
3581 UErrorCode status
= U_ZERO_ERROR
;
3583 buf
[i
].ce
= ucol_previousProcessed(ceIter
, &buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3591 // #define USEARCH_DEBUG
3593 #ifdef USEARCH_DEBUG
3599 * Find the next break boundary after startIndex. If the UStringSearch object
3600 * has an external break iterator, use that. Otherwise use the internal character
3603 static int32_t nextBoundaryAfter(UStringSearch
*strsrch
, int32_t startIndex
) {
3605 const UChar
*text
= strsrch
->search
->text
;
3606 int32_t textLen
= strsrch
->search
->textLength
;
3608 U_ASSERT(startIndex
>=0);
3609 U_ASSERT(startIndex
<=textLen
);
3611 if (startIndex
>= textLen
) {
3616 int32_t i
= startIndex
;
3617 U16_NEXT(text
, i
, textLen
, c
);
3619 // If we are on a control character, stop without looking for combining marks.
3620 // Control characters do not combine.
3621 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3622 if (gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
) {
3626 // The initial character was not a control, and can thus accept trailing
3627 // combining characters. Advance over however many of them there are.
3628 int32_t indexOfLastCharChecked
;
3630 indexOfLastCharChecked
= i
;
3634 U16_NEXT(text
, i
, textLen
, c
);
3635 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3636 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3640 return indexOfLastCharChecked
;
3641 #elif !UCONFIG_NO_BREAK_ITERATION
3642 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3644 if (breakiterator
== NULL
) {
3645 breakiterator
= strsrch
->search
->internalBreakIter
;
3648 if (breakiterator
!= NULL
) {
3649 return ubrk_following(breakiterator
, startIndex
);
3654 // **** or should we use the original code? ****
3661 * Returns TRUE if index is on a break boundary. If the UStringSearch
3662 * has an external break iterator, test using that, otherwise test
3663 * using the internal character break iterator.
3665 static UBool
isBreakBoundary(UStringSearch
*strsrch
, int32_t index
) {
3667 const UChar
*text
= strsrch
->search
->text
;
3668 int32_t textLen
= strsrch
->search
->textLength
;
3671 U_ASSERT(index
<=textLen
);
3673 if (index
>=textLen
|| index
<=0) {
3677 // If the character at the current index is not a GRAPHEME_EXTEND
3678 // then we can not be within a combining sequence.
3680 U16_GET(text
, 0, index
, textLen
, c
);
3681 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3682 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3686 // We are at a combining mark. If the preceding character is anything
3687 // except a CONTROL, CR or LF, we are in a combining sequence.
3688 U16_PREV(text
, 0, index
, c
);
3689 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3690 UBool combining
= !(gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
);
3692 #elif !UCONFIG_NO_BREAK_ITERATION
3693 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3695 if (breakiterator
== NULL
) {
3696 breakiterator
= strsrch
->search
->internalBreakIter
;
3699 return (breakiterator
!= NULL
&& ! ubrk_isBoundary(breakiterator
, index
));
3701 // **** or use the original code? ****
3707 static UBool
onBreakBoundaries(const UStringSearch
*strsrch
, int32_t start
, int32_t end
)
3709 #if !UCONFIG_NO_BREAK_ITERATION
3710 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3712 if (breakiterator
!= NULL
) {
3713 int32_t startindex
= ubrk_first(breakiterator
);
3714 int32_t endindex
= ubrk_last(breakiterator
);
3716 // out-of-range indexes are never boundary positions
3717 if (start
< startindex
|| start
> endindex
||
3718 end
< startindex
|| end
> endindex
) {
3722 return ubrk_isBoundary(breakiterator
, start
) &&
3723 ubrk_isBoundary(breakiterator
, end
);
3732 U_CAPI UBool U_EXPORT2
usearch_search(UStringSearch
*strsrch
,
3734 int32_t *matchStart
,
3735 int32_t *matchLimit
,
3738 if (U_FAILURE(*status
)) {
3742 // TODO: reject search patterns beginning with a combining char.
3744 #ifdef USEARCH_DEBUG
3745 if (getenv("USEARCH_DEBUG") != NULL
) {
3746 printf("Pattern CEs\n");
3747 for (int ii
=0; ii
<strsrch
->pattern
.CELength
; ii
++) {
3748 printf(" %8x", strsrch
->pattern
.CE
[ii
]);
3754 // Input parameter sanity check.
3755 // TODO: should input indicies clip to the text length
3756 // in the same way that UText does.
3757 if(strsrch
->pattern
.CELength
== 0 ||
3759 startIdx
> strsrch
->search
->textLength
||
3760 strsrch
->pattern
.CE
== NULL
) {
3761 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3765 if (strsrch
->pattern
.PCE
== NULL
) {
3766 initializePatternPCETable(strsrch
, status
);
3769 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
3770 CEBuffer
ceb(strsrch
, status
);
3773 int32_t targetIx
= 0;
3774 const CEI
*targetCEI
;
3778 int32_t mStart
= -1;
3779 int32_t mLimit
= -1;
3785 // Outer loop moves over match starting positions in the
3787 for(targetIx
=0; ; targetIx
++)
3790 // Inner loop checks for a match beginning at each
3791 // position from the outer loop.
3792 for (patIx
=0; patIx
<strsrch
->pattern
.CELength
; patIx
++) {
3793 int64_t patCE
= strsrch
->pattern
.PCE
[patIx
];
3794 targetCEI
= ceb
.get(targetIx
+patIx
);
3795 // Compare CE from target string with CE from the pattern.
3796 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
3797 // which will fail the compare, below.
3798 if (targetCEI
->ce
!= patCE
) {
3804 if (!found
&& targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
3805 // No match at this targetIx. Try again at the next.
3810 // No match at all, we have run off the end of the target text.
3815 // We have found a match in CE space.
3816 // Now determine the bounds in string index space.
3817 // There still is a chance of match failure if the CE range not correspond to
3818 // an acceptable character range.
3820 const CEI
*firstCEI
= ceb
.get(targetIx
);
3821 const CEI
*lastCEI
= ceb
.get(targetIx
+ strsrch
->pattern
.CELength
- 1);
3822 const CEI
*nextCEI
= ceb
.get(targetIx
+ strsrch
->pattern
.CELength
);
3824 // targetCEI = ceb.get(targetIx+strsrch->pattern.CELength);
3825 // maxLimit = targetCEI->lowIndex;
3826 mStart
= firstCEI
->lowIndex
;
3827 minLimit
= lastCEI
->lowIndex
;
3828 maxLimit
= nextCEI
->lowIndex
;
3830 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3831 // extended to the end of input, and the match is good.
3833 // Look at the high and low indices of the CE following the match. If
3834 // they are the same it means one of two things:
3835 // 1. The match extended to the last CE from the target text, which is OK, or
3836 // 2. The last CE that was part of the match is in an expansion that extends
3837 // to the first CE after the match. In this case, we reject the match.
3838 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
3843 // Check for the start of the match being within a combining sequence.
3844 // This can happen if the pattern itself begins with a combining char, and
3845 // the match found combining marks in the target text that were attached
3846 // to something else.
3847 // This type of match should be rejected for not completely consuming a
3848 // combining sequence.
3849 if (isBreakBoundary(strsrch
, mStart
)) {
3853 // Check for the start of the match being within an Collation Element Expansion,
3854 // meaning that the first char of the match is only partially matched.
3855 // With exapnsions, the first CE will report the index of the source
3856 // character, and all subsequent (expansions) CEs will report the source index of the
3857 // _following_ character.
3858 int32_t secondIx
= firstCEI
->highIndex
;
3859 if (mStart
== secondIx
) {
3863 // Advance the match end position to the first acceptable match boundary.
3864 // This advances the index over any combining charcters.
3866 if (minLimit
< maxLimit
) {
3867 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
3869 if (nba
>= lastCEI
->highIndex
) {
3874 #ifdef USEARCH_DEBUG
3875 if (getenv("USEARCH_DEBUG") != NULL
) {
3876 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
3880 // If advancing to the end of a combining sequence in character indexing space
3881 // advanced us beyond the end of the match in CE space, reject this match.
3882 if (mLimit
> maxLimit
) {
3886 if (isBreakBoundary(strsrch
, mLimit
)) {
3895 #ifdef USEARCH_DEBUG
3896 if (getenv("USEARCH_DEBUG") != NULL
) {
3897 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
3898 int32_t lastToPrint
= ceb
.limitIx
+2;
3899 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
3900 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
3902 printf("\n%s\n", found
? "match found" : "no match");
3906 // All Done. Store back the match bounds to the caller.
3913 if (matchStart
!= NULL
) {
3914 *matchStart
= mStart
;
3917 if (matchLimit
!= NULL
) {
3918 *matchLimit
= mLimit
;
3925 U_CAPI UBool U_EXPORT2
usearch_searchBackwards(UStringSearch
*strsrch
,
3927 int32_t *matchStart
,
3928 int32_t *matchLimit
,
3931 if (U_FAILURE(*status
)) {
3935 // TODO: reject search patterns beginning with a combining char.
3937 #ifdef USEARCH_DEBUG
3938 if (getenv("USEARCH_DEBUG") != NULL
) {
3939 printf("Pattern CEs\n");
3940 for (int ii
=0; ii
<strsrch
->pattern
.CELength
; ii
++) {
3941 printf(" %8x", strsrch
->pattern
.CE
[ii
]);
3947 // Input parameter sanity check.
3948 // TODO: should input indicies clip to the text length
3949 // in the same way that UText does.
3950 if(strsrch
->pattern
.CELength
== 0 ||
3952 startIdx
> strsrch
->search
->textLength
||
3953 strsrch
->pattern
.CE
== NULL
) {
3954 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3958 if (strsrch
->pattern
.PCE
== NULL
) {
3959 initializePatternPCETable(strsrch
, status
);
3962 CEBuffer
ceb(strsrch
, status
);
3963 int32_t targetIx
= 0;
3966 * Pre-load the buffer with the CE's for the grapheme
3967 * after our starting position so that we're sure that
3968 * we can look at the CE following the match when we
3969 * check the match boundaries.
3971 * This will also pre-fetch the first CE that we'll
3972 * consider for the match.
3974 if (startIdx
< strsrch
->search
->textLength
) {
3975 UBreakIterator
*bi
= strsrch
->search
->internalBreakIter
;
3976 int32_t next
= ubrk_following(bi
, startIdx
);
3978 ucol_setOffset(strsrch
->textIter
, next
, status
);
3980 for (targetIx
= 0; ; targetIx
+= 1) {
3981 if (ceb
.getPrevious(targetIx
)->lowIndex
< startIdx
) {
3986 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
3990 const CEI
*targetCEI
;
3994 int32_t limitIx
= targetIx
;
3995 int32_t mStart
= -1;
3996 int32_t mLimit
= -1;
4002 // Outer loop moves over match starting positions in the
4004 for(targetIx
= limitIx
; ; targetIx
+= 1)
4007 // Inner loop checks for a match beginning at each
4008 // position from the outer loop.
4009 for (patIx
= strsrch
->pattern
.CELength
- 1; patIx
>= 0; patIx
-= 1) {
4010 int64_t patCE
= strsrch
->pattern
.PCE
[patIx
];
4012 targetCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.CELength
- 1 - patIx
);
4013 // Compare CE from target string with CE from the pattern.
4014 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4015 // which will fail the compare, below.
4016 if (targetCEI
->ce
!= patCE
) {
4022 if (!found
&& targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
4023 // No match at this targetIx. Try again at the next.
4028 // No match at all, we have run off the end of the target text.
4033 // We have found a match in CE space.
4034 // Now determine the bounds in string index space.
4035 // There still is a chance of match failure if the CE range not correspond to
4036 // an acceptable character range.
4038 const CEI
*firstCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.CELength
- 1);
4039 const CEI
*lastCEI
= ceb
.getPrevious(targetIx
);
4040 const CEI
*nextCEI
= targetIx
> 0? ceb
.getPrevious(targetIx
- 1) : NULL
;
4042 mStart
= firstCEI
->lowIndex
;
4043 minLimit
= lastCEI
->lowIndex
;
4044 maxLimit
= targetIx
> 0? nextCEI
->lowIndex
: lastCEI
->highIndex
;
4046 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4047 // extended to the end of input, and the match is good.
4049 // Look at the high and low indices of the CE following the match. If
4050 // they are the same it means one of two things:
4051 // 1. The match extended to the last CE from the target text, which is OK, or
4052 // 2. The last CE that was part of the match is in an expansion that extends
4053 // to the first CE after the match. In this case, we reject the match.
4054 if (targetIx
>= 1) {
4055 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
4061 // Check for the start of the match being within a combining sequence.
4062 // This can happen if the pattern itself begins with a combining char, and
4063 // the match found combining marks in the target text that were attached
4064 // to something else.
4065 // This type of match should be rejected for not completely consuming a
4066 // combining sequence.
4067 if (isBreakBoundary(strsrch
, mStart
)) {
4071 // Look at the high index of the first CE in the match. If it's the same as the
4072 // low index, the first CE in the match is in the middle of an expansion.
4073 if (mStart
== firstCEI
->highIndex
) {
4077 // Advance the match end position to the first acceptable match boundary.
4078 // This advances the index over any combining charcters.
4080 if (/*targetIx > 0 &&*/ minLimit
< maxLimit
) {
4081 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4083 if (nba
>= lastCEI
->highIndex
) {
4088 #ifdef USEARCH_DEBUG
4089 if (getenv("USEARCH_DEBUG") != NULL
) {
4090 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4094 // If advancing to the end of a combining sequence in character indexing space
4095 // advanced us beyond the end of the match in CE space, reject this match.
4096 if (mLimit
> maxLimit
) {
4100 // Make sure the end of the match is on a break boundary
4101 if (isBreakBoundary(strsrch
, mLimit
)) {
4110 #ifdef USEARCH_DEBUG
4111 if (getenv("USEARCH_DEBUG") != NULL
) {
4112 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4113 int32_t lastToPrint
= ceb
.limitIx
+2;
4114 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4115 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4117 printf("\n%s\n", found
? "match found" : "no match");
4121 // All Done. Store back the match bounds to the caller.
4128 if (matchStart
!= NULL
) {
4129 *matchStart
= mStart
;
4132 if (matchLimit
!= NULL
) {
4133 *matchLimit
= mLimit
;
4142 // internal use methods declared in usrchimp.h -----------------------------
4144 UBool
usearch_handleNextExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4146 if (U_FAILURE(*status
)) {
4147 setMatchNotFound(strsrch
);
4152 UCollationElements
*coleiter
= strsrch
->textIter
;
4153 int32_t textlength
= strsrch
->search
->textLength
;
4154 int32_t *patternce
= strsrch
->pattern
.CE
;
4155 int32_t patterncelength
= strsrch
->pattern
.CELength
;
4156 int32_t textoffset
= ucol_getOffset(coleiter
);
4158 // status used in setting coleiter offset, since offset is checked in
4159 // shiftForward before setting the coleiter offset, status never
4161 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4163 while (textoffset
<= textlength
)
4165 uint32_t patternceindex
= patterncelength
- 1;
4167 UBool found
= FALSE
;
4168 int32_t lastce
= UCOL_NULLORDER
;
4170 setColEIterOffset(coleiter
, textoffset
);
4173 // finding the last pattern ce match, imagine composite characters
4174 // for example: search for pattern A in text \u00C0
4175 // we'll have to skip \u0300 the grave first before we get to A
4176 targetce
= ucol_previous(coleiter
, status
);
4177 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4181 targetce
= getCE(strsrch
, targetce
);
4182 if (targetce
== UCOL_IGNORABLE
&& inNormBuf(coleiter
)) {
4183 // this is for the text \u0315\u0300 that requires
4184 // normalization and pattern \u0300, where \u0315 is ignorable
4187 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4190 if (targetce
== patternce
[patternceindex
]) {
4191 // the first ce can be a contraction
4195 if (!hasExpansion(coleiter
)) {
4201 //targetce = lastce;
4203 while (found
&& patternceindex
> 0) {
4205 targetce
= ucol_previous(coleiter
, status
);
4206 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4210 targetce
= getCE(strsrch
, targetce
);
4211 if (targetce
== UCOL_IGNORABLE
) {
4216 found
= found
&& targetce
== patternce
[patternceindex
];
4222 if (U_FAILURE(*status
)) {
4225 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4227 // status checked at loop.
4228 patternceindex
= patterncelength
;
4232 if (checkNextExactMatch(strsrch
, &textoffset
, status
)) {
4233 // status checked in ucol_setOffset
4234 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4238 setMatchNotFound(strsrch
);
4241 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4245 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4246 strsrch
->search
->matchedIndex
= start
;
4247 strsrch
->search
->matchedLength
= end
- start
;
4250 setMatchNotFound(strsrch
);
4256 UBool
usearch_handleNextCanonical(UStringSearch
*strsrch
, UErrorCode
*status
)
4258 if (U_FAILURE(*status
)) {
4259 setMatchNotFound(strsrch
);
4264 UCollationElements
*coleiter
= strsrch
->textIter
;
4265 int32_t textlength
= strsrch
->search
->textLength
;
4266 int32_t *patternce
= strsrch
->pattern
.CE
;
4267 int32_t patterncelength
= strsrch
->pattern
.CELength
;
4268 int32_t textoffset
= ucol_getOffset(coleiter
);
4269 UBool hasPatternAccents
=
4270 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4272 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4274 strsrch
->canonicalPrefixAccents
[0] = 0;
4275 strsrch
->canonicalSuffixAccents
[0] = 0;
4277 while (textoffset
<= textlength
)
4279 int32_t patternceindex
= patterncelength
- 1;
4281 UBool found
= FALSE
;
4282 int32_t lastce
= UCOL_NULLORDER
;
4284 setColEIterOffset(coleiter
, textoffset
);
4287 // finding the last pattern ce match, imagine composite characters
4288 // for example: search for pattern A in text \u00C0
4289 // we'll have to skip \u0300 the grave first before we get to A
4290 targetce
= ucol_previous(coleiter
, status
);
4291 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4295 targetce
= getCE(strsrch
, targetce
);
4296 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4299 if (targetce
== patternce
[patternceindex
]) {
4300 // the first ce can be a contraction
4304 if (!hasExpansion(coleiter
)) {
4310 while (found
&& patternceindex
> 0) {
4311 targetce
= ucol_previous(coleiter
, status
);
4312 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4316 targetce
= getCE(strsrch
, targetce
);
4317 if (targetce
== UCOL_IGNORABLE
) {
4322 found
= found
&& targetce
== patternce
[patternceindex
];
4325 // initializing the rearranged accent array
4326 if (hasPatternAccents
&& !found
) {
4327 strsrch
->canonicalPrefixAccents
[0] = 0;
4328 strsrch
->canonicalSuffixAccents
[0] = 0;
4329 if (U_FAILURE(*status
)) {
4332 found
= doNextCanonicalMatch(strsrch
, textoffset
, status
);
4336 if (U_FAILURE(*status
)) {
4339 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4341 // status checked at loop
4342 patternceindex
= patterncelength
;
4346 if (checkNextCanonicalMatch(strsrch
, &textoffset
, status
)) {
4347 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4351 setMatchNotFound(strsrch
);
4354 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4358 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4359 strsrch
->search
->matchedIndex
= start
;
4360 strsrch
->search
->matchedLength
= end
- start
;
4363 setMatchNotFound(strsrch
);
4369 UBool
usearch_handlePreviousExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4371 if (U_FAILURE(*status
)) {
4372 setMatchNotFound(strsrch
);
4377 UCollationElements
*coleiter
= strsrch
->textIter
;
4378 int32_t *patternce
= strsrch
->pattern
.CE
;
4379 int32_t patterncelength
= strsrch
->pattern
.CELength
;
4380 int32_t textoffset
= ucol_getOffset(coleiter
);
4382 // shifting it check for setting offset
4383 // if setOffset is called previously or there was no previous match, we
4384 // leave the offset as it is.
4385 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4386 textoffset
= strsrch
->search
->matchedIndex
;
4389 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4392 while (textoffset
>= 0)
4394 int32_t patternceindex
= 1;
4396 UBool found
= FALSE
;
4397 int32_t firstce
= UCOL_NULLORDER
;
4399 // if status is a failure, ucol_setOffset does nothing
4400 setColEIterOffset(coleiter
, textoffset
);
4403 // finding the first pattern ce match, imagine composite
4404 // characters. for example: search for pattern \u0300 in text
4405 // \u00C0, we'll have to skip A first before we get to
4406 // \u0300 the grave accent
4407 targetce
= ucol_next(coleiter
, status
);
4408 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4412 targetce
= getCE(strsrch
, targetce
);
4413 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4416 if (targetce
== UCOL_IGNORABLE
&& strsrch
->strength
!= UCOL_PRIMARY
) {
4419 if (targetce
== patternce
[0]) {
4423 if (!hasExpansion(coleiter
)) {
4424 // checking for accents in composite character
4430 //targetce = firstce;
4432 while (found
&& (patternceindex
< patterncelength
)) {
4434 targetce
= ucol_next(coleiter
, status
);
4435 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4439 targetce
= getCE(strsrch
, targetce
);
4440 if (targetce
== UCOL_IGNORABLE
) {
4444 found
= found
&& targetce
== patternce
[patternceindex
];
4451 if (U_FAILURE(*status
)) {
4455 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4461 if (checkPreviousExactMatch(strsrch
, &textoffset
, status
)) {
4462 setColEIterOffset(coleiter
, textoffset
);
4466 setMatchNotFound(strsrch
);
4469 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4473 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4474 strsrch
->search
->matchedIndex
= start
;
4475 strsrch
->search
->matchedLength
= end
- start
;
4478 setMatchNotFound(strsrch
);
4484 UBool
usearch_handlePreviousCanonical(UStringSearch
*strsrch
,
4487 if (U_FAILURE(*status
)) {
4488 setMatchNotFound(strsrch
);
4493 UCollationElements
*coleiter
= strsrch
->textIter
;
4494 int32_t *patternce
= strsrch
->pattern
.CE
;
4495 int32_t patterncelength
= strsrch
->pattern
.CELength
;
4496 int32_t textoffset
= ucol_getOffset(coleiter
);
4497 UBool hasPatternAccents
=
4498 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4500 // shifting it check for setting offset
4501 // if setOffset is called previously or there was no previous match, we
4502 // leave the offset as it is.
4503 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4504 textoffset
= strsrch
->search
->matchedIndex
;
4507 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4509 strsrch
->canonicalPrefixAccents
[0] = 0;
4510 strsrch
->canonicalSuffixAccents
[0] = 0;
4512 while (textoffset
>= 0)
4514 int32_t patternceindex
= 1;
4516 UBool found
= FALSE
;
4517 int32_t firstce
= UCOL_NULLORDER
;
4519 setColEIterOffset(coleiter
, textoffset
);
4521 // finding the first pattern ce match, imagine composite
4522 // characters. for example: search for pattern \u0300 in text
4523 // \u00C0, we'll have to skip A first before we get to
4524 // \u0300 the grave accent
4525 targetce
= ucol_next(coleiter
, status
);
4526 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4530 targetce
= getCE(strsrch
, targetce
);
4531 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4535 if (targetce
== patternce
[0]) {
4536 // the first ce can be a contraction
4540 if (!hasExpansion(coleiter
)) {
4541 // checking for accents in composite character
4549 while (found
&& patternceindex
< patterncelength
) {
4550 targetce
= ucol_next(coleiter
, status
);
4551 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4555 targetce
= getCE(strsrch
, targetce
);
4556 if (targetce
== UCOL_IGNORABLE
) {
4560 found
= found
&& targetce
== patternce
[patternceindex
];
4564 // initializing the rearranged accent array
4565 if (hasPatternAccents
&& !found
) {
4566 strsrch
->canonicalPrefixAccents
[0] = 0;
4567 strsrch
->canonicalSuffixAccents
[0] = 0;
4568 if (U_FAILURE(*status
)) {
4571 found
= doPreviousCanonicalMatch(strsrch
, textoffset
, status
);
4575 if (U_FAILURE(*status
)) {
4578 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4584 if (checkPreviousCanonicalMatch(strsrch
, &textoffset
, status
)) {
4585 setColEIterOffset(coleiter
, textoffset
);
4589 setMatchNotFound(strsrch
);
4592 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4596 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4597 strsrch
->search
->matchedIndex
= start
;
4598 strsrch
->search
->matchedLength
= end
- start
;
4601 setMatchNotFound(strsrch
);
4607 #endif /* #if !UCONFIG_NO_COLLATION */