2 **********************************************************************
3 * Copyright (C) 2001-2015 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unicode/utf16.h"
18 #include "normalizer2impl.h"
27 // don't use Boyer-Moore
28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
31 // internal definition ---------------------------------------------------
33 #define LAST_BYTE_MASK_ 0xFF
34 #define SECOND_LAST_BYTE_SHIFT_ 8
35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
37 static const Normalizer2Impl
*g_nfcImpl
= NULL
;
39 // internal methods -------------------------------------------------
42 * Fast collation element iterator setOffset.
43 * This function does not check for bounds.
44 * @param coleiter collation element iterator
45 * @param offset to set
48 inline void setColEIterOffset(UCollationElements
*elems
,
51 // Note: Not "fast" any more after the 2013 collation rewrite.
52 // We do not want to expose more internals than necessary.
53 UErrorCode status
= U_ZERO_ERROR
;
54 ucol_setOffset(elems
, offset
, &status
);
58 * Getting the mask for collation strength
59 * @param strength collation strength
60 * @return collation element mask
63 inline uint32_t getMask(UCollationStrength strength
)
68 return UCOL_PRIMARYORDERMASK
;
70 return UCOL_SECONDARYORDERMASK
| UCOL_PRIMARYORDERMASK
;
72 return UCOL_TERTIARYORDERMASK
| UCOL_SECONDARYORDERMASK
|
73 UCOL_PRIMARYORDERMASK
;
78 * This is to squeeze the 21bit ces into a 256 table
79 * @param ce collation element
80 * @return collapsed version of the collation element
83 inline int hash(uint32_t ce
)
85 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
86 // well with the new collation where most of the latin 1 characters
87 // are of the value xx000xxx. their hashes will most of the time be 0
88 // to be discussed on the hash algo.
89 return UCOL_PRIMARYORDER(ce
) % MAX_TABLE_SIZE_
;
93 static UBool U_CALLCONV
94 usearch_cleanup(void) {
101 * Initializing the fcd tables.
102 * Internal method, status assumed to be a success.
103 * @param status output error if any, caller to check status before calling
104 * method, status assumed to be success when passed in.
107 inline void initializeFCD(UErrorCode
*status
)
109 if (g_nfcImpl
== NULL
) {
110 g_nfcImpl
= Normalizer2Factory::getNFCImpl(*status
);
111 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH
, usearch_cleanup
);
116 * Gets the fcd value for a character at the argument index.
117 * This method takes into accounts of the supplementary characters.
118 * @param str UTF16 string where character for fcd retrieval resides
119 * @param offset position of the character whose fcd is to be retrieved, to be
120 * overwritten with the next character position, taking
121 * surrogate characters into consideration.
122 * @param strlength length of the argument string
126 uint16_t getFCD(const UChar
*str
, int32_t *offset
,
129 const UChar
*temp
= str
+ *offset
;
130 uint16_t result
= g_nfcImpl
->nextFCD16(temp
, str
+ strlength
);
131 *offset
= (int32_t)(temp
- str
);
136 * Getting the modified collation elements taking into account the collation
138 * @param strsrch string search data
140 * @return the modified collation element
143 inline int32_t getCE(const UStringSearch
*strsrch
, uint32_t sourcece
)
145 // note for tertiary we can't use the collator->tertiaryMask, that
146 // is a preprocessed mask that takes into account case options. since
147 // we are only concerned with exact matches, we don't need that.
148 sourcece
&= strsrch
->ceMask
;
150 if (strsrch
->toShift
) {
151 // alternate handling here, since only the 16 most significant digits
152 // is only used, we can safely do a compare without masking
153 // if the ce is a variable, we mask and get only the primary values
154 // no shifting to quartenary is required since all primary values
155 // less than variabletop will need to be masked off anyway.
156 if (strsrch
->variableTop
> sourcece
) {
157 if (strsrch
->strength
>= UCOL_QUATERNARY
) {
158 sourcece
&= UCOL_PRIMARYORDERMASK
;
161 sourcece
= UCOL_IGNORABLE
;
164 } else if (strsrch
->strength
>= UCOL_QUATERNARY
&& sourcece
== UCOL_IGNORABLE
) {
172 * Allocate a memory and returns NULL if it failed.
173 * Internal method, status assumed to be a success.
174 * @param size to allocate
175 * @param status output error if any, caller to check status before calling
176 * method, status assumed to be success when passed in.
177 * @return newly allocated array, NULL otherwise
180 inline void * allocateMemory(uint32_t size
, UErrorCode
*status
)
182 uint32_t *result
= (uint32_t *)uprv_malloc(size
);
183 if (result
== NULL
) {
184 *status
= U_MEMORY_ALLOCATION_ERROR
;
190 * Adds a uint32_t value to a destination array.
191 * Creates a new array if we run out of space. The caller will have to
192 * manually deallocate the newly allocated array.
193 * Internal method, status assumed to be success, caller has to check status
194 * before calling this method. destination not to be NULL and has at least
195 * size destinationlength.
196 * @param destination target array
197 * @param offset destination offset to add value
198 * @param destinationlength target array size, return value for the new size
199 * @param value to be added
200 * @param increments incremental size expected
201 * @param status output error if any, caller to check status before calling
202 * method, status assumed to be success when passed in.
203 * @return new destination array, destination if there was no new allocation
206 inline int32_t * addTouint32_tArray(int32_t *destination
,
208 uint32_t *destinationlength
,
213 uint32_t newlength
= *destinationlength
;
214 if (offset
+ 1 == newlength
) {
215 newlength
+= increments
;
216 int32_t *temp
= (int32_t *)allocateMemory(
217 sizeof(int32_t) * newlength
, status
);
218 if (U_FAILURE(*status
)) {
221 uprv_memcpy(temp
, destination
, sizeof(int32_t) * offset
);
222 *destinationlength
= newlength
;
225 destination
[offset
] = value
;
230 * Adds a uint64_t value to a destination array.
231 * Creates a new array if we run out of space. The caller will have to
232 * manually deallocate the newly allocated array.
233 * Internal method, status assumed to be success, caller has to check status
234 * before calling this method. destination not to be NULL and has at least
235 * size destinationlength.
236 * @param destination target array
237 * @param offset destination offset to add value
238 * @param destinationlength target array size, return value for the new size
239 * @param value to be added
240 * @param increments incremental size expected
241 * @param status output error if any, caller to check status before calling
242 * method, status assumed to be success when passed in.
243 * @return new destination array, destination if there was no new allocation
246 inline int64_t * addTouint64_tArray(int64_t *destination
,
248 uint32_t *destinationlength
,
253 uint32_t newlength
= *destinationlength
;
254 if (offset
+ 1 == newlength
) {
255 newlength
+= increments
;
256 int64_t *temp
= (int64_t *)allocateMemory(
257 sizeof(int64_t) * newlength
, status
);
259 if (U_FAILURE(*status
)) {
263 uprv_memcpy(temp
, destination
, sizeof(int64_t) * offset
);
264 *destinationlength
= newlength
;
268 destination
[offset
] = value
;
274 * Initializing the ce table for a pattern.
275 * Stores non-ignorable collation keys.
276 * Table size will be estimated by the size of the pattern text. Table
277 * expansion will be perform as we go along. Adding 1 to ensure that the table
278 * size definitely increases.
279 * Internal method, status assumed to be a success.
280 * @param strsrch string search data
281 * @param status output error if any, caller to check status before calling
282 * method, status assumed to be success when passed in.
283 * @return total number of expansions
286 inline uint16_t initializePatternCETable(UStringSearch
*strsrch
,
289 UPattern
*pattern
= &(strsrch
->pattern
);
290 uint32_t cetablesize
= INITIAL_ARRAY_SIZE_
;
291 int32_t *cetable
= pattern
->cesBuffer
;
292 uint32_t patternlength
= pattern
->textLength
;
293 UCollationElements
*coleiter
= strsrch
->utilIter
;
295 if (coleiter
== NULL
) {
296 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
297 patternlength
, status
);
298 // status will be checked in ucol_next(..) later and if it is an
299 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
301 strsrch
->utilIter
= coleiter
;
304 ucol_setText(coleiter
, pattern
->text
, pattern
->textLength
, status
);
306 if(U_FAILURE(*status
)) {
310 if (pattern
->ces
!= cetable
&& pattern
->ces
) {
311 uprv_free(pattern
->ces
);
318 while ((ce
= ucol_next(coleiter
, status
)) != UCOL_NULLORDER
&&
319 U_SUCCESS(*status
)) {
320 uint32_t newce
= getCE(strsrch
, ce
);
322 int32_t *temp
= addTouint32_tArray(cetable
, offset
, &cetablesize
,
324 patternlength
- ucol_getOffset(coleiter
) + 1,
326 if (U_FAILURE(*status
)) {
330 if (cetable
!= temp
&& cetable
!= pattern
->cesBuffer
) {
335 result
+= (uint16_t)(ucol_getMaxExpansion(coleiter
, ce
) - 1);
339 pattern
->ces
= cetable
;
340 pattern
->cesLength
= offset
;
346 * Initializing the pce table for a pattern.
347 * Stores non-ignorable collation keys.
348 * Table size will be estimated by the size of the pattern text. Table
349 * expansion will be perform as we go along. Adding 1 to ensure that the table
350 * size definitely increases.
351 * Internal method, status assumed to be a success.
352 * @param strsrch string search data
353 * @param status output error if any, caller to check status before calling
354 * method, status assumed to be success when passed in.
355 * @return total number of expansions
358 inline uint16_t initializePatternPCETable(UStringSearch
*strsrch
,
361 UPattern
*pattern
= &(strsrch
->pattern
);
362 uint32_t pcetablesize
= INITIAL_ARRAY_SIZE_
;
363 int64_t *pcetable
= pattern
->pcesBuffer
;
364 uint32_t patternlength
= pattern
->textLength
;
365 UCollationElements
*coleiter
= strsrch
->utilIter
;
367 if (coleiter
== NULL
) {
368 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
369 patternlength
, status
);
370 // status will be checked in ucol_next(..) later and if it is an
371 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
373 strsrch
->utilIter
= coleiter
;
375 ucol_setText(coleiter
, pattern
->text
, pattern
->textLength
, status
);
377 if(U_FAILURE(*status
)) {
381 if (pattern
->pces
!= pcetable
&& pattern
->pces
!= NULL
) {
382 uprv_free(pattern
->pces
);
389 icu::UCollationPCE
iter(coleiter
);
391 // ** Should processed CEs be signed or unsigned?
392 // ** (the rest of the code in this file seems to play fast-and-loose with
393 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
394 while ((pce
= iter
.nextProcessed(NULL
, NULL
, status
)) != UCOL_PROCESSED_NULLORDER
&&
395 U_SUCCESS(*status
)) {
396 int64_t *temp
= addTouint64_tArray(pcetable
, offset
, &pcetablesize
,
398 patternlength
- ucol_getOffset(coleiter
) + 1,
401 if (U_FAILURE(*status
)) {
407 if (pcetable
!= temp
&& pcetable
!= pattern
->pcesBuffer
) {
412 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
415 pcetable
[offset
] = 0;
416 pattern
->pces
= pcetable
;
417 pattern
->pcesLength
= offset
;
423 * Initializes the pattern struct.
424 * Internal method, status assumed to be success.
425 * @param strsrch UStringSearch data storage
426 * @param status output error if any, caller to check status before calling
427 * method, status assumed to be success when passed in.
428 * @return expansionsize the total expansion size of the pattern
431 inline int16_t initializePattern(UStringSearch
*strsrch
, UErrorCode
*status
)
433 if (U_FAILURE(*status
)) { return 0; }
434 UPattern
*pattern
= &(strsrch
->pattern
);
435 const UChar
*patterntext
= pattern
->text
;
436 int32_t length
= pattern
->textLength
;
439 // Since the strength is primary, accents are ignored in the pattern.
440 if (strsrch
->strength
== UCOL_PRIMARY
) {
441 pattern
->hasPrefixAccents
= 0;
442 pattern
->hasSuffixAccents
= 0;
444 pattern
->hasPrefixAccents
= getFCD(patterntext
, &index
, length
) >>
445 SECOND_LAST_BYTE_SHIFT_
;
447 U16_BACK_1(patterntext
, 0, index
);
448 pattern
->hasSuffixAccents
= getFCD(patterntext
, &index
, length
) &
453 if (strsrch
->pattern
.pces
!= NULL
) {
454 if (strsrch
->pattern
.pces
!= strsrch
->pattern
.pcesBuffer
) {
455 uprv_free(strsrch
->pattern
.pces
);
458 strsrch
->pattern
.pces
= NULL
;
461 // since intializePattern is an internal method status is a success.
462 return initializePatternCETable(strsrch
, status
);
466 * Initializing shift tables, with the default values.
467 * If a corresponding default value is 0, the shift table is not set.
468 * @param shift table for forwards shift
469 * @param backshift table for backwards shift
470 * @param cetable table containing pattern ce
471 * @param cesize size of the pattern ces
472 * @param expansionsize total size of the expansions
473 * @param defaultforward the default forward value
474 * @param defaultbackward the default backward value
477 inline void setShiftTable(int16_t shift
[], int16_t backshift
[],
478 int32_t *cetable
, int32_t cesize
,
479 int16_t expansionsize
,
480 int16_t defaultforward
,
481 int16_t defaultbackward
)
483 // estimate the value to shift. to do that we estimate the smallest
484 // number of characters to give the relevant ces, ie approximately
485 // the number of ces minus their expansion, since expansions can come
488 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
489 shift
[count
] = defaultforward
;
491 cesize
--; // down to the last index
492 for (count
= 0; count
< cesize
; count
++) {
493 // number of ces from right of array to the count
494 int temp
= defaultforward
- count
- 1;
495 shift
[hash(cetable
[count
])] = temp
> 1 ? temp
: 1;
497 shift
[hash(cetable
[cesize
])] = 1;
498 // for ignorables we just shift by one. see test examples.
501 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
502 backshift
[count
] = defaultbackward
;
504 for (count
= cesize
; count
> 0; count
--) {
505 // the original value count does not seem to work
506 backshift
[hash(cetable
[count
])] = count
> expansionsize
?
507 (int16_t)(count
- expansionsize
) : 1;
509 backshift
[hash(cetable
[0])] = 1;
510 backshift
[hash(0)] = 1;
514 * Building of the pattern collation element list and the boyer moore strsrch
516 * The canonical match will only be performed after the default match fails.
517 * For both cases we need to remember the size of the composed and decomposed
518 * versions of the string. Since the Boyer-Moore shift calculations shifts by
519 * a number of characters in the text and tries to match the pattern from that
520 * offset, the shift value can not be too large in case we miss some
521 * characters. To choose a right shift size, we estimate the NFC form of the
522 * and use its size as a shift guide. The NFC form should be the small
523 * possible representation of the pattern. Anyways, we'll err on the smaller
524 * shift size. Hence the calculation for minlength.
525 * Canonical match will be performed slightly differently. We'll split the
526 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
527 * the first and last base character (MS), the ending accents (EA). Matches
528 * will be done on MS first, and only when we match MS then some processing
529 * will be required for the prefix and end accents in order to determine if
530 * they match PA and EA. Hence the default shift values
531 * for the canonical match will take the size of either end's accent into
532 * consideration. Forwards search will take the end accents into consideration
533 * for the default shift values and the backwards search will take the prefix
534 * accents into consideration.
535 * If pattern has no non-ignorable ce, we return a illegal argument error.
536 * Internal method, status assumed to be success.
537 * @param strsrch UStringSearch data storage
538 * @param status for output errors if it occurs, status is assumed to be a
539 * success when it is passed in.
542 inline void initialize(UStringSearch
*strsrch
, UErrorCode
*status
)
544 int16_t expandlength
= initializePattern(strsrch
, status
);
545 if (U_SUCCESS(*status
) && strsrch
->pattern
.cesLength
> 0) {
546 UPattern
*pattern
= &strsrch
->pattern
;
547 int32_t cesize
= pattern
->cesLength
;
549 int16_t minlength
= cesize
> expandlength
550 ? (int16_t)cesize
- expandlength
: 1;
551 pattern
->defaultShiftSize
= minlength
;
552 setShiftTable(pattern
->shift
, pattern
->backShift
, pattern
->ces
,
553 cesize
, expandlength
, minlength
, minlength
);
556 strsrch
->pattern
.defaultShiftSize
= 0;
561 * Check to make sure that the match length is at the end of the character by
562 * using the breakiterator.
563 * @param strsrch string search data
564 * @param start target text start offset
565 * @param end target text end offset
568 void checkBreakBoundary(const UStringSearch
*strsrch
, int32_t * /*start*/,
571 #if !UCONFIG_NO_BREAK_ITERATION
572 UBreakIterator
*breakiterator
= strsrch
->search
->internalBreakIter
;
574 int32_t matchend
= *end
;
575 //int32_t matchstart = *start;
577 if (!ubrk_isBoundary(breakiterator
, matchend
)) {
578 *end
= ubrk_following(breakiterator
, matchend
);
581 /* Check the start of the matched text to make sure it doesn't have any accents
582 * before it. This code may not be necessary and so it is commented out */
583 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
584 *start = ubrk_preceding(breakiterator, matchstart);
591 * Determine whether the target text in UStringSearch bounded by the offset
592 * start and end is one or more whole units of text as
593 * determined by the breakiterator in UStringSearch.
594 * @param strsrch string search data
595 * @param start target text start offset
596 * @param end target text end offset
599 UBool
isBreakUnit(const UStringSearch
*strsrch
, int32_t start
,
602 #if !UCONFIG_NO_BREAK_ITERATION
603 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
606 int32_t startindex
= ubrk_first(breakiterator
);
607 int32_t endindex
= ubrk_last(breakiterator
);
609 // out-of-range indexes are never boundary positions
610 if (start
< startindex
|| start
> endindex
||
611 end
< startindex
|| end
> endindex
) {
614 // otherwise, we can use following() on the position before the
615 // specified one and return true of the position we get back is the
616 // one the user specified
617 UBool result
= (start
== startindex
||
618 ubrk_following(breakiterator
, start
- 1) == start
) &&
620 ubrk_following(breakiterator
, end
- 1) == end
);
622 // iterates the individual ces
623 UCollationElements
*coleiter
= strsrch
->utilIter
;
624 const UChar
*text
= strsrch
->search
->text
+
626 UErrorCode status
= U_ZERO_ERROR
;
627 ucol_setText(coleiter
, text
, end
- start
, &status
);
628 for (int32_t count
= 0; count
< strsrch
->pattern
.cesLength
;
630 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
631 if (ce
== UCOL_IGNORABLE
) {
635 if (U_FAILURE(status
) || ce
!= strsrch
->pattern
.ces
[count
]) {
639 int32_t nextce
= ucol_next(coleiter
, &status
);
640 while (ucol_getOffset(coleiter
) == (end
- start
)
641 && getCE(strsrch
, nextce
) == UCOL_IGNORABLE
) {
642 nextce
= ucol_next(coleiter
, &status
);
644 if (ucol_getOffset(coleiter
) == (end
- start
)
645 && nextce
!= UCOL_NULLORDER
) {
646 // extra collation elements at the end of the match
657 * Getting the next base character offset if current offset is an accent,
658 * or the current offset if the current character contains a base character.
659 * accents the following base character will be returned
661 * @param textoffset current offset
662 * @param textlength length of text string
663 * @return the next base character or the current offset
664 * if the current character is contains a base character.
667 inline int32_t getNextBaseOffset(const UChar
*text
,
671 if (textoffset
< textlength
) {
672 int32_t temp
= textoffset
;
673 if (getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
674 while (temp
< textlength
) {
675 int32_t result
= temp
;
676 if ((getFCD(text
, &temp
, textlength
) >>
677 SECOND_LAST_BYTE_SHIFT_
) == 0) {
688 * Gets the next base character offset depending on the string search pattern
690 * @param strsrch string search data
691 * @param textoffset current offset, one offset away from the last character
693 * @return start index of the next base character or the current offset
694 * if the current character is contains a base character.
697 inline int32_t getNextUStringSearchBaseOffset(UStringSearch
*strsrch
,
700 int32_t textlength
= strsrch
->search
->textLength
;
701 if (strsrch
->pattern
.hasSuffixAccents
&&
702 textoffset
< textlength
) {
703 int32_t temp
= textoffset
;
704 const UChar
*text
= strsrch
->search
->text
;
705 U16_BACK_1(text
, 0, temp
);
706 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
707 return getNextBaseOffset(text
, textoffset
, textlength
);
714 * Shifting the collation element iterator position forward to prepare for
715 * a following match. If the last character is a unsafe character, we'll only
716 * shift by 1 to capture contractions, normalization etc.
717 * Internal method, status assumed to be success.
718 * @param text strsrch string search data
719 * @param textoffset start text position to do search
720 * @param ce the text ce which failed the match.
721 * @param patternceindex index of the ce within the pattern ce buffer which
723 * @return final offset
726 inline int32_t shiftForward(UStringSearch
*strsrch
,
729 int32_t patternceindex
)
731 UPattern
*pattern
= &(strsrch
->pattern
);
732 if (ce
!= UCOL_NULLORDER
) {
733 int32_t shift
= pattern
->shift
[hash(ce
)];
734 // this is to adjust for characters in the middle of the
735 // substring for matching that failed.
736 int32_t adjust
= pattern
->cesLength
- patternceindex
;
737 if (adjust
> 1 && shift
>= adjust
) {
743 textoffset
+= pattern
->defaultShiftSize
;
746 textoffset
= getNextUStringSearchBaseOffset(strsrch
, textoffset
);
747 // check for unsafe characters
748 // * if it is the start or middle of a contraction: to be done after
749 // a initial match is found
750 // * thai or lao base consonant character: similar to contraction
751 // * high surrogate character: similar to contraction
752 // * next character is a accent: shift to the next base character
755 #endif // #if BOYER_MOORE
758 * sets match not found
759 * @param strsrch string search data
762 inline void setMatchNotFound(UStringSearch
*strsrch
)
764 // this method resets the match result regardless of the error status.
765 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
766 strsrch
->search
->matchedLength
= 0;
767 if (strsrch
->search
->isForwardSearching
) {
768 setColEIterOffset(strsrch
->textIter
, strsrch
->search
->textLength
);
771 setColEIterOffset(strsrch
->textIter
, 0);
777 * Gets the offset to the next safe point in text.
778 * ie. not the middle of a contraction, swappable characters or supplementary
780 * @param collator collation sata
781 * @param text string to work with
782 * @param textoffset offset in string
783 * @param textlength length of text string
784 * @return offset to the next safe character
787 inline int32_t getNextSafeOffset(const UCollator
*collator
,
792 int32_t result
= textoffset
; // first contraction character
793 while (result
!= textlength
&& ucol_unsafeCP(text
[result
], collator
)) {
800 * This checks for accents in the potential match started with a .
801 * composite character.
802 * This is really painful... we have to check that composite character do not
803 * have any extra accents. We have to normalize the potential match and find
804 * the immediate decomposed character before the match.
805 * The first composite character would have been taken care of by the fcd
806 * checks in checkForwardExactMatch.
807 * This is the slow path after the fcd of the first character and
808 * the last character has been checked by checkForwardExactMatch and we
809 * determine that the potential match has extra non-ignorable preceding
811 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
812 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
813 * Note here that accents checking are slow and cautioned in the API docs.
814 * Internal method, status assumed to be a success, caller should check status
815 * before calling this method
816 * @param strsrch string search data
817 * @param start index of the potential unfriendly composite character
818 * @param end index of the potential unfriendly composite character
819 * @param status output error status if any.
820 * @return TRUE if there is non-ignorable accents before at the beginning
821 * of the match, FALSE otherwise.
825 UBool
checkExtraMatchAccents(const UStringSearch
*strsrch
, int32_t start
,
829 UBool result
= FALSE
;
830 if (strsrch
->pattern
.hasPrefixAccents
) {
831 int32_t length
= end
- start
;
833 const UChar
*text
= strsrch
->search
->text
+ start
;
835 U16_FWD_1(text
, offset
, length
);
836 // we are only concerned with the first composite character
837 if (unorm_quickCheck(text
, offset
, UNORM_NFD
, status
) == UNORM_NO
) {
838 int32_t safeoffset
= getNextSafeOffset(strsrch
->collator
,
840 if (safeoffset
!= length
) {
844 UChar buffer
[INITIAL_ARRAY_SIZE_
];
845 int32_t size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0,
846 buffer
, INITIAL_ARRAY_SIZE_
,
848 if (U_FAILURE(*status
)) {
851 if (size
>= INITIAL_ARRAY_SIZE_
) {
852 norm
= (UChar
*)allocateMemory((size
+ 1) * sizeof(UChar
),
854 // if allocation failed, status will be set to
855 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
857 size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0, norm
,
859 if (U_FAILURE(*status
) && norm
!= NULL
) {
868 UCollationElements
*coleiter
= strsrch
->utilIter
;
869 ucol_setText(coleiter
, norm
, size
, status
);
870 uint32_t firstce
= strsrch
->pattern
.ces
[0];
871 UBool ignorable
= TRUE
;
872 uint32_t ce
= UCOL_IGNORABLE
;
873 while (U_SUCCESS(*status
) && ce
!= firstce
&& ce
!= (uint32_t)UCOL_NULLORDER
) {
874 offset
= ucol_getOffset(coleiter
);
875 if (ce
!= firstce
&& ce
!= UCOL_IGNORABLE
) {
878 ce
= ucol_next(coleiter
, status
);
881 U16_PREV(norm
, 0, offset
, codepoint
);
882 result
= !ignorable
&& (u_getCombiningClass(codepoint
) != 0);
884 if (norm
!= buffer
) {
894 * Used by exact matches, checks if there are accents before the match.
895 * This is really painful... we have to check that composite characters at
896 * the start of the matches have to not have any extra accents.
897 * We check the FCD of the character first, if it starts with an accent and
898 * the first pattern ce does not match the first ce of the character, we bail.
899 * Otherwise we try normalizing the first composite
900 * character and find the immediate decomposed character before the match to
901 * see if it is an non-ignorable accent.
902 * Now normalizing the first composite character is enough because we ensure
903 * that when the match is passed in here with extra beginning ces, the
904 * first or last ce that match has to occur within the first character.
905 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
906 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
907 * Note here that accents checking are slow and cautioned in the API docs.
908 * @param strsrch string search data
909 * @param start offset
911 * @return TRUE if there are accents on either side of the match,
915 UBool
hasAccentsBeforeMatch(const UStringSearch
*strsrch
, int32_t start
,
918 if (strsrch
->pattern
.hasPrefixAccents
) {
919 UCollationElements
*coleiter
= strsrch
->textIter
;
920 UErrorCode status
= U_ZERO_ERROR
;
921 // we have been iterating forwards previously
922 uint32_t ignorable
= TRUE
;
923 int32_t firstce
= strsrch
->pattern
.ces
[0];
925 setColEIterOffset(coleiter
, start
);
926 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
927 if (U_FAILURE(status
)) {
930 while (ce
!= firstce
) {
931 if (ce
!= UCOL_IGNORABLE
) {
934 ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
935 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
939 if (!ignorable
&& inNormBuf(coleiter
)) {
940 // within normalization buffer, discontiguous handled here
945 int32_t temp
= start
;
947 // accent = (getFCD(strsrch->search->text, &temp,
948 // strsrch->search->textLength)
949 // >> SECOND_LAST_BYTE_SHIFT_);
950 // however this code does not work well with VC7 .net in release mode.
951 // maybe the inlines for getFCD combined with shifting has bugs in
952 // VC7. anyways this is a work around.
953 UBool accent
= getFCD(strsrch
->search
->text
, &temp
,
954 strsrch
->search
->textLength
) > 0xFF;
956 return checkExtraMatchAccents(strsrch
, start
, end
, &status
);
963 U16_BACK_1(strsrch
->search
->text
, 0, temp
);
964 if (getFCD(strsrch
->search
->text
, &temp
,
965 strsrch
->search
->textLength
) & LAST_BYTE_MASK_
) {
966 setColEIterOffset(coleiter
, start
);
967 ce
= ucol_previous(coleiter
, &status
);
968 if (U_FAILURE(status
) ||
969 (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
)) {
980 * Used by exact matches, checks if there are accents bounding the match.
981 * Note this is the initial boundary check. If the potential match
982 * starts or ends with composite characters, the accents in those
983 * characters will be determined later.
984 * Not doing backwards iteration here, since discontiguos contraction for
985 * backwards collation element iterator, use up too many characters.
986 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
987 * should fail since there is a acute at the end of \u01FA
988 * Note here that accents checking are slow and cautioned in the API docs.
989 * @param strsrch string search data
990 * @param start offset of match
991 * @param end end offset of the match
992 * @return TRUE if there are accents on either side of the match,
996 UBool
hasAccentsAfterMatch(const UStringSearch
*strsrch
, int32_t start
,
999 if (strsrch
->pattern
.hasSuffixAccents
) {
1000 const UChar
*text
= strsrch
->search
->text
;
1002 int32_t textlength
= strsrch
->search
->textLength
;
1003 U16_BACK_1(text
, 0, temp
);
1004 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
1005 int32_t firstce
= strsrch
->pattern
.ces
[0];
1006 UCollationElements
*coleiter
= strsrch
->textIter
;
1007 UErrorCode status
= U_ZERO_ERROR
;
1009 setColEIterOffset(coleiter
, start
);
1010 while ((ce
= getCE(strsrch
, ucol_next(coleiter
, &status
))) != firstce
) {
1011 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
1016 while (count
< strsrch
->pattern
.cesLength
) {
1017 if (getCE(strsrch
, ucol_next(coleiter
, &status
))
1018 == UCOL_IGNORABLE
) {
1019 // Thai can give an ignorable here.
1022 if (U_FAILURE(status
)) {
1028 ce
= ucol_next(coleiter
, &status
);
1029 if (U_FAILURE(status
)) {
1032 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1033 ce
= getCE(strsrch
, ce
);
1035 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1036 if (ucol_getOffset(coleiter
) <= end
) {
1039 if (getFCD(text
, &end
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
1047 #endif // #if BOYER_MOORE
1050 * Checks if the offset runs out of the text string
1052 * @param textlength of the text string
1053 * @return TRUE if offset is out of bounds, FALSE otherwise
1056 inline UBool
isOutOfBounds(int32_t textlength
, int32_t offset
)
1058 return offset
< 0 || offset
> textlength
;
1062 * Checks for identical match
1063 * @param strsrch string search data
1064 * @param start offset of possible match
1065 * @param end offset of possible match
1066 * @return TRUE if identical match is found
1069 inline UBool
checkIdentical(const UStringSearch
*strsrch
, int32_t start
,
1072 if (strsrch
->strength
!= UCOL_IDENTICAL
) {
1076 // Note: We could use Normalizer::compare() or similar, but for short strings
1077 // which may not be in FCD it might be faster to just NFD them.
1078 UErrorCode status
= U_ZERO_ERROR
;
1079 UnicodeString t2
, p2
;
1080 strsrch
->nfd
->normalize(
1081 UnicodeString(FALSE
, strsrch
->search
->text
+ start
, end
- start
), t2
, status
);
1082 strsrch
->nfd
->normalize(
1083 UnicodeString(FALSE
, strsrch
->pattern
.text
, strsrch
->pattern
.textLength
), p2
, status
);
1084 // return FALSE if NFD failed
1085 return U_SUCCESS(status
) && t2
== p2
;
1090 * Checks to see if the match is repeated
1091 * @param strsrch string search data
1092 * @param start new match start index
1093 * @param end new match end index
1094 * @return TRUE if the the match is repeated, FALSE otherwise
1097 inline UBool
checkRepeatedMatch(UStringSearch
*strsrch
,
1101 int32_t lastmatchindex
= strsrch
->search
->matchedIndex
;
1103 if (lastmatchindex
== USEARCH_DONE
) {
1106 if (strsrch
->search
->isForwardSearching
) {
1107 result
= start
<= lastmatchindex
;
1110 result
= start
>= lastmatchindex
;
1112 if (!result
&& !strsrch
->search
->isOverlap
) {
1113 if (strsrch
->search
->isForwardSearching
) {
1114 result
= start
< lastmatchindex
+ strsrch
->search
->matchedLength
;
1117 result
= end
> lastmatchindex
;
1124 * Gets the collation element iterator's current offset.
1125 * @param coleiter collation element iterator
1126 * @param forwards flag TRUE if we are moving in th forwards direction
1127 * @return current offset
1130 inline int32_t getColElemIterOffset(const UCollationElements
*coleiter
,
1133 int32_t result
= ucol_getOffset(coleiter
);
1134 // intricacies of the the backwards collation element iterator
1135 if (FALSE
&& !forwards
&& inNormBuf(coleiter
) && !isFCDPointerNull(coleiter
)) {
1142 * Checks match for contraction.
1143 * If the match ends with a partial contraction we fail.
1144 * If the match starts too far off (because of backwards iteration) we try to
1145 * chip off the extra characters depending on whether a breakiterator has
1147 * Internal method, error assumed to be success, caller has to check status
1148 * before calling this method.
1149 * @param strsrch string search data
1150 * @param start offset of potential match, to be modified if necessary
1151 * @param end offset of potential match, to be modified if necessary
1152 * @param status output error status if any
1153 * @return TRUE if match passes the contraction test, FALSE otherwise
1157 UBool
checkNextExactContractionMatch(UStringSearch
*strsrch
,
1159 int32_t *end
, UErrorCode
*status
)
1161 UCollationElements
*coleiter
= strsrch
->textIter
;
1162 int32_t textlength
= strsrch
->search
->textLength
;
1163 int32_t temp
= *start
;
1164 const UCollator
*collator
= strsrch
->collator
;
1165 const UChar
*text
= strsrch
->search
->text
;
1166 // This part checks if either ends of the match contains potential
1167 // contraction. If so we'll have to iterate through them
1168 // The start contraction needs to be checked since ucol_previous dumps
1169 // all characters till the first safe character into the buffer.
1170 // *start + 1 is used to test for the unsafe characters instead of *start
1171 // because ucol_prev takes all unsafe characters till the first safe
1172 // character ie *start. so by testing *start + 1, we can estimate if
1173 // excess prefix characters has been included in the potential search
1175 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1176 (*start
+ 1 < textlength
1177 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1178 int32_t expansion
= getExpansionPrefix(coleiter
);
1179 UBool expandflag
= expansion
> 0;
1180 setColEIterOffset(coleiter
, *start
);
1181 while (expansion
> 0) {
1182 // getting rid of the redundant ce, caused by setOffset.
1183 // since backward contraction/expansion may have extra ces if we
1184 // are in the normalization buffer, hasAccentsBeforeMatch would
1185 // have taken care of it.
1186 // E.g. the character \u01FA will have an expansion of 3, but if
1187 // we are only looking for acute and ring \u030A and \u0301, we'll
1188 // have to skip the first ce in the expansion buffer.
1189 ucol_next(coleiter
, status
);
1190 if (U_FAILURE(*status
)) {
1193 if (ucol_getOffset(coleiter
) != temp
) {
1195 temp
= ucol_getOffset(coleiter
);
1200 int32_t *patternce
= strsrch
->pattern
.ces
;
1201 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
1203 while (count
< patterncelength
) {
1204 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1205 if (ce
== UCOL_IGNORABLE
) {
1208 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1210 temp
= ucol_getOffset(coleiter
);
1212 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1214 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1224 * Checks and sets the match information if found.
1227 * <li> the potential match does not repeat the previous match
1228 * <li> boundaries are correct
1229 * <li> exact matches has no extra accents
1230 * <li> identical matchesb
1231 * <li> potential match does not end in the middle of a contraction
1233 * Otherwise the offset will be shifted to the next character.
1234 * Internal method, status assumed to be success, caller has to check status
1235 * before calling this method.
1236 * @param strsrch string search data
1237 * @param textoffset offset in the collation element text. the returned value
1238 * will be the truncated end offset of the match or the new start
1240 * @param status output error status if any
1241 * @return TRUE if the match is valid, FALSE otherwise
1244 inline UBool
checkNextExactMatch(UStringSearch
*strsrch
,
1245 int32_t *textoffset
, UErrorCode
*status
)
1247 UCollationElements
*coleiter
= strsrch
->textIter
;
1248 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1250 if (!checkNextExactContractionMatch(strsrch
, &start
, textoffset
, status
)) {
1254 // this totally matches, however we need to check if it is repeating
1255 if (!isBreakUnit(strsrch
, start
, *textoffset
) ||
1256 checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1257 hasAccentsBeforeMatch(strsrch
, start
, *textoffset
) ||
1258 !checkIdentical(strsrch
, start
, *textoffset
) ||
1259 hasAccentsAfterMatch(strsrch
, start
, *textoffset
)) {
1262 *textoffset
= getNextUStringSearchBaseOffset(strsrch
, *textoffset
);
1266 //Add breakiterator boundary check for primary strength search.
1267 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
1268 checkBreakBoundary(strsrch
, &start
, textoffset
);
1271 // totally match, we will get rid of the ending ignorables.
1272 strsrch
->search
->matchedIndex
= start
;
1273 strsrch
->search
->matchedLength
= *textoffset
- start
;
1278 * Getting the previous base character offset, or the current offset if the
1279 * current character is a base character
1280 * @param text string
1281 * @param textoffset one offset after the current character
1282 * @return the offset of the next character after the base character or the first
1283 * composed character with accents
1286 inline int32_t getPreviousBaseOffset(const UChar
*text
,
1289 if (textoffset
> 0) {
1291 int32_t result
= textoffset
;
1292 U16_BACK_1(text
, 0, textoffset
);
1293 int32_t temp
= textoffset
;
1294 uint16_t fcd
= getFCD(text
, &temp
, result
);
1295 if ((fcd
>> SECOND_LAST_BYTE_SHIFT_
) == 0) {
1296 if (fcd
& LAST_BYTE_MASK_
) {
1301 if (textoffset
== 0) {
1310 * Getting the indexes of the accents that are not blocked in the argument
1312 * @param accents array of accents in nfd terminated by a 0.
1313 * @param accentsindex array of indexes of the accents that are not blocked
1316 inline int getUnblockedAccentIndex(UChar
*accents
, int32_t *accentsindex
)
1319 int32_t length
= u_strlen(accents
);
1320 UChar32 codepoint
= 0;
1324 while (index
< length
) {
1326 U16_NEXT(accents
, index
, length
, codepoint
);
1327 if (u_getCombiningClass(codepoint
) != cclass
) {
1328 cclass
= u_getCombiningClass(codepoint
);
1329 accentsindex
[result
] = temp
;
1333 accentsindex
[result
] = length
;
1338 * Appends 3 UChar arrays to a destination array.
1339 * Creates a new array if we run out of space. The caller will have to
1340 * manually deallocate the newly allocated array.
1341 * Internal method, status assumed to be success, caller has to check status
1342 * before calling this method. destination not to be NULL and has at least
1343 * size destinationlength.
1344 * @param destination target array
1345 * @param destinationlength target array size, returning the appended length
1346 * @param source1 null-terminated first array
1347 * @param source2 second array
1348 * @param source2length length of seond array
1349 * @param source3 null-terminated third array
1350 * @param status error status if any
1351 * @return new destination array, destination if there was no new allocation
1354 inline UChar
* addToUCharArray( UChar
*destination
,
1355 int32_t *destinationlength
,
1356 const UChar
*source1
,
1357 const UChar
*source2
,
1358 int32_t source2length
,
1359 const UChar
*source3
,
1362 int32_t source1length
= source1
? u_strlen(source1
) : 0;
1363 int32_t source3length
= source3
? u_strlen(source3
) : 0;
1364 if (*destinationlength
< source1length
+ source2length
+ source3length
+
1367 destination
= (UChar
*)allocateMemory(
1368 (source1length
+ source2length
+ source3length
+ 1) * sizeof(UChar
),
1370 // if error allocating memory, status will be
1371 // U_MEMORY_ALLOCATION_ERROR
1372 if (U_FAILURE(*status
)) {
1373 *destinationlength
= 0;
1377 if (source1length
!= 0) {
1378 uprv_memcpy(destination
, source1
, sizeof(UChar
) * source1length
);
1380 if (source2length
!= 0) {
1381 uprv_memcpy(destination
+ source1length
, source2
,
1382 sizeof(UChar
) * source2length
);
1384 if (source3length
!= 0) {
1385 uprv_memcpy(destination
+ source1length
+ source2length
, source3
,
1386 sizeof(UChar
) * source3length
);
1388 *destinationlength
= source1length
+ source2length
+ source3length
;
1393 * Running through a collation element iterator to see if the contents matches
1394 * pattern in string search data
1395 * @param strsrch string search data
1396 * @param coleiter collation element iterator
1397 * @return TRUE if a match if found, FALSE otherwise
1400 inline UBool
checkCollationMatch(const UStringSearch
*strsrch
,
1401 UCollationElements
*coleiter
)
1403 int patternceindex
= strsrch
->pattern
.cesLength
;
1404 int32_t *patternce
= strsrch
->pattern
.ces
;
1405 UErrorCode status
= U_ZERO_ERROR
;
1406 while (patternceindex
> 0) {
1407 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
1408 if (ce
== UCOL_IGNORABLE
) {
1411 if (U_FAILURE(status
) || ce
!= *patternce
) {
1421 * Rearranges the front accents to try matching.
1422 * Prefix accents in the text will be grouped according to their combining
1423 * class and the groups will be mixed and matched to try find the perfect
1424 * match with the pattern.
1425 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1426 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1427 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1429 * step 2: check if any of the generated substrings matches the pattern.
1430 * Internal method, status is assumed to be success, caller has to check status
1431 * before calling this method.
1432 * @param strsrch string search match
1433 * @param start first offset of the accents to start searching
1434 * @param end start of the last accent set
1435 * @param status output error status if any
1436 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1437 * offset of the match. Note this start includes all preceding accents.
1440 int32_t doNextCanonicalPrefixMatch(UStringSearch
*strsrch
,
1445 const UChar
*text
= strsrch
->search
->text
;
1446 int32_t textlength
= strsrch
->search
->textLength
;
1447 int32_t tempstart
= start
;
1449 if ((getFCD(text
, &tempstart
, textlength
) & LAST_BYTE_MASK_
) == 0) {
1450 // die... failed at a base character
1451 return USEARCH_DONE
;
1454 int32_t offset
= getNextBaseOffset(text
, tempstart
, textlength
);
1455 start
= getPreviousBaseOffset(text
, tempstart
);
1457 UChar accents
[INITIAL_ARRAY_SIZE_
];
1458 // normalizing the offensive string
1459 unorm_normalize(text
+ start
, offset
- start
, UNORM_NFD
, 0, accents
,
1460 INITIAL_ARRAY_SIZE_
, status
);
1461 if (U_FAILURE(*status
)) {
1462 return USEARCH_DONE
;
1465 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1466 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1468 int32_t count
= (2 << (accentsize
- 1)) - 1;
1469 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1470 UCollationElements
*coleiter
= strsrch
->utilIter
;
1471 while (U_SUCCESS(*status
) && count
> 0) {
1472 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
1473 // copy the base characters
1474 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1475 *rearrange
++ = accents
[k
];
1477 // forming all possible canonical rearrangement by dropping
1479 for (int i
= 0; i
<= accentsize
- 1; i
++) {
1480 int32_t mask
= 1 << (accentsize
- i
- 1);
1482 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1483 *rearrange
++ = accents
[j
];
1488 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
1489 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
1490 strsrch
->canonicalPrefixAccents
,
1491 strsrch
->search
->text
+ offset
,
1493 strsrch
->canonicalSuffixAccents
,
1496 // if status is a failure, ucol_setText does nothing.
1497 // run the collator iterator through this match
1498 ucol_setText(coleiter
, match
, matchsize
, status
);
1499 if (U_SUCCESS(*status
)) {
1500 if (checkCollationMatch(strsrch
, coleiter
)) {
1501 if (match
!= buffer
) {
1509 return USEARCH_DONE
;
1513 * Gets the offset to the safe point in text before textoffset.
1514 * ie. not the middle of a contraction, swappable characters or supplementary
1516 * @param collator collation sata
1517 * @param text string to work with
1518 * @param textoffset offset in string
1519 * @param textlength length of text string
1520 * @return offset to the previous safe character
1523 inline uint32_t getPreviousSafeOffset(const UCollator
*collator
,
1527 int32_t result
= textoffset
; // first contraction character
1528 while (result
!= 0 && ucol_unsafeCP(text
[result
- 1], collator
)) {
1532 // the first contraction character is consider unsafe here
1539 * Cleaning up after we passed the safe zone
1540 * @param strsrch string search data
1541 * @param safetext safe text array
1542 * @param safebuffer safe text buffer
1543 * @param coleiter collation element iterator for safe text
1546 inline void cleanUpSafeText(const UStringSearch
*strsrch
, UChar
*safetext
,
1549 if (safetext
!= safebuffer
&& safetext
!= strsrch
->canonicalSuffixAccents
)
1551 uprv_free(safetext
);
1556 * Take the rearranged end accents and tries matching. If match failed at
1557 * a seperate preceding set of accents (seperated from the rearranged on by
1558 * at least a base character) then we rearrange the preceding accents and
1559 * tries matching again.
1560 * We allow skipping of the ends of the accent set if the ces do not match.
1561 * However if the failure is found before the accent set, it fails.
1562 * Internal method, status assumed to be success, caller has to check status
1563 * before calling this method.
1564 * @param strsrch string search data
1565 * @param textoffset of the start of the rearranged accent
1566 * @param status output error status if any
1567 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1568 * offset of the match. Note this start includes all preceding accents.
1571 int32_t doNextCanonicalSuffixMatch(UStringSearch
*strsrch
,
1575 const UChar
*text
= strsrch
->search
->text
;
1576 const UCollator
*collator
= strsrch
->collator
;
1577 int32_t safelength
= 0;
1579 int32_t safetextlength
;
1580 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
1581 UCollationElements
*coleiter
= strsrch
->utilIter
;
1582 int32_t safeoffset
= textoffset
;
1584 if (textoffset
!= 0 && ucol_unsafeCP(strsrch
->canonicalSuffixAccents
[0],
1586 safeoffset
= getPreviousSafeOffset(collator
, text
, textoffset
);
1587 safelength
= textoffset
- safeoffset
;
1588 safetextlength
= INITIAL_ARRAY_SIZE_
;
1589 safetext
= addToUCharArray(safebuffer
, &safetextlength
, NULL
,
1590 text
+ safeoffset
, safelength
,
1591 strsrch
->canonicalSuffixAccents
,
1595 safetextlength
= u_strlen(strsrch
->canonicalSuffixAccents
);
1596 safetext
= strsrch
->canonicalSuffixAccents
;
1599 // if status is a failure, ucol_setText does nothing
1600 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
1601 // status checked in loop below
1603 int32_t *ce
= strsrch
->pattern
.ces
;
1604 int32_t celength
= strsrch
->pattern
.cesLength
;
1605 int ceindex
= celength
- 1;
1606 UBool isSafe
= TRUE
; // indication flag for position in safe zone
1608 while (ceindex
>= 0) {
1609 int32_t textce
= ucol_previous(coleiter
, status
);
1610 if (U_FAILURE(*status
)) {
1612 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1614 return USEARCH_DONE
;
1616 if (textce
== UCOL_NULLORDER
) {
1617 // check if we have passed the safe buffer
1618 if (coleiter
== strsrch
->textIter
) {
1619 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1620 return USEARCH_DONE
;
1622 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1623 safetext
= safebuffer
;
1624 coleiter
= strsrch
->textIter
;
1625 setColEIterOffset(coleiter
, safeoffset
);
1626 // status checked at the start of the loop
1630 textce
= getCE(strsrch
, textce
);
1631 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
1632 // do the beginning stuff
1633 int32_t failedoffset
= getColElemIterOffset(coleiter
, FALSE
);
1634 if (isSafe
&& failedoffset
>= safelength
) {
1635 // alas... no hope. failed at rearranged accent set
1636 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1637 return USEARCH_DONE
;
1641 failedoffset
+= safeoffset
;
1642 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1645 // try rearranging the front accents
1646 int32_t result
= doNextCanonicalPrefixMatch(strsrch
,
1647 failedoffset
, textoffset
, status
);
1648 if (result
!= USEARCH_DONE
) {
1649 // if status is a failure, ucol_setOffset does nothing
1650 setColEIterOffset(strsrch
->textIter
, result
);
1652 if (U_FAILURE(*status
)) {
1653 return USEARCH_DONE
;
1658 if (textce
== ce
[ceindex
]) {
1664 int32_t result
= getColElemIterOffset(coleiter
, FALSE
);
1665 // sets the text iterator here with the correct expansion and offset
1666 int32_t leftoverces
= getExpansionPrefix(coleiter
);
1667 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1668 if (result
>= safelength
) {
1669 result
= textoffset
;
1672 result
+= safeoffset
;
1674 setColEIterOffset(strsrch
->textIter
, result
);
1675 strsrch
->textIter
->iteratordata_
.toReturn
=
1676 setExpansionPrefix(strsrch
->textIter
, leftoverces
);
1680 return ucol_getOffset(coleiter
);
1684 * Trying out the substring and sees if it can be a canonical match.
1685 * This will try normalizing the end accents and arranging them into canonical
1686 * equivalents and check their corresponding ces with the pattern ce.
1687 * Suffix accents in the text will be grouped according to their combining
1688 * class and the groups will be mixed and matched to try find the perfect
1689 * match with the pattern.
1690 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1691 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1692 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1694 * step 2: check if any of the generated substrings matches the pattern.
1695 * Internal method, status assumed to be success, caller has to check status
1696 * before calling this method.
1697 * @param strsrch string search data
1698 * @param textoffset end offset in the collation element text that ends with
1699 * the accents to be rearranged
1700 * @param status error status if any
1701 * @return TRUE if the match is valid, FALSE otherwise
1704 UBool
doNextCanonicalMatch(UStringSearch
*strsrch
,
1708 const UChar
*text
= strsrch
->search
->text
;
1709 int32_t temp
= textoffset
;
1710 U16_BACK_1(text
, 0, temp
);
1711 if ((getFCD(text
, &temp
, textoffset
) & LAST_BYTE_MASK_
) == 0) {
1712 UCollationElements
*coleiter
= strsrch
->textIter
;
1713 int32_t offset
= getColElemIterOffset(coleiter
, FALSE
);
1714 if (strsrch
->pattern
.hasPrefixAccents
) {
1715 offset
= doNextCanonicalPrefixMatch(strsrch
, offset
, textoffset
,
1717 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
1718 setColEIterOffset(coleiter
, offset
);
1725 if (!strsrch
->pattern
.hasSuffixAccents
) {
1729 UChar accents
[INITIAL_ARRAY_SIZE_
];
1730 // offset to the last base character in substring to search
1731 int32_t baseoffset
= getPreviousBaseOffset(text
, textoffset
);
1732 // normalizing the offensive string
1733 unorm_normalize(text
+ baseoffset
, textoffset
- baseoffset
, UNORM_NFD
,
1734 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
1735 // status checked in loop below
1737 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1738 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
1740 // 2 power n - 1 plus the full set of accents
1741 int32_t count
= (2 << (size
- 1)) - 1;
1742 while (U_SUCCESS(*status
) && count
> 0) {
1743 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
1744 // copy the base characters
1745 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1746 *rearrange
++ = accents
[k
];
1748 // forming all possible canonical rearrangement by dropping
1750 for (int i
= 0; i
<= size
- 1; i
++) {
1751 int32_t mask
= 1 << (size
- i
- 1);
1753 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1754 *rearrange
++ = accents
[j
];
1759 int32_t offset
= doNextCanonicalSuffixMatch(strsrch
, baseoffset
,
1761 if (offset
!= USEARCH_DONE
) {
1762 return TRUE
; // match found
1770 * Gets the previous base character offset depending on the string search
1772 * @param strsrch string search data
1773 * @param textoffset current offset, current character
1774 * @return the offset of the next character after this base character or itself
1775 * if it is a composed character with accents
1778 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch
*strsrch
,
1781 if (strsrch
->pattern
.hasPrefixAccents
&& textoffset
> 0) {
1782 const UChar
*text
= strsrch
->search
->text
;
1783 int32_t offset
= textoffset
;
1784 if (getFCD(text
, &offset
, strsrch
->search
->textLength
) >>
1785 SECOND_LAST_BYTE_SHIFT_
) {
1786 return getPreviousBaseOffset(text
, textoffset
);
1793 * Checks match for contraction.
1794 * If the match ends with a partial contraction we fail.
1795 * If the match starts too far off (because of backwards iteration) we try to
1796 * chip off the extra characters
1797 * Internal method, status assumed to be success, caller has to check status
1798 * before calling this method.
1799 * @param strsrch string search data
1800 * @param start offset of potential match, to be modified if necessary
1801 * @param end offset of potential match, to be modified if necessary
1802 * @param status output error status if any
1803 * @return TRUE if match passes the contraction test, FALSE otherwise
1806 UBool
checkNextCanonicalContractionMatch(UStringSearch
*strsrch
,
1811 UCollationElements
*coleiter
= strsrch
->textIter
;
1812 int32_t textlength
= strsrch
->search
->textLength
;
1813 int32_t temp
= *start
;
1814 const UCollator
*collator
= strsrch
->collator
;
1815 const UChar
*text
= strsrch
->search
->text
;
1816 // This part checks if either ends of the match contains potential
1817 // contraction. If so we'll have to iterate through them
1818 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1819 (*start
+ 1 < textlength
1820 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1821 int32_t expansion
= getExpansionPrefix(coleiter
);
1822 UBool expandflag
= expansion
> 0;
1823 setColEIterOffset(coleiter
, *start
);
1824 while (expansion
> 0) {
1825 // getting rid of the redundant ce, caused by setOffset.
1826 // since backward contraction/expansion may have extra ces if we
1827 // are in the normalization buffer, hasAccentsBeforeMatch would
1828 // have taken care of it.
1829 // E.g. the character \u01FA will have an expansion of 3, but if
1830 // we are only looking for acute and ring \u030A and \u0301, we'll
1831 // have to skip the first ce in the expansion buffer.
1832 ucol_next(coleiter
, status
);
1833 if (U_FAILURE(*status
)) {
1836 if (ucol_getOffset(coleiter
) != temp
) {
1838 temp
= ucol_getOffset(coleiter
);
1843 int32_t *patternce
= strsrch
->pattern
.ces
;
1844 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
1846 int32_t textlength
= strsrch
->search
->textLength
;
1847 while (count
< patterncelength
) {
1848 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1849 // status checked below, note that if status is a failure
1850 // ucol_next returns UCOL_NULLORDER
1851 if (ce
== UCOL_IGNORABLE
) {
1854 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1856 temp
= ucol_getOffset(coleiter
);
1859 if (count
== 0 && ce
!= patternce
[0]) {
1860 // accents may have extra starting ces, this occurs when a
1861 // pure accent pattern is matched without rearrangement
1862 // text \u0325\u0300 and looking for \u0300
1863 int32_t expected
= patternce
[0];
1864 if (getFCD(text
, start
, textlength
) & LAST_BYTE_MASK_
) {
1865 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1866 while (U_SUCCESS(*status
) && ce
!= expected
&&
1867 ce
!= UCOL_NULLORDER
&&
1868 ucol_getOffset(coleiter
) <= *end
) {
1869 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1873 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1875 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1885 * Checks and sets the match information if found.
1888 * <li> the potential match does not repeat the previous match
1889 * <li> boundaries are correct
1890 * <li> potential match does not end in the middle of a contraction
1891 * <li> identical matches
1893 * Otherwise the offset will be shifted to the next character.
1894 * Internal method, status assumed to be success, caller has to check the
1895 * status before calling this method.
1896 * @param strsrch string search data
1897 * @param textoffset offset in the collation element text. the returned value
1898 * will be the truncated end offset of the match or the new start
1900 * @param status output error status if any
1901 * @return TRUE if the match is valid, FALSE otherwise
1904 inline UBool
checkNextCanonicalMatch(UStringSearch
*strsrch
,
1905 int32_t *textoffset
,
1908 // to ensure that the start and ends are not composite characters
1909 UCollationElements
*coleiter
= strsrch
->textIter
;
1910 // if we have a canonical accent match
1911 if ((strsrch
->pattern
.hasSuffixAccents
&&
1912 strsrch
->canonicalSuffixAccents
[0]) ||
1913 (strsrch
->pattern
.hasPrefixAccents
&&
1914 strsrch
->canonicalPrefixAccents
[0])) {
1915 strsrch
->search
->matchedIndex
= getPreviousUStringSearchBaseOffset(
1917 ucol_getOffset(coleiter
));
1918 strsrch
->search
->matchedLength
= *textoffset
-
1919 strsrch
->search
->matchedIndex
;
1923 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1924 if (!checkNextCanonicalContractionMatch(strsrch
, &start
, textoffset
,
1925 status
) || U_FAILURE(*status
)) {
1929 start
= getPreviousUStringSearchBaseOffset(strsrch
, start
);
1930 // this totally matches, however we need to check if it is repeating
1931 if (checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1932 !isBreakUnit(strsrch
, start
, *textoffset
) ||
1933 !checkIdentical(strsrch
, start
, *textoffset
)) {
1935 *textoffset
= getNextBaseOffset(strsrch
->search
->text
, *textoffset
,
1936 strsrch
->search
->textLength
);
1940 strsrch
->search
->matchedIndex
= start
;
1941 strsrch
->search
->matchedLength
= *textoffset
- start
;
1946 * Shifting the collation element iterator position forward to prepare for
1947 * a preceding match. If the first character is a unsafe character, we'll only
1948 * shift by 1 to capture contractions, normalization etc.
1949 * Internal method, status assumed to be success, caller has to check status
1950 * before calling this method.
1951 * @param text strsrch string search data
1952 * @param textoffset start text position to do search
1953 * @param ce the text ce which failed the match.
1954 * @param patternceindex index of the ce within the pattern ce buffer which
1956 * @return final offset
1959 inline int32_t reverseShift(UStringSearch
*strsrch
,
1962 int32_t patternceindex
)
1964 if (strsrch
->search
->isOverlap
) {
1965 if (textoffset
!= strsrch
->search
->textLength
) {
1969 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1973 if (ce
!= UCOL_NULLORDER
) {
1974 int32_t shift
= strsrch
->pattern
.backShift
[hash(ce
)];
1976 // this is to adjust for characters in the middle of the substring
1977 // for matching that failed.
1978 int32_t adjust
= patternceindex
;
1979 if (adjust
> 1 && shift
> adjust
) {
1980 shift
-= adjust
- 1;
1982 textoffset
-= shift
;
1985 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1988 textoffset
= getPreviousUStringSearchBaseOffset(strsrch
, textoffset
);
1993 * Checks match for contraction.
1994 * If the match starts with a partial contraction we fail.
1995 * Internal method, status assumed to be success, caller has to check status
1996 * before calling this method.
1997 * @param strsrch string search data
1998 * @param start offset of potential match, to be modified if necessary
1999 * @param end offset of potential match, to be modified if necessary
2000 * @param status output error status if any
2001 * @return TRUE if match passes the contraction test, FALSE otherwise
2004 UBool
checkPreviousExactContractionMatch(UStringSearch
*strsrch
,
2006 int32_t *end
, UErrorCode
*status
)
2008 UCollationElements
*coleiter
= strsrch
->textIter
;
2009 int32_t textlength
= strsrch
->search
->textLength
;
2010 int32_t temp
= *end
;
2011 const UCollator
*collator
= strsrch
->collator
;
2012 const UChar
*text
= strsrch
->search
->text
;
2013 // This part checks if either if the start of the match contains potential
2014 // contraction. If so we'll have to iterate through them
2015 // Since we used ucol_next while previously looking for the potential
2016 // match, this guarantees that our end will not be a partial contraction,
2017 // or a partial supplementary character.
2018 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2019 int32_t expansion
= getExpansionSuffix(coleiter
);
2020 UBool expandflag
= expansion
> 0;
2021 setColEIterOffset(coleiter
, *end
);
2022 while (U_SUCCESS(*status
) && expansion
> 0) {
2023 // getting rid of the redundant ce
2024 // since forward contraction/expansion may have extra ces
2025 // if we are in the normalization buffer, hasAccentsBeforeMatch
2026 // would have taken care of it.
2027 // E.g. the character \u01FA will have an expansion of 3, but if
2028 // we are only looking for A ring A\u030A, we'll have to skip the
2029 // last ce in the expansion buffer
2030 ucol_previous(coleiter
, status
);
2031 if (U_FAILURE(*status
)) {
2034 if (ucol_getOffset(coleiter
) != temp
) {
2036 temp
= ucol_getOffset(coleiter
);
2041 int32_t *patternce
= strsrch
->pattern
.ces
;
2042 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
2043 int32_t count
= patterncelength
;
2045 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2046 // status checked below, note that if status is a failure
2047 // ucol_previous returns UCOL_NULLORDER
2048 if (ce
== UCOL_IGNORABLE
) {
2051 if (expandflag
&& count
== 0 &&
2052 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2054 temp
= ucol_getOffset(coleiter
);
2056 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2058 *start
= getPreviousBaseOffset(text
, *start
);
2068 * Checks and sets the match information if found.
2071 * <li> the current match does not repeat the last match
2072 * <li> boundaries are correct
2073 * <li> exact matches has no extra accents
2074 * <li> identical matches
2076 * Otherwise the offset will be shifted to the preceding character.
2077 * Internal method, status assumed to be success, caller has to check status
2078 * before calling this method.
2079 * @param strsrch string search data
2081 * @param coleiter collation element iterator
2082 * @param text string
2083 * @param textoffset offset in the collation element text. the returned value
2084 * will be the truncated start offset of the match or the new start
2086 * @param status output error status if any
2087 * @return TRUE if the match is valid, FALSE otherwise
2090 inline UBool
checkPreviousExactMatch(UStringSearch
*strsrch
,
2091 int32_t *textoffset
,
2094 // to ensure that the start and ends are not composite characters
2095 int32_t end
= ucol_getOffset(strsrch
->textIter
);
2096 if (!checkPreviousExactContractionMatch(strsrch
, textoffset
, &end
, status
)
2097 || U_FAILURE(*status
)) {
2101 // this totally matches, however we need to check if it is repeating
2103 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2104 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2105 hasAccentsBeforeMatch(strsrch
, *textoffset
, end
) ||
2106 !checkIdentical(strsrch
, *textoffset
, end
) ||
2107 hasAccentsAfterMatch(strsrch
, *textoffset
, end
)) {
2109 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2114 //Add breakiterator boundary check for primary strength search.
2115 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
2116 checkBreakBoundary(strsrch
, textoffset
, &end
);
2119 strsrch
->search
->matchedIndex
= *textoffset
;
2120 strsrch
->search
->matchedLength
= end
- *textoffset
;
2125 * Rearranges the end accents to try matching.
2126 * Suffix accents in the text will be grouped according to their combining
2127 * class and the groups will be mixed and matched to try find the perfect
2128 * match with the pattern.
2129 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2130 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2131 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2133 * step 2: check if any of the generated substrings matches the pattern.
2134 * Internal method, status assumed to be success, user has to check status
2135 * before calling this method.
2136 * @param strsrch string search match
2137 * @param start offset of the first base character
2138 * @param end start of the last accent set
2139 * @param status only error status if any
2140 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2141 * offset of the match. Note this start includes all following accents.
2144 int32_t doPreviousCanonicalSuffixMatch(UStringSearch
*strsrch
,
2149 const UChar
*text
= strsrch
->search
->text
;
2150 int32_t tempend
= end
;
2152 U16_BACK_1(text
, 0, tempend
);
2153 if (!(getFCD(text
, &tempend
, strsrch
->search
->textLength
) &
2155 // die... failed at a base character
2156 return USEARCH_DONE
;
2158 end
= getNextBaseOffset(text
, end
, strsrch
->search
->textLength
);
2160 if (U_SUCCESS(*status
)) {
2161 UChar accents
[INITIAL_ARRAY_SIZE_
];
2162 int32_t offset
= getPreviousBaseOffset(text
, end
);
2163 // normalizing the offensive string
2164 unorm_normalize(text
+ offset
, end
- offset
, UNORM_NFD
, 0, accents
,
2165 INITIAL_ARRAY_SIZE_
, status
);
2167 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2168 int32_t accentsize
= getUnblockedAccentIndex(accents
,
2170 int32_t count
= (2 << (accentsize
- 1)) - 1;
2171 UChar buffer
[INITIAL_ARRAY_SIZE_
];
2172 UCollationElements
*coleiter
= strsrch
->utilIter
;
2173 while (U_SUCCESS(*status
) && count
> 0) {
2174 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
2175 // copy the base characters
2176 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2177 *rearrange
++ = accents
[k
];
2179 // forming all possible canonical rearrangement by dropping
2181 for (int i
= 0; i
<= accentsize
- 1; i
++) {
2182 int32_t mask
= 1 << (accentsize
- i
- 1);
2184 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2185 *rearrange
++ = accents
[j
];
2190 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
2191 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
2192 strsrch
->canonicalPrefixAccents
,
2193 strsrch
->search
->text
+ start
,
2195 strsrch
->canonicalSuffixAccents
,
2198 // run the collator iterator through this match
2199 // if status is a failure ucol_setText does nothing
2200 ucol_setText(coleiter
, match
, matchsize
, status
);
2201 if (U_SUCCESS(*status
)) {
2202 if (checkCollationMatch(strsrch
, coleiter
)) {
2203 if (match
!= buffer
) {
2212 return USEARCH_DONE
;
2216 * Take the rearranged start accents and tries matching. If match failed at
2217 * a seperate following set of accents (seperated from the rearranged on by
2218 * at least a base character) then we rearrange the preceding accents and
2219 * tries matching again.
2220 * We allow skipping of the ends of the accent set if the ces do not match.
2221 * However if the failure is found before the accent set, it fails.
2222 * Internal method, status assumed to be success, caller has to check status
2223 * before calling this method.
2224 * @param strsrch string search data
2225 * @param textoffset of the ends of the rearranged accent
2226 * @param status output error status if any
2227 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2228 * offset of the match. Note this start includes all following accents.
2231 int32_t doPreviousCanonicalPrefixMatch(UStringSearch
*strsrch
,
2235 const UChar
*text
= strsrch
->search
->text
;
2236 const UCollator
*collator
= strsrch
->collator
;
2237 int32_t safelength
= 0;
2239 int32_t safetextlength
;
2240 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
2241 int32_t safeoffset
= textoffset
;
2244 ucol_unsafeCP(strsrch
->canonicalPrefixAccents
[
2245 u_strlen(strsrch
->canonicalPrefixAccents
) - 1
2247 safeoffset
= getNextSafeOffset(collator
, text
, textoffset
,
2248 strsrch
->search
->textLength
);
2249 safelength
= safeoffset
- textoffset
;
2250 safetextlength
= INITIAL_ARRAY_SIZE_
;
2251 safetext
= addToUCharArray(safebuffer
, &safetextlength
,
2252 strsrch
->canonicalPrefixAccents
,
2253 text
+ textoffset
, safelength
,
2257 safetextlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2258 safetext
= strsrch
->canonicalPrefixAccents
;
2261 UCollationElements
*coleiter
= strsrch
->utilIter
;
2262 // if status is a failure, ucol_setText does nothing
2263 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
2264 // status checked in loop below
2266 int32_t *ce
= strsrch
->pattern
.ces
;
2267 int32_t celength
= strsrch
->pattern
.cesLength
;
2269 UBool isSafe
= TRUE
; // safe zone indication flag for position
2270 int32_t prefixlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2272 while (ceindex
< celength
) {
2273 int32_t textce
= ucol_next(coleiter
, status
);
2274 if (U_FAILURE(*status
)) {
2276 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2278 return USEARCH_DONE
;
2280 if (textce
== UCOL_NULLORDER
) {
2281 // check if we have passed the safe buffer
2282 if (coleiter
== strsrch
->textIter
) {
2283 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2284 return USEARCH_DONE
;
2286 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2287 safetext
= safebuffer
;
2288 coleiter
= strsrch
->textIter
;
2289 setColEIterOffset(coleiter
, safeoffset
);
2290 // status checked at the start of the loop
2294 textce
= getCE(strsrch
, textce
);
2295 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
2296 // do the beginning stuff
2297 int32_t failedoffset
= ucol_getOffset(coleiter
);
2298 if (isSafe
&& failedoffset
<= prefixlength
) {
2299 // alas... no hope. failed at rearranged accent set
2300 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2301 return USEARCH_DONE
;
2305 failedoffset
= safeoffset
- failedoffset
;
2306 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2309 // try rearranging the end accents
2310 int32_t result
= doPreviousCanonicalSuffixMatch(strsrch
,
2311 textoffset
, failedoffset
, status
);
2312 if (result
!= USEARCH_DONE
) {
2313 // if status is a failure, ucol_setOffset does nothing
2314 setColEIterOffset(strsrch
->textIter
, result
);
2316 if (U_FAILURE(*status
)) {
2317 return USEARCH_DONE
;
2322 if (textce
== ce
[ceindex
]) {
2328 int32_t result
= ucol_getOffset(coleiter
);
2329 // sets the text iterator here with the correct expansion and offset
2330 int32_t leftoverces
= getExpansionSuffix(coleiter
);
2331 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2332 if (result
<= prefixlength
) {
2333 result
= textoffset
;
2336 result
= textoffset
+ (safeoffset
- result
);
2338 setColEIterOffset(strsrch
->textIter
, result
);
2339 setExpansionSuffix(strsrch
->textIter
, leftoverces
);
2343 return ucol_getOffset(coleiter
);
2347 * Trying out the substring and sees if it can be a canonical match.
2348 * This will try normalizing the starting accents and arranging them into
2349 * canonical equivalents and check their corresponding ces with the pattern ce.
2350 * Prefix accents in the text will be grouped according to their combining
2351 * class and the groups will be mixed and matched to try find the perfect
2352 * match with the pattern.
2353 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2354 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2355 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2357 * step 2: check if any of the generated substrings matches the pattern.
2358 * Internal method, status assumed to be success, caller has to check status
2359 * before calling this method.
2360 * @param strsrch string search data
2361 * @param textoffset start offset in the collation element text that starts
2362 * with the accents to be rearranged
2363 * @param status output error status if any
2364 * @return TRUE if the match is valid, FALSE otherwise
2367 UBool
doPreviousCanonicalMatch(UStringSearch
*strsrch
,
2371 const UChar
*text
= strsrch
->search
->text
;
2372 int32_t temp
= textoffset
;
2373 int32_t textlength
= strsrch
->search
->textLength
;
2374 if ((getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) == 0) {
2375 UCollationElements
*coleiter
= strsrch
->textIter
;
2376 int32_t offset
= ucol_getOffset(coleiter
);
2377 if (strsrch
->pattern
.hasSuffixAccents
) {
2378 offset
= doPreviousCanonicalSuffixMatch(strsrch
, textoffset
,
2380 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
2381 setColEIterOffset(coleiter
, offset
);
2388 if (!strsrch
->pattern
.hasPrefixAccents
) {
2392 UChar accents
[INITIAL_ARRAY_SIZE_
];
2393 // offset to the last base character in substring to search
2394 int32_t baseoffset
= getNextBaseOffset(text
, textoffset
, textlength
);
2395 // normalizing the offensive string
2396 unorm_normalize(text
+ textoffset
, baseoffset
- textoffset
, UNORM_NFD
,
2397 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
2398 // status checked in loop
2400 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2401 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
2403 // 2 power n - 1 plus the full set of accents
2404 int32_t count
= (2 << (size
- 1)) - 1;
2405 while (U_SUCCESS(*status
) && count
> 0) {
2406 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
2407 // copy the base characters
2408 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2409 *rearrange
++ = accents
[k
];
2411 // forming all possible canonical rearrangement by dropping
2413 for (int i
= 0; i
<= size
- 1; i
++) {
2414 int32_t mask
= 1 << (size
- i
- 1);
2416 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2417 *rearrange
++ = accents
[j
];
2422 int32_t offset
= doPreviousCanonicalPrefixMatch(strsrch
,
2423 baseoffset
, status
);
2424 if (offset
!= USEARCH_DONE
) {
2425 return TRUE
; // match found
2433 * Checks match for contraction.
2434 * If the match starts with a partial contraction we fail.
2435 * Internal method, status assumed to be success, caller has to check status
2436 * before calling this method.
2437 * @param strsrch string search data
2438 * @param start offset of potential match, to be modified if necessary
2439 * @param end offset of potential match, to be modified if necessary
2440 * @param status only error status if any
2441 * @return TRUE if match passes the contraction test, FALSE otherwise
2444 UBool
checkPreviousCanonicalContractionMatch(UStringSearch
*strsrch
,
2446 int32_t *end
, UErrorCode
*status
)
2448 UCollationElements
*coleiter
= strsrch
->textIter
;
2449 int32_t textlength
= strsrch
->search
->textLength
;
2450 int32_t temp
= *end
;
2451 const UCollator
*collator
= strsrch
->collator
;
2452 const UChar
*text
= strsrch
->search
->text
;
2453 // This part checks if either if the start of the match contains potential
2454 // contraction. If so we'll have to iterate through them
2455 // Since we used ucol_next while previously looking for the potential
2456 // match, this guarantees that our end will not be a partial contraction,
2457 // or a partial supplementary character.
2458 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2459 int32_t expansion
= getExpansionSuffix(coleiter
);
2460 UBool expandflag
= expansion
> 0;
2461 setColEIterOffset(coleiter
, *end
);
2462 while (expansion
> 0) {
2463 // getting rid of the redundant ce
2464 // since forward contraction/expansion may have extra ces
2465 // if we are in the normalization buffer, hasAccentsBeforeMatch
2466 // would have taken care of it.
2467 // E.g. the character \u01FA will have an expansion of 3, but if
2468 // we are only looking for A ring A\u030A, we'll have to skip the
2469 // last ce in the expansion buffer
2470 ucol_previous(coleiter
, status
);
2471 if (U_FAILURE(*status
)) {
2474 if (ucol_getOffset(coleiter
) != temp
) {
2476 temp
= ucol_getOffset(coleiter
);
2481 int32_t *patternce
= strsrch
->pattern
.ces
;
2482 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
2483 int32_t count
= patterncelength
;
2485 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2486 // status checked below, note that if status is a failure
2487 // ucol_previous returns UCOL_NULLORDER
2488 if (ce
== UCOL_IGNORABLE
) {
2491 if (expandflag
&& count
== 0 &&
2492 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2494 temp
= ucol_getOffset(coleiter
);
2496 if (count
== patterncelength
&&
2497 ce
!= patternce
[patterncelength
- 1]) {
2498 // accents may have extra starting ces, this occurs when a
2499 // pure accent pattern is matched without rearrangement
2500 int32_t expected
= patternce
[patterncelength
- 1];
2501 U16_BACK_1(text
, 0, *end
);
2502 if (getFCD(text
, end
, textlength
) & LAST_BYTE_MASK_
) {
2503 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2504 while (U_SUCCESS(*status
) && ce
!= expected
&&
2505 ce
!= UCOL_NULLORDER
&&
2506 ucol_getOffset(coleiter
) <= *start
) {
2507 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2511 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2513 *start
= getPreviousBaseOffset(text
, *start
);
2523 * Checks and sets the match information if found.
2526 * <li> the potential match does not repeat the previous match
2527 * <li> boundaries are correct
2528 * <li> potential match does not end in the middle of a contraction
2529 * <li> identical matches
2531 * Otherwise the offset will be shifted to the next character.
2532 * Internal method, status assumed to be success, caller has to check status
2533 * before calling this method.
2534 * @param strsrch string search data
2535 * @param textoffset offset in the collation element text. the returned value
2536 * will be the truncated start offset of the match or the new start
2538 * @param status only error status if any
2539 * @return TRUE if the match is valid, FALSE otherwise
2542 inline UBool
checkPreviousCanonicalMatch(UStringSearch
*strsrch
,
2543 int32_t *textoffset
,
2546 // to ensure that the start and ends are not composite characters
2547 UCollationElements
*coleiter
= strsrch
->textIter
;
2548 // if we have a canonical accent match
2549 if ((strsrch
->pattern
.hasSuffixAccents
&&
2550 strsrch
->canonicalSuffixAccents
[0]) ||
2551 (strsrch
->pattern
.hasPrefixAccents
&&
2552 strsrch
->canonicalPrefixAccents
[0])) {
2553 strsrch
->search
->matchedIndex
= *textoffset
;
2554 strsrch
->search
->matchedLength
=
2555 getNextUStringSearchBaseOffset(strsrch
,
2556 getColElemIterOffset(coleiter
, FALSE
))
2561 int32_t end
= ucol_getOffset(coleiter
);
2562 if (!checkPreviousCanonicalContractionMatch(strsrch
, textoffset
, &end
,
2564 U_FAILURE(*status
)) {
2568 end
= getNextUStringSearchBaseOffset(strsrch
, end
);
2569 // this totally matches, however we need to check if it is repeating
2570 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2571 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2572 !checkIdentical(strsrch
, *textoffset
, end
)) {
2574 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2579 strsrch
->search
->matchedIndex
= *textoffset
;
2580 strsrch
->search
->matchedLength
= end
- *textoffset
;
2583 #endif // #if BOYER_MOORE
2585 // constructors and destructor -------------------------------------------
2587 U_CAPI UStringSearch
* U_EXPORT2
usearch_open(const UChar
*pattern
,
2588 int32_t patternlength
,
2592 UBreakIterator
*breakiter
,
2595 if (U_FAILURE(*status
)) {
2598 #if UCONFIG_NO_BREAK_ITERATION
2599 if (breakiter
!= NULL
) {
2600 *status
= U_UNSUPPORTED_ERROR
;
2605 // ucol_open internally checks for status
2606 UCollator
*collator
= ucol_open(locale
, status
);
2607 // pattern, text checks are done in usearch_openFromCollator
2608 UStringSearch
*result
= usearch_openFromCollator(pattern
,
2609 patternlength
, text
, textlength
,
2610 collator
, breakiter
, status
);
2612 if (result
== NULL
|| U_FAILURE(*status
)) {
2614 ucol_close(collator
);
2619 result
->ownCollator
= TRUE
;
2623 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2627 U_CAPI UStringSearch
* U_EXPORT2
usearch_openFromCollator(
2628 const UChar
*pattern
,
2629 int32_t patternlength
,
2632 const UCollator
*collator
,
2633 UBreakIterator
*breakiter
,
2636 if (U_FAILURE(*status
)) {
2639 #if UCONFIG_NO_BREAK_ITERATION
2640 if (breakiter
!= NULL
) {
2641 *status
= U_UNSUPPORTED_ERROR
;
2645 if (pattern
== NULL
|| text
== NULL
|| collator
== NULL
) {
2646 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2650 // string search does not really work when numeric collation is turned on
2651 if(ucol_getAttribute(collator
, UCOL_NUMERIC_COLLATION
, status
) == UCOL_ON
) {
2652 *status
= U_UNSUPPORTED_ERROR
;
2656 if (U_SUCCESS(*status
)) {
2657 initializeFCD(status
);
2658 if (U_FAILURE(*status
)) {
2662 UStringSearch
*result
;
2663 if (textlength
== -1) {
2664 textlength
= u_strlen(text
);
2666 if (patternlength
== -1) {
2667 patternlength
= u_strlen(pattern
);
2669 if (textlength
<= 0 || patternlength
<= 0) {
2670 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2674 result
= (UStringSearch
*)uprv_malloc(sizeof(UStringSearch
));
2675 if (result
== NULL
) {
2676 *status
= U_MEMORY_ALLOCATION_ERROR
;
2680 result
->collator
= collator
;
2681 result
->strength
= ucol_getStrength(collator
);
2682 result
->ceMask
= getMask(result
->strength
);
2684 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2686 result
->variableTop
= ucol_getVariableTop(collator
, status
);
2688 result
->nfd
= Normalizer2::getNFDInstance(*status
);
2690 if (U_FAILURE(*status
)) {
2695 result
->search
= (USearch
*)uprv_malloc(sizeof(USearch
));
2696 if (result
->search
== NULL
) {
2697 *status
= U_MEMORY_ALLOCATION_ERROR
;
2702 result
->search
->text
= text
;
2703 result
->search
->textLength
= textlength
;
2705 result
->pattern
.text
= pattern
;
2706 result
->pattern
.textLength
= patternlength
;
2707 result
->pattern
.ces
= NULL
;
2708 result
->pattern
.pces
= NULL
;
2710 result
->search
->breakIter
= breakiter
;
2711 #if !UCONFIG_NO_BREAK_ITERATION
2712 result
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(result
->collator
, ULOC_VALID_LOCALE
, status
), text
, textlength
, status
);
2714 ubrk_setText(breakiter
, text
, textlength
, status
);
2718 result
->ownCollator
= FALSE
;
2719 result
->search
->matchedLength
= 0;
2720 result
->search
->matchedIndex
= USEARCH_DONE
;
2721 result
->utilIter
= NULL
;
2722 result
->textIter
= ucol_openElements(collator
, text
,
2723 textlength
, status
);
2724 result
->textProcessedIter
= NULL
;
2725 if (U_FAILURE(*status
)) {
2726 usearch_close(result
);
2730 result
->search
->isOverlap
= FALSE
;
2731 result
->search
->isCanonicalMatch
= FALSE
;
2732 result
->search
->elementComparisonType
= 0;
2733 result
->search
->isForwardSearching
= TRUE
;
2734 result
->search
->reset
= TRUE
;
2736 initialize(result
, status
);
2738 if (U_FAILURE(*status
)) {
2739 usearch_close(result
);
2748 U_CAPI
void U_EXPORT2
usearch_close(UStringSearch
*strsrch
)
2751 if (strsrch
->pattern
.ces
!= strsrch
->pattern
.cesBuffer
&&
2752 strsrch
->pattern
.ces
) {
2753 uprv_free(strsrch
->pattern
.ces
);
2756 if (strsrch
->pattern
.pces
!= NULL
&&
2757 strsrch
->pattern
.pces
!= strsrch
->pattern
.pcesBuffer
) {
2758 uprv_free(strsrch
->pattern
.pces
);
2761 delete strsrch
->textProcessedIter
;
2762 ucol_closeElements(strsrch
->textIter
);
2763 ucol_closeElements(strsrch
->utilIter
);
2765 if (strsrch
->ownCollator
&& strsrch
->collator
) {
2766 ucol_close((UCollator
*)strsrch
->collator
);
2769 #if !UCONFIG_NO_BREAK_ITERATION
2770 if (strsrch
->search
->internalBreakIter
) {
2771 ubrk_close(strsrch
->search
->internalBreakIter
);
2775 uprv_free(strsrch
->search
);
2782 UBool
initTextProcessedIter(UStringSearch
*strsrch
, UErrorCode
*status
) {
2783 if (U_FAILURE(*status
)) { return FALSE
; }
2784 if (strsrch
->textProcessedIter
== NULL
) {
2785 strsrch
->textProcessedIter
= new icu::UCollationPCE(strsrch
->textIter
);
2786 if (strsrch
->textProcessedIter
== NULL
) {
2787 *status
= U_MEMORY_ALLOCATION_ERROR
;
2791 strsrch
->textProcessedIter
->init(strsrch
->textIter
);
2798 // set and get methods --------------------------------------------------
2800 U_CAPI
void U_EXPORT2
usearch_setOffset(UStringSearch
*strsrch
,
2804 if (U_SUCCESS(*status
) && strsrch
) {
2805 if (isOutOfBounds(strsrch
->search
->textLength
, position
)) {
2806 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2809 setColEIterOffset(strsrch
->textIter
, position
);
2811 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2812 strsrch
->search
->matchedLength
= 0;
2813 strsrch
->search
->reset
= FALSE
;
2817 U_CAPI
int32_t U_EXPORT2
usearch_getOffset(const UStringSearch
*strsrch
)
2820 int32_t result
= ucol_getOffset(strsrch
->textIter
);
2821 if (isOutOfBounds(strsrch
->search
->textLength
, result
)) {
2822 return USEARCH_DONE
;
2826 return USEARCH_DONE
;
2829 U_CAPI
void U_EXPORT2
usearch_setAttribute(UStringSearch
*strsrch
,
2830 USearchAttribute attribute
,
2831 USearchAttributeValue value
,
2834 if (U_SUCCESS(*status
) && strsrch
) {
2837 case USEARCH_OVERLAP
:
2838 strsrch
->search
->isOverlap
= (value
== USEARCH_ON
? TRUE
: FALSE
);
2840 case USEARCH_CANONICAL_MATCH
:
2841 strsrch
->search
->isCanonicalMatch
= (value
== USEARCH_ON
? TRUE
:
2844 case USEARCH_ELEMENT_COMPARISON
:
2845 if (value
== USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD
|| value
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
2846 strsrch
->search
->elementComparisonType
= (int16_t)value
;
2848 strsrch
->search
->elementComparisonType
= 0;
2851 case USEARCH_ATTRIBUTE_COUNT
:
2853 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2856 if (value
== USEARCH_ATTRIBUTE_VALUE_COUNT
) {
2857 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2861 U_CAPI USearchAttributeValue U_EXPORT2
usearch_getAttribute(
2862 const UStringSearch
*strsrch
,
2863 USearchAttribute attribute
)
2866 switch (attribute
) {
2867 case USEARCH_OVERLAP
:
2868 return (strsrch
->search
->isOverlap
== TRUE
? USEARCH_ON
:
2870 case USEARCH_CANONICAL_MATCH
:
2871 return (strsrch
->search
->isCanonicalMatch
== TRUE
? USEARCH_ON
:
2873 case USEARCH_ELEMENT_COMPARISON
:
2875 int16_t value
= strsrch
->search
->elementComparisonType
;
2876 if (value
== USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD
|| value
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
2877 return (USearchAttributeValue
)value
;
2879 return USEARCH_STANDARD_ELEMENT_COMPARISON
;
2882 case USEARCH_ATTRIBUTE_COUNT
:
2883 return USEARCH_DEFAULT
;
2886 return USEARCH_DEFAULT
;
2889 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedStart(
2890 const UStringSearch
*strsrch
)
2892 if (strsrch
== NULL
) {
2893 return USEARCH_DONE
;
2895 return strsrch
->search
->matchedIndex
;
2899 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedText(const UStringSearch
*strsrch
,
2901 int32_t resultCapacity
,
2904 if (U_FAILURE(*status
)) {
2905 return USEARCH_DONE
;
2907 if (strsrch
== NULL
|| resultCapacity
< 0 || (resultCapacity
> 0 &&
2909 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2910 return USEARCH_DONE
;
2913 int32_t copylength
= strsrch
->search
->matchedLength
;
2914 int32_t copyindex
= strsrch
->search
->matchedIndex
;
2915 if (copyindex
== USEARCH_DONE
) {
2916 u_terminateUChars(result
, resultCapacity
, 0, status
);
2917 return USEARCH_DONE
;
2920 if (resultCapacity
< copylength
) {
2921 copylength
= resultCapacity
;
2923 if (copylength
> 0) {
2924 uprv_memcpy(result
, strsrch
->search
->text
+ copyindex
,
2925 copylength
* sizeof(UChar
));
2927 return u_terminateUChars(result
, resultCapacity
,
2928 strsrch
->search
->matchedLength
, status
);
2931 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedLength(
2932 const UStringSearch
*strsrch
)
2935 return strsrch
->search
->matchedLength
;
2937 return USEARCH_DONE
;
2940 #if !UCONFIG_NO_BREAK_ITERATION
2942 U_CAPI
void U_EXPORT2
usearch_setBreakIterator(UStringSearch
*strsrch
,
2943 UBreakIterator
*breakiter
,
2946 if (U_SUCCESS(*status
) && strsrch
) {
2947 strsrch
->search
->breakIter
= breakiter
;
2949 ubrk_setText(breakiter
, strsrch
->search
->text
,
2950 strsrch
->search
->textLength
, status
);
2955 U_CAPI
const UBreakIterator
* U_EXPORT2
2956 usearch_getBreakIterator(const UStringSearch
*strsrch
)
2959 return strsrch
->search
->breakIter
;
2966 U_CAPI
void U_EXPORT2
usearch_setText( UStringSearch
*strsrch
,
2971 if (U_SUCCESS(*status
)) {
2972 if (strsrch
== NULL
|| text
== NULL
|| textlength
< -1 ||
2974 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2977 if (textlength
== -1) {
2978 textlength
= u_strlen(text
);
2980 strsrch
->search
->text
= text
;
2981 strsrch
->search
->textLength
= textlength
;
2982 ucol_setText(strsrch
->textIter
, text
, textlength
, status
);
2983 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2984 strsrch
->search
->matchedLength
= 0;
2985 strsrch
->search
->reset
= TRUE
;
2986 #if !UCONFIG_NO_BREAK_ITERATION
2987 if (strsrch
->search
->breakIter
!= NULL
) {
2988 ubrk_setText(strsrch
->search
->breakIter
, text
,
2989 textlength
, status
);
2991 ubrk_setText(strsrch
->search
->internalBreakIter
, text
, textlength
, status
);
2997 U_CAPI
const UChar
* U_EXPORT2
usearch_getText(const UStringSearch
*strsrch
,
3001 *length
= strsrch
->search
->textLength
;
3002 return strsrch
->search
->text
;
3007 U_CAPI
void U_EXPORT2
usearch_setCollator( UStringSearch
*strsrch
,
3008 const UCollator
*collator
,
3011 if (U_SUCCESS(*status
)) {
3012 if (collator
== NULL
) {
3013 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3018 delete strsrch
->textProcessedIter
;
3019 strsrch
->textProcessedIter
= NULL
;
3020 ucol_closeElements(strsrch
->textIter
);
3021 ucol_closeElements(strsrch
->utilIter
);
3022 strsrch
->textIter
= strsrch
->utilIter
= NULL
;
3023 if (strsrch
->ownCollator
&& (strsrch
->collator
!= collator
)) {
3024 ucol_close((UCollator
*)strsrch
->collator
);
3025 strsrch
->ownCollator
= FALSE
;
3027 strsrch
->collator
= collator
;
3028 strsrch
->strength
= ucol_getStrength(collator
);
3029 strsrch
->ceMask
= getMask(strsrch
->strength
);
3030 #if !UCONFIG_NO_BREAK_ITERATION
3031 ubrk_close(strsrch
->search
->internalBreakIter
);
3032 strsrch
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(collator
, ULOC_VALID_LOCALE
, status
),
3033 strsrch
->search
->text
, strsrch
->search
->textLength
, status
);
3035 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3037 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
3039 // if status is a failure, ucol_getVariableTop returns 0
3040 strsrch
->variableTop
= ucol_getVariableTop(collator
, status
);
3041 strsrch
->textIter
= ucol_openElements(collator
,
3042 strsrch
->search
->text
,
3043 strsrch
->search
->textLength
,
3045 strsrch
->utilIter
= ucol_openElements(
3046 collator
, strsrch
->pattern
.text
, strsrch
->pattern
.textLength
, status
);
3047 // initialize() _after_ setting the iterators for the new collator.
3048 initialize(strsrch
, status
);
3051 // **** are these calls needed?
3052 // **** we call uprv_init_pce in initializePatternPCETable
3053 // **** and the CEIBuffer constructor...
3055 uprv_init_pce(strsrch
->textIter
);
3056 uprv_init_pce(strsrch
->utilIter
);
3061 U_CAPI UCollator
* U_EXPORT2
usearch_getCollator(const UStringSearch
*strsrch
)
3064 return (UCollator
*)strsrch
->collator
;
3069 U_CAPI
void U_EXPORT2
usearch_setPattern( UStringSearch
*strsrch
,
3070 const UChar
*pattern
,
3071 int32_t patternlength
,
3074 if (U_SUCCESS(*status
)) {
3075 if (strsrch
== NULL
|| pattern
== NULL
) {
3076 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3079 if (patternlength
== -1) {
3080 patternlength
= u_strlen(pattern
);
3082 if (patternlength
== 0) {
3083 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3086 strsrch
->pattern
.text
= pattern
;
3087 strsrch
->pattern
.textLength
= patternlength
;
3088 initialize(strsrch
, status
);
3093 U_CAPI
const UChar
* U_EXPORT2
3094 usearch_getPattern(const UStringSearch
*strsrch
,
3098 *length
= strsrch
->pattern
.textLength
;
3099 return strsrch
->pattern
.text
;
3104 // miscellanous methods --------------------------------------------------
3106 U_CAPI
int32_t U_EXPORT2
usearch_first(UStringSearch
*strsrch
,
3109 if (strsrch
&& U_SUCCESS(*status
)) {
3110 strsrch
->search
->isForwardSearching
= TRUE
;
3111 usearch_setOffset(strsrch
, 0, status
);
3112 if (U_SUCCESS(*status
)) {
3113 return usearch_next(strsrch
, status
);
3116 return USEARCH_DONE
;
3119 U_CAPI
int32_t U_EXPORT2
usearch_following(UStringSearch
*strsrch
,
3123 if (strsrch
&& U_SUCCESS(*status
)) {
3124 strsrch
->search
->isForwardSearching
= TRUE
;
3125 // position checked in usearch_setOffset
3126 usearch_setOffset(strsrch
, position
, status
);
3127 if (U_SUCCESS(*status
)) {
3128 return usearch_next(strsrch
, status
);
3131 return USEARCH_DONE
;
3134 U_CAPI
int32_t U_EXPORT2
usearch_last(UStringSearch
*strsrch
,
3137 if (strsrch
&& U_SUCCESS(*status
)) {
3138 strsrch
->search
->isForwardSearching
= FALSE
;
3139 usearch_setOffset(strsrch
, strsrch
->search
->textLength
, status
);
3140 if (U_SUCCESS(*status
)) {
3141 return usearch_previous(strsrch
, status
);
3144 return USEARCH_DONE
;
3147 U_CAPI
int32_t U_EXPORT2
usearch_preceding(UStringSearch
*strsrch
,
3151 if (strsrch
&& U_SUCCESS(*status
)) {
3152 strsrch
->search
->isForwardSearching
= FALSE
;
3153 // position checked in usearch_setOffset
3154 usearch_setOffset(strsrch
, position
, status
);
3155 if (U_SUCCESS(*status
)) {
3156 return usearch_previous(strsrch
, status
);
3159 return USEARCH_DONE
;
3163 * If a direction switch is required, we'll count the number of ces till the
3164 * beginning of the collation element iterator and iterate forwards that
3165 * number of times. This is so that we get to the correct point within the
3166 * string to continue the search in. Imagine when we are in the middle of the
3167 * normalization buffer when the change in direction is request. arrrgghh....
3168 * After searching the offset within the collation element iterator will be
3169 * shifted to the start of the match. If a match is not found, the offset would
3170 * have been set to the end of the text string in the collation element
3172 * Okay, here's my take on normalization buffer. The only time when there can
3173 * be 2 matches within the same normalization is when the pattern is consists
3174 * of all accents. But since the offset returned is from the text string, we
3175 * should not confuse the caller by returning the second match within the
3176 * same normalization buffer. If we do, the 2 results will have the same match
3177 * offsets, and that'll be confusing. I'll return the next match that doesn't
3178 * fall within the same normalization buffer. Note this does not affect the
3179 * results of matches spanning the text and the normalization buffer.
3180 * The position to start searching is taken from the collation element
3181 * iterator. Callers of this API would have to set the offset in the collation
3182 * element iterator before using this method.
3184 U_CAPI
int32_t U_EXPORT2
usearch_next(UStringSearch
*strsrch
,
3187 if (U_SUCCESS(*status
) && strsrch
) {
3188 // note offset is either equivalent to the start of the previous match
3189 // or is set by the user
3190 int32_t offset
= usearch_getOffset(strsrch
);
3191 USearch
*search
= strsrch
->search
;
3192 search
->reset
= FALSE
;
3193 int32_t textlength
= search
->textLength
;
3194 if (search
->isForwardSearching
) {
3196 if (offset
== textlength
3197 || (!search
->isOverlap
&&
3198 (offset
+ strsrch
->pattern
.defaultShiftSize
> textlength
||
3199 (search
->matchedIndex
!= USEARCH_DONE
&&
3200 offset
+ search
->matchedLength
>= textlength
)))) {
3201 // not enough characters to match
3202 setMatchNotFound(strsrch
);
3203 return USEARCH_DONE
;
3206 if (offset
== textlength
||
3207 (! search
->isOverlap
&&
3208 (search
->matchedIndex
!= USEARCH_DONE
&&
3209 offset
+ search
->matchedLength
> textlength
))) {
3210 // not enough characters to match
3211 setMatchNotFound(strsrch
);
3212 return USEARCH_DONE
;
3217 // switching direction.
3218 // if matchedIndex == USEARCH_DONE, it means that either a
3219 // setOffset has been called or that previous ran off the text
3220 // string. the iterator would have been set to offset 0 if a
3221 // match is not found.
3222 search
->isForwardSearching
= TRUE
;
3223 if (search
->matchedIndex
!= USEARCH_DONE
) {
3224 // there's no need to set the collation element iterator
3225 // the next call to next will set the offset.
3226 return search
->matchedIndex
;
3230 if (U_SUCCESS(*status
)) {
3231 if (strsrch
->pattern
.cesLength
== 0) {
3232 if (search
->matchedIndex
== USEARCH_DONE
) {
3233 search
->matchedIndex
= offset
;
3235 else { // moves by codepoints
3236 U16_FWD_1(search
->text
, search
->matchedIndex
, textlength
);
3239 search
->matchedLength
= 0;
3240 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3241 // status checked below
3242 if (search
->matchedIndex
== textlength
) {
3243 search
->matchedIndex
= USEARCH_DONE
;
3247 if (search
->matchedLength
> 0) {
3248 // if matchlength is 0 we are at the start of the iteration
3249 if (search
->isOverlap
) {
3250 ucol_setOffset(strsrch
->textIter
, offset
+ 1, status
);
3253 ucol_setOffset(strsrch
->textIter
,
3254 offset
+ search
->matchedLength
, status
);
3258 // for boundary check purposes. this will ensure that the
3259 // next match will not preceed the current offset
3260 // note search->matchedIndex will always be set to something
3262 search
->matchedIndex
= offset
- 1;
3265 if (search
->isCanonicalMatch
) {
3266 // can't use exact here since extra accents are allowed.
3267 usearch_handleNextCanonical(strsrch
, status
);
3270 usearch_handleNextExact(strsrch
, status
);
3274 if (U_FAILURE(*status
)) {
3275 return USEARCH_DONE
;
3279 if (search
->matchedIndex
== USEARCH_DONE
) {
3280 ucol_setOffset(strsrch
->textIter
, search
->textLength
, status
);
3282 ucol_setOffset(strsrch
->textIter
, search
->matchedIndex
, status
);
3286 return search
->matchedIndex
;
3289 return USEARCH_DONE
;
3292 U_CAPI
int32_t U_EXPORT2
usearch_previous(UStringSearch
*strsrch
,
3295 if (U_SUCCESS(*status
) && strsrch
) {
3297 USearch
*search
= strsrch
->search
;
3298 if (search
->reset
) {
3299 offset
= search
->textLength
;
3300 search
->isForwardSearching
= FALSE
;
3301 search
->reset
= FALSE
;
3302 setColEIterOffset(strsrch
->textIter
, offset
);
3305 offset
= usearch_getOffset(strsrch
);
3308 int32_t matchedindex
= search
->matchedIndex
;
3309 if (search
->isForwardSearching
== TRUE
) {
3310 // switching direction.
3311 // if matchedIndex == USEARCH_DONE, it means that either a
3312 // setOffset has been called or that next ran off the text
3313 // string. the iterator would have been set to offset textLength if
3314 // a match is not found.
3315 search
->isForwardSearching
= FALSE
;
3316 if (matchedindex
!= USEARCH_DONE
) {
3317 return matchedindex
;
3322 if (offset
== 0 || matchedindex
== 0 ||
3323 (!search
->isOverlap
&&
3324 (offset
< strsrch
->pattern
.defaultShiftSize
||
3325 (matchedindex
!= USEARCH_DONE
&&
3326 matchedindex
< strsrch
->pattern
.defaultShiftSize
)))) {
3327 // not enough characters to match
3328 setMatchNotFound(strsrch
);
3329 return USEARCH_DONE
;
3332 // Could check pattern length, but the
3333 // linear search will do the right thing
3334 if (offset
== 0 || matchedindex
== 0) {
3335 setMatchNotFound(strsrch
);
3336 return USEARCH_DONE
;
3341 if (U_SUCCESS(*status
)) {
3342 if (strsrch
->pattern
.cesLength
== 0) {
3343 search
->matchedIndex
=
3344 (matchedindex
== USEARCH_DONE
? offset
: matchedindex
);
3345 if (search
->matchedIndex
== 0) {
3346 setMatchNotFound(strsrch
);
3347 // status checked below
3349 else { // move by codepoints
3350 U16_BACK_1(search
->text
, 0, search
->matchedIndex
);
3351 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3352 // status checked below
3353 search
->matchedLength
= 0;
3357 if (strsrch
->search
->isCanonicalMatch
) {
3358 // can't use exact here since extra accents are allowed.
3359 usearch_handlePreviousCanonical(strsrch
, status
);
3360 // status checked below
3363 usearch_handlePreviousExact(strsrch
, status
);
3364 // status checked below
3368 if (U_FAILURE(*status
)) {
3369 return USEARCH_DONE
;
3372 return search
->matchedIndex
;
3375 return USEARCH_DONE
;
3380 U_CAPI
void U_EXPORT2
usearch_reset(UStringSearch
*strsrch
)
3383 reset is setting the attributes that are already in
3384 string search, hence all attributes in the collator should
3385 be retrieved without any problems
3388 UErrorCode status
= U_ZERO_ERROR
;
3389 UBool sameCollAttribute
= TRUE
;
3394 // **** hack to deal w/ how processed CEs encode quaternary ****
3395 UCollationStrength newStrength
= ucol_getStrength(strsrch
->collator
);
3396 if ((strsrch
->strength
< UCOL_QUATERNARY
&& newStrength
>= UCOL_QUATERNARY
) ||
3397 (strsrch
->strength
>= UCOL_QUATERNARY
&& newStrength
< UCOL_QUATERNARY
)) {
3398 sameCollAttribute
= FALSE
;
3401 strsrch
->strength
= ucol_getStrength(strsrch
->collator
);
3402 ceMask
= getMask(strsrch
->strength
);
3403 if (strsrch
->ceMask
!= ceMask
) {
3404 strsrch
->ceMask
= ceMask
;
3405 sameCollAttribute
= FALSE
;
3408 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3409 shift
= ucol_getAttribute(strsrch
->collator
, UCOL_ALTERNATE_HANDLING
,
3410 &status
) == UCOL_SHIFTED
;
3411 if (strsrch
->toShift
!= shift
) {
3412 strsrch
->toShift
= shift
;
3413 sameCollAttribute
= FALSE
;
3416 // if status is a failure, ucol_getVariableTop returns 0
3417 varTop
= ucol_getVariableTop(strsrch
->collator
, &status
);
3418 if (strsrch
->variableTop
!= varTop
) {
3419 strsrch
->variableTop
= varTop
;
3420 sameCollAttribute
= FALSE
;
3422 if (!sameCollAttribute
) {
3423 initialize(strsrch
, &status
);
3425 ucol_setText(strsrch
->textIter
, strsrch
->search
->text
,
3426 strsrch
->search
->textLength
,
3428 strsrch
->search
->matchedLength
= 0;
3429 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
3430 strsrch
->search
->isOverlap
= FALSE
;
3431 strsrch
->search
->isCanonicalMatch
= FALSE
;
3432 strsrch
->search
->elementComparisonType
= 0;
3433 strsrch
->search
->isForwardSearching
= TRUE
;
3434 strsrch
->search
->reset
= TRUE
;
3439 // CEI Collation Element + source text index.
3440 // These structs are kept in the circular buffer.
3452 // CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3454 #define DEFAULT_CEBUFFER_SIZE 96
3455 #define CEBUFFER_EXTRA 32
3456 // Some typical max values to make buffer size more reasonable for asymmetric search.
3457 // #8694 is for a better long-term solution to allocation of this buffer.
3458 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3459 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3460 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3462 CEI defBuf
[DEFAULT_CEBUFFER_SIZE
];
3467 UCollationElements
*ceIter
;
3468 UStringSearch
*strSearch
;
3472 CEIBuffer(UStringSearch
*ss
, UErrorCode
*status
);
3474 const CEI
*get(int32_t index
);
3475 const CEI
*getPrevious(int32_t index
);
3479 CEIBuffer::CEIBuffer(UStringSearch
*ss
, UErrorCode
*status
) {
3482 bufSize
= ss
->pattern
.pcesLength
+ CEBUFFER_EXTRA
;
3483 if (ss
->search
->elementComparisonType
!= 0) {
3484 const UChar
* patText
= ss
->pattern
.text
;
3486 const UChar
* patTextLimit
= patText
+ ss
->pattern
.textLength
;
3487 while ( patText
< patTextLimit
) {
3488 UChar c
= *patText
++;
3489 if (MIGHT_BE_JAMO_L(c
)) {
3490 bufSize
+= MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L
;
3492 // No check for surrogates, we might allocate slightly more buffer than necessary.
3493 bufSize
+= MAX_TARGET_IGNORABLES_PER_PAT_OTHER
;
3498 ceIter
= ss
->textIter
;
3502 if (!initTextProcessedIter(ss
, status
)) { return; }
3504 if (bufSize
>DEFAULT_CEBUFFER_SIZE
) {
3505 buf
= (CEI
*)uprv_malloc(bufSize
* sizeof(CEI
));
3507 *status
= U_MEMORY_ALLOCATION_ERROR
;
3512 // TODO: add a reset or init function so that allocated
3513 // buffers can be retained & reused.
3515 CEIBuffer::~CEIBuffer() {
3516 if (buf
!= defBuf
) {
3522 // Get the CE with the specified index.
3523 // Index must be in the range
3524 // n-history_size < index < n+1
3525 // where n is the largest index to have been fetched by some previous call to this function.
3526 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3528 const CEI
*CEIBuffer::get(int32_t index
) {
3529 int i
= index
% bufSize
;
3531 if (index
>=firstIx
&& index
<limitIx
) {
3532 // The request was for an entry already in our buffer.
3537 // Caller is requesting a new, never accessed before, CE.
3538 // Verify that it is the next one in sequence, which is all
3540 if (index
!= limitIx
) {
3546 // Manage the circular CE buffer indexing
3549 if (limitIx
- firstIx
>= bufSize
) {
3550 // The buffer is full, knock out the lowest-indexed entry.
3554 UErrorCode status
= U_ZERO_ERROR
;
3556 buf
[i
].ce
= strSearch
->textProcessedIter
->nextProcessed(&buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3561 // Get the CE with the specified index.
3562 // Index must be in the range
3563 // n-history_size < index < n+1
3564 // where n is the largest index to have been fetched by some previous call to this function.
3565 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3567 const CEI
*CEIBuffer::getPrevious(int32_t index
) {
3568 int i
= index
% bufSize
;
3570 if (index
>=firstIx
&& index
<limitIx
) {
3571 // The request was for an entry already in our buffer.
3576 // Caller is requesting a new, never accessed before, CE.
3577 // Verify that it is the next one in sequence, which is all
3579 if (index
!= limitIx
) {
3585 // Manage the circular CE buffer indexing
3588 if (limitIx
- firstIx
>= bufSize
) {
3589 // The buffer is full, knock out the lowest-indexed entry.
3593 UErrorCode status
= U_ZERO_ERROR
;
3595 buf
[i
].ce
= strSearch
->textProcessedIter
->previousProcessed(&buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3605 // #define USEARCH_DEBUG
3607 #ifdef USEARCH_DEBUG
3613 * Find the next break boundary after startIndex. If the UStringSearch object
3614 * has an external break iterator, use that. Otherwise use the internal character
3617 static int32_t nextBoundaryAfter(UStringSearch
*strsrch
, int32_t startIndex
) {
3619 const UChar
*text
= strsrch
->search
->text
;
3620 int32_t textLen
= strsrch
->search
->textLength
;
3622 U_ASSERT(startIndex
>=0);
3623 U_ASSERT(startIndex
<=textLen
);
3625 if (startIndex
>= textLen
) {
3630 int32_t i
= startIndex
;
3631 U16_NEXT(text
, i
, textLen
, c
);
3633 // If we are on a control character, stop without looking for combining marks.
3634 // Control characters do not combine.
3635 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3636 if (gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
) {
3640 // The initial character was not a control, and can thus accept trailing
3641 // combining characters. Advance over however many of them there are.
3642 int32_t indexOfLastCharChecked
;
3644 indexOfLastCharChecked
= i
;
3648 U16_NEXT(text
, i
, textLen
, c
);
3649 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3650 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3654 return indexOfLastCharChecked
;
3655 #elif !UCONFIG_NO_BREAK_ITERATION
3656 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3658 if (breakiterator
== NULL
) {
3659 breakiterator
= strsrch
->search
->internalBreakIter
;
3662 if (breakiterator
!= NULL
) {
3663 return ubrk_following(breakiterator
, startIndex
);
3668 // **** or should we use the original code? ****
3675 * Returns TRUE if index is on a break boundary. If the UStringSearch
3676 * has an external break iterator, test using that, otherwise test
3677 * using the internal character break iterator.
3679 static UBool
isBreakBoundary(UStringSearch
*strsrch
, int32_t index
) {
3681 const UChar
*text
= strsrch
->search
->text
;
3682 int32_t textLen
= strsrch
->search
->textLength
;
3685 U_ASSERT(index
<=textLen
);
3687 if (index
>=textLen
|| index
<=0) {
3691 // If the character at the current index is not a GRAPHEME_EXTEND
3692 // then we can not be within a combining sequence.
3694 U16_GET(text
, 0, index
, textLen
, c
);
3695 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3696 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3700 // We are at a combining mark. If the preceding character is anything
3701 // except a CONTROL, CR or LF, we are in a combining sequence.
3702 U16_PREV(text
, 0, index
, c
);
3703 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3704 UBool combining
= !(gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
);
3706 #elif !UCONFIG_NO_BREAK_ITERATION
3707 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3709 if (breakiterator
== NULL
) {
3710 breakiterator
= strsrch
->search
->internalBreakIter
;
3713 return (breakiterator
!= NULL
&& ubrk_isBoundary(breakiterator
, index
));
3715 // **** or use the original code? ****
3721 static UBool
onBreakBoundaries(const UStringSearch
*strsrch
, int32_t start
, int32_t end
)
3723 #if !UCONFIG_NO_BREAK_ITERATION
3724 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3726 if (breakiterator
!= NULL
) {
3727 int32_t startindex
= ubrk_first(breakiterator
);
3728 int32_t endindex
= ubrk_last(breakiterator
);
3730 // out-of-range indexes are never boundary positions
3731 if (start
< startindex
|| start
> endindex
||
3732 end
< startindex
|| end
> endindex
) {
3736 return ubrk_isBoundary(breakiterator
, start
) &&
3737 ubrk_isBoundary(breakiterator
, end
);
3750 } UCompareCEsResult
;
3751 #define U_CE_LEVEL2_BASE 0x00000005
3752 #define U_CE_LEVEL3_BASE 0x00050000
3754 static UCompareCEsResult
compareCE64s(int64_t targCE
, int64_t patCE
, int16_t compareType
) {
3755 if (targCE
== patCE
) {
3758 if (compareType
== 0) {
3759 return U_CE_NO_MATCH
;
3762 int64_t targCEshifted
= targCE
>> 32;
3763 int64_t patCEshifted
= patCE
>> 32;
3767 int32_t targLev1
= (int32_t)(targCEshifted
& mask
);
3768 int32_t patLev1
= (int32_t)(patCEshifted
& mask
);
3769 if ( targLev1
!= patLev1
) {
3770 if ( targLev1
== 0 ) {
3771 return U_CE_SKIP_TARG
;
3773 if ( patLev1
== 0 && compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
3774 return U_CE_SKIP_PATN
;
3776 return U_CE_NO_MATCH
;
3780 int32_t targLev2
= (int32_t)(targCEshifted
& mask
);
3781 int32_t patLev2
= (int32_t)(patCEshifted
& mask
);
3782 if ( targLev2
!= patLev2
) {
3783 if ( targLev2
== 0 ) {
3784 return U_CE_SKIP_TARG
;
3786 if ( patLev2
== 0 && compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
3787 return U_CE_SKIP_PATN
;
3789 return (patLev2
== U_CE_LEVEL2_BASE
|| (compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
&& targLev2
== U_CE_LEVEL2_BASE
) )?
3790 U_CE_MATCH
: U_CE_NO_MATCH
;
3794 int32_t targLev3
= (int32_t)(targCE
& mask
);
3795 int32_t patLev3
= (int32_t)(patCE
& mask
);
3796 if ( targLev3
!= patLev3
) {
3797 return (patLev3
== U_CE_LEVEL3_BASE
|| (compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
&& targLev3
== U_CE_LEVEL3_BASE
) )?
3798 U_CE_MATCH
: U_CE_NO_MATCH
;
3805 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3808 U_CAPI UBool U_EXPORT2
usearch_search(UStringSearch
*strsrch
,
3810 int32_t *matchStart
,
3811 int32_t *matchLimit
,
3814 if (U_FAILURE(*status
)) {
3818 // TODO: reject search patterns beginning with a combining char.
3820 #ifdef USEARCH_DEBUG
3821 if (getenv("USEARCH_DEBUG") != NULL
) {
3822 printf("Pattern CEs\n");
3823 for (int ii
=0; ii
<strsrch
->pattern
.cesLength
; ii
++) {
3824 printf(" %8x", strsrch
->pattern
.ces
[ii
]);
3830 // Input parameter sanity check.
3831 // TODO: should input indicies clip to the text length
3832 // in the same way that UText does.
3833 if(strsrch
->pattern
.cesLength
== 0 ||
3835 startIdx
> strsrch
->search
->textLength
||
3836 strsrch
->pattern
.ces
== NULL
) {
3837 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3841 if (strsrch
->pattern
.pces
== NULL
) {
3842 initializePatternPCETable(strsrch
, status
);
3845 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
3846 CEIBuffer
ceb(strsrch
, status
);
3849 int32_t targetIx
= 0;
3850 const CEI
*targetCEI
= NULL
;
3854 int32_t mStart
= -1;
3855 int32_t mLimit
= -1;
3861 // Outer loop moves over match starting positions in the
3863 // Here we see the target as a sequence of collation elements, resulting from the following:
3864 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3865 // (for example, digraphs such as IJ may be broken into two characters).
3866 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3867 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3868 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3869 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3870 // then the CE is deleted, so the following code sees only CEs that are relevant.
3871 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3872 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3873 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3875 for(targetIx
=0; ; targetIx
++)
3878 // Inner loop checks for a match beginning at each
3879 // position from the outer loop.
3880 int32_t targetIxOffset
= 0;
3882 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3883 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3884 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3885 const CEI
*firstCEI
= ceb
.get(targetIx
);
3886 if (firstCEI
== NULL
) {
3887 *status
= U_INTERNAL_PROGRAM_ERROR
;
3892 for (patIx
=0; patIx
<strsrch
->pattern
.pcesLength
; patIx
++) {
3893 patCE
= strsrch
->pattern
.pces
[patIx
];
3894 targetCEI
= ceb
.get(targetIx
+patIx
+targetIxOffset
);
3895 // Compare CE from target string with CE from the pattern.
3896 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3897 // which will fail the compare, below.
3898 UCompareCEsResult ceMatch
= compareCE64s(targetCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
3899 if ( ceMatch
== U_CE_NO_MATCH
) {
3902 } else if ( ceMatch
> U_CE_NO_MATCH
) {
3903 if ( ceMatch
== U_CE_SKIP_TARG
) {
3904 // redo with same patCE, next targCE
3907 } else { // ceMatch == U_CE_SKIP_PATN
3908 // redo with same targCE, next patCE
3913 targetIxOffset
+= strsrch
->pattern
.pcesLength
; // this is now the offset in target CE space to end of the match so far
3915 if (!found
&& ((targetCEI
== NULL
) || (targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
))) {
3916 // No match at this targetIx. Try again at the next.
3921 // No match at all, we have run off the end of the target text.
3926 // We have found a match in CE space.
3927 // Now determine the bounds in string index space.
3928 // There still is a chance of match failure if the CE range not correspond to
3929 // an acceptable character range.
3931 const CEI
*lastCEI
= ceb
.get(targetIx
+ targetIxOffset
- 1);
3933 mStart
= firstCEI
->lowIndex
;
3934 minLimit
= lastCEI
->lowIndex
;
3936 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3937 // extended to the end of input, and the match is good.
3939 // Look at the high and low indices of the CE following the match. If
3940 // they are the same it means one of two things:
3941 // 1. The match extended to the last CE from the target text, which is OK, or
3942 // 2. The last CE that was part of the match is in an expansion that extends
3943 // to the first CE after the match. In this case, we reject the match.
3944 const CEI
*nextCEI
= 0;
3945 if (strsrch
->search
->elementComparisonType
== 0) {
3946 nextCEI
= ceb
.get(targetIx
+ targetIxOffset
);
3947 maxLimit
= nextCEI
->lowIndex
;
3948 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
3952 for ( ; ; ++targetIxOffset
) {
3953 nextCEI
= ceb
.get(targetIx
+ targetIxOffset
);
3954 maxLimit
= nextCEI
->lowIndex
;
3955 // If we are at the end of the target too, match succeeds
3956 if ( nextCEI
->ce
== UCOL_PROCESSED_NULLORDER
) {
3959 // As long as the next CE has primary weight of 0,
3960 // it is part of the last target element matched by the pattern;
3961 // make sure it can be part of a match with the last patCE
3962 if ( (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) == 0 ) {
3963 UCompareCEsResult ceMatch
= compareCE64s(nextCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
3964 if ( ceMatch
== U_CE_NO_MATCH
|| ceMatch
== U_CE_SKIP_PATN
) {
3968 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3969 // target element, but it has non-zero primary weight => match fails
3970 } else if ( nextCEI
->lowIndex
== nextCEI
->highIndex
) {
3973 // Else the target CE is not part of an expansion of the last matched element, match succeeds
3981 // Check for the start of the match being within a combining sequence.
3982 // This can happen if the pattern itself begins with a combining char, and
3983 // the match found combining marks in the target text that were attached
3984 // to something else.
3985 // This type of match should be rejected for not completely consuming a
3986 // combining sequence.
3987 if (!isBreakBoundary(strsrch
, mStart
)) {
3991 // Check for the start of the match being within an Collation Element Expansion,
3992 // meaning that the first char of the match is only partially matched.
3993 // With exapnsions, the first CE will report the index of the source
3994 // character, and all subsequent (expansions) CEs will report the source index of the
3995 // _following_ character.
3996 int32_t secondIx
= firstCEI
->highIndex
;
3997 if (mStart
== secondIx
) {
4001 // Advance the match end position to the first acceptable match boundary.
4002 // This advances the index over any combining charcters.
4004 if (minLimit
< maxLimit
) {
4005 // When the last CE's low index is same with its high index, the CE is likely
4006 // a part of expansion. In this case, the index is located just after the
4007 // character corresponding to the CEs compared above. If the index is right
4008 // at the break boundary, move the position to the next boundary will result
4009 // incorrect match length when there are ignorable characters exist between
4010 // the position and the next character produces CE(s). See ticket#8482.
4011 if (minLimit
== lastCEI
->highIndex
&& isBreakBoundary(strsrch
, minLimit
)) {
4014 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4015 if (nba
>= lastCEI
->highIndex
) {
4021 #ifdef USEARCH_DEBUG
4022 if (getenv("USEARCH_DEBUG") != NULL
) {
4023 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4027 // If default breakIter is being used, and next collation element belonging to this
4028 // combining sequence has non-zero primary weight and corresponds to a separate
4029 // character following the one at end of the current match, then do NOT require
4030 // that match end position be on a breakIter boundary, or that end of the
4031 // combining sequence not extend beyond the match in CE space. Only do those
4032 // tests if the conditions above are not met. Added this to make prefix search
4033 // work in Indic scripts per <rdar://problem/18063262>.
4034 UBool doLimitTests
= !(strsrch
->search
->breakIter
== NULL
&&
4035 nextCEI
!= NULL
&& (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) != 0 &&
4036 nextCEI
->lowIndex
>= lastCEI
->highIndex
&& nextCEI
->highIndex
> nextCEI
->lowIndex
);
4038 if (doLimitTests
) { // <rdar://problem/18063262>
4039 // If advancing to the end of a combining sequence in character indexing space
4040 // advanced us beyond the end of the match in CE space, reject this match.
4041 if (mLimit
> maxLimit
) {
4045 if (!isBreakBoundary(strsrch
, mLimit
)) {
4050 if (! checkIdentical(strsrch
, mStart
, mLimit
)) {
4059 #ifdef USEARCH_DEBUG
4060 if (getenv("USEARCH_DEBUG") != NULL
) {
4061 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4062 int32_t lastToPrint
= ceb
.limitIx
+2;
4063 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4064 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4066 printf("\n%s\n", found
? "match found" : "no match");
4070 // All Done. Store back the match bounds to the caller.
4077 if (matchStart
!= NULL
) {
4078 *matchStart
= mStart
;
4081 if (matchLimit
!= NULL
) {
4082 *matchLimit
= mLimit
;
4088 U_CAPI UBool U_EXPORT2
usearch_searchBackwards(UStringSearch
*strsrch
,
4090 int32_t *matchStart
,
4091 int32_t *matchLimit
,
4094 if (U_FAILURE(*status
)) {
4098 // TODO: reject search patterns beginning with a combining char.
4100 #ifdef USEARCH_DEBUG
4101 if (getenv("USEARCH_DEBUG") != NULL
) {
4102 printf("Pattern CEs\n");
4103 for (int ii
=0; ii
<strsrch
->pattern
.cesLength
; ii
++) {
4104 printf(" %8x", strsrch
->pattern
.ces
[ii
]);
4110 // Input parameter sanity check.
4111 // TODO: should input indicies clip to the text length
4112 // in the same way that UText does.
4113 if(strsrch
->pattern
.cesLength
== 0 ||
4115 startIdx
> strsrch
->search
->textLength
||
4116 strsrch
->pattern
.ces
== NULL
) {
4117 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
4121 if (strsrch
->pattern
.pces
== NULL
) {
4122 initializePatternPCETable(strsrch
, status
);
4125 CEIBuffer
ceb(strsrch
, status
);
4126 int32_t targetIx
= 0;
4129 * Pre-load the buffer with the CE's for the grapheme
4130 * after our starting position so that we're sure that
4131 * we can look at the CE following the match when we
4132 * check the match boundaries.
4134 * This will also pre-fetch the first CE that we'll
4135 * consider for the match.
4137 if (startIdx
< strsrch
->search
->textLength
) {
4138 UBreakIterator
*bi
= strsrch
->search
->internalBreakIter
;
4139 int32_t next
= ubrk_following(bi
, startIdx
);
4141 ucol_setOffset(strsrch
->textIter
, next
, status
);
4143 for (targetIx
= 0; ; targetIx
+= 1) {
4144 if (ceb
.getPrevious(targetIx
)->lowIndex
< startIdx
) {
4149 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
4153 const CEI
*targetCEI
= NULL
;
4157 int32_t limitIx
= targetIx
;
4158 int32_t mStart
= -1;
4159 int32_t mLimit
= -1;
4165 // Outer loop moves over match starting positions in the
4167 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4168 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4169 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4170 // and the beginning of the base text.
4171 for(targetIx
= limitIx
; ; targetIx
+= 1)
4174 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4175 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4176 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4177 const CEI
*lastCEI
= ceb
.getPrevious(targetIx
);
4178 if (lastCEI
== NULL
) {
4179 *status
= U_INTERNAL_PROGRAM_ERROR
;
4183 // Inner loop checks for a match beginning at each
4184 // position from the outer loop.
4185 int32_t targetIxOffset
= 0;
4186 for (patIx
= strsrch
->pattern
.pcesLength
- 1; patIx
>= 0; patIx
-= 1) {
4187 int64_t patCE
= strsrch
->pattern
.pces
[patIx
];
4189 targetCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.pcesLength
- 1 - patIx
+ targetIxOffset
);
4190 // Compare CE from target string with CE from the pattern.
4191 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4192 // which will fail the compare, below.
4193 UCompareCEsResult ceMatch
= compareCE64s(targetCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
4194 if ( ceMatch
== U_CE_NO_MATCH
) {
4197 } else if ( ceMatch
> U_CE_NO_MATCH
) {
4198 if ( ceMatch
== U_CE_SKIP_TARG
) {
4199 // redo with same patCE, next targCE
4202 } else { // ceMatch == U_CE_SKIP_PATN
4203 // redo with same targCE, next patCE
4209 if (!found
&& ((targetCEI
== NULL
) || (targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
))) {
4210 // No match at this targetIx. Try again at the next.
4215 // No match at all, we have run off the end of the target text.
4220 // We have found a match in CE space.
4221 // Now determine the bounds in string index space.
4222 // There still is a chance of match failure if the CE range not correspond to
4223 // an acceptable character range.
4225 const CEI
*firstCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.pcesLength
- 1 + targetIxOffset
);
4226 mStart
= firstCEI
->lowIndex
;
4228 // Check for the start of the match being within a combining sequence.
4229 // This can happen if the pattern itself begins with a combining char, and
4230 // the match found combining marks in the target text that were attached
4231 // to something else.
4232 // This type of match should be rejected for not completely consuming a
4233 // combining sequence.
4234 if (!isBreakBoundary(strsrch
, mStart
)) {
4238 // Look at the high index of the first CE in the match. If it's the same as the
4239 // low index, the first CE in the match is in the middle of an expansion.
4240 if (mStart
== firstCEI
->highIndex
) {
4245 minLimit
= lastCEI
->lowIndex
;
4248 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4249 // extended to the end of input, and the match is good.
4251 // Look at the high and low indices of the CE following the match. If
4252 // they are the same it means one of two things:
4253 // 1. The match extended to the last CE from the target text, which is OK, or
4254 // 2. The last CE that was part of the match is in an expansion that extends
4255 // to the first CE after the match. In this case, we reject the match.
4256 const CEI
*nextCEI
= ceb
.getPrevious(targetIx
- 1);
4258 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
4262 mLimit
= maxLimit
= nextCEI
->lowIndex
;
4264 // Advance the match end position to the first acceptable match boundary.
4265 // This advances the index over any combining characters.
4266 if (minLimit
< maxLimit
) {
4267 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4269 if (nba
>= lastCEI
->highIndex
) {
4274 // If default breakIter is being used, and next collation element belonging to this
4275 // combining sequence has non-zero primary weight and corresponds to a separate
4276 // character following the one at end of the current match, then do NOT require
4277 // that match end position be on a breakIter boundary, or that end of the
4278 // combining sequence not extend beyond the match in CE space. Only do those
4279 // tests if the conditions above are not met. Added this to make prefix search
4280 // work in Indic scripts per <rdar://problem/18063262>.
4281 UBool doLimitTests
= !(strsrch
->search
->breakIter
== NULL
&&
4282 nextCEI
!= NULL
&& (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) != 0 &&
4283 nextCEI
->lowIndex
>= lastCEI
->highIndex
&& nextCEI
->highIndex
> nextCEI
->lowIndex
);
4285 if (doLimitTests
) { // <rdar://problem/18063262>
4286 // If advancing to the end of a combining sequence in character indexing space
4287 // advanced us beyond the end of the match in CE space, reject this match.
4288 if (mLimit
> maxLimit
) {
4292 // Make sure the end of the match is on a break boundary
4293 if (!isBreakBoundary(strsrch
, mLimit
)) {
4299 // No non-ignorable CEs after this point.
4300 // The maximum position is detected by boundary after
4301 // the last non-ignorable CE. Combining sequence
4302 // across the start index will be truncated.
4303 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4304 mLimit
= maxLimit
= (nba
> 0) && (startIdx
> nba
) ? nba
: startIdx
;
4307 #ifdef USEARCH_DEBUG
4308 if (getenv("USEARCH_DEBUG") != NULL
) {
4309 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4314 if (! checkIdentical(strsrch
, mStart
, mLimit
)) {
4323 #ifdef USEARCH_DEBUG
4324 if (getenv("USEARCH_DEBUG") != NULL
) {
4325 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4326 int32_t lastToPrint
= ceb
.limitIx
+2;
4327 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4328 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4330 printf("\n%s\n", found
? "match found" : "no match");
4334 // All Done. Store back the match bounds to the caller.
4341 if (matchStart
!= NULL
) {
4342 *matchStart
= mStart
;
4345 if (matchLimit
!= NULL
) {
4346 *matchLimit
= mLimit
;
4352 // internal use methods declared in usrchimp.h -----------------------------
4354 UBool
usearch_handleNextExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4356 if (U_FAILURE(*status
)) {
4357 setMatchNotFound(strsrch
);
4362 UCollationElements
*coleiter
= strsrch
->textIter
;
4363 int32_t textlength
= strsrch
->search
->textLength
;
4364 int32_t *patternce
= strsrch
->pattern
.ces
;
4365 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4366 int32_t textoffset
= ucol_getOffset(coleiter
);
4368 // status used in setting coleiter offset, since offset is checked in
4369 // shiftForward before setting the coleiter offset, status never
4371 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4373 while (textoffset
<= textlength
)
4375 uint32_t patternceindex
= patterncelength
- 1;
4377 UBool found
= FALSE
;
4378 int32_t lastce
= UCOL_NULLORDER
;
4380 setColEIterOffset(coleiter
, textoffset
);
4383 // finding the last pattern ce match, imagine composite characters
4384 // for example: search for pattern A in text \u00C0
4385 // we'll have to skip \u0300 the grave first before we get to A
4386 targetce
= ucol_previous(coleiter
, status
);
4387 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4391 targetce
= getCE(strsrch
, targetce
);
4392 if (targetce
== UCOL_IGNORABLE
&& inNormBuf(coleiter
)) {
4393 // this is for the text \u0315\u0300 that requires
4394 // normalization and pattern \u0300, where \u0315 is ignorable
4397 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4400 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4401 if (targetce
== patternce
[patternceindex
]) {
4402 // the first ce can be a contraction
4406 if (!hasExpansion(coleiter
)) {
4412 //targetce = lastce;
4414 while (found
&& patternceindex
> 0) {
4416 targetce
= ucol_previous(coleiter
, status
);
4417 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4421 targetce
= getCE(strsrch
, targetce
);
4422 if (targetce
== UCOL_IGNORABLE
) {
4427 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4428 found
= found
&& targetce
== patternce
[patternceindex
];
4434 if (U_FAILURE(*status
)) {
4437 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4439 // status checked at loop.
4440 patternceindex
= patterncelength
;
4444 if (checkNextExactMatch(strsrch
, &textoffset
, status
)) {
4445 // status checked in ucol_setOffset
4446 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4450 setMatchNotFound(strsrch
);
4453 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4457 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4458 strsrch
->search
->matchedIndex
= start
;
4459 strsrch
->search
->matchedLength
= end
- start
;
4462 setMatchNotFound(strsrch
);
4468 UBool
usearch_handleNextCanonical(UStringSearch
*strsrch
, UErrorCode
*status
)
4470 if (U_FAILURE(*status
)) {
4471 setMatchNotFound(strsrch
);
4476 UCollationElements
*coleiter
= strsrch
->textIter
;
4477 int32_t textlength
= strsrch
->search
->textLength
;
4478 int32_t *patternce
= strsrch
->pattern
.ces
;
4479 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4480 int32_t textoffset
= ucol_getOffset(coleiter
);
4481 UBool hasPatternAccents
=
4482 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4484 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4486 strsrch
->canonicalPrefixAccents
[0] = 0;
4487 strsrch
->canonicalSuffixAccents
[0] = 0;
4489 while (textoffset
<= textlength
)
4491 int32_t patternceindex
= patterncelength
- 1;
4493 UBool found
= FALSE
;
4494 int32_t lastce
= UCOL_NULLORDER
;
4496 setColEIterOffset(coleiter
, textoffset
);
4499 // finding the last pattern ce match, imagine composite characters
4500 // for example: search for pattern A in text \u00C0
4501 // we'll have to skip \u0300 the grave first before we get to A
4502 targetce
= ucol_previous(coleiter
, status
);
4503 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4507 targetce
= getCE(strsrch
, targetce
);
4508 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4511 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4512 if (targetce
== patternce
[patternceindex
]) {
4513 // the first ce can be a contraction
4517 if (!hasExpansion(coleiter
)) {
4523 while (found
&& patternceindex
> 0) {
4524 targetce
= ucol_previous(coleiter
, status
);
4525 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4529 targetce
= getCE(strsrch
, targetce
);
4530 if (targetce
== UCOL_IGNORABLE
) {
4535 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4536 found
= found
&& targetce
== patternce
[patternceindex
];
4539 // initializing the rearranged accent array
4540 if (hasPatternAccents
&& !found
) {
4541 strsrch
->canonicalPrefixAccents
[0] = 0;
4542 strsrch
->canonicalSuffixAccents
[0] = 0;
4543 if (U_FAILURE(*status
)) {
4546 found
= doNextCanonicalMatch(strsrch
, textoffset
, status
);
4550 if (U_FAILURE(*status
)) {
4553 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4555 // status checked at loop
4556 patternceindex
= patterncelength
;
4560 if (checkNextCanonicalMatch(strsrch
, &textoffset
, status
)) {
4561 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4565 setMatchNotFound(strsrch
);
4568 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4572 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4573 strsrch
->search
->matchedIndex
= start
;
4574 strsrch
->search
->matchedLength
= end
- start
;
4577 setMatchNotFound(strsrch
);
4583 UBool
usearch_handlePreviousExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4585 if (U_FAILURE(*status
)) {
4586 setMatchNotFound(strsrch
);
4591 UCollationElements
*coleiter
= strsrch
->textIter
;
4592 int32_t *patternce
= strsrch
->pattern
.ces
;
4593 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4594 int32_t textoffset
= ucol_getOffset(coleiter
);
4596 // shifting it check for setting offset
4597 // if setOffset is called previously or there was no previous match, we
4598 // leave the offset as it is.
4599 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4600 textoffset
= strsrch
->search
->matchedIndex
;
4603 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4606 while (textoffset
>= 0)
4608 int32_t patternceindex
= 1;
4610 UBool found
= FALSE
;
4611 int32_t firstce
= UCOL_NULLORDER
;
4613 // if status is a failure, ucol_setOffset does nothing
4614 setColEIterOffset(coleiter
, textoffset
);
4617 // finding the first pattern ce match, imagine composite
4618 // characters. for example: search for pattern \u0300 in text
4619 // \u00C0, we'll have to skip A first before we get to
4620 // \u0300 the grave accent
4621 targetce
= ucol_next(coleiter
, status
);
4622 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4626 targetce
= getCE(strsrch
, targetce
);
4627 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4630 if (targetce
== UCOL_IGNORABLE
&& strsrch
->strength
!= UCOL_PRIMARY
) {
4633 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4634 if (targetce
== patternce
[0]) {
4638 if (!hasExpansion(coleiter
)) {
4639 // checking for accents in composite character
4645 //targetce = firstce;
4647 while (found
&& (patternceindex
< patterncelength
)) {
4649 targetce
= ucol_next(coleiter
, status
);
4650 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4654 targetce
= getCE(strsrch
, targetce
);
4655 if (targetce
== UCOL_IGNORABLE
) {
4659 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4660 found
= found
&& targetce
== patternce
[patternceindex
];
4667 if (U_FAILURE(*status
)) {
4671 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4677 if (checkPreviousExactMatch(strsrch
, &textoffset
, status
)) {
4678 setColEIterOffset(coleiter
, textoffset
);
4682 setMatchNotFound(strsrch
);
4687 if (strsrch
->search
->isOverlap
) {
4688 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4689 textOffset
= strsrch
->search
->matchedIndex
+ strsrch
->search
->matchedLength
- 1;
4691 // move the start position at the end of possible match
4692 initializePatternPCETable(strsrch
, status
);
4693 if (!initTextProcessedIter(strsrch
, status
)) {
4694 setMatchNotFound(strsrch
);
4697 for (int32_t nPCEs
= 0; nPCEs
< strsrch
->pattern
.pcesLength
- 1; nPCEs
++) {
4698 int64_t pce
= strsrch
->textProcessedIter
->nextProcessed(NULL
, NULL
, status
);
4699 if (pce
== UCOL_PROCESSED_NULLORDER
) {
4700 // at the end of the text
4704 if (U_FAILURE(*status
)) {
4705 setMatchNotFound(strsrch
);
4708 textOffset
= ucol_getOffset(strsrch
->textIter
);
4711 textOffset
= ucol_getOffset(strsrch
->textIter
);
4717 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4718 strsrch
->search
->matchedIndex
= start
;
4719 strsrch
->search
->matchedLength
= end
- start
;
4722 setMatchNotFound(strsrch
);
4728 UBool
usearch_handlePreviousCanonical(UStringSearch
*strsrch
,
4731 if (U_FAILURE(*status
)) {
4732 setMatchNotFound(strsrch
);
4737 UCollationElements
*coleiter
= strsrch
->textIter
;
4738 int32_t *patternce
= strsrch
->pattern
.ces
;
4739 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4740 int32_t textoffset
= ucol_getOffset(coleiter
);
4741 UBool hasPatternAccents
=
4742 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4744 // shifting it check for setting offset
4745 // if setOffset is called previously or there was no previous match, we
4746 // leave the offset as it is.
4747 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4748 textoffset
= strsrch
->search
->matchedIndex
;
4751 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4753 strsrch
->canonicalPrefixAccents
[0] = 0;
4754 strsrch
->canonicalSuffixAccents
[0] = 0;
4756 while (textoffset
>= 0)
4758 int32_t patternceindex
= 1;
4760 UBool found
= FALSE
;
4761 int32_t firstce
= UCOL_NULLORDER
;
4763 setColEIterOffset(coleiter
, textoffset
);
4765 // finding the first pattern ce match, imagine composite
4766 // characters. for example: search for pattern \u0300 in text
4767 // \u00C0, we'll have to skip A first before we get to
4768 // \u0300 the grave accent
4769 targetce
= ucol_next(coleiter
, status
);
4770 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4774 targetce
= getCE(strsrch
, targetce
);
4775 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4779 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4780 if (targetce
== patternce
[0]) {
4781 // the first ce can be a contraction
4785 if (!hasExpansion(coleiter
)) {
4786 // checking for accents in composite character
4794 while (found
&& patternceindex
< patterncelength
) {
4795 targetce
= ucol_next(coleiter
, status
);
4796 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4800 targetce
= getCE(strsrch
, targetce
);
4801 if (targetce
== UCOL_IGNORABLE
) {
4805 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4806 found
= found
&& targetce
== patternce
[patternceindex
];
4810 // initializing the rearranged accent array
4811 if (hasPatternAccents
&& !found
) {
4812 strsrch
->canonicalPrefixAccents
[0] = 0;
4813 strsrch
->canonicalSuffixAccents
[0] = 0;
4814 if (U_FAILURE(*status
)) {
4817 found
= doPreviousCanonicalMatch(strsrch
, textoffset
, status
);
4821 if (U_FAILURE(*status
)) {
4824 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4830 if (checkPreviousCanonicalMatch(strsrch
, &textoffset
, status
)) {
4831 setColEIterOffset(coleiter
, textoffset
);
4835 setMatchNotFound(strsrch
);
4840 if (strsrch
->search
->isOverlap
) {
4841 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4842 textOffset
= strsrch
->search
->matchedIndex
+ strsrch
->search
->matchedLength
- 1;
4844 // move the start position at the end of possible match
4845 initializePatternPCETable(strsrch
, status
);
4846 if (!initTextProcessedIter(strsrch
, status
)) {
4847 setMatchNotFound(strsrch
);
4850 for (int32_t nPCEs
= 0; nPCEs
< strsrch
->pattern
.pcesLength
- 1; nPCEs
++) {
4851 int64_t pce
= strsrch
->textProcessedIter
->nextProcessed(NULL
, NULL
, status
);
4852 if (pce
== UCOL_PROCESSED_NULLORDER
) {
4853 // at the end of the text
4857 if (U_FAILURE(*status
)) {
4858 setMatchNotFound(strsrch
);
4861 textOffset
= ucol_getOffset(strsrch
->textIter
);
4864 textOffset
= ucol_getOffset(strsrch
->textIter
);
4870 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4871 strsrch
->search
->matchedIndex
= start
;
4872 strsrch
->search
->matchedLength
= end
- start
;
4875 setMatchNotFound(strsrch
);
4881 #endif /* #if !UCONFIG_NO_COLLATION */