1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2001-2015 IBM and others. All rights reserved.
6 **********************************************************************
7 * Date Name Description
8 * 07/02/2001 synwee Creation.
9 **********************************************************************
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
16 #include "unicode/usearch.h"
17 #include "unicode/ustring.h"
18 #include "unicode/uchar.h"
19 #include "unicode/utf16.h"
20 #include "normalizer2impl.h"
29 // don't use Boyer-Moore
30 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
33 // internal definition ---------------------------------------------------
35 #define LAST_BYTE_MASK_ 0xFF
36 #define SECOND_LAST_BYTE_SHIFT_ 8
37 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
39 static const Normalizer2Impl
*g_nfcImpl
= NULL
;
41 // internal methods -------------------------------------------------
44 * Fast collation element iterator setOffset.
45 * This function does not check for bounds.
46 * @param coleiter collation element iterator
47 * @param offset to set
50 inline void setColEIterOffset(UCollationElements
*elems
,
53 // Note: Not "fast" any more after the 2013 collation rewrite.
54 // We do not want to expose more internals than necessary.
55 UErrorCode status
= U_ZERO_ERROR
;
56 ucol_setOffset(elems
, offset
, &status
);
60 * Getting the mask for collation strength
61 * @param strength collation strength
62 * @return collation element mask
65 inline uint32_t getMask(UCollationStrength strength
)
70 return UCOL_PRIMARYORDERMASK
;
72 return UCOL_SECONDARYORDERMASK
| UCOL_PRIMARYORDERMASK
;
74 return UCOL_TERTIARYORDERMASK
| UCOL_SECONDARYORDERMASK
|
75 UCOL_PRIMARYORDERMASK
;
80 * @param ce 32-bit collation element
84 inline int hashFromCE32(uint32_t ce
)
87 ((((((ce
>> 24) * 37) +
91 hc
%= MAX_TABLE_SIZE_
;
93 hc
+= MAX_TABLE_SIZE_
;
99 static UBool U_CALLCONV
100 usearch_cleanup(void) {
107 * Initializing the fcd tables.
108 * Internal method, status assumed to be a success.
109 * @param status output error if any, caller to check status before calling
110 * method, status assumed to be success when passed in.
113 inline void initializeFCD(UErrorCode
*status
)
115 if (g_nfcImpl
== NULL
) {
116 g_nfcImpl
= Normalizer2Factory::getNFCImpl(*status
);
117 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH
, usearch_cleanup
);
122 * Gets the fcd value for a character at the argument index.
123 * This method takes into accounts of the supplementary characters.
124 * @param str UTF16 string where character for fcd retrieval resides
125 * @param offset position of the character whose fcd is to be retrieved, to be
126 * overwritten with the next character position, taking
127 * surrogate characters into consideration.
128 * @param strlength length of the argument string
132 uint16_t getFCD(const UChar
*str
, int32_t *offset
,
135 const UChar
*temp
= str
+ *offset
;
136 uint16_t result
= g_nfcImpl
->nextFCD16(temp
, str
+ strlength
);
137 *offset
= (int32_t)(temp
- str
);
142 * Getting the modified collation elements taking into account the collation
144 * @param strsrch string search data
146 * @return the modified collation element
149 inline int32_t getCE(const UStringSearch
*strsrch
, uint32_t sourcece
)
151 // note for tertiary we can't use the collator->tertiaryMask, that
152 // is a preprocessed mask that takes into account case options. since
153 // we are only concerned with exact matches, we don't need that.
154 sourcece
&= strsrch
->ceMask
;
156 if (strsrch
->toShift
) {
157 // alternate handling here, since only the 16 most significant digits
158 // is only used, we can safely do a compare without masking
159 // if the ce is a variable, we mask and get only the primary values
160 // no shifting to quartenary is required since all primary values
161 // less than variabletop will need to be masked off anyway.
162 if (strsrch
->variableTop
> sourcece
) {
163 if (strsrch
->strength
>= UCOL_QUATERNARY
) {
164 sourcece
&= UCOL_PRIMARYORDERMASK
;
167 sourcece
= UCOL_IGNORABLE
;
170 } else if (strsrch
->strength
>= UCOL_QUATERNARY
&& sourcece
== UCOL_IGNORABLE
) {
178 * Allocate a memory and returns NULL if it failed.
179 * Internal method, status assumed to be a success.
180 * @param size to allocate
181 * @param status output error if any, caller to check status before calling
182 * method, status assumed to be success when passed in.
183 * @return newly allocated array, NULL otherwise
186 inline void * allocateMemory(uint32_t size
, UErrorCode
*status
)
188 uint32_t *result
= (uint32_t *)uprv_malloc(size
);
189 if (result
== NULL
) {
190 *status
= U_MEMORY_ALLOCATION_ERROR
;
196 * Adds a uint32_t value to a destination array.
197 * Creates a new array if we run out of space. The caller will have to
198 * manually deallocate the newly allocated array.
199 * Internal method, status assumed to be success, caller has to check status
200 * before calling this method. destination not to be NULL and has at least
201 * size destinationlength.
202 * @param destination target array
203 * @param offset destination offset to add value
204 * @param destinationlength target array size, return value for the new size
205 * @param value to be added
206 * @param increments incremental size expected
207 * @param status output error if any, caller to check status before calling
208 * method, status assumed to be success when passed in.
209 * @return new destination array, destination if there was no new allocation
212 inline int32_t * addTouint32_tArray(int32_t *destination
,
214 uint32_t *destinationlength
,
219 uint32_t newlength
= *destinationlength
;
220 if (offset
+ 1 == newlength
) {
221 newlength
+= increments
;
222 int32_t *temp
= (int32_t *)allocateMemory(
223 sizeof(int32_t) * newlength
, status
);
224 if (U_FAILURE(*status
)) {
227 uprv_memcpy(temp
, destination
, sizeof(int32_t) * (size_t)offset
);
228 *destinationlength
= newlength
;
231 destination
[offset
] = value
;
236 * Adds a uint64_t value to a destination array.
237 * Creates a new array if we run out of space. The caller will have to
238 * manually deallocate the newly allocated array.
239 * Internal method, status assumed to be success, caller has to check status
240 * before calling this method. destination not to be NULL and has at least
241 * size destinationlength.
242 * @param destination target array
243 * @param offset destination offset to add value
244 * @param destinationlength target array size, return value for the new size
245 * @param value to be added
246 * @param increments incremental size expected
247 * @param status output error if any, caller to check status before calling
248 * method, status assumed to be success when passed in.
249 * @return new destination array, destination if there was no new allocation
252 inline int64_t * addTouint64_tArray(int64_t *destination
,
254 uint32_t *destinationlength
,
259 uint32_t newlength
= *destinationlength
;
260 if (offset
+ 1 == newlength
) {
261 newlength
+= increments
;
262 int64_t *temp
= (int64_t *)allocateMemory(
263 sizeof(int64_t) * newlength
, status
);
265 if (U_FAILURE(*status
)) {
269 uprv_memcpy(temp
, destination
, sizeof(int64_t) * (size_t)offset
);
270 *destinationlength
= newlength
;
274 destination
[offset
] = value
;
280 * Initializing the ce table for a pattern.
281 * Stores non-ignorable collation keys.
282 * Table size will be estimated by the size of the pattern text. Table
283 * expansion will be perform as we go along. Adding 1 to ensure that the table
284 * size definitely increases.
285 * Internal method, status assumed to be a success.
286 * @param strsrch string search data
287 * @param status output error if any, caller to check status before calling
288 * method, status assumed to be success when passed in.
289 * @return total number of expansions
292 inline uint16_t initializePatternCETable(UStringSearch
*strsrch
,
295 UPattern
*pattern
= &(strsrch
->pattern
);
296 uint32_t cetablesize
= INITIAL_ARRAY_SIZE_
;
297 int32_t *cetable
= pattern
->cesBuffer
;
298 uint32_t patternlength
= pattern
->textLength
;
299 UCollationElements
*coleiter
= strsrch
->utilIter
;
301 if (coleiter
== NULL
) {
302 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
303 patternlength
, status
);
304 // status will be checked in ucol_next(..) later and if it is an
305 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
307 strsrch
->utilIter
= coleiter
;
310 ucol_setText(coleiter
, pattern
->text
, pattern
->textLength
, status
);
312 if(U_FAILURE(*status
)) {
316 if (pattern
->ces
!= cetable
&& pattern
->ces
) {
317 uprv_free(pattern
->ces
);
324 while ((ce
= ucol_next(coleiter
, status
)) != UCOL_NULLORDER
&&
325 U_SUCCESS(*status
)) {
326 uint32_t newce
= getCE(strsrch
, ce
);
328 int32_t *temp
= addTouint32_tArray(cetable
, offset
, &cetablesize
,
330 patternlength
- ucol_getOffset(coleiter
) + 1,
332 if (U_FAILURE(*status
)) {
336 if (cetable
!= temp
&& cetable
!= pattern
->cesBuffer
) {
341 result
+= (uint16_t)(ucol_getMaxExpansion(coleiter
, ce
) - 1);
345 pattern
->ces
= cetable
;
346 pattern
->cesLength
= offset
;
352 * Initializing the pce table for a pattern.
353 * Stores non-ignorable collation keys.
354 * Table size will be estimated by the size of the pattern text. Table
355 * expansion will be perform as we go along. Adding 1 to ensure that the table
356 * size definitely increases.
357 * Internal method, status assumed to be a success.
358 * @param strsrch string search data
359 * @param status output error if any, caller to check status before calling
360 * method, status assumed to be success when passed in.
361 * @return total number of expansions
364 inline uint16_t initializePatternPCETable(UStringSearch
*strsrch
,
367 UPattern
*pattern
= &(strsrch
->pattern
);
368 uint32_t pcetablesize
= INITIAL_ARRAY_SIZE_
;
369 int64_t *pcetable
= pattern
->pcesBuffer
;
370 uint32_t patternlength
= pattern
->textLength
;
371 UCollationElements
*coleiter
= strsrch
->utilIter
;
373 if (coleiter
== NULL
) {
374 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
375 patternlength
, status
);
376 // status will be checked in ucol_next(..) later and if it is an
377 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
379 strsrch
->utilIter
= coleiter
;
381 ucol_setText(coleiter
, pattern
->text
, pattern
->textLength
, status
);
383 if(U_FAILURE(*status
)) {
387 if (pattern
->pces
!= pcetable
&& pattern
->pces
!= NULL
) {
388 uprv_free(pattern
->pces
);
395 icu::UCollationPCE
iter(coleiter
);
397 // ** Should processed CEs be signed or unsigned?
398 // ** (the rest of the code in this file seems to play fast-and-loose with
399 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
400 while ((pce
= iter
.nextProcessed(NULL
, NULL
, status
)) != UCOL_PROCESSED_NULLORDER
&&
401 U_SUCCESS(*status
)) {
402 int64_t *temp
= addTouint64_tArray(pcetable
, offset
, &pcetablesize
,
404 patternlength
- ucol_getOffset(coleiter
) + 1,
407 if (U_FAILURE(*status
)) {
413 if (pcetable
!= temp
&& pcetable
!= pattern
->pcesBuffer
) {
418 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
421 pcetable
[offset
] = 0;
422 pattern
->pces
= pcetable
;
423 pattern
->pcesLength
= offset
;
429 * Initializes the pattern struct.
430 * Internal method, status assumed to be success.
431 * @param strsrch UStringSearch data storage
432 * @param status output error if any, caller to check status before calling
433 * method, status assumed to be success when passed in.
434 * @return expansionsize the total expansion size of the pattern
437 inline int16_t initializePattern(UStringSearch
*strsrch
, UErrorCode
*status
)
439 if (U_FAILURE(*status
)) { return 0; }
440 UPattern
*pattern
= &(strsrch
->pattern
);
441 const UChar
*patterntext
= pattern
->text
;
442 int32_t length
= pattern
->textLength
;
445 // Since the strength is primary, accents are ignored in the pattern.
446 if (strsrch
->strength
== UCOL_PRIMARY
) {
447 pattern
->hasPrefixAccents
= 0;
448 pattern
->hasSuffixAccents
= 0;
450 pattern
->hasPrefixAccents
= getFCD(patterntext
, &index
, length
) >>
451 SECOND_LAST_BYTE_SHIFT_
;
453 U16_BACK_1(patterntext
, 0, index
);
454 pattern
->hasSuffixAccents
= getFCD(patterntext
, &index
, length
) &
459 if (strsrch
->pattern
.pces
!= NULL
) {
460 if (strsrch
->pattern
.pces
!= strsrch
->pattern
.pcesBuffer
) {
461 uprv_free(strsrch
->pattern
.pces
);
464 strsrch
->pattern
.pces
= NULL
;
467 // since intializePattern is an internal method status is a success.
468 return initializePatternCETable(strsrch
, status
);
472 * Initializing shift tables, with the default values.
473 * If a corresponding default value is 0, the shift table is not set.
474 * @param shift table for forwards shift
475 * @param backshift table for backwards shift
476 * @param cetable table containing pattern ce
477 * @param cesize size of the pattern ces
478 * @param expansionsize total size of the expansions
479 * @param defaultforward the default forward value
480 * @param defaultbackward the default backward value
483 inline void setShiftTable(int16_t shift
[], int16_t backshift
[],
484 int32_t *cetable
, int32_t cesize
,
485 int16_t expansionsize
,
486 int16_t defaultforward
,
487 int16_t defaultbackward
)
489 // estimate the value to shift. to do that we estimate the smallest
490 // number of characters to give the relevant ces, ie approximately
491 // the number of ces minus their expansion, since expansions can come
494 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
495 shift
[count
] = defaultforward
;
497 cesize
--; // down to the last index
498 for (count
= 0; count
< cesize
; count
++) {
499 // number of ces from right of array to the count
500 int temp
= defaultforward
- count
- 1;
501 shift
[hashFromCE32(cetable
[count
])] = temp
> 1 ? temp
: 1;
503 shift
[hashFromCE32(cetable
[cesize
])] = 1;
504 // for ignorables we just shift by one. see test examples.
505 shift
[hashFromCE32(0)] = 1;
507 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
508 backshift
[count
] = defaultbackward
;
510 for (count
= cesize
; count
> 0; count
--) {
511 // the original value count does not seem to work
512 backshift
[hashFromCE32(cetable
[count
])] = count
> expansionsize
?
513 (int16_t)(count
- expansionsize
) : 1;
515 backshift
[hashFromCE32(cetable
[0])] = 1;
516 backshift
[hashFromCE32(0)] = 1;
520 * Building of the pattern collation element list and the boyer moore strsrch
522 * The canonical match will only be performed after the default match fails.
523 * For both cases we need to remember the size of the composed and decomposed
524 * versions of the string. Since the Boyer-Moore shift calculations shifts by
525 * a number of characters in the text and tries to match the pattern from that
526 * offset, the shift value can not be too large in case we miss some
527 * characters. To choose a right shift size, we estimate the NFC form of the
528 * and use its size as a shift guide. The NFC form should be the small
529 * possible representation of the pattern. Anyways, we'll err on the smaller
530 * shift size. Hence the calculation for minlength.
531 * Canonical match will be performed slightly differently. We'll split the
532 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
533 * the first and last base character (MS), the ending accents (EA). Matches
534 * will be done on MS first, and only when we match MS then some processing
535 * will be required for the prefix and end accents in order to determine if
536 * they match PA and EA. Hence the default shift values
537 * for the canonical match will take the size of either end's accent into
538 * consideration. Forwards search will take the end accents into consideration
539 * for the default shift values and the backwards search will take the prefix
540 * accents into consideration.
541 * If pattern has no non-ignorable ce, we return a illegal argument error.
542 * Internal method, status assumed to be success.
543 * @param strsrch UStringSearch data storage
544 * @param status for output errors if it occurs, status is assumed to be a
545 * success when it is passed in.
548 inline void initialize(UStringSearch
*strsrch
, UErrorCode
*status
)
550 int16_t expandlength
= initializePattern(strsrch
, status
);
551 if (U_SUCCESS(*status
) && strsrch
->pattern
.cesLength
> 0) {
552 UPattern
*pattern
= &strsrch
->pattern
;
553 int32_t cesize
= pattern
->cesLength
;
555 int16_t minlength
= cesize
> expandlength
556 ? (int16_t)cesize
- expandlength
: 1;
557 pattern
->defaultShiftSize
= minlength
;
558 setShiftTable(pattern
->shift
, pattern
->backShift
, pattern
->ces
,
559 cesize
, expandlength
, minlength
, minlength
);
562 strsrch
->pattern
.defaultShiftSize
= 0;
567 * Check to make sure that the match length is at the end of the character by
568 * using the breakiterator.
569 * @param strsrch string search data
570 * @param start target text start offset
571 * @param end target text end offset
574 void checkBreakBoundary(const UStringSearch
*strsrch
, int32_t * /*start*/,
577 #if !UCONFIG_NO_BREAK_ITERATION
578 UBreakIterator
*breakiterator
= strsrch
->search
->internalBreakIter
;
580 int32_t matchend
= *end
;
581 //int32_t matchstart = *start;
583 if (!ubrk_isBoundary(breakiterator
, matchend
)) {
584 *end
= ubrk_following(breakiterator
, matchend
);
587 /* Check the start of the matched text to make sure it doesn't have any accents
588 * before it. This code may not be necessary and so it is commented out */
589 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
590 *start = ubrk_preceding(breakiterator, matchstart);
597 * Determine whether the target text in UStringSearch bounded by the offset
598 * start and end is one or more whole units of text as
599 * determined by the breakiterator in UStringSearch.
600 * @param strsrch string search data
601 * @param start target text start offset
602 * @param end target text end offset
605 UBool
isBreakUnit(const UStringSearch
*strsrch
, int32_t start
,
608 #if !UCONFIG_NO_BREAK_ITERATION
609 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
612 int32_t startindex
= ubrk_first(breakiterator
);
613 int32_t endindex
= ubrk_last(breakiterator
);
615 // out-of-range indexes are never boundary positions
616 if (start
< startindex
|| start
> endindex
||
617 end
< startindex
|| end
> endindex
) {
620 // otherwise, we can use following() on the position before the
621 // specified one and return true of the position we get back is the
622 // one the user specified
623 UBool result
= (start
== startindex
||
624 ubrk_following(breakiterator
, start
- 1) == start
) &&
626 ubrk_following(breakiterator
, end
- 1) == end
);
628 // iterates the individual ces
629 UCollationElements
*coleiter
= strsrch
->utilIter
;
630 const UChar
*text
= strsrch
->search
->text
+
632 UErrorCode status
= U_ZERO_ERROR
;
633 ucol_setText(coleiter
, text
, end
- start
, &status
);
634 for (int32_t count
= 0; count
< strsrch
->pattern
.cesLength
;
636 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
637 if (ce
== UCOL_IGNORABLE
) {
641 if (U_FAILURE(status
) || ce
!= strsrch
->pattern
.ces
[count
]) {
645 int32_t nextce
= ucol_next(coleiter
, &status
);
646 while (ucol_getOffset(coleiter
) == (end
- start
)
647 && getCE(strsrch
, nextce
) == UCOL_IGNORABLE
) {
648 nextce
= ucol_next(coleiter
, &status
);
650 if (ucol_getOffset(coleiter
) == (end
- start
)
651 && nextce
!= UCOL_NULLORDER
) {
652 // extra collation elements at the end of the match
663 * Getting the next base character offset if current offset is an accent,
664 * or the current offset if the current character contains a base character.
665 * accents the following base character will be returned
667 * @param textoffset current offset
668 * @param textlength length of text string
669 * @return the next base character or the current offset
670 * if the current character is contains a base character.
673 inline int32_t getNextBaseOffset(const UChar
*text
,
677 if (textoffset
< textlength
) {
678 int32_t temp
= textoffset
;
679 if (getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
680 while (temp
< textlength
) {
681 int32_t result
= temp
;
682 if ((getFCD(text
, &temp
, textlength
) >>
683 SECOND_LAST_BYTE_SHIFT_
) == 0) {
694 * Gets the next base character offset depending on the string search pattern
696 * @param strsrch string search data
697 * @param textoffset current offset, one offset away from the last character
699 * @return start index of the next base character or the current offset
700 * if the current character is contains a base character.
703 inline int32_t getNextUStringSearchBaseOffset(UStringSearch
*strsrch
,
706 int32_t textlength
= strsrch
->search
->textLength
;
707 if (strsrch
->pattern
.hasSuffixAccents
&&
708 textoffset
< textlength
) {
709 int32_t temp
= textoffset
;
710 const UChar
*text
= strsrch
->search
->text
;
711 U16_BACK_1(text
, 0, temp
);
712 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
713 return getNextBaseOffset(text
, textoffset
, textlength
);
720 * Shifting the collation element iterator position forward to prepare for
721 * a following match. If the last character is a unsafe character, we'll only
722 * shift by 1 to capture contractions, normalization etc.
723 * Internal method, status assumed to be success.
724 * @param text strsrch string search data
725 * @param textoffset start text position to do search
726 * @param ce the text ce which failed the match.
727 * @param patternceindex index of the ce within the pattern ce buffer which
729 * @return final offset
732 inline int32_t shiftForward(UStringSearch
*strsrch
,
735 int32_t patternceindex
)
737 UPattern
*pattern
= &(strsrch
->pattern
);
738 if (ce
!= UCOL_NULLORDER
) {
739 int32_t shift
= pattern
->shift
[hashFromCE32(ce
)];
740 // this is to adjust for characters in the middle of the
741 // substring for matching that failed.
742 int32_t adjust
= pattern
->cesLength
- patternceindex
;
743 if (adjust
> 1 && shift
>= adjust
) {
749 textoffset
+= pattern
->defaultShiftSize
;
752 textoffset
= getNextUStringSearchBaseOffset(strsrch
, textoffset
);
753 // check for unsafe characters
754 // * if it is the start or middle of a contraction: to be done after
755 // a initial match is found
756 // * thai or lao base consonant character: similar to contraction
757 // * high surrogate character: similar to contraction
758 // * next character is a accent: shift to the next base character
761 #endif // #if BOYER_MOORE
764 * sets match not found
765 * @param strsrch string search data
768 inline void setMatchNotFound(UStringSearch
*strsrch
)
770 // this method resets the match result regardless of the error status.
771 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
772 strsrch
->search
->matchedLength
= 0;
773 if (strsrch
->search
->isForwardSearching
) {
774 setColEIterOffset(strsrch
->textIter
, strsrch
->search
->textLength
);
777 setColEIterOffset(strsrch
->textIter
, 0);
783 * Gets the offset to the next safe point in text.
784 * ie. not the middle of a contraction, swappable characters or supplementary
786 * @param collator collation sata
787 * @param text string to work with
788 * @param textoffset offset in string
789 * @param textlength length of text string
790 * @return offset to the next safe character
793 inline int32_t getNextSafeOffset(const UCollator
*collator
,
798 int32_t result
= textoffset
; // first contraction character
799 while (result
!= textlength
&& ucol_unsafeCP(text
[result
], collator
)) {
806 * This checks for accents in the potential match started with a .
807 * composite character.
808 * This is really painful... we have to check that composite character do not
809 * have any extra accents. We have to normalize the potential match and find
810 * the immediate decomposed character before the match.
811 * The first composite character would have been taken care of by the fcd
812 * checks in checkForwardExactMatch.
813 * This is the slow path after the fcd of the first character and
814 * the last character has been checked by checkForwardExactMatch and we
815 * determine that the potential match has extra non-ignorable preceding
817 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
818 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
819 * Note here that accents checking are slow and cautioned in the API docs.
820 * Internal method, status assumed to be a success, caller should check status
821 * before calling this method
822 * @param strsrch string search data
823 * @param start index of the potential unfriendly composite character
824 * @param end index of the potential unfriendly composite character
825 * @param status output error status if any.
826 * @return TRUE if there is non-ignorable accents before at the beginning
827 * of the match, FALSE otherwise.
831 UBool
checkExtraMatchAccents(const UStringSearch
*strsrch
, int32_t start
,
835 UBool result
= FALSE
;
836 if (strsrch
->pattern
.hasPrefixAccents
) {
837 int32_t length
= end
- start
;
839 const UChar
*text
= strsrch
->search
->text
+ start
;
841 U16_FWD_1(text
, offset
, length
);
842 // we are only concerned with the first composite character
843 if (unorm_quickCheck(text
, offset
, UNORM_NFD
, status
) == UNORM_NO
) {
844 int32_t safeoffset
= getNextSafeOffset(strsrch
->collator
,
846 if (safeoffset
!= length
) {
850 UChar buffer
[INITIAL_ARRAY_SIZE_
];
851 int32_t size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0,
852 buffer
, INITIAL_ARRAY_SIZE_
,
854 if (U_FAILURE(*status
)) {
857 if (size
>= INITIAL_ARRAY_SIZE_
) {
858 norm
= (UChar
*)allocateMemory((size
+ 1) * sizeof(UChar
),
860 // if allocation failed, status will be set to
861 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
863 size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0, norm
,
865 if (U_FAILURE(*status
) && norm
!= NULL
) {
874 UCollationElements
*coleiter
= strsrch
->utilIter
;
875 ucol_setText(coleiter
, norm
, size
, status
);
876 uint32_t firstce
= strsrch
->pattern
.ces
[0];
877 UBool ignorable
= TRUE
;
878 uint32_t ce
= UCOL_IGNORABLE
;
879 while (U_SUCCESS(*status
) && ce
!= firstce
&& ce
!= (uint32_t)UCOL_NULLORDER
) {
880 offset
= ucol_getOffset(coleiter
);
881 if (ce
!= firstce
&& ce
!= UCOL_IGNORABLE
) {
884 ce
= ucol_next(coleiter
, status
);
887 U16_PREV(norm
, 0, offset
, codepoint
);
888 result
= !ignorable
&& (u_getCombiningClass(codepoint
) != 0);
890 if (norm
!= buffer
) {
900 * Used by exact matches, checks if there are accents before the match.
901 * This is really painful... we have to check that composite characters at
902 * the start of the matches have to not have any extra accents.
903 * We check the FCD of the character first, if it starts with an accent and
904 * the first pattern ce does not match the first ce of the character, we bail.
905 * Otherwise we try normalizing the first composite
906 * character and find the immediate decomposed character before the match to
907 * see if it is an non-ignorable accent.
908 * Now normalizing the first composite character is enough because we ensure
909 * that when the match is passed in here with extra beginning ces, the
910 * first or last ce that match has to occur within the first character.
911 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
912 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
913 * Note here that accents checking are slow and cautioned in the API docs.
914 * @param strsrch string search data
915 * @param start offset
917 * @return TRUE if there are accents on either side of the match,
921 UBool
hasAccentsBeforeMatch(const UStringSearch
*strsrch
, int32_t start
,
924 if (strsrch
->pattern
.hasPrefixAccents
) {
925 UCollationElements
*coleiter
= strsrch
->textIter
;
926 UErrorCode status
= U_ZERO_ERROR
;
927 // we have been iterating forwards previously
928 uint32_t ignorable
= TRUE
;
929 int32_t firstce
= strsrch
->pattern
.ces
[0];
931 setColEIterOffset(coleiter
, start
);
932 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
933 if (U_FAILURE(status
)) {
936 while (ce
!= firstce
) {
937 if (ce
!= UCOL_IGNORABLE
) {
940 ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
941 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
945 if (!ignorable
&& inNormBuf(coleiter
)) {
946 // within normalization buffer, discontiguous handled here
951 int32_t temp
= start
;
953 // accent = (getFCD(strsrch->search->text, &temp,
954 // strsrch->search->textLength)
955 // >> SECOND_LAST_BYTE_SHIFT_);
956 // however this code does not work well with VC7 .net in release mode.
957 // maybe the inlines for getFCD combined with shifting has bugs in
958 // VC7. anyways this is a work around.
959 UBool accent
= getFCD(strsrch
->search
->text
, &temp
,
960 strsrch
->search
->textLength
) > 0xFF;
962 return checkExtraMatchAccents(strsrch
, start
, end
, &status
);
969 U16_BACK_1(strsrch
->search
->text
, 0, temp
);
970 if (getFCD(strsrch
->search
->text
, &temp
,
971 strsrch
->search
->textLength
) & LAST_BYTE_MASK_
) {
972 setColEIterOffset(coleiter
, start
);
973 ce
= ucol_previous(coleiter
, &status
);
974 if (U_FAILURE(status
) ||
975 (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
)) {
986 * Used by exact matches, checks if there are accents bounding the match.
987 * Note this is the initial boundary check. If the potential match
988 * starts or ends with composite characters, the accents in those
989 * characters will be determined later.
990 * Not doing backwards iteration here, since discontiguos contraction for
991 * backwards collation element iterator, use up too many characters.
992 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
993 * should fail since there is a acute at the end of \u01FA
994 * Note here that accents checking are slow and cautioned in the API docs.
995 * @param strsrch string search data
996 * @param start offset of match
997 * @param end end offset of the match
998 * @return TRUE if there are accents on either side of the match,
1002 UBool
hasAccentsAfterMatch(const UStringSearch
*strsrch
, int32_t start
,
1005 if (strsrch
->pattern
.hasSuffixAccents
) {
1006 const UChar
*text
= strsrch
->search
->text
;
1008 int32_t textlength
= strsrch
->search
->textLength
;
1009 U16_BACK_1(text
, 0, temp
);
1010 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
1011 int32_t firstce
= strsrch
->pattern
.ces
[0];
1012 UCollationElements
*coleiter
= strsrch
->textIter
;
1013 UErrorCode status
= U_ZERO_ERROR
;
1015 setColEIterOffset(coleiter
, start
);
1016 while ((ce
= getCE(strsrch
, ucol_next(coleiter
, &status
))) != firstce
) {
1017 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
1022 while (count
< strsrch
->pattern
.cesLength
) {
1023 if (getCE(strsrch
, ucol_next(coleiter
, &status
))
1024 == UCOL_IGNORABLE
) {
1025 // Thai can give an ignorable here.
1028 if (U_FAILURE(status
)) {
1034 ce
= ucol_next(coleiter
, &status
);
1035 if (U_FAILURE(status
)) {
1038 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1039 ce
= getCE(strsrch
, ce
);
1041 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1042 if (ucol_getOffset(coleiter
) <= end
) {
1045 if (getFCD(text
, &end
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
1053 #endif // #if BOYER_MOORE
1056 * Checks if the offset runs out of the text string
1058 * @param textlength of the text string
1059 * @return TRUE if offset is out of bounds, FALSE otherwise
1062 inline UBool
isOutOfBounds(int32_t textlength
, int32_t offset
)
1064 return offset
< 0 || offset
> textlength
;
1068 * Checks for identical match
1069 * @param strsrch string search data
1070 * @param start offset of possible match
1071 * @param end offset of possible match
1072 * @return TRUE if identical match is found
1075 inline UBool
checkIdentical(const UStringSearch
*strsrch
, int32_t start
,
1078 if (strsrch
->strength
!= UCOL_IDENTICAL
) {
1082 // Note: We could use Normalizer::compare() or similar, but for short strings
1083 // which may not be in FCD it might be faster to just NFD them.
1084 UErrorCode status
= U_ZERO_ERROR
;
1085 UnicodeString t2
, p2
;
1086 strsrch
->nfd
->normalize(
1087 UnicodeString(FALSE
, strsrch
->search
->text
+ start
, end
- start
), t2
, status
);
1088 strsrch
->nfd
->normalize(
1089 UnicodeString(FALSE
, strsrch
->pattern
.text
, strsrch
->pattern
.textLength
), p2
, status
);
1090 // return FALSE if NFD failed
1091 return U_SUCCESS(status
) && t2
== p2
;
1096 * Checks to see if the match is repeated
1097 * @param strsrch string search data
1098 * @param start new match start index
1099 * @param end new match end index
1100 * @return TRUE if the the match is repeated, FALSE otherwise
1103 inline UBool
checkRepeatedMatch(UStringSearch
*strsrch
,
1107 int32_t lastmatchindex
= strsrch
->search
->matchedIndex
;
1109 if (lastmatchindex
== USEARCH_DONE
) {
1112 if (strsrch
->search
->isForwardSearching
) {
1113 result
= start
<= lastmatchindex
;
1116 result
= start
>= lastmatchindex
;
1118 if (!result
&& !strsrch
->search
->isOverlap
) {
1119 if (strsrch
->search
->isForwardSearching
) {
1120 result
= start
< lastmatchindex
+ strsrch
->search
->matchedLength
;
1123 result
= end
> lastmatchindex
;
1130 * Gets the collation element iterator's current offset.
1131 * @param coleiter collation element iterator
1132 * @param forwards flag TRUE if we are moving in th forwards direction
1133 * @return current offset
1136 inline int32_t getColElemIterOffset(const UCollationElements
*coleiter
,
1139 int32_t result
= ucol_getOffset(coleiter
);
1140 // intricacies of the the backwards collation element iterator
1141 if (FALSE
&& !forwards
&& inNormBuf(coleiter
) && !isFCDPointerNull(coleiter
)) {
1148 * Checks match for contraction.
1149 * If the match ends with a partial contraction we fail.
1150 * If the match starts too far off (because of backwards iteration) we try to
1151 * chip off the extra characters depending on whether a breakiterator has
1153 * Internal method, error assumed to be success, caller has to check status
1154 * before calling this method.
1155 * @param strsrch string search data
1156 * @param start offset of potential match, to be modified if necessary
1157 * @param end offset of potential match, to be modified if necessary
1158 * @param status output error status if any
1159 * @return TRUE if match passes the contraction test, FALSE otherwise
1163 UBool
checkNextExactContractionMatch(UStringSearch
*strsrch
,
1165 int32_t *end
, UErrorCode
*status
)
1167 UCollationElements
*coleiter
= strsrch
->textIter
;
1168 int32_t textlength
= strsrch
->search
->textLength
;
1169 int32_t temp
= *start
;
1170 const UCollator
*collator
= strsrch
->collator
;
1171 const UChar
*text
= strsrch
->search
->text
;
1172 // This part checks if either ends of the match contains potential
1173 // contraction. If so we'll have to iterate through them
1174 // The start contraction needs to be checked since ucol_previous dumps
1175 // all characters till the first safe character into the buffer.
1176 // *start + 1 is used to test for the unsafe characters instead of *start
1177 // because ucol_prev takes all unsafe characters till the first safe
1178 // character ie *start. so by testing *start + 1, we can estimate if
1179 // excess prefix characters has been included in the potential search
1181 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1182 (*start
+ 1 < textlength
1183 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1184 int32_t expansion
= getExpansionPrefix(coleiter
);
1185 UBool expandflag
= expansion
> 0;
1186 setColEIterOffset(coleiter
, *start
);
1187 while (expansion
> 0) {
1188 // getting rid of the redundant ce, caused by setOffset.
1189 // since backward contraction/expansion may have extra ces if we
1190 // are in the normalization buffer, hasAccentsBeforeMatch would
1191 // have taken care of it.
1192 // E.g. the character \u01FA will have an expansion of 3, but if
1193 // we are only looking for acute and ring \u030A and \u0301, we'll
1194 // have to skip the first ce in the expansion buffer.
1195 ucol_next(coleiter
, status
);
1196 if (U_FAILURE(*status
)) {
1199 if (ucol_getOffset(coleiter
) != temp
) {
1201 temp
= ucol_getOffset(coleiter
);
1206 int32_t *patternce
= strsrch
->pattern
.ces
;
1207 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
1209 while (count
< patterncelength
) {
1210 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1211 if (ce
== UCOL_IGNORABLE
) {
1214 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1216 temp
= ucol_getOffset(coleiter
);
1218 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1220 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1230 * Checks and sets the match information if found.
1233 * <li> the potential match does not repeat the previous match
1234 * <li> boundaries are correct
1235 * <li> exact matches has no extra accents
1236 * <li> identical matchesb
1237 * <li> potential match does not end in the middle of a contraction
1239 * Otherwise the offset will be shifted to the next character.
1240 * Internal method, status assumed to be success, caller has to check status
1241 * before calling this method.
1242 * @param strsrch string search data
1243 * @param textoffset offset in the collation element text. the returned value
1244 * will be the truncated end offset of the match or the new start
1246 * @param status output error status if any
1247 * @return TRUE if the match is valid, FALSE otherwise
1250 inline UBool
checkNextExactMatch(UStringSearch
*strsrch
,
1251 int32_t *textoffset
, UErrorCode
*status
)
1253 UCollationElements
*coleiter
= strsrch
->textIter
;
1254 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1256 if (!checkNextExactContractionMatch(strsrch
, &start
, textoffset
, status
)) {
1260 // this totally matches, however we need to check if it is repeating
1261 if (!isBreakUnit(strsrch
, start
, *textoffset
) ||
1262 checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1263 hasAccentsBeforeMatch(strsrch
, start
, *textoffset
) ||
1264 !checkIdentical(strsrch
, start
, *textoffset
) ||
1265 hasAccentsAfterMatch(strsrch
, start
, *textoffset
)) {
1268 *textoffset
= getNextUStringSearchBaseOffset(strsrch
, *textoffset
);
1272 //Add breakiterator boundary check for primary strength search.
1273 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
1274 checkBreakBoundary(strsrch
, &start
, textoffset
);
1277 // totally match, we will get rid of the ending ignorables.
1278 strsrch
->search
->matchedIndex
= start
;
1279 strsrch
->search
->matchedLength
= *textoffset
- start
;
1284 * Getting the previous base character offset, or the current offset if the
1285 * current character is a base character
1286 * @param text string
1287 * @param textoffset one offset after the current character
1288 * @return the offset of the next character after the base character or the first
1289 * composed character with accents
1292 inline int32_t getPreviousBaseOffset(const UChar
*text
,
1295 if (textoffset
> 0) {
1297 int32_t result
= textoffset
;
1298 U16_BACK_1(text
, 0, textoffset
);
1299 int32_t temp
= textoffset
;
1300 uint16_t fcd
= getFCD(text
, &temp
, result
);
1301 if ((fcd
>> SECOND_LAST_BYTE_SHIFT_
) == 0) {
1302 if (fcd
& LAST_BYTE_MASK_
) {
1307 if (textoffset
== 0) {
1316 * Getting the indexes of the accents that are not blocked in the argument
1318 * @param accents array of accents in nfd terminated by a 0.
1319 * @param accentsindex array of indexes of the accents that are not blocked
1322 inline int getUnblockedAccentIndex(UChar
*accents
, int32_t *accentsindex
)
1325 int32_t length
= u_strlen(accents
);
1326 UChar32 codepoint
= 0;
1330 while (index
< length
) {
1332 U16_NEXT(accents
, index
, length
, codepoint
);
1333 if (u_getCombiningClass(codepoint
) != cclass
) {
1334 cclass
= u_getCombiningClass(codepoint
);
1335 accentsindex
[result
] = temp
;
1339 accentsindex
[result
] = length
;
1344 * Appends 3 UChar arrays to a destination array.
1345 * Creates a new array if we run out of space. The caller will have to
1346 * manually deallocate the newly allocated array.
1347 * Internal method, status assumed to be success, caller has to check status
1348 * before calling this method. destination not to be NULL and has at least
1349 * size destinationlength.
1350 * @param destination target array
1351 * @param destinationlength target array size, returning the appended length
1352 * @param source1 null-terminated first array
1353 * @param source2 second array
1354 * @param source2length length of seond array
1355 * @param source3 null-terminated third array
1356 * @param status error status if any
1357 * @return new destination array, destination if there was no new allocation
1360 inline UChar
* addToUCharArray( UChar
*destination
,
1361 int32_t *destinationlength
,
1362 const UChar
*source1
,
1363 const UChar
*source2
,
1364 int32_t source2length
,
1365 const UChar
*source3
,
1368 int32_t source1length
= source1
? u_strlen(source1
) : 0;
1369 int32_t source3length
= source3
? u_strlen(source3
) : 0;
1370 if (*destinationlength
< source1length
+ source2length
+ source3length
+
1373 destination
= (UChar
*)allocateMemory(
1374 (source1length
+ source2length
+ source3length
+ 1) * sizeof(UChar
),
1376 // if error allocating memory, status will be
1377 // U_MEMORY_ALLOCATION_ERROR
1378 if (U_FAILURE(*status
)) {
1379 *destinationlength
= 0;
1383 if (source1length
!= 0) {
1384 u_memcpy(destination
, source1
, source1length
);
1386 if (source2length
!= 0) {
1387 uprv_memcpy(destination
+ source1length
, source2
,
1388 sizeof(UChar
) * source2length
);
1390 if (source3length
!= 0) {
1391 uprv_memcpy(destination
+ source1length
+ source2length
, source3
,
1392 sizeof(UChar
) * source3length
);
1394 *destinationlength
= source1length
+ source2length
+ source3length
;
1399 * Running through a collation element iterator to see if the contents matches
1400 * pattern in string search data
1401 * @param strsrch string search data
1402 * @param coleiter collation element iterator
1403 * @return TRUE if a match if found, FALSE otherwise
1406 inline UBool
checkCollationMatch(const UStringSearch
*strsrch
,
1407 UCollationElements
*coleiter
)
1409 int patternceindex
= strsrch
->pattern
.cesLength
;
1410 int32_t *patternce
= strsrch
->pattern
.ces
;
1411 UErrorCode status
= U_ZERO_ERROR
;
1412 while (patternceindex
> 0) {
1413 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
1414 if (ce
== UCOL_IGNORABLE
) {
1417 if (U_FAILURE(status
) || ce
!= *patternce
) {
1427 * Rearranges the front accents to try matching.
1428 * Prefix accents in the text will be grouped according to their combining
1429 * class and the groups will be mixed and matched to try find the perfect
1430 * match with the pattern.
1431 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1432 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1433 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1435 * step 2: check if any of the generated substrings matches the pattern.
1436 * Internal method, status is assumed to be success, caller has to check status
1437 * before calling this method.
1438 * @param strsrch string search match
1439 * @param start first offset of the accents to start searching
1440 * @param end start of the last accent set
1441 * @param status output error status if any
1442 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1443 * offset of the match. Note this start includes all preceding accents.
1446 int32_t doNextCanonicalPrefixMatch(UStringSearch
*strsrch
,
1451 const UChar
*text
= strsrch
->search
->text
;
1452 int32_t textlength
= strsrch
->search
->textLength
;
1453 int32_t tempstart
= start
;
1455 if ((getFCD(text
, &tempstart
, textlength
) & LAST_BYTE_MASK_
) == 0) {
1456 // die... failed at a base character
1457 return USEARCH_DONE
;
1460 int32_t offset
= getNextBaseOffset(text
, tempstart
, textlength
);
1461 start
= getPreviousBaseOffset(text
, tempstart
);
1463 UChar accents
[INITIAL_ARRAY_SIZE_
];
1464 // normalizing the offensive string
1465 unorm_normalize(text
+ start
, offset
- start
, UNORM_NFD
, 0, accents
,
1466 INITIAL_ARRAY_SIZE_
, status
);
1467 if (U_FAILURE(*status
)) {
1468 return USEARCH_DONE
;
1471 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1472 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1474 int32_t count
= (2 << (accentsize
- 1)) - 1;
1475 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1476 UCollationElements
*coleiter
= strsrch
->utilIter
;
1477 while (U_SUCCESS(*status
) && count
> 0) {
1478 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
1479 // copy the base characters
1480 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1481 *rearrange
++ = accents
[k
];
1483 // forming all possible canonical rearrangement by dropping
1485 for (int i
= 0; i
<= accentsize
- 1; i
++) {
1486 int32_t mask
= 1 << (accentsize
- i
- 1);
1488 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1489 *rearrange
++ = accents
[j
];
1494 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
1495 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
1496 strsrch
->canonicalPrefixAccents
,
1497 strsrch
->search
->text
+ offset
,
1499 strsrch
->canonicalSuffixAccents
,
1502 // if status is a failure, ucol_setText does nothing.
1503 // run the collator iterator through this match
1504 ucol_setText(coleiter
, match
, matchsize
, status
);
1505 if (U_SUCCESS(*status
)) {
1506 if (checkCollationMatch(strsrch
, coleiter
)) {
1507 if (match
!= buffer
) {
1515 return USEARCH_DONE
;
1519 * Gets the offset to the safe point in text before textoffset.
1520 * ie. not the middle of a contraction, swappable characters or supplementary
1522 * @param collator collation sata
1523 * @param text string to work with
1524 * @param textoffset offset in string
1525 * @param textlength length of text string
1526 * @return offset to the previous safe character
1529 inline uint32_t getPreviousSafeOffset(const UCollator
*collator
,
1533 int32_t result
= textoffset
; // first contraction character
1534 while (result
!= 0 && ucol_unsafeCP(text
[result
- 1], collator
)) {
1538 // the first contraction character is consider unsafe here
1545 * Cleaning up after we passed the safe zone
1546 * @param strsrch string search data
1547 * @param safetext safe text array
1548 * @param safebuffer safe text buffer
1549 * @param coleiter collation element iterator for safe text
1552 inline void cleanUpSafeText(const UStringSearch
*strsrch
, UChar
*safetext
,
1555 if (safetext
!= safebuffer
&& safetext
!= strsrch
->canonicalSuffixAccents
)
1557 uprv_free(safetext
);
1562 * Take the rearranged end accents and tries matching. If match failed at
1563 * a seperate preceding set of accents (seperated from the rearranged on by
1564 * at least a base character) then we rearrange the preceding accents and
1565 * tries matching again.
1566 * We allow skipping of the ends of the accent set if the ces do not match.
1567 * However if the failure is found before the accent set, it fails.
1568 * Internal method, status assumed to be success, caller has to check status
1569 * before calling this method.
1570 * @param strsrch string search data
1571 * @param textoffset of the start of the rearranged accent
1572 * @param status output error status if any
1573 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1574 * offset of the match. Note this start includes all preceding accents.
1577 int32_t doNextCanonicalSuffixMatch(UStringSearch
*strsrch
,
1581 const UChar
*text
= strsrch
->search
->text
;
1582 const UCollator
*collator
= strsrch
->collator
;
1583 int32_t safelength
= 0;
1585 int32_t safetextlength
;
1586 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
1587 UCollationElements
*coleiter
= strsrch
->utilIter
;
1588 int32_t safeoffset
= textoffset
;
1590 if (textoffset
!= 0 && ucol_unsafeCP(strsrch
->canonicalSuffixAccents
[0],
1592 safeoffset
= getPreviousSafeOffset(collator
, text
, textoffset
);
1593 safelength
= textoffset
- safeoffset
;
1594 safetextlength
= INITIAL_ARRAY_SIZE_
;
1595 safetext
= addToUCharArray(safebuffer
, &safetextlength
, NULL
,
1596 text
+ safeoffset
, safelength
,
1597 strsrch
->canonicalSuffixAccents
,
1601 safetextlength
= u_strlen(strsrch
->canonicalSuffixAccents
);
1602 safetext
= strsrch
->canonicalSuffixAccents
;
1605 // if status is a failure, ucol_setText does nothing
1606 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
1607 // status checked in loop below
1609 int32_t *ce
= strsrch
->pattern
.ces
;
1610 int32_t celength
= strsrch
->pattern
.cesLength
;
1611 int ceindex
= celength
- 1;
1612 UBool isSafe
= TRUE
; // indication flag for position in safe zone
1614 while (ceindex
>= 0) {
1615 int32_t textce
= ucol_previous(coleiter
, status
);
1616 if (U_FAILURE(*status
)) {
1618 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1620 return USEARCH_DONE
;
1622 if (textce
== UCOL_NULLORDER
) {
1623 // check if we have passed the safe buffer
1624 if (coleiter
== strsrch
->textIter
) {
1625 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1626 return USEARCH_DONE
;
1628 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1629 safetext
= safebuffer
;
1630 coleiter
= strsrch
->textIter
;
1631 setColEIterOffset(coleiter
, safeoffset
);
1632 // status checked at the start of the loop
1636 textce
= getCE(strsrch
, textce
);
1637 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
1638 // do the beginning stuff
1639 int32_t failedoffset
= getColElemIterOffset(coleiter
, FALSE
);
1640 if (isSafe
&& failedoffset
>= safelength
) {
1641 // alas... no hope. failed at rearranged accent set
1642 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1643 return USEARCH_DONE
;
1647 failedoffset
+= safeoffset
;
1648 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1651 // try rearranging the front accents
1652 int32_t result
= doNextCanonicalPrefixMatch(strsrch
,
1653 failedoffset
, textoffset
, status
);
1654 if (result
!= USEARCH_DONE
) {
1655 // if status is a failure, ucol_setOffset does nothing
1656 setColEIterOffset(strsrch
->textIter
, result
);
1658 if (U_FAILURE(*status
)) {
1659 return USEARCH_DONE
;
1664 if (textce
== ce
[ceindex
]) {
1670 int32_t result
= getColElemIterOffset(coleiter
, FALSE
);
1671 // sets the text iterator here with the correct expansion and offset
1672 int32_t leftoverces
= getExpansionPrefix(coleiter
);
1673 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1674 if (result
>= safelength
) {
1675 result
= textoffset
;
1678 result
+= safeoffset
;
1680 setColEIterOffset(strsrch
->textIter
, result
);
1681 strsrch
->textIter
->iteratordata_
.toReturn
=
1682 setExpansionPrefix(strsrch
->textIter
, leftoverces
);
1686 return ucol_getOffset(coleiter
);
1690 * Trying out the substring and sees if it can be a canonical match.
1691 * This will try normalizing the end accents and arranging them into canonical
1692 * equivalents and check their corresponding ces with the pattern ce.
1693 * Suffix accents in the text will be grouped according to their combining
1694 * class and the groups will be mixed and matched to try find the perfect
1695 * match with the pattern.
1696 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1697 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1698 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1700 * step 2: check if any of the generated substrings matches the pattern.
1701 * Internal method, status assumed to be success, caller has to check status
1702 * before calling this method.
1703 * @param strsrch string search data
1704 * @param textoffset end offset in the collation element text that ends with
1705 * the accents to be rearranged
1706 * @param status error status if any
1707 * @return TRUE if the match is valid, FALSE otherwise
1710 UBool
doNextCanonicalMatch(UStringSearch
*strsrch
,
1714 const UChar
*text
= strsrch
->search
->text
;
1715 int32_t temp
= textoffset
;
1716 U16_BACK_1(text
, 0, temp
);
1717 if ((getFCD(text
, &temp
, textoffset
) & LAST_BYTE_MASK_
) == 0) {
1718 UCollationElements
*coleiter
= strsrch
->textIter
;
1719 int32_t offset
= getColElemIterOffset(coleiter
, FALSE
);
1720 if (strsrch
->pattern
.hasPrefixAccents
) {
1721 offset
= doNextCanonicalPrefixMatch(strsrch
, offset
, textoffset
,
1723 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
1724 setColEIterOffset(coleiter
, offset
);
1731 if (!strsrch
->pattern
.hasSuffixAccents
) {
1735 UChar accents
[INITIAL_ARRAY_SIZE_
];
1736 // offset to the last base character in substring to search
1737 int32_t baseoffset
= getPreviousBaseOffset(text
, textoffset
);
1738 // normalizing the offensive string
1739 unorm_normalize(text
+ baseoffset
, textoffset
- baseoffset
, UNORM_NFD
,
1740 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
1741 // status checked in loop below
1743 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1744 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
1746 // 2 power n - 1 plus the full set of accents
1747 int32_t count
= (2 << (size
- 1)) - 1;
1748 while (U_SUCCESS(*status
) && count
> 0) {
1749 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
1750 // copy the base characters
1751 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1752 *rearrange
++ = accents
[k
];
1754 // forming all possible canonical rearrangement by dropping
1756 for (int i
= 0; i
<= size
- 1; i
++) {
1757 int32_t mask
= 1 << (size
- i
- 1);
1759 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1760 *rearrange
++ = accents
[j
];
1765 int32_t offset
= doNextCanonicalSuffixMatch(strsrch
, baseoffset
,
1767 if (offset
!= USEARCH_DONE
) {
1768 return TRUE
; // match found
1776 * Gets the previous base character offset depending on the string search
1778 * @param strsrch string search data
1779 * @param textoffset current offset, current character
1780 * @return the offset of the next character after this base character or itself
1781 * if it is a composed character with accents
1784 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch
*strsrch
,
1787 if (strsrch
->pattern
.hasPrefixAccents
&& textoffset
> 0) {
1788 const UChar
*text
= strsrch
->search
->text
;
1789 int32_t offset
= textoffset
;
1790 if (getFCD(text
, &offset
, strsrch
->search
->textLength
) >>
1791 SECOND_LAST_BYTE_SHIFT_
) {
1792 return getPreviousBaseOffset(text
, textoffset
);
1799 * Checks match for contraction.
1800 * If the match ends with a partial contraction we fail.
1801 * If the match starts too far off (because of backwards iteration) we try to
1802 * chip off the extra characters
1803 * Internal method, status assumed to be success, caller has to check status
1804 * before calling this method.
1805 * @param strsrch string search data
1806 * @param start offset of potential match, to be modified if necessary
1807 * @param end offset of potential match, to be modified if necessary
1808 * @param status output error status if any
1809 * @return TRUE if match passes the contraction test, FALSE otherwise
1812 UBool
checkNextCanonicalContractionMatch(UStringSearch
*strsrch
,
1817 UCollationElements
*coleiter
= strsrch
->textIter
;
1818 int32_t textlength
= strsrch
->search
->textLength
;
1819 int32_t temp
= *start
;
1820 const UCollator
*collator
= strsrch
->collator
;
1821 const UChar
*text
= strsrch
->search
->text
;
1822 // This part checks if either ends of the match contains potential
1823 // contraction. If so we'll have to iterate through them
1824 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1825 (*start
+ 1 < textlength
1826 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1827 int32_t expansion
= getExpansionPrefix(coleiter
);
1828 UBool expandflag
= expansion
> 0;
1829 setColEIterOffset(coleiter
, *start
);
1830 while (expansion
> 0) {
1831 // getting rid of the redundant ce, caused by setOffset.
1832 // since backward contraction/expansion may have extra ces if we
1833 // are in the normalization buffer, hasAccentsBeforeMatch would
1834 // have taken care of it.
1835 // E.g. the character \u01FA will have an expansion of 3, but if
1836 // we are only looking for acute and ring \u030A and \u0301, we'll
1837 // have to skip the first ce in the expansion buffer.
1838 ucol_next(coleiter
, status
);
1839 if (U_FAILURE(*status
)) {
1842 if (ucol_getOffset(coleiter
) != temp
) {
1844 temp
= ucol_getOffset(coleiter
);
1849 int32_t *patternce
= strsrch
->pattern
.ces
;
1850 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
1852 int32_t textlength
= strsrch
->search
->textLength
;
1853 while (count
< patterncelength
) {
1854 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1855 // status checked below, note that if status is a failure
1856 // ucol_next returns UCOL_NULLORDER
1857 if (ce
== UCOL_IGNORABLE
) {
1860 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1862 temp
= ucol_getOffset(coleiter
);
1865 if (count
== 0 && ce
!= patternce
[0]) {
1866 // accents may have extra starting ces, this occurs when a
1867 // pure accent pattern is matched without rearrangement
1868 // text \u0325\u0300 and looking for \u0300
1869 int32_t expected
= patternce
[0];
1870 if (getFCD(text
, start
, textlength
) & LAST_BYTE_MASK_
) {
1871 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1872 while (U_SUCCESS(*status
) && ce
!= expected
&&
1873 ce
!= UCOL_NULLORDER
&&
1874 ucol_getOffset(coleiter
) <= *end
) {
1875 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1879 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1881 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1891 * Checks and sets the match information if found.
1894 * <li> the potential match does not repeat the previous match
1895 * <li> boundaries are correct
1896 * <li> potential match does not end in the middle of a contraction
1897 * <li> identical matches
1899 * Otherwise the offset will be shifted to the next character.
1900 * Internal method, status assumed to be success, caller has to check the
1901 * status before calling this method.
1902 * @param strsrch string search data
1903 * @param textoffset offset in the collation element text. the returned value
1904 * will be the truncated end offset of the match or the new start
1906 * @param status output error status if any
1907 * @return TRUE if the match is valid, FALSE otherwise
1910 inline UBool
checkNextCanonicalMatch(UStringSearch
*strsrch
,
1911 int32_t *textoffset
,
1914 // to ensure that the start and ends are not composite characters
1915 UCollationElements
*coleiter
= strsrch
->textIter
;
1916 // if we have a canonical accent match
1917 if ((strsrch
->pattern
.hasSuffixAccents
&&
1918 strsrch
->canonicalSuffixAccents
[0]) ||
1919 (strsrch
->pattern
.hasPrefixAccents
&&
1920 strsrch
->canonicalPrefixAccents
[0])) {
1921 strsrch
->search
->matchedIndex
= getPreviousUStringSearchBaseOffset(
1923 ucol_getOffset(coleiter
));
1924 strsrch
->search
->matchedLength
= *textoffset
-
1925 strsrch
->search
->matchedIndex
;
1929 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1930 if (!checkNextCanonicalContractionMatch(strsrch
, &start
, textoffset
,
1931 status
) || U_FAILURE(*status
)) {
1935 start
= getPreviousUStringSearchBaseOffset(strsrch
, start
);
1936 // this totally matches, however we need to check if it is repeating
1937 if (checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1938 !isBreakUnit(strsrch
, start
, *textoffset
) ||
1939 !checkIdentical(strsrch
, start
, *textoffset
)) {
1941 *textoffset
= getNextBaseOffset(strsrch
->search
->text
, *textoffset
,
1942 strsrch
->search
->textLength
);
1946 strsrch
->search
->matchedIndex
= start
;
1947 strsrch
->search
->matchedLength
= *textoffset
- start
;
1952 * Shifting the collation element iterator position forward to prepare for
1953 * a preceding match. If the first character is a unsafe character, we'll only
1954 * shift by 1 to capture contractions, normalization etc.
1955 * Internal method, status assumed to be success, caller has to check status
1956 * before calling this method.
1957 * @param text strsrch string search data
1958 * @param textoffset start text position to do search
1959 * @param ce the text ce which failed the match.
1960 * @param patternceindex index of the ce within the pattern ce buffer which
1962 * @return final offset
1965 inline int32_t reverseShift(UStringSearch
*strsrch
,
1968 int32_t patternceindex
)
1970 if (strsrch
->search
->isOverlap
) {
1971 if (textoffset
!= strsrch
->search
->textLength
) {
1975 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1979 if (ce
!= UCOL_NULLORDER
) {
1980 int32_t shift
= strsrch
->pattern
.backShift
[hashFromCE32(ce
)];
1982 // this is to adjust for characters in the middle of the substring
1983 // for matching that failed.
1984 int32_t adjust
= patternceindex
;
1985 if (adjust
> 1 && shift
> adjust
) {
1986 shift
-= adjust
- 1;
1988 textoffset
-= shift
;
1991 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1994 textoffset
= getPreviousUStringSearchBaseOffset(strsrch
, textoffset
);
1999 * Checks match for contraction.
2000 * If the match starts with a partial contraction we fail.
2001 * Internal method, status assumed to be success, caller has to check status
2002 * before calling this method.
2003 * @param strsrch string search data
2004 * @param start offset of potential match, to be modified if necessary
2005 * @param end offset of potential match, to be modified if necessary
2006 * @param status output error status if any
2007 * @return TRUE if match passes the contraction test, FALSE otherwise
2010 UBool
checkPreviousExactContractionMatch(UStringSearch
*strsrch
,
2012 int32_t *end
, UErrorCode
*status
)
2014 UCollationElements
*coleiter
= strsrch
->textIter
;
2015 int32_t textlength
= strsrch
->search
->textLength
;
2016 int32_t temp
= *end
;
2017 const UCollator
*collator
= strsrch
->collator
;
2018 const UChar
*text
= strsrch
->search
->text
;
2019 // This part checks if either if the start of the match contains potential
2020 // contraction. If so we'll have to iterate through them
2021 // Since we used ucol_next while previously looking for the potential
2022 // match, this guarantees that our end will not be a partial contraction,
2023 // or a partial supplementary character.
2024 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2025 int32_t expansion
= getExpansionSuffix(coleiter
);
2026 UBool expandflag
= expansion
> 0;
2027 setColEIterOffset(coleiter
, *end
);
2028 while (U_SUCCESS(*status
) && expansion
> 0) {
2029 // getting rid of the redundant ce
2030 // since forward contraction/expansion may have extra ces
2031 // if we are in the normalization buffer, hasAccentsBeforeMatch
2032 // would have taken care of it.
2033 // E.g. the character \u01FA will have an expansion of 3, but if
2034 // we are only looking for A ring A\u030A, we'll have to skip the
2035 // last ce in the expansion buffer
2036 ucol_previous(coleiter
, status
);
2037 if (U_FAILURE(*status
)) {
2040 if (ucol_getOffset(coleiter
) != temp
) {
2042 temp
= ucol_getOffset(coleiter
);
2047 int32_t *patternce
= strsrch
->pattern
.ces
;
2048 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
2049 int32_t count
= patterncelength
;
2051 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2052 // status checked below, note that if status is a failure
2053 // ucol_previous returns UCOL_NULLORDER
2054 if (ce
== UCOL_IGNORABLE
) {
2057 if (expandflag
&& count
== 0 &&
2058 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2060 temp
= ucol_getOffset(coleiter
);
2062 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2064 *start
= getPreviousBaseOffset(text
, *start
);
2074 * Checks and sets the match information if found.
2077 * <li> the current match does not repeat the last match
2078 * <li> boundaries are correct
2079 * <li> exact matches has no extra accents
2080 * <li> identical matches
2082 * Otherwise the offset will be shifted to the preceding character.
2083 * Internal method, status assumed to be success, caller has to check status
2084 * before calling this method.
2085 * @param strsrch string search data
2087 * @param coleiter collation element iterator
2088 * @param text string
2089 * @param textoffset offset in the collation element text. the returned value
2090 * will be the truncated start offset of the match or the new start
2092 * @param status output error status if any
2093 * @return TRUE if the match is valid, FALSE otherwise
2096 inline UBool
checkPreviousExactMatch(UStringSearch
*strsrch
,
2097 int32_t *textoffset
,
2100 // to ensure that the start and ends are not composite characters
2101 int32_t end
= ucol_getOffset(strsrch
->textIter
);
2102 if (!checkPreviousExactContractionMatch(strsrch
, textoffset
, &end
, status
)
2103 || U_FAILURE(*status
)) {
2107 // this totally matches, however we need to check if it is repeating
2109 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2110 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2111 hasAccentsBeforeMatch(strsrch
, *textoffset
, end
) ||
2112 !checkIdentical(strsrch
, *textoffset
, end
) ||
2113 hasAccentsAfterMatch(strsrch
, *textoffset
, end
)) {
2115 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2120 //Add breakiterator boundary check for primary strength search.
2121 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
2122 checkBreakBoundary(strsrch
, textoffset
, &end
);
2125 strsrch
->search
->matchedIndex
= *textoffset
;
2126 strsrch
->search
->matchedLength
= end
- *textoffset
;
2131 * Rearranges the end accents to try matching.
2132 * Suffix accents in the text will be grouped according to their combining
2133 * class and the groups will be mixed and matched to try find the perfect
2134 * match with the pattern.
2135 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2136 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2137 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2139 * step 2: check if any of the generated substrings matches the pattern.
2140 * Internal method, status assumed to be success, user has to check status
2141 * before calling this method.
2142 * @param strsrch string search match
2143 * @param start offset of the first base character
2144 * @param end start of the last accent set
2145 * @param status only error status if any
2146 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2147 * offset of the match. Note this start includes all following accents.
2150 int32_t doPreviousCanonicalSuffixMatch(UStringSearch
*strsrch
,
2155 const UChar
*text
= strsrch
->search
->text
;
2156 int32_t tempend
= end
;
2158 U16_BACK_1(text
, 0, tempend
);
2159 if (!(getFCD(text
, &tempend
, strsrch
->search
->textLength
) &
2161 // die... failed at a base character
2162 return USEARCH_DONE
;
2164 end
= getNextBaseOffset(text
, end
, strsrch
->search
->textLength
);
2166 if (U_SUCCESS(*status
)) {
2167 UChar accents
[INITIAL_ARRAY_SIZE_
];
2168 int32_t offset
= getPreviousBaseOffset(text
, end
);
2169 // normalizing the offensive string
2170 unorm_normalize(text
+ offset
, end
- offset
, UNORM_NFD
, 0, accents
,
2171 INITIAL_ARRAY_SIZE_
, status
);
2173 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2174 int32_t accentsize
= getUnblockedAccentIndex(accents
,
2176 int32_t count
= (2 << (accentsize
- 1)) - 1;
2177 UChar buffer
[INITIAL_ARRAY_SIZE_
];
2178 UCollationElements
*coleiter
= strsrch
->utilIter
;
2179 while (U_SUCCESS(*status
) && count
> 0) {
2180 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
2181 // copy the base characters
2182 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2183 *rearrange
++ = accents
[k
];
2185 // forming all possible canonical rearrangement by dropping
2187 for (int i
= 0; i
<= accentsize
- 1; i
++) {
2188 int32_t mask
= 1 << (accentsize
- i
- 1);
2190 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2191 *rearrange
++ = accents
[j
];
2196 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
2197 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
2198 strsrch
->canonicalPrefixAccents
,
2199 strsrch
->search
->text
+ start
,
2201 strsrch
->canonicalSuffixAccents
,
2204 // run the collator iterator through this match
2205 // if status is a failure ucol_setText does nothing
2206 ucol_setText(coleiter
, match
, matchsize
, status
);
2207 if (U_SUCCESS(*status
)) {
2208 if (checkCollationMatch(strsrch
, coleiter
)) {
2209 if (match
!= buffer
) {
2218 return USEARCH_DONE
;
2222 * Take the rearranged start accents and tries matching. If match failed at
2223 * a seperate following set of accents (seperated from the rearranged on by
2224 * at least a base character) then we rearrange the preceding accents and
2225 * tries matching again.
2226 * We allow skipping of the ends of the accent set if the ces do not match.
2227 * However if the failure is found before the accent set, it fails.
2228 * Internal method, status assumed to be success, caller has to check status
2229 * before calling this method.
2230 * @param strsrch string search data
2231 * @param textoffset of the ends of the rearranged accent
2232 * @param status output error status if any
2233 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2234 * offset of the match. Note this start includes all following accents.
2237 int32_t doPreviousCanonicalPrefixMatch(UStringSearch
*strsrch
,
2241 const UChar
*text
= strsrch
->search
->text
;
2242 const UCollator
*collator
= strsrch
->collator
;
2243 int32_t safelength
= 0;
2245 int32_t safetextlength
;
2246 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
2247 int32_t safeoffset
= textoffset
;
2250 ucol_unsafeCP(strsrch
->canonicalPrefixAccents
[
2251 u_strlen(strsrch
->canonicalPrefixAccents
) - 1
2253 safeoffset
= getNextSafeOffset(collator
, text
, textoffset
,
2254 strsrch
->search
->textLength
);
2255 safelength
= safeoffset
- textoffset
;
2256 safetextlength
= INITIAL_ARRAY_SIZE_
;
2257 safetext
= addToUCharArray(safebuffer
, &safetextlength
,
2258 strsrch
->canonicalPrefixAccents
,
2259 text
+ textoffset
, safelength
,
2263 safetextlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2264 safetext
= strsrch
->canonicalPrefixAccents
;
2267 UCollationElements
*coleiter
= strsrch
->utilIter
;
2268 // if status is a failure, ucol_setText does nothing
2269 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
2270 // status checked in loop below
2272 int32_t *ce
= strsrch
->pattern
.ces
;
2273 int32_t celength
= strsrch
->pattern
.cesLength
;
2275 UBool isSafe
= TRUE
; // safe zone indication flag for position
2276 int32_t prefixlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2278 while (ceindex
< celength
) {
2279 int32_t textce
= ucol_next(coleiter
, status
);
2280 if (U_FAILURE(*status
)) {
2282 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2284 return USEARCH_DONE
;
2286 if (textce
== UCOL_NULLORDER
) {
2287 // check if we have passed the safe buffer
2288 if (coleiter
== strsrch
->textIter
) {
2289 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2290 return USEARCH_DONE
;
2292 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2293 safetext
= safebuffer
;
2294 coleiter
= strsrch
->textIter
;
2295 setColEIterOffset(coleiter
, safeoffset
);
2296 // status checked at the start of the loop
2300 textce
= getCE(strsrch
, textce
);
2301 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
2302 // do the beginning stuff
2303 int32_t failedoffset
= ucol_getOffset(coleiter
);
2304 if (isSafe
&& failedoffset
<= prefixlength
) {
2305 // alas... no hope. failed at rearranged accent set
2306 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2307 return USEARCH_DONE
;
2311 failedoffset
= safeoffset
- failedoffset
;
2312 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2315 // try rearranging the end accents
2316 int32_t result
= doPreviousCanonicalSuffixMatch(strsrch
,
2317 textoffset
, failedoffset
, status
);
2318 if (result
!= USEARCH_DONE
) {
2319 // if status is a failure, ucol_setOffset does nothing
2320 setColEIterOffset(strsrch
->textIter
, result
);
2322 if (U_FAILURE(*status
)) {
2323 return USEARCH_DONE
;
2328 if (textce
== ce
[ceindex
]) {
2334 int32_t result
= ucol_getOffset(coleiter
);
2335 // sets the text iterator here with the correct expansion and offset
2336 int32_t leftoverces
= getExpansionSuffix(coleiter
);
2337 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2338 if (result
<= prefixlength
) {
2339 result
= textoffset
;
2342 result
= textoffset
+ (safeoffset
- result
);
2344 setColEIterOffset(strsrch
->textIter
, result
);
2345 setExpansionSuffix(strsrch
->textIter
, leftoverces
);
2349 return ucol_getOffset(coleiter
);
2353 * Trying out the substring and sees if it can be a canonical match.
2354 * This will try normalizing the starting accents and arranging them into
2355 * canonical equivalents and check their corresponding ces with the pattern ce.
2356 * Prefix accents in the text will be grouped according to their combining
2357 * class and the groups will be mixed and matched to try find the perfect
2358 * match with the pattern.
2359 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2360 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2361 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2363 * step 2: check if any of the generated substrings matches the pattern.
2364 * Internal method, status assumed to be success, caller has to check status
2365 * before calling this method.
2366 * @param strsrch string search data
2367 * @param textoffset start offset in the collation element text that starts
2368 * with the accents to be rearranged
2369 * @param status output error status if any
2370 * @return TRUE if the match is valid, FALSE otherwise
2373 UBool
doPreviousCanonicalMatch(UStringSearch
*strsrch
,
2377 const UChar
*text
= strsrch
->search
->text
;
2378 int32_t temp
= textoffset
;
2379 int32_t textlength
= strsrch
->search
->textLength
;
2380 if ((getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) == 0) {
2381 UCollationElements
*coleiter
= strsrch
->textIter
;
2382 int32_t offset
= ucol_getOffset(coleiter
);
2383 if (strsrch
->pattern
.hasSuffixAccents
) {
2384 offset
= doPreviousCanonicalSuffixMatch(strsrch
, textoffset
,
2386 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
2387 setColEIterOffset(coleiter
, offset
);
2394 if (!strsrch
->pattern
.hasPrefixAccents
) {
2398 UChar accents
[INITIAL_ARRAY_SIZE_
];
2399 // offset to the last base character in substring to search
2400 int32_t baseoffset
= getNextBaseOffset(text
, textoffset
, textlength
);
2401 // normalizing the offensive string
2402 unorm_normalize(text
+ textoffset
, baseoffset
- textoffset
, UNORM_NFD
,
2403 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
2404 // status checked in loop
2406 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2407 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
2409 // 2 power n - 1 plus the full set of accents
2410 int32_t count
= (2 << (size
- 1)) - 1;
2411 while (U_SUCCESS(*status
) && count
> 0) {
2412 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
2413 // copy the base characters
2414 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2415 *rearrange
++ = accents
[k
];
2417 // forming all possible canonical rearrangement by dropping
2419 for (int i
= 0; i
<= size
- 1; i
++) {
2420 int32_t mask
= 1 << (size
- i
- 1);
2422 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2423 *rearrange
++ = accents
[j
];
2428 int32_t offset
= doPreviousCanonicalPrefixMatch(strsrch
,
2429 baseoffset
, status
);
2430 if (offset
!= USEARCH_DONE
) {
2431 return TRUE
; // match found
2439 * Checks match for contraction.
2440 * If the match starts with a partial contraction we fail.
2441 * Internal method, status assumed to be success, caller has to check status
2442 * before calling this method.
2443 * @param strsrch string search data
2444 * @param start offset of potential match, to be modified if necessary
2445 * @param end offset of potential match, to be modified if necessary
2446 * @param status only error status if any
2447 * @return TRUE if match passes the contraction test, FALSE otherwise
2450 UBool
checkPreviousCanonicalContractionMatch(UStringSearch
*strsrch
,
2452 int32_t *end
, UErrorCode
*status
)
2454 UCollationElements
*coleiter
= strsrch
->textIter
;
2455 int32_t textlength
= strsrch
->search
->textLength
;
2456 int32_t temp
= *end
;
2457 const UCollator
*collator
= strsrch
->collator
;
2458 const UChar
*text
= strsrch
->search
->text
;
2459 // This part checks if either if the start of the match contains potential
2460 // contraction. If so we'll have to iterate through them
2461 // Since we used ucol_next while previously looking for the potential
2462 // match, this guarantees that our end will not be a partial contraction,
2463 // or a partial supplementary character.
2464 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2465 int32_t expansion
= getExpansionSuffix(coleiter
);
2466 UBool expandflag
= expansion
> 0;
2467 setColEIterOffset(coleiter
, *end
);
2468 while (expansion
> 0) {
2469 // getting rid of the redundant ce
2470 // since forward contraction/expansion may have extra ces
2471 // if we are in the normalization buffer, hasAccentsBeforeMatch
2472 // would have taken care of it.
2473 // E.g. the character \u01FA will have an expansion of 3, but if
2474 // we are only looking for A ring A\u030A, we'll have to skip the
2475 // last ce in the expansion buffer
2476 ucol_previous(coleiter
, status
);
2477 if (U_FAILURE(*status
)) {
2480 if (ucol_getOffset(coleiter
) != temp
) {
2482 temp
= ucol_getOffset(coleiter
);
2487 int32_t *patternce
= strsrch
->pattern
.ces
;
2488 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
2489 int32_t count
= patterncelength
;
2491 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2492 // status checked below, note that if status is a failure
2493 // ucol_previous returns UCOL_NULLORDER
2494 if (ce
== UCOL_IGNORABLE
) {
2497 if (expandflag
&& count
== 0 &&
2498 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2500 temp
= ucol_getOffset(coleiter
);
2502 if (count
== patterncelength
&&
2503 ce
!= patternce
[patterncelength
- 1]) {
2504 // accents may have extra starting ces, this occurs when a
2505 // pure accent pattern is matched without rearrangement
2506 int32_t expected
= patternce
[patterncelength
- 1];
2507 U16_BACK_1(text
, 0, *end
);
2508 if (getFCD(text
, end
, textlength
) & LAST_BYTE_MASK_
) {
2509 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2510 while (U_SUCCESS(*status
) && ce
!= expected
&&
2511 ce
!= UCOL_NULLORDER
&&
2512 ucol_getOffset(coleiter
) <= *start
) {
2513 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2517 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2519 *start
= getPreviousBaseOffset(text
, *start
);
2529 * Checks and sets the match information if found.
2532 * <li> the potential match does not repeat the previous match
2533 * <li> boundaries are correct
2534 * <li> potential match does not end in the middle of a contraction
2535 * <li> identical matches
2537 * Otherwise the offset will be shifted to the next character.
2538 * Internal method, status assumed to be success, caller has to check status
2539 * before calling this method.
2540 * @param strsrch string search data
2541 * @param textoffset offset in the collation element text. the returned value
2542 * will be the truncated start offset of the match or the new start
2544 * @param status only error status if any
2545 * @return TRUE if the match is valid, FALSE otherwise
2548 inline UBool
checkPreviousCanonicalMatch(UStringSearch
*strsrch
,
2549 int32_t *textoffset
,
2552 // to ensure that the start and ends are not composite characters
2553 UCollationElements
*coleiter
= strsrch
->textIter
;
2554 // if we have a canonical accent match
2555 if ((strsrch
->pattern
.hasSuffixAccents
&&
2556 strsrch
->canonicalSuffixAccents
[0]) ||
2557 (strsrch
->pattern
.hasPrefixAccents
&&
2558 strsrch
->canonicalPrefixAccents
[0])) {
2559 strsrch
->search
->matchedIndex
= *textoffset
;
2560 strsrch
->search
->matchedLength
=
2561 getNextUStringSearchBaseOffset(strsrch
,
2562 getColElemIterOffset(coleiter
, FALSE
))
2567 int32_t end
= ucol_getOffset(coleiter
);
2568 if (!checkPreviousCanonicalContractionMatch(strsrch
, textoffset
, &end
,
2570 U_FAILURE(*status
)) {
2574 end
= getNextUStringSearchBaseOffset(strsrch
, end
);
2575 // this totally matches, however we need to check if it is repeating
2576 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2577 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2578 !checkIdentical(strsrch
, *textoffset
, end
)) {
2580 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2585 strsrch
->search
->matchedIndex
= *textoffset
;
2586 strsrch
->search
->matchedLength
= end
- *textoffset
;
2589 #endif // #if BOYER_MOORE
2591 // constructors and destructor -------------------------------------------
2593 U_CAPI UStringSearch
* U_EXPORT2
usearch_open(const UChar
*pattern
,
2594 int32_t patternlength
,
2598 UBreakIterator
*breakiter
,
2601 if (U_FAILURE(*status
)) {
2604 #if UCONFIG_NO_BREAK_ITERATION
2605 if (breakiter
!= NULL
) {
2606 *status
= U_UNSUPPORTED_ERROR
;
2611 // ucol_open internally checks for status
2612 UCollator
*collator
= ucol_open(locale
, status
);
2613 // pattern, text checks are done in usearch_openFromCollator
2614 UStringSearch
*result
= usearch_openFromCollator(pattern
,
2615 patternlength
, text
, textlength
,
2616 collator
, breakiter
, status
);
2618 if (result
== NULL
|| U_FAILURE(*status
)) {
2620 ucol_close(collator
);
2625 result
->ownCollator
= TRUE
;
2629 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2633 U_CAPI UStringSearch
* U_EXPORT2
usearch_openFromCollator(
2634 const UChar
*pattern
,
2635 int32_t patternlength
,
2638 const UCollator
*collator
,
2639 UBreakIterator
*breakiter
,
2642 if (U_FAILURE(*status
)) {
2645 #if UCONFIG_NO_BREAK_ITERATION
2646 if (breakiter
!= NULL
) {
2647 *status
= U_UNSUPPORTED_ERROR
;
2651 if (pattern
== NULL
|| text
== NULL
|| collator
== NULL
) {
2652 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2656 // string search does not really work when numeric collation is turned on
2657 if(ucol_getAttribute(collator
, UCOL_NUMERIC_COLLATION
, status
) == UCOL_ON
) {
2658 *status
= U_UNSUPPORTED_ERROR
;
2662 if (U_SUCCESS(*status
)) {
2663 initializeFCD(status
);
2664 if (U_FAILURE(*status
)) {
2668 UStringSearch
*result
;
2669 if (textlength
== -1) {
2670 textlength
= u_strlen(text
);
2672 if (patternlength
== -1) {
2673 patternlength
= u_strlen(pattern
);
2675 if (textlength
<= 0 || patternlength
<= 0) {
2676 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2680 result
= (UStringSearch
*)uprv_malloc(sizeof(UStringSearch
));
2681 if (result
== NULL
) {
2682 *status
= U_MEMORY_ALLOCATION_ERROR
;
2686 result
->collator
= collator
;
2687 result
->strength
= ucol_getStrength(collator
);
2688 result
->ceMask
= getMask(result
->strength
);
2690 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2692 result
->variableTop
= ucol_getVariableTop(collator
, status
);
2694 result
->nfd
= Normalizer2::getNFDInstance(*status
);
2696 if (U_FAILURE(*status
)) {
2701 result
->search
= (USearch
*)uprv_malloc(sizeof(USearch
));
2702 if (result
->search
== NULL
) {
2703 *status
= U_MEMORY_ALLOCATION_ERROR
;
2708 result
->search
->text
= text
;
2709 result
->search
->textLength
= textlength
;
2711 result
->pattern
.text
= pattern
;
2712 result
->pattern
.textLength
= patternlength
;
2713 result
->pattern
.ces
= NULL
;
2714 result
->pattern
.pces
= NULL
;
2716 result
->search
->breakIter
= breakiter
;
2717 #if !UCONFIG_NO_BREAK_ITERATION
2718 result
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(result
->collator
, ULOC_VALID_LOCALE
, status
), text
, textlength
, status
);
2720 ubrk_setText(breakiter
, text
, textlength
, status
);
2724 result
->ownCollator
= FALSE
;
2725 result
->search
->matchedLength
= 0;
2726 result
->search
->matchedIndex
= USEARCH_DONE
;
2727 result
->utilIter
= NULL
;
2728 result
->textIter
= ucol_openElements(collator
, text
,
2729 textlength
, status
);
2730 result
->textProcessedIter
= NULL
;
2731 if (U_FAILURE(*status
)) {
2732 usearch_close(result
);
2736 result
->search
->isOverlap
= FALSE
;
2737 result
->search
->isCanonicalMatch
= FALSE
;
2738 result
->search
->elementComparisonType
= 0;
2739 result
->search
->isForwardSearching
= TRUE
;
2740 result
->search
->reset
= TRUE
;
2742 initialize(result
, status
);
2744 if (U_FAILURE(*status
)) {
2745 usearch_close(result
);
2754 U_CAPI
void U_EXPORT2
usearch_close(UStringSearch
*strsrch
)
2757 if (strsrch
->pattern
.ces
!= strsrch
->pattern
.cesBuffer
&&
2758 strsrch
->pattern
.ces
) {
2759 uprv_free(strsrch
->pattern
.ces
);
2762 if (strsrch
->pattern
.pces
!= NULL
&&
2763 strsrch
->pattern
.pces
!= strsrch
->pattern
.pcesBuffer
) {
2764 uprv_free(strsrch
->pattern
.pces
);
2767 delete strsrch
->textProcessedIter
;
2768 ucol_closeElements(strsrch
->textIter
);
2769 ucol_closeElements(strsrch
->utilIter
);
2771 if (strsrch
->ownCollator
&& strsrch
->collator
) {
2772 ucol_close((UCollator
*)strsrch
->collator
);
2775 #if !UCONFIG_NO_BREAK_ITERATION
2776 if (strsrch
->search
->internalBreakIter
) {
2777 ubrk_close(strsrch
->search
->internalBreakIter
);
2781 uprv_free(strsrch
->search
);
2788 UBool
initTextProcessedIter(UStringSearch
*strsrch
, UErrorCode
*status
) {
2789 if (U_FAILURE(*status
)) { return FALSE
; }
2790 if (strsrch
->textProcessedIter
== NULL
) {
2791 strsrch
->textProcessedIter
= new icu::UCollationPCE(strsrch
->textIter
);
2792 if (strsrch
->textProcessedIter
== NULL
) {
2793 *status
= U_MEMORY_ALLOCATION_ERROR
;
2797 strsrch
->textProcessedIter
->init(strsrch
->textIter
);
2804 // set and get methods --------------------------------------------------
2806 U_CAPI
void U_EXPORT2
usearch_setOffset(UStringSearch
*strsrch
,
2810 if (U_SUCCESS(*status
) && strsrch
) {
2811 if (isOutOfBounds(strsrch
->search
->textLength
, position
)) {
2812 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2815 setColEIterOffset(strsrch
->textIter
, position
);
2817 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2818 strsrch
->search
->matchedLength
= 0;
2819 strsrch
->search
->reset
= FALSE
;
2823 U_CAPI
int32_t U_EXPORT2
usearch_getOffset(const UStringSearch
*strsrch
)
2826 int32_t result
= ucol_getOffset(strsrch
->textIter
);
2827 if (isOutOfBounds(strsrch
->search
->textLength
, result
)) {
2828 return USEARCH_DONE
;
2832 return USEARCH_DONE
;
2835 U_CAPI
void U_EXPORT2
usearch_setAttribute(UStringSearch
*strsrch
,
2836 USearchAttribute attribute
,
2837 USearchAttributeValue value
,
2840 if (U_SUCCESS(*status
) && strsrch
) {
2843 case USEARCH_OVERLAP
:
2844 strsrch
->search
->isOverlap
= (value
== USEARCH_ON
? TRUE
: FALSE
);
2846 case USEARCH_CANONICAL_MATCH
:
2847 strsrch
->search
->isCanonicalMatch
= (value
== USEARCH_ON
? TRUE
:
2850 case USEARCH_ELEMENT_COMPARISON
:
2851 if (value
== USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD
|| value
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
2852 strsrch
->search
->elementComparisonType
= (int16_t)value
;
2854 strsrch
->search
->elementComparisonType
= 0;
2857 case USEARCH_ATTRIBUTE_COUNT
:
2859 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2862 if (value
== USEARCH_ATTRIBUTE_VALUE_COUNT
) {
2863 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2867 U_CAPI USearchAttributeValue U_EXPORT2
usearch_getAttribute(
2868 const UStringSearch
*strsrch
,
2869 USearchAttribute attribute
)
2872 switch (attribute
) {
2873 case USEARCH_OVERLAP
:
2874 return (strsrch
->search
->isOverlap
== TRUE
? USEARCH_ON
:
2876 case USEARCH_CANONICAL_MATCH
:
2877 return (strsrch
->search
->isCanonicalMatch
== TRUE
? USEARCH_ON
:
2879 case USEARCH_ELEMENT_COMPARISON
:
2881 int16_t value
= strsrch
->search
->elementComparisonType
;
2882 if (value
== USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD
|| value
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
2883 return (USearchAttributeValue
)value
;
2885 return USEARCH_STANDARD_ELEMENT_COMPARISON
;
2888 case USEARCH_ATTRIBUTE_COUNT
:
2889 return USEARCH_DEFAULT
;
2892 return USEARCH_DEFAULT
;
2895 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedStart(
2896 const UStringSearch
*strsrch
)
2898 if (strsrch
== NULL
) {
2899 return USEARCH_DONE
;
2901 return strsrch
->search
->matchedIndex
;
2905 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedText(const UStringSearch
*strsrch
,
2907 int32_t resultCapacity
,
2910 if (U_FAILURE(*status
)) {
2911 return USEARCH_DONE
;
2913 if (strsrch
== NULL
|| resultCapacity
< 0 || (resultCapacity
> 0 &&
2915 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2916 return USEARCH_DONE
;
2919 int32_t copylength
= strsrch
->search
->matchedLength
;
2920 int32_t copyindex
= strsrch
->search
->matchedIndex
;
2921 if (copyindex
== USEARCH_DONE
) {
2922 u_terminateUChars(result
, resultCapacity
, 0, status
);
2923 return USEARCH_DONE
;
2926 if (resultCapacity
< copylength
) {
2927 copylength
= resultCapacity
;
2929 if (copylength
> 0) {
2930 uprv_memcpy(result
, strsrch
->search
->text
+ copyindex
,
2931 copylength
* sizeof(UChar
));
2933 return u_terminateUChars(result
, resultCapacity
,
2934 strsrch
->search
->matchedLength
, status
);
2937 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedLength(
2938 const UStringSearch
*strsrch
)
2941 return strsrch
->search
->matchedLength
;
2943 return USEARCH_DONE
;
2946 #if !UCONFIG_NO_BREAK_ITERATION
2948 U_CAPI
void U_EXPORT2
usearch_setBreakIterator(UStringSearch
*strsrch
,
2949 UBreakIterator
*breakiter
,
2952 if (U_SUCCESS(*status
) && strsrch
) {
2953 strsrch
->search
->breakIter
= breakiter
;
2955 ubrk_setText(breakiter
, strsrch
->search
->text
,
2956 strsrch
->search
->textLength
, status
);
2961 U_CAPI
const UBreakIterator
* U_EXPORT2
2962 usearch_getBreakIterator(const UStringSearch
*strsrch
)
2965 return strsrch
->search
->breakIter
;
2972 U_CAPI
void U_EXPORT2
usearch_setText( UStringSearch
*strsrch
,
2977 if (U_SUCCESS(*status
)) {
2978 if (strsrch
== NULL
|| text
== NULL
|| textlength
< -1 ||
2980 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2983 if (textlength
== -1) {
2984 textlength
= u_strlen(text
);
2986 strsrch
->search
->text
= text
;
2987 strsrch
->search
->textLength
= textlength
;
2988 ucol_setText(strsrch
->textIter
, text
, textlength
, status
);
2989 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2990 strsrch
->search
->matchedLength
= 0;
2991 strsrch
->search
->reset
= TRUE
;
2992 #if !UCONFIG_NO_BREAK_ITERATION
2993 if (strsrch
->search
->breakIter
!= NULL
) {
2994 ubrk_setText(strsrch
->search
->breakIter
, text
,
2995 textlength
, status
);
2997 ubrk_setText(strsrch
->search
->internalBreakIter
, text
, textlength
, status
);
3003 U_CAPI
const UChar
* U_EXPORT2
usearch_getText(const UStringSearch
*strsrch
,
3007 *length
= strsrch
->search
->textLength
;
3008 return strsrch
->search
->text
;
3013 U_CAPI
void U_EXPORT2
usearch_setCollator( UStringSearch
*strsrch
,
3014 const UCollator
*collator
,
3017 if (U_SUCCESS(*status
)) {
3018 if (collator
== NULL
) {
3019 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3024 delete strsrch
->textProcessedIter
;
3025 strsrch
->textProcessedIter
= NULL
;
3026 ucol_closeElements(strsrch
->textIter
);
3027 ucol_closeElements(strsrch
->utilIter
);
3028 strsrch
->textIter
= strsrch
->utilIter
= NULL
;
3029 if (strsrch
->ownCollator
&& (strsrch
->collator
!= collator
)) {
3030 ucol_close((UCollator
*)strsrch
->collator
);
3031 strsrch
->ownCollator
= FALSE
;
3033 strsrch
->collator
= collator
;
3034 strsrch
->strength
= ucol_getStrength(collator
);
3035 strsrch
->ceMask
= getMask(strsrch
->strength
);
3036 #if !UCONFIG_NO_BREAK_ITERATION
3037 ubrk_close(strsrch
->search
->internalBreakIter
);
3038 strsrch
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(collator
, ULOC_VALID_LOCALE
, status
),
3039 strsrch
->search
->text
, strsrch
->search
->textLength
, status
);
3041 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3043 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
3045 // if status is a failure, ucol_getVariableTop returns 0
3046 strsrch
->variableTop
= ucol_getVariableTop(collator
, status
);
3047 strsrch
->textIter
= ucol_openElements(collator
,
3048 strsrch
->search
->text
,
3049 strsrch
->search
->textLength
,
3051 strsrch
->utilIter
= ucol_openElements(
3052 collator
, strsrch
->pattern
.text
, strsrch
->pattern
.textLength
, status
);
3053 // initialize() _after_ setting the iterators for the new collator.
3054 initialize(strsrch
, status
);
3057 // **** are these calls needed?
3058 // **** we call uprv_init_pce in initializePatternPCETable
3059 // **** and the CEIBuffer constructor...
3061 uprv_init_pce(strsrch
->textIter
);
3062 uprv_init_pce(strsrch
->utilIter
);
3067 U_CAPI UCollator
* U_EXPORT2
usearch_getCollator(const UStringSearch
*strsrch
)
3070 return (UCollator
*)strsrch
->collator
;
3075 U_CAPI
void U_EXPORT2
usearch_setPattern( UStringSearch
*strsrch
,
3076 const UChar
*pattern
,
3077 int32_t patternlength
,
3080 if (U_SUCCESS(*status
)) {
3081 if (strsrch
== NULL
|| pattern
== NULL
) {
3082 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3085 if (patternlength
== -1) {
3086 patternlength
= u_strlen(pattern
);
3088 if (patternlength
== 0) {
3089 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3092 strsrch
->pattern
.text
= pattern
;
3093 strsrch
->pattern
.textLength
= patternlength
;
3094 initialize(strsrch
, status
);
3099 U_CAPI
const UChar
* U_EXPORT2
3100 usearch_getPattern(const UStringSearch
*strsrch
,
3104 *length
= strsrch
->pattern
.textLength
;
3105 return strsrch
->pattern
.text
;
3110 // miscellanous methods --------------------------------------------------
3112 U_CAPI
int32_t U_EXPORT2
usearch_first(UStringSearch
*strsrch
,
3115 if (strsrch
&& U_SUCCESS(*status
)) {
3116 strsrch
->search
->isForwardSearching
= TRUE
;
3117 usearch_setOffset(strsrch
, 0, status
);
3118 if (U_SUCCESS(*status
)) {
3119 return usearch_next(strsrch
, status
);
3122 return USEARCH_DONE
;
3125 U_CAPI
int32_t U_EXPORT2
usearch_following(UStringSearch
*strsrch
,
3129 if (strsrch
&& U_SUCCESS(*status
)) {
3130 strsrch
->search
->isForwardSearching
= TRUE
;
3131 // position checked in usearch_setOffset
3132 usearch_setOffset(strsrch
, position
, status
);
3133 if (U_SUCCESS(*status
)) {
3134 return usearch_next(strsrch
, status
);
3137 return USEARCH_DONE
;
3140 U_CAPI
int32_t U_EXPORT2
usearch_last(UStringSearch
*strsrch
,
3143 if (strsrch
&& U_SUCCESS(*status
)) {
3144 strsrch
->search
->isForwardSearching
= FALSE
;
3145 usearch_setOffset(strsrch
, strsrch
->search
->textLength
, status
);
3146 if (U_SUCCESS(*status
)) {
3147 return usearch_previous(strsrch
, status
);
3150 return USEARCH_DONE
;
3153 U_CAPI
int32_t U_EXPORT2
usearch_preceding(UStringSearch
*strsrch
,
3157 if (strsrch
&& U_SUCCESS(*status
)) {
3158 strsrch
->search
->isForwardSearching
= FALSE
;
3159 // position checked in usearch_setOffset
3160 usearch_setOffset(strsrch
, position
, status
);
3161 if (U_SUCCESS(*status
)) {
3162 return usearch_previous(strsrch
, status
);
3165 return USEARCH_DONE
;
3169 * If a direction switch is required, we'll count the number of ces till the
3170 * beginning of the collation element iterator and iterate forwards that
3171 * number of times. This is so that we get to the correct point within the
3172 * string to continue the search in. Imagine when we are in the middle of the
3173 * normalization buffer when the change in direction is request. arrrgghh....
3174 * After searching the offset within the collation element iterator will be
3175 * shifted to the start of the match. If a match is not found, the offset would
3176 * have been set to the end of the text string in the collation element
3178 * Okay, here's my take on normalization buffer. The only time when there can
3179 * be 2 matches within the same normalization is when the pattern is consists
3180 * of all accents. But since the offset returned is from the text string, we
3181 * should not confuse the caller by returning the second match within the
3182 * same normalization buffer. If we do, the 2 results will have the same match
3183 * offsets, and that'll be confusing. I'll return the next match that doesn't
3184 * fall within the same normalization buffer. Note this does not affect the
3185 * results of matches spanning the text and the normalization buffer.
3186 * The position to start searching is taken from the collation element
3187 * iterator. Callers of this API would have to set the offset in the collation
3188 * element iterator before using this method.
3190 U_CAPI
int32_t U_EXPORT2
usearch_next(UStringSearch
*strsrch
,
3193 if (U_SUCCESS(*status
) && strsrch
) {
3194 // note offset is either equivalent to the start of the previous match
3195 // or is set by the user
3196 int32_t offset
= usearch_getOffset(strsrch
);
3197 USearch
*search
= strsrch
->search
;
3198 search
->reset
= FALSE
;
3199 int32_t textlength
= search
->textLength
;
3200 if (search
->isForwardSearching
) {
3202 if (offset
== textlength
3203 || (!search
->isOverlap
&&
3204 (offset
+ strsrch
->pattern
.defaultShiftSize
> textlength
||
3205 (search
->matchedIndex
!= USEARCH_DONE
&&
3206 offset
+ search
->matchedLength
>= textlength
)))) {
3207 // not enough characters to match
3208 setMatchNotFound(strsrch
);
3209 return USEARCH_DONE
;
3212 if (offset
== textlength
||
3213 (! search
->isOverlap
&&
3214 (search
->matchedIndex
!= USEARCH_DONE
&&
3215 offset
+ search
->matchedLength
> textlength
))) {
3216 // not enough characters to match
3217 setMatchNotFound(strsrch
);
3218 return USEARCH_DONE
;
3223 // switching direction.
3224 // if matchedIndex == USEARCH_DONE, it means that either a
3225 // setOffset has been called or that previous ran off the text
3226 // string. the iterator would have been set to offset 0 if a
3227 // match is not found.
3228 search
->isForwardSearching
= TRUE
;
3229 if (search
->matchedIndex
!= USEARCH_DONE
) {
3230 // there's no need to set the collation element iterator
3231 // the next call to next will set the offset.
3232 return search
->matchedIndex
;
3236 if (U_SUCCESS(*status
)) {
3237 if (strsrch
->pattern
.cesLength
== 0) {
3238 if (search
->matchedIndex
== USEARCH_DONE
) {
3239 search
->matchedIndex
= offset
;
3241 else { // moves by codepoints
3242 U16_FWD_1(search
->text
, search
->matchedIndex
, textlength
);
3245 search
->matchedLength
= 0;
3246 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3247 // status checked below
3248 if (search
->matchedIndex
== textlength
) {
3249 search
->matchedIndex
= USEARCH_DONE
;
3253 if (search
->matchedLength
> 0) {
3254 // if matchlength is 0 we are at the start of the iteration
3255 if (search
->isOverlap
) {
3256 ucol_setOffset(strsrch
->textIter
, offset
+ 1, status
);
3259 ucol_setOffset(strsrch
->textIter
,
3260 offset
+ search
->matchedLength
, status
);
3264 // for boundary check purposes. this will ensure that the
3265 // next match will not preceed the current offset
3266 // note search->matchedIndex will always be set to something
3268 search
->matchedIndex
= offset
- 1;
3271 if (search
->isCanonicalMatch
) {
3272 // can't use exact here since extra accents are allowed.
3273 usearch_handleNextCanonical(strsrch
, status
);
3276 usearch_handleNextExact(strsrch
, status
);
3280 if (U_FAILURE(*status
)) {
3281 return USEARCH_DONE
;
3285 if (search
->matchedIndex
== USEARCH_DONE
) {
3286 ucol_setOffset(strsrch
->textIter
, search
->textLength
, status
);
3288 ucol_setOffset(strsrch
->textIter
, search
->matchedIndex
, status
);
3292 return search
->matchedIndex
;
3295 return USEARCH_DONE
;
3298 U_CAPI
int32_t U_EXPORT2
usearch_previous(UStringSearch
*strsrch
,
3301 if (U_SUCCESS(*status
) && strsrch
) {
3303 USearch
*search
= strsrch
->search
;
3304 if (search
->reset
) {
3305 offset
= search
->textLength
;
3306 search
->isForwardSearching
= FALSE
;
3307 search
->reset
= FALSE
;
3308 setColEIterOffset(strsrch
->textIter
, offset
);
3311 offset
= usearch_getOffset(strsrch
);
3314 int32_t matchedindex
= search
->matchedIndex
;
3315 if (search
->isForwardSearching
== TRUE
) {
3316 // switching direction.
3317 // if matchedIndex == USEARCH_DONE, it means that either a
3318 // setOffset has been called or that next ran off the text
3319 // string. the iterator would have been set to offset textLength if
3320 // a match is not found.
3321 search
->isForwardSearching
= FALSE
;
3322 if (matchedindex
!= USEARCH_DONE
) {
3323 return matchedindex
;
3328 if (offset
== 0 || matchedindex
== 0 ||
3329 (!search
->isOverlap
&&
3330 (offset
< strsrch
->pattern
.defaultShiftSize
||
3331 (matchedindex
!= USEARCH_DONE
&&
3332 matchedindex
< strsrch
->pattern
.defaultShiftSize
)))) {
3333 // not enough characters to match
3334 setMatchNotFound(strsrch
);
3335 return USEARCH_DONE
;
3338 // Could check pattern length, but the
3339 // linear search will do the right thing
3340 if (offset
== 0 || matchedindex
== 0) {
3341 setMatchNotFound(strsrch
);
3342 return USEARCH_DONE
;
3347 if (U_SUCCESS(*status
)) {
3348 if (strsrch
->pattern
.cesLength
== 0) {
3349 search
->matchedIndex
=
3350 (matchedindex
== USEARCH_DONE
? offset
: matchedindex
);
3351 if (search
->matchedIndex
== 0) {
3352 setMatchNotFound(strsrch
);
3353 // status checked below
3355 else { // move by codepoints
3356 U16_BACK_1(search
->text
, 0, search
->matchedIndex
);
3357 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3358 // status checked below
3359 search
->matchedLength
= 0;
3363 if (strsrch
->search
->isCanonicalMatch
) {
3364 // can't use exact here since extra accents are allowed.
3365 usearch_handlePreviousCanonical(strsrch
, status
);
3366 // status checked below
3369 usearch_handlePreviousExact(strsrch
, status
);
3370 // status checked below
3374 if (U_FAILURE(*status
)) {
3375 return USEARCH_DONE
;
3378 return search
->matchedIndex
;
3381 return USEARCH_DONE
;
3386 U_CAPI
void U_EXPORT2
usearch_reset(UStringSearch
*strsrch
)
3389 reset is setting the attributes that are already in
3390 string search, hence all attributes in the collator should
3391 be retrieved without any problems
3394 UErrorCode status
= U_ZERO_ERROR
;
3395 UBool sameCollAttribute
= TRUE
;
3400 // **** hack to deal w/ how processed CEs encode quaternary ****
3401 UCollationStrength newStrength
= ucol_getStrength(strsrch
->collator
);
3402 if ((strsrch
->strength
< UCOL_QUATERNARY
&& newStrength
>= UCOL_QUATERNARY
) ||
3403 (strsrch
->strength
>= UCOL_QUATERNARY
&& newStrength
< UCOL_QUATERNARY
)) {
3404 sameCollAttribute
= FALSE
;
3407 strsrch
->strength
= ucol_getStrength(strsrch
->collator
);
3408 ceMask
= getMask(strsrch
->strength
);
3409 if (strsrch
->ceMask
!= ceMask
) {
3410 strsrch
->ceMask
= ceMask
;
3411 sameCollAttribute
= FALSE
;
3414 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3415 shift
= ucol_getAttribute(strsrch
->collator
, UCOL_ALTERNATE_HANDLING
,
3416 &status
) == UCOL_SHIFTED
;
3417 if (strsrch
->toShift
!= shift
) {
3418 strsrch
->toShift
= shift
;
3419 sameCollAttribute
= FALSE
;
3422 // if status is a failure, ucol_getVariableTop returns 0
3423 varTop
= ucol_getVariableTop(strsrch
->collator
, &status
);
3424 if (strsrch
->variableTop
!= varTop
) {
3425 strsrch
->variableTop
= varTop
;
3426 sameCollAttribute
= FALSE
;
3428 if (!sameCollAttribute
) {
3429 initialize(strsrch
, &status
);
3431 ucol_setText(strsrch
->textIter
, strsrch
->search
->text
,
3432 strsrch
->search
->textLength
,
3434 strsrch
->search
->matchedLength
= 0;
3435 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
3436 strsrch
->search
->isOverlap
= FALSE
;
3437 strsrch
->search
->isCanonicalMatch
= FALSE
;
3438 strsrch
->search
->elementComparisonType
= 0;
3439 strsrch
->search
->isForwardSearching
= TRUE
;
3440 strsrch
->search
->reset
= TRUE
;
3445 // CEI Collation Element + source text index.
3446 // These structs are kept in the circular buffer.
3458 // CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3460 #define DEFAULT_CEBUFFER_SIZE 96
3461 #define CEBUFFER_EXTRA 32
3462 // Some typical max values to make buffer size more reasonable for asymmetric search.
3463 // #8694 is for a better long-term solution to allocation of this buffer.
3464 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3465 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3466 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3468 CEI defBuf
[DEFAULT_CEBUFFER_SIZE
];
3473 UCollationElements
*ceIter
;
3474 UStringSearch
*strSearch
;
3478 CEIBuffer(UStringSearch
*ss
, UErrorCode
*status
);
3480 const CEI
*get(int32_t index
);
3481 const CEI
*getPrevious(int32_t index
);
3485 CEIBuffer::CEIBuffer(UStringSearch
*ss
, UErrorCode
*status
) {
3488 bufSize
= ss
->pattern
.pcesLength
+ CEBUFFER_EXTRA
;
3489 if (ss
->search
->elementComparisonType
!= 0) {
3490 const UChar
* patText
= ss
->pattern
.text
;
3492 const UChar
* patTextLimit
= patText
+ ss
->pattern
.textLength
;
3493 while ( patText
< patTextLimit
) {
3494 UChar c
= *patText
++;
3495 if (MIGHT_BE_JAMO_L(c
)) {
3496 bufSize
+= MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L
;
3498 // No check for surrogates, we might allocate slightly more buffer than necessary.
3499 bufSize
+= MAX_TARGET_IGNORABLES_PER_PAT_OTHER
;
3504 ceIter
= ss
->textIter
;
3508 if (!initTextProcessedIter(ss
, status
)) { return; }
3510 if (bufSize
>DEFAULT_CEBUFFER_SIZE
) {
3511 buf
= (CEI
*)uprv_malloc(bufSize
* sizeof(CEI
));
3513 *status
= U_MEMORY_ALLOCATION_ERROR
;
3518 // TODO: add a reset or init function so that allocated
3519 // buffers can be retained & reused.
3521 CEIBuffer::~CEIBuffer() {
3522 if (buf
!= defBuf
) {
3528 // Get the CE with the specified index.
3529 // Index must be in the range
3530 // n-history_size < index < n+1
3531 // where n is the largest index to have been fetched by some previous call to this function.
3532 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3534 const CEI
*CEIBuffer::get(int32_t index
) {
3535 int i
= index
% bufSize
;
3537 if (index
>=firstIx
&& index
<limitIx
) {
3538 // The request was for an entry already in our buffer.
3543 // Caller is requesting a new, never accessed before, CE.
3544 // Verify that it is the next one in sequence, which is all
3546 if (index
!= limitIx
) {
3552 // Manage the circular CE buffer indexing
3555 if (limitIx
- firstIx
>= bufSize
) {
3556 // The buffer is full, knock out the lowest-indexed entry.
3560 UErrorCode status
= U_ZERO_ERROR
;
3562 buf
[i
].ce
= strSearch
->textProcessedIter
->nextProcessed(&buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3567 // Get the CE with the specified index.
3568 // Index must be in the range
3569 // n-history_size < index < n+1
3570 // where n is the largest index to have been fetched by some previous call to this function.
3571 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3573 const CEI
*CEIBuffer::getPrevious(int32_t index
) {
3574 int i
= index
% bufSize
;
3576 if (index
>=firstIx
&& index
<limitIx
) {
3577 // The request was for an entry already in our buffer.
3582 // Caller is requesting a new, never accessed before, CE.
3583 // Verify that it is the next one in sequence, which is all
3585 if (index
!= limitIx
) {
3591 // Manage the circular CE buffer indexing
3594 if (limitIx
- firstIx
>= bufSize
) {
3595 // The buffer is full, knock out the lowest-indexed entry.
3599 UErrorCode status
= U_ZERO_ERROR
;
3601 buf
[i
].ce
= strSearch
->textProcessedIter
->previousProcessed(&buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3611 // #define USEARCH_DEBUG
3613 #ifdef USEARCH_DEBUG
3619 * Find the next break boundary after startIndex. If the UStringSearch object
3620 * has an external break iterator, use that. Otherwise use the internal character
3623 static int32_t nextBoundaryAfter(UStringSearch
*strsrch
, int32_t startIndex
) {
3625 const UChar
*text
= strsrch
->search
->text
;
3626 int32_t textLen
= strsrch
->search
->textLength
;
3628 U_ASSERT(startIndex
>=0);
3629 U_ASSERT(startIndex
<=textLen
);
3631 if (startIndex
>= textLen
) {
3636 int32_t i
= startIndex
;
3637 U16_NEXT(text
, i
, textLen
, c
);
3639 // If we are on a control character, stop without looking for combining marks.
3640 // Control characters do not combine.
3641 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3642 if (gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
) {
3646 // The initial character was not a control, and can thus accept trailing
3647 // combining characters. Advance over however many of them there are.
3648 int32_t indexOfLastCharChecked
;
3650 indexOfLastCharChecked
= i
;
3654 U16_NEXT(text
, i
, textLen
, c
);
3655 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3656 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3660 return indexOfLastCharChecked
;
3661 #elif !UCONFIG_NO_BREAK_ITERATION
3662 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3664 if (breakiterator
== NULL
) {
3665 breakiterator
= strsrch
->search
->internalBreakIter
;
3668 if (breakiterator
!= NULL
) {
3669 return ubrk_following(breakiterator
, startIndex
);
3674 // **** or should we use the original code? ****
3681 * Returns TRUE if index is on a break boundary. If the UStringSearch
3682 * has an external break iterator, test using that, otherwise test
3683 * using the internal character break iterator.
3685 static UBool
isBreakBoundary(UStringSearch
*strsrch
, int32_t index
) {
3687 const UChar
*text
= strsrch
->search
->text
;
3688 int32_t textLen
= strsrch
->search
->textLength
;
3691 U_ASSERT(index
<=textLen
);
3693 if (index
>=textLen
|| index
<=0) {
3697 // If the character at the current index is not a GRAPHEME_EXTEND
3698 // then we can not be within a combining sequence.
3700 U16_GET(text
, 0, index
, textLen
, c
);
3701 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3702 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3706 // We are at a combining mark. If the preceding character is anything
3707 // except a CONTROL, CR or LF, we are in a combining sequence.
3708 U16_PREV(text
, 0, index
, c
);
3709 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3710 UBool combining
= !(gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
);
3712 #elif !UCONFIG_NO_BREAK_ITERATION
3713 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3715 if (breakiterator
== NULL
) {
3716 breakiterator
= strsrch
->search
->internalBreakIter
;
3719 return (breakiterator
!= NULL
&& ubrk_isBoundary(breakiterator
, index
));
3721 // **** or use the original code? ****
3727 static UBool
onBreakBoundaries(const UStringSearch
*strsrch
, int32_t start
, int32_t end
)
3729 #if !UCONFIG_NO_BREAK_ITERATION
3730 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3732 if (breakiterator
!= NULL
) {
3733 int32_t startindex
= ubrk_first(breakiterator
);
3734 int32_t endindex
= ubrk_last(breakiterator
);
3736 // out-of-range indexes are never boundary positions
3737 if (start
< startindex
|| start
> endindex
||
3738 end
< startindex
|| end
> endindex
) {
3742 return ubrk_isBoundary(breakiterator
, start
) &&
3743 ubrk_isBoundary(breakiterator
, end
);
3756 } UCompareCEsResult
;
3757 #define U_CE_LEVEL2_BASE 0x00000005
3758 #define U_CE_LEVEL3_BASE 0x00050000
3760 static UCompareCEsResult
compareCE64s(int64_t targCE
, int64_t patCE
, int16_t compareType
) {
3761 if (targCE
== patCE
) {
3764 if (compareType
== 0) {
3765 return U_CE_NO_MATCH
;
3768 int64_t targCEshifted
= targCE
>> 32;
3769 int64_t patCEshifted
= patCE
>> 32;
3773 int32_t targLev1
= (int32_t)(targCEshifted
& mask
);
3774 int32_t patLev1
= (int32_t)(patCEshifted
& mask
);
3775 if ( targLev1
!= patLev1
) {
3776 if ( targLev1
== 0 ) {
3777 return U_CE_SKIP_TARG
;
3779 if ( patLev1
== 0 && compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
3780 return U_CE_SKIP_PATN
;
3782 return U_CE_NO_MATCH
;
3786 int32_t targLev2
= (int32_t)(targCEshifted
& mask
);
3787 int32_t patLev2
= (int32_t)(patCEshifted
& mask
);
3788 if ( targLev2
!= patLev2
) {
3789 if ( targLev2
== 0 ) {
3790 return U_CE_SKIP_TARG
;
3792 if ( patLev2
== 0 && compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
3793 return U_CE_SKIP_PATN
;
3795 return (patLev2
== U_CE_LEVEL2_BASE
|| (compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
&& targLev2
== U_CE_LEVEL2_BASE
) )?
3796 U_CE_MATCH
: U_CE_NO_MATCH
;
3800 int32_t targLev3
= (int32_t)(targCE
& mask
);
3801 int32_t patLev3
= (int32_t)(patCE
& mask
);
3802 if ( targLev3
!= patLev3
) {
3803 return (patLev3
== U_CE_LEVEL3_BASE
|| (compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
&& targLev3
== U_CE_LEVEL3_BASE
) )?
3804 U_CE_MATCH
: U_CE_NO_MATCH
;
3811 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3816 UChar32
codePointAt(const USearch
&search
, int32_t index
) {
3817 if (index
< search
.textLength
) {
3819 U16_NEXT(search
.text
, index
, search
.textLength
, c
);
3825 UChar32
codePointBefore(const USearch
&search
, int32_t index
) {
3828 U16_PREV(search
.text
, 0, index
, c
);
3836 U_CAPI UBool U_EXPORT2
usearch_search(UStringSearch
*strsrch
,
3838 int32_t *matchStart
,
3839 int32_t *matchLimit
,
3842 if (U_FAILURE(*status
)) {
3846 // TODO: reject search patterns beginning with a combining char.
3848 #ifdef USEARCH_DEBUG
3849 if (getenv("USEARCH_DEBUG") != NULL
) {
3850 printf("Pattern CEs\n");
3851 for (int ii
=0; ii
<strsrch
->pattern
.cesLength
; ii
++) {
3852 printf(" %8x", strsrch
->pattern
.ces
[ii
]);
3858 // Input parameter sanity check.
3859 // TODO: should input indicies clip to the text length
3860 // in the same way that UText does.
3861 if(strsrch
->pattern
.cesLength
== 0 ||
3863 startIdx
> strsrch
->search
->textLength
||
3864 strsrch
->pattern
.ces
== NULL
) {
3865 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3869 if (strsrch
->pattern
.pces
== NULL
) {
3870 initializePatternPCETable(strsrch
, status
);
3873 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
3874 CEIBuffer
ceb(strsrch
, status
);
3877 int32_t targetIx
= 0;
3878 const CEI
*targetCEI
= NULL
;
3882 int32_t mStart
= -1;
3883 int32_t mLimit
= -1;
3889 // Outer loop moves over match starting positions in the
3891 // Here we see the target as a sequence of collation elements, resulting from the following:
3892 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3893 // (for example, digraphs such as IJ may be broken into two characters).
3894 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3895 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3896 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3897 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3898 // then the CE is deleted, so the following code sees only CEs that are relevant.
3899 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3900 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3901 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3903 for(targetIx
=0; ; targetIx
++)
3906 // Inner loop checks for a match beginning at each
3907 // position from the outer loop.
3908 int32_t targetIxOffset
= 0;
3910 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3911 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3912 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3913 const CEI
*firstCEI
= ceb
.get(targetIx
);
3914 if (firstCEI
== NULL
) {
3915 *status
= U_INTERNAL_PROGRAM_ERROR
;
3920 for (patIx
=0; patIx
<strsrch
->pattern
.pcesLength
; patIx
++) {
3921 patCE
= strsrch
->pattern
.pces
[patIx
];
3922 targetCEI
= ceb
.get(targetIx
+patIx
+targetIxOffset
);
3923 // Compare CE from target string with CE from the pattern.
3924 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3925 // which will fail the compare, below.
3926 UCompareCEsResult ceMatch
= compareCE64s(targetCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
3927 if ( ceMatch
== U_CE_NO_MATCH
) {
3930 } else if ( ceMatch
> U_CE_NO_MATCH
) {
3931 if ( ceMatch
== U_CE_SKIP_TARG
) {
3932 // redo with same patCE, next targCE
3935 } else { // ceMatch == U_CE_SKIP_PATN
3936 // redo with same targCE, next patCE
3941 targetIxOffset
+= strsrch
->pattern
.pcesLength
; // this is now the offset in target CE space to end of the match so far
3943 if (!found
&& ((targetCEI
== NULL
) || (targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
))) {
3944 // No match at this targetIx. Try again at the next.
3949 // No match at all, we have run off the end of the target text.
3954 // We have found a match in CE space.
3955 // Now determine the bounds in string index space.
3956 // There still is a chance of match failure if the CE range not correspond to
3957 // an acceptable character range.
3959 const CEI
*lastCEI
= ceb
.get(targetIx
+ targetIxOffset
- 1);
3961 mStart
= firstCEI
->lowIndex
;
3962 minLimit
= lastCEI
->lowIndex
;
3964 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3965 // extended to the end of input, and the match is good.
3967 // Look at the high and low indices of the CE following the match. If
3968 // they are the same it means one of two things:
3969 // 1. The match extended to the last CE from the target text, which is OK, or
3970 // 2. The last CE that was part of the match is in an expansion that extends
3971 // to the first CE after the match. In this case, we reject the match.
3972 const CEI
*nextCEI
= 0;
3973 if (strsrch
->search
->elementComparisonType
== 0) {
3974 nextCEI
= ceb
.get(targetIx
+ targetIxOffset
);
3975 maxLimit
= nextCEI
->lowIndex
;
3976 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
3980 for ( ; ; ++targetIxOffset
) {
3981 nextCEI
= ceb
.get(targetIx
+ targetIxOffset
);
3982 maxLimit
= nextCEI
->lowIndex
;
3983 // If we are at the end of the target too, match succeeds
3984 if ( nextCEI
->ce
== UCOL_PROCESSED_NULLORDER
) {
3987 // As long as the next CE has primary weight of 0,
3988 // it is part of the last target element matched by the pattern;
3989 // make sure it can be part of a match with the last patCE
3990 if ( (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) == 0 ) {
3991 UCompareCEsResult ceMatch
= compareCE64s(nextCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
3992 if ( ceMatch
== U_CE_NO_MATCH
|| ceMatch
== U_CE_SKIP_PATN
) {
3996 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3997 // target element, but it has non-zero primary weight => match fails
3998 } else if ( nextCEI
->lowIndex
== nextCEI
->highIndex
) {
4001 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4009 // Check for the start of the match being within a combining sequence.
4010 // This can happen if the pattern itself begins with a combining char, and
4011 // the match found combining marks in the target text that were attached
4012 // to something else.
4013 // This type of match should be rejected for not completely consuming a
4014 // combining sequence.
4015 if (!isBreakBoundary(strsrch
, mStart
)) {
4019 // Check for the start of the match being within an Collation Element Expansion,
4020 // meaning that the first char of the match is only partially matched.
4021 // With exapnsions, the first CE will report the index of the source
4022 // character, and all subsequent (expansions) CEs will report the source index of the
4023 // _following_ character.
4024 int32_t secondIx
= firstCEI
->highIndex
;
4025 if (mStart
== secondIx
) {
4029 // Allow matches to end in the middle of a grapheme cluster if the following
4030 // conditions are met; this is needed to make prefix search work properly in
4031 // Indic, see #11750
4032 // * the default breakIter is being used
4033 // * the next collation element after this combining sequence
4034 // - has non-zero primary weight
4035 // - corresponds to a separate character following the one at end of the current match
4036 // (the second of these conditions, and perhaps both, may be redundant given the
4037 // subsequent check for normalization boundary; however they are likely much faster
4038 // tests in any case)
4039 // * the match limit is a normalization boundary
4040 UBool allowMidclusterMatch
= FALSE
;
4041 if (strsrch
->search
->text
!= NULL
&& strsrch
->search
->textLength
> maxLimit
) {
4042 allowMidclusterMatch
=
4043 strsrch
->search
->breakIter
== NULL
&&
4044 nextCEI
!= NULL
&& (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) != 0 &&
4045 maxLimit
>= lastCEI
->highIndex
&& nextCEI
->highIndex
> maxLimit
&&
4046 (strsrch
->nfd
->hasBoundaryBefore(codePointAt(*strsrch
->search
, maxLimit
)) ||
4047 strsrch
->nfd
->hasBoundaryAfter(codePointBefore(*strsrch
->search
, maxLimit
)));
4049 // If those conditions are met, then:
4050 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4051 // the match limit may be backed off to a previous break boundary. This handles
4052 // cases in which mLimit includes target characters that are ignorable with current
4053 // settings (such as space) and which extend beyond the pattern match.
4054 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4055 // * do NOT require that match limit be on a breakIter boundary
4057 // Advance the match end position to the first acceptable match boundary.
4058 // This advances the index over any combining charcters.
4060 if (minLimit
< maxLimit
) {
4061 // When the last CE's low index is same with its high index, the CE is likely
4062 // a part of expansion. In this case, the index is located just after the
4063 // character corresponding to the CEs compared above. If the index is right
4064 // at the break boundary, move the position to the next boundary will result
4065 // incorrect match length when there are ignorable characters exist between
4066 // the position and the next character produces CE(s). See ticket#8482.
4067 if (minLimit
== lastCEI
->highIndex
&& isBreakBoundary(strsrch
, minLimit
)) {
4070 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4071 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4072 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4073 // (i.e. we back off mLimit to the previous breakIterator boundary).
4074 if (nba
>= lastCEI
->highIndex
&& (!allowMidclusterMatch
|| nba
< maxLimit
)) {
4080 #ifdef USEARCH_DEBUG
4081 if (getenv("USEARCH_DEBUG") != NULL
) {
4082 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4086 if (!allowMidclusterMatch
) {
4087 // If advancing to the end of a combining sequence in character indexing space
4088 // advanced us beyond the end of the match in CE space, reject this match.
4089 if (mLimit
> maxLimit
) {
4093 if (!isBreakBoundary(strsrch
, mLimit
)) {
4098 if (! checkIdentical(strsrch
, mStart
, mLimit
)) {
4107 #ifdef USEARCH_DEBUG
4108 if (getenv("USEARCH_DEBUG") != NULL
) {
4109 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4110 int32_t lastToPrint
= ceb
.limitIx
+2;
4111 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4112 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4114 printf("\n%s\n", found
? "match found" : "no match");
4118 // All Done. Store back the match bounds to the caller.
4125 if (matchStart
!= NULL
) {
4126 *matchStart
= mStart
;
4129 if (matchLimit
!= NULL
) {
4130 *matchLimit
= mLimit
;
4136 U_CAPI UBool U_EXPORT2
usearch_searchBackwards(UStringSearch
*strsrch
,
4138 int32_t *matchStart
,
4139 int32_t *matchLimit
,
4142 if (U_FAILURE(*status
)) {
4146 // TODO: reject search patterns beginning with a combining char.
4148 #ifdef USEARCH_DEBUG
4149 if (getenv("USEARCH_DEBUG") != NULL
) {
4150 printf("Pattern CEs\n");
4151 for (int ii
=0; ii
<strsrch
->pattern
.cesLength
; ii
++) {
4152 printf(" %8x", strsrch
->pattern
.ces
[ii
]);
4158 // Input parameter sanity check.
4159 // TODO: should input indicies clip to the text length
4160 // in the same way that UText does.
4161 if(strsrch
->pattern
.cesLength
== 0 ||
4163 startIdx
> strsrch
->search
->textLength
||
4164 strsrch
->pattern
.ces
== NULL
) {
4165 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
4169 if (strsrch
->pattern
.pces
== NULL
) {
4170 initializePatternPCETable(strsrch
, status
);
4173 CEIBuffer
ceb(strsrch
, status
);
4174 int32_t targetIx
= 0;
4177 * Pre-load the buffer with the CE's for the grapheme
4178 * after our starting position so that we're sure that
4179 * we can look at the CE following the match when we
4180 * check the match boundaries.
4182 * This will also pre-fetch the first CE that we'll
4183 * consider for the match.
4185 if (startIdx
< strsrch
->search
->textLength
) {
4186 UBreakIterator
*bi
= strsrch
->search
->internalBreakIter
;
4187 int32_t next
= ubrk_following(bi
, startIdx
);
4189 ucol_setOffset(strsrch
->textIter
, next
, status
);
4191 for (targetIx
= 0; ; targetIx
+= 1) {
4192 if (ceb
.getPrevious(targetIx
)->lowIndex
< startIdx
) {
4197 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
4201 const CEI
*targetCEI
= NULL
;
4205 int32_t limitIx
= targetIx
;
4206 int32_t mStart
= -1;
4207 int32_t mLimit
= -1;
4213 // Outer loop moves over match starting positions in the
4215 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4216 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4217 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4218 // and the beginning of the base text.
4219 for(targetIx
= limitIx
; ; targetIx
+= 1)
4222 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4223 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4224 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4225 const CEI
*lastCEI
= ceb
.getPrevious(targetIx
);
4226 if (lastCEI
== NULL
) {
4227 *status
= U_INTERNAL_PROGRAM_ERROR
;
4231 // Inner loop checks for a match beginning at each
4232 // position from the outer loop.
4233 int32_t targetIxOffset
= 0;
4234 for (patIx
= strsrch
->pattern
.pcesLength
- 1; patIx
>= 0; patIx
-= 1) {
4235 int64_t patCE
= strsrch
->pattern
.pces
[patIx
];
4237 targetCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.pcesLength
- 1 - patIx
+ targetIxOffset
);
4238 // Compare CE from target string with CE from the pattern.
4239 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4240 // which will fail the compare, below.
4241 UCompareCEsResult ceMatch
= compareCE64s(targetCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
4242 if ( ceMatch
== U_CE_NO_MATCH
) {
4245 } else if ( ceMatch
> U_CE_NO_MATCH
) {
4246 if ( ceMatch
== U_CE_SKIP_TARG
) {
4247 // redo with same patCE, next targCE
4250 } else { // ceMatch == U_CE_SKIP_PATN
4251 // redo with same targCE, next patCE
4257 if (!found
&& ((targetCEI
== NULL
) || (targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
))) {
4258 // No match at this targetIx. Try again at the next.
4263 // No match at all, we have run off the end of the target text.
4268 // We have found a match in CE space.
4269 // Now determine the bounds in string index space.
4270 // There still is a chance of match failure if the CE range not correspond to
4271 // an acceptable character range.
4273 const CEI
*firstCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.pcesLength
- 1 + targetIxOffset
);
4274 mStart
= firstCEI
->lowIndex
;
4276 // Check for the start of the match being within a combining sequence.
4277 // This can happen if the pattern itself begins with a combining char, and
4278 // the match found combining marks in the target text that were attached
4279 // to something else.
4280 // This type of match should be rejected for not completely consuming a
4281 // combining sequence.
4282 if (!isBreakBoundary(strsrch
, mStart
)) {
4286 // Look at the high index of the first CE in the match. If it's the same as the
4287 // low index, the first CE in the match is in the middle of an expansion.
4288 if (mStart
== firstCEI
->highIndex
) {
4293 minLimit
= lastCEI
->lowIndex
;
4296 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4297 // extended to the end of input, and the match is good.
4299 // Look at the high and low indices of the CE following the match. If
4300 // they are the same it means one of two things:
4301 // 1. The match extended to the last CE from the target text, which is OK, or
4302 // 2. The last CE that was part of the match is in an expansion that extends
4303 // to the first CE after the match. In this case, we reject the match.
4304 const CEI
*nextCEI
= ceb
.getPrevious(targetIx
- 1);
4306 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
4310 mLimit
= maxLimit
= nextCEI
->lowIndex
;
4312 // Allow matches to end in the middle of a grapheme cluster if the following
4313 // conditions are met; this is needed to make prefix search work properly in
4314 // Indic, see #11750
4315 // * the default breakIter is being used
4316 // * the next collation element after this combining sequence
4317 // - has non-zero primary weight
4318 // - corresponds to a separate character following the one at end of the current match
4319 // (the second of these conditions, and perhaps both, may be redundant given the
4320 // subsequent check for normalization boundary; however they are likely much faster
4321 // tests in any case)
4322 // * the match limit is a normalization boundary
4323 UBool allowMidclusterMatch
= FALSE
;
4324 if (strsrch
->search
->text
!= NULL
&& strsrch
->search
->textLength
> maxLimit
) {
4325 allowMidclusterMatch
=
4326 strsrch
->search
->breakIter
== NULL
&&
4327 nextCEI
!= NULL
&& (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) != 0 &&
4328 maxLimit
>= lastCEI
->highIndex
&& nextCEI
->highIndex
> maxLimit
&&
4329 (strsrch
->nfd
->hasBoundaryBefore(codePointAt(*strsrch
->search
, maxLimit
)) ||
4330 strsrch
->nfd
->hasBoundaryAfter(codePointBefore(*strsrch
->search
, maxLimit
)));
4332 // If those conditions are met, then:
4333 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4334 // the match limit may be backed off to a previous break boundary. This handles
4335 // cases in which mLimit includes target characters that are ignorable with current
4336 // settings (such as space) and which extend beyond the pattern match.
4337 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4338 // * do NOT require that match limit be on a breakIter boundary
4340 // Advance the match end position to the first acceptable match boundary.
4341 // This advances the index over any combining characters.
4342 if (minLimit
< maxLimit
) {
4343 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4344 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4345 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4346 // (i.e. we back off mLimit to the previous breakIterator boundary).
4347 if (nba
>= lastCEI
->highIndex
&& (!allowMidclusterMatch
|| nba
< maxLimit
)) {
4352 if (!allowMidclusterMatch
) {
4353 // If advancing to the end of a combining sequence in character indexing space
4354 // advanced us beyond the end of the match in CE space, reject this match.
4355 if (mLimit
> maxLimit
) {
4359 // Make sure the end of the match is on a break boundary
4360 if (!isBreakBoundary(strsrch
, mLimit
)) {
4366 // No non-ignorable CEs after this point.
4367 // The maximum position is detected by boundary after
4368 // the last non-ignorable CE. Combining sequence
4369 // across the start index will be truncated.
4370 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4371 mLimit
= maxLimit
= (nba
> 0) && (startIdx
> nba
) ? nba
: startIdx
;
4374 #ifdef USEARCH_DEBUG
4375 if (getenv("USEARCH_DEBUG") != NULL
) {
4376 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4381 if (! checkIdentical(strsrch
, mStart
, mLimit
)) {
4390 #ifdef USEARCH_DEBUG
4391 if (getenv("USEARCH_DEBUG") != NULL
) {
4392 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4393 int32_t lastToPrint
= ceb
.limitIx
+2;
4394 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4395 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4397 printf("\n%s\n", found
? "match found" : "no match");
4401 // All Done. Store back the match bounds to the caller.
4408 if (matchStart
!= NULL
) {
4409 *matchStart
= mStart
;
4412 if (matchLimit
!= NULL
) {
4413 *matchLimit
= mLimit
;
4419 // internal use methods declared in usrchimp.h -----------------------------
4421 UBool
usearch_handleNextExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4423 if (U_FAILURE(*status
)) {
4424 setMatchNotFound(strsrch
);
4429 UCollationElements
*coleiter
= strsrch
->textIter
;
4430 int32_t textlength
= strsrch
->search
->textLength
;
4431 int32_t *patternce
= strsrch
->pattern
.ces
;
4432 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4433 int32_t textoffset
= ucol_getOffset(coleiter
);
4435 // status used in setting coleiter offset, since offset is checked in
4436 // shiftForward before setting the coleiter offset, status never
4438 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4440 while (textoffset
<= textlength
)
4442 uint32_t patternceindex
= patterncelength
- 1;
4444 UBool found
= FALSE
;
4445 int32_t lastce
= UCOL_NULLORDER
;
4447 setColEIterOffset(coleiter
, textoffset
);
4450 // finding the last pattern ce match, imagine composite characters
4451 // for example: search for pattern A in text \u00C0
4452 // we'll have to skip \u0300 the grave first before we get to A
4453 targetce
= ucol_previous(coleiter
, status
);
4454 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4458 targetce
= getCE(strsrch
, targetce
);
4459 if (targetce
== UCOL_IGNORABLE
&& inNormBuf(coleiter
)) {
4460 // this is for the text \u0315\u0300 that requires
4461 // normalization and pattern \u0300, where \u0315 is ignorable
4464 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4467 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4468 if (targetce
== patternce
[patternceindex
]) {
4469 // the first ce can be a contraction
4473 if (!hasExpansion(coleiter
)) {
4479 //targetce = lastce;
4481 while (found
&& patternceindex
> 0) {
4483 targetce
= ucol_previous(coleiter
, status
);
4484 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4488 targetce
= getCE(strsrch
, targetce
);
4489 if (targetce
== UCOL_IGNORABLE
) {
4494 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4495 found
= found
&& targetce
== patternce
[patternceindex
];
4501 if (U_FAILURE(*status
)) {
4504 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4506 // status checked at loop.
4507 patternceindex
= patterncelength
;
4511 if (checkNextExactMatch(strsrch
, &textoffset
, status
)) {
4512 // status checked in ucol_setOffset
4513 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4517 setMatchNotFound(strsrch
);
4520 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4524 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4525 strsrch
->search
->matchedIndex
= start
;
4526 strsrch
->search
->matchedLength
= end
- start
;
4529 setMatchNotFound(strsrch
);
4535 UBool
usearch_handleNextCanonical(UStringSearch
*strsrch
, UErrorCode
*status
)
4537 if (U_FAILURE(*status
)) {
4538 setMatchNotFound(strsrch
);
4543 UCollationElements
*coleiter
= strsrch
->textIter
;
4544 int32_t textlength
= strsrch
->search
->textLength
;
4545 int32_t *patternce
= strsrch
->pattern
.ces
;
4546 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4547 int32_t textoffset
= ucol_getOffset(coleiter
);
4548 UBool hasPatternAccents
=
4549 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4551 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4553 strsrch
->canonicalPrefixAccents
[0] = 0;
4554 strsrch
->canonicalSuffixAccents
[0] = 0;
4556 while (textoffset
<= textlength
)
4558 int32_t patternceindex
= patterncelength
- 1;
4560 UBool found
= FALSE
;
4561 int32_t lastce
= UCOL_NULLORDER
;
4563 setColEIterOffset(coleiter
, textoffset
);
4566 // finding the last pattern ce match, imagine composite characters
4567 // for example: search for pattern A in text \u00C0
4568 // we'll have to skip \u0300 the grave first before we get to A
4569 targetce
= ucol_previous(coleiter
, status
);
4570 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4574 targetce
= getCE(strsrch
, targetce
);
4575 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4578 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4579 if (targetce
== patternce
[patternceindex
]) {
4580 // the first ce can be a contraction
4584 if (!hasExpansion(coleiter
)) {
4590 while (found
&& patternceindex
> 0) {
4591 targetce
= ucol_previous(coleiter
, status
);
4592 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4596 targetce
= getCE(strsrch
, targetce
);
4597 if (targetce
== UCOL_IGNORABLE
) {
4602 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4603 found
= found
&& targetce
== patternce
[patternceindex
];
4606 // initializing the rearranged accent array
4607 if (hasPatternAccents
&& !found
) {
4608 strsrch
->canonicalPrefixAccents
[0] = 0;
4609 strsrch
->canonicalSuffixAccents
[0] = 0;
4610 if (U_FAILURE(*status
)) {
4613 found
= doNextCanonicalMatch(strsrch
, textoffset
, status
);
4617 if (U_FAILURE(*status
)) {
4620 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4622 // status checked at loop
4623 patternceindex
= patterncelength
;
4627 if (checkNextCanonicalMatch(strsrch
, &textoffset
, status
)) {
4628 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4632 setMatchNotFound(strsrch
);
4635 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4639 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4640 strsrch
->search
->matchedIndex
= start
;
4641 strsrch
->search
->matchedLength
= end
- start
;
4644 setMatchNotFound(strsrch
);
4650 UBool
usearch_handlePreviousExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4652 if (U_FAILURE(*status
)) {
4653 setMatchNotFound(strsrch
);
4658 UCollationElements
*coleiter
= strsrch
->textIter
;
4659 int32_t *patternce
= strsrch
->pattern
.ces
;
4660 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4661 int32_t textoffset
= ucol_getOffset(coleiter
);
4663 // shifting it check for setting offset
4664 // if setOffset is called previously or there was no previous match, we
4665 // leave the offset as it is.
4666 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4667 textoffset
= strsrch
->search
->matchedIndex
;
4670 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4673 while (textoffset
>= 0)
4675 int32_t patternceindex
= 1;
4677 UBool found
= FALSE
;
4678 int32_t firstce
= UCOL_NULLORDER
;
4680 // if status is a failure, ucol_setOffset does nothing
4681 setColEIterOffset(coleiter
, textoffset
);
4684 // finding the first pattern ce match, imagine composite
4685 // characters. for example: search for pattern \u0300 in text
4686 // \u00C0, we'll have to skip A first before we get to
4687 // \u0300 the grave accent
4688 targetce
= ucol_next(coleiter
, status
);
4689 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4693 targetce
= getCE(strsrch
, targetce
);
4694 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4697 if (targetce
== UCOL_IGNORABLE
&& strsrch
->strength
!= UCOL_PRIMARY
) {
4700 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4701 if (targetce
== patternce
[0]) {
4705 if (!hasExpansion(coleiter
)) {
4706 // checking for accents in composite character
4712 //targetce = firstce;
4714 while (found
&& (patternceindex
< patterncelength
)) {
4716 targetce
= ucol_next(coleiter
, status
);
4717 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4721 targetce
= getCE(strsrch
, targetce
);
4722 if (targetce
== UCOL_IGNORABLE
) {
4726 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4727 found
= found
&& targetce
== patternce
[patternceindex
];
4734 if (U_FAILURE(*status
)) {
4738 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4744 if (checkPreviousExactMatch(strsrch
, &textoffset
, status
)) {
4745 setColEIterOffset(coleiter
, textoffset
);
4749 setMatchNotFound(strsrch
);
4754 if (strsrch
->search
->isOverlap
) {
4755 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4756 textOffset
= strsrch
->search
->matchedIndex
+ strsrch
->search
->matchedLength
- 1;
4758 // move the start position at the end of possible match
4759 initializePatternPCETable(strsrch
, status
);
4760 if (!initTextProcessedIter(strsrch
, status
)) {
4761 setMatchNotFound(strsrch
);
4764 for (int32_t nPCEs
= 0; nPCEs
< strsrch
->pattern
.pcesLength
- 1; nPCEs
++) {
4765 int64_t pce
= strsrch
->textProcessedIter
->nextProcessed(NULL
, NULL
, status
);
4766 if (pce
== UCOL_PROCESSED_NULLORDER
) {
4767 // at the end of the text
4771 if (U_FAILURE(*status
)) {
4772 setMatchNotFound(strsrch
);
4775 textOffset
= ucol_getOffset(strsrch
->textIter
);
4778 textOffset
= ucol_getOffset(strsrch
->textIter
);
4784 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4785 strsrch
->search
->matchedIndex
= start
;
4786 strsrch
->search
->matchedLength
= end
- start
;
4789 setMatchNotFound(strsrch
);
4795 UBool
usearch_handlePreviousCanonical(UStringSearch
*strsrch
,
4798 if (U_FAILURE(*status
)) {
4799 setMatchNotFound(strsrch
);
4804 UCollationElements
*coleiter
= strsrch
->textIter
;
4805 int32_t *patternce
= strsrch
->pattern
.ces
;
4806 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4807 int32_t textoffset
= ucol_getOffset(coleiter
);
4808 UBool hasPatternAccents
=
4809 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4811 // shifting it check for setting offset
4812 // if setOffset is called previously or there was no previous match, we
4813 // leave the offset as it is.
4814 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4815 textoffset
= strsrch
->search
->matchedIndex
;
4818 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4820 strsrch
->canonicalPrefixAccents
[0] = 0;
4821 strsrch
->canonicalSuffixAccents
[0] = 0;
4823 while (textoffset
>= 0)
4825 int32_t patternceindex
= 1;
4827 UBool found
= FALSE
;
4828 int32_t firstce
= UCOL_NULLORDER
;
4830 setColEIterOffset(coleiter
, textoffset
);
4832 // finding the first pattern ce match, imagine composite
4833 // characters. for example: search for pattern \u0300 in text
4834 // \u00C0, we'll have to skip A first before we get to
4835 // \u0300 the grave accent
4836 targetce
= ucol_next(coleiter
, status
);
4837 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4841 targetce
= getCE(strsrch
, targetce
);
4842 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4846 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4847 if (targetce
== patternce
[0]) {
4848 // the first ce can be a contraction
4852 if (!hasExpansion(coleiter
)) {
4853 // checking for accents in composite character
4861 while (found
&& patternceindex
< patterncelength
) {
4862 targetce
= ucol_next(coleiter
, status
);
4863 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4867 targetce
= getCE(strsrch
, targetce
);
4868 if (targetce
== UCOL_IGNORABLE
) {
4872 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4873 found
= found
&& targetce
== patternce
[patternceindex
];
4877 // initializing the rearranged accent array
4878 if (hasPatternAccents
&& !found
) {
4879 strsrch
->canonicalPrefixAccents
[0] = 0;
4880 strsrch
->canonicalSuffixAccents
[0] = 0;
4881 if (U_FAILURE(*status
)) {
4884 found
= doPreviousCanonicalMatch(strsrch
, textoffset
, status
);
4888 if (U_FAILURE(*status
)) {
4891 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4897 if (checkPreviousCanonicalMatch(strsrch
, &textoffset
, status
)) {
4898 setColEIterOffset(coleiter
, textoffset
);
4902 setMatchNotFound(strsrch
);
4907 if (strsrch
->search
->isOverlap
) {
4908 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4909 textOffset
= strsrch
->search
->matchedIndex
+ strsrch
->search
->matchedLength
- 1;
4911 // move the start position at the end of possible match
4912 initializePatternPCETable(strsrch
, status
);
4913 if (!initTextProcessedIter(strsrch
, status
)) {
4914 setMatchNotFound(strsrch
);
4917 for (int32_t nPCEs
= 0; nPCEs
< strsrch
->pattern
.pcesLength
- 1; nPCEs
++) {
4918 int64_t pce
= strsrch
->textProcessedIter
->nextProcessed(NULL
, NULL
, status
);
4919 if (pce
== UCOL_PROCESSED_NULLORDER
) {
4920 // at the end of the text
4924 if (U_FAILURE(*status
)) {
4925 setMatchNotFound(strsrch
);
4928 textOffset
= ucol_getOffset(strsrch
->textIter
);
4931 textOffset
= ucol_getOffset(strsrch
->textIter
);
4937 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4938 strsrch
->search
->matchedIndex
= start
;
4939 strsrch
->search
->matchedLength
= end
- start
;
4942 setMatchNotFound(strsrch
);
4948 #endif /* #if !UCONFIG_NO_COLLATION */