1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2001-2015 IBM and others. All rights reserved.
6 **********************************************************************
7 * Date Name Description
8 * 07/02/2001 synwee Creation.
9 **********************************************************************
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
16 #include "unicode/usearch.h"
17 #include "unicode/ustring.h"
18 #include "unicode/uchar.h"
19 #include "unicode/utf16.h"
20 #include "normalizer2impl.h"
26 #if U_PLATFORM_IS_DARWIN_BASED
27 // Add logging for error conditions as in <rdar://51554495>
33 // don't use Boyer-Moore
34 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
37 // internal definition ---------------------------------------------------
39 #define LAST_BYTE_MASK_ 0xFF
40 #define SECOND_LAST_BYTE_SHIFT_ 8
41 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
43 static const Normalizer2Impl
*g_nfcImpl
= NULL
;
45 // internal methods -------------------------------------------------
48 * Fast collation element iterator setOffset.
49 * This function does not check for bounds.
50 * @param coleiter collation element iterator
51 * @param offset to set
54 inline void setColEIterOffset(UCollationElements
*elems
,
57 // Note: Not "fast" any more after the 2013 collation rewrite.
58 // We do not want to expose more internals than necessary.
59 UErrorCode status
= U_ZERO_ERROR
;
60 ucol_setOffset(elems
, offset
, &status
);
64 * Getting the mask for collation strength
65 * @param strength collation strength
66 * @return collation element mask
69 inline uint32_t getMask(UCollationStrength strength
)
74 return UCOL_PRIMARYORDERMASK
;
76 return UCOL_SECONDARYORDERMASK
| UCOL_PRIMARYORDERMASK
;
78 return UCOL_TERTIARYORDERMASK
| UCOL_SECONDARYORDERMASK
|
79 UCOL_PRIMARYORDERMASK
;
84 * @param ce 32-bit collation element
88 inline int hashFromCE32(uint32_t ce
)
91 ((((((ce
>> 24) * 37) +
95 hc
%= MAX_TABLE_SIZE_
;
97 hc
+= MAX_TABLE_SIZE_
;
103 static UBool U_CALLCONV
104 usearch_cleanup(void) {
111 * Initializing the fcd tables.
112 * Internal method, status assumed to be a success.
113 * @param status output error if any, caller to check status before calling
114 * method, status assumed to be success when passed in.
117 inline void initializeFCD(UErrorCode
*status
)
119 if (g_nfcImpl
== NULL
) {
120 g_nfcImpl
= Normalizer2Factory::getNFCImpl(*status
);
121 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH
, usearch_cleanup
);
126 * Gets the fcd value for a character at the argument index.
127 * This method takes into accounts of the supplementary characters.
128 * @param str UTF16 string where character for fcd retrieval resides
129 * @param offset position of the character whose fcd is to be retrieved, to be
130 * overwritten with the next character position, taking
131 * surrogate characters into consideration.
132 * @param strlength length of the argument string
136 uint16_t getFCD(const UChar
*str
, int32_t *offset
,
139 const UChar
*temp
= str
+ *offset
;
140 uint16_t result
= g_nfcImpl
->nextFCD16(temp
, str
+ strlength
);
141 *offset
= (int32_t)(temp
- str
);
146 * Getting the modified collation elements taking into account the collation
148 * @param strsrch string search data
150 * @return the modified collation element
153 inline int32_t getCE(const UStringSearch
*strsrch
, uint32_t sourcece
)
155 // note for tertiary we can't use the collator->tertiaryMask, that
156 // is a preprocessed mask that takes into account case options. since
157 // we are only concerned with exact matches, we don't need that.
158 sourcece
&= strsrch
->ceMask
;
160 if (strsrch
->toShift
) {
161 // alternate handling here, since only the 16 most significant digits
162 // is only used, we can safely do a compare without masking
163 // if the ce is a variable, we mask and get only the primary values
164 // no shifting to quartenary is required since all primary values
165 // less than variabletop will need to be masked off anyway.
166 if (strsrch
->variableTop
> sourcece
) {
167 if (strsrch
->strength
>= UCOL_QUATERNARY
) {
168 sourcece
&= UCOL_PRIMARYORDERMASK
;
171 sourcece
= UCOL_IGNORABLE
;
174 } else if (strsrch
->strength
>= UCOL_QUATERNARY
&& sourcece
== UCOL_IGNORABLE
) {
182 * Allocate a memory and returns NULL if it failed.
183 * Internal method, status assumed to be a success.
184 * @param size to allocate
185 * @param status output error if any, caller to check status before calling
186 * method, status assumed to be success when passed in.
187 * @return newly allocated array, NULL otherwise
190 inline void * allocateMemory(uint32_t size
, UErrorCode
*status
)
192 uint32_t *result
= (uint32_t *)uprv_malloc(size
);
193 if (result
== NULL
) {
194 *status
= U_MEMORY_ALLOCATION_ERROR
;
200 * Adds a uint32_t value to a destination array.
201 * Creates a new array if we run out of space. The caller will have to
202 * manually deallocate the newly allocated array.
203 * Internal method, status assumed to be success, caller has to check status
204 * before calling this method. destination not to be NULL and has at least
205 * size destinationlength.
206 * @param destination target array
207 * @param offset destination offset to add value
208 * @param destinationlength target array size, return value for the new size
209 * @param value to be added
210 * @param increments incremental size expected
211 * @param status output error if any, caller to check status before calling
212 * method, status assumed to be success when passed in.
213 * @return new destination array, destination if there was no new allocation
216 inline int32_t * addTouint32_tArray(int32_t *destination
,
218 uint32_t *destinationlength
,
223 uint32_t newlength
= *destinationlength
;
224 if (offset
+ 1 == newlength
) {
225 newlength
+= increments
;
226 int32_t *temp
= (int32_t *)allocateMemory(
227 sizeof(int32_t) * newlength
, status
);
228 if (U_FAILURE(*status
)) {
231 uprv_memcpy(temp
, destination
, sizeof(int32_t) * (size_t)offset
);
232 *destinationlength
= newlength
;
235 destination
[offset
] = value
;
240 * Adds a uint64_t value to a destination array.
241 * Creates a new array if we run out of space. The caller will have to
242 * manually deallocate the newly allocated array.
243 * Internal method, status assumed to be success, caller has to check status
244 * before calling this method. destination not to be NULL and has at least
245 * size destinationlength.
246 * @param destination target array
247 * @param offset destination offset to add value
248 * @param destinationlength target array size, return value for the new size
249 * @param value to be added
250 * @param increments incremental size expected
251 * @param status output error if any, caller to check status before calling
252 * method, status assumed to be success when passed in.
253 * @return new destination array, destination if there was no new allocation
256 inline int64_t * addTouint64_tArray(int64_t *destination
,
258 uint32_t *destinationlength
,
263 uint32_t newlength
= *destinationlength
;
264 if (offset
+ 1 == newlength
) {
265 newlength
+= increments
;
266 int64_t *temp
= (int64_t *)allocateMemory(
267 sizeof(int64_t) * newlength
, status
);
269 if (U_FAILURE(*status
)) {
273 uprv_memcpy(temp
, destination
, sizeof(int64_t) * (size_t)offset
);
274 *destinationlength
= newlength
;
278 destination
[offset
] = value
;
284 * Initializing the ce table for a pattern.
285 * Stores non-ignorable collation keys.
286 * Table size will be estimated by the size of the pattern text. Table
287 * expansion will be perform as we go along. Adding 1 to ensure that the table
288 * size definitely increases.
289 * Internal method, status assumed to be a success.
290 * @param strsrch string search data
291 * @param status output error if any, caller to check status before calling
292 * method, status assumed to be success when passed in.
293 * @return total number of expansions
296 inline uint16_t initializePatternCETable(UStringSearch
*strsrch
,
299 UPattern
*pattern
= &(strsrch
->pattern
);
300 uint32_t cetablesize
= INITIAL_ARRAY_SIZE_
;
301 int32_t *cetable
= pattern
->cesBuffer
;
302 uint32_t patternlength
= pattern
->textLength
;
303 UCollationElements
*coleiter
= strsrch
->utilIter
;
305 if (coleiter
== NULL
) {
306 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
307 patternlength
, status
);
308 // status will be checked in ucol_next(..) later and if it is an
309 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
311 strsrch
->utilIter
= coleiter
;
314 ucol_setText(coleiter
, pattern
->text
, pattern
->textLength
, status
);
316 if(U_FAILURE(*status
)) {
320 if (pattern
->ces
!= cetable
&& pattern
->ces
) {
321 uprv_free(pattern
->ces
);
328 while ((ce
= ucol_next(coleiter
, status
)) != UCOL_NULLORDER
&&
329 U_SUCCESS(*status
)) {
330 uint32_t newce
= getCE(strsrch
, ce
);
332 int32_t *temp
= addTouint32_tArray(cetable
, offset
, &cetablesize
,
334 patternlength
- ucol_getOffset(coleiter
) + 1,
336 if (U_FAILURE(*status
)) {
340 if (cetable
!= temp
&& cetable
!= pattern
->cesBuffer
) {
345 result
+= (uint16_t)(ucol_getMaxExpansion(coleiter
, ce
) - 1);
349 pattern
->ces
= cetable
;
350 pattern
->cesLength
= offset
;
356 * Initializing the pce table for a pattern.
357 * Stores non-ignorable collation keys.
358 * Table size will be estimated by the size of the pattern text. Table
359 * expansion will be perform as we go along. Adding 1 to ensure that the table
360 * size definitely increases.
361 * Internal method, status assumed to be a success.
362 * @param strsrch string search data
363 * @param status output error if any, caller to check status before calling
364 * method, status assumed to be success when passed in.
365 * @return total number of expansions
368 inline uint16_t initializePatternPCETable(UStringSearch
*strsrch
,
371 UPattern
*pattern
= &(strsrch
->pattern
);
372 uint32_t pcetablesize
= INITIAL_ARRAY_SIZE_
;
373 int64_t *pcetable
= pattern
->pcesBuffer
;
374 uint32_t patternlength
= pattern
->textLength
;
375 UCollationElements
*coleiter
= strsrch
->utilIter
;
377 if (coleiter
== NULL
) {
378 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
379 patternlength
, status
);
380 // status will be checked in ucol_next(..) later and if it is an
381 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
383 strsrch
->utilIter
= coleiter
;
385 ucol_setText(coleiter
, pattern
->text
, pattern
->textLength
, status
);
387 if(U_FAILURE(*status
)) {
391 if (pattern
->pces
!= pcetable
&& pattern
->pces
!= NULL
) {
392 uprv_free(pattern
->pces
);
399 icu::UCollationPCE
iter(coleiter
);
401 // ** Should processed CEs be signed or unsigned?
402 // ** (the rest of the code in this file seems to play fast-and-loose with
403 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
404 while ((pce
= iter
.nextProcessed(NULL
, NULL
, status
)) != UCOL_PROCESSED_NULLORDER
&&
405 U_SUCCESS(*status
)) {
406 int64_t *temp
= addTouint64_tArray(pcetable
, offset
, &pcetablesize
,
408 patternlength
- ucol_getOffset(coleiter
) + 1,
411 if (U_FAILURE(*status
)) {
417 if (pcetable
!= temp
&& pcetable
!= pattern
->pcesBuffer
) {
422 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
425 pcetable
[offset
] = 0;
426 pattern
->pces
= pcetable
;
427 pattern
->pcesLength
= offset
;
433 * Initializes the pattern struct.
434 * Internal method, status assumed to be success.
435 * @param strsrch UStringSearch data storage
436 * @param status output error if any, caller to check status before calling
437 * method, status assumed to be success when passed in.
438 * @return expansionsize the total expansion size of the pattern
441 inline int16_t initializePattern(UStringSearch
*strsrch
, UErrorCode
*status
)
443 if (U_FAILURE(*status
)) { return 0; }
444 UPattern
*pattern
= &(strsrch
->pattern
);
445 const UChar
*patterntext
= pattern
->text
;
446 int32_t length
= pattern
->textLength
;
449 // Since the strength is primary, accents are ignored in the pattern.
450 if (strsrch
->strength
== UCOL_PRIMARY
) {
451 pattern
->hasPrefixAccents
= 0;
452 pattern
->hasSuffixAccents
= 0;
454 pattern
->hasPrefixAccents
= getFCD(patterntext
, &index
, length
) >>
455 SECOND_LAST_BYTE_SHIFT_
;
457 U16_BACK_1(patterntext
, 0, index
);
458 pattern
->hasSuffixAccents
= getFCD(patterntext
, &index
, length
) &
463 if (strsrch
->pattern
.pces
!= NULL
) {
464 if (strsrch
->pattern
.pces
!= strsrch
->pattern
.pcesBuffer
) {
465 uprv_free(strsrch
->pattern
.pces
);
468 strsrch
->pattern
.pces
= NULL
;
471 // since intializePattern is an internal method status is a success.
472 return initializePatternCETable(strsrch
, status
);
476 * Initializing shift tables, with the default values.
477 * If a corresponding default value is 0, the shift table is not set.
478 * @param shift table for forwards shift
479 * @param backshift table for backwards shift
480 * @param cetable table containing pattern ce
481 * @param cesize size of the pattern ces
482 * @param expansionsize total size of the expansions
483 * @param defaultforward the default forward value
484 * @param defaultbackward the default backward value
487 inline void setShiftTable(int16_t shift
[], int16_t backshift
[],
488 int32_t *cetable
, int32_t cesize
,
489 int16_t expansionsize
,
490 int16_t defaultforward
,
491 int16_t defaultbackward
)
493 // estimate the value to shift. to do that we estimate the smallest
494 // number of characters to give the relevant ces, ie approximately
495 // the number of ces minus their expansion, since expansions can come
498 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
499 shift
[count
] = defaultforward
;
501 cesize
--; // down to the last index
502 for (count
= 0; count
< cesize
; count
++) {
503 // number of ces from right of array to the count
504 int temp
= defaultforward
- count
- 1;
505 shift
[hashFromCE32(cetable
[count
])] = temp
> 1 ? static_cast<int16_t>(temp
) : 1;
507 shift
[hashFromCE32(cetable
[cesize
])] = 1;
508 // for ignorables we just shift by one. see test examples.
509 shift
[hashFromCE32(0)] = 1;
511 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
512 backshift
[count
] = defaultbackward
;
514 for (count
= cesize
; count
> 0; count
--) {
515 // the original value count does not seem to work
516 backshift
[hashFromCE32(cetable
[count
])] = count
> expansionsize
?
517 (int16_t)(count
- expansionsize
) : 1;
519 backshift
[hashFromCE32(cetable
[0])] = 1;
520 backshift
[hashFromCE32(0)] = 1;
524 * Building of the pattern collation element list and the boyer moore strsrch
526 * The canonical match will only be performed after the default match fails.
527 * For both cases we need to remember the size of the composed and decomposed
528 * versions of the string. Since the Boyer-Moore shift calculations shifts by
529 * a number of characters in the text and tries to match the pattern from that
530 * offset, the shift value can not be too large in case we miss some
531 * characters. To choose a right shift size, we estimate the NFC form of the
532 * and use its size as a shift guide. The NFC form should be the small
533 * possible representation of the pattern. Anyways, we'll err on the smaller
534 * shift size. Hence the calculation for minlength.
535 * Canonical match will be performed slightly differently. We'll split the
536 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
537 * the first and last base character (MS), the ending accents (EA). Matches
538 * will be done on MS first, and only when we match MS then some processing
539 * will be required for the prefix and end accents in order to determine if
540 * they match PA and EA. Hence the default shift values
541 * for the canonical match will take the size of either end's accent into
542 * consideration. Forwards search will take the end accents into consideration
543 * for the default shift values and the backwards search will take the prefix
544 * accents into consideration.
545 * If pattern has no non-ignorable ce, we return a illegal argument error.
546 * Internal method, status assumed to be success.
547 * @param strsrch UStringSearch data storage
548 * @param status for output errors if it occurs, status is assumed to be a
549 * success when it is passed in.
552 inline void initialize(UStringSearch
*strsrch
, UErrorCode
*status
)
554 int16_t expandlength
= initializePattern(strsrch
, status
);
555 if (U_SUCCESS(*status
) && strsrch
->pattern
.cesLength
> 0) {
556 UPattern
*pattern
= &strsrch
->pattern
;
557 int32_t cesize
= pattern
->cesLength
;
559 int16_t minlength
= cesize
> expandlength
560 ? (int16_t)cesize
- expandlength
: 1;
561 pattern
->defaultShiftSize
= minlength
;
562 setShiftTable(pattern
->shift
, pattern
->backShift
, pattern
->ces
,
563 cesize
, expandlength
, minlength
, minlength
);
566 strsrch
->pattern
.defaultShiftSize
= 0;
571 * Check to make sure that the match length is at the end of the character by
572 * using the breakiterator.
573 * @param strsrch string search data
574 * @param start target text start offset
575 * @param end target text end offset
578 void checkBreakBoundary(const UStringSearch
*strsrch
, int32_t * /*start*/,
581 #if !UCONFIG_NO_BREAK_ITERATION
582 UBreakIterator
*breakiterator
= strsrch
->search
->internalBreakIter
;
584 int32_t matchend
= *end
;
585 //int32_t matchstart = *start;
587 if (!ubrk_isBoundary(breakiterator
, matchend
)) {
588 *end
= ubrk_following(breakiterator
, matchend
);
591 /* Check the start of the matched text to make sure it doesn't have any accents
592 * before it. This code may not be necessary and so it is commented out */
593 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
594 *start = ubrk_preceding(breakiterator, matchstart);
601 * Determine whether the target text in UStringSearch bounded by the offset
602 * start and end is one or more whole units of text as
603 * determined by the breakiterator in UStringSearch.
604 * @param strsrch string search data
605 * @param start target text start offset
606 * @param end target text end offset
609 UBool
isBreakUnit(const UStringSearch
*strsrch
, int32_t start
,
612 #if !UCONFIG_NO_BREAK_ITERATION
613 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
616 int32_t startindex
= ubrk_first(breakiterator
);
617 int32_t endindex
= ubrk_last(breakiterator
);
619 // out-of-range indexes are never boundary positions
620 if (start
< startindex
|| start
> endindex
||
621 end
< startindex
|| end
> endindex
) {
624 // otherwise, we can use following() on the position before the
625 // specified one and return true of the position we get back is the
626 // one the user specified
627 UBool result
= (start
== startindex
||
628 ubrk_following(breakiterator
, start
- 1) == start
) &&
630 ubrk_following(breakiterator
, end
- 1) == end
);
632 // iterates the individual ces
633 UCollationElements
*coleiter
= strsrch
->utilIter
;
634 const UChar
*text
= strsrch
->search
->text
+
636 UErrorCode status
= U_ZERO_ERROR
;
637 ucol_setText(coleiter
, text
, end
- start
, &status
);
638 for (int32_t count
= 0; count
< strsrch
->pattern
.cesLength
;
640 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
641 if (ce
== UCOL_IGNORABLE
) {
645 if (U_FAILURE(status
) || ce
!= strsrch
->pattern
.ces
[count
]) {
649 int32_t nextce
= ucol_next(coleiter
, &status
);
650 while (ucol_getOffset(coleiter
) == (end
- start
)
651 && getCE(strsrch
, nextce
) == UCOL_IGNORABLE
) {
652 nextce
= ucol_next(coleiter
, &status
);
654 if (ucol_getOffset(coleiter
) == (end
- start
)
655 && nextce
!= UCOL_NULLORDER
) {
656 // extra collation elements at the end of the match
667 * Getting the next base character offset if current offset is an accent,
668 * or the current offset if the current character contains a base character.
669 * accents the following base character will be returned
671 * @param textoffset current offset
672 * @param textlength length of text string
673 * @return the next base character or the current offset
674 * if the current character is contains a base character.
677 inline int32_t getNextBaseOffset(const UChar
*text
,
681 if (textoffset
< textlength
) {
682 int32_t temp
= textoffset
;
683 if (getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
684 while (temp
< textlength
) {
685 int32_t result
= temp
;
686 if ((getFCD(text
, &temp
, textlength
) >>
687 SECOND_LAST_BYTE_SHIFT_
) == 0) {
698 * Gets the next base character offset depending on the string search pattern
700 * @param strsrch string search data
701 * @param textoffset current offset, one offset away from the last character
703 * @return start index of the next base character or the current offset
704 * if the current character is contains a base character.
707 inline int32_t getNextUStringSearchBaseOffset(UStringSearch
*strsrch
,
710 int32_t textlength
= strsrch
->search
->textLength
;
711 if (strsrch
->pattern
.hasSuffixAccents
&&
712 textoffset
< textlength
) {
713 int32_t temp
= textoffset
;
714 const UChar
*text
= strsrch
->search
->text
;
715 U16_BACK_1(text
, 0, temp
);
716 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
717 return getNextBaseOffset(text
, textoffset
, textlength
);
724 * Shifting the collation element iterator position forward to prepare for
725 * a following match. If the last character is a unsafe character, we'll only
726 * shift by 1 to capture contractions, normalization etc.
727 * Internal method, status assumed to be success.
728 * @param text strsrch string search data
729 * @param textoffset start text position to do search
730 * @param ce the text ce which failed the match.
731 * @param patternceindex index of the ce within the pattern ce buffer which
733 * @return final offset
736 inline int32_t shiftForward(UStringSearch
*strsrch
,
739 int32_t patternceindex
)
741 UPattern
*pattern
= &(strsrch
->pattern
);
742 if (ce
!= UCOL_NULLORDER
) {
743 int32_t shift
= pattern
->shift
[hashFromCE32(ce
)];
744 // this is to adjust for characters in the middle of the
745 // substring for matching that failed.
746 int32_t adjust
= pattern
->cesLength
- patternceindex
;
747 if (adjust
> 1 && shift
>= adjust
) {
753 textoffset
+= pattern
->defaultShiftSize
;
756 textoffset
= getNextUStringSearchBaseOffset(strsrch
, textoffset
);
757 // check for unsafe characters
758 // * if it is the start or middle of a contraction: to be done after
759 // a initial match is found
760 // * thai or lao base consonant character: similar to contraction
761 // * high surrogate character: similar to contraction
762 // * next character is a accent: shift to the next base character
765 #endif // #if BOYER_MOORE
768 * sets match not found
769 * @param strsrch string search data
772 inline void setMatchNotFound(UStringSearch
*strsrch
)
774 // this method resets the match result regardless of the error status.
775 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
776 strsrch
->search
->matchedLength
= 0;
777 if (strsrch
->search
->isForwardSearching
) {
778 setColEIterOffset(strsrch
->textIter
, strsrch
->search
->textLength
);
781 setColEIterOffset(strsrch
->textIter
, 0);
787 * Gets the offset to the next safe point in text.
788 * ie. not the middle of a contraction, swappable characters or supplementary
790 * @param collator collation sata
791 * @param text string to work with
792 * @param textoffset offset in string
793 * @param textlength length of text string
794 * @return offset to the next safe character
797 inline int32_t getNextSafeOffset(const UCollator
*collator
,
802 int32_t result
= textoffset
; // first contraction character
803 while (result
!= textlength
&& ucol_unsafeCP(text
[result
], collator
)) {
810 * This checks for accents in the potential match started with a .
811 * composite character.
812 * This is really painful... we have to check that composite character do not
813 * have any extra accents. We have to normalize the potential match and find
814 * the immediate decomposed character before the match.
815 * The first composite character would have been taken care of by the fcd
816 * checks in checkForwardExactMatch.
817 * This is the slow path after the fcd of the first character and
818 * the last character has been checked by checkForwardExactMatch and we
819 * determine that the potential match has extra non-ignorable preceding
821 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
822 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
823 * Note here that accents checking are slow and cautioned in the API docs.
824 * Internal method, status assumed to be a success, caller should check status
825 * before calling this method
826 * @param strsrch string search data
827 * @param start index of the potential unfriendly composite character
828 * @param end index of the potential unfriendly composite character
829 * @param status output error status if any.
830 * @return TRUE if there is non-ignorable accents before at the beginning
831 * of the match, FALSE otherwise.
835 UBool
checkExtraMatchAccents(const UStringSearch
*strsrch
, int32_t start
,
839 UBool result
= FALSE
;
840 if (strsrch
->pattern
.hasPrefixAccents
) {
841 int32_t length
= end
- start
;
843 const UChar
*text
= strsrch
->search
->text
+ start
;
845 U16_FWD_1(text
, offset
, length
);
846 // we are only concerned with the first composite character
847 if (unorm_quickCheck(text
, offset
, UNORM_NFD
, status
) == UNORM_NO
) {
848 int32_t safeoffset
= getNextSafeOffset(strsrch
->collator
,
850 if (safeoffset
!= length
) {
854 UChar buffer
[INITIAL_ARRAY_SIZE_
];
855 int32_t size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0,
856 buffer
, INITIAL_ARRAY_SIZE_
,
858 if (U_FAILURE(*status
)) {
861 if (size
>= INITIAL_ARRAY_SIZE_
) {
862 norm
= (UChar
*)allocateMemory((size
+ 1) * sizeof(UChar
),
864 // if allocation failed, status will be set to
865 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
867 size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0, norm
,
869 if (U_FAILURE(*status
) && norm
!= NULL
) {
878 UCollationElements
*coleiter
= strsrch
->utilIter
;
879 ucol_setText(coleiter
, norm
, size
, status
);
880 uint32_t firstce
= strsrch
->pattern
.ces
[0];
881 UBool ignorable
= TRUE
;
882 uint32_t ce
= UCOL_IGNORABLE
;
883 while (U_SUCCESS(*status
) && ce
!= firstce
&& ce
!= (uint32_t)UCOL_NULLORDER
) {
884 offset
= ucol_getOffset(coleiter
);
885 if (ce
!= firstce
&& ce
!= UCOL_IGNORABLE
) {
888 ce
= ucol_next(coleiter
, status
);
891 U16_PREV(norm
, 0, offset
, codepoint
);
892 result
= !ignorable
&& (u_getCombiningClass(codepoint
) != 0);
894 if (norm
!= buffer
) {
904 * Used by exact matches, checks if there are accents before the match.
905 * This is really painful... we have to check that composite characters at
906 * the start of the matches have to not have any extra accents.
907 * We check the FCD of the character first, if it starts with an accent and
908 * the first pattern ce does not match the first ce of the character, we bail.
909 * Otherwise we try normalizing the first composite
910 * character and find the immediate decomposed character before the match to
911 * see if it is an non-ignorable accent.
912 * Now normalizing the first composite character is enough because we ensure
913 * that when the match is passed in here with extra beginning ces, the
914 * first or last ce that match has to occur within the first character.
915 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
916 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
917 * Note here that accents checking are slow and cautioned in the API docs.
918 * @param strsrch string search data
919 * @param start offset
921 * @return TRUE if there are accents on either side of the match,
925 UBool
hasAccentsBeforeMatch(const UStringSearch
*strsrch
, int32_t start
,
928 if (strsrch
->pattern
.hasPrefixAccents
) {
929 UCollationElements
*coleiter
= strsrch
->textIter
;
930 UErrorCode status
= U_ZERO_ERROR
;
931 // we have been iterating forwards previously
932 uint32_t ignorable
= TRUE
;
933 int32_t firstce
= strsrch
->pattern
.ces
[0];
935 setColEIterOffset(coleiter
, start
);
936 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
937 if (U_FAILURE(status
)) {
940 while (ce
!= firstce
) {
941 if (ce
!= UCOL_IGNORABLE
) {
944 ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
945 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
949 if (!ignorable
&& inNormBuf(coleiter
)) {
950 // within normalization buffer, discontiguous handled here
955 int32_t temp
= start
;
957 // accent = (getFCD(strsrch->search->text, &temp,
958 // strsrch->search->textLength)
959 // >> SECOND_LAST_BYTE_SHIFT_);
960 // however this code does not work well with VC7 .net in release mode.
961 // maybe the inlines for getFCD combined with shifting has bugs in
962 // VC7. anyways this is a work around.
963 UBool accent
= getFCD(strsrch
->search
->text
, &temp
,
964 strsrch
->search
->textLength
) > 0xFF;
966 return checkExtraMatchAccents(strsrch
, start
, end
, &status
);
973 U16_BACK_1(strsrch
->search
->text
, 0, temp
);
974 if (getFCD(strsrch
->search
->text
, &temp
,
975 strsrch
->search
->textLength
) & LAST_BYTE_MASK_
) {
976 setColEIterOffset(coleiter
, start
);
977 ce
= ucol_previous(coleiter
, &status
);
978 if (U_FAILURE(status
) ||
979 (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
)) {
990 * Used by exact matches, checks if there are accents bounding the match.
991 * Note this is the initial boundary check. If the potential match
992 * starts or ends with composite characters, the accents in those
993 * characters will be determined later.
994 * Not doing backwards iteration here, since discontiguos contraction for
995 * backwards collation element iterator, use up too many characters.
996 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
997 * should fail since there is a acute at the end of \u01FA
998 * Note here that accents checking are slow and cautioned in the API docs.
999 * @param strsrch string search data
1000 * @param start offset of match
1001 * @param end end offset of the match
1002 * @return TRUE if there are accents on either side of the match,
1006 UBool
hasAccentsAfterMatch(const UStringSearch
*strsrch
, int32_t start
,
1009 if (strsrch
->pattern
.hasSuffixAccents
) {
1010 const UChar
*text
= strsrch
->search
->text
;
1012 int32_t textlength
= strsrch
->search
->textLength
;
1013 U16_BACK_1(text
, 0, temp
);
1014 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
1015 int32_t firstce
= strsrch
->pattern
.ces
[0];
1016 UCollationElements
*coleiter
= strsrch
->textIter
;
1017 UErrorCode status
= U_ZERO_ERROR
;
1019 setColEIterOffset(coleiter
, start
);
1020 while ((ce
= getCE(strsrch
, ucol_next(coleiter
, &status
))) != firstce
) {
1021 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
1026 while (count
< strsrch
->pattern
.cesLength
) {
1027 if (getCE(strsrch
, ucol_next(coleiter
, &status
))
1028 == UCOL_IGNORABLE
) {
1029 // Thai can give an ignorable here.
1032 if (U_FAILURE(status
)) {
1038 ce
= ucol_next(coleiter
, &status
);
1039 if (U_FAILURE(status
)) {
1042 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1043 ce
= getCE(strsrch
, ce
);
1045 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1046 if (ucol_getOffset(coleiter
) <= end
) {
1049 if (getFCD(text
, &end
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
1057 #endif // #if BOYER_MOORE
1060 * Checks if the offset runs out of the text string
1062 * @param textlength of the text string
1063 * @return TRUE if offset is out of bounds, FALSE otherwise
1066 inline UBool
isOutOfBounds(int32_t textlength
, int32_t offset
)
1068 return offset
< 0 || offset
> textlength
;
1072 * Checks for identical match
1073 * @param strsrch string search data
1074 * @param start offset of possible match
1075 * @param end offset of possible match
1076 * @return TRUE if identical match is found
1079 inline UBool
checkIdentical(const UStringSearch
*strsrch
, int32_t start
,
1082 if (strsrch
->strength
!= UCOL_IDENTICAL
) {
1086 // Note: We could use Normalizer::compare() or similar, but for short strings
1087 // which may not be in FCD it might be faster to just NFD them.
1088 UErrorCode status
= U_ZERO_ERROR
;
1089 UnicodeString t2
, p2
;
1090 strsrch
->nfd
->normalize(
1091 UnicodeString(FALSE
, strsrch
->search
->text
+ start
, end
- start
), t2
, status
);
1092 strsrch
->nfd
->normalize(
1093 UnicodeString(FALSE
, strsrch
->pattern
.text
, strsrch
->pattern
.textLength
), p2
, status
);
1094 // return FALSE if NFD failed
1095 return U_SUCCESS(status
) && t2
== p2
;
1100 * Checks to see if the match is repeated
1101 * @param strsrch string search data
1102 * @param start new match start index
1103 * @param end new match end index
1104 * @return TRUE if the the match is repeated, FALSE otherwise
1107 inline UBool
checkRepeatedMatch(UStringSearch
*strsrch
,
1111 int32_t lastmatchindex
= strsrch
->search
->matchedIndex
;
1113 if (lastmatchindex
== USEARCH_DONE
) {
1116 if (strsrch
->search
->isForwardSearching
) {
1117 result
= start
<= lastmatchindex
;
1120 result
= start
>= lastmatchindex
;
1122 if (!result
&& !strsrch
->search
->isOverlap
) {
1123 if (strsrch
->search
->isForwardSearching
) {
1124 result
= start
< lastmatchindex
+ strsrch
->search
->matchedLength
;
1127 result
= end
> lastmatchindex
;
1134 * Gets the collation element iterator's current offset.
1135 * @param coleiter collation element iterator
1136 * @param forwards flag TRUE if we are moving in th forwards direction
1137 * @return current offset
1140 inline int32_t getColElemIterOffset(const UCollationElements
*coleiter
,
1143 int32_t result
= ucol_getOffset(coleiter
);
1144 // intricacies of the the backwards collation element iterator
1145 if (FALSE
&& !forwards
&& inNormBuf(coleiter
) && !isFCDPointerNull(coleiter
)) {
1152 * Checks match for contraction.
1153 * If the match ends with a partial contraction we fail.
1154 * If the match starts too far off (because of backwards iteration) we try to
1155 * chip off the extra characters depending on whether a breakiterator has
1157 * Internal method, error assumed to be success, caller has to check status
1158 * before calling this method.
1159 * @param strsrch string search data
1160 * @param start offset of potential match, to be modified if necessary
1161 * @param end offset of potential match, to be modified if necessary
1162 * @param status output error status if any
1163 * @return TRUE if match passes the contraction test, FALSE otherwise
1167 UBool
checkNextExactContractionMatch(UStringSearch
*strsrch
,
1169 int32_t *end
, UErrorCode
*status
)
1171 UCollationElements
*coleiter
= strsrch
->textIter
;
1172 int32_t textlength
= strsrch
->search
->textLength
;
1173 int32_t temp
= *start
;
1174 const UCollator
*collator
= strsrch
->collator
;
1175 const UChar
*text
= strsrch
->search
->text
;
1176 // This part checks if either ends of the match contains potential
1177 // contraction. If so we'll have to iterate through them
1178 // The start contraction needs to be checked since ucol_previous dumps
1179 // all characters till the first safe character into the buffer.
1180 // *start + 1 is used to test for the unsafe characters instead of *start
1181 // because ucol_prev takes all unsafe characters till the first safe
1182 // character ie *start. so by testing *start + 1, we can estimate if
1183 // excess prefix characters has been included in the potential search
1185 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1186 (*start
+ 1 < textlength
1187 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1188 int32_t expansion
= getExpansionPrefix(coleiter
);
1189 UBool expandflag
= expansion
> 0;
1190 setColEIterOffset(coleiter
, *start
);
1191 while (expansion
> 0) {
1192 // getting rid of the redundant ce, caused by setOffset.
1193 // since backward contraction/expansion may have extra ces if we
1194 // are in the normalization buffer, hasAccentsBeforeMatch would
1195 // have taken care of it.
1196 // E.g. the character \u01FA will have an expansion of 3, but if
1197 // we are only looking for acute and ring \u030A and \u0301, we'll
1198 // have to skip the first ce in the expansion buffer.
1199 ucol_next(coleiter
, status
);
1200 if (U_FAILURE(*status
)) {
1203 if (ucol_getOffset(coleiter
) != temp
) {
1205 temp
= ucol_getOffset(coleiter
);
1210 int32_t *patternce
= strsrch
->pattern
.ces
;
1211 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
1213 while (count
< patterncelength
) {
1214 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1215 if (ce
== UCOL_IGNORABLE
) {
1218 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1220 temp
= ucol_getOffset(coleiter
);
1222 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1224 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1234 * Checks and sets the match information if found.
1237 * <li> the potential match does not repeat the previous match
1238 * <li> boundaries are correct
1239 * <li> exact matches has no extra accents
1240 * <li> identical matchesb
1241 * <li> potential match does not end in the middle of a contraction
1243 * Otherwise the offset will be shifted to the next character.
1244 * Internal method, status assumed to be success, caller has to check status
1245 * before calling this method.
1246 * @param strsrch string search data
1247 * @param textoffset offset in the collation element text. the returned value
1248 * will be the truncated end offset of the match or the new start
1250 * @param status output error status if any
1251 * @return TRUE if the match is valid, FALSE otherwise
1254 inline UBool
checkNextExactMatch(UStringSearch
*strsrch
,
1255 int32_t *textoffset
, UErrorCode
*status
)
1257 UCollationElements
*coleiter
= strsrch
->textIter
;
1258 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1260 if (!checkNextExactContractionMatch(strsrch
, &start
, textoffset
, status
)) {
1264 // this totally matches, however we need to check if it is repeating
1265 if (!isBreakUnit(strsrch
, start
, *textoffset
) ||
1266 checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1267 hasAccentsBeforeMatch(strsrch
, start
, *textoffset
) ||
1268 !checkIdentical(strsrch
, start
, *textoffset
) ||
1269 hasAccentsAfterMatch(strsrch
, start
, *textoffset
)) {
1272 *textoffset
= getNextUStringSearchBaseOffset(strsrch
, *textoffset
);
1276 //Add breakiterator boundary check for primary strength search.
1277 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
1278 checkBreakBoundary(strsrch
, &start
, textoffset
);
1281 // totally match, we will get rid of the ending ignorables.
1282 strsrch
->search
->matchedIndex
= start
;
1283 strsrch
->search
->matchedLength
= *textoffset
- start
;
1288 * Getting the previous base character offset, or the current offset if the
1289 * current character is a base character
1290 * @param text string
1291 * @param textoffset one offset after the current character
1292 * @return the offset of the next character after the base character or the first
1293 * composed character with accents
1296 inline int32_t getPreviousBaseOffset(const UChar
*text
,
1299 if (textoffset
> 0) {
1301 int32_t result
= textoffset
;
1302 U16_BACK_1(text
, 0, textoffset
);
1303 int32_t temp
= textoffset
;
1304 uint16_t fcd
= getFCD(text
, &temp
, result
);
1305 if ((fcd
>> SECOND_LAST_BYTE_SHIFT_
) == 0) {
1306 if (fcd
& LAST_BYTE_MASK_
) {
1311 if (textoffset
== 0) {
1320 * Getting the indexes of the accents that are not blocked in the argument
1322 * @param accents array of accents in nfd terminated by a 0.
1323 * @param accentsindex array of indexes of the accents that are not blocked
1326 inline int getUnblockedAccentIndex(UChar
*accents
, int32_t *accentsindex
)
1329 int32_t length
= u_strlen(accents
);
1330 UChar32 codepoint
= 0;
1334 while (index
< length
) {
1336 U16_NEXT(accents
, index
, length
, codepoint
);
1337 if (u_getCombiningClass(codepoint
) != cclass
) {
1338 cclass
= u_getCombiningClass(codepoint
);
1339 accentsindex
[result
] = temp
;
1343 accentsindex
[result
] = length
;
1348 * Appends 3 UChar arrays to a destination array.
1349 * Creates a new array if we run out of space. The caller will have to
1350 * manually deallocate the newly allocated array.
1351 * Internal method, status assumed to be success, caller has to check status
1352 * before calling this method. destination not to be NULL and has at least
1353 * size destinationlength.
1354 * @param destination target array
1355 * @param destinationlength target array size, returning the appended length
1356 * @param source1 null-terminated first array
1357 * @param source2 second array
1358 * @param source2length length of seond array
1359 * @param source3 null-terminated third array
1360 * @param status error status if any
1361 * @return new destination array, destination if there was no new allocation
1364 inline UChar
* addToUCharArray( UChar
*destination
,
1365 int32_t *destinationlength
,
1366 const UChar
*source1
,
1367 const UChar
*source2
,
1368 int32_t source2length
,
1369 const UChar
*source3
,
1372 int32_t source1length
= source1
? u_strlen(source1
) : 0;
1373 int32_t source3length
= source3
? u_strlen(source3
) : 0;
1374 if (*destinationlength
< source1length
+ source2length
+ source3length
+
1377 destination
= (UChar
*)allocateMemory(
1378 (source1length
+ source2length
+ source3length
+ 1) * sizeof(UChar
),
1380 // if error allocating memory, status will be
1381 // U_MEMORY_ALLOCATION_ERROR
1382 if (U_FAILURE(*status
)) {
1383 *destinationlength
= 0;
1387 if (source1length
!= 0) {
1388 u_memcpy(destination
, source1
, source1length
);
1390 if (source2length
!= 0) {
1391 uprv_memcpy(destination
+ source1length
, source2
,
1392 sizeof(UChar
) * source2length
);
1394 if (source3length
!= 0) {
1395 uprv_memcpy(destination
+ source1length
+ source2length
, source3
,
1396 sizeof(UChar
) * source3length
);
1398 *destinationlength
= source1length
+ source2length
+ source3length
;
1403 * Running through a collation element iterator to see if the contents matches
1404 * pattern in string search data
1405 * @param strsrch string search data
1406 * @param coleiter collation element iterator
1407 * @return TRUE if a match if found, FALSE otherwise
1410 inline UBool
checkCollationMatch(const UStringSearch
*strsrch
,
1411 UCollationElements
*coleiter
)
1413 int patternceindex
= strsrch
->pattern
.cesLength
;
1414 int32_t *patternce
= strsrch
->pattern
.ces
;
1415 UErrorCode status
= U_ZERO_ERROR
;
1416 while (patternceindex
> 0) {
1417 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
1418 if (ce
== UCOL_IGNORABLE
) {
1421 if (U_FAILURE(status
) || ce
!= *patternce
) {
1431 * Rearranges the front accents to try matching.
1432 * Prefix accents in the text will be grouped according to their combining
1433 * class and the groups will be mixed and matched to try find the perfect
1434 * match with the pattern.
1435 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1436 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1437 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1439 * step 2: check if any of the generated substrings matches the pattern.
1440 * Internal method, status is assumed to be success, caller has to check status
1441 * before calling this method.
1442 * @param strsrch string search match
1443 * @param start first offset of the accents to start searching
1444 * @param end start of the last accent set
1445 * @param status output error status if any
1446 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1447 * offset of the match. Note this start includes all preceding accents.
1450 int32_t doNextCanonicalPrefixMatch(UStringSearch
*strsrch
,
1455 const UChar
*text
= strsrch
->search
->text
;
1456 int32_t textlength
= strsrch
->search
->textLength
;
1457 int32_t tempstart
= start
;
1459 if ((getFCD(text
, &tempstart
, textlength
) & LAST_BYTE_MASK_
) == 0) {
1460 // die... failed at a base character
1461 return USEARCH_DONE
;
1464 int32_t offset
= getNextBaseOffset(text
, tempstart
, textlength
);
1465 start
= getPreviousBaseOffset(text
, tempstart
);
1467 UChar accents
[INITIAL_ARRAY_SIZE_
];
1468 // normalizing the offensive string
1469 unorm_normalize(text
+ start
, offset
- start
, UNORM_NFD
, 0, accents
,
1470 INITIAL_ARRAY_SIZE_
, status
);
1471 if (U_FAILURE(*status
)) {
1472 return USEARCH_DONE
;
1475 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1476 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1478 int32_t count
= (2 << (accentsize
- 1)) - 1;
1479 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1480 UCollationElements
*coleiter
= strsrch
->utilIter
;
1481 while (U_SUCCESS(*status
) && count
> 0) {
1482 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
1483 // copy the base characters
1484 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1485 *rearrange
++ = accents
[k
];
1487 // forming all possible canonical rearrangement by dropping
1489 for (int i
= 0; i
<= accentsize
- 1; i
++) {
1490 int32_t mask
= 1 << (accentsize
- i
- 1);
1492 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1493 *rearrange
++ = accents
[j
];
1498 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
1499 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
1500 strsrch
->canonicalPrefixAccents
,
1501 strsrch
->search
->text
+ offset
,
1503 strsrch
->canonicalSuffixAccents
,
1506 // if status is a failure, ucol_setText does nothing.
1507 // run the collator iterator through this match
1508 ucol_setText(coleiter
, match
, matchsize
, status
);
1509 if (U_SUCCESS(*status
)) {
1510 if (checkCollationMatch(strsrch
, coleiter
)) {
1511 if (match
!= buffer
) {
1519 return USEARCH_DONE
;
1523 * Gets the offset to the safe point in text before textoffset.
1524 * ie. not the middle of a contraction, swappable characters or supplementary
1526 * @param collator collation sata
1527 * @param text string to work with
1528 * @param textoffset offset in string
1529 * @param textlength length of text string
1530 * @return offset to the previous safe character
1533 inline uint32_t getPreviousSafeOffset(const UCollator
*collator
,
1537 int32_t result
= textoffset
; // first contraction character
1538 while (result
!= 0 && ucol_unsafeCP(text
[result
- 1], collator
)) {
1542 // the first contraction character is consider unsafe here
1549 * Cleaning up after we passed the safe zone
1550 * @param strsrch string search data
1551 * @param safetext safe text array
1552 * @param safebuffer safe text buffer
1553 * @param coleiter collation element iterator for safe text
1556 inline void cleanUpSafeText(const UStringSearch
*strsrch
, UChar
*safetext
,
1559 if (safetext
!= safebuffer
&& safetext
!= strsrch
->canonicalSuffixAccents
)
1561 uprv_free(safetext
);
1566 * Take the rearranged end accents and tries matching. If match failed at
1567 * a seperate preceding set of accents (seperated from the rearranged on by
1568 * at least a base character) then we rearrange the preceding accents and
1569 * tries matching again.
1570 * We allow skipping of the ends of the accent set if the ces do not match.
1571 * However if the failure is found before the accent set, it fails.
1572 * Internal method, status assumed to be success, caller has to check status
1573 * before calling this method.
1574 * @param strsrch string search data
1575 * @param textoffset of the start of the rearranged accent
1576 * @param status output error status if any
1577 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1578 * offset of the match. Note this start includes all preceding accents.
1581 int32_t doNextCanonicalSuffixMatch(UStringSearch
*strsrch
,
1585 const UChar
*text
= strsrch
->search
->text
;
1586 const UCollator
*collator
= strsrch
->collator
;
1587 int32_t safelength
= 0;
1589 int32_t safetextlength
;
1590 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
1591 UCollationElements
*coleiter
= strsrch
->utilIter
;
1592 int32_t safeoffset
= textoffset
;
1594 if (textoffset
!= 0 && ucol_unsafeCP(strsrch
->canonicalSuffixAccents
[0],
1596 safeoffset
= getPreviousSafeOffset(collator
, text
, textoffset
);
1597 safelength
= textoffset
- safeoffset
;
1598 safetextlength
= INITIAL_ARRAY_SIZE_
;
1599 safetext
= addToUCharArray(safebuffer
, &safetextlength
, NULL
,
1600 text
+ safeoffset
, safelength
,
1601 strsrch
->canonicalSuffixAccents
,
1605 safetextlength
= u_strlen(strsrch
->canonicalSuffixAccents
);
1606 safetext
= strsrch
->canonicalSuffixAccents
;
1609 // if status is a failure, ucol_setText does nothing
1610 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
1611 // status checked in loop below
1613 int32_t *ce
= strsrch
->pattern
.ces
;
1614 int32_t celength
= strsrch
->pattern
.cesLength
;
1615 int ceindex
= celength
- 1;
1616 UBool isSafe
= TRUE
; // indication flag for position in safe zone
1618 while (ceindex
>= 0) {
1619 int32_t textce
= ucol_previous(coleiter
, status
);
1620 if (U_FAILURE(*status
)) {
1622 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1624 return USEARCH_DONE
;
1626 if (textce
== UCOL_NULLORDER
) {
1627 // check if we have passed the safe buffer
1628 if (coleiter
== strsrch
->textIter
) {
1629 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1630 return USEARCH_DONE
;
1632 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1633 safetext
= safebuffer
;
1634 coleiter
= strsrch
->textIter
;
1635 setColEIterOffset(coleiter
, safeoffset
);
1636 // status checked at the start of the loop
1640 textce
= getCE(strsrch
, textce
);
1641 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
1642 // do the beginning stuff
1643 int32_t failedoffset
= getColElemIterOffset(coleiter
, FALSE
);
1644 if (isSafe
&& failedoffset
>= safelength
) {
1645 // alas... no hope. failed at rearranged accent set
1646 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1647 return USEARCH_DONE
;
1651 failedoffset
+= safeoffset
;
1652 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1655 // try rearranging the front accents
1656 int32_t result
= doNextCanonicalPrefixMatch(strsrch
,
1657 failedoffset
, textoffset
, status
);
1658 if (result
!= USEARCH_DONE
) {
1659 // if status is a failure, ucol_setOffset does nothing
1660 setColEIterOffset(strsrch
->textIter
, result
);
1662 if (U_FAILURE(*status
)) {
1663 return USEARCH_DONE
;
1668 if (textce
== ce
[ceindex
]) {
1674 int32_t result
= getColElemIterOffset(coleiter
, FALSE
);
1675 // sets the text iterator here with the correct expansion and offset
1676 int32_t leftoverces
= getExpansionPrefix(coleiter
);
1677 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1678 if (result
>= safelength
) {
1679 result
= textoffset
;
1682 result
+= safeoffset
;
1684 setColEIterOffset(strsrch
->textIter
, result
);
1685 strsrch
->textIter
->iteratordata_
.toReturn
=
1686 setExpansionPrefix(strsrch
->textIter
, leftoverces
);
1690 return ucol_getOffset(coleiter
);
1694 * Trying out the substring and sees if it can be a canonical match.
1695 * This will try normalizing the end accents and arranging them into canonical
1696 * equivalents and check their corresponding ces with the pattern ce.
1697 * Suffix accents in the text will be grouped according to their combining
1698 * class and the groups will be mixed and matched to try find the perfect
1699 * match with the pattern.
1700 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1701 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1702 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1704 * step 2: check if any of the generated substrings matches the pattern.
1705 * Internal method, status assumed to be success, caller has to check status
1706 * before calling this method.
1707 * @param strsrch string search data
1708 * @param textoffset end offset in the collation element text that ends with
1709 * the accents to be rearranged
1710 * @param status error status if any
1711 * @return TRUE if the match is valid, FALSE otherwise
1714 UBool
doNextCanonicalMatch(UStringSearch
*strsrch
,
1718 const UChar
*text
= strsrch
->search
->text
;
1719 int32_t temp
= textoffset
;
1720 U16_BACK_1(text
, 0, temp
);
1721 if ((getFCD(text
, &temp
, textoffset
) & LAST_BYTE_MASK_
) == 0) {
1722 UCollationElements
*coleiter
= strsrch
->textIter
;
1723 int32_t offset
= getColElemIterOffset(coleiter
, FALSE
);
1724 if (strsrch
->pattern
.hasPrefixAccents
) {
1725 offset
= doNextCanonicalPrefixMatch(strsrch
, offset
, textoffset
,
1727 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
1728 setColEIterOffset(coleiter
, offset
);
1735 if (!strsrch
->pattern
.hasSuffixAccents
) {
1739 UChar accents
[INITIAL_ARRAY_SIZE_
];
1740 // offset to the last base character in substring to search
1741 int32_t baseoffset
= getPreviousBaseOffset(text
, textoffset
);
1742 // normalizing the offensive string
1743 unorm_normalize(text
+ baseoffset
, textoffset
- baseoffset
, UNORM_NFD
,
1744 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
1745 // status checked in loop below
1747 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1748 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
1750 // 2 power n - 1 plus the full set of accents
1751 int32_t count
= (2 << (size
- 1)) - 1;
1752 while (U_SUCCESS(*status
) && count
> 0) {
1753 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
1754 // copy the base characters
1755 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1756 *rearrange
++ = accents
[k
];
1758 // forming all possible canonical rearrangement by dropping
1760 for (int i
= 0; i
<= size
- 1; i
++) {
1761 int32_t mask
= 1 << (size
- i
- 1);
1763 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1764 *rearrange
++ = accents
[j
];
1769 int32_t offset
= doNextCanonicalSuffixMatch(strsrch
, baseoffset
,
1771 if (offset
!= USEARCH_DONE
) {
1772 return TRUE
; // match found
1780 * Gets the previous base character offset depending on the string search
1782 * @param strsrch string search data
1783 * @param textoffset current offset, current character
1784 * @return the offset of the next character after this base character or itself
1785 * if it is a composed character with accents
1788 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch
*strsrch
,
1791 if (strsrch
->pattern
.hasPrefixAccents
&& textoffset
> 0) {
1792 const UChar
*text
= strsrch
->search
->text
;
1793 int32_t offset
= textoffset
;
1794 if (getFCD(text
, &offset
, strsrch
->search
->textLength
) >>
1795 SECOND_LAST_BYTE_SHIFT_
) {
1796 return getPreviousBaseOffset(text
, textoffset
);
1803 * Checks match for contraction.
1804 * If the match ends with a partial contraction we fail.
1805 * If the match starts too far off (because of backwards iteration) we try to
1806 * chip off the extra characters
1807 * Internal method, status assumed to be success, caller has to check status
1808 * before calling this method.
1809 * @param strsrch string search data
1810 * @param start offset of potential match, to be modified if necessary
1811 * @param end offset of potential match, to be modified if necessary
1812 * @param status output error status if any
1813 * @return TRUE if match passes the contraction test, FALSE otherwise
1816 UBool
checkNextCanonicalContractionMatch(UStringSearch
*strsrch
,
1821 UCollationElements
*coleiter
= strsrch
->textIter
;
1822 int32_t textlength
= strsrch
->search
->textLength
;
1823 int32_t temp
= *start
;
1824 const UCollator
*collator
= strsrch
->collator
;
1825 const UChar
*text
= strsrch
->search
->text
;
1826 // This part checks if either ends of the match contains potential
1827 // contraction. If so we'll have to iterate through them
1828 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1829 (*start
+ 1 < textlength
1830 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1831 int32_t expansion
= getExpansionPrefix(coleiter
);
1832 UBool expandflag
= expansion
> 0;
1833 setColEIterOffset(coleiter
, *start
);
1834 while (expansion
> 0) {
1835 // getting rid of the redundant ce, caused by setOffset.
1836 // since backward contraction/expansion may have extra ces if we
1837 // are in the normalization buffer, hasAccentsBeforeMatch would
1838 // have taken care of it.
1839 // E.g. the character \u01FA will have an expansion of 3, but if
1840 // we are only looking for acute and ring \u030A and \u0301, we'll
1841 // have to skip the first ce in the expansion buffer.
1842 ucol_next(coleiter
, status
);
1843 if (U_FAILURE(*status
)) {
1846 if (ucol_getOffset(coleiter
) != temp
) {
1848 temp
= ucol_getOffset(coleiter
);
1853 int32_t *patternce
= strsrch
->pattern
.ces
;
1854 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
1856 int32_t textlength
= strsrch
->search
->textLength
;
1857 while (count
< patterncelength
) {
1858 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1859 // status checked below, note that if status is a failure
1860 // ucol_next returns UCOL_NULLORDER
1861 if (ce
== UCOL_IGNORABLE
) {
1864 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1866 temp
= ucol_getOffset(coleiter
);
1869 if (count
== 0 && ce
!= patternce
[0]) {
1870 // accents may have extra starting ces, this occurs when a
1871 // pure accent pattern is matched without rearrangement
1872 // text \u0325\u0300 and looking for \u0300
1873 int32_t expected
= patternce
[0];
1874 if (getFCD(text
, start
, textlength
) & LAST_BYTE_MASK_
) {
1875 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1876 while (U_SUCCESS(*status
) && ce
!= expected
&&
1877 ce
!= UCOL_NULLORDER
&&
1878 ucol_getOffset(coleiter
) <= *end
) {
1879 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1883 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1885 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1895 * Checks and sets the match information if found.
1898 * <li> the potential match does not repeat the previous match
1899 * <li> boundaries are correct
1900 * <li> potential match does not end in the middle of a contraction
1901 * <li> identical matches
1903 * Otherwise the offset will be shifted to the next character.
1904 * Internal method, status assumed to be success, caller has to check the
1905 * status before calling this method.
1906 * @param strsrch string search data
1907 * @param textoffset offset in the collation element text. the returned value
1908 * will be the truncated end offset of the match or the new start
1910 * @param status output error status if any
1911 * @return TRUE if the match is valid, FALSE otherwise
1914 inline UBool
checkNextCanonicalMatch(UStringSearch
*strsrch
,
1915 int32_t *textoffset
,
1918 // to ensure that the start and ends are not composite characters
1919 UCollationElements
*coleiter
= strsrch
->textIter
;
1920 // if we have a canonical accent match
1921 if ((strsrch
->pattern
.hasSuffixAccents
&&
1922 strsrch
->canonicalSuffixAccents
[0]) ||
1923 (strsrch
->pattern
.hasPrefixAccents
&&
1924 strsrch
->canonicalPrefixAccents
[0])) {
1925 strsrch
->search
->matchedIndex
= getPreviousUStringSearchBaseOffset(
1927 ucol_getOffset(coleiter
));
1928 strsrch
->search
->matchedLength
= *textoffset
-
1929 strsrch
->search
->matchedIndex
;
1933 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1934 if (!checkNextCanonicalContractionMatch(strsrch
, &start
, textoffset
,
1935 status
) || U_FAILURE(*status
)) {
1939 start
= getPreviousUStringSearchBaseOffset(strsrch
, start
);
1940 // this totally matches, however we need to check if it is repeating
1941 if (checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1942 !isBreakUnit(strsrch
, start
, *textoffset
) ||
1943 !checkIdentical(strsrch
, start
, *textoffset
)) {
1945 *textoffset
= getNextBaseOffset(strsrch
->search
->text
, *textoffset
,
1946 strsrch
->search
->textLength
);
1950 strsrch
->search
->matchedIndex
= start
;
1951 strsrch
->search
->matchedLength
= *textoffset
- start
;
1956 * Shifting the collation element iterator position forward to prepare for
1957 * a preceding match. If the first character is a unsafe character, we'll only
1958 * shift by 1 to capture contractions, normalization etc.
1959 * Internal method, status assumed to be success, caller has to check status
1960 * before calling this method.
1961 * @param text strsrch string search data
1962 * @param textoffset start text position to do search
1963 * @param ce the text ce which failed the match.
1964 * @param patternceindex index of the ce within the pattern ce buffer which
1966 * @return final offset
1969 inline int32_t reverseShift(UStringSearch
*strsrch
,
1972 int32_t patternceindex
)
1974 if (strsrch
->search
->isOverlap
) {
1975 if (textoffset
!= strsrch
->search
->textLength
) {
1979 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1983 if (ce
!= UCOL_NULLORDER
) {
1984 int32_t shift
= strsrch
->pattern
.backShift
[hashFromCE32(ce
)];
1986 // this is to adjust for characters in the middle of the substring
1987 // for matching that failed.
1988 int32_t adjust
= patternceindex
;
1989 if (adjust
> 1 && shift
> adjust
) {
1990 shift
-= adjust
- 1;
1992 textoffset
-= shift
;
1995 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1998 textoffset
= getPreviousUStringSearchBaseOffset(strsrch
, textoffset
);
2003 * Checks match for contraction.
2004 * If the match starts with a partial contraction we fail.
2005 * Internal method, status assumed to be success, caller has to check status
2006 * before calling this method.
2007 * @param strsrch string search data
2008 * @param start offset of potential match, to be modified if necessary
2009 * @param end offset of potential match, to be modified if necessary
2010 * @param status output error status if any
2011 * @return TRUE if match passes the contraction test, FALSE otherwise
2014 UBool
checkPreviousExactContractionMatch(UStringSearch
*strsrch
,
2016 int32_t *end
, UErrorCode
*status
)
2018 UCollationElements
*coleiter
= strsrch
->textIter
;
2019 int32_t textlength
= strsrch
->search
->textLength
;
2020 int32_t temp
= *end
;
2021 const UCollator
*collator
= strsrch
->collator
;
2022 const UChar
*text
= strsrch
->search
->text
;
2023 // This part checks if either if the start of the match contains potential
2024 // contraction. If so we'll have to iterate through them
2025 // Since we used ucol_next while previously looking for the potential
2026 // match, this guarantees that our end will not be a partial contraction,
2027 // or a partial supplementary character.
2028 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2029 int32_t expansion
= getExpansionSuffix(coleiter
);
2030 UBool expandflag
= expansion
> 0;
2031 setColEIterOffset(coleiter
, *end
);
2032 while (U_SUCCESS(*status
) && expansion
> 0) {
2033 // getting rid of the redundant ce
2034 // since forward contraction/expansion may have extra ces
2035 // if we are in the normalization buffer, hasAccentsBeforeMatch
2036 // would have taken care of it.
2037 // E.g. the character \u01FA will have an expansion of 3, but if
2038 // we are only looking for A ring A\u030A, we'll have to skip the
2039 // last ce in the expansion buffer
2040 ucol_previous(coleiter
, status
);
2041 if (U_FAILURE(*status
)) {
2044 if (ucol_getOffset(coleiter
) != temp
) {
2046 temp
= ucol_getOffset(coleiter
);
2051 int32_t *patternce
= strsrch
->pattern
.ces
;
2052 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
2053 int32_t count
= patterncelength
;
2055 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2056 // status checked below, note that if status is a failure
2057 // ucol_previous returns UCOL_NULLORDER
2058 if (ce
== UCOL_IGNORABLE
) {
2061 if (expandflag
&& count
== 0 &&
2062 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2064 temp
= ucol_getOffset(coleiter
);
2066 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2068 *start
= getPreviousBaseOffset(text
, *start
);
2078 * Checks and sets the match information if found.
2081 * <li> the current match does not repeat the last match
2082 * <li> boundaries are correct
2083 * <li> exact matches has no extra accents
2084 * <li> identical matches
2086 * Otherwise the offset will be shifted to the preceding character.
2087 * Internal method, status assumed to be success, caller has to check status
2088 * before calling this method.
2089 * @param strsrch string search data
2091 * @param coleiter collation element iterator
2092 * @param text string
2093 * @param textoffset offset in the collation element text. the returned value
2094 * will be the truncated start offset of the match or the new start
2096 * @param status output error status if any
2097 * @return TRUE if the match is valid, FALSE otherwise
2100 inline UBool
checkPreviousExactMatch(UStringSearch
*strsrch
,
2101 int32_t *textoffset
,
2104 // to ensure that the start and ends are not composite characters
2105 int32_t end
= ucol_getOffset(strsrch
->textIter
);
2106 if (!checkPreviousExactContractionMatch(strsrch
, textoffset
, &end
, status
)
2107 || U_FAILURE(*status
)) {
2111 // this totally matches, however we need to check if it is repeating
2113 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2114 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2115 hasAccentsBeforeMatch(strsrch
, *textoffset
, end
) ||
2116 !checkIdentical(strsrch
, *textoffset
, end
) ||
2117 hasAccentsAfterMatch(strsrch
, *textoffset
, end
)) {
2119 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2124 //Add breakiterator boundary check for primary strength search.
2125 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
2126 checkBreakBoundary(strsrch
, textoffset
, &end
);
2129 strsrch
->search
->matchedIndex
= *textoffset
;
2130 strsrch
->search
->matchedLength
= end
- *textoffset
;
2135 * Rearranges the end accents to try matching.
2136 * Suffix accents in the text will be grouped according to their combining
2137 * class and the groups will be mixed and matched to try find the perfect
2138 * match with the pattern.
2139 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2140 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2141 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2143 * step 2: check if any of the generated substrings matches the pattern.
2144 * Internal method, status assumed to be success, user has to check status
2145 * before calling this method.
2146 * @param strsrch string search match
2147 * @param start offset of the first base character
2148 * @param end start of the last accent set
2149 * @param status only error status if any
2150 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2151 * offset of the match. Note this start includes all following accents.
2154 int32_t doPreviousCanonicalSuffixMatch(UStringSearch
*strsrch
,
2159 const UChar
*text
= strsrch
->search
->text
;
2160 int32_t tempend
= end
;
2162 U16_BACK_1(text
, 0, tempend
);
2163 if (!(getFCD(text
, &tempend
, strsrch
->search
->textLength
) &
2165 // die... failed at a base character
2166 return USEARCH_DONE
;
2168 end
= getNextBaseOffset(text
, end
, strsrch
->search
->textLength
);
2170 if (U_SUCCESS(*status
)) {
2171 UChar accents
[INITIAL_ARRAY_SIZE_
];
2172 int32_t offset
= getPreviousBaseOffset(text
, end
);
2173 // normalizing the offensive string
2174 unorm_normalize(text
+ offset
, end
- offset
, UNORM_NFD
, 0, accents
,
2175 INITIAL_ARRAY_SIZE_
, status
);
2177 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2178 int32_t accentsize
= getUnblockedAccentIndex(accents
,
2180 int32_t count
= (2 << (accentsize
- 1)) - 1;
2181 UChar buffer
[INITIAL_ARRAY_SIZE_
];
2182 UCollationElements
*coleiter
= strsrch
->utilIter
;
2183 while (U_SUCCESS(*status
) && count
> 0) {
2184 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
2185 // copy the base characters
2186 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2187 *rearrange
++ = accents
[k
];
2189 // forming all possible canonical rearrangement by dropping
2191 for (int i
= 0; i
<= accentsize
- 1; i
++) {
2192 int32_t mask
= 1 << (accentsize
- i
- 1);
2194 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2195 *rearrange
++ = accents
[j
];
2200 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
2201 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
2202 strsrch
->canonicalPrefixAccents
,
2203 strsrch
->search
->text
+ start
,
2205 strsrch
->canonicalSuffixAccents
,
2208 // run the collator iterator through this match
2209 // if status is a failure ucol_setText does nothing
2210 ucol_setText(coleiter
, match
, matchsize
, status
);
2211 if (U_SUCCESS(*status
)) {
2212 if (checkCollationMatch(strsrch
, coleiter
)) {
2213 if (match
!= buffer
) {
2222 return USEARCH_DONE
;
2226 * Take the rearranged start accents and tries matching. If match failed at
2227 * a seperate following set of accents (seperated from the rearranged on by
2228 * at least a base character) then we rearrange the preceding accents and
2229 * tries matching again.
2230 * We allow skipping of the ends of the accent set if the ces do not match.
2231 * However if the failure is found before the accent set, it fails.
2232 * Internal method, status assumed to be success, caller has to check status
2233 * before calling this method.
2234 * @param strsrch string search data
2235 * @param textoffset of the ends of the rearranged accent
2236 * @param status output error status if any
2237 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2238 * offset of the match. Note this start includes all following accents.
2241 int32_t doPreviousCanonicalPrefixMatch(UStringSearch
*strsrch
,
2245 const UChar
*text
= strsrch
->search
->text
;
2246 const UCollator
*collator
= strsrch
->collator
;
2247 int32_t safelength
= 0;
2249 int32_t safetextlength
;
2250 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
2251 int32_t safeoffset
= textoffset
;
2254 ucol_unsafeCP(strsrch
->canonicalPrefixAccents
[
2255 u_strlen(strsrch
->canonicalPrefixAccents
) - 1
2257 safeoffset
= getNextSafeOffset(collator
, text
, textoffset
,
2258 strsrch
->search
->textLength
);
2259 safelength
= safeoffset
- textoffset
;
2260 safetextlength
= INITIAL_ARRAY_SIZE_
;
2261 safetext
= addToUCharArray(safebuffer
, &safetextlength
,
2262 strsrch
->canonicalPrefixAccents
,
2263 text
+ textoffset
, safelength
,
2267 safetextlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2268 safetext
= strsrch
->canonicalPrefixAccents
;
2271 UCollationElements
*coleiter
= strsrch
->utilIter
;
2272 // if status is a failure, ucol_setText does nothing
2273 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
2274 // status checked in loop below
2276 int32_t *ce
= strsrch
->pattern
.ces
;
2277 int32_t celength
= strsrch
->pattern
.cesLength
;
2279 UBool isSafe
= TRUE
; // safe zone indication flag for position
2280 int32_t prefixlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2282 while (ceindex
< celength
) {
2283 int32_t textce
= ucol_next(coleiter
, status
);
2284 if (U_FAILURE(*status
)) {
2286 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2288 return USEARCH_DONE
;
2290 if (textce
== UCOL_NULLORDER
) {
2291 // check if we have passed the safe buffer
2292 if (coleiter
== strsrch
->textIter
) {
2293 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2294 return USEARCH_DONE
;
2296 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2297 safetext
= safebuffer
;
2298 coleiter
= strsrch
->textIter
;
2299 setColEIterOffset(coleiter
, safeoffset
);
2300 // status checked at the start of the loop
2304 textce
= getCE(strsrch
, textce
);
2305 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
2306 // do the beginning stuff
2307 int32_t failedoffset
= ucol_getOffset(coleiter
);
2308 if (isSafe
&& failedoffset
<= prefixlength
) {
2309 // alas... no hope. failed at rearranged accent set
2310 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2311 return USEARCH_DONE
;
2315 failedoffset
= safeoffset
- failedoffset
;
2316 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2319 // try rearranging the end accents
2320 int32_t result
= doPreviousCanonicalSuffixMatch(strsrch
,
2321 textoffset
, failedoffset
, status
);
2322 if (result
!= USEARCH_DONE
) {
2323 // if status is a failure, ucol_setOffset does nothing
2324 setColEIterOffset(strsrch
->textIter
, result
);
2326 if (U_FAILURE(*status
)) {
2327 return USEARCH_DONE
;
2332 if (textce
== ce
[ceindex
]) {
2338 int32_t result
= ucol_getOffset(coleiter
);
2339 // sets the text iterator here with the correct expansion and offset
2340 int32_t leftoverces
= getExpansionSuffix(coleiter
);
2341 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2342 if (result
<= prefixlength
) {
2343 result
= textoffset
;
2346 result
= textoffset
+ (safeoffset
- result
);
2348 setColEIterOffset(strsrch
->textIter
, result
);
2349 setExpansionSuffix(strsrch
->textIter
, leftoverces
);
2353 return ucol_getOffset(coleiter
);
2357 * Trying out the substring and sees if it can be a canonical match.
2358 * This will try normalizing the starting accents and arranging them into
2359 * canonical equivalents and check their corresponding ces with the pattern ce.
2360 * Prefix accents in the text will be grouped according to their combining
2361 * class and the groups will be mixed and matched to try find the perfect
2362 * match with the pattern.
2363 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2364 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2365 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2367 * step 2: check if any of the generated substrings matches the pattern.
2368 * Internal method, status assumed to be success, caller has to check status
2369 * before calling this method.
2370 * @param strsrch string search data
2371 * @param textoffset start offset in the collation element text that starts
2372 * with the accents to be rearranged
2373 * @param status output error status if any
2374 * @return TRUE if the match is valid, FALSE otherwise
2377 UBool
doPreviousCanonicalMatch(UStringSearch
*strsrch
,
2381 const UChar
*text
= strsrch
->search
->text
;
2382 int32_t temp
= textoffset
;
2383 int32_t textlength
= strsrch
->search
->textLength
;
2384 if ((getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) == 0) {
2385 UCollationElements
*coleiter
= strsrch
->textIter
;
2386 int32_t offset
= ucol_getOffset(coleiter
);
2387 if (strsrch
->pattern
.hasSuffixAccents
) {
2388 offset
= doPreviousCanonicalSuffixMatch(strsrch
, textoffset
,
2390 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
2391 setColEIterOffset(coleiter
, offset
);
2398 if (!strsrch
->pattern
.hasPrefixAccents
) {
2402 UChar accents
[INITIAL_ARRAY_SIZE_
];
2403 // offset to the last base character in substring to search
2404 int32_t baseoffset
= getNextBaseOffset(text
, textoffset
, textlength
);
2405 // normalizing the offensive string
2406 unorm_normalize(text
+ textoffset
, baseoffset
- textoffset
, UNORM_NFD
,
2407 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
2408 // status checked in loop
2410 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2411 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
2413 // 2 power n - 1 plus the full set of accents
2414 int32_t count
= (2 << (size
- 1)) - 1;
2415 while (U_SUCCESS(*status
) && count
> 0) {
2416 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
2417 // copy the base characters
2418 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2419 *rearrange
++ = accents
[k
];
2421 // forming all possible canonical rearrangement by dropping
2423 for (int i
= 0; i
<= size
- 1; i
++) {
2424 int32_t mask
= 1 << (size
- i
- 1);
2426 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2427 *rearrange
++ = accents
[j
];
2432 int32_t offset
= doPreviousCanonicalPrefixMatch(strsrch
,
2433 baseoffset
, status
);
2434 if (offset
!= USEARCH_DONE
) {
2435 return TRUE
; // match found
2443 * Checks match for contraction.
2444 * If the match starts with a partial contraction we fail.
2445 * Internal method, status assumed to be success, caller has to check status
2446 * before calling this method.
2447 * @param strsrch string search data
2448 * @param start offset of potential match, to be modified if necessary
2449 * @param end offset of potential match, to be modified if necessary
2450 * @param status only error status if any
2451 * @return TRUE if match passes the contraction test, FALSE otherwise
2454 UBool
checkPreviousCanonicalContractionMatch(UStringSearch
*strsrch
,
2456 int32_t *end
, UErrorCode
*status
)
2458 UCollationElements
*coleiter
= strsrch
->textIter
;
2459 int32_t textlength
= strsrch
->search
->textLength
;
2460 int32_t temp
= *end
;
2461 const UCollator
*collator
= strsrch
->collator
;
2462 const UChar
*text
= strsrch
->search
->text
;
2463 // This part checks if either if the start of the match contains potential
2464 // contraction. If so we'll have to iterate through them
2465 // Since we used ucol_next while previously looking for the potential
2466 // match, this guarantees that our end will not be a partial contraction,
2467 // or a partial supplementary character.
2468 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2469 int32_t expansion
= getExpansionSuffix(coleiter
);
2470 UBool expandflag
= expansion
> 0;
2471 setColEIterOffset(coleiter
, *end
);
2472 while (expansion
> 0) {
2473 // getting rid of the redundant ce
2474 // since forward contraction/expansion may have extra ces
2475 // if we are in the normalization buffer, hasAccentsBeforeMatch
2476 // would have taken care of it.
2477 // E.g. the character \u01FA will have an expansion of 3, but if
2478 // we are only looking for A ring A\u030A, we'll have to skip the
2479 // last ce in the expansion buffer
2480 ucol_previous(coleiter
, status
);
2481 if (U_FAILURE(*status
)) {
2484 if (ucol_getOffset(coleiter
) != temp
) {
2486 temp
= ucol_getOffset(coleiter
);
2491 int32_t *patternce
= strsrch
->pattern
.ces
;
2492 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
2493 int32_t count
= patterncelength
;
2495 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2496 // status checked below, note that if status is a failure
2497 // ucol_previous returns UCOL_NULLORDER
2498 if (ce
== UCOL_IGNORABLE
) {
2501 if (expandflag
&& count
== 0 &&
2502 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2504 temp
= ucol_getOffset(coleiter
);
2506 if (count
== patterncelength
&&
2507 ce
!= patternce
[patterncelength
- 1]) {
2508 // accents may have extra starting ces, this occurs when a
2509 // pure accent pattern is matched without rearrangement
2510 int32_t expected
= patternce
[patterncelength
- 1];
2511 U16_BACK_1(text
, 0, *end
);
2512 if (getFCD(text
, end
, textlength
) & LAST_BYTE_MASK_
) {
2513 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2514 while (U_SUCCESS(*status
) && ce
!= expected
&&
2515 ce
!= UCOL_NULLORDER
&&
2516 ucol_getOffset(coleiter
) <= *start
) {
2517 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2521 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2523 *start
= getPreviousBaseOffset(text
, *start
);
2533 * Checks and sets the match information if found.
2536 * <li> the potential match does not repeat the previous match
2537 * <li> boundaries are correct
2538 * <li> potential match does not end in the middle of a contraction
2539 * <li> identical matches
2541 * Otherwise the offset will be shifted to the next character.
2542 * Internal method, status assumed to be success, caller has to check status
2543 * before calling this method.
2544 * @param strsrch string search data
2545 * @param textoffset offset in the collation element text. the returned value
2546 * will be the truncated start offset of the match or the new start
2548 * @param status only error status if any
2549 * @return TRUE if the match is valid, FALSE otherwise
2552 inline UBool
checkPreviousCanonicalMatch(UStringSearch
*strsrch
,
2553 int32_t *textoffset
,
2556 // to ensure that the start and ends are not composite characters
2557 UCollationElements
*coleiter
= strsrch
->textIter
;
2558 // if we have a canonical accent match
2559 if ((strsrch
->pattern
.hasSuffixAccents
&&
2560 strsrch
->canonicalSuffixAccents
[0]) ||
2561 (strsrch
->pattern
.hasPrefixAccents
&&
2562 strsrch
->canonicalPrefixAccents
[0])) {
2563 strsrch
->search
->matchedIndex
= *textoffset
;
2564 strsrch
->search
->matchedLength
=
2565 getNextUStringSearchBaseOffset(strsrch
,
2566 getColElemIterOffset(coleiter
, FALSE
))
2571 int32_t end
= ucol_getOffset(coleiter
);
2572 if (!checkPreviousCanonicalContractionMatch(strsrch
, textoffset
, &end
,
2574 U_FAILURE(*status
)) {
2578 end
= getNextUStringSearchBaseOffset(strsrch
, end
);
2579 // this totally matches, however we need to check if it is repeating
2580 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2581 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2582 !checkIdentical(strsrch
, *textoffset
, end
)) {
2584 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2589 strsrch
->search
->matchedIndex
= *textoffset
;
2590 strsrch
->search
->matchedLength
= end
- *textoffset
;
2593 #endif // #if BOYER_MOORE
2595 // constructors and destructor -------------------------------------------
2597 U_CAPI UStringSearch
* U_EXPORT2
usearch_open(const UChar
*pattern
,
2598 int32_t patternlength
,
2602 UBreakIterator
*breakiter
,
2605 if (U_FAILURE(*status
)) {
2608 #if UCONFIG_NO_BREAK_ITERATION
2609 if (breakiter
!= NULL
) {
2610 *status
= U_UNSUPPORTED_ERROR
;
2615 // ucol_open internally checks for status
2616 UCollator
*collator
= ucol_open(locale
, status
);
2617 // pattern, text checks are done in usearch_openFromCollator
2618 UStringSearch
*result
= usearch_openFromCollator(pattern
,
2619 patternlength
, text
, textlength
,
2620 collator
, breakiter
, status
);
2622 if (result
== NULL
|| U_FAILURE(*status
)) {
2624 ucol_close(collator
);
2629 result
->ownCollator
= TRUE
;
2633 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2637 U_CAPI UStringSearch
* U_EXPORT2
usearch_openFromCollator(
2638 const UChar
*pattern
,
2639 int32_t patternlength
,
2642 const UCollator
*collator
,
2643 UBreakIterator
*breakiter
,
2646 if (U_FAILURE(*status
)) {
2649 #if UCONFIG_NO_BREAK_ITERATION
2650 if (breakiter
!= NULL
) {
2651 *status
= U_UNSUPPORTED_ERROR
;
2655 if (pattern
== NULL
|| text
== NULL
|| collator
== NULL
) {
2656 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2660 // string search does not really work when numeric collation is turned on
2661 if(ucol_getAttribute(collator
, UCOL_NUMERIC_COLLATION
, status
) == UCOL_ON
) {
2662 *status
= U_UNSUPPORTED_ERROR
;
2666 if (U_SUCCESS(*status
)) {
2667 initializeFCD(status
);
2668 if (U_FAILURE(*status
)) {
2672 UStringSearch
*result
;
2673 if (textlength
== -1) {
2674 textlength
= u_strlen(text
);
2676 if (patternlength
== -1) {
2677 patternlength
= u_strlen(pattern
);
2679 if (textlength
<= 0 || patternlength
<= 0) {
2680 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2684 result
= (UStringSearch
*)uprv_malloc(sizeof(UStringSearch
));
2685 if (result
== NULL
) {
2686 *status
= U_MEMORY_ALLOCATION_ERROR
;
2690 result
->collator
= collator
;
2691 result
->strength
= ucol_getStrength(collator
);
2692 result
->ceMask
= getMask(result
->strength
);
2694 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2696 result
->variableTop
= ucol_getVariableTop(collator
, status
);
2698 result
->nfd
= Normalizer2::getNFDInstance(*status
);
2700 if (U_FAILURE(*status
)) {
2705 result
->search
= (USearch
*)uprv_malloc(sizeof(USearch
));
2706 if (result
->search
== NULL
) {
2707 *status
= U_MEMORY_ALLOCATION_ERROR
;
2712 result
->search
->text
= text
;
2713 result
->search
->textLength
= textlength
;
2715 result
->pattern
.text
= pattern
;
2716 result
->pattern
.textLength
= patternlength
;
2717 result
->pattern
.ces
= NULL
;
2718 result
->pattern
.pces
= NULL
;
2720 result
->search
->breakIter
= breakiter
;
2721 #if !UCONFIG_NO_BREAK_ITERATION
2722 result
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(result
->collator
, ULOC_VALID_LOCALE
, status
), text
, textlength
, status
);
2724 ubrk_setText(breakiter
, text
, textlength
, status
);
2728 result
->ownCollator
= FALSE
;
2729 result
->search
->matchedLength
= 0;
2730 result
->search
->matchedIndex
= USEARCH_DONE
;
2731 result
->utilIter
= NULL
;
2732 result
->textIter
= ucol_openElements(collator
, text
,
2733 textlength
, status
);
2734 result
->textProcessedIter
= NULL
;
2735 if (U_FAILURE(*status
)) {
2736 usearch_close(result
);
2740 result
->search
->isOverlap
= FALSE
;
2741 result
->search
->isCanonicalMatch
= FALSE
;
2742 result
->search
->elementComparisonType
= 0;
2743 result
->search
->isForwardSearching
= TRUE
;
2744 result
->search
->reset
= TRUE
;
2746 initialize(result
, status
);
2748 if (U_FAILURE(*status
)) {
2749 usearch_close(result
);
2758 U_CAPI
void U_EXPORT2
usearch_close(UStringSearch
*strsrch
)
2761 if (strsrch
->pattern
.ces
!= strsrch
->pattern
.cesBuffer
&&
2762 strsrch
->pattern
.ces
) {
2763 uprv_free(strsrch
->pattern
.ces
);
2766 if (strsrch
->pattern
.pces
!= NULL
&&
2767 strsrch
->pattern
.pces
!= strsrch
->pattern
.pcesBuffer
) {
2768 uprv_free(strsrch
->pattern
.pces
);
2771 delete strsrch
->textProcessedIter
;
2772 ucol_closeElements(strsrch
->textIter
);
2773 ucol_closeElements(strsrch
->utilIter
);
2775 if (strsrch
->ownCollator
&& strsrch
->collator
) {
2776 ucol_close((UCollator
*)strsrch
->collator
);
2779 #if !UCONFIG_NO_BREAK_ITERATION
2780 if (strsrch
->search
->internalBreakIter
) {
2781 ubrk_close(strsrch
->search
->internalBreakIter
);
2785 uprv_free(strsrch
->search
);
2792 UBool
initTextProcessedIter(UStringSearch
*strsrch
, UErrorCode
*status
) {
2793 if (U_FAILURE(*status
)) { return FALSE
; }
2794 if (strsrch
->textProcessedIter
== NULL
) {
2795 strsrch
->textProcessedIter
= new icu::UCollationPCE(strsrch
->textIter
);
2796 if (strsrch
->textProcessedIter
== NULL
) {
2797 *status
= U_MEMORY_ALLOCATION_ERROR
;
2801 strsrch
->textProcessedIter
->init(strsrch
->textIter
);
2808 // set and get methods --------------------------------------------------
2810 U_CAPI
void U_EXPORT2
usearch_setOffset(UStringSearch
*strsrch
,
2814 if (U_SUCCESS(*status
) && strsrch
) {
2815 if (isOutOfBounds(strsrch
->search
->textLength
, position
)) {
2816 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2819 setColEIterOffset(strsrch
->textIter
, position
);
2821 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2822 strsrch
->search
->matchedLength
= 0;
2823 strsrch
->search
->reset
= FALSE
;
2827 U_CAPI
int32_t U_EXPORT2
usearch_getOffset(const UStringSearch
*strsrch
)
2830 int32_t result
= ucol_getOffset(strsrch
->textIter
);
2831 if (isOutOfBounds(strsrch
->search
->textLength
, result
)) {
2832 return USEARCH_DONE
;
2836 return USEARCH_DONE
;
2839 U_CAPI
void U_EXPORT2
usearch_setAttribute(UStringSearch
*strsrch
,
2840 USearchAttribute attribute
,
2841 USearchAttributeValue value
,
2844 if (U_SUCCESS(*status
) && strsrch
) {
2847 case USEARCH_OVERLAP
:
2848 strsrch
->search
->isOverlap
= (value
== USEARCH_ON
? TRUE
: FALSE
);
2850 case USEARCH_CANONICAL_MATCH
:
2851 strsrch
->search
->isCanonicalMatch
= (value
== USEARCH_ON
? TRUE
:
2854 case USEARCH_ELEMENT_COMPARISON
:
2855 if (value
== USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD
|| value
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
2856 strsrch
->search
->elementComparisonType
= (int16_t)value
;
2858 strsrch
->search
->elementComparisonType
= 0;
2861 case USEARCH_ATTRIBUTE_COUNT
:
2863 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2866 if (value
== USEARCH_ATTRIBUTE_VALUE_COUNT
) {
2867 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2871 U_CAPI USearchAttributeValue U_EXPORT2
usearch_getAttribute(
2872 const UStringSearch
*strsrch
,
2873 USearchAttribute attribute
)
2876 switch (attribute
) {
2877 case USEARCH_OVERLAP
:
2878 return (strsrch
->search
->isOverlap
== TRUE
? USEARCH_ON
:
2880 case USEARCH_CANONICAL_MATCH
:
2881 return (strsrch
->search
->isCanonicalMatch
== TRUE
? USEARCH_ON
:
2883 case USEARCH_ELEMENT_COMPARISON
:
2885 int16_t value
= strsrch
->search
->elementComparisonType
;
2886 if (value
== USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD
|| value
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
2887 return (USearchAttributeValue
)value
;
2889 return USEARCH_STANDARD_ELEMENT_COMPARISON
;
2892 case USEARCH_ATTRIBUTE_COUNT
:
2893 return USEARCH_DEFAULT
;
2896 return USEARCH_DEFAULT
;
2899 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedStart(
2900 const UStringSearch
*strsrch
)
2902 if (strsrch
== NULL
) {
2903 return USEARCH_DONE
;
2905 return strsrch
->search
->matchedIndex
;
2909 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedText(const UStringSearch
*strsrch
,
2911 int32_t resultCapacity
,
2914 if (U_FAILURE(*status
)) {
2915 return USEARCH_DONE
;
2917 if (strsrch
== NULL
|| resultCapacity
< 0 || (resultCapacity
> 0 &&
2919 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2920 return USEARCH_DONE
;
2923 int32_t copylength
= strsrch
->search
->matchedLength
;
2924 int32_t copyindex
= strsrch
->search
->matchedIndex
;
2925 if (copyindex
== USEARCH_DONE
) {
2926 u_terminateUChars(result
, resultCapacity
, 0, status
);
2927 return USEARCH_DONE
;
2930 if (resultCapacity
< copylength
) {
2931 copylength
= resultCapacity
;
2933 if (copylength
> 0) {
2934 uprv_memcpy(result
, strsrch
->search
->text
+ copyindex
,
2935 copylength
* sizeof(UChar
));
2937 return u_terminateUChars(result
, resultCapacity
,
2938 strsrch
->search
->matchedLength
, status
);
2941 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedLength(
2942 const UStringSearch
*strsrch
)
2945 return strsrch
->search
->matchedLength
;
2947 return USEARCH_DONE
;
2950 #if !UCONFIG_NO_BREAK_ITERATION
2952 U_CAPI
void U_EXPORT2
usearch_setBreakIterator(UStringSearch
*strsrch
,
2953 UBreakIterator
*breakiter
,
2956 if (U_SUCCESS(*status
) && strsrch
) {
2957 strsrch
->search
->breakIter
= breakiter
;
2959 ubrk_setText(breakiter
, strsrch
->search
->text
,
2960 strsrch
->search
->textLength
, status
);
2965 U_CAPI
const UBreakIterator
* U_EXPORT2
2966 usearch_getBreakIterator(const UStringSearch
*strsrch
)
2969 return strsrch
->search
->breakIter
;
2976 U_CAPI
void U_EXPORT2
usearch_setText( UStringSearch
*strsrch
,
2981 if (U_SUCCESS(*status
)) {
2982 if (strsrch
== NULL
|| text
== NULL
|| textlength
< -1 ||
2984 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2987 if (textlength
== -1) {
2988 textlength
= u_strlen(text
);
2990 strsrch
->search
->text
= text
;
2991 strsrch
->search
->textLength
= textlength
;
2992 ucol_setText(strsrch
->textIter
, text
, textlength
, status
);
2993 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2994 strsrch
->search
->matchedLength
= 0;
2995 strsrch
->search
->reset
= TRUE
;
2996 #if !UCONFIG_NO_BREAK_ITERATION
2997 if (strsrch
->search
->breakIter
!= NULL
) {
2998 ubrk_setText(strsrch
->search
->breakIter
, text
,
2999 textlength
, status
);
3001 ubrk_setText(strsrch
->search
->internalBreakIter
, text
, textlength
, status
);
3007 U_CAPI
const UChar
* U_EXPORT2
usearch_getText(const UStringSearch
*strsrch
,
3011 *length
= strsrch
->search
->textLength
;
3012 return strsrch
->search
->text
;
3017 U_CAPI
void U_EXPORT2
usearch_setCollator( UStringSearch
*strsrch
,
3018 const UCollator
*collator
,
3021 if (U_SUCCESS(*status
)) {
3022 if (collator
== NULL
) {
3023 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3028 delete strsrch
->textProcessedIter
;
3029 strsrch
->textProcessedIter
= NULL
;
3030 ucol_closeElements(strsrch
->textIter
);
3031 ucol_closeElements(strsrch
->utilIter
);
3032 strsrch
->textIter
= strsrch
->utilIter
= NULL
;
3033 if (strsrch
->ownCollator
&& (strsrch
->collator
!= collator
)) {
3034 ucol_close((UCollator
*)strsrch
->collator
);
3035 strsrch
->ownCollator
= FALSE
;
3037 strsrch
->collator
= collator
;
3038 strsrch
->strength
= ucol_getStrength(collator
);
3039 strsrch
->ceMask
= getMask(strsrch
->strength
);
3040 #if !UCONFIG_NO_BREAK_ITERATION
3041 ubrk_close(strsrch
->search
->internalBreakIter
);
3042 strsrch
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(collator
, ULOC_VALID_LOCALE
, status
),
3043 strsrch
->search
->text
, strsrch
->search
->textLength
, status
);
3045 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3047 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
3049 // if status is a failure, ucol_getVariableTop returns 0
3050 strsrch
->variableTop
= ucol_getVariableTop(collator
, status
);
3051 strsrch
->textIter
= ucol_openElements(collator
,
3052 strsrch
->search
->text
,
3053 strsrch
->search
->textLength
,
3055 strsrch
->utilIter
= ucol_openElements(
3056 collator
, strsrch
->pattern
.text
, strsrch
->pattern
.textLength
, status
);
3057 // initialize() _after_ setting the iterators for the new collator.
3058 initialize(strsrch
, status
);
3061 // **** are these calls needed?
3062 // **** we call uprv_init_pce in initializePatternPCETable
3063 // **** and the CEIBuffer constructor...
3065 uprv_init_pce(strsrch
->textIter
);
3066 uprv_init_pce(strsrch
->utilIter
);
3071 U_CAPI UCollator
* U_EXPORT2
usearch_getCollator(const UStringSearch
*strsrch
)
3074 return (UCollator
*)strsrch
->collator
;
3079 U_CAPI
void U_EXPORT2
usearch_setPattern( UStringSearch
*strsrch
,
3080 const UChar
*pattern
,
3081 int32_t patternlength
,
3084 if (U_SUCCESS(*status
)) {
3085 if (strsrch
== NULL
|| pattern
== NULL
) {
3086 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3089 if (patternlength
== -1) {
3090 patternlength
= u_strlen(pattern
);
3092 if (patternlength
== 0) {
3093 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3096 strsrch
->pattern
.text
= pattern
;
3097 strsrch
->pattern
.textLength
= patternlength
;
3098 initialize(strsrch
, status
);
3103 U_CAPI
const UChar
* U_EXPORT2
3104 usearch_getPattern(const UStringSearch
*strsrch
,
3108 *length
= strsrch
->pattern
.textLength
;
3109 return strsrch
->pattern
.text
;
3114 // miscellanous methods --------------------------------------------------
3116 U_CAPI
int32_t U_EXPORT2
usearch_first(UStringSearch
*strsrch
,
3119 if (strsrch
&& U_SUCCESS(*status
)) {
3120 strsrch
->search
->isForwardSearching
= TRUE
;
3121 usearch_setOffset(strsrch
, 0, status
);
3122 if (U_SUCCESS(*status
)) {
3123 return usearch_next(strsrch
, status
);
3126 return USEARCH_DONE
;
3129 U_CAPI
int32_t U_EXPORT2
usearch_following(UStringSearch
*strsrch
,
3133 if (strsrch
&& U_SUCCESS(*status
)) {
3134 strsrch
->search
->isForwardSearching
= TRUE
;
3135 // position checked in usearch_setOffset
3136 usearch_setOffset(strsrch
, position
, status
);
3137 if (U_SUCCESS(*status
)) {
3138 return usearch_next(strsrch
, status
);
3141 return USEARCH_DONE
;
3144 U_CAPI
int32_t U_EXPORT2
usearch_last(UStringSearch
*strsrch
,
3147 if (strsrch
&& U_SUCCESS(*status
)) {
3148 strsrch
->search
->isForwardSearching
= FALSE
;
3149 usearch_setOffset(strsrch
, strsrch
->search
->textLength
, status
);
3150 if (U_SUCCESS(*status
)) {
3151 return usearch_previous(strsrch
, status
);
3154 return USEARCH_DONE
;
3157 U_CAPI
int32_t U_EXPORT2
usearch_preceding(UStringSearch
*strsrch
,
3161 if (strsrch
&& U_SUCCESS(*status
)) {
3162 strsrch
->search
->isForwardSearching
= FALSE
;
3163 // position checked in usearch_setOffset
3164 usearch_setOffset(strsrch
, position
, status
);
3165 if (U_SUCCESS(*status
)) {
3166 return usearch_previous(strsrch
, status
);
3169 return USEARCH_DONE
;
3173 * If a direction switch is required, we'll count the number of ces till the
3174 * beginning of the collation element iterator and iterate forwards that
3175 * number of times. This is so that we get to the correct point within the
3176 * string to continue the search in. Imagine when we are in the middle of the
3177 * normalization buffer when the change in direction is request. arrrgghh....
3178 * After searching the offset within the collation element iterator will be
3179 * shifted to the start of the match. If a match is not found, the offset would
3180 * have been set to the end of the text string in the collation element
3182 * Okay, here's my take on normalization buffer. The only time when there can
3183 * be 2 matches within the same normalization is when the pattern is consists
3184 * of all accents. But since the offset returned is from the text string, we
3185 * should not confuse the caller by returning the second match within the
3186 * same normalization buffer. If we do, the 2 results will have the same match
3187 * offsets, and that'll be confusing. I'll return the next match that doesn't
3188 * fall within the same normalization buffer. Note this does not affect the
3189 * results of matches spanning the text and the normalization buffer.
3190 * The position to start searching is taken from the collation element
3191 * iterator. Callers of this API would have to set the offset in the collation
3192 * element iterator before using this method.
3194 U_CAPI
int32_t U_EXPORT2
usearch_next(UStringSearch
*strsrch
,
3197 if (U_SUCCESS(*status
) && strsrch
) {
3198 // note offset is either equivalent to the start of the previous match
3199 // or is set by the user
3200 int32_t offset
= usearch_getOffset(strsrch
);
3201 USearch
*search
= strsrch
->search
;
3202 search
->reset
= FALSE
;
3203 int32_t textlength
= search
->textLength
;
3204 if (search
->isForwardSearching
) {
3206 if (offset
== textlength
3207 || (!search
->isOverlap
&&
3208 (offset
+ strsrch
->pattern
.defaultShiftSize
> textlength
||
3209 (search
->matchedIndex
!= USEARCH_DONE
&&
3210 offset
+ search
->matchedLength
>= textlength
)))) {
3211 // not enough characters to match
3212 setMatchNotFound(strsrch
);
3213 return USEARCH_DONE
;
3216 if (offset
== textlength
||
3217 (! search
->isOverlap
&&
3218 (search
->matchedIndex
!= USEARCH_DONE
&&
3219 offset
+ search
->matchedLength
> textlength
))) {
3220 // not enough characters to match
3221 setMatchNotFound(strsrch
);
3222 return USEARCH_DONE
;
3227 // switching direction.
3228 // if matchedIndex == USEARCH_DONE, it means that either a
3229 // setOffset has been called or that previous ran off the text
3230 // string. the iterator would have been set to offset 0 if a
3231 // match is not found.
3232 search
->isForwardSearching
= TRUE
;
3233 if (search
->matchedIndex
!= USEARCH_DONE
) {
3234 // there's no need to set the collation element iterator
3235 // the next call to next will set the offset.
3236 return search
->matchedIndex
;
3240 if (U_SUCCESS(*status
)) {
3241 if (strsrch
->pattern
.cesLength
== 0) {
3242 if (search
->matchedIndex
== USEARCH_DONE
) {
3243 search
->matchedIndex
= offset
;
3245 else { // moves by codepoints
3246 U16_FWD_1(search
->text
, search
->matchedIndex
, textlength
);
3249 search
->matchedLength
= 0;
3250 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3251 // status checked below
3252 if (search
->matchedIndex
== textlength
) {
3253 search
->matchedIndex
= USEARCH_DONE
;
3257 if (search
->matchedLength
> 0) {
3258 // if matchlength is 0 we are at the start of the iteration
3259 if (search
->isOverlap
) {
3260 ucol_setOffset(strsrch
->textIter
, offset
+ 1, status
);
3263 ucol_setOffset(strsrch
->textIter
,
3264 offset
+ search
->matchedLength
, status
);
3268 // for boundary check purposes. this will ensure that the
3269 // next match will not preceed the current offset
3270 // note search->matchedIndex will always be set to something
3272 search
->matchedIndex
= offset
- 1;
3275 if (search
->isCanonicalMatch
) {
3276 // can't use exact here since extra accents are allowed.
3277 usearch_handleNextCanonical(strsrch
, status
);
3280 usearch_handleNextExact(strsrch
, status
);
3284 if (U_FAILURE(*status
)) {
3285 return USEARCH_DONE
;
3289 if (search
->matchedIndex
== USEARCH_DONE
) {
3290 ucol_setOffset(strsrch
->textIter
, search
->textLength
, status
);
3292 ucol_setOffset(strsrch
->textIter
, search
->matchedIndex
, status
);
3296 return search
->matchedIndex
;
3299 return USEARCH_DONE
;
3302 U_CAPI
int32_t U_EXPORT2
usearch_previous(UStringSearch
*strsrch
,
3305 if (U_SUCCESS(*status
) && strsrch
) {
3307 USearch
*search
= strsrch
->search
;
3308 if (search
->reset
) {
3309 offset
= search
->textLength
;
3310 search
->isForwardSearching
= FALSE
;
3311 search
->reset
= FALSE
;
3312 setColEIterOffset(strsrch
->textIter
, offset
);
3315 offset
= usearch_getOffset(strsrch
);
3318 int32_t matchedindex
= search
->matchedIndex
;
3319 if (search
->isForwardSearching
== TRUE
) {
3320 // switching direction.
3321 // if matchedIndex == USEARCH_DONE, it means that either a
3322 // setOffset has been called or that next ran off the text
3323 // string. the iterator would have been set to offset textLength if
3324 // a match is not found.
3325 search
->isForwardSearching
= FALSE
;
3326 if (matchedindex
!= USEARCH_DONE
) {
3327 return matchedindex
;
3332 if (offset
== 0 || matchedindex
== 0 ||
3333 (!search
->isOverlap
&&
3334 (offset
< strsrch
->pattern
.defaultShiftSize
||
3335 (matchedindex
!= USEARCH_DONE
&&
3336 matchedindex
< strsrch
->pattern
.defaultShiftSize
)))) {
3337 // not enough characters to match
3338 setMatchNotFound(strsrch
);
3339 return USEARCH_DONE
;
3342 // Could check pattern length, but the
3343 // linear search will do the right thing
3344 if (offset
== 0 || matchedindex
== 0) {
3345 setMatchNotFound(strsrch
);
3346 return USEARCH_DONE
;
3351 if (U_SUCCESS(*status
)) {
3352 if (strsrch
->pattern
.cesLength
== 0) {
3353 search
->matchedIndex
=
3354 (matchedindex
== USEARCH_DONE
? offset
: matchedindex
);
3355 if (search
->matchedIndex
== 0) {
3356 setMatchNotFound(strsrch
);
3357 // status checked below
3359 else { // move by codepoints
3360 U16_BACK_1(search
->text
, 0, search
->matchedIndex
);
3361 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3362 // status checked below
3363 search
->matchedLength
= 0;
3367 if (strsrch
->search
->isCanonicalMatch
) {
3368 // can't use exact here since extra accents are allowed.
3369 usearch_handlePreviousCanonical(strsrch
, status
);
3370 // status checked below
3373 usearch_handlePreviousExact(strsrch
, status
);
3374 // status checked below
3378 if (U_FAILURE(*status
)) {
3379 return USEARCH_DONE
;
3382 return search
->matchedIndex
;
3385 return USEARCH_DONE
;
3390 U_CAPI
void U_EXPORT2
usearch_reset(UStringSearch
*strsrch
)
3393 reset is setting the attributes that are already in
3394 string search, hence all attributes in the collator should
3395 be retrieved without any problems
3398 UErrorCode status
= U_ZERO_ERROR
;
3399 UBool sameCollAttribute
= TRUE
;
3404 // **** hack to deal w/ how processed CEs encode quaternary ****
3405 UCollationStrength newStrength
= ucol_getStrength(strsrch
->collator
);
3406 if ((strsrch
->strength
< UCOL_QUATERNARY
&& newStrength
>= UCOL_QUATERNARY
) ||
3407 (strsrch
->strength
>= UCOL_QUATERNARY
&& newStrength
< UCOL_QUATERNARY
)) {
3408 sameCollAttribute
= FALSE
;
3411 strsrch
->strength
= ucol_getStrength(strsrch
->collator
);
3412 ceMask
= getMask(strsrch
->strength
);
3413 if (strsrch
->ceMask
!= ceMask
) {
3414 strsrch
->ceMask
= ceMask
;
3415 sameCollAttribute
= FALSE
;
3418 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3419 shift
= ucol_getAttribute(strsrch
->collator
, UCOL_ALTERNATE_HANDLING
,
3420 &status
) == UCOL_SHIFTED
;
3421 if (strsrch
->toShift
!= shift
) {
3422 strsrch
->toShift
= shift
;
3423 sameCollAttribute
= FALSE
;
3426 // if status is a failure, ucol_getVariableTop returns 0
3427 varTop
= ucol_getVariableTop(strsrch
->collator
, &status
);
3428 if (strsrch
->variableTop
!= varTop
) {
3429 strsrch
->variableTop
= varTop
;
3430 sameCollAttribute
= FALSE
;
3432 if (!sameCollAttribute
) {
3433 initialize(strsrch
, &status
);
3435 ucol_setText(strsrch
->textIter
, strsrch
->search
->text
,
3436 strsrch
->search
->textLength
,
3438 strsrch
->search
->matchedLength
= 0;
3439 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
3440 strsrch
->search
->isOverlap
= FALSE
;
3441 strsrch
->search
->isCanonicalMatch
= FALSE
;
3442 strsrch
->search
->elementComparisonType
= 0;
3443 strsrch
->search
->isForwardSearching
= TRUE
;
3444 strsrch
->search
->reset
= TRUE
;
3449 // CEI Collation Element + source text index.
3450 // These structs are kept in the circular buffer.
3462 // CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3464 #define DEFAULT_CEBUFFER_SIZE 96
3465 #define CEBUFFER_EXTRA 32
3466 // Some typical max values to make buffer size more reasonable for asymmetric search.
3467 // #8694 is for a better long-term solution to allocation of this buffer.
3468 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3469 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3470 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3472 CEI defBuf
[DEFAULT_CEBUFFER_SIZE
];
3477 UCollationElements
*ceIter
;
3478 UStringSearch
*strSearch
;
3482 CEIBuffer(UStringSearch
*ss
, UErrorCode
*status
);
3484 const CEI
*get(int32_t index
);
3485 const CEI
*getPrevious(int32_t index
);
3489 CEIBuffer::CEIBuffer(UStringSearch
*ss
, UErrorCode
*status
) {
3492 bufSize
= ss
->pattern
.pcesLength
+ CEBUFFER_EXTRA
;
3493 if (ss
->search
->elementComparisonType
!= 0) {
3494 const UChar
* patText
= ss
->pattern
.text
;
3496 const UChar
* patTextLimit
= patText
+ ss
->pattern
.textLength
;
3497 while ( patText
< patTextLimit
) {
3498 UChar c
= *patText
++;
3499 if (MIGHT_BE_JAMO_L(c
)) {
3500 bufSize
+= MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L
;
3502 // No check for surrogates, we might allocate slightly more buffer than necessary.
3503 bufSize
+= MAX_TARGET_IGNORABLES_PER_PAT_OTHER
;
3508 ceIter
= ss
->textIter
;
3512 if (!initTextProcessedIter(ss
, status
)) { return; }
3514 if (bufSize
>DEFAULT_CEBUFFER_SIZE
) {
3515 buf
= (CEI
*)uprv_malloc(bufSize
* sizeof(CEI
));
3517 *status
= U_MEMORY_ALLOCATION_ERROR
;
3522 // TODO: add a reset or init function so that allocated
3523 // buffers can be retained & reused.
3525 CEIBuffer::~CEIBuffer() {
3526 if (buf
!= defBuf
) {
3532 // Get the CE with the specified index.
3533 // Index must be in the range
3534 // n-history_size < index < n+1
3535 // where n is the largest index to have been fetched by some previous call to this function.
3536 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3538 const CEI
*CEIBuffer::get(int32_t index
) {
3539 int i
= index
% bufSize
;
3541 if (index
>=firstIx
&& index
<limitIx
) {
3542 // The request was for an entry already in our buffer.
3547 // Caller is requesting a new, never accessed before, CE.
3548 // Verify that it is the next one in sequence, which is all
3550 if (index
!= limitIx
) {
3551 #if U_PLATFORM_IS_DARWIN_BASED
3552 os_log(OS_LOG_DEFAULT
, "# CEIBuffer::get param err, index %d limitIx %d", index
, limitIx
); // <rdar://51554495>
3559 // Manage the circular CE buffer indexing
3562 if (limitIx
- firstIx
>= bufSize
) {
3563 // The buffer is full, knock out the lowest-indexed entry.
3567 UErrorCode status
= U_ZERO_ERROR
;
3569 buf
[i
].ce
= strSearch
->textProcessedIter
->nextProcessed(&buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3574 // Get the CE with the specified index.
3575 // Index must be in the range
3576 // n-history_size < index < n+1
3577 // where n is the largest index to have been fetched by some previous call to this function.
3578 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3580 const CEI
*CEIBuffer::getPrevious(int32_t index
) {
3581 int i
= index
% bufSize
;
3583 if (index
>=firstIx
&& index
<limitIx
) {
3584 // The request was for an entry already in our buffer.
3589 // Caller is requesting a new, never accessed before, CE.
3590 // Verify that it is the next one in sequence, which is all
3592 if (index
!= limitIx
) {
3593 #if U_PLATFORM_IS_DARWIN_BASED
3594 os_log(OS_LOG_DEFAULT
, "# CEIBuffer::getPrevious param err, index %d limitIx %d", index
, limitIx
); // <rdar://51554495>
3601 // Manage the circular CE buffer indexing
3604 if (limitIx
- firstIx
>= bufSize
) {
3605 // The buffer is full, knock out the lowest-indexed entry.
3609 UErrorCode status
= U_ZERO_ERROR
;
3611 buf
[i
].ce
= strSearch
->textProcessedIter
->previousProcessed(&buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3621 // #define USEARCH_DEBUG
3623 #ifdef USEARCH_DEBUG
3629 * Find the next break boundary after startIndex. If the UStringSearch object
3630 * has an external break iterator, use that. Otherwise use the internal character
3633 static int32_t nextBoundaryAfter(UStringSearch
*strsrch
, int32_t startIndex
) {
3635 const UChar
*text
= strsrch
->search
->text
;
3636 int32_t textLen
= strsrch
->search
->textLength
;
3638 U_ASSERT(startIndex
>=0);
3639 U_ASSERT(startIndex
<=textLen
);
3641 if (startIndex
>= textLen
) {
3646 int32_t i
= startIndex
;
3647 U16_NEXT(text
, i
, textLen
, c
);
3649 // If we are on a control character, stop without looking for combining marks.
3650 // Control characters do not combine.
3651 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3652 if (gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
) {
3656 // The initial character was not a control, and can thus accept trailing
3657 // combining characters. Advance over however many of them there are.
3658 int32_t indexOfLastCharChecked
;
3660 indexOfLastCharChecked
= i
;
3664 U16_NEXT(text
, i
, textLen
, c
);
3665 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3666 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3670 return indexOfLastCharChecked
;
3671 #elif !UCONFIG_NO_BREAK_ITERATION
3672 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3674 if (breakiterator
== NULL
) {
3675 breakiterator
= strsrch
->search
->internalBreakIter
;
3678 if (breakiterator
!= NULL
) {
3679 return ubrk_following(breakiterator
, startIndex
);
3684 // **** or should we use the original code? ****
3691 * Returns TRUE if index is on a break boundary. If the UStringSearch
3692 * has an external break iterator, test using that, otherwise test
3693 * using the internal character break iterator.
3695 static UBool
isBreakBoundary(UStringSearch
*strsrch
, int32_t index
) {
3697 const UChar
*text
= strsrch
->search
->text
;
3698 int32_t textLen
= strsrch
->search
->textLength
;
3701 U_ASSERT(index
<=textLen
);
3703 if (index
>=textLen
|| index
<=0) {
3707 // If the character at the current index is not a GRAPHEME_EXTEND
3708 // then we can not be within a combining sequence.
3710 U16_GET(text
, 0, index
, textLen
, c
);
3711 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3712 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3716 // We are at a combining mark. If the preceding character is anything
3717 // except a CONTROL, CR or LF, we are in a combining sequence.
3718 U16_PREV(text
, 0, index
, c
);
3719 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3720 UBool combining
= !(gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
);
3722 #elif !UCONFIG_NO_BREAK_ITERATION
3723 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3725 if (breakiterator
== NULL
) {
3726 breakiterator
= strsrch
->search
->internalBreakIter
;
3729 return (breakiterator
!= NULL
&& ubrk_isBoundary(breakiterator
, index
));
3731 // **** or use the original code? ****
3737 static UBool
onBreakBoundaries(const UStringSearch
*strsrch
, int32_t start
, int32_t end
)
3739 #if !UCONFIG_NO_BREAK_ITERATION
3740 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3742 if (breakiterator
!= NULL
) {
3743 int32_t startindex
= ubrk_first(breakiterator
);
3744 int32_t endindex
= ubrk_last(breakiterator
);
3746 // out-of-range indexes are never boundary positions
3747 if (start
< startindex
|| start
> endindex
||
3748 end
< startindex
|| end
> endindex
) {
3752 return ubrk_isBoundary(breakiterator
, start
) &&
3753 ubrk_isBoundary(breakiterator
, end
);
3766 } UCompareCEsResult
;
3767 #define U_CE_LEVEL2_BASE 0x00000005
3768 #define U_CE_LEVEL3_BASE 0x00050000
3770 static UCompareCEsResult
compareCE64s(int64_t targCE
, int64_t patCE
, int16_t compareType
) {
3771 if (targCE
== patCE
) {
3774 if (compareType
== 0) {
3775 return U_CE_NO_MATCH
;
3778 int64_t targCEshifted
= targCE
>> 32;
3779 int64_t patCEshifted
= patCE
>> 32;
3783 int32_t targLev1
= (int32_t)(targCEshifted
& mask
);
3784 int32_t patLev1
= (int32_t)(patCEshifted
& mask
);
3785 if ( targLev1
!= patLev1
) {
3786 if ( targLev1
== 0 ) {
3787 return U_CE_SKIP_TARG
;
3789 if ( patLev1
== 0 && compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
3790 return U_CE_SKIP_PATN
;
3792 return U_CE_NO_MATCH
;
3796 int32_t targLev2
= (int32_t)(targCEshifted
& mask
);
3797 int32_t patLev2
= (int32_t)(patCEshifted
& mask
);
3798 if ( targLev2
!= patLev2
) {
3799 if ( targLev2
== 0 ) {
3800 return U_CE_SKIP_TARG
;
3802 if ( patLev2
== 0 && compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
3803 return U_CE_SKIP_PATN
;
3805 return (patLev2
== U_CE_LEVEL2_BASE
|| (compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
&& targLev2
== U_CE_LEVEL2_BASE
) )?
3806 U_CE_MATCH
: U_CE_NO_MATCH
;
3810 int32_t targLev3
= (int32_t)(targCE
& mask
);
3811 int32_t patLev3
= (int32_t)(patCE
& mask
);
3812 if ( targLev3
!= patLev3
) {
3813 return (patLev3
== U_CE_LEVEL3_BASE
|| (compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
&& targLev3
== U_CE_LEVEL3_BASE
) )?
3814 U_CE_MATCH
: U_CE_NO_MATCH
;
3821 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3826 UChar32
codePointAt(const USearch
&search
, int32_t index
) {
3827 if (index
< search
.textLength
) {
3829 U16_NEXT(search
.text
, index
, search
.textLength
, c
);
3835 UChar32
codePointBefore(const USearch
&search
, int32_t index
) {
3838 U16_PREV(search
.text
, 0, index
, c
);
3846 U_CAPI UBool U_EXPORT2
usearch_search(UStringSearch
*strsrch
,
3848 int32_t *matchStart
,
3849 int32_t *matchLimit
,
3852 if (U_FAILURE(*status
)) {
3856 // TODO: reject search patterns beginning with a combining char.
3858 #ifdef USEARCH_DEBUG
3859 if (getenv("USEARCH_DEBUG") != NULL
) {
3860 printf("Pattern CEs\n");
3861 for (int ii
=0; ii
<strsrch
->pattern
.cesLength
; ii
++) {
3862 printf(" %8x", strsrch
->pattern
.ces
[ii
]);
3868 // Input parameter sanity check.
3869 // TODO: should input indicies clip to the text length
3870 // in the same way that UText does.
3871 if(strsrch
->pattern
.cesLength
== 0 ||
3873 startIdx
> strsrch
->search
->textLength
||
3874 strsrch
->pattern
.ces
== NULL
) {
3875 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3879 if (strsrch
->pattern
.pces
== NULL
) {
3880 initializePatternPCETable(strsrch
, status
);
3883 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
3884 CEIBuffer
ceb(strsrch
, status
);
3887 int32_t targetIx
= 0;
3888 const CEI
*targetCEI
= NULL
;
3892 int32_t mStart
= -1;
3893 int32_t mLimit
= -1;
3899 // Outer loop moves over match starting positions in the
3901 // Here we see the target as a sequence of collation elements, resulting from the following:
3902 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3903 // (for example, digraphs such as IJ may be broken into two characters).
3904 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3905 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3906 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3907 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3908 // then the CE is deleted, so the following code sees only CEs that are relevant.
3909 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3910 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3911 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3913 for(targetIx
=0; ; targetIx
++)
3916 // Inner loop checks for a match beginning at each
3917 // position from the outer loop.
3918 int32_t targetIxOffset
= 0;
3920 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3921 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3922 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3923 const CEI
*firstCEI
= ceb
.get(targetIx
);
3924 if (firstCEI
== NULL
) {
3925 *status
= U_INTERNAL_PROGRAM_ERROR
;
3930 for (patIx
=0; patIx
<strsrch
->pattern
.pcesLength
; patIx
++) {
3931 patCE
= strsrch
->pattern
.pces
[patIx
];
3932 targetCEI
= ceb
.get(targetIx
+patIx
+targetIxOffset
);
3933 // Compare CE from target string with CE from the pattern.
3934 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3935 // which will fail the compare, below.
3936 UCompareCEsResult ceMatch
= compareCE64s(targetCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
3937 if ( ceMatch
== U_CE_NO_MATCH
) {
3940 } else if ( ceMatch
> U_CE_NO_MATCH
) {
3941 if ( ceMatch
== U_CE_SKIP_TARG
) {
3942 // redo with same patCE, next targCE
3945 } else { // ceMatch == U_CE_SKIP_PATN
3946 // redo with same targCE, next patCE
3951 targetIxOffset
+= strsrch
->pattern
.pcesLength
; // this is now the offset in target CE space to end of the match so far
3953 if (!found
&& ((targetCEI
== NULL
) || (targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
))) {
3954 // No match at this targetIx. Try again at the next.
3959 // No match at all, we have run off the end of the target text.
3964 // We have found a match in CE space.
3965 // Now determine the bounds in string index space.
3966 // There still is a chance of match failure if the CE range not correspond to
3967 // an acceptable character range.
3969 const CEI
*lastCEI
= ceb
.get(targetIx
+ targetIxOffset
- 1);
3971 mStart
= firstCEI
->lowIndex
;
3972 minLimit
= lastCEI
->lowIndex
;
3974 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3975 // extended to the end of input, and the match is good.
3977 // Look at the high and low indices of the CE following the match. If
3978 // they are the same it means one of two things:
3979 // 1. The match extended to the last CE from the target text, which is OK, or
3980 // 2. The last CE that was part of the match is in an expansion that extends
3981 // to the first CE after the match. In this case, we reject the match.
3982 const CEI
*nextCEI
= 0;
3983 if (strsrch
->search
->elementComparisonType
== 0) {
3984 nextCEI
= ceb
.get(targetIx
+ targetIxOffset
);
3985 maxLimit
= nextCEI
->lowIndex
;
3986 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
3990 for ( ; ; ++targetIxOffset
) {
3991 nextCEI
= ceb
.get(targetIx
+ targetIxOffset
);
3992 maxLimit
= nextCEI
->lowIndex
;
3993 // If we are at the end of the target too, match succeeds
3994 if ( nextCEI
->ce
== UCOL_PROCESSED_NULLORDER
) {
3997 // As long as the next CE has primary weight of 0,
3998 // it is part of the last target element matched by the pattern;
3999 // make sure it can be part of a match with the last patCE
4000 if ( (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) == 0 ) {
4001 UCompareCEsResult ceMatch
= compareCE64s(nextCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
4002 if ( ceMatch
== U_CE_NO_MATCH
|| ceMatch
== U_CE_SKIP_PATN
) {
4006 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
4007 // target element, but it has non-zero primary weight => match fails
4008 } else if ( nextCEI
->lowIndex
== nextCEI
->highIndex
) {
4011 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4019 // Check for the start of the match being within a combining sequence.
4020 // This can happen if the pattern itself begins with a combining char, and
4021 // the match found combining marks in the target text that were attached
4022 // to something else.
4023 // This type of match should be rejected for not completely consuming a
4024 // combining sequence.
4025 if (!isBreakBoundary(strsrch
, mStart
)) {
4029 // Check for the start of the match being within an Collation Element Expansion,
4030 // meaning that the first char of the match is only partially matched.
4031 // With exapnsions, the first CE will report the index of the source
4032 // character, and all subsequent (expansions) CEs will report the source index of the
4033 // _following_ character.
4034 int32_t secondIx
= firstCEI
->highIndex
;
4035 if (mStart
== secondIx
) {
4039 // Allow matches to end in the middle of a grapheme cluster if the following
4040 // conditions are met; this is needed to make prefix search work properly in
4041 // Indic, see #11750
4042 // * the default breakIter is being used
4043 // * the next collation element after this combining sequence
4044 // - has non-zero primary weight
4045 // - corresponds to a separate character following the one at end of the current match
4046 // (the second of these conditions, and perhaps both, may be redundant given the
4047 // subsequent check for normalization boundary; however they are likely much faster
4048 // tests in any case)
4049 // * the match limit is a normalization boundary
4050 UBool allowMidclusterMatch
= FALSE
;
4051 if (strsrch
->search
->text
!= NULL
&& strsrch
->search
->textLength
> maxLimit
) {
4052 allowMidclusterMatch
=
4053 strsrch
->search
->breakIter
== NULL
&&
4054 nextCEI
!= NULL
&& (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) != 0 &&
4055 maxLimit
>= lastCEI
->highIndex
&& nextCEI
->highIndex
> maxLimit
&&
4056 (strsrch
->nfd
->hasBoundaryBefore(codePointAt(*strsrch
->search
, maxLimit
)) ||
4057 strsrch
->nfd
->hasBoundaryAfter(codePointBefore(*strsrch
->search
, maxLimit
)));
4059 // If those conditions are met, then:
4060 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4061 // the match limit may be backed off to a previous break boundary. This handles
4062 // cases in which mLimit includes target characters that are ignorable with current
4063 // settings (such as space) and which extend beyond the pattern match.
4064 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4065 // * do NOT require that match limit be on a breakIter boundary
4067 // Advance the match end position to the first acceptable match boundary.
4068 // This advances the index over any combining charcters.
4070 if (minLimit
< maxLimit
) {
4071 // When the last CE's low index is same with its high index, the CE is likely
4072 // a part of expansion. In this case, the index is located just after the
4073 // character corresponding to the CEs compared above. If the index is right
4074 // at the break boundary, move the position to the next boundary will result
4075 // incorrect match length when there are ignorable characters exist between
4076 // the position and the next character produces CE(s). See ticket#8482.
4077 if (minLimit
== lastCEI
->highIndex
&& isBreakBoundary(strsrch
, minLimit
)) {
4080 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4081 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4082 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4083 // (i.e. we back off mLimit to the previous breakIterator boundary).
4084 if (nba
>= lastCEI
->highIndex
&& (!allowMidclusterMatch
|| nba
< maxLimit
)) {
4090 #ifdef USEARCH_DEBUG
4091 if (getenv("USEARCH_DEBUG") != NULL
) {
4092 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4096 if (!allowMidclusterMatch
) {
4097 // If advancing to the end of a combining sequence in character indexing space
4098 // advanced us beyond the end of the match in CE space, reject this match.
4099 if (mLimit
> maxLimit
) {
4103 if (!isBreakBoundary(strsrch
, mLimit
)) {
4108 if (! checkIdentical(strsrch
, mStart
, mLimit
)) {
4117 #ifdef USEARCH_DEBUG
4118 if (getenv("USEARCH_DEBUG") != NULL
) {
4119 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4120 int32_t lastToPrint
= ceb
.limitIx
+2;
4121 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4122 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4124 printf("\n%s\n", found
? "match found" : "no match");
4128 // All Done. Store back the match bounds to the caller.
4135 if (matchStart
!= NULL
) {
4136 *matchStart
= mStart
;
4139 if (matchLimit
!= NULL
) {
4140 *matchLimit
= mLimit
;
4146 U_CAPI UBool U_EXPORT2
usearch_searchBackwards(UStringSearch
*strsrch
,
4148 int32_t *matchStart
,
4149 int32_t *matchLimit
,
4152 if (U_FAILURE(*status
)) {
4156 // TODO: reject search patterns beginning with a combining char.
4158 #ifdef USEARCH_DEBUG
4159 if (getenv("USEARCH_DEBUG") != NULL
) {
4160 printf("Pattern CEs\n");
4161 for (int ii
=0; ii
<strsrch
->pattern
.cesLength
; ii
++) {
4162 printf(" %8x", strsrch
->pattern
.ces
[ii
]);
4168 // Input parameter sanity check.
4169 // TODO: should input indicies clip to the text length
4170 // in the same way that UText does.
4171 if(strsrch
->pattern
.cesLength
== 0 ||
4173 startIdx
> strsrch
->search
->textLength
||
4174 strsrch
->pattern
.ces
== NULL
) {
4175 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
4179 if (strsrch
->pattern
.pces
== NULL
) {
4180 initializePatternPCETable(strsrch
, status
);
4183 CEIBuffer
ceb(strsrch
, status
);
4184 int32_t targetIx
= 0;
4187 * Pre-load the buffer with the CE's for the grapheme
4188 * after our starting position so that we're sure that
4189 * we can look at the CE following the match when we
4190 * check the match boundaries.
4192 * This will also pre-fetch the first CE that we'll
4193 * consider for the match.
4195 if (startIdx
< strsrch
->search
->textLength
) {
4196 UBreakIterator
*bi
= strsrch
->search
->internalBreakIter
;
4197 int32_t next
= ubrk_following(bi
, startIdx
);
4199 ucol_setOffset(strsrch
->textIter
, next
, status
);
4201 for (targetIx
= 0; ; targetIx
+= 1) {
4202 if (ceb
.getPrevious(targetIx
)->lowIndex
< startIdx
) {
4207 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
4211 const CEI
*targetCEI
= NULL
;
4215 int32_t limitIx
= targetIx
;
4216 int32_t mStart
= -1;
4217 int32_t mLimit
= -1;
4223 // Outer loop moves over match starting positions in the
4225 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4226 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4227 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4228 // and the beginning of the base text.
4229 for(targetIx
= limitIx
; ; targetIx
+= 1)
4232 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4233 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4234 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4235 const CEI
*lastCEI
= ceb
.getPrevious(targetIx
);
4236 if (lastCEI
== NULL
) {
4237 *status
= U_INTERNAL_PROGRAM_ERROR
;
4241 // Inner loop checks for a match beginning at each
4242 // position from the outer loop.
4243 int32_t targetIxOffset
= 0;
4244 for (patIx
= strsrch
->pattern
.pcesLength
- 1; patIx
>= 0; patIx
-= 1) {
4245 int64_t patCE
= strsrch
->pattern
.pces
[patIx
];
4247 targetCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.pcesLength
- 1 - patIx
+ targetIxOffset
);
4248 // Compare CE from target string with CE from the pattern.
4249 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4250 // which will fail the compare, below.
4251 UCompareCEsResult ceMatch
= compareCE64s(targetCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
4252 if ( ceMatch
== U_CE_NO_MATCH
) {
4255 } else if ( ceMatch
> U_CE_NO_MATCH
) {
4256 if ( ceMatch
== U_CE_SKIP_TARG
) {
4257 // redo with same patCE, next targCE
4260 } else { // ceMatch == U_CE_SKIP_PATN
4261 // redo with same targCE, next patCE
4267 if (!found
&& ((targetCEI
== NULL
) || (targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
))) {
4268 // No match at this targetIx. Try again at the next.
4273 // No match at all, we have run off the end of the target text.
4278 // We have found a match in CE space.
4279 // Now determine the bounds in string index space.
4280 // There still is a chance of match failure if the CE range not correspond to
4281 // an acceptable character range.
4283 const CEI
*firstCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.pcesLength
- 1 + targetIxOffset
);
4284 mStart
= firstCEI
->lowIndex
;
4286 // Check for the start of the match being within a combining sequence.
4287 // This can happen if the pattern itself begins with a combining char, and
4288 // the match found combining marks in the target text that were attached
4289 // to something else.
4290 // This type of match should be rejected for not completely consuming a
4291 // combining sequence.
4292 if (!isBreakBoundary(strsrch
, mStart
)) {
4296 // Look at the high index of the first CE in the match. If it's the same as the
4297 // low index, the first CE in the match is in the middle of an expansion.
4298 if (mStart
== firstCEI
->highIndex
) {
4303 minLimit
= lastCEI
->lowIndex
;
4306 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4307 // extended to the end of input, and the match is good.
4309 // Look at the high and low indices of the CE following the match. If
4310 // they are the same it means one of two things:
4311 // 1. The match extended to the last CE from the target text, which is OK, or
4312 // 2. The last CE that was part of the match is in an expansion that extends
4313 // to the first CE after the match. In this case, we reject the match.
4314 const CEI
*nextCEI
= ceb
.getPrevious(targetIx
- 1);
4316 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
4320 mLimit
= maxLimit
= nextCEI
->lowIndex
;
4322 // Allow matches to end in the middle of a grapheme cluster if the following
4323 // conditions are met; this is needed to make prefix search work properly in
4324 // Indic, see #11750
4325 // * the default breakIter is being used
4326 // * the next collation element after this combining sequence
4327 // - has non-zero primary weight
4328 // - corresponds to a separate character following the one at end of the current match
4329 // (the second of these conditions, and perhaps both, may be redundant given the
4330 // subsequent check for normalization boundary; however they are likely much faster
4331 // tests in any case)
4332 // * the match limit is a normalization boundary
4333 UBool allowMidclusterMatch
= FALSE
;
4334 if (strsrch
->search
->text
!= NULL
&& strsrch
->search
->textLength
> maxLimit
) {
4335 allowMidclusterMatch
=
4336 strsrch
->search
->breakIter
== NULL
&&
4337 nextCEI
!= NULL
&& (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) != 0 &&
4338 maxLimit
>= lastCEI
->highIndex
&& nextCEI
->highIndex
> maxLimit
&&
4339 (strsrch
->nfd
->hasBoundaryBefore(codePointAt(*strsrch
->search
, maxLimit
)) ||
4340 strsrch
->nfd
->hasBoundaryAfter(codePointBefore(*strsrch
->search
, maxLimit
)));
4342 // If those conditions are met, then:
4343 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4344 // the match limit may be backed off to a previous break boundary. This handles
4345 // cases in which mLimit includes target characters that are ignorable with current
4346 // settings (such as space) and which extend beyond the pattern match.
4347 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4348 // * do NOT require that match limit be on a breakIter boundary
4350 // Advance the match end position to the first acceptable match boundary.
4351 // This advances the index over any combining characters.
4352 if (minLimit
< maxLimit
) {
4353 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4354 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4355 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4356 // (i.e. we back off mLimit to the previous breakIterator boundary).
4357 if (nba
>= lastCEI
->highIndex
&& (!allowMidclusterMatch
|| nba
< maxLimit
)) {
4362 if (!allowMidclusterMatch
) {
4363 // If advancing to the end of a combining sequence in character indexing space
4364 // advanced us beyond the end of the match in CE space, reject this match.
4365 if (mLimit
> maxLimit
) {
4369 // Make sure the end of the match is on a break boundary
4370 if (!isBreakBoundary(strsrch
, mLimit
)) {
4376 // No non-ignorable CEs after this point.
4377 // The maximum position is detected by boundary after
4378 // the last non-ignorable CE. Combining sequence
4379 // across the start index will be truncated.
4380 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4381 mLimit
= maxLimit
= (nba
> 0) && (startIdx
> nba
) ? nba
: startIdx
;
4384 #ifdef USEARCH_DEBUG
4385 if (getenv("USEARCH_DEBUG") != NULL
) {
4386 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4391 if (! checkIdentical(strsrch
, mStart
, mLimit
)) {
4400 #ifdef USEARCH_DEBUG
4401 if (getenv("USEARCH_DEBUG") != NULL
) {
4402 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4403 int32_t lastToPrint
= ceb
.limitIx
+2;
4404 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4405 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4407 printf("\n%s\n", found
? "match found" : "no match");
4411 // All Done. Store back the match bounds to the caller.
4418 if (matchStart
!= NULL
) {
4419 *matchStart
= mStart
;
4422 if (matchLimit
!= NULL
) {
4423 *matchLimit
= mLimit
;
4429 // internal use methods declared in usrchimp.h -----------------------------
4431 UBool
usearch_handleNextExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4433 if (U_FAILURE(*status
)) {
4434 setMatchNotFound(strsrch
);
4439 UCollationElements
*coleiter
= strsrch
->textIter
;
4440 int32_t textlength
= strsrch
->search
->textLength
;
4441 int32_t *patternce
= strsrch
->pattern
.ces
;
4442 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4443 int32_t textoffset
= ucol_getOffset(coleiter
);
4445 // status used in setting coleiter offset, since offset is checked in
4446 // shiftForward before setting the coleiter offset, status never
4448 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4450 while (textoffset
<= textlength
)
4452 uint32_t patternceindex
= patterncelength
- 1;
4454 UBool found
= FALSE
;
4455 int32_t lastce
= UCOL_NULLORDER
;
4457 setColEIterOffset(coleiter
, textoffset
);
4460 // finding the last pattern ce match, imagine composite characters
4461 // for example: search for pattern A in text \u00C0
4462 // we'll have to skip \u0300 the grave first before we get to A
4463 targetce
= ucol_previous(coleiter
, status
);
4464 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4468 targetce
= getCE(strsrch
, targetce
);
4469 if (targetce
== UCOL_IGNORABLE
&& inNormBuf(coleiter
)) {
4470 // this is for the text \u0315\u0300 that requires
4471 // normalization and pattern \u0300, where \u0315 is ignorable
4474 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4477 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4478 if (targetce
== patternce
[patternceindex
]) {
4479 // the first ce can be a contraction
4483 if (!hasExpansion(coleiter
)) {
4489 //targetce = lastce;
4491 while (found
&& patternceindex
> 0) {
4493 targetce
= ucol_previous(coleiter
, status
);
4494 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4498 targetce
= getCE(strsrch
, targetce
);
4499 if (targetce
== UCOL_IGNORABLE
) {
4504 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4505 found
= found
&& targetce
== patternce
[patternceindex
];
4511 if (U_FAILURE(*status
)) {
4514 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4516 // status checked at loop.
4517 patternceindex
= patterncelength
;
4521 if (checkNextExactMatch(strsrch
, &textoffset
, status
)) {
4522 // status checked in ucol_setOffset
4523 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4527 setMatchNotFound(strsrch
);
4530 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4534 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4535 strsrch
->search
->matchedIndex
= start
;
4536 strsrch
->search
->matchedLength
= end
- start
;
4539 setMatchNotFound(strsrch
);
4545 UBool
usearch_handleNextCanonical(UStringSearch
*strsrch
, UErrorCode
*status
)
4547 if (U_FAILURE(*status
)) {
4548 setMatchNotFound(strsrch
);
4553 UCollationElements
*coleiter
= strsrch
->textIter
;
4554 int32_t textlength
= strsrch
->search
->textLength
;
4555 int32_t *patternce
= strsrch
->pattern
.ces
;
4556 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4557 int32_t textoffset
= ucol_getOffset(coleiter
);
4558 UBool hasPatternAccents
=
4559 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4561 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4563 strsrch
->canonicalPrefixAccents
[0] = 0;
4564 strsrch
->canonicalSuffixAccents
[0] = 0;
4566 while (textoffset
<= textlength
)
4568 int32_t patternceindex
= patterncelength
- 1;
4570 UBool found
= FALSE
;
4571 int32_t lastce
= UCOL_NULLORDER
;
4573 setColEIterOffset(coleiter
, textoffset
);
4576 // finding the last pattern ce match, imagine composite characters
4577 // for example: search for pattern A in text \u00C0
4578 // we'll have to skip \u0300 the grave first before we get to A
4579 targetce
= ucol_previous(coleiter
, status
);
4580 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4584 targetce
= getCE(strsrch
, targetce
);
4585 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4588 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4589 if (targetce
== patternce
[patternceindex
]) {
4590 // the first ce can be a contraction
4594 if (!hasExpansion(coleiter
)) {
4600 while (found
&& patternceindex
> 0) {
4601 targetce
= ucol_previous(coleiter
, status
);
4602 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4606 targetce
= getCE(strsrch
, targetce
);
4607 if (targetce
== UCOL_IGNORABLE
) {
4612 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4613 found
= found
&& targetce
== patternce
[patternceindex
];
4616 // initializing the rearranged accent array
4617 if (hasPatternAccents
&& !found
) {
4618 strsrch
->canonicalPrefixAccents
[0] = 0;
4619 strsrch
->canonicalSuffixAccents
[0] = 0;
4620 if (U_FAILURE(*status
)) {
4623 found
= doNextCanonicalMatch(strsrch
, textoffset
, status
);
4627 if (U_FAILURE(*status
)) {
4630 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4632 // status checked at loop
4633 patternceindex
= patterncelength
;
4637 if (checkNextCanonicalMatch(strsrch
, &textoffset
, status
)) {
4638 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4642 setMatchNotFound(strsrch
);
4645 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4649 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4650 strsrch
->search
->matchedIndex
= start
;
4651 strsrch
->search
->matchedLength
= end
- start
;
4654 setMatchNotFound(strsrch
);
4660 UBool
usearch_handlePreviousExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4662 if (U_FAILURE(*status
)) {
4663 setMatchNotFound(strsrch
);
4668 UCollationElements
*coleiter
= strsrch
->textIter
;
4669 int32_t *patternce
= strsrch
->pattern
.ces
;
4670 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4671 int32_t textoffset
= ucol_getOffset(coleiter
);
4673 // shifting it check for setting offset
4674 // if setOffset is called previously or there was no previous match, we
4675 // leave the offset as it is.
4676 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4677 textoffset
= strsrch
->search
->matchedIndex
;
4680 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4683 while (textoffset
>= 0)
4685 int32_t patternceindex
= 1;
4687 UBool found
= FALSE
;
4688 int32_t firstce
= UCOL_NULLORDER
;
4690 // if status is a failure, ucol_setOffset does nothing
4691 setColEIterOffset(coleiter
, textoffset
);
4694 // finding the first pattern ce match, imagine composite
4695 // characters. for example: search for pattern \u0300 in text
4696 // \u00C0, we'll have to skip A first before we get to
4697 // \u0300 the grave accent
4698 targetce
= ucol_next(coleiter
, status
);
4699 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4703 targetce
= getCE(strsrch
, targetce
);
4704 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4707 if (targetce
== UCOL_IGNORABLE
&& strsrch
->strength
!= UCOL_PRIMARY
) {
4710 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4711 if (targetce
== patternce
[0]) {
4715 if (!hasExpansion(coleiter
)) {
4716 // checking for accents in composite character
4722 //targetce = firstce;
4724 while (found
&& (patternceindex
< patterncelength
)) {
4726 targetce
= ucol_next(coleiter
, status
);
4727 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4731 targetce
= getCE(strsrch
, targetce
);
4732 if (targetce
== UCOL_IGNORABLE
) {
4736 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4737 found
= found
&& targetce
== patternce
[patternceindex
];
4744 if (U_FAILURE(*status
)) {
4748 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4754 if (checkPreviousExactMatch(strsrch
, &textoffset
, status
)) {
4755 setColEIterOffset(coleiter
, textoffset
);
4759 setMatchNotFound(strsrch
);
4764 if (strsrch
->search
->isOverlap
) {
4765 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4766 textOffset
= strsrch
->search
->matchedIndex
+ strsrch
->search
->matchedLength
- 1;
4768 // move the start position at the end of possible match
4769 initializePatternPCETable(strsrch
, status
);
4770 if (!initTextProcessedIter(strsrch
, status
)) {
4771 setMatchNotFound(strsrch
);
4774 for (int32_t nPCEs
= 0; nPCEs
< strsrch
->pattern
.pcesLength
- 1; nPCEs
++) {
4775 int64_t pce
= strsrch
->textProcessedIter
->nextProcessed(NULL
, NULL
, status
);
4776 if (pce
== UCOL_PROCESSED_NULLORDER
) {
4777 // at the end of the text
4781 if (U_FAILURE(*status
)) {
4782 setMatchNotFound(strsrch
);
4785 textOffset
= ucol_getOffset(strsrch
->textIter
);
4788 textOffset
= ucol_getOffset(strsrch
->textIter
);
4794 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4795 strsrch
->search
->matchedIndex
= start
;
4796 strsrch
->search
->matchedLength
= end
- start
;
4799 setMatchNotFound(strsrch
);
4805 UBool
usearch_handlePreviousCanonical(UStringSearch
*strsrch
,
4808 if (U_FAILURE(*status
)) {
4809 setMatchNotFound(strsrch
);
4814 UCollationElements
*coleiter
= strsrch
->textIter
;
4815 int32_t *patternce
= strsrch
->pattern
.ces
;
4816 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4817 int32_t textoffset
= ucol_getOffset(coleiter
);
4818 UBool hasPatternAccents
=
4819 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4821 // shifting it check for setting offset
4822 // if setOffset is called previously or there was no previous match, we
4823 // leave the offset as it is.
4824 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4825 textoffset
= strsrch
->search
->matchedIndex
;
4828 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4830 strsrch
->canonicalPrefixAccents
[0] = 0;
4831 strsrch
->canonicalSuffixAccents
[0] = 0;
4833 while (textoffset
>= 0)
4835 int32_t patternceindex
= 1;
4837 UBool found
= FALSE
;
4838 int32_t firstce
= UCOL_NULLORDER
;
4840 setColEIterOffset(coleiter
, textoffset
);
4842 // finding the first pattern ce match, imagine composite
4843 // characters. for example: search for pattern \u0300 in text
4844 // \u00C0, we'll have to skip A first before we get to
4845 // \u0300 the grave accent
4846 targetce
= ucol_next(coleiter
, status
);
4847 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4851 targetce
= getCE(strsrch
, targetce
);
4852 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4856 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4857 if (targetce
== patternce
[0]) {
4858 // the first ce can be a contraction
4862 if (!hasExpansion(coleiter
)) {
4863 // checking for accents in composite character
4871 while (found
&& patternceindex
< patterncelength
) {
4872 targetce
= ucol_next(coleiter
, status
);
4873 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4877 targetce
= getCE(strsrch
, targetce
);
4878 if (targetce
== UCOL_IGNORABLE
) {
4882 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4883 found
= found
&& targetce
== patternce
[patternceindex
];
4887 // initializing the rearranged accent array
4888 if (hasPatternAccents
&& !found
) {
4889 strsrch
->canonicalPrefixAccents
[0] = 0;
4890 strsrch
->canonicalSuffixAccents
[0] = 0;
4891 if (U_FAILURE(*status
)) {
4894 found
= doPreviousCanonicalMatch(strsrch
, textoffset
, status
);
4898 if (U_FAILURE(*status
)) {
4901 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4907 if (checkPreviousCanonicalMatch(strsrch
, &textoffset
, status
)) {
4908 setColEIterOffset(coleiter
, textoffset
);
4912 setMatchNotFound(strsrch
);
4917 if (strsrch
->search
->isOverlap
) {
4918 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4919 textOffset
= strsrch
->search
->matchedIndex
+ strsrch
->search
->matchedLength
- 1;
4921 // move the start position at the end of possible match
4922 initializePatternPCETable(strsrch
, status
);
4923 if (!initTextProcessedIter(strsrch
, status
)) {
4924 setMatchNotFound(strsrch
);
4927 for (int32_t nPCEs
= 0; nPCEs
< strsrch
->pattern
.pcesLength
- 1; nPCEs
++) {
4928 int64_t pce
= strsrch
->textProcessedIter
->nextProcessed(NULL
, NULL
, status
);
4929 if (pce
== UCOL_PROCESSED_NULLORDER
) {
4930 // at the end of the text
4934 if (U_FAILURE(*status
)) {
4935 setMatchNotFound(strsrch
);
4938 textOffset
= ucol_getOffset(strsrch
->textIter
);
4941 textOffset
= ucol_getOffset(strsrch
->textIter
);
4947 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4948 strsrch
->search
->matchedIndex
= start
;
4949 strsrch
->search
->matchedLength
= end
- start
;
4952 setMatchNotFound(strsrch
);
4958 #endif /* #if !UCONFIG_NO_COLLATION */