2 **********************************************************************
3 * Copyright (C) 2001-2015 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unicode/utf16.h"
18 #include "normalizer2impl.h"
27 // don't use Boyer-Moore
28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
31 // internal definition ---------------------------------------------------
33 #define LAST_BYTE_MASK_ 0xFF
34 #define SECOND_LAST_BYTE_SHIFT_ 8
35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
37 static const Normalizer2Impl
*g_nfcImpl
= NULL
;
39 // internal methods -------------------------------------------------
42 * Fast collation element iterator setOffset.
43 * This function does not check for bounds.
44 * @param coleiter collation element iterator
45 * @param offset to set
48 inline void setColEIterOffset(UCollationElements
*elems
,
51 // Note: Not "fast" any more after the 2013 collation rewrite.
52 // We do not want to expose more internals than necessary.
53 UErrorCode status
= U_ZERO_ERROR
;
54 ucol_setOffset(elems
, offset
, &status
);
58 * Getting the mask for collation strength
59 * @param strength collation strength
60 * @return collation element mask
63 inline uint32_t getMask(UCollationStrength strength
)
68 return UCOL_PRIMARYORDERMASK
;
70 return UCOL_SECONDARYORDERMASK
| UCOL_PRIMARYORDERMASK
;
72 return UCOL_TERTIARYORDERMASK
| UCOL_SECONDARYORDERMASK
|
73 UCOL_PRIMARYORDERMASK
;
78 * @param ce 32-bit collation element
82 inline int hashFromCE32(uint32_t ce
)
85 ((((((ce
>> 24) * 37) +
89 hc
%= MAX_TABLE_SIZE_
;
91 hc
+= MAX_TABLE_SIZE_
;
97 static UBool U_CALLCONV
98 usearch_cleanup(void) {
105 * Initializing the fcd tables.
106 * Internal method, status assumed to be a success.
107 * @param status output error if any, caller to check status before calling
108 * method, status assumed to be success when passed in.
111 inline void initializeFCD(UErrorCode
*status
)
113 if (g_nfcImpl
== NULL
) {
114 g_nfcImpl
= Normalizer2Factory::getNFCImpl(*status
);
115 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH
, usearch_cleanup
);
120 * Gets the fcd value for a character at the argument index.
121 * This method takes into accounts of the supplementary characters.
122 * @param str UTF16 string where character for fcd retrieval resides
123 * @param offset position of the character whose fcd is to be retrieved, to be
124 * overwritten with the next character position, taking
125 * surrogate characters into consideration.
126 * @param strlength length of the argument string
130 uint16_t getFCD(const UChar
*str
, int32_t *offset
,
133 const UChar
*temp
= str
+ *offset
;
134 uint16_t result
= g_nfcImpl
->nextFCD16(temp
, str
+ strlength
);
135 *offset
= (int32_t)(temp
- str
);
140 * Getting the modified collation elements taking into account the collation
142 * @param strsrch string search data
144 * @return the modified collation element
147 inline int32_t getCE(const UStringSearch
*strsrch
, uint32_t sourcece
)
149 // note for tertiary we can't use the collator->tertiaryMask, that
150 // is a preprocessed mask that takes into account case options. since
151 // we are only concerned with exact matches, we don't need that.
152 sourcece
&= strsrch
->ceMask
;
154 if (strsrch
->toShift
) {
155 // alternate handling here, since only the 16 most significant digits
156 // is only used, we can safely do a compare without masking
157 // if the ce is a variable, we mask and get only the primary values
158 // no shifting to quartenary is required since all primary values
159 // less than variabletop will need to be masked off anyway.
160 if (strsrch
->variableTop
> sourcece
) {
161 if (strsrch
->strength
>= UCOL_QUATERNARY
) {
162 sourcece
&= UCOL_PRIMARYORDERMASK
;
165 sourcece
= UCOL_IGNORABLE
;
168 } else if (strsrch
->strength
>= UCOL_QUATERNARY
&& sourcece
== UCOL_IGNORABLE
) {
176 * Allocate a memory and returns NULL if it failed.
177 * Internal method, status assumed to be a success.
178 * @param size to allocate
179 * @param status output error if any, caller to check status before calling
180 * method, status assumed to be success when passed in.
181 * @return newly allocated array, NULL otherwise
184 inline void * allocateMemory(uint32_t size
, UErrorCode
*status
)
186 uint32_t *result
= (uint32_t *)uprv_malloc(size
);
187 if (result
== NULL
) {
188 *status
= U_MEMORY_ALLOCATION_ERROR
;
194 * Adds a uint32_t value to a destination array.
195 * Creates a new array if we run out of space. The caller will have to
196 * manually deallocate the newly allocated array.
197 * Internal method, status assumed to be success, caller has to check status
198 * before calling this method. destination not to be NULL and has at least
199 * size destinationlength.
200 * @param destination target array
201 * @param offset destination offset to add value
202 * @param destinationlength target array size, return value for the new size
203 * @param value to be added
204 * @param increments incremental size expected
205 * @param status output error if any, caller to check status before calling
206 * method, status assumed to be success when passed in.
207 * @return new destination array, destination if there was no new allocation
210 inline int32_t * addTouint32_tArray(int32_t *destination
,
212 uint32_t *destinationlength
,
217 uint32_t newlength
= *destinationlength
;
218 if (offset
+ 1 == newlength
) {
219 newlength
+= increments
;
220 int32_t *temp
= (int32_t *)allocateMemory(
221 sizeof(int32_t) * newlength
, status
);
222 if (U_FAILURE(*status
)) {
225 uprv_memcpy(temp
, destination
, sizeof(int32_t) * offset
);
226 *destinationlength
= newlength
;
229 destination
[offset
] = value
;
234 * Adds a uint64_t value to a destination array.
235 * Creates a new array if we run out of space. The caller will have to
236 * manually deallocate the newly allocated array.
237 * Internal method, status assumed to be success, caller has to check status
238 * before calling this method. destination not to be NULL and has at least
239 * size destinationlength.
240 * @param destination target array
241 * @param offset destination offset to add value
242 * @param destinationlength target array size, return value for the new size
243 * @param value to be added
244 * @param increments incremental size expected
245 * @param status output error if any, caller to check status before calling
246 * method, status assumed to be success when passed in.
247 * @return new destination array, destination if there was no new allocation
250 inline int64_t * addTouint64_tArray(int64_t *destination
,
252 uint32_t *destinationlength
,
257 uint32_t newlength
= *destinationlength
;
258 if (offset
+ 1 == newlength
) {
259 newlength
+= increments
;
260 int64_t *temp
= (int64_t *)allocateMemory(
261 sizeof(int64_t) * newlength
, status
);
263 if (U_FAILURE(*status
)) {
267 uprv_memcpy(temp
, destination
, sizeof(int64_t) * offset
);
268 *destinationlength
= newlength
;
272 destination
[offset
] = value
;
278 * Initializing the ce table for a pattern.
279 * Stores non-ignorable collation keys.
280 * Table size will be estimated by the size of the pattern text. Table
281 * expansion will be perform as we go along. Adding 1 to ensure that the table
282 * size definitely increases.
283 * Internal method, status assumed to be a success.
284 * @param strsrch string search data
285 * @param status output error if any, caller to check status before calling
286 * method, status assumed to be success when passed in.
287 * @return total number of expansions
290 inline uint16_t initializePatternCETable(UStringSearch
*strsrch
,
293 UPattern
*pattern
= &(strsrch
->pattern
);
294 uint32_t cetablesize
= INITIAL_ARRAY_SIZE_
;
295 int32_t *cetable
= pattern
->cesBuffer
;
296 uint32_t patternlength
= pattern
->textLength
;
297 UCollationElements
*coleiter
= strsrch
->utilIter
;
299 if (coleiter
== NULL
) {
300 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
301 patternlength
, status
);
302 // status will be checked in ucol_next(..) later and if it is an
303 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
305 strsrch
->utilIter
= coleiter
;
308 ucol_setText(coleiter
, pattern
->text
, pattern
->textLength
, status
);
310 if(U_FAILURE(*status
)) {
314 if (pattern
->ces
!= cetable
&& pattern
->ces
) {
315 uprv_free(pattern
->ces
);
322 while ((ce
= ucol_next(coleiter
, status
)) != UCOL_NULLORDER
&&
323 U_SUCCESS(*status
)) {
324 uint32_t newce
= getCE(strsrch
, ce
);
326 int32_t *temp
= addTouint32_tArray(cetable
, offset
, &cetablesize
,
328 patternlength
- ucol_getOffset(coleiter
) + 1,
330 if (U_FAILURE(*status
)) {
334 if (cetable
!= temp
&& cetable
!= pattern
->cesBuffer
) {
339 result
+= (uint16_t)(ucol_getMaxExpansion(coleiter
, ce
) - 1);
343 pattern
->ces
= cetable
;
344 pattern
->cesLength
= offset
;
350 * Initializing the pce table for a pattern.
351 * Stores non-ignorable collation keys.
352 * Table size will be estimated by the size of the pattern text. Table
353 * expansion will be perform as we go along. Adding 1 to ensure that the table
354 * size definitely increases.
355 * Internal method, status assumed to be a success.
356 * @param strsrch string search data
357 * @param status output error if any, caller to check status before calling
358 * method, status assumed to be success when passed in.
359 * @return total number of expansions
362 inline uint16_t initializePatternPCETable(UStringSearch
*strsrch
,
365 UPattern
*pattern
= &(strsrch
->pattern
);
366 uint32_t pcetablesize
= INITIAL_ARRAY_SIZE_
;
367 int64_t *pcetable
= pattern
->pcesBuffer
;
368 uint32_t patternlength
= pattern
->textLength
;
369 UCollationElements
*coleiter
= strsrch
->utilIter
;
371 if (coleiter
== NULL
) {
372 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
373 patternlength
, status
);
374 // status will be checked in ucol_next(..) later and if it is an
375 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
377 strsrch
->utilIter
= coleiter
;
379 ucol_setText(coleiter
, pattern
->text
, pattern
->textLength
, status
);
381 if(U_FAILURE(*status
)) {
385 if (pattern
->pces
!= pcetable
&& pattern
->pces
!= NULL
) {
386 uprv_free(pattern
->pces
);
393 icu::UCollationPCE
iter(coleiter
);
395 // ** Should processed CEs be signed or unsigned?
396 // ** (the rest of the code in this file seems to play fast-and-loose with
397 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
398 while ((pce
= iter
.nextProcessed(NULL
, NULL
, status
)) != UCOL_PROCESSED_NULLORDER
&&
399 U_SUCCESS(*status
)) {
400 int64_t *temp
= addTouint64_tArray(pcetable
, offset
, &pcetablesize
,
402 patternlength
- ucol_getOffset(coleiter
) + 1,
405 if (U_FAILURE(*status
)) {
411 if (pcetable
!= temp
&& pcetable
!= pattern
->pcesBuffer
) {
416 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
419 pcetable
[offset
] = 0;
420 pattern
->pces
= pcetable
;
421 pattern
->pcesLength
= offset
;
427 * Initializes the pattern struct.
428 * Internal method, status assumed to be success.
429 * @param strsrch UStringSearch data storage
430 * @param status output error if any, caller to check status before calling
431 * method, status assumed to be success when passed in.
432 * @return expansionsize the total expansion size of the pattern
435 inline int16_t initializePattern(UStringSearch
*strsrch
, UErrorCode
*status
)
437 if (U_FAILURE(*status
)) { return 0; }
438 UPattern
*pattern
= &(strsrch
->pattern
);
439 const UChar
*patterntext
= pattern
->text
;
440 int32_t length
= pattern
->textLength
;
443 // Since the strength is primary, accents are ignored in the pattern.
444 if (strsrch
->strength
== UCOL_PRIMARY
) {
445 pattern
->hasPrefixAccents
= 0;
446 pattern
->hasSuffixAccents
= 0;
448 pattern
->hasPrefixAccents
= getFCD(patterntext
, &index
, length
) >>
449 SECOND_LAST_BYTE_SHIFT_
;
451 U16_BACK_1(patterntext
, 0, index
);
452 pattern
->hasSuffixAccents
= getFCD(patterntext
, &index
, length
) &
457 if (strsrch
->pattern
.pces
!= NULL
) {
458 if (strsrch
->pattern
.pces
!= strsrch
->pattern
.pcesBuffer
) {
459 uprv_free(strsrch
->pattern
.pces
);
462 strsrch
->pattern
.pces
= NULL
;
465 // since intializePattern is an internal method status is a success.
466 return initializePatternCETable(strsrch
, status
);
470 * Initializing shift tables, with the default values.
471 * If a corresponding default value is 0, the shift table is not set.
472 * @param shift table for forwards shift
473 * @param backshift table for backwards shift
474 * @param cetable table containing pattern ce
475 * @param cesize size of the pattern ces
476 * @param expansionsize total size of the expansions
477 * @param defaultforward the default forward value
478 * @param defaultbackward the default backward value
481 inline void setShiftTable(int16_t shift
[], int16_t backshift
[],
482 int32_t *cetable
, int32_t cesize
,
483 int16_t expansionsize
,
484 int16_t defaultforward
,
485 int16_t defaultbackward
)
487 // estimate the value to shift. to do that we estimate the smallest
488 // number of characters to give the relevant ces, ie approximately
489 // the number of ces minus their expansion, since expansions can come
492 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
493 shift
[count
] = defaultforward
;
495 cesize
--; // down to the last index
496 for (count
= 0; count
< cesize
; count
++) {
497 // number of ces from right of array to the count
498 int temp
= defaultforward
- count
- 1;
499 shift
[hashFromCE32(cetable
[count
])] = temp
> 1 ? temp
: 1;
501 shift
[hashFromCE32(cetable
[cesize
])] = 1;
502 // for ignorables we just shift by one. see test examples.
503 shift
[hashFromCE32(0)] = 1;
505 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
506 backshift
[count
] = defaultbackward
;
508 for (count
= cesize
; count
> 0; count
--) {
509 // the original value count does not seem to work
510 backshift
[hashFromCE32(cetable
[count
])] = count
> expansionsize
?
511 (int16_t)(count
- expansionsize
) : 1;
513 backshift
[hashFromCE32(cetable
[0])] = 1;
514 backshift
[hashFromCE32(0)] = 1;
518 * Building of the pattern collation element list and the boyer moore strsrch
520 * The canonical match will only be performed after the default match fails.
521 * For both cases we need to remember the size of the composed and decomposed
522 * versions of the string. Since the Boyer-Moore shift calculations shifts by
523 * a number of characters in the text and tries to match the pattern from that
524 * offset, the shift value can not be too large in case we miss some
525 * characters. To choose a right shift size, we estimate the NFC form of the
526 * and use its size as a shift guide. The NFC form should be the small
527 * possible representation of the pattern. Anyways, we'll err on the smaller
528 * shift size. Hence the calculation for minlength.
529 * Canonical match will be performed slightly differently. We'll split the
530 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
531 * the first and last base character (MS), the ending accents (EA). Matches
532 * will be done on MS first, and only when we match MS then some processing
533 * will be required for the prefix and end accents in order to determine if
534 * they match PA and EA. Hence the default shift values
535 * for the canonical match will take the size of either end's accent into
536 * consideration. Forwards search will take the end accents into consideration
537 * for the default shift values and the backwards search will take the prefix
538 * accents into consideration.
539 * If pattern has no non-ignorable ce, we return a illegal argument error.
540 * Internal method, status assumed to be success.
541 * @param strsrch UStringSearch data storage
542 * @param status for output errors if it occurs, status is assumed to be a
543 * success when it is passed in.
546 inline void initialize(UStringSearch
*strsrch
, UErrorCode
*status
)
548 int16_t expandlength
= initializePattern(strsrch
, status
);
549 if (U_SUCCESS(*status
) && strsrch
->pattern
.cesLength
> 0) {
550 UPattern
*pattern
= &strsrch
->pattern
;
551 int32_t cesize
= pattern
->cesLength
;
553 int16_t minlength
= cesize
> expandlength
554 ? (int16_t)cesize
- expandlength
: 1;
555 pattern
->defaultShiftSize
= minlength
;
556 setShiftTable(pattern
->shift
, pattern
->backShift
, pattern
->ces
,
557 cesize
, expandlength
, minlength
, minlength
);
560 strsrch
->pattern
.defaultShiftSize
= 0;
565 * Check to make sure that the match length is at the end of the character by
566 * using the breakiterator.
567 * @param strsrch string search data
568 * @param start target text start offset
569 * @param end target text end offset
572 void checkBreakBoundary(const UStringSearch
*strsrch
, int32_t * /*start*/,
575 #if !UCONFIG_NO_BREAK_ITERATION
576 UBreakIterator
*breakiterator
= strsrch
->search
->internalBreakIter
;
578 int32_t matchend
= *end
;
579 //int32_t matchstart = *start;
581 if (!ubrk_isBoundary(breakiterator
, matchend
)) {
582 *end
= ubrk_following(breakiterator
, matchend
);
585 /* Check the start of the matched text to make sure it doesn't have any accents
586 * before it. This code may not be necessary and so it is commented out */
587 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
588 *start = ubrk_preceding(breakiterator, matchstart);
595 * Determine whether the target text in UStringSearch bounded by the offset
596 * start and end is one or more whole units of text as
597 * determined by the breakiterator in UStringSearch.
598 * @param strsrch string search data
599 * @param start target text start offset
600 * @param end target text end offset
603 UBool
isBreakUnit(const UStringSearch
*strsrch
, int32_t start
,
606 #if !UCONFIG_NO_BREAK_ITERATION
607 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
610 int32_t startindex
= ubrk_first(breakiterator
);
611 int32_t endindex
= ubrk_last(breakiterator
);
613 // out-of-range indexes are never boundary positions
614 if (start
< startindex
|| start
> endindex
||
615 end
< startindex
|| end
> endindex
) {
618 // otherwise, we can use following() on the position before the
619 // specified one and return true of the position we get back is the
620 // one the user specified
621 UBool result
= (start
== startindex
||
622 ubrk_following(breakiterator
, start
- 1) == start
) &&
624 ubrk_following(breakiterator
, end
- 1) == end
);
626 // iterates the individual ces
627 UCollationElements
*coleiter
= strsrch
->utilIter
;
628 const UChar
*text
= strsrch
->search
->text
+
630 UErrorCode status
= U_ZERO_ERROR
;
631 ucol_setText(coleiter
, text
, end
- start
, &status
);
632 for (int32_t count
= 0; count
< strsrch
->pattern
.cesLength
;
634 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
635 if (ce
== UCOL_IGNORABLE
) {
639 if (U_FAILURE(status
) || ce
!= strsrch
->pattern
.ces
[count
]) {
643 int32_t nextce
= ucol_next(coleiter
, &status
);
644 while (ucol_getOffset(coleiter
) == (end
- start
)
645 && getCE(strsrch
, nextce
) == UCOL_IGNORABLE
) {
646 nextce
= ucol_next(coleiter
, &status
);
648 if (ucol_getOffset(coleiter
) == (end
- start
)
649 && nextce
!= UCOL_NULLORDER
) {
650 // extra collation elements at the end of the match
661 * Getting the next base character offset if current offset is an accent,
662 * or the current offset if the current character contains a base character.
663 * accents the following base character will be returned
665 * @param textoffset current offset
666 * @param textlength length of text string
667 * @return the next base character or the current offset
668 * if the current character is contains a base character.
671 inline int32_t getNextBaseOffset(const UChar
*text
,
675 if (textoffset
< textlength
) {
676 int32_t temp
= textoffset
;
677 if (getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
678 while (temp
< textlength
) {
679 int32_t result
= temp
;
680 if ((getFCD(text
, &temp
, textlength
) >>
681 SECOND_LAST_BYTE_SHIFT_
) == 0) {
692 * Gets the next base character offset depending on the string search pattern
694 * @param strsrch string search data
695 * @param textoffset current offset, one offset away from the last character
697 * @return start index of the next base character or the current offset
698 * if the current character is contains a base character.
701 inline int32_t getNextUStringSearchBaseOffset(UStringSearch
*strsrch
,
704 int32_t textlength
= strsrch
->search
->textLength
;
705 if (strsrch
->pattern
.hasSuffixAccents
&&
706 textoffset
< textlength
) {
707 int32_t temp
= textoffset
;
708 const UChar
*text
= strsrch
->search
->text
;
709 U16_BACK_1(text
, 0, temp
);
710 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
711 return getNextBaseOffset(text
, textoffset
, textlength
);
718 * Shifting the collation element iterator position forward to prepare for
719 * a following match. If the last character is a unsafe character, we'll only
720 * shift by 1 to capture contractions, normalization etc.
721 * Internal method, status assumed to be success.
722 * @param text strsrch string search data
723 * @param textoffset start text position to do search
724 * @param ce the text ce which failed the match.
725 * @param patternceindex index of the ce within the pattern ce buffer which
727 * @return final offset
730 inline int32_t shiftForward(UStringSearch
*strsrch
,
733 int32_t patternceindex
)
735 UPattern
*pattern
= &(strsrch
->pattern
);
736 if (ce
!= UCOL_NULLORDER
) {
737 int32_t shift
= pattern
->shift
[hashFromCE32(ce
)];
738 // this is to adjust for characters in the middle of the
739 // substring for matching that failed.
740 int32_t adjust
= pattern
->cesLength
- patternceindex
;
741 if (adjust
> 1 && shift
>= adjust
) {
747 textoffset
+= pattern
->defaultShiftSize
;
750 textoffset
= getNextUStringSearchBaseOffset(strsrch
, textoffset
);
751 // check for unsafe characters
752 // * if it is the start or middle of a contraction: to be done after
753 // a initial match is found
754 // * thai or lao base consonant character: similar to contraction
755 // * high surrogate character: similar to contraction
756 // * next character is a accent: shift to the next base character
759 #endif // #if BOYER_MOORE
762 * sets match not found
763 * @param strsrch string search data
766 inline void setMatchNotFound(UStringSearch
*strsrch
)
768 // this method resets the match result regardless of the error status.
769 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
770 strsrch
->search
->matchedLength
= 0;
771 if (strsrch
->search
->isForwardSearching
) {
772 setColEIterOffset(strsrch
->textIter
, strsrch
->search
->textLength
);
775 setColEIterOffset(strsrch
->textIter
, 0);
781 * Gets the offset to the next safe point in text.
782 * ie. not the middle of a contraction, swappable characters or supplementary
784 * @param collator collation sata
785 * @param text string to work with
786 * @param textoffset offset in string
787 * @param textlength length of text string
788 * @return offset to the next safe character
791 inline int32_t getNextSafeOffset(const UCollator
*collator
,
796 int32_t result
= textoffset
; // first contraction character
797 while (result
!= textlength
&& ucol_unsafeCP(text
[result
], collator
)) {
804 * This checks for accents in the potential match started with a .
805 * composite character.
806 * This is really painful... we have to check that composite character do not
807 * have any extra accents. We have to normalize the potential match and find
808 * the immediate decomposed character before the match.
809 * The first composite character would have been taken care of by the fcd
810 * checks in checkForwardExactMatch.
811 * This is the slow path after the fcd of the first character and
812 * the last character has been checked by checkForwardExactMatch and we
813 * determine that the potential match has extra non-ignorable preceding
815 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
816 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
817 * Note here that accents checking are slow and cautioned in the API docs.
818 * Internal method, status assumed to be a success, caller should check status
819 * before calling this method
820 * @param strsrch string search data
821 * @param start index of the potential unfriendly composite character
822 * @param end index of the potential unfriendly composite character
823 * @param status output error status if any.
824 * @return TRUE if there is non-ignorable accents before at the beginning
825 * of the match, FALSE otherwise.
829 UBool
checkExtraMatchAccents(const UStringSearch
*strsrch
, int32_t start
,
833 UBool result
= FALSE
;
834 if (strsrch
->pattern
.hasPrefixAccents
) {
835 int32_t length
= end
- start
;
837 const UChar
*text
= strsrch
->search
->text
+ start
;
839 U16_FWD_1(text
, offset
, length
);
840 // we are only concerned with the first composite character
841 if (unorm_quickCheck(text
, offset
, UNORM_NFD
, status
) == UNORM_NO
) {
842 int32_t safeoffset
= getNextSafeOffset(strsrch
->collator
,
844 if (safeoffset
!= length
) {
848 UChar buffer
[INITIAL_ARRAY_SIZE_
];
849 int32_t size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0,
850 buffer
, INITIAL_ARRAY_SIZE_
,
852 if (U_FAILURE(*status
)) {
855 if (size
>= INITIAL_ARRAY_SIZE_
) {
856 norm
= (UChar
*)allocateMemory((size
+ 1) * sizeof(UChar
),
858 // if allocation failed, status will be set to
859 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
861 size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0, norm
,
863 if (U_FAILURE(*status
) && norm
!= NULL
) {
872 UCollationElements
*coleiter
= strsrch
->utilIter
;
873 ucol_setText(coleiter
, norm
, size
, status
);
874 uint32_t firstce
= strsrch
->pattern
.ces
[0];
875 UBool ignorable
= TRUE
;
876 uint32_t ce
= UCOL_IGNORABLE
;
877 while (U_SUCCESS(*status
) && ce
!= firstce
&& ce
!= (uint32_t)UCOL_NULLORDER
) {
878 offset
= ucol_getOffset(coleiter
);
879 if (ce
!= firstce
&& ce
!= UCOL_IGNORABLE
) {
882 ce
= ucol_next(coleiter
, status
);
885 U16_PREV(norm
, 0, offset
, codepoint
);
886 result
= !ignorable
&& (u_getCombiningClass(codepoint
) != 0);
888 if (norm
!= buffer
) {
898 * Used by exact matches, checks if there are accents before the match.
899 * This is really painful... we have to check that composite characters at
900 * the start of the matches have to not have any extra accents.
901 * We check the FCD of the character first, if it starts with an accent and
902 * the first pattern ce does not match the first ce of the character, we bail.
903 * Otherwise we try normalizing the first composite
904 * character and find the immediate decomposed character before the match to
905 * see if it is an non-ignorable accent.
906 * Now normalizing the first composite character is enough because we ensure
907 * that when the match is passed in here with extra beginning ces, the
908 * first or last ce that match has to occur within the first character.
909 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
910 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
911 * Note here that accents checking are slow and cautioned in the API docs.
912 * @param strsrch string search data
913 * @param start offset
915 * @return TRUE if there are accents on either side of the match,
919 UBool
hasAccentsBeforeMatch(const UStringSearch
*strsrch
, int32_t start
,
922 if (strsrch
->pattern
.hasPrefixAccents
) {
923 UCollationElements
*coleiter
= strsrch
->textIter
;
924 UErrorCode status
= U_ZERO_ERROR
;
925 // we have been iterating forwards previously
926 uint32_t ignorable
= TRUE
;
927 int32_t firstce
= strsrch
->pattern
.ces
[0];
929 setColEIterOffset(coleiter
, start
);
930 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
931 if (U_FAILURE(status
)) {
934 while (ce
!= firstce
) {
935 if (ce
!= UCOL_IGNORABLE
) {
938 ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
939 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
943 if (!ignorable
&& inNormBuf(coleiter
)) {
944 // within normalization buffer, discontiguous handled here
949 int32_t temp
= start
;
951 // accent = (getFCD(strsrch->search->text, &temp,
952 // strsrch->search->textLength)
953 // >> SECOND_LAST_BYTE_SHIFT_);
954 // however this code does not work well with VC7 .net in release mode.
955 // maybe the inlines for getFCD combined with shifting has bugs in
956 // VC7. anyways this is a work around.
957 UBool accent
= getFCD(strsrch
->search
->text
, &temp
,
958 strsrch
->search
->textLength
) > 0xFF;
960 return checkExtraMatchAccents(strsrch
, start
, end
, &status
);
967 U16_BACK_1(strsrch
->search
->text
, 0, temp
);
968 if (getFCD(strsrch
->search
->text
, &temp
,
969 strsrch
->search
->textLength
) & LAST_BYTE_MASK_
) {
970 setColEIterOffset(coleiter
, start
);
971 ce
= ucol_previous(coleiter
, &status
);
972 if (U_FAILURE(status
) ||
973 (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
)) {
984 * Used by exact matches, checks if there are accents bounding the match.
985 * Note this is the initial boundary check. If the potential match
986 * starts or ends with composite characters, the accents in those
987 * characters will be determined later.
988 * Not doing backwards iteration here, since discontiguos contraction for
989 * backwards collation element iterator, use up too many characters.
990 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
991 * should fail since there is a acute at the end of \u01FA
992 * Note here that accents checking are slow and cautioned in the API docs.
993 * @param strsrch string search data
994 * @param start offset of match
995 * @param end end offset of the match
996 * @return TRUE if there are accents on either side of the match,
1000 UBool
hasAccentsAfterMatch(const UStringSearch
*strsrch
, int32_t start
,
1003 if (strsrch
->pattern
.hasSuffixAccents
) {
1004 const UChar
*text
= strsrch
->search
->text
;
1006 int32_t textlength
= strsrch
->search
->textLength
;
1007 U16_BACK_1(text
, 0, temp
);
1008 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
1009 int32_t firstce
= strsrch
->pattern
.ces
[0];
1010 UCollationElements
*coleiter
= strsrch
->textIter
;
1011 UErrorCode status
= U_ZERO_ERROR
;
1013 setColEIterOffset(coleiter
, start
);
1014 while ((ce
= getCE(strsrch
, ucol_next(coleiter
, &status
))) != firstce
) {
1015 if (U_FAILURE(status
) || ce
== UCOL_NULLORDER
) {
1020 while (count
< strsrch
->pattern
.cesLength
) {
1021 if (getCE(strsrch
, ucol_next(coleiter
, &status
))
1022 == UCOL_IGNORABLE
) {
1023 // Thai can give an ignorable here.
1026 if (U_FAILURE(status
)) {
1032 ce
= ucol_next(coleiter
, &status
);
1033 if (U_FAILURE(status
)) {
1036 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1037 ce
= getCE(strsrch
, ce
);
1039 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
1040 if (ucol_getOffset(coleiter
) <= end
) {
1043 if (getFCD(text
, &end
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
1051 #endif // #if BOYER_MOORE
1054 * Checks if the offset runs out of the text string
1056 * @param textlength of the text string
1057 * @return TRUE if offset is out of bounds, FALSE otherwise
1060 inline UBool
isOutOfBounds(int32_t textlength
, int32_t offset
)
1062 return offset
< 0 || offset
> textlength
;
1066 * Checks for identical match
1067 * @param strsrch string search data
1068 * @param start offset of possible match
1069 * @param end offset of possible match
1070 * @return TRUE if identical match is found
1073 inline UBool
checkIdentical(const UStringSearch
*strsrch
, int32_t start
,
1076 if (strsrch
->strength
!= UCOL_IDENTICAL
) {
1080 // Note: We could use Normalizer::compare() or similar, but for short strings
1081 // which may not be in FCD it might be faster to just NFD them.
1082 UErrorCode status
= U_ZERO_ERROR
;
1083 UnicodeString t2
, p2
;
1084 strsrch
->nfd
->normalize(
1085 UnicodeString(FALSE
, strsrch
->search
->text
+ start
, end
- start
), t2
, status
);
1086 strsrch
->nfd
->normalize(
1087 UnicodeString(FALSE
, strsrch
->pattern
.text
, strsrch
->pattern
.textLength
), p2
, status
);
1088 // return FALSE if NFD failed
1089 return U_SUCCESS(status
) && t2
== p2
;
1094 * Checks to see if the match is repeated
1095 * @param strsrch string search data
1096 * @param start new match start index
1097 * @param end new match end index
1098 * @return TRUE if the the match is repeated, FALSE otherwise
1101 inline UBool
checkRepeatedMatch(UStringSearch
*strsrch
,
1105 int32_t lastmatchindex
= strsrch
->search
->matchedIndex
;
1107 if (lastmatchindex
== USEARCH_DONE
) {
1110 if (strsrch
->search
->isForwardSearching
) {
1111 result
= start
<= lastmatchindex
;
1114 result
= start
>= lastmatchindex
;
1116 if (!result
&& !strsrch
->search
->isOverlap
) {
1117 if (strsrch
->search
->isForwardSearching
) {
1118 result
= start
< lastmatchindex
+ strsrch
->search
->matchedLength
;
1121 result
= end
> lastmatchindex
;
1128 * Gets the collation element iterator's current offset.
1129 * @param coleiter collation element iterator
1130 * @param forwards flag TRUE if we are moving in th forwards direction
1131 * @return current offset
1134 inline int32_t getColElemIterOffset(const UCollationElements
*coleiter
,
1137 int32_t result
= ucol_getOffset(coleiter
);
1138 // intricacies of the the backwards collation element iterator
1139 if (FALSE
&& !forwards
&& inNormBuf(coleiter
) && !isFCDPointerNull(coleiter
)) {
1146 * Checks match for contraction.
1147 * If the match ends with a partial contraction we fail.
1148 * If the match starts too far off (because of backwards iteration) we try to
1149 * chip off the extra characters depending on whether a breakiterator has
1151 * Internal method, error assumed to be success, caller has to check status
1152 * before calling this method.
1153 * @param strsrch string search data
1154 * @param start offset of potential match, to be modified if necessary
1155 * @param end offset of potential match, to be modified if necessary
1156 * @param status output error status if any
1157 * @return TRUE if match passes the contraction test, FALSE otherwise
1161 UBool
checkNextExactContractionMatch(UStringSearch
*strsrch
,
1163 int32_t *end
, UErrorCode
*status
)
1165 UCollationElements
*coleiter
= strsrch
->textIter
;
1166 int32_t textlength
= strsrch
->search
->textLength
;
1167 int32_t temp
= *start
;
1168 const UCollator
*collator
= strsrch
->collator
;
1169 const UChar
*text
= strsrch
->search
->text
;
1170 // This part checks if either ends of the match contains potential
1171 // contraction. If so we'll have to iterate through them
1172 // The start contraction needs to be checked since ucol_previous dumps
1173 // all characters till the first safe character into the buffer.
1174 // *start + 1 is used to test for the unsafe characters instead of *start
1175 // because ucol_prev takes all unsafe characters till the first safe
1176 // character ie *start. so by testing *start + 1, we can estimate if
1177 // excess prefix characters has been included in the potential search
1179 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1180 (*start
+ 1 < textlength
1181 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1182 int32_t expansion
= getExpansionPrefix(coleiter
);
1183 UBool expandflag
= expansion
> 0;
1184 setColEIterOffset(coleiter
, *start
);
1185 while (expansion
> 0) {
1186 // getting rid of the redundant ce, caused by setOffset.
1187 // since backward contraction/expansion may have extra ces if we
1188 // are in the normalization buffer, hasAccentsBeforeMatch would
1189 // have taken care of it.
1190 // E.g. the character \u01FA will have an expansion of 3, but if
1191 // we are only looking for acute and ring \u030A and \u0301, we'll
1192 // have to skip the first ce in the expansion buffer.
1193 ucol_next(coleiter
, status
);
1194 if (U_FAILURE(*status
)) {
1197 if (ucol_getOffset(coleiter
) != temp
) {
1199 temp
= ucol_getOffset(coleiter
);
1204 int32_t *patternce
= strsrch
->pattern
.ces
;
1205 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
1207 while (count
< patterncelength
) {
1208 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1209 if (ce
== UCOL_IGNORABLE
) {
1212 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1214 temp
= ucol_getOffset(coleiter
);
1216 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1218 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1228 * Checks and sets the match information if found.
1231 * <li> the potential match does not repeat the previous match
1232 * <li> boundaries are correct
1233 * <li> exact matches has no extra accents
1234 * <li> identical matchesb
1235 * <li> potential match does not end in the middle of a contraction
1237 * Otherwise the offset will be shifted to the next character.
1238 * Internal method, status assumed to be success, caller has to check status
1239 * before calling this method.
1240 * @param strsrch string search data
1241 * @param textoffset offset in the collation element text. the returned value
1242 * will be the truncated end offset of the match or the new start
1244 * @param status output error status if any
1245 * @return TRUE if the match is valid, FALSE otherwise
1248 inline UBool
checkNextExactMatch(UStringSearch
*strsrch
,
1249 int32_t *textoffset
, UErrorCode
*status
)
1251 UCollationElements
*coleiter
= strsrch
->textIter
;
1252 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1254 if (!checkNextExactContractionMatch(strsrch
, &start
, textoffset
, status
)) {
1258 // this totally matches, however we need to check if it is repeating
1259 if (!isBreakUnit(strsrch
, start
, *textoffset
) ||
1260 checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1261 hasAccentsBeforeMatch(strsrch
, start
, *textoffset
) ||
1262 !checkIdentical(strsrch
, start
, *textoffset
) ||
1263 hasAccentsAfterMatch(strsrch
, start
, *textoffset
)) {
1266 *textoffset
= getNextUStringSearchBaseOffset(strsrch
, *textoffset
);
1270 //Add breakiterator boundary check for primary strength search.
1271 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
1272 checkBreakBoundary(strsrch
, &start
, textoffset
);
1275 // totally match, we will get rid of the ending ignorables.
1276 strsrch
->search
->matchedIndex
= start
;
1277 strsrch
->search
->matchedLength
= *textoffset
- start
;
1282 * Getting the previous base character offset, or the current offset if the
1283 * current character is a base character
1284 * @param text string
1285 * @param textoffset one offset after the current character
1286 * @return the offset of the next character after the base character or the first
1287 * composed character with accents
1290 inline int32_t getPreviousBaseOffset(const UChar
*text
,
1293 if (textoffset
> 0) {
1295 int32_t result
= textoffset
;
1296 U16_BACK_1(text
, 0, textoffset
);
1297 int32_t temp
= textoffset
;
1298 uint16_t fcd
= getFCD(text
, &temp
, result
);
1299 if ((fcd
>> SECOND_LAST_BYTE_SHIFT_
) == 0) {
1300 if (fcd
& LAST_BYTE_MASK_
) {
1305 if (textoffset
== 0) {
1314 * Getting the indexes of the accents that are not blocked in the argument
1316 * @param accents array of accents in nfd terminated by a 0.
1317 * @param accentsindex array of indexes of the accents that are not blocked
1320 inline int getUnblockedAccentIndex(UChar
*accents
, int32_t *accentsindex
)
1323 int32_t length
= u_strlen(accents
);
1324 UChar32 codepoint
= 0;
1328 while (index
< length
) {
1330 U16_NEXT(accents
, index
, length
, codepoint
);
1331 if (u_getCombiningClass(codepoint
) != cclass
) {
1332 cclass
= u_getCombiningClass(codepoint
);
1333 accentsindex
[result
] = temp
;
1337 accentsindex
[result
] = length
;
1342 * Appends 3 UChar arrays to a destination array.
1343 * Creates a new array if we run out of space. The caller will have to
1344 * manually deallocate the newly allocated array.
1345 * Internal method, status assumed to be success, caller has to check status
1346 * before calling this method. destination not to be NULL and has at least
1347 * size destinationlength.
1348 * @param destination target array
1349 * @param destinationlength target array size, returning the appended length
1350 * @param source1 null-terminated first array
1351 * @param source2 second array
1352 * @param source2length length of seond array
1353 * @param source3 null-terminated third array
1354 * @param status error status if any
1355 * @return new destination array, destination if there was no new allocation
1358 inline UChar
* addToUCharArray( UChar
*destination
,
1359 int32_t *destinationlength
,
1360 const UChar
*source1
,
1361 const UChar
*source2
,
1362 int32_t source2length
,
1363 const UChar
*source3
,
1366 int32_t source1length
= source1
? u_strlen(source1
) : 0;
1367 int32_t source3length
= source3
? u_strlen(source3
) : 0;
1368 if (*destinationlength
< source1length
+ source2length
+ source3length
+
1371 destination
= (UChar
*)allocateMemory(
1372 (source1length
+ source2length
+ source3length
+ 1) * sizeof(UChar
),
1374 // if error allocating memory, status will be
1375 // U_MEMORY_ALLOCATION_ERROR
1376 if (U_FAILURE(*status
)) {
1377 *destinationlength
= 0;
1381 if (source1length
!= 0) {
1382 uprv_memcpy(destination
, source1
, sizeof(UChar
) * source1length
);
1384 if (source2length
!= 0) {
1385 uprv_memcpy(destination
+ source1length
, source2
,
1386 sizeof(UChar
) * source2length
);
1388 if (source3length
!= 0) {
1389 uprv_memcpy(destination
+ source1length
+ source2length
, source3
,
1390 sizeof(UChar
) * source3length
);
1392 *destinationlength
= source1length
+ source2length
+ source3length
;
1397 * Running through a collation element iterator to see if the contents matches
1398 * pattern in string search data
1399 * @param strsrch string search data
1400 * @param coleiter collation element iterator
1401 * @return TRUE if a match if found, FALSE otherwise
1404 inline UBool
checkCollationMatch(const UStringSearch
*strsrch
,
1405 UCollationElements
*coleiter
)
1407 int patternceindex
= strsrch
->pattern
.cesLength
;
1408 int32_t *patternce
= strsrch
->pattern
.ces
;
1409 UErrorCode status
= U_ZERO_ERROR
;
1410 while (patternceindex
> 0) {
1411 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
1412 if (ce
== UCOL_IGNORABLE
) {
1415 if (U_FAILURE(status
) || ce
!= *patternce
) {
1425 * Rearranges the front accents to try matching.
1426 * Prefix accents in the text will be grouped according to their combining
1427 * class and the groups will be mixed and matched to try find the perfect
1428 * match with the pattern.
1429 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1430 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1431 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1433 * step 2: check if any of the generated substrings matches the pattern.
1434 * Internal method, status is assumed to be success, caller has to check status
1435 * before calling this method.
1436 * @param strsrch string search match
1437 * @param start first offset of the accents to start searching
1438 * @param end start of the last accent set
1439 * @param status output error status if any
1440 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1441 * offset of the match. Note this start includes all preceding accents.
1444 int32_t doNextCanonicalPrefixMatch(UStringSearch
*strsrch
,
1449 const UChar
*text
= strsrch
->search
->text
;
1450 int32_t textlength
= strsrch
->search
->textLength
;
1451 int32_t tempstart
= start
;
1453 if ((getFCD(text
, &tempstart
, textlength
) & LAST_BYTE_MASK_
) == 0) {
1454 // die... failed at a base character
1455 return USEARCH_DONE
;
1458 int32_t offset
= getNextBaseOffset(text
, tempstart
, textlength
);
1459 start
= getPreviousBaseOffset(text
, tempstart
);
1461 UChar accents
[INITIAL_ARRAY_SIZE_
];
1462 // normalizing the offensive string
1463 unorm_normalize(text
+ start
, offset
- start
, UNORM_NFD
, 0, accents
,
1464 INITIAL_ARRAY_SIZE_
, status
);
1465 if (U_FAILURE(*status
)) {
1466 return USEARCH_DONE
;
1469 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1470 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1472 int32_t count
= (2 << (accentsize
- 1)) - 1;
1473 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1474 UCollationElements
*coleiter
= strsrch
->utilIter
;
1475 while (U_SUCCESS(*status
) && count
> 0) {
1476 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
1477 // copy the base characters
1478 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1479 *rearrange
++ = accents
[k
];
1481 // forming all possible canonical rearrangement by dropping
1483 for (int i
= 0; i
<= accentsize
- 1; i
++) {
1484 int32_t mask
= 1 << (accentsize
- i
- 1);
1486 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1487 *rearrange
++ = accents
[j
];
1492 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
1493 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
1494 strsrch
->canonicalPrefixAccents
,
1495 strsrch
->search
->text
+ offset
,
1497 strsrch
->canonicalSuffixAccents
,
1500 // if status is a failure, ucol_setText does nothing.
1501 // run the collator iterator through this match
1502 ucol_setText(coleiter
, match
, matchsize
, status
);
1503 if (U_SUCCESS(*status
)) {
1504 if (checkCollationMatch(strsrch
, coleiter
)) {
1505 if (match
!= buffer
) {
1513 return USEARCH_DONE
;
1517 * Gets the offset to the safe point in text before textoffset.
1518 * ie. not the middle of a contraction, swappable characters or supplementary
1520 * @param collator collation sata
1521 * @param text string to work with
1522 * @param textoffset offset in string
1523 * @param textlength length of text string
1524 * @return offset to the previous safe character
1527 inline uint32_t getPreviousSafeOffset(const UCollator
*collator
,
1531 int32_t result
= textoffset
; // first contraction character
1532 while (result
!= 0 && ucol_unsafeCP(text
[result
- 1], collator
)) {
1536 // the first contraction character is consider unsafe here
1543 * Cleaning up after we passed the safe zone
1544 * @param strsrch string search data
1545 * @param safetext safe text array
1546 * @param safebuffer safe text buffer
1547 * @param coleiter collation element iterator for safe text
1550 inline void cleanUpSafeText(const UStringSearch
*strsrch
, UChar
*safetext
,
1553 if (safetext
!= safebuffer
&& safetext
!= strsrch
->canonicalSuffixAccents
)
1555 uprv_free(safetext
);
1560 * Take the rearranged end accents and tries matching. If match failed at
1561 * a seperate preceding set of accents (seperated from the rearranged on by
1562 * at least a base character) then we rearrange the preceding accents and
1563 * tries matching again.
1564 * We allow skipping of the ends of the accent set if the ces do not match.
1565 * However if the failure is found before the accent set, it fails.
1566 * Internal method, status assumed to be success, caller has to check status
1567 * before calling this method.
1568 * @param strsrch string search data
1569 * @param textoffset of the start of the rearranged accent
1570 * @param status output error status if any
1571 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1572 * offset of the match. Note this start includes all preceding accents.
1575 int32_t doNextCanonicalSuffixMatch(UStringSearch
*strsrch
,
1579 const UChar
*text
= strsrch
->search
->text
;
1580 const UCollator
*collator
= strsrch
->collator
;
1581 int32_t safelength
= 0;
1583 int32_t safetextlength
;
1584 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
1585 UCollationElements
*coleiter
= strsrch
->utilIter
;
1586 int32_t safeoffset
= textoffset
;
1588 if (textoffset
!= 0 && ucol_unsafeCP(strsrch
->canonicalSuffixAccents
[0],
1590 safeoffset
= getPreviousSafeOffset(collator
, text
, textoffset
);
1591 safelength
= textoffset
- safeoffset
;
1592 safetextlength
= INITIAL_ARRAY_SIZE_
;
1593 safetext
= addToUCharArray(safebuffer
, &safetextlength
, NULL
,
1594 text
+ safeoffset
, safelength
,
1595 strsrch
->canonicalSuffixAccents
,
1599 safetextlength
= u_strlen(strsrch
->canonicalSuffixAccents
);
1600 safetext
= strsrch
->canonicalSuffixAccents
;
1603 // if status is a failure, ucol_setText does nothing
1604 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
1605 // status checked in loop below
1607 int32_t *ce
= strsrch
->pattern
.ces
;
1608 int32_t celength
= strsrch
->pattern
.cesLength
;
1609 int ceindex
= celength
- 1;
1610 UBool isSafe
= TRUE
; // indication flag for position in safe zone
1612 while (ceindex
>= 0) {
1613 int32_t textce
= ucol_previous(coleiter
, status
);
1614 if (U_FAILURE(*status
)) {
1616 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1618 return USEARCH_DONE
;
1620 if (textce
== UCOL_NULLORDER
) {
1621 // check if we have passed the safe buffer
1622 if (coleiter
== strsrch
->textIter
) {
1623 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1624 return USEARCH_DONE
;
1626 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1627 safetext
= safebuffer
;
1628 coleiter
= strsrch
->textIter
;
1629 setColEIterOffset(coleiter
, safeoffset
);
1630 // status checked at the start of the loop
1634 textce
= getCE(strsrch
, textce
);
1635 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
1636 // do the beginning stuff
1637 int32_t failedoffset
= getColElemIterOffset(coleiter
, FALSE
);
1638 if (isSafe
&& failedoffset
>= safelength
) {
1639 // alas... no hope. failed at rearranged accent set
1640 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1641 return USEARCH_DONE
;
1645 failedoffset
+= safeoffset
;
1646 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1649 // try rearranging the front accents
1650 int32_t result
= doNextCanonicalPrefixMatch(strsrch
,
1651 failedoffset
, textoffset
, status
);
1652 if (result
!= USEARCH_DONE
) {
1653 // if status is a failure, ucol_setOffset does nothing
1654 setColEIterOffset(strsrch
->textIter
, result
);
1656 if (U_FAILURE(*status
)) {
1657 return USEARCH_DONE
;
1662 if (textce
== ce
[ceindex
]) {
1668 int32_t result
= getColElemIterOffset(coleiter
, FALSE
);
1669 // sets the text iterator here with the correct expansion and offset
1670 int32_t leftoverces
= getExpansionPrefix(coleiter
);
1671 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1672 if (result
>= safelength
) {
1673 result
= textoffset
;
1676 result
+= safeoffset
;
1678 setColEIterOffset(strsrch
->textIter
, result
);
1679 strsrch
->textIter
->iteratordata_
.toReturn
=
1680 setExpansionPrefix(strsrch
->textIter
, leftoverces
);
1684 return ucol_getOffset(coleiter
);
1688 * Trying out the substring and sees if it can be a canonical match.
1689 * This will try normalizing the end accents and arranging them into canonical
1690 * equivalents and check their corresponding ces with the pattern ce.
1691 * Suffix accents in the text will be grouped according to their combining
1692 * class and the groups will be mixed and matched to try find the perfect
1693 * match with the pattern.
1694 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1695 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1696 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1698 * step 2: check if any of the generated substrings matches the pattern.
1699 * Internal method, status assumed to be success, caller has to check status
1700 * before calling this method.
1701 * @param strsrch string search data
1702 * @param textoffset end offset in the collation element text that ends with
1703 * the accents to be rearranged
1704 * @param status error status if any
1705 * @return TRUE if the match is valid, FALSE otherwise
1708 UBool
doNextCanonicalMatch(UStringSearch
*strsrch
,
1712 const UChar
*text
= strsrch
->search
->text
;
1713 int32_t temp
= textoffset
;
1714 U16_BACK_1(text
, 0, temp
);
1715 if ((getFCD(text
, &temp
, textoffset
) & LAST_BYTE_MASK_
) == 0) {
1716 UCollationElements
*coleiter
= strsrch
->textIter
;
1717 int32_t offset
= getColElemIterOffset(coleiter
, FALSE
);
1718 if (strsrch
->pattern
.hasPrefixAccents
) {
1719 offset
= doNextCanonicalPrefixMatch(strsrch
, offset
, textoffset
,
1721 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
1722 setColEIterOffset(coleiter
, offset
);
1729 if (!strsrch
->pattern
.hasSuffixAccents
) {
1733 UChar accents
[INITIAL_ARRAY_SIZE_
];
1734 // offset to the last base character in substring to search
1735 int32_t baseoffset
= getPreviousBaseOffset(text
, textoffset
);
1736 // normalizing the offensive string
1737 unorm_normalize(text
+ baseoffset
, textoffset
- baseoffset
, UNORM_NFD
,
1738 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
1739 // status checked in loop below
1741 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1742 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
1744 // 2 power n - 1 plus the full set of accents
1745 int32_t count
= (2 << (size
- 1)) - 1;
1746 while (U_SUCCESS(*status
) && count
> 0) {
1747 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
1748 // copy the base characters
1749 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1750 *rearrange
++ = accents
[k
];
1752 // forming all possible canonical rearrangement by dropping
1754 for (int i
= 0; i
<= size
- 1; i
++) {
1755 int32_t mask
= 1 << (size
- i
- 1);
1757 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1758 *rearrange
++ = accents
[j
];
1763 int32_t offset
= doNextCanonicalSuffixMatch(strsrch
, baseoffset
,
1765 if (offset
!= USEARCH_DONE
) {
1766 return TRUE
; // match found
1774 * Gets the previous base character offset depending on the string search
1776 * @param strsrch string search data
1777 * @param textoffset current offset, current character
1778 * @return the offset of the next character after this base character or itself
1779 * if it is a composed character with accents
1782 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch
*strsrch
,
1785 if (strsrch
->pattern
.hasPrefixAccents
&& textoffset
> 0) {
1786 const UChar
*text
= strsrch
->search
->text
;
1787 int32_t offset
= textoffset
;
1788 if (getFCD(text
, &offset
, strsrch
->search
->textLength
) >>
1789 SECOND_LAST_BYTE_SHIFT_
) {
1790 return getPreviousBaseOffset(text
, textoffset
);
1797 * Checks match for contraction.
1798 * If the match ends with a partial contraction we fail.
1799 * If the match starts too far off (because of backwards iteration) we try to
1800 * chip off the extra characters
1801 * Internal method, status assumed to be success, caller has to check status
1802 * before calling this method.
1803 * @param strsrch string search data
1804 * @param start offset of potential match, to be modified if necessary
1805 * @param end offset of potential match, to be modified if necessary
1806 * @param status output error status if any
1807 * @return TRUE if match passes the contraction test, FALSE otherwise
1810 UBool
checkNextCanonicalContractionMatch(UStringSearch
*strsrch
,
1815 UCollationElements
*coleiter
= strsrch
->textIter
;
1816 int32_t textlength
= strsrch
->search
->textLength
;
1817 int32_t temp
= *start
;
1818 const UCollator
*collator
= strsrch
->collator
;
1819 const UChar
*text
= strsrch
->search
->text
;
1820 // This part checks if either ends of the match contains potential
1821 // contraction. If so we'll have to iterate through them
1822 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1823 (*start
+ 1 < textlength
1824 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1825 int32_t expansion
= getExpansionPrefix(coleiter
);
1826 UBool expandflag
= expansion
> 0;
1827 setColEIterOffset(coleiter
, *start
);
1828 while (expansion
> 0) {
1829 // getting rid of the redundant ce, caused by setOffset.
1830 // since backward contraction/expansion may have extra ces if we
1831 // are in the normalization buffer, hasAccentsBeforeMatch would
1832 // have taken care of it.
1833 // E.g. the character \u01FA will have an expansion of 3, but if
1834 // we are only looking for acute and ring \u030A and \u0301, we'll
1835 // have to skip the first ce in the expansion buffer.
1836 ucol_next(coleiter
, status
);
1837 if (U_FAILURE(*status
)) {
1840 if (ucol_getOffset(coleiter
) != temp
) {
1842 temp
= ucol_getOffset(coleiter
);
1847 int32_t *patternce
= strsrch
->pattern
.ces
;
1848 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
1850 int32_t textlength
= strsrch
->search
->textLength
;
1851 while (count
< patterncelength
) {
1852 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1853 // status checked below, note that if status is a failure
1854 // ucol_next returns UCOL_NULLORDER
1855 if (ce
== UCOL_IGNORABLE
) {
1858 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1860 temp
= ucol_getOffset(coleiter
);
1863 if (count
== 0 && ce
!= patternce
[0]) {
1864 // accents may have extra starting ces, this occurs when a
1865 // pure accent pattern is matched without rearrangement
1866 // text \u0325\u0300 and looking for \u0300
1867 int32_t expected
= patternce
[0];
1868 if (getFCD(text
, start
, textlength
) & LAST_BYTE_MASK_
) {
1869 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1870 while (U_SUCCESS(*status
) && ce
!= expected
&&
1871 ce
!= UCOL_NULLORDER
&&
1872 ucol_getOffset(coleiter
) <= *end
) {
1873 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1877 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1879 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1889 * Checks and sets the match information if found.
1892 * <li> the potential match does not repeat the previous match
1893 * <li> boundaries are correct
1894 * <li> potential match does not end in the middle of a contraction
1895 * <li> identical matches
1897 * Otherwise the offset will be shifted to the next character.
1898 * Internal method, status assumed to be success, caller has to check the
1899 * status before calling this method.
1900 * @param strsrch string search data
1901 * @param textoffset offset in the collation element text. the returned value
1902 * will be the truncated end offset of the match or the new start
1904 * @param status output error status if any
1905 * @return TRUE if the match is valid, FALSE otherwise
1908 inline UBool
checkNextCanonicalMatch(UStringSearch
*strsrch
,
1909 int32_t *textoffset
,
1912 // to ensure that the start and ends are not composite characters
1913 UCollationElements
*coleiter
= strsrch
->textIter
;
1914 // if we have a canonical accent match
1915 if ((strsrch
->pattern
.hasSuffixAccents
&&
1916 strsrch
->canonicalSuffixAccents
[0]) ||
1917 (strsrch
->pattern
.hasPrefixAccents
&&
1918 strsrch
->canonicalPrefixAccents
[0])) {
1919 strsrch
->search
->matchedIndex
= getPreviousUStringSearchBaseOffset(
1921 ucol_getOffset(coleiter
));
1922 strsrch
->search
->matchedLength
= *textoffset
-
1923 strsrch
->search
->matchedIndex
;
1927 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1928 if (!checkNextCanonicalContractionMatch(strsrch
, &start
, textoffset
,
1929 status
) || U_FAILURE(*status
)) {
1933 start
= getPreviousUStringSearchBaseOffset(strsrch
, start
);
1934 // this totally matches, however we need to check if it is repeating
1935 if (checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1936 !isBreakUnit(strsrch
, start
, *textoffset
) ||
1937 !checkIdentical(strsrch
, start
, *textoffset
)) {
1939 *textoffset
= getNextBaseOffset(strsrch
->search
->text
, *textoffset
,
1940 strsrch
->search
->textLength
);
1944 strsrch
->search
->matchedIndex
= start
;
1945 strsrch
->search
->matchedLength
= *textoffset
- start
;
1950 * Shifting the collation element iterator position forward to prepare for
1951 * a preceding match. If the first character is a unsafe character, we'll only
1952 * shift by 1 to capture contractions, normalization etc.
1953 * Internal method, status assumed to be success, caller has to check status
1954 * before calling this method.
1955 * @param text strsrch string search data
1956 * @param textoffset start text position to do search
1957 * @param ce the text ce which failed the match.
1958 * @param patternceindex index of the ce within the pattern ce buffer which
1960 * @return final offset
1963 inline int32_t reverseShift(UStringSearch
*strsrch
,
1966 int32_t patternceindex
)
1968 if (strsrch
->search
->isOverlap
) {
1969 if (textoffset
!= strsrch
->search
->textLength
) {
1973 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1977 if (ce
!= UCOL_NULLORDER
) {
1978 int32_t shift
= strsrch
->pattern
.backShift
[hashFromCE32(ce
)];
1980 // this is to adjust for characters in the middle of the substring
1981 // for matching that failed.
1982 int32_t adjust
= patternceindex
;
1983 if (adjust
> 1 && shift
> adjust
) {
1984 shift
-= adjust
- 1;
1986 textoffset
-= shift
;
1989 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1992 textoffset
= getPreviousUStringSearchBaseOffset(strsrch
, textoffset
);
1997 * Checks match for contraction.
1998 * If the match starts with a partial contraction we fail.
1999 * Internal method, status assumed to be success, caller has to check status
2000 * before calling this method.
2001 * @param strsrch string search data
2002 * @param start offset of potential match, to be modified if necessary
2003 * @param end offset of potential match, to be modified if necessary
2004 * @param status output error status if any
2005 * @return TRUE if match passes the contraction test, FALSE otherwise
2008 UBool
checkPreviousExactContractionMatch(UStringSearch
*strsrch
,
2010 int32_t *end
, UErrorCode
*status
)
2012 UCollationElements
*coleiter
= strsrch
->textIter
;
2013 int32_t textlength
= strsrch
->search
->textLength
;
2014 int32_t temp
= *end
;
2015 const UCollator
*collator
= strsrch
->collator
;
2016 const UChar
*text
= strsrch
->search
->text
;
2017 // This part checks if either if the start of the match contains potential
2018 // contraction. If so we'll have to iterate through them
2019 // Since we used ucol_next while previously looking for the potential
2020 // match, this guarantees that our end will not be a partial contraction,
2021 // or a partial supplementary character.
2022 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2023 int32_t expansion
= getExpansionSuffix(coleiter
);
2024 UBool expandflag
= expansion
> 0;
2025 setColEIterOffset(coleiter
, *end
);
2026 while (U_SUCCESS(*status
) && expansion
> 0) {
2027 // getting rid of the redundant ce
2028 // since forward contraction/expansion may have extra ces
2029 // if we are in the normalization buffer, hasAccentsBeforeMatch
2030 // would have taken care of it.
2031 // E.g. the character \u01FA will have an expansion of 3, but if
2032 // we are only looking for A ring A\u030A, we'll have to skip the
2033 // last ce in the expansion buffer
2034 ucol_previous(coleiter
, status
);
2035 if (U_FAILURE(*status
)) {
2038 if (ucol_getOffset(coleiter
) != temp
) {
2040 temp
= ucol_getOffset(coleiter
);
2045 int32_t *patternce
= strsrch
->pattern
.ces
;
2046 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
2047 int32_t count
= patterncelength
;
2049 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2050 // status checked below, note that if status is a failure
2051 // ucol_previous returns UCOL_NULLORDER
2052 if (ce
== UCOL_IGNORABLE
) {
2055 if (expandflag
&& count
== 0 &&
2056 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2058 temp
= ucol_getOffset(coleiter
);
2060 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2062 *start
= getPreviousBaseOffset(text
, *start
);
2072 * Checks and sets the match information if found.
2075 * <li> the current match does not repeat the last match
2076 * <li> boundaries are correct
2077 * <li> exact matches has no extra accents
2078 * <li> identical matches
2080 * Otherwise the offset will be shifted to the preceding character.
2081 * Internal method, status assumed to be success, caller has to check status
2082 * before calling this method.
2083 * @param strsrch string search data
2085 * @param coleiter collation element iterator
2086 * @param text string
2087 * @param textoffset offset in the collation element text. the returned value
2088 * will be the truncated start offset of the match or the new start
2090 * @param status output error status if any
2091 * @return TRUE if the match is valid, FALSE otherwise
2094 inline UBool
checkPreviousExactMatch(UStringSearch
*strsrch
,
2095 int32_t *textoffset
,
2098 // to ensure that the start and ends are not composite characters
2099 int32_t end
= ucol_getOffset(strsrch
->textIter
);
2100 if (!checkPreviousExactContractionMatch(strsrch
, textoffset
, &end
, status
)
2101 || U_FAILURE(*status
)) {
2105 // this totally matches, however we need to check if it is repeating
2107 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2108 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2109 hasAccentsBeforeMatch(strsrch
, *textoffset
, end
) ||
2110 !checkIdentical(strsrch
, *textoffset
, end
) ||
2111 hasAccentsAfterMatch(strsrch
, *textoffset
, end
)) {
2113 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2118 //Add breakiterator boundary check for primary strength search.
2119 if (!strsrch
->search
->breakIter
&& strsrch
->strength
== UCOL_PRIMARY
) {
2120 checkBreakBoundary(strsrch
, textoffset
, &end
);
2123 strsrch
->search
->matchedIndex
= *textoffset
;
2124 strsrch
->search
->matchedLength
= end
- *textoffset
;
2129 * Rearranges the end accents to try matching.
2130 * Suffix accents in the text will be grouped according to their combining
2131 * class and the groups will be mixed and matched to try find the perfect
2132 * match with the pattern.
2133 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2134 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2135 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2137 * step 2: check if any of the generated substrings matches the pattern.
2138 * Internal method, status assumed to be success, user has to check status
2139 * before calling this method.
2140 * @param strsrch string search match
2141 * @param start offset of the first base character
2142 * @param end start of the last accent set
2143 * @param status only error status if any
2144 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2145 * offset of the match. Note this start includes all following accents.
2148 int32_t doPreviousCanonicalSuffixMatch(UStringSearch
*strsrch
,
2153 const UChar
*text
= strsrch
->search
->text
;
2154 int32_t tempend
= end
;
2156 U16_BACK_1(text
, 0, tempend
);
2157 if (!(getFCD(text
, &tempend
, strsrch
->search
->textLength
) &
2159 // die... failed at a base character
2160 return USEARCH_DONE
;
2162 end
= getNextBaseOffset(text
, end
, strsrch
->search
->textLength
);
2164 if (U_SUCCESS(*status
)) {
2165 UChar accents
[INITIAL_ARRAY_SIZE_
];
2166 int32_t offset
= getPreviousBaseOffset(text
, end
);
2167 // normalizing the offensive string
2168 unorm_normalize(text
+ offset
, end
- offset
, UNORM_NFD
, 0, accents
,
2169 INITIAL_ARRAY_SIZE_
, status
);
2171 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2172 int32_t accentsize
= getUnblockedAccentIndex(accents
,
2174 int32_t count
= (2 << (accentsize
- 1)) - 1;
2175 UChar buffer
[INITIAL_ARRAY_SIZE_
];
2176 UCollationElements
*coleiter
= strsrch
->utilIter
;
2177 while (U_SUCCESS(*status
) && count
> 0) {
2178 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
2179 // copy the base characters
2180 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2181 *rearrange
++ = accents
[k
];
2183 // forming all possible canonical rearrangement by dropping
2185 for (int i
= 0; i
<= accentsize
- 1; i
++) {
2186 int32_t mask
= 1 << (accentsize
- i
- 1);
2188 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2189 *rearrange
++ = accents
[j
];
2194 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
2195 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
2196 strsrch
->canonicalPrefixAccents
,
2197 strsrch
->search
->text
+ start
,
2199 strsrch
->canonicalSuffixAccents
,
2202 // run the collator iterator through this match
2203 // if status is a failure ucol_setText does nothing
2204 ucol_setText(coleiter
, match
, matchsize
, status
);
2205 if (U_SUCCESS(*status
)) {
2206 if (checkCollationMatch(strsrch
, coleiter
)) {
2207 if (match
!= buffer
) {
2216 return USEARCH_DONE
;
2220 * Take the rearranged start accents and tries matching. If match failed at
2221 * a seperate following set of accents (seperated from the rearranged on by
2222 * at least a base character) then we rearrange the preceding accents and
2223 * tries matching again.
2224 * We allow skipping of the ends of the accent set if the ces do not match.
2225 * However if the failure is found before the accent set, it fails.
2226 * Internal method, status assumed to be success, caller has to check status
2227 * before calling this method.
2228 * @param strsrch string search data
2229 * @param textoffset of the ends of the rearranged accent
2230 * @param status output error status if any
2231 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2232 * offset of the match. Note this start includes all following accents.
2235 int32_t doPreviousCanonicalPrefixMatch(UStringSearch
*strsrch
,
2239 const UChar
*text
= strsrch
->search
->text
;
2240 const UCollator
*collator
= strsrch
->collator
;
2241 int32_t safelength
= 0;
2243 int32_t safetextlength
;
2244 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
2245 int32_t safeoffset
= textoffset
;
2248 ucol_unsafeCP(strsrch
->canonicalPrefixAccents
[
2249 u_strlen(strsrch
->canonicalPrefixAccents
) - 1
2251 safeoffset
= getNextSafeOffset(collator
, text
, textoffset
,
2252 strsrch
->search
->textLength
);
2253 safelength
= safeoffset
- textoffset
;
2254 safetextlength
= INITIAL_ARRAY_SIZE_
;
2255 safetext
= addToUCharArray(safebuffer
, &safetextlength
,
2256 strsrch
->canonicalPrefixAccents
,
2257 text
+ textoffset
, safelength
,
2261 safetextlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2262 safetext
= strsrch
->canonicalPrefixAccents
;
2265 UCollationElements
*coleiter
= strsrch
->utilIter
;
2266 // if status is a failure, ucol_setText does nothing
2267 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
2268 // status checked in loop below
2270 int32_t *ce
= strsrch
->pattern
.ces
;
2271 int32_t celength
= strsrch
->pattern
.cesLength
;
2273 UBool isSafe
= TRUE
; // safe zone indication flag for position
2274 int32_t prefixlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2276 while (ceindex
< celength
) {
2277 int32_t textce
= ucol_next(coleiter
, status
);
2278 if (U_FAILURE(*status
)) {
2280 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2282 return USEARCH_DONE
;
2284 if (textce
== UCOL_NULLORDER
) {
2285 // check if we have passed the safe buffer
2286 if (coleiter
== strsrch
->textIter
) {
2287 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2288 return USEARCH_DONE
;
2290 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2291 safetext
= safebuffer
;
2292 coleiter
= strsrch
->textIter
;
2293 setColEIterOffset(coleiter
, safeoffset
);
2294 // status checked at the start of the loop
2298 textce
= getCE(strsrch
, textce
);
2299 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
2300 // do the beginning stuff
2301 int32_t failedoffset
= ucol_getOffset(coleiter
);
2302 if (isSafe
&& failedoffset
<= prefixlength
) {
2303 // alas... no hope. failed at rearranged accent set
2304 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2305 return USEARCH_DONE
;
2309 failedoffset
= safeoffset
- failedoffset
;
2310 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2313 // try rearranging the end accents
2314 int32_t result
= doPreviousCanonicalSuffixMatch(strsrch
,
2315 textoffset
, failedoffset
, status
);
2316 if (result
!= USEARCH_DONE
) {
2317 // if status is a failure, ucol_setOffset does nothing
2318 setColEIterOffset(strsrch
->textIter
, result
);
2320 if (U_FAILURE(*status
)) {
2321 return USEARCH_DONE
;
2326 if (textce
== ce
[ceindex
]) {
2332 int32_t result
= ucol_getOffset(coleiter
);
2333 // sets the text iterator here with the correct expansion and offset
2334 int32_t leftoverces
= getExpansionSuffix(coleiter
);
2335 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2336 if (result
<= prefixlength
) {
2337 result
= textoffset
;
2340 result
= textoffset
+ (safeoffset
- result
);
2342 setColEIterOffset(strsrch
->textIter
, result
);
2343 setExpansionSuffix(strsrch
->textIter
, leftoverces
);
2347 return ucol_getOffset(coleiter
);
2351 * Trying out the substring and sees if it can be a canonical match.
2352 * This will try normalizing the starting accents and arranging them into
2353 * canonical equivalents and check their corresponding ces with the pattern ce.
2354 * Prefix accents in the text will be grouped according to their combining
2355 * class and the groups will be mixed and matched to try find the perfect
2356 * match with the pattern.
2357 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2358 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2359 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2361 * step 2: check if any of the generated substrings matches the pattern.
2362 * Internal method, status assumed to be success, caller has to check status
2363 * before calling this method.
2364 * @param strsrch string search data
2365 * @param textoffset start offset in the collation element text that starts
2366 * with the accents to be rearranged
2367 * @param status output error status if any
2368 * @return TRUE if the match is valid, FALSE otherwise
2371 UBool
doPreviousCanonicalMatch(UStringSearch
*strsrch
,
2375 const UChar
*text
= strsrch
->search
->text
;
2376 int32_t temp
= textoffset
;
2377 int32_t textlength
= strsrch
->search
->textLength
;
2378 if ((getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) == 0) {
2379 UCollationElements
*coleiter
= strsrch
->textIter
;
2380 int32_t offset
= ucol_getOffset(coleiter
);
2381 if (strsrch
->pattern
.hasSuffixAccents
) {
2382 offset
= doPreviousCanonicalSuffixMatch(strsrch
, textoffset
,
2384 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
2385 setColEIterOffset(coleiter
, offset
);
2392 if (!strsrch
->pattern
.hasPrefixAccents
) {
2396 UChar accents
[INITIAL_ARRAY_SIZE_
];
2397 // offset to the last base character in substring to search
2398 int32_t baseoffset
= getNextBaseOffset(text
, textoffset
, textlength
);
2399 // normalizing the offensive string
2400 unorm_normalize(text
+ textoffset
, baseoffset
- textoffset
, UNORM_NFD
,
2401 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
2402 // status checked in loop
2404 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2405 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
2407 // 2 power n - 1 plus the full set of accents
2408 int32_t count
= (2 << (size
- 1)) - 1;
2409 while (U_SUCCESS(*status
) && count
> 0) {
2410 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
2411 // copy the base characters
2412 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2413 *rearrange
++ = accents
[k
];
2415 // forming all possible canonical rearrangement by dropping
2417 for (int i
= 0; i
<= size
- 1; i
++) {
2418 int32_t mask
= 1 << (size
- i
- 1);
2420 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2421 *rearrange
++ = accents
[j
];
2426 int32_t offset
= doPreviousCanonicalPrefixMatch(strsrch
,
2427 baseoffset
, status
);
2428 if (offset
!= USEARCH_DONE
) {
2429 return TRUE
; // match found
2437 * Checks match for contraction.
2438 * If the match starts with a partial contraction we fail.
2439 * Internal method, status assumed to be success, caller has to check status
2440 * before calling this method.
2441 * @param strsrch string search data
2442 * @param start offset of potential match, to be modified if necessary
2443 * @param end offset of potential match, to be modified if necessary
2444 * @param status only error status if any
2445 * @return TRUE if match passes the contraction test, FALSE otherwise
2448 UBool
checkPreviousCanonicalContractionMatch(UStringSearch
*strsrch
,
2450 int32_t *end
, UErrorCode
*status
)
2452 UCollationElements
*coleiter
= strsrch
->textIter
;
2453 int32_t textlength
= strsrch
->search
->textLength
;
2454 int32_t temp
= *end
;
2455 const UCollator
*collator
= strsrch
->collator
;
2456 const UChar
*text
= strsrch
->search
->text
;
2457 // This part checks if either if the start of the match contains potential
2458 // contraction. If so we'll have to iterate through them
2459 // Since we used ucol_next while previously looking for the potential
2460 // match, this guarantees that our end will not be a partial contraction,
2461 // or a partial supplementary character.
2462 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2463 int32_t expansion
= getExpansionSuffix(coleiter
);
2464 UBool expandflag
= expansion
> 0;
2465 setColEIterOffset(coleiter
, *end
);
2466 while (expansion
> 0) {
2467 // getting rid of the redundant ce
2468 // since forward contraction/expansion may have extra ces
2469 // if we are in the normalization buffer, hasAccentsBeforeMatch
2470 // would have taken care of it.
2471 // E.g. the character \u01FA will have an expansion of 3, but if
2472 // we are only looking for A ring A\u030A, we'll have to skip the
2473 // last ce in the expansion buffer
2474 ucol_previous(coleiter
, status
);
2475 if (U_FAILURE(*status
)) {
2478 if (ucol_getOffset(coleiter
) != temp
) {
2480 temp
= ucol_getOffset(coleiter
);
2485 int32_t *patternce
= strsrch
->pattern
.ces
;
2486 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
2487 int32_t count
= patterncelength
;
2489 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2490 // status checked below, note that if status is a failure
2491 // ucol_previous returns UCOL_NULLORDER
2492 if (ce
== UCOL_IGNORABLE
) {
2495 if (expandflag
&& count
== 0 &&
2496 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2498 temp
= ucol_getOffset(coleiter
);
2500 if (count
== patterncelength
&&
2501 ce
!= patternce
[patterncelength
- 1]) {
2502 // accents may have extra starting ces, this occurs when a
2503 // pure accent pattern is matched without rearrangement
2504 int32_t expected
= patternce
[patterncelength
- 1];
2505 U16_BACK_1(text
, 0, *end
);
2506 if (getFCD(text
, end
, textlength
) & LAST_BYTE_MASK_
) {
2507 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2508 while (U_SUCCESS(*status
) && ce
!= expected
&&
2509 ce
!= UCOL_NULLORDER
&&
2510 ucol_getOffset(coleiter
) <= *start
) {
2511 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2515 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2517 *start
= getPreviousBaseOffset(text
, *start
);
2527 * Checks and sets the match information if found.
2530 * <li> the potential match does not repeat the previous match
2531 * <li> boundaries are correct
2532 * <li> potential match does not end in the middle of a contraction
2533 * <li> identical matches
2535 * Otherwise the offset will be shifted to the next character.
2536 * Internal method, status assumed to be success, caller has to check status
2537 * before calling this method.
2538 * @param strsrch string search data
2539 * @param textoffset offset in the collation element text. the returned value
2540 * will be the truncated start offset of the match or the new start
2542 * @param status only error status if any
2543 * @return TRUE if the match is valid, FALSE otherwise
2546 inline UBool
checkPreviousCanonicalMatch(UStringSearch
*strsrch
,
2547 int32_t *textoffset
,
2550 // to ensure that the start and ends are not composite characters
2551 UCollationElements
*coleiter
= strsrch
->textIter
;
2552 // if we have a canonical accent match
2553 if ((strsrch
->pattern
.hasSuffixAccents
&&
2554 strsrch
->canonicalSuffixAccents
[0]) ||
2555 (strsrch
->pattern
.hasPrefixAccents
&&
2556 strsrch
->canonicalPrefixAccents
[0])) {
2557 strsrch
->search
->matchedIndex
= *textoffset
;
2558 strsrch
->search
->matchedLength
=
2559 getNextUStringSearchBaseOffset(strsrch
,
2560 getColElemIterOffset(coleiter
, FALSE
))
2565 int32_t end
= ucol_getOffset(coleiter
);
2566 if (!checkPreviousCanonicalContractionMatch(strsrch
, textoffset
, &end
,
2568 U_FAILURE(*status
)) {
2572 end
= getNextUStringSearchBaseOffset(strsrch
, end
);
2573 // this totally matches, however we need to check if it is repeating
2574 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2575 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2576 !checkIdentical(strsrch
, *textoffset
, end
)) {
2578 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2583 strsrch
->search
->matchedIndex
= *textoffset
;
2584 strsrch
->search
->matchedLength
= end
- *textoffset
;
2587 #endif // #if BOYER_MOORE
2589 // constructors and destructor -------------------------------------------
2591 U_CAPI UStringSearch
* U_EXPORT2
usearch_open(const UChar
*pattern
,
2592 int32_t patternlength
,
2596 UBreakIterator
*breakiter
,
2599 if (U_FAILURE(*status
)) {
2602 #if UCONFIG_NO_BREAK_ITERATION
2603 if (breakiter
!= NULL
) {
2604 *status
= U_UNSUPPORTED_ERROR
;
2609 // ucol_open internally checks for status
2610 UCollator
*collator
= ucol_open(locale
, status
);
2611 // pattern, text checks are done in usearch_openFromCollator
2612 UStringSearch
*result
= usearch_openFromCollator(pattern
,
2613 patternlength
, text
, textlength
,
2614 collator
, breakiter
, status
);
2616 if (result
== NULL
|| U_FAILURE(*status
)) {
2618 ucol_close(collator
);
2623 result
->ownCollator
= TRUE
;
2627 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2631 U_CAPI UStringSearch
* U_EXPORT2
usearch_openFromCollator(
2632 const UChar
*pattern
,
2633 int32_t patternlength
,
2636 const UCollator
*collator
,
2637 UBreakIterator
*breakiter
,
2640 if (U_FAILURE(*status
)) {
2643 #if UCONFIG_NO_BREAK_ITERATION
2644 if (breakiter
!= NULL
) {
2645 *status
= U_UNSUPPORTED_ERROR
;
2649 if (pattern
== NULL
|| text
== NULL
|| collator
== NULL
) {
2650 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2654 // string search does not really work when numeric collation is turned on
2655 if(ucol_getAttribute(collator
, UCOL_NUMERIC_COLLATION
, status
) == UCOL_ON
) {
2656 *status
= U_UNSUPPORTED_ERROR
;
2660 if (U_SUCCESS(*status
)) {
2661 initializeFCD(status
);
2662 if (U_FAILURE(*status
)) {
2666 UStringSearch
*result
;
2667 if (textlength
== -1) {
2668 textlength
= u_strlen(text
);
2670 if (patternlength
== -1) {
2671 patternlength
= u_strlen(pattern
);
2673 if (textlength
<= 0 || patternlength
<= 0) {
2674 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2678 result
= (UStringSearch
*)uprv_malloc(sizeof(UStringSearch
));
2679 if (result
== NULL
) {
2680 *status
= U_MEMORY_ALLOCATION_ERROR
;
2684 result
->collator
= collator
;
2685 result
->strength
= ucol_getStrength(collator
);
2686 result
->ceMask
= getMask(result
->strength
);
2688 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2690 result
->variableTop
= ucol_getVariableTop(collator
, status
);
2692 result
->nfd
= Normalizer2::getNFDInstance(*status
);
2694 if (U_FAILURE(*status
)) {
2699 result
->search
= (USearch
*)uprv_malloc(sizeof(USearch
));
2700 if (result
->search
== NULL
) {
2701 *status
= U_MEMORY_ALLOCATION_ERROR
;
2706 result
->search
->text
= text
;
2707 result
->search
->textLength
= textlength
;
2709 result
->pattern
.text
= pattern
;
2710 result
->pattern
.textLength
= patternlength
;
2711 result
->pattern
.ces
= NULL
;
2712 result
->pattern
.pces
= NULL
;
2714 result
->search
->breakIter
= breakiter
;
2715 #if !UCONFIG_NO_BREAK_ITERATION
2716 result
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(result
->collator
, ULOC_VALID_LOCALE
, status
), text
, textlength
, status
);
2718 ubrk_setText(breakiter
, text
, textlength
, status
);
2722 result
->ownCollator
= FALSE
;
2723 result
->search
->matchedLength
= 0;
2724 result
->search
->matchedIndex
= USEARCH_DONE
;
2725 result
->utilIter
= NULL
;
2726 result
->textIter
= ucol_openElements(collator
, text
,
2727 textlength
, status
);
2728 result
->textProcessedIter
= NULL
;
2729 if (U_FAILURE(*status
)) {
2730 usearch_close(result
);
2734 result
->search
->isOverlap
= FALSE
;
2735 result
->search
->isCanonicalMatch
= FALSE
;
2736 result
->search
->elementComparisonType
= 0;
2737 result
->search
->isForwardSearching
= TRUE
;
2738 result
->search
->reset
= TRUE
;
2740 initialize(result
, status
);
2742 if (U_FAILURE(*status
)) {
2743 usearch_close(result
);
2752 U_CAPI
void U_EXPORT2
usearch_close(UStringSearch
*strsrch
)
2755 if (strsrch
->pattern
.ces
!= strsrch
->pattern
.cesBuffer
&&
2756 strsrch
->pattern
.ces
) {
2757 uprv_free(strsrch
->pattern
.ces
);
2760 if (strsrch
->pattern
.pces
!= NULL
&&
2761 strsrch
->pattern
.pces
!= strsrch
->pattern
.pcesBuffer
) {
2762 uprv_free(strsrch
->pattern
.pces
);
2765 delete strsrch
->textProcessedIter
;
2766 ucol_closeElements(strsrch
->textIter
);
2767 ucol_closeElements(strsrch
->utilIter
);
2769 if (strsrch
->ownCollator
&& strsrch
->collator
) {
2770 ucol_close((UCollator
*)strsrch
->collator
);
2773 #if !UCONFIG_NO_BREAK_ITERATION
2774 if (strsrch
->search
->internalBreakIter
) {
2775 ubrk_close(strsrch
->search
->internalBreakIter
);
2779 uprv_free(strsrch
->search
);
2786 UBool
initTextProcessedIter(UStringSearch
*strsrch
, UErrorCode
*status
) {
2787 if (U_FAILURE(*status
)) { return FALSE
; }
2788 if (strsrch
->textProcessedIter
== NULL
) {
2789 strsrch
->textProcessedIter
= new icu::UCollationPCE(strsrch
->textIter
);
2790 if (strsrch
->textProcessedIter
== NULL
) {
2791 *status
= U_MEMORY_ALLOCATION_ERROR
;
2795 strsrch
->textProcessedIter
->init(strsrch
->textIter
);
2802 // set and get methods --------------------------------------------------
2804 U_CAPI
void U_EXPORT2
usearch_setOffset(UStringSearch
*strsrch
,
2808 if (U_SUCCESS(*status
) && strsrch
) {
2809 if (isOutOfBounds(strsrch
->search
->textLength
, position
)) {
2810 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2813 setColEIterOffset(strsrch
->textIter
, position
);
2815 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2816 strsrch
->search
->matchedLength
= 0;
2817 strsrch
->search
->reset
= FALSE
;
2821 U_CAPI
int32_t U_EXPORT2
usearch_getOffset(const UStringSearch
*strsrch
)
2824 int32_t result
= ucol_getOffset(strsrch
->textIter
);
2825 if (isOutOfBounds(strsrch
->search
->textLength
, result
)) {
2826 return USEARCH_DONE
;
2830 return USEARCH_DONE
;
2833 U_CAPI
void U_EXPORT2
usearch_setAttribute(UStringSearch
*strsrch
,
2834 USearchAttribute attribute
,
2835 USearchAttributeValue value
,
2838 if (U_SUCCESS(*status
) && strsrch
) {
2841 case USEARCH_OVERLAP
:
2842 strsrch
->search
->isOverlap
= (value
== USEARCH_ON
? TRUE
: FALSE
);
2844 case USEARCH_CANONICAL_MATCH
:
2845 strsrch
->search
->isCanonicalMatch
= (value
== USEARCH_ON
? TRUE
:
2848 case USEARCH_ELEMENT_COMPARISON
:
2849 if (value
== USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD
|| value
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
2850 strsrch
->search
->elementComparisonType
= (int16_t)value
;
2852 strsrch
->search
->elementComparisonType
= 0;
2855 case USEARCH_ATTRIBUTE_COUNT
:
2857 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2860 if (value
== USEARCH_ATTRIBUTE_VALUE_COUNT
) {
2861 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2865 U_CAPI USearchAttributeValue U_EXPORT2
usearch_getAttribute(
2866 const UStringSearch
*strsrch
,
2867 USearchAttribute attribute
)
2870 switch (attribute
) {
2871 case USEARCH_OVERLAP
:
2872 return (strsrch
->search
->isOverlap
== TRUE
? USEARCH_ON
:
2874 case USEARCH_CANONICAL_MATCH
:
2875 return (strsrch
->search
->isCanonicalMatch
== TRUE
? USEARCH_ON
:
2877 case USEARCH_ELEMENT_COMPARISON
:
2879 int16_t value
= strsrch
->search
->elementComparisonType
;
2880 if (value
== USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD
|| value
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
2881 return (USearchAttributeValue
)value
;
2883 return USEARCH_STANDARD_ELEMENT_COMPARISON
;
2886 case USEARCH_ATTRIBUTE_COUNT
:
2887 return USEARCH_DEFAULT
;
2890 return USEARCH_DEFAULT
;
2893 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedStart(
2894 const UStringSearch
*strsrch
)
2896 if (strsrch
== NULL
) {
2897 return USEARCH_DONE
;
2899 return strsrch
->search
->matchedIndex
;
2903 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedText(const UStringSearch
*strsrch
,
2905 int32_t resultCapacity
,
2908 if (U_FAILURE(*status
)) {
2909 return USEARCH_DONE
;
2911 if (strsrch
== NULL
|| resultCapacity
< 0 || (resultCapacity
> 0 &&
2913 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2914 return USEARCH_DONE
;
2917 int32_t copylength
= strsrch
->search
->matchedLength
;
2918 int32_t copyindex
= strsrch
->search
->matchedIndex
;
2919 if (copyindex
== USEARCH_DONE
) {
2920 u_terminateUChars(result
, resultCapacity
, 0, status
);
2921 return USEARCH_DONE
;
2924 if (resultCapacity
< copylength
) {
2925 copylength
= resultCapacity
;
2927 if (copylength
> 0) {
2928 uprv_memcpy(result
, strsrch
->search
->text
+ copyindex
,
2929 copylength
* sizeof(UChar
));
2931 return u_terminateUChars(result
, resultCapacity
,
2932 strsrch
->search
->matchedLength
, status
);
2935 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedLength(
2936 const UStringSearch
*strsrch
)
2939 return strsrch
->search
->matchedLength
;
2941 return USEARCH_DONE
;
2944 #if !UCONFIG_NO_BREAK_ITERATION
2946 U_CAPI
void U_EXPORT2
usearch_setBreakIterator(UStringSearch
*strsrch
,
2947 UBreakIterator
*breakiter
,
2950 if (U_SUCCESS(*status
) && strsrch
) {
2951 strsrch
->search
->breakIter
= breakiter
;
2953 ubrk_setText(breakiter
, strsrch
->search
->text
,
2954 strsrch
->search
->textLength
, status
);
2959 U_CAPI
const UBreakIterator
* U_EXPORT2
2960 usearch_getBreakIterator(const UStringSearch
*strsrch
)
2963 return strsrch
->search
->breakIter
;
2970 U_CAPI
void U_EXPORT2
usearch_setText( UStringSearch
*strsrch
,
2975 if (U_SUCCESS(*status
)) {
2976 if (strsrch
== NULL
|| text
== NULL
|| textlength
< -1 ||
2978 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2981 if (textlength
== -1) {
2982 textlength
= u_strlen(text
);
2984 strsrch
->search
->text
= text
;
2985 strsrch
->search
->textLength
= textlength
;
2986 ucol_setText(strsrch
->textIter
, text
, textlength
, status
);
2987 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2988 strsrch
->search
->matchedLength
= 0;
2989 strsrch
->search
->reset
= TRUE
;
2990 #if !UCONFIG_NO_BREAK_ITERATION
2991 if (strsrch
->search
->breakIter
!= NULL
) {
2992 ubrk_setText(strsrch
->search
->breakIter
, text
,
2993 textlength
, status
);
2995 ubrk_setText(strsrch
->search
->internalBreakIter
, text
, textlength
, status
);
3001 U_CAPI
const UChar
* U_EXPORT2
usearch_getText(const UStringSearch
*strsrch
,
3005 *length
= strsrch
->search
->textLength
;
3006 return strsrch
->search
->text
;
3011 U_CAPI
void U_EXPORT2
usearch_setCollator( UStringSearch
*strsrch
,
3012 const UCollator
*collator
,
3015 if (U_SUCCESS(*status
)) {
3016 if (collator
== NULL
) {
3017 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3022 delete strsrch
->textProcessedIter
;
3023 strsrch
->textProcessedIter
= NULL
;
3024 ucol_closeElements(strsrch
->textIter
);
3025 ucol_closeElements(strsrch
->utilIter
);
3026 strsrch
->textIter
= strsrch
->utilIter
= NULL
;
3027 if (strsrch
->ownCollator
&& (strsrch
->collator
!= collator
)) {
3028 ucol_close((UCollator
*)strsrch
->collator
);
3029 strsrch
->ownCollator
= FALSE
;
3031 strsrch
->collator
= collator
;
3032 strsrch
->strength
= ucol_getStrength(collator
);
3033 strsrch
->ceMask
= getMask(strsrch
->strength
);
3034 #if !UCONFIG_NO_BREAK_ITERATION
3035 ubrk_close(strsrch
->search
->internalBreakIter
);
3036 strsrch
->search
->internalBreakIter
= ubrk_open(UBRK_CHARACTER
, ucol_getLocaleByType(collator
, ULOC_VALID_LOCALE
, status
),
3037 strsrch
->search
->text
, strsrch
->search
->textLength
, status
);
3039 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3041 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
3043 // if status is a failure, ucol_getVariableTop returns 0
3044 strsrch
->variableTop
= ucol_getVariableTop(collator
, status
);
3045 strsrch
->textIter
= ucol_openElements(collator
,
3046 strsrch
->search
->text
,
3047 strsrch
->search
->textLength
,
3049 strsrch
->utilIter
= ucol_openElements(
3050 collator
, strsrch
->pattern
.text
, strsrch
->pattern
.textLength
, status
);
3051 // initialize() _after_ setting the iterators for the new collator.
3052 initialize(strsrch
, status
);
3055 // **** are these calls needed?
3056 // **** we call uprv_init_pce in initializePatternPCETable
3057 // **** and the CEIBuffer constructor...
3059 uprv_init_pce(strsrch
->textIter
);
3060 uprv_init_pce(strsrch
->utilIter
);
3065 U_CAPI UCollator
* U_EXPORT2
usearch_getCollator(const UStringSearch
*strsrch
)
3068 return (UCollator
*)strsrch
->collator
;
3073 U_CAPI
void U_EXPORT2
usearch_setPattern( UStringSearch
*strsrch
,
3074 const UChar
*pattern
,
3075 int32_t patternlength
,
3078 if (U_SUCCESS(*status
)) {
3079 if (strsrch
== NULL
|| pattern
== NULL
) {
3080 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3083 if (patternlength
== -1) {
3084 patternlength
= u_strlen(pattern
);
3086 if (patternlength
== 0) {
3087 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3090 strsrch
->pattern
.text
= pattern
;
3091 strsrch
->pattern
.textLength
= patternlength
;
3092 initialize(strsrch
, status
);
3097 U_CAPI
const UChar
* U_EXPORT2
3098 usearch_getPattern(const UStringSearch
*strsrch
,
3102 *length
= strsrch
->pattern
.textLength
;
3103 return strsrch
->pattern
.text
;
3108 // miscellanous methods --------------------------------------------------
3110 U_CAPI
int32_t U_EXPORT2
usearch_first(UStringSearch
*strsrch
,
3113 if (strsrch
&& U_SUCCESS(*status
)) {
3114 strsrch
->search
->isForwardSearching
= TRUE
;
3115 usearch_setOffset(strsrch
, 0, status
);
3116 if (U_SUCCESS(*status
)) {
3117 return usearch_next(strsrch
, status
);
3120 return USEARCH_DONE
;
3123 U_CAPI
int32_t U_EXPORT2
usearch_following(UStringSearch
*strsrch
,
3127 if (strsrch
&& U_SUCCESS(*status
)) {
3128 strsrch
->search
->isForwardSearching
= TRUE
;
3129 // position checked in usearch_setOffset
3130 usearch_setOffset(strsrch
, position
, status
);
3131 if (U_SUCCESS(*status
)) {
3132 return usearch_next(strsrch
, status
);
3135 return USEARCH_DONE
;
3138 U_CAPI
int32_t U_EXPORT2
usearch_last(UStringSearch
*strsrch
,
3141 if (strsrch
&& U_SUCCESS(*status
)) {
3142 strsrch
->search
->isForwardSearching
= FALSE
;
3143 usearch_setOffset(strsrch
, strsrch
->search
->textLength
, status
);
3144 if (U_SUCCESS(*status
)) {
3145 return usearch_previous(strsrch
, status
);
3148 return USEARCH_DONE
;
3151 U_CAPI
int32_t U_EXPORT2
usearch_preceding(UStringSearch
*strsrch
,
3155 if (strsrch
&& U_SUCCESS(*status
)) {
3156 strsrch
->search
->isForwardSearching
= FALSE
;
3157 // position checked in usearch_setOffset
3158 usearch_setOffset(strsrch
, position
, status
);
3159 if (U_SUCCESS(*status
)) {
3160 return usearch_previous(strsrch
, status
);
3163 return USEARCH_DONE
;
3167 * If a direction switch is required, we'll count the number of ces till the
3168 * beginning of the collation element iterator and iterate forwards that
3169 * number of times. This is so that we get to the correct point within the
3170 * string to continue the search in. Imagine when we are in the middle of the
3171 * normalization buffer when the change in direction is request. arrrgghh....
3172 * After searching the offset within the collation element iterator will be
3173 * shifted to the start of the match. If a match is not found, the offset would
3174 * have been set to the end of the text string in the collation element
3176 * Okay, here's my take on normalization buffer. The only time when there can
3177 * be 2 matches within the same normalization is when the pattern is consists
3178 * of all accents. But since the offset returned is from the text string, we
3179 * should not confuse the caller by returning the second match within the
3180 * same normalization buffer. If we do, the 2 results will have the same match
3181 * offsets, and that'll be confusing. I'll return the next match that doesn't
3182 * fall within the same normalization buffer. Note this does not affect the
3183 * results of matches spanning the text and the normalization buffer.
3184 * The position to start searching is taken from the collation element
3185 * iterator. Callers of this API would have to set the offset in the collation
3186 * element iterator before using this method.
3188 U_CAPI
int32_t U_EXPORT2
usearch_next(UStringSearch
*strsrch
,
3191 if (U_SUCCESS(*status
) && strsrch
) {
3192 // note offset is either equivalent to the start of the previous match
3193 // or is set by the user
3194 int32_t offset
= usearch_getOffset(strsrch
);
3195 USearch
*search
= strsrch
->search
;
3196 search
->reset
= FALSE
;
3197 int32_t textlength
= search
->textLength
;
3198 if (search
->isForwardSearching
) {
3200 if (offset
== textlength
3201 || (!search
->isOverlap
&&
3202 (offset
+ strsrch
->pattern
.defaultShiftSize
> textlength
||
3203 (search
->matchedIndex
!= USEARCH_DONE
&&
3204 offset
+ search
->matchedLength
>= textlength
)))) {
3205 // not enough characters to match
3206 setMatchNotFound(strsrch
);
3207 return USEARCH_DONE
;
3210 if (offset
== textlength
||
3211 (! search
->isOverlap
&&
3212 (search
->matchedIndex
!= USEARCH_DONE
&&
3213 offset
+ search
->matchedLength
> textlength
))) {
3214 // not enough characters to match
3215 setMatchNotFound(strsrch
);
3216 return USEARCH_DONE
;
3221 // switching direction.
3222 // if matchedIndex == USEARCH_DONE, it means that either a
3223 // setOffset has been called or that previous ran off the text
3224 // string. the iterator would have been set to offset 0 if a
3225 // match is not found.
3226 search
->isForwardSearching
= TRUE
;
3227 if (search
->matchedIndex
!= USEARCH_DONE
) {
3228 // there's no need to set the collation element iterator
3229 // the next call to next will set the offset.
3230 return search
->matchedIndex
;
3234 if (U_SUCCESS(*status
)) {
3235 if (strsrch
->pattern
.cesLength
== 0) {
3236 if (search
->matchedIndex
== USEARCH_DONE
) {
3237 search
->matchedIndex
= offset
;
3239 else { // moves by codepoints
3240 U16_FWD_1(search
->text
, search
->matchedIndex
, textlength
);
3243 search
->matchedLength
= 0;
3244 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3245 // status checked below
3246 if (search
->matchedIndex
== textlength
) {
3247 search
->matchedIndex
= USEARCH_DONE
;
3251 if (search
->matchedLength
> 0) {
3252 // if matchlength is 0 we are at the start of the iteration
3253 if (search
->isOverlap
) {
3254 ucol_setOffset(strsrch
->textIter
, offset
+ 1, status
);
3257 ucol_setOffset(strsrch
->textIter
,
3258 offset
+ search
->matchedLength
, status
);
3262 // for boundary check purposes. this will ensure that the
3263 // next match will not preceed the current offset
3264 // note search->matchedIndex will always be set to something
3266 search
->matchedIndex
= offset
- 1;
3269 if (search
->isCanonicalMatch
) {
3270 // can't use exact here since extra accents are allowed.
3271 usearch_handleNextCanonical(strsrch
, status
);
3274 usearch_handleNextExact(strsrch
, status
);
3278 if (U_FAILURE(*status
)) {
3279 return USEARCH_DONE
;
3283 if (search
->matchedIndex
== USEARCH_DONE
) {
3284 ucol_setOffset(strsrch
->textIter
, search
->textLength
, status
);
3286 ucol_setOffset(strsrch
->textIter
, search
->matchedIndex
, status
);
3290 return search
->matchedIndex
;
3293 return USEARCH_DONE
;
3296 U_CAPI
int32_t U_EXPORT2
usearch_previous(UStringSearch
*strsrch
,
3299 if (U_SUCCESS(*status
) && strsrch
) {
3301 USearch
*search
= strsrch
->search
;
3302 if (search
->reset
) {
3303 offset
= search
->textLength
;
3304 search
->isForwardSearching
= FALSE
;
3305 search
->reset
= FALSE
;
3306 setColEIterOffset(strsrch
->textIter
, offset
);
3309 offset
= usearch_getOffset(strsrch
);
3312 int32_t matchedindex
= search
->matchedIndex
;
3313 if (search
->isForwardSearching
== TRUE
) {
3314 // switching direction.
3315 // if matchedIndex == USEARCH_DONE, it means that either a
3316 // setOffset has been called or that next ran off the text
3317 // string. the iterator would have been set to offset textLength if
3318 // a match is not found.
3319 search
->isForwardSearching
= FALSE
;
3320 if (matchedindex
!= USEARCH_DONE
) {
3321 return matchedindex
;
3326 if (offset
== 0 || matchedindex
== 0 ||
3327 (!search
->isOverlap
&&
3328 (offset
< strsrch
->pattern
.defaultShiftSize
||
3329 (matchedindex
!= USEARCH_DONE
&&
3330 matchedindex
< strsrch
->pattern
.defaultShiftSize
)))) {
3331 // not enough characters to match
3332 setMatchNotFound(strsrch
);
3333 return USEARCH_DONE
;
3336 // Could check pattern length, but the
3337 // linear search will do the right thing
3338 if (offset
== 0 || matchedindex
== 0) {
3339 setMatchNotFound(strsrch
);
3340 return USEARCH_DONE
;
3345 if (U_SUCCESS(*status
)) {
3346 if (strsrch
->pattern
.cesLength
== 0) {
3347 search
->matchedIndex
=
3348 (matchedindex
== USEARCH_DONE
? offset
: matchedindex
);
3349 if (search
->matchedIndex
== 0) {
3350 setMatchNotFound(strsrch
);
3351 // status checked below
3353 else { // move by codepoints
3354 U16_BACK_1(search
->text
, 0, search
->matchedIndex
);
3355 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3356 // status checked below
3357 search
->matchedLength
= 0;
3361 if (strsrch
->search
->isCanonicalMatch
) {
3362 // can't use exact here since extra accents are allowed.
3363 usearch_handlePreviousCanonical(strsrch
, status
);
3364 // status checked below
3367 usearch_handlePreviousExact(strsrch
, status
);
3368 // status checked below
3372 if (U_FAILURE(*status
)) {
3373 return USEARCH_DONE
;
3376 return search
->matchedIndex
;
3379 return USEARCH_DONE
;
3384 U_CAPI
void U_EXPORT2
usearch_reset(UStringSearch
*strsrch
)
3387 reset is setting the attributes that are already in
3388 string search, hence all attributes in the collator should
3389 be retrieved without any problems
3392 UErrorCode status
= U_ZERO_ERROR
;
3393 UBool sameCollAttribute
= TRUE
;
3398 // **** hack to deal w/ how processed CEs encode quaternary ****
3399 UCollationStrength newStrength
= ucol_getStrength(strsrch
->collator
);
3400 if ((strsrch
->strength
< UCOL_QUATERNARY
&& newStrength
>= UCOL_QUATERNARY
) ||
3401 (strsrch
->strength
>= UCOL_QUATERNARY
&& newStrength
< UCOL_QUATERNARY
)) {
3402 sameCollAttribute
= FALSE
;
3405 strsrch
->strength
= ucol_getStrength(strsrch
->collator
);
3406 ceMask
= getMask(strsrch
->strength
);
3407 if (strsrch
->ceMask
!= ceMask
) {
3408 strsrch
->ceMask
= ceMask
;
3409 sameCollAttribute
= FALSE
;
3412 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3413 shift
= ucol_getAttribute(strsrch
->collator
, UCOL_ALTERNATE_HANDLING
,
3414 &status
) == UCOL_SHIFTED
;
3415 if (strsrch
->toShift
!= shift
) {
3416 strsrch
->toShift
= shift
;
3417 sameCollAttribute
= FALSE
;
3420 // if status is a failure, ucol_getVariableTop returns 0
3421 varTop
= ucol_getVariableTop(strsrch
->collator
, &status
);
3422 if (strsrch
->variableTop
!= varTop
) {
3423 strsrch
->variableTop
= varTop
;
3424 sameCollAttribute
= FALSE
;
3426 if (!sameCollAttribute
) {
3427 initialize(strsrch
, &status
);
3429 ucol_setText(strsrch
->textIter
, strsrch
->search
->text
,
3430 strsrch
->search
->textLength
,
3432 strsrch
->search
->matchedLength
= 0;
3433 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
3434 strsrch
->search
->isOverlap
= FALSE
;
3435 strsrch
->search
->isCanonicalMatch
= FALSE
;
3436 strsrch
->search
->elementComparisonType
= 0;
3437 strsrch
->search
->isForwardSearching
= TRUE
;
3438 strsrch
->search
->reset
= TRUE
;
3443 // CEI Collation Element + source text index.
3444 // These structs are kept in the circular buffer.
3456 // CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3458 #define DEFAULT_CEBUFFER_SIZE 96
3459 #define CEBUFFER_EXTRA 32
3460 // Some typical max values to make buffer size more reasonable for asymmetric search.
3461 // #8694 is for a better long-term solution to allocation of this buffer.
3462 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3463 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3464 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3466 CEI defBuf
[DEFAULT_CEBUFFER_SIZE
];
3471 UCollationElements
*ceIter
;
3472 UStringSearch
*strSearch
;
3476 CEIBuffer(UStringSearch
*ss
, UErrorCode
*status
);
3478 const CEI
*get(int32_t index
);
3479 const CEI
*getPrevious(int32_t index
);
3483 CEIBuffer::CEIBuffer(UStringSearch
*ss
, UErrorCode
*status
) {
3486 bufSize
= ss
->pattern
.pcesLength
+ CEBUFFER_EXTRA
;
3487 if (ss
->search
->elementComparisonType
!= 0) {
3488 const UChar
* patText
= ss
->pattern
.text
;
3490 const UChar
* patTextLimit
= patText
+ ss
->pattern
.textLength
;
3491 while ( patText
< patTextLimit
) {
3492 UChar c
= *patText
++;
3493 if (MIGHT_BE_JAMO_L(c
)) {
3494 bufSize
+= MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L
;
3496 // No check for surrogates, we might allocate slightly more buffer than necessary.
3497 bufSize
+= MAX_TARGET_IGNORABLES_PER_PAT_OTHER
;
3502 ceIter
= ss
->textIter
;
3506 if (!initTextProcessedIter(ss
, status
)) { return; }
3508 if (bufSize
>DEFAULT_CEBUFFER_SIZE
) {
3509 buf
= (CEI
*)uprv_malloc(bufSize
* sizeof(CEI
));
3511 *status
= U_MEMORY_ALLOCATION_ERROR
;
3516 // TODO: add a reset or init function so that allocated
3517 // buffers can be retained & reused.
3519 CEIBuffer::~CEIBuffer() {
3520 if (buf
!= defBuf
) {
3526 // Get the CE with the specified index.
3527 // Index must be in the range
3528 // n-history_size < index < n+1
3529 // where n is the largest index to have been fetched by some previous call to this function.
3530 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3532 const CEI
*CEIBuffer::get(int32_t index
) {
3533 int i
= index
% bufSize
;
3535 if (index
>=firstIx
&& index
<limitIx
) {
3536 // The request was for an entry already in our buffer.
3541 // Caller is requesting a new, never accessed before, CE.
3542 // Verify that it is the next one in sequence, which is all
3544 if (index
!= limitIx
) {
3550 // Manage the circular CE buffer indexing
3553 if (limitIx
- firstIx
>= bufSize
) {
3554 // The buffer is full, knock out the lowest-indexed entry.
3558 UErrorCode status
= U_ZERO_ERROR
;
3560 buf
[i
].ce
= strSearch
->textProcessedIter
->nextProcessed(&buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3565 // Get the CE with the specified index.
3566 // Index must be in the range
3567 // n-history_size < index < n+1
3568 // where n is the largest index to have been fetched by some previous call to this function.
3569 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3571 const CEI
*CEIBuffer::getPrevious(int32_t index
) {
3572 int i
= index
% bufSize
;
3574 if (index
>=firstIx
&& index
<limitIx
) {
3575 // The request was for an entry already in our buffer.
3580 // Caller is requesting a new, never accessed before, CE.
3581 // Verify that it is the next one in sequence, which is all
3583 if (index
!= limitIx
) {
3589 // Manage the circular CE buffer indexing
3592 if (limitIx
- firstIx
>= bufSize
) {
3593 // The buffer is full, knock out the lowest-indexed entry.
3597 UErrorCode status
= U_ZERO_ERROR
;
3599 buf
[i
].ce
= strSearch
->textProcessedIter
->previousProcessed(&buf
[i
].lowIndex
, &buf
[i
].highIndex
, &status
);
3609 // #define USEARCH_DEBUG
3611 #ifdef USEARCH_DEBUG
3617 * Find the next break boundary after startIndex. If the UStringSearch object
3618 * has an external break iterator, use that. Otherwise use the internal character
3621 static int32_t nextBoundaryAfter(UStringSearch
*strsrch
, int32_t startIndex
) {
3623 const UChar
*text
= strsrch
->search
->text
;
3624 int32_t textLen
= strsrch
->search
->textLength
;
3626 U_ASSERT(startIndex
>=0);
3627 U_ASSERT(startIndex
<=textLen
);
3629 if (startIndex
>= textLen
) {
3634 int32_t i
= startIndex
;
3635 U16_NEXT(text
, i
, textLen
, c
);
3637 // If we are on a control character, stop without looking for combining marks.
3638 // Control characters do not combine.
3639 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3640 if (gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
) {
3644 // The initial character was not a control, and can thus accept trailing
3645 // combining characters. Advance over however many of them there are.
3646 int32_t indexOfLastCharChecked
;
3648 indexOfLastCharChecked
= i
;
3652 U16_NEXT(text
, i
, textLen
, c
);
3653 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3654 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3658 return indexOfLastCharChecked
;
3659 #elif !UCONFIG_NO_BREAK_ITERATION
3660 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3662 if (breakiterator
== NULL
) {
3663 breakiterator
= strsrch
->search
->internalBreakIter
;
3666 if (breakiterator
!= NULL
) {
3667 return ubrk_following(breakiterator
, startIndex
);
3672 // **** or should we use the original code? ****
3679 * Returns TRUE if index is on a break boundary. If the UStringSearch
3680 * has an external break iterator, test using that, otherwise test
3681 * using the internal character break iterator.
3683 static UBool
isBreakBoundary(UStringSearch
*strsrch
, int32_t index
) {
3685 const UChar
*text
= strsrch
->search
->text
;
3686 int32_t textLen
= strsrch
->search
->textLength
;
3689 U_ASSERT(index
<=textLen
);
3691 if (index
>=textLen
|| index
<=0) {
3695 // If the character at the current index is not a GRAPHEME_EXTEND
3696 // then we can not be within a combining sequence.
3698 U16_GET(text
, 0, index
, textLen
, c
);
3699 int32_t gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3700 if (gcProperty
!= U_GCB_EXTEND
&& gcProperty
!= U_GCB_SPACING_MARK
) {
3704 // We are at a combining mark. If the preceding character is anything
3705 // except a CONTROL, CR or LF, we are in a combining sequence.
3706 U16_PREV(text
, 0, index
, c
);
3707 gcProperty
= u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
3708 UBool combining
= !(gcProperty
==U_GCB_CONTROL
|| gcProperty
==U_GCB_LF
|| gcProperty
==U_GCB_CR
);
3710 #elif !UCONFIG_NO_BREAK_ITERATION
3711 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3713 if (breakiterator
== NULL
) {
3714 breakiterator
= strsrch
->search
->internalBreakIter
;
3717 return (breakiterator
!= NULL
&& ubrk_isBoundary(breakiterator
, index
));
3719 // **** or use the original code? ****
3725 static UBool
onBreakBoundaries(const UStringSearch
*strsrch
, int32_t start
, int32_t end
)
3727 #if !UCONFIG_NO_BREAK_ITERATION
3728 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
3730 if (breakiterator
!= NULL
) {
3731 int32_t startindex
= ubrk_first(breakiterator
);
3732 int32_t endindex
= ubrk_last(breakiterator
);
3734 // out-of-range indexes are never boundary positions
3735 if (start
< startindex
|| start
> endindex
||
3736 end
< startindex
|| end
> endindex
) {
3740 return ubrk_isBoundary(breakiterator
, start
) &&
3741 ubrk_isBoundary(breakiterator
, end
);
3754 } UCompareCEsResult
;
3755 #define U_CE_LEVEL2_BASE 0x00000005
3756 #define U_CE_LEVEL3_BASE 0x00050000
3758 static UCompareCEsResult
compareCE64s(int64_t targCE
, int64_t patCE
, int16_t compareType
) {
3759 if (targCE
== patCE
) {
3762 if (compareType
== 0) {
3763 return U_CE_NO_MATCH
;
3766 int64_t targCEshifted
= targCE
>> 32;
3767 int64_t patCEshifted
= patCE
>> 32;
3771 int32_t targLev1
= (int32_t)(targCEshifted
& mask
);
3772 int32_t patLev1
= (int32_t)(patCEshifted
& mask
);
3773 if ( targLev1
!= patLev1
) {
3774 if ( targLev1
== 0 ) {
3775 return U_CE_SKIP_TARG
;
3777 if ( patLev1
== 0 && compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
3778 return U_CE_SKIP_PATN
;
3780 return U_CE_NO_MATCH
;
3784 int32_t targLev2
= (int32_t)(targCEshifted
& mask
);
3785 int32_t patLev2
= (int32_t)(patCEshifted
& mask
);
3786 if ( targLev2
!= patLev2
) {
3787 if ( targLev2
== 0 ) {
3788 return U_CE_SKIP_TARG
;
3790 if ( patLev2
== 0 && compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
) {
3791 return U_CE_SKIP_PATN
;
3793 return (patLev2
== U_CE_LEVEL2_BASE
|| (compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
&& targLev2
== U_CE_LEVEL2_BASE
) )?
3794 U_CE_MATCH
: U_CE_NO_MATCH
;
3798 int32_t targLev3
= (int32_t)(targCE
& mask
);
3799 int32_t patLev3
= (int32_t)(patCE
& mask
);
3800 if ( targLev3
!= patLev3
) {
3801 return (patLev3
== U_CE_LEVEL3_BASE
|| (compareType
== USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD
&& targLev3
== U_CE_LEVEL3_BASE
) )?
3802 U_CE_MATCH
: U_CE_NO_MATCH
;
3809 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3814 UChar32
codePointAt(const USearch
&search
, int32_t index
) {
3815 if (index
< search
.textLength
) {
3817 U16_NEXT(search
.text
, index
, search
.textLength
, c
);
3823 UChar32
codePointBefore(const USearch
&search
, int32_t index
) {
3826 U16_PREV(search
.text
, 0, index
, c
);
3834 U_CAPI UBool U_EXPORT2
usearch_search(UStringSearch
*strsrch
,
3836 int32_t *matchStart
,
3837 int32_t *matchLimit
,
3840 if (U_FAILURE(*status
)) {
3844 // TODO: reject search patterns beginning with a combining char.
3846 #ifdef USEARCH_DEBUG
3847 if (getenv("USEARCH_DEBUG") != NULL
) {
3848 printf("Pattern CEs\n");
3849 for (int ii
=0; ii
<strsrch
->pattern
.cesLength
; ii
++) {
3850 printf(" %8x", strsrch
->pattern
.ces
[ii
]);
3856 // Input parameter sanity check.
3857 // TODO: should input indicies clip to the text length
3858 // in the same way that UText does.
3859 if(strsrch
->pattern
.cesLength
== 0 ||
3861 startIdx
> strsrch
->search
->textLength
||
3862 strsrch
->pattern
.ces
== NULL
) {
3863 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
3867 if (strsrch
->pattern
.pces
== NULL
) {
3868 initializePatternPCETable(strsrch
, status
);
3871 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
3872 CEIBuffer
ceb(strsrch
, status
);
3875 int32_t targetIx
= 0;
3876 const CEI
*targetCEI
= NULL
;
3880 int32_t mStart
= -1;
3881 int32_t mLimit
= -1;
3887 // Outer loop moves over match starting positions in the
3889 // Here we see the target as a sequence of collation elements, resulting from the following:
3890 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3891 // (for example, digraphs such as IJ may be broken into two characters).
3892 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3893 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3894 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3895 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3896 // then the CE is deleted, so the following code sees only CEs that are relevant.
3897 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3898 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3899 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3901 for(targetIx
=0; ; targetIx
++)
3904 // Inner loop checks for a match beginning at each
3905 // position from the outer loop.
3906 int32_t targetIxOffset
= 0;
3908 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3909 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3910 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3911 const CEI
*firstCEI
= ceb
.get(targetIx
);
3912 if (firstCEI
== NULL
) {
3913 *status
= U_INTERNAL_PROGRAM_ERROR
;
3918 for (patIx
=0; patIx
<strsrch
->pattern
.pcesLength
; patIx
++) {
3919 patCE
= strsrch
->pattern
.pces
[patIx
];
3920 targetCEI
= ceb
.get(targetIx
+patIx
+targetIxOffset
);
3921 // Compare CE from target string with CE from the pattern.
3922 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3923 // which will fail the compare, below.
3924 UCompareCEsResult ceMatch
= compareCE64s(targetCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
3925 if ( ceMatch
== U_CE_NO_MATCH
) {
3928 } else if ( ceMatch
> U_CE_NO_MATCH
) {
3929 if ( ceMatch
== U_CE_SKIP_TARG
) {
3930 // redo with same patCE, next targCE
3933 } else { // ceMatch == U_CE_SKIP_PATN
3934 // redo with same targCE, next patCE
3939 targetIxOffset
+= strsrch
->pattern
.pcesLength
; // this is now the offset in target CE space to end of the match so far
3941 if (!found
&& ((targetCEI
== NULL
) || (targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
))) {
3942 // No match at this targetIx. Try again at the next.
3947 // No match at all, we have run off the end of the target text.
3952 // We have found a match in CE space.
3953 // Now determine the bounds in string index space.
3954 // There still is a chance of match failure if the CE range not correspond to
3955 // an acceptable character range.
3957 const CEI
*lastCEI
= ceb
.get(targetIx
+ targetIxOffset
- 1);
3959 mStart
= firstCEI
->lowIndex
;
3960 minLimit
= lastCEI
->lowIndex
;
3962 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3963 // extended to the end of input, and the match is good.
3965 // Look at the high and low indices of the CE following the match. If
3966 // they are the same it means one of two things:
3967 // 1. The match extended to the last CE from the target text, which is OK, or
3968 // 2. The last CE that was part of the match is in an expansion that extends
3969 // to the first CE after the match. In this case, we reject the match.
3970 const CEI
*nextCEI
= 0;
3971 if (strsrch
->search
->elementComparisonType
== 0) {
3972 nextCEI
= ceb
.get(targetIx
+ targetIxOffset
);
3973 maxLimit
= nextCEI
->lowIndex
;
3974 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
3978 for ( ; ; ++targetIxOffset
) {
3979 nextCEI
= ceb
.get(targetIx
+ targetIxOffset
);
3980 maxLimit
= nextCEI
->lowIndex
;
3981 // If we are at the end of the target too, match succeeds
3982 if ( nextCEI
->ce
== UCOL_PROCESSED_NULLORDER
) {
3985 // As long as the next CE has primary weight of 0,
3986 // it is part of the last target element matched by the pattern;
3987 // make sure it can be part of a match with the last patCE
3988 if ( (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) == 0 ) {
3989 UCompareCEsResult ceMatch
= compareCE64s(nextCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
3990 if ( ceMatch
== U_CE_NO_MATCH
|| ceMatch
== U_CE_SKIP_PATN
) {
3994 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3995 // target element, but it has non-zero primary weight => match fails
3996 } else if ( nextCEI
->lowIndex
== nextCEI
->highIndex
) {
3999 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4007 // Check for the start of the match being within a combining sequence.
4008 // This can happen if the pattern itself begins with a combining char, and
4009 // the match found combining marks in the target text that were attached
4010 // to something else.
4011 // This type of match should be rejected for not completely consuming a
4012 // combining sequence.
4013 if (!isBreakBoundary(strsrch
, mStart
)) {
4017 // Check for the start of the match being within an Collation Element Expansion,
4018 // meaning that the first char of the match is only partially matched.
4019 // With exapnsions, the first CE will report the index of the source
4020 // character, and all subsequent (expansions) CEs will report the source index of the
4021 // _following_ character.
4022 int32_t secondIx
= firstCEI
->highIndex
;
4023 if (mStart
== secondIx
) {
4027 // Allow matches to end in the middle of a grapheme cluster if the following
4028 // conditions are met; this is needed to make prefix search work properly in
4029 // Indic, see #11750
4030 // * the default breakIter is being used
4031 // * the next collation element after this combining sequence
4032 // - has non-zero primary weight
4033 // - corresponds to a separate character following the one at end of the current match
4034 // (the second of these conditions, and perhaps both, may be redundant given the
4035 // subsequent check for normalization boundary; however they are likely much faster
4036 // tests in any case)
4037 // * the match limit is a normalization boundary
4038 UBool allowMidclusterMatch
= FALSE
;
4039 if (strsrch
->search
->text
!= NULL
&& strsrch
->search
->textLength
> maxLimit
) {
4040 allowMidclusterMatch
=
4041 strsrch
->search
->breakIter
== NULL
&&
4042 nextCEI
!= NULL
&& (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) != 0 &&
4043 maxLimit
>= lastCEI
->highIndex
&& nextCEI
->highIndex
> maxLimit
&&
4044 (strsrch
->nfd
->hasBoundaryBefore(codePointAt(*strsrch
->search
, maxLimit
)) ||
4045 strsrch
->nfd
->hasBoundaryAfter(codePointBefore(*strsrch
->search
, maxLimit
)));
4047 // If those conditions are met, then:
4048 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4049 // the match limit may be backed off to a previous break boundary. This handles
4050 // cases in which mLimit includes target characters that are ignorable with current
4051 // settings (such as space) and which extend beyond the pattern match.
4052 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4053 // * do NOT require that match limit be on a breakIter boundary
4055 // Advance the match end position to the first acceptable match boundary.
4056 // This advances the index over any combining charcters.
4058 if (minLimit
< maxLimit
) {
4059 // When the last CE's low index is same with its high index, the CE is likely
4060 // a part of expansion. In this case, the index is located just after the
4061 // character corresponding to the CEs compared above. If the index is right
4062 // at the break boundary, move the position to the next boundary will result
4063 // incorrect match length when there are ignorable characters exist between
4064 // the position and the next character produces CE(s). See ticket#8482.
4065 if (minLimit
== lastCEI
->highIndex
&& isBreakBoundary(strsrch
, minLimit
)) {
4068 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4069 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4070 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4071 // (i.e. we back off mLimit to the previous breakIterator boundary).
4072 if (nba
>= lastCEI
->highIndex
&& (!allowMidclusterMatch
|| nba
< maxLimit
)) {
4078 #ifdef USEARCH_DEBUG
4079 if (getenv("USEARCH_DEBUG") != NULL
) {
4080 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4084 if (!allowMidclusterMatch
) {
4085 // If advancing to the end of a combining sequence in character indexing space
4086 // advanced us beyond the end of the match in CE space, reject this match.
4087 if (mLimit
> maxLimit
) {
4091 if (!isBreakBoundary(strsrch
, mLimit
)) {
4096 if (! checkIdentical(strsrch
, mStart
, mLimit
)) {
4105 #ifdef USEARCH_DEBUG
4106 if (getenv("USEARCH_DEBUG") != NULL
) {
4107 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4108 int32_t lastToPrint
= ceb
.limitIx
+2;
4109 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4110 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4112 printf("\n%s\n", found
? "match found" : "no match");
4116 // All Done. Store back the match bounds to the caller.
4123 if (matchStart
!= NULL
) {
4124 *matchStart
= mStart
;
4127 if (matchLimit
!= NULL
) {
4128 *matchLimit
= mLimit
;
4134 U_CAPI UBool U_EXPORT2
usearch_searchBackwards(UStringSearch
*strsrch
,
4136 int32_t *matchStart
,
4137 int32_t *matchLimit
,
4140 if (U_FAILURE(*status
)) {
4144 // TODO: reject search patterns beginning with a combining char.
4146 #ifdef USEARCH_DEBUG
4147 if (getenv("USEARCH_DEBUG") != NULL
) {
4148 printf("Pattern CEs\n");
4149 for (int ii
=0; ii
<strsrch
->pattern
.cesLength
; ii
++) {
4150 printf(" %8x", strsrch
->pattern
.ces
[ii
]);
4156 // Input parameter sanity check.
4157 // TODO: should input indicies clip to the text length
4158 // in the same way that UText does.
4159 if(strsrch
->pattern
.cesLength
== 0 ||
4161 startIdx
> strsrch
->search
->textLength
||
4162 strsrch
->pattern
.ces
== NULL
) {
4163 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
4167 if (strsrch
->pattern
.pces
== NULL
) {
4168 initializePatternPCETable(strsrch
, status
);
4171 CEIBuffer
ceb(strsrch
, status
);
4172 int32_t targetIx
= 0;
4175 * Pre-load the buffer with the CE's for the grapheme
4176 * after our starting position so that we're sure that
4177 * we can look at the CE following the match when we
4178 * check the match boundaries.
4180 * This will also pre-fetch the first CE that we'll
4181 * consider for the match.
4183 if (startIdx
< strsrch
->search
->textLength
) {
4184 UBreakIterator
*bi
= strsrch
->search
->internalBreakIter
;
4185 int32_t next
= ubrk_following(bi
, startIdx
);
4187 ucol_setOffset(strsrch
->textIter
, next
, status
);
4189 for (targetIx
= 0; ; targetIx
+= 1) {
4190 if (ceb
.getPrevious(targetIx
)->lowIndex
< startIdx
) {
4195 ucol_setOffset(strsrch
->textIter
, startIdx
, status
);
4199 const CEI
*targetCEI
= NULL
;
4203 int32_t limitIx
= targetIx
;
4204 int32_t mStart
= -1;
4205 int32_t mLimit
= -1;
4211 // Outer loop moves over match starting positions in the
4213 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4214 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4215 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4216 // and the beginning of the base text.
4217 for(targetIx
= limitIx
; ; targetIx
+= 1)
4220 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4221 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4222 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4223 const CEI
*lastCEI
= ceb
.getPrevious(targetIx
);
4224 if (lastCEI
== NULL
) {
4225 *status
= U_INTERNAL_PROGRAM_ERROR
;
4229 // Inner loop checks for a match beginning at each
4230 // position from the outer loop.
4231 int32_t targetIxOffset
= 0;
4232 for (patIx
= strsrch
->pattern
.pcesLength
- 1; patIx
>= 0; patIx
-= 1) {
4233 int64_t patCE
= strsrch
->pattern
.pces
[patIx
];
4235 targetCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.pcesLength
- 1 - patIx
+ targetIxOffset
);
4236 // Compare CE from target string with CE from the pattern.
4237 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4238 // which will fail the compare, below.
4239 UCompareCEsResult ceMatch
= compareCE64s(targetCEI
->ce
, patCE
, strsrch
->search
->elementComparisonType
);
4240 if ( ceMatch
== U_CE_NO_MATCH
) {
4243 } else if ( ceMatch
> U_CE_NO_MATCH
) {
4244 if ( ceMatch
== U_CE_SKIP_TARG
) {
4245 // redo with same patCE, next targCE
4248 } else { // ceMatch == U_CE_SKIP_PATN
4249 // redo with same targCE, next patCE
4255 if (!found
&& ((targetCEI
== NULL
) || (targetCEI
->ce
!= UCOL_PROCESSED_NULLORDER
))) {
4256 // No match at this targetIx. Try again at the next.
4261 // No match at all, we have run off the end of the target text.
4266 // We have found a match in CE space.
4267 // Now determine the bounds in string index space.
4268 // There still is a chance of match failure if the CE range not correspond to
4269 // an acceptable character range.
4271 const CEI
*firstCEI
= ceb
.getPrevious(targetIx
+ strsrch
->pattern
.pcesLength
- 1 + targetIxOffset
);
4272 mStart
= firstCEI
->lowIndex
;
4274 // Check for the start of the match being within a combining sequence.
4275 // This can happen if the pattern itself begins with a combining char, and
4276 // the match found combining marks in the target text that were attached
4277 // to something else.
4278 // This type of match should be rejected for not completely consuming a
4279 // combining sequence.
4280 if (!isBreakBoundary(strsrch
, mStart
)) {
4284 // Look at the high index of the first CE in the match. If it's the same as the
4285 // low index, the first CE in the match is in the middle of an expansion.
4286 if (mStart
== firstCEI
->highIndex
) {
4291 minLimit
= lastCEI
->lowIndex
;
4294 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4295 // extended to the end of input, and the match is good.
4297 // Look at the high and low indices of the CE following the match. If
4298 // they are the same it means one of two things:
4299 // 1. The match extended to the last CE from the target text, which is OK, or
4300 // 2. The last CE that was part of the match is in an expansion that extends
4301 // to the first CE after the match. In this case, we reject the match.
4302 const CEI
*nextCEI
= ceb
.getPrevious(targetIx
- 1);
4304 if (nextCEI
->lowIndex
== nextCEI
->highIndex
&& nextCEI
->ce
!= UCOL_PROCESSED_NULLORDER
) {
4308 mLimit
= maxLimit
= nextCEI
->lowIndex
;
4310 // Allow matches to end in the middle of a grapheme cluster if the following
4311 // conditions are met; this is needed to make prefix search work properly in
4312 // Indic, see #11750
4313 // * the default breakIter is being used
4314 // * the next collation element after this combining sequence
4315 // - has non-zero primary weight
4316 // - corresponds to a separate character following the one at end of the current match
4317 // (the second of these conditions, and perhaps both, may be redundant given the
4318 // subsequent check for normalization boundary; however they are likely much faster
4319 // tests in any case)
4320 // * the match limit is a normalization boundary
4321 UBool allowMidclusterMatch
= FALSE
;
4322 if (strsrch
->search
->text
!= NULL
&& strsrch
->search
->textLength
> maxLimit
) {
4323 allowMidclusterMatch
=
4324 strsrch
->search
->breakIter
== NULL
&&
4325 nextCEI
!= NULL
&& (((nextCEI
->ce
) >> 32) & 0xFFFF0000UL
) != 0 &&
4326 maxLimit
>= lastCEI
->highIndex
&& nextCEI
->highIndex
> maxLimit
&&
4327 (strsrch
->nfd
->hasBoundaryBefore(codePointAt(*strsrch
->search
, maxLimit
)) ||
4328 strsrch
->nfd
->hasBoundaryAfter(codePointBefore(*strsrch
->search
, maxLimit
)));
4330 // If those conditions are met, then:
4331 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4332 // the match limit may be backed off to a previous break boundary. This handles
4333 // cases in which mLimit includes target characters that are ignorable with current
4334 // settings (such as space) and which extend beyond the pattern match.
4335 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4336 // * do NOT require that match limit be on a breakIter boundary
4338 // Advance the match end position to the first acceptable match boundary.
4339 // This advances the index over any combining characters.
4340 if (minLimit
< maxLimit
) {
4341 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4342 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4343 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4344 // (i.e. we back off mLimit to the previous breakIterator boundary).
4345 if (nba
>= lastCEI
->highIndex
&& (!allowMidclusterMatch
|| nba
< maxLimit
)) {
4350 if (!allowMidclusterMatch
) {
4351 // If advancing to the end of a combining sequence in character indexing space
4352 // advanced us beyond the end of the match in CE space, reject this match.
4353 if (mLimit
> maxLimit
) {
4357 // Make sure the end of the match is on a break boundary
4358 if (!isBreakBoundary(strsrch
, mLimit
)) {
4364 // No non-ignorable CEs after this point.
4365 // The maximum position is detected by boundary after
4366 // the last non-ignorable CE. Combining sequence
4367 // across the start index will be truncated.
4368 int32_t nba
= nextBoundaryAfter(strsrch
, minLimit
);
4369 mLimit
= maxLimit
= (nba
> 0) && (startIdx
> nba
) ? nba
: startIdx
;
4372 #ifdef USEARCH_DEBUG
4373 if (getenv("USEARCH_DEBUG") != NULL
) {
4374 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit
, maxLimit
, mLimit
);
4379 if (! checkIdentical(strsrch
, mStart
, mLimit
)) {
4388 #ifdef USEARCH_DEBUG
4389 if (getenv("USEARCH_DEBUG") != NULL
) {
4390 printf("Target CEs [%d .. %d]\n", ceb
.firstIx
, ceb
.limitIx
);
4391 int32_t lastToPrint
= ceb
.limitIx
+2;
4392 for (int ii
=ceb
.firstIx
; ii
<lastToPrint
; ii
++) {
4393 printf("%8x@%d ", ceb
.get(ii
)->ce
, ceb
.get(ii
)->srcIndex
);
4395 printf("\n%s\n", found
? "match found" : "no match");
4399 // All Done. Store back the match bounds to the caller.
4406 if (matchStart
!= NULL
) {
4407 *matchStart
= mStart
;
4410 if (matchLimit
!= NULL
) {
4411 *matchLimit
= mLimit
;
4417 // internal use methods declared in usrchimp.h -----------------------------
4419 UBool
usearch_handleNextExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4421 if (U_FAILURE(*status
)) {
4422 setMatchNotFound(strsrch
);
4427 UCollationElements
*coleiter
= strsrch
->textIter
;
4428 int32_t textlength
= strsrch
->search
->textLength
;
4429 int32_t *patternce
= strsrch
->pattern
.ces
;
4430 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4431 int32_t textoffset
= ucol_getOffset(coleiter
);
4433 // status used in setting coleiter offset, since offset is checked in
4434 // shiftForward before setting the coleiter offset, status never
4436 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4438 while (textoffset
<= textlength
)
4440 uint32_t patternceindex
= patterncelength
- 1;
4442 UBool found
= FALSE
;
4443 int32_t lastce
= UCOL_NULLORDER
;
4445 setColEIterOffset(coleiter
, textoffset
);
4448 // finding the last pattern ce match, imagine composite characters
4449 // for example: search for pattern A in text \u00C0
4450 // we'll have to skip \u0300 the grave first before we get to A
4451 targetce
= ucol_previous(coleiter
, status
);
4452 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4456 targetce
= getCE(strsrch
, targetce
);
4457 if (targetce
== UCOL_IGNORABLE
&& inNormBuf(coleiter
)) {
4458 // this is for the text \u0315\u0300 that requires
4459 // normalization and pattern \u0300, where \u0315 is ignorable
4462 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4465 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4466 if (targetce
== patternce
[patternceindex
]) {
4467 // the first ce can be a contraction
4471 if (!hasExpansion(coleiter
)) {
4477 //targetce = lastce;
4479 while (found
&& patternceindex
> 0) {
4481 targetce
= ucol_previous(coleiter
, status
);
4482 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4486 targetce
= getCE(strsrch
, targetce
);
4487 if (targetce
== UCOL_IGNORABLE
) {
4492 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4493 found
= found
&& targetce
== patternce
[patternceindex
];
4499 if (U_FAILURE(*status
)) {
4502 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4504 // status checked at loop.
4505 patternceindex
= patterncelength
;
4509 if (checkNextExactMatch(strsrch
, &textoffset
, status
)) {
4510 // status checked in ucol_setOffset
4511 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4515 setMatchNotFound(strsrch
);
4518 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4522 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4523 strsrch
->search
->matchedIndex
= start
;
4524 strsrch
->search
->matchedLength
= end
- start
;
4527 setMatchNotFound(strsrch
);
4533 UBool
usearch_handleNextCanonical(UStringSearch
*strsrch
, UErrorCode
*status
)
4535 if (U_FAILURE(*status
)) {
4536 setMatchNotFound(strsrch
);
4541 UCollationElements
*coleiter
= strsrch
->textIter
;
4542 int32_t textlength
= strsrch
->search
->textLength
;
4543 int32_t *patternce
= strsrch
->pattern
.ces
;
4544 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4545 int32_t textoffset
= ucol_getOffset(coleiter
);
4546 UBool hasPatternAccents
=
4547 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4549 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
4551 strsrch
->canonicalPrefixAccents
[0] = 0;
4552 strsrch
->canonicalSuffixAccents
[0] = 0;
4554 while (textoffset
<= textlength
)
4556 int32_t patternceindex
= patterncelength
- 1;
4558 UBool found
= FALSE
;
4559 int32_t lastce
= UCOL_NULLORDER
;
4561 setColEIterOffset(coleiter
, textoffset
);
4564 // finding the last pattern ce match, imagine composite characters
4565 // for example: search for pattern A in text \u00C0
4566 // we'll have to skip \u0300 the grave first before we get to A
4567 targetce
= ucol_previous(coleiter
, status
);
4568 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4572 targetce
= getCE(strsrch
, targetce
);
4573 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
4576 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4577 if (targetce
== patternce
[patternceindex
]) {
4578 // the first ce can be a contraction
4582 if (!hasExpansion(coleiter
)) {
4588 while (found
&& patternceindex
> 0) {
4589 targetce
= ucol_previous(coleiter
, status
);
4590 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4594 targetce
= getCE(strsrch
, targetce
);
4595 if (targetce
== UCOL_IGNORABLE
) {
4600 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4601 found
= found
&& targetce
== patternce
[patternceindex
];
4604 // initializing the rearranged accent array
4605 if (hasPatternAccents
&& !found
) {
4606 strsrch
->canonicalPrefixAccents
[0] = 0;
4607 strsrch
->canonicalSuffixAccents
[0] = 0;
4608 if (U_FAILURE(*status
)) {
4611 found
= doNextCanonicalMatch(strsrch
, textoffset
, status
);
4615 if (U_FAILURE(*status
)) {
4618 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
4620 // status checked at loop
4621 patternceindex
= patterncelength
;
4625 if (checkNextCanonicalMatch(strsrch
, &textoffset
, status
)) {
4626 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
4630 setMatchNotFound(strsrch
);
4633 int32_t textOffset
= ucol_getOffset(strsrch
->textIter
);
4637 if (usearch_search(strsrch
, textOffset
, &start
, &end
, status
)) {
4638 strsrch
->search
->matchedIndex
= start
;
4639 strsrch
->search
->matchedLength
= end
- start
;
4642 setMatchNotFound(strsrch
);
4648 UBool
usearch_handlePreviousExact(UStringSearch
*strsrch
, UErrorCode
*status
)
4650 if (U_FAILURE(*status
)) {
4651 setMatchNotFound(strsrch
);
4656 UCollationElements
*coleiter
= strsrch
->textIter
;
4657 int32_t *patternce
= strsrch
->pattern
.ces
;
4658 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4659 int32_t textoffset
= ucol_getOffset(coleiter
);
4661 // shifting it check for setting offset
4662 // if setOffset is called previously or there was no previous match, we
4663 // leave the offset as it is.
4664 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4665 textoffset
= strsrch
->search
->matchedIndex
;
4668 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4671 while (textoffset
>= 0)
4673 int32_t patternceindex
= 1;
4675 UBool found
= FALSE
;
4676 int32_t firstce
= UCOL_NULLORDER
;
4678 // if status is a failure, ucol_setOffset does nothing
4679 setColEIterOffset(coleiter
, textoffset
);
4682 // finding the first pattern ce match, imagine composite
4683 // characters. for example: search for pattern \u0300 in text
4684 // \u00C0, we'll have to skip A first before we get to
4685 // \u0300 the grave accent
4686 targetce
= ucol_next(coleiter
, status
);
4687 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4691 targetce
= getCE(strsrch
, targetce
);
4692 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4695 if (targetce
== UCOL_IGNORABLE
&& strsrch
->strength
!= UCOL_PRIMARY
) {
4698 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4699 if (targetce
== patternce
[0]) {
4703 if (!hasExpansion(coleiter
)) {
4704 // checking for accents in composite character
4710 //targetce = firstce;
4712 while (found
&& (patternceindex
< patterncelength
)) {
4714 targetce
= ucol_next(coleiter
, status
);
4715 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4719 targetce
= getCE(strsrch
, targetce
);
4720 if (targetce
== UCOL_IGNORABLE
) {
4724 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4725 found
= found
&& targetce
== patternce
[patternceindex
];
4732 if (U_FAILURE(*status
)) {
4736 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4742 if (checkPreviousExactMatch(strsrch
, &textoffset
, status
)) {
4743 setColEIterOffset(coleiter
, textoffset
);
4747 setMatchNotFound(strsrch
);
4752 if (strsrch
->search
->isOverlap
) {
4753 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4754 textOffset
= strsrch
->search
->matchedIndex
+ strsrch
->search
->matchedLength
- 1;
4756 // move the start position at the end of possible match
4757 initializePatternPCETable(strsrch
, status
);
4758 if (!initTextProcessedIter(strsrch
, status
)) {
4759 setMatchNotFound(strsrch
);
4762 for (int32_t nPCEs
= 0; nPCEs
< strsrch
->pattern
.pcesLength
- 1; nPCEs
++) {
4763 int64_t pce
= strsrch
->textProcessedIter
->nextProcessed(NULL
, NULL
, status
);
4764 if (pce
== UCOL_PROCESSED_NULLORDER
) {
4765 // at the end of the text
4769 if (U_FAILURE(*status
)) {
4770 setMatchNotFound(strsrch
);
4773 textOffset
= ucol_getOffset(strsrch
->textIter
);
4776 textOffset
= ucol_getOffset(strsrch
->textIter
);
4782 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4783 strsrch
->search
->matchedIndex
= start
;
4784 strsrch
->search
->matchedLength
= end
- start
;
4787 setMatchNotFound(strsrch
);
4793 UBool
usearch_handlePreviousCanonical(UStringSearch
*strsrch
,
4796 if (U_FAILURE(*status
)) {
4797 setMatchNotFound(strsrch
);
4802 UCollationElements
*coleiter
= strsrch
->textIter
;
4803 int32_t *patternce
= strsrch
->pattern
.ces
;
4804 int32_t patterncelength
= strsrch
->pattern
.cesLength
;
4805 int32_t textoffset
= ucol_getOffset(coleiter
);
4806 UBool hasPatternAccents
=
4807 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
4809 // shifting it check for setting offset
4810 // if setOffset is called previously or there was no previous match, we
4811 // leave the offset as it is.
4812 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4813 textoffset
= strsrch
->search
->matchedIndex
;
4816 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
4818 strsrch
->canonicalPrefixAccents
[0] = 0;
4819 strsrch
->canonicalSuffixAccents
[0] = 0;
4821 while (textoffset
>= 0)
4823 int32_t patternceindex
= 1;
4825 UBool found
= FALSE
;
4826 int32_t firstce
= UCOL_NULLORDER
;
4828 setColEIterOffset(coleiter
, textoffset
);
4830 // finding the first pattern ce match, imagine composite
4831 // characters. for example: search for pattern \u0300 in text
4832 // \u00C0, we'll have to skip A first before we get to
4833 // \u0300 the grave accent
4834 targetce
= ucol_next(coleiter
, status
);
4835 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4839 targetce
= getCE(strsrch
, targetce
);
4840 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
4844 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4845 if (targetce
== patternce
[0]) {
4846 // the first ce can be a contraction
4850 if (!hasExpansion(coleiter
)) {
4851 // checking for accents in composite character
4859 while (found
&& patternceindex
< patterncelength
) {
4860 targetce
= ucol_next(coleiter
, status
);
4861 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
4865 targetce
= getCE(strsrch
, targetce
);
4866 if (targetce
== UCOL_IGNORABLE
) {
4870 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4871 found
= found
&& targetce
== patternce
[patternceindex
];
4875 // initializing the rearranged accent array
4876 if (hasPatternAccents
&& !found
) {
4877 strsrch
->canonicalPrefixAccents
[0] = 0;
4878 strsrch
->canonicalSuffixAccents
[0] = 0;
4879 if (U_FAILURE(*status
)) {
4882 found
= doPreviousCanonicalMatch(strsrch
, textoffset
, status
);
4886 if (U_FAILURE(*status
)) {
4889 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
4895 if (checkPreviousCanonicalMatch(strsrch
, &textoffset
, status
)) {
4896 setColEIterOffset(coleiter
, textoffset
);
4900 setMatchNotFound(strsrch
);
4905 if (strsrch
->search
->isOverlap
) {
4906 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
4907 textOffset
= strsrch
->search
->matchedIndex
+ strsrch
->search
->matchedLength
- 1;
4909 // move the start position at the end of possible match
4910 initializePatternPCETable(strsrch
, status
);
4911 if (!initTextProcessedIter(strsrch
, status
)) {
4912 setMatchNotFound(strsrch
);
4915 for (int32_t nPCEs
= 0; nPCEs
< strsrch
->pattern
.pcesLength
- 1; nPCEs
++) {
4916 int64_t pce
= strsrch
->textProcessedIter
->nextProcessed(NULL
, NULL
, status
);
4917 if (pce
== UCOL_PROCESSED_NULLORDER
) {
4918 // at the end of the text
4922 if (U_FAILURE(*status
)) {
4923 setMatchNotFound(strsrch
);
4926 textOffset
= ucol_getOffset(strsrch
->textIter
);
4929 textOffset
= ucol_getOffset(strsrch
->textIter
);
4935 if (usearch_searchBackwards(strsrch
, textOffset
, &start
, &end
, status
)) {
4936 strsrch
->search
->matchedIndex
= start
;
4937 strsrch
->search
->matchedLength
= end
- start
;
4940 setMatchNotFound(strsrch
);
4946 #endif /* #if !UCONFIG_NO_COLLATION */