2 **********************************************************************
3 * Copyright (C) 2001-2003 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
22 // internal definition ---------------------------------------------------
24 #define LAST_BYTE_MASK_ 0xFF
25 #define SECOND_LAST_BYTE_SHIFT_ 8
26 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
28 static const uint16_t *FCD_
= NULL
;
30 // internal methods -------------------------------------------------
33 * Fast collation element iterator setOffset.
34 * This function does not check for bounds.
35 * @param coleiter collation element iterator
36 * @param offset to set
39 inline void setColEIterOffset(UCollationElements
*elems
,
42 collIterate
*ci
= &(elems
->iteratordata_
);
43 ci
->pos
= ci
->string
+ offset
;
44 ci
->CEpos
= ci
->toReturn
= ci
->CEs
;
45 if (ci
->flags
& UCOL_ITER_INNORMBUF
) {
46 ci
->flags
= ci
->origFlags
;
48 ci
->fcdPosition
= NULL
;
52 * Getting the mask for collation strength
53 * @param strength collation strength
54 * @return collation element mask
57 inline uint32_t getMask(UCollationStrength strength
)
62 return UCOL_PRIMARYORDERMASK
;
64 return UCOL_SECONDARYORDERMASK
| UCOL_PRIMARYORDERMASK
;
66 return UCOL_TERTIARYORDERMASK
| UCOL_SECONDARYORDERMASK
|
67 UCOL_PRIMARYORDERMASK
;
72 * This is to squeeze the 21bit ces into a 256 table
73 * @param ce collation element
74 * @return collapsed version of the collation element
77 inline int hash(uint32_t ce
)
79 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
80 // well with the new collation where most of the latin 1 characters
81 // are of the value xx000xxx. their hashes will most of the time be 0
82 // to be discussed on the hash algo.
83 return UCOL_PRIMARYORDER(ce
) % MAX_TABLE_SIZE_
;
87 * Initializing the fcd tables.
88 * Internal method, status assumed to be a success.
89 * @param status output error if any, caller to check status before calling
90 * method, status assumed to be success when passed in.
93 inline void initializeFCD(UErrorCode
*status
)
96 FCD_
= unorm_getFCDTrie(status
);
101 * Gets the fcd value for a character at the argument index.
102 * This method takes into accounts of the supplementary characters.
103 * @param str UTF16 string where character for fcd retrieval resides
104 * @param offset position of the character whose fcd is to be retrieved, to be
105 * overwritten with the next character position, taking
106 * surrogate characters into consideration.
107 * @param strlength length of the argument string
111 inline uint16_t getFCD(const UChar
*str
, int32_t *offset
,
114 int32_t temp
= *offset
;
116 UChar ch
= str
[temp
];
117 result
= unorm_getFCD16(FCD_
, ch
);
120 if (result
&& temp
!= strlength
&& UTF_IS_FIRST_SURROGATE(ch
)) {
122 if (UTF_IS_SECOND_SURROGATE(ch
)) {
123 result
= unorm_getFCD16FromSurrogatePair(FCD_
, result
, ch
);
134 * Getting the modified collation elements taking into account the collation
136 * @param strsrch string search data
138 * @return the modified collation element
141 inline uint32_t getCE(const UStringSearch
*strsrch
, uint32_t sourcece
)
143 // note for tertiary we can't use the collator->tertiaryMask, that
144 // is a preprocessed mask that takes into account case options. since
145 // we are only concerned with exact matches, we don't need that.
146 sourcece
&= strsrch
->ceMask
;
148 if (strsrch
->toShift
) {
149 // alternate handling here, since only the 16 most significant digits
150 // is only used, we can safely do a compare without masking
151 // if the ce is a variable, we mask and get only the primary values
152 // no shifting to quartenary is required since all primary values
153 // less than variabletop will need to be masked off anyway.
154 if (strsrch
->variableTop
> sourcece
) {
155 if (strsrch
->strength
== UCOL_QUATERNARY
) {
156 sourcece
&= UCOL_PRIMARYORDERMASK
;
159 sourcece
= UCOL_IGNORABLE
;
168 * Allocate a memory and returns NULL if it failed.
169 * Internal method, status assumed to be a success.
170 * @param size to allocate
171 * @param status output error if any, caller to check status before calling
172 * method, status assumed to be success when passed in.
173 * @return newly allocated array, NULL otherwise
176 inline void * allocateMemory(uint32_t size
, UErrorCode
*status
)
178 uint32_t *result
= (uint32_t *)uprv_malloc(size
);
179 if (result
== NULL
) {
180 *status
= U_MEMORY_ALLOCATION_ERROR
;
186 * Adds a uint32_t value to a destination array.
187 * Creates a new array if we run out of space. The caller will have to
188 * manually deallocate the newly allocated array.
189 * Internal method, status assumed to be success, caller has to check status
190 * before calling this method. destination not to be NULL and has at least
191 * size destinationlength.
192 * @param destination target array
193 * @param offset destination offset to add value
194 * @param destinationlength target array size, return value for the new size
195 * @param value to be added
196 * @param increments incremental size expected
197 * @param status output error if any, caller to check status before calling
198 * method, status assumed to be success when passed in.
199 * @return new destination array, destination if there was no new allocation
202 inline uint32_t * addTouint32_tArray(uint32_t *destination
,
204 uint32_t *destinationlength
,
209 uint32_t newlength
= *destinationlength
;
210 if (offset
+ 1 == newlength
) {
211 newlength
+= increments
;
212 uint32_t *temp
= (uint32_t *)allocateMemory(
213 sizeof(uint32_t) * newlength
, status
);
214 if (U_FAILURE(*status
)) {
217 uprv_memcpy(temp
, destination
, sizeof(uint32_t) * offset
);
218 *destinationlength
= newlength
;
221 destination
[offset
] = value
;
226 * Initializing the ce table for a pattern.
227 * Stores non-ignorable collation keys.
228 * Table size will be estimated by the size of the pattern text. Table
229 * expansion will be perform as we go along. Adding 1 to ensure that the table
230 * size definitely increases.
231 * Internal method, status assumed to be a success.
232 * @param strsrch string search data
233 * @param status output error if any, caller to check status before calling
234 * method, status assumed to be success when passed in.
235 * @return total number of expansions
238 inline uint16_t initializePatternCETable(UStringSearch
*strsrch
,
241 UPattern
*pattern
= &(strsrch
->pattern
);
242 uint32_t cetablesize
= INITIAL_ARRAY_SIZE_
;
243 uint32_t *cetable
= pattern
->CEBuffer
;
244 uint32_t patternlength
= pattern
->textLength
;
245 UCollationElements
*coleiter
= strsrch
->utilIter
;
247 if (coleiter
== NULL
) {
248 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
249 patternlength
, status
);
250 // status will be checked in ucol_next(..) later and if it is an
251 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
253 strsrch
->utilIter
= coleiter
;
256 uprv_init_collIterate(strsrch
->collator
, pattern
->text
,
258 &coleiter
->iteratordata_
);
261 if (pattern
->CE
!= cetable
&& pattern
->CE
) {
262 uprv_free(pattern
->CE
);
269 while ((ce
= ucol_next(coleiter
, status
)) != UCOL_NULLORDER
&&
270 U_SUCCESS(*status
)) {
271 uint32_t newce
= getCE(strsrch
, ce
);
273 uint32_t *temp
= addTouint32_tArray(cetable
, offset
, &cetablesize
,
275 patternlength
- ucol_getOffset(coleiter
) + 1,
277 if (U_FAILURE(*status
)) {
281 if (cetable
!= temp
&& cetable
!= pattern
->CEBuffer
) {
286 result
+= (uint16_t)(ucol_getMaxExpansion(coleiter
, ce
) - 1);
290 pattern
->CE
= cetable
;
291 pattern
->CELength
= offset
;
297 * Initializes the pattern struct.
298 * Internal method, status assumed to be success.
299 * @param strsrch UStringSearch data storage
300 * @param status output error if any, caller to check status before calling
301 * method, status assumed to be success when passed in.
302 * @return expansionsize the total expansion size of the pattern
305 inline int16_t initializePattern(UStringSearch
*strsrch
, UErrorCode
*status
)
307 UPattern
*pattern
= &(strsrch
->pattern
);
308 const UChar
*patterntext
= pattern
->text
;
309 int32_t length
= pattern
->textLength
;
312 pattern
->hasPrefixAccents
= getFCD(patterntext
, &index
, length
) >>
313 SECOND_LAST_BYTE_SHIFT_
;
315 UTF_BACK_1(patterntext
, 0, index
);
316 pattern
->hasSuffixAccents
= getFCD(patterntext
, &index
, length
) &
318 // since intializePattern is an internal method status is a success.
319 return initializePatternCETable(strsrch
, status
);
323 * Initializing shift tables, with the default values.
324 * If a corresponding default value is 0, the shift table is not set.
325 * @param shift table for forwards shift
326 * @param backshift table for backwards shift
327 * @param cetable table containing pattern ce
328 * @param cesize size of the pattern ces
329 * @param expansionsize total size of the expansions
330 * @param defaultforward the default forward value
331 * @param defaultbackward the default backward value
334 inline void setShiftTable(int16_t shift
[], int16_t backshift
[],
335 uint32_t *cetable
, int32_t cesize
,
336 int16_t expansionsize
,
337 int16_t defaultforward
,
338 int16_t defaultbackward
)
340 // estimate the value to shift. to do that we estimate the smallest
341 // number of characters to give the relevant ces, ie approximately
342 // the number of ces minus their expansion, since expansions can come
345 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
346 shift
[count
] = defaultforward
;
348 cesize
--; // down to the last index
349 for (count
= 0; count
< cesize
; count
++) {
350 // number of ces from right of array to the count
351 int temp
= defaultforward
- count
- 1;
352 shift
[hash(cetable
[count
])] = temp
> 1 ? temp
: 1;
354 shift
[hash(cetable
[cesize
])] = 1;
355 // for ignorables we just shift by one. see test examples.
358 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
359 backshift
[count
] = defaultbackward
;
361 for (count
= cesize
; count
> 0; count
--) {
362 // the original value count does not seem to work
363 backshift
[hash(cetable
[count
])] = count
> expansionsize
?
364 (int16_t)(count
- expansionsize
) : 1;
366 backshift
[hash(cetable
[0])] = 1;
367 backshift
[hash(0)] = 1;
371 * Building of the pattern collation element list and the boyer moore strsrch
373 * The canonical match will only be performed after the default match fails.
374 * For both cases we need to remember the size of the composed and decomposed
375 * versions of the string. Since the Boyer-Moore shift calculations shifts by
376 * a number of characters in the text and tries to match the pattern from that
377 * offset, the shift value can not be too large in case we miss some
378 * characters. To choose a right shift size, we estimate the NFC form of the
379 * and use its size as a shift guide. The NFC form should be the small
380 * possible representation of the pattern. Anyways, we'll err on the smaller
381 * shift size. Hence the calculation for minlength.
382 * Canonical match will be performed slightly differently. We'll split the
383 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
384 * the first and last base character (MS), the ending accents (EA). Matches
385 * will be done on MS first, and only when we match MS then some processing
386 * will be required for the prefix and end accents in order to determine if
387 * they match PA and EA. Hence the default shift values
388 * for the canonical match will take the size of either end's accent into
389 * consideration. Forwards search will take the end accents into consideration
390 * for the default shift values and the backwards search will take the prefix
391 * accents into consideration.
392 * If pattern has no non-ignorable ce, we return a illegal argument error.
393 * Internal method, status assumed to be success.
394 * @param strsrch UStringSearch data storage
395 * @param status for output errors if it occurs, status is assumed to be a
396 * success when it is passed in.
399 inline void initialize(UStringSearch
*strsrch
, UErrorCode
*status
)
401 int16_t expandlength
= initializePattern(strsrch
, status
);
402 if (U_SUCCESS(*status
) && strsrch
->pattern
.CELength
> 0) {
403 UPattern
*pattern
= &strsrch
->pattern
;
404 int32_t cesize
= pattern
->CELength
;
406 int16_t minlength
= cesize
> expandlength
407 ? (int16_t)cesize
- expandlength
: 1;
408 pattern
->defaultShiftSize
= minlength
;
409 setShiftTable(pattern
->shift
, pattern
->backShift
, pattern
->CE
,
410 cesize
, expandlength
, minlength
, minlength
);
413 strsrch
->pattern
.defaultShiftSize
= 0;
417 * Determine whether the target text in UStringSearch bounded by the offset
418 * start and end is one or more whole units of text as
419 * determined by the breakiterator in UStringSearch.
420 * @param strsrch string search data
421 * @param start target text start offset
422 * @param end target text end offset
425 inline UBool
isBreakUnit(const UStringSearch
*strsrch
, int32_t start
,
428 #if !UCONFIG_NO_BREAK_ITERATION
429 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
431 int32_t startindex
= ubrk_first(breakiterator
);
432 int32_t endindex
= ubrk_last(breakiterator
);
434 // out-of-range indexes are never boundary positions
435 if (start
< startindex
|| start
> endindex
||
436 end
< startindex
|| end
> endindex
) {
439 // otherwise, we can use following() on the position before the
440 // specified one and return true of the position we get back is the
441 // one the user specified
442 UBool result
= (start
== startindex
||
443 ubrk_following(breakiterator
, start
- 1) == start
) &&
445 ubrk_following(breakiterator
, end
- 1) == end
);
447 // iterates the individual ces
448 UCollationElements
*coleiter
= strsrch
->utilIter
;
449 const UChar
*text
= strsrch
->search
->text
+
451 UErrorCode status
= U_ZERO_ERROR
;
452 ucol_setText(coleiter
, text
, end
- start
, &status
);
453 for (int32_t count
= 0; count
< strsrch
->pattern
.CELength
;
455 uint32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
456 if (ce
== UCOL_IGNORABLE
) {
460 if (U_FAILURE(status
) || ce
!= strsrch
->pattern
.CE
[count
]) {
464 uint32_t nextce
= ucol_next(coleiter
, &status
);
465 while (ucol_getOffset(coleiter
) == (end
- start
)
466 && getCE(strsrch
, nextce
) == UCOL_IGNORABLE
) {
467 nextce
= ucol_next(coleiter
, &status
);
469 if (ucol_getOffset(coleiter
) == (end
- start
)
470 && nextce
!= UCOL_NULLORDER
) {
471 // extra collation elements at the end of the match
482 * Getting the next base character offset if current offset is an accent,
483 * or the current offset if the current character contains a base character.
484 * accents the following base character will be returned
486 * @param textoffset current offset
487 * @param textlength length of text string
488 * @return the next base character or the current offset
489 * if the current character is contains a base character.
492 inline int32_t getNextBaseOffset(const UChar
*text
,
496 if (textoffset
< textlength
) {
497 int32_t temp
= textoffset
;
498 if (getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
499 while (temp
< textlength
) {
500 int32_t result
= temp
;
501 if ((getFCD(text
, &temp
, textlength
) >>
502 SECOND_LAST_BYTE_SHIFT_
) == 0) {
513 * Gets the next base character offset depending on the string search pattern
515 * @param strsrch string search data
516 * @param textoffset current offset, one offset away from the last character
518 * @return start index of the next base character or the current offset
519 * if the current character is contains a base character.
522 inline int32_t getNextUStringSearchBaseOffset(UStringSearch
*strsrch
,
525 int32_t textlength
= strsrch
->search
->textLength
;
526 if (strsrch
->pattern
.hasSuffixAccents
&&
527 textoffset
< textlength
) {
528 int32_t temp
= textoffset
;
529 const UChar
*text
= strsrch
->search
->text
;
530 UTF_BACK_1(text
, 0, temp
);
531 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
532 return getNextBaseOffset(text
, textoffset
, textlength
);
539 * Shifting the collation element iterator position forward to prepare for
540 * a following match. If the last character is a unsafe character, we'll only
541 * shift by 1 to capture contractions, normalization etc.
542 * Internal method, status assumed to be success.
543 * @param text strsrch string search data
544 * @param textoffset start text position to do search
545 * @param ce the text ce which failed the match.
546 * @param patternceindex index of the ce within the pattern ce buffer which
548 * @return final offset
551 inline int32_t shiftForward(UStringSearch
*strsrch
,
554 int32_t patternceindex
)
556 UPattern
*pattern
= &(strsrch
->pattern
);
557 if (ce
!= UCOL_NULLORDER
) {
558 int32_t shift
= pattern
->shift
[hash(ce
)];
559 // this is to adjust for characters in the middle of the
560 // substring for matching that failed.
561 int32_t adjust
= pattern
->CELength
- patternceindex
;
562 if (adjust
> 1 && shift
>= adjust
) {
568 textoffset
+= pattern
->defaultShiftSize
;
571 textoffset
= getNextUStringSearchBaseOffset(strsrch
, textoffset
);
572 // check for unsafe characters
573 // * if it is the start or middle of a contraction: to be done after
574 // a initial match is found
575 // * thai or lao base consonant character: similar to contraction
576 // * high surrogate character: similar to contraction
577 // * next character is a accent: shift to the next base character
582 * sets match not found
583 * @param strsrch string search data
586 inline void setMatchNotFound(UStringSearch
*strsrch
)
588 // this method resets the match result regardless of the error status.
589 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
590 strsrch
->search
->matchedLength
= 0;
591 if (strsrch
->search
->isForwardSearching
) {
592 setColEIterOffset(strsrch
->textIter
, strsrch
->search
->textLength
);
595 setColEIterOffset(strsrch
->textIter
, 0);
600 * Gets the offset to the next safe point in text.
601 * ie. not the middle of a contraction, swappable characters or supplementary
603 * @param collator collation sata
604 * @param text string to work with
605 * @param textoffset offset in string
606 * @param textlength length of text string
607 * @return offset to the next safe character
610 inline int32_t getNextSafeOffset(const UCollator
*collator
,
615 int32_t result
= textoffset
; // first contraction character
616 while (result
!= textlength
&& ucol_unsafeCP(text
[result
], collator
)) {
623 * This checks for accents in the potential match started with a .
624 * composite character.
625 * This is really painful... we have to check that composite character do not
626 * have any extra accents. We have to normalize the potential match and find
627 * the immediate decomposed character before the match.
628 * The first composite character would have been taken care of by the fcd
629 * checks in checkForwardExactMatch.
630 * This is the slow path after the fcd of the first character and
631 * the last character has been checked by checkForwardExactMatch and we
632 * determine that the potential match has extra non-ignorable preceding
634 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
635 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
636 * Note here that accents checking are slow and cautioned in the API docs.
637 * Internal method, status assumed to be a success, caller should check status
638 * before calling this method
639 * @param strsrch string search data
640 * @param start index of the potential unfriendly composite character
641 * @param end index of the potential unfriendly composite character
642 * @param status output error status if any.
643 * @return TRUE if there is non-ignorable accents before at the beginning
644 * of the match, FALSE otherwise.
648 UBool
checkExtraMatchAccents(const UStringSearch
*strsrch
, int32_t start
,
652 UBool result
= FALSE
;
653 if (strsrch
->pattern
.hasPrefixAccents
) {
654 int32_t length
= end
- start
;
656 const UChar
*text
= strsrch
->search
->text
+ start
;
658 UTF_FWD_1(text
, offset
, length
);
659 // we are only concerned with the first composite character
660 if (unorm_quickCheck(text
, offset
, UNORM_NFD
, status
) == UNORM_NO
) {
661 int32_t safeoffset
= getNextSafeOffset(strsrch
->collator
,
663 if (safeoffset
!= length
) {
667 UChar buffer
[INITIAL_ARRAY_SIZE_
];
668 int32_t size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0,
669 buffer
, INITIAL_ARRAY_SIZE_
,
671 if (U_FAILURE(*status
)) {
674 if (size
>= INITIAL_ARRAY_SIZE_
) {
675 norm
= (UChar
*)allocateMemory((size
+ 1) * sizeof(UChar
),
677 // if allocation failed, status will be set to
678 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
680 size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0, norm
,
682 if (U_FAILURE(*status
) && norm
!= NULL
) {
691 UCollationElements
*coleiter
= strsrch
->utilIter
;
692 ucol_setText(coleiter
, norm
, size
, status
);
693 uint32_t firstce
= strsrch
->pattern
.CE
[0];
694 UBool ignorable
= TRUE
;
695 uint32_t ce
= UCOL_IGNORABLE
;
696 while (U_SUCCESS(*status
) && ce
!= firstce
) {
697 offset
= ucol_getOffset(coleiter
);
698 if (ce
!= firstce
&& ce
!= UCOL_IGNORABLE
) {
701 ce
= ucol_next(coleiter
, status
);
704 UTF_PREV_CHAR(norm
, 0, offset
, codepoint
);
705 result
= !ignorable
&& (u_getCombiningClass(codepoint
) != 0);
707 if (norm
!= buffer
) {
717 * Used by exact matches, checks if there are accents before the match.
718 * This is really painful... we have to check that composite characters at
719 * the start of the matches have to not have any extra accents.
720 * We check the FCD of the character first, if it starts with an accent and
721 * the first pattern ce does not match the first ce of the character, we bail.
722 * Otherwise we try normalizing the first composite
723 * character and find the immediate decomposed character before the match to
724 * see if it is an non-ignorable accent.
725 * Now normalizing the first composite character is enough because we ensure
726 * that when the match is passed in here with extra beginning ces, the
727 * first or last ce that match has to occur within the first character.
728 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
729 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
730 * Note here that accents checking are slow and cautioned in the API docs.
731 * @param strsrch string search data
732 * @param start offset
734 * @return TRUE if there are accents on either side of the match,
738 UBool
hasAccentsBeforeMatch(const UStringSearch
*strsrch
, int32_t start
,
741 if (strsrch
->pattern
.hasPrefixAccents
) {
742 UCollationElements
*coleiter
= strsrch
->textIter
;
743 UErrorCode status
= U_ZERO_ERROR
;
744 // we have been iterating forwards previously
745 uint32_t ignorable
= TRUE
;
746 uint32_t firstce
= strsrch
->pattern
.CE
[0];
748 setColEIterOffset(coleiter
, start
);
749 uint32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
750 if (U_FAILURE(status
)) {
753 while (ce
!= firstce
) {
754 if (ce
!= UCOL_IGNORABLE
) {
757 ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
758 if (U_FAILURE(status
)) {
762 if (!ignorable
&& inNormBuf(coleiter
)) {
763 // within normalization buffer, discontiguous handled here
768 int32_t temp
= start
;
770 // accent = (getFCD(strsrch->search->text, &temp,
771 // strsrch->search->textLength)
772 // >> SECOND_LAST_BYTE_SHIFT_);
773 // however this code does not work well with VC7 .net in release mode.
774 // maybe the inlines for getFCD combined with shifting has bugs in
775 // VC7. anyways this is a work around.
776 UBool accent
= getFCD(strsrch
->search
->text
, &temp
,
777 strsrch
->search
->textLength
) > 0xFF;
779 return checkExtraMatchAccents(strsrch
, start
, end
, &status
);
786 UTF_BACK_1(strsrch
->search
->text
, 0, temp
);
787 if (getFCD(strsrch
->search
->text
, &temp
,
788 strsrch
->search
->textLength
) & LAST_BYTE_MASK_
) {
789 setColEIterOffset(coleiter
, start
);
790 ce
= ucol_previous(coleiter
, &status
);
791 if (U_FAILURE(status
) ||
792 (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
)) {
803 * Used by exact matches, checks if there are accents bounding the match.
804 * Note this is the initial boundary check. If the potential match
805 * starts or ends with composite characters, the accents in those
806 * characters will be determined later.
807 * Not doing backwards iteration here, since discontiguos contraction for
808 * backwards collation element iterator, use up too many characters.
809 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
810 * should fail since there is a acute at the end of \u01FA
811 * Note here that accents checking are slow and cautioned in the API docs.
812 * @param strsrch string search data
813 * @param start offset of match
814 * @param end end offset of the match
815 * @return TRUE if there are accents on either side of the match,
819 UBool
hasAccentsAfterMatch(const UStringSearch
*strsrch
, int32_t start
,
822 if (strsrch
->pattern
.hasSuffixAccents
) {
823 const UChar
*text
= strsrch
->search
->text
;
825 int32_t textlength
= strsrch
->search
->textLength
;
826 UTF_BACK_1(text
, 0, temp
);
827 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
828 uint32_t firstce
= strsrch
->pattern
.CE
[0];
829 UCollationElements
*coleiter
= strsrch
->textIter
;
830 UErrorCode status
= U_ZERO_ERROR
;
831 setColEIterOffset(coleiter
, start
);
832 while (getCE(strsrch
, ucol_next(coleiter
, &status
)) != firstce
) {
833 if (U_FAILURE(status
)) {
838 while (count
< strsrch
->pattern
.CELength
) {
839 if (getCE(strsrch
, ucol_next(coleiter
, &status
))
841 // Thai can give an ignorable here.
844 if (U_FAILURE(status
)) {
849 uint32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
850 if (U_FAILURE(status
)) {
853 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
854 if (ucol_getOffset(coleiter
) <= end
) {
857 if (getFCD(text
, &end
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
867 * Checks if the offset runs out of the text string
869 * @param textlength of the text string
870 * @return TRUE if offset is out of bounds, FALSE otherwise
873 inline UBool
isOutOfBounds(int32_t textlength
, int32_t offset
)
875 return offset
< 0 || offset
> textlength
;
879 * Checks for identical match
880 * @param strsrch string search data
881 * @param start offset of possible match
882 * @param end offset of possible match
883 * @return TRUE if identical match is found
886 inline UBool
checkIdentical(const UStringSearch
*strsrch
, int32_t start
,
889 int32_t length
= end
- start
;
890 if (strsrch
->strength
!= UCOL_IDENTICAL
) {
894 UErrorCode status
= U_ZERO_ERROR
;
895 int decomplength
= unorm_decompose(NULL
, -1,
896 strsrch
->search
->text
+ start
, length
,
898 if (decomplength
!= unorm_decompose(NULL
, -1, strsrch
->pattern
.text
,
899 strsrch
->pattern
.textLength
,
900 FALSE
, 0, &status
)) {
904 UChar
*text
= (UChar
*)uprv_malloc(decomplength
* sizeof(UChar
));
905 UChar
*pattern
= (UChar
*)uprv_malloc(decomplength
* sizeof(UChar
));
906 unorm_decompose(text
, decomplength
, strsrch
->search
->text
+ start
,
907 length
, FALSE
, 0, &status
);
908 unorm_decompose(pattern
, decomplength
, strsrch
->pattern
.text
,
909 strsrch
->pattern
.textLength
, FALSE
, 0, &status
);
910 UBool result
= (uprv_memcmp(pattern
, text
, decomplength
* sizeof(UChar
))
918 * Checks to see if the match is repeated
919 * @param strsrch string search data
920 * @param start new match start index
921 * @param end new match end index
922 * @return TRUE if the the match is repeated, FALSE otherwise
925 inline UBool
checkRepeatedMatch(UStringSearch
*strsrch
,
929 int32_t lastmatchindex
= strsrch
->search
->matchedIndex
;
931 if (lastmatchindex
== USEARCH_DONE
) {
934 if (strsrch
->search
->isForwardSearching
) {
935 result
= start
<= lastmatchindex
;
938 result
= start
>= lastmatchindex
;
940 if (!strsrch
->search
->isOverlap
) {
941 if (strsrch
->search
->isForwardSearching
) {
942 result
= start
< lastmatchindex
+ strsrch
->search
->matchedLength
;
945 result
= end
> lastmatchindex
;
952 * Gets the collation element iterator's current offset.
953 * @param coleiter collation element iterator
954 * @param forwards flag TRUE if we are moving in th forwards direction
955 * @return current offset
958 inline int32_t getColElemIterOffset(const UCollationElements
*coleiter
,
961 int32_t result
= ucol_getOffset(coleiter
);
962 // intricacies of the the backwards collation element iterator
963 if (!forwards
&& inNormBuf(coleiter
) && !isFCDPointerNull(coleiter
)) {
970 * Checks match for contraction.
971 * If the match ends with a partial contraction we fail.
972 * If the match starts too far off (because of backwards iteration) we try to
973 * chip off the extra characters depending on whether a breakiterator has
975 * Internal method, error assumed to be success, caller has to check status
976 * before calling this method.
977 * @param strsrch string search data
978 * @param start offset of potential match, to be modified if necessary
979 * @param end offset of potential match, to be modified if necessary
980 * @param status output error status if any
981 * @return TRUE if match passes the contraction test, FALSE otherwise
985 UBool
checkNextExactContractionMatch(UStringSearch
*strsrch
,
987 int32_t *end
, UErrorCode
*status
)
989 UCollationElements
*coleiter
= strsrch
->textIter
;
990 int32_t textlength
= strsrch
->search
->textLength
;
991 int32_t temp
= *start
;
992 const UCollator
*collator
= strsrch
->collator
;
993 const UChar
*text
= strsrch
->search
->text
;
994 // This part checks if either ends of the match contains potential
995 // contraction. If so we'll have to iterate through them
996 // The start contraction needs to be checked since ucol_previous dumps
997 // all characters till the first safe character into the buffer.
998 // *start + 1 is used to test for the unsafe characters instead of *start
999 // because ucol_prev takes all unsafe characters till the first safe
1000 // character ie *start. so by testing *start + 1, we can estimate if
1001 // excess prefix characters has been included in the potential search
1003 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1004 (*start
+ 1 < textlength
1005 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1006 int32_t expansion
= getExpansionPrefix(coleiter
);
1007 UBool expandflag
= expansion
> 0;
1008 setColEIterOffset(coleiter
, *start
);
1009 while (expansion
> 0) {
1010 // getting rid of the redundant ce, caused by setOffset.
1011 // since backward contraction/expansion may have extra ces if we
1012 // are in the normalization buffer, hasAccentsBeforeMatch would
1013 // have taken care of it.
1014 // E.g. the character \u01FA will have an expansion of 3, but if
1015 // we are only looking for acute and ring \u030A and \u0301, we'll
1016 // have to skip the first ce in the expansion buffer.
1017 ucol_next(coleiter
, status
);
1018 if (U_FAILURE(*status
)) {
1021 if (ucol_getOffset(coleiter
) != temp
) {
1023 temp
= ucol_getOffset(coleiter
);
1028 uint32_t *patternce
= strsrch
->pattern
.CE
;
1029 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1031 while (count
< patterncelength
) {
1032 uint32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1033 if (ce
== UCOL_IGNORABLE
) {
1036 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1038 temp
= ucol_getOffset(coleiter
);
1040 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1042 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1052 * Checks and sets the match information if found.
1055 * <li> the potential match does not repeat the previous match
1056 * <li> boundaries are correct
1057 * <li> exact matches has no extra accents
1058 * <li> identical matchesb
1059 * <li> potential match does not end in the middle of a contraction
1061 * Otherwise the offset will be shifted to the next character.
1062 * Internal method, status assumed to be success, caller has to check status
1063 * before calling this method.
1064 * @param strsrch string search data
1065 * @param textoffset offset in the collation element text. the returned value
1066 * will be the truncated end offset of the match or the new start
1068 * @param status output error status if any
1069 * @return TRUE if the match is valid, FALSE otherwise
1072 inline UBool
checkNextExactMatch(UStringSearch
*strsrch
,
1073 int32_t *textoffset
, UErrorCode
*status
)
1075 UCollationElements
*coleiter
= strsrch
->textIter
;
1076 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1078 if (!checkNextExactContractionMatch(strsrch
, &start
, textoffset
, status
)) {
1082 // this totally matches, however we need to check if it is repeating
1083 if (!isBreakUnit(strsrch
, start
, *textoffset
) ||
1084 checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1085 hasAccentsBeforeMatch(strsrch
, start
, *textoffset
) ||
1086 !checkIdentical(strsrch
, start
, *textoffset
) ||
1087 hasAccentsAfterMatch(strsrch
, start
, *textoffset
)) {
1090 *textoffset
= getNextUStringSearchBaseOffset(strsrch
, *textoffset
);
1094 // totally match, we will get rid of the ending ignorables.
1095 strsrch
->search
->matchedIndex
= start
;
1096 strsrch
->search
->matchedLength
= *textoffset
- start
;
1101 * Getting the previous base character offset, or the current offset if the
1102 * current character is a base character
1103 * @param text string
1104 * @param textoffset one offset after the current character
1105 * @return the offset of the next character after the base character or the first
1106 * composed character with accents
1109 inline int32_t getPreviousBaseOffset(const UChar
*text
,
1112 if (textoffset
> 0) {
1114 int32_t result
= textoffset
;
1115 UTF_BACK_1(text
, 0, textoffset
);
1116 int32_t temp
= textoffset
;
1117 uint16_t fcd
= getFCD(text
, &temp
, result
);
1118 if ((fcd
>> SECOND_LAST_BYTE_SHIFT_
) == 0) {
1119 if (fcd
& LAST_BYTE_MASK_
) {
1124 if (textoffset
== 0) {
1133 * Getting the indexes of the accents that are not blocked in the argument
1135 * @param accents array of accents in nfd terminated by a 0.
1136 * @param accentsindex array of indexes of the accents that are not blocked
1139 inline int getUnblockedAccentIndex(UChar
*accents
, int32_t *accentsindex
)
1142 int32_t length
= u_strlen(accents
);
1143 UChar32 codepoint
= 0;
1147 while (index
< length
) {
1149 UTF_NEXT_CHAR(accents
, index
, length
, codepoint
);
1150 if (u_getCombiningClass(codepoint
) != cclass
) {
1151 cclass
= u_getCombiningClass(codepoint
);
1152 accentsindex
[result
] = temp
;
1156 accentsindex
[result
] = length
;
1161 * Appends 3 UChar arrays to a destination array.
1162 * Creates a new array if we run out of space. The caller will have to
1163 * manually deallocate the newly allocated array.
1164 * Internal method, status assumed to be success, caller has to check status
1165 * before calling this method. destination not to be NULL and has at least
1166 * size destinationlength.
1167 * @param destination target array
1168 * @param destinationlength target array size, returning the appended length
1169 * @param source1 null-terminated first array
1170 * @param source2 second array
1171 * @param source2length length of seond array
1172 * @param source3 null-terminated third array
1173 * @param status error status if any
1174 * @return new destination array, destination if there was no new allocation
1177 inline UChar
* addToUCharArray( UChar
*destination
,
1178 int32_t *destinationlength
,
1179 const UChar
*source1
,
1180 const UChar
*source2
,
1181 int32_t source2length
,
1182 const UChar
*source3
,
1185 int32_t source1length
= source1
? u_strlen(source1
) : 0;
1186 int32_t source3length
= source3
? u_strlen(source3
) : 0;
1187 if (*destinationlength
< source1length
+ source2length
+ source3length
+
1190 destination
= (UChar
*)allocateMemory(
1191 (source1length
+ source2length
+ source3length
+ 1) * sizeof(UChar
),
1193 // if error allocating memory, status will be
1194 // U_MEMORY_ALLOCATION_ERROR
1195 if (U_FAILURE(*status
)) {
1196 *destinationlength
= 0;
1200 if (source1length
!= 0) {
1201 uprv_memcpy(destination
, source1
, sizeof(UChar
) * source1length
);
1203 if (source2length
!= 0) {
1204 uprv_memcpy(destination
+ source1length
, source2
,
1205 sizeof(UChar
) * source2length
);
1207 if (source3length
!= 0) {
1208 uprv_memcpy(destination
+ source1length
+ source2length
, source3
,
1209 sizeof(UChar
) * source3length
);
1211 *destinationlength
= source1length
+ source2length
+ source3length
;
1216 * Running through a collation element iterator to see if the contents matches
1217 * pattern in string search data
1218 * @param strsrch string search data
1219 * @param coleiter collation element iterator
1220 * @return TRUE if a match if found, FALSE otherwise
1223 inline UBool
checkCollationMatch(const UStringSearch
*strsrch
,
1224 UCollationElements
*coleiter
)
1226 int patternceindex
= strsrch
->pattern
.CELength
;
1227 uint32_t *patternce
= strsrch
->pattern
.CE
;
1228 UErrorCode status
= U_ZERO_ERROR
;
1229 while (patternceindex
> 0) {
1230 uint32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
1231 if (ce
== UCOL_IGNORABLE
) {
1234 if (U_FAILURE(status
) || ce
!= *patternce
) {
1244 * Rearranges the front accents to try matching.
1245 * Prefix accents in the text will be grouped according to their combining
1246 * class and the groups will be mixed and matched to try find the perfect
1247 * match with the pattern.
1248 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1249 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1250 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1252 * step 2: check if any of the generated substrings matches the pattern.
1253 * Internal method, status is assumed to be success, caller has to check status
1254 * before calling this method.
1255 * @param strsrch string search match
1256 * @param start first offset of the accents to start searching
1257 * @param end start of the last accent set
1258 * @param status output error status if any
1259 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1260 * offset of the match. Note this start includes all preceding accents.
1263 int32_t doNextCanonicalPrefixMatch(UStringSearch
*strsrch
,
1268 const UChar
*text
= strsrch
->search
->text
;
1269 int32_t textlength
= strsrch
->search
->textLength
;
1270 int32_t tempstart
= start
;
1272 if ((getFCD(text
, &tempstart
, textlength
) & LAST_BYTE_MASK_
) == 0) {
1273 // die... failed at a base character
1274 return USEARCH_DONE
;
1277 int32_t offset
= getNextBaseOffset(text
, tempstart
, textlength
);
1278 start
= getPreviousBaseOffset(text
, tempstart
);
1280 UChar accents
[INITIAL_ARRAY_SIZE_
];
1281 // normalizing the offensive string
1282 unorm_normalize(text
+ start
, offset
- start
, UNORM_NFD
, 0, accents
,
1283 INITIAL_ARRAY_SIZE_
, status
);
1284 if (U_FAILURE(*status
)) {
1285 return USEARCH_DONE
;
1288 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1289 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1291 int32_t count
= (2 << (accentsize
- 1)) - 2;
1292 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1293 UCollationElements
*coleiter
= strsrch
->utilIter
;
1294 while (U_SUCCESS(*status
) && count
> 0) {
1295 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
1296 // copy the base characters
1297 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1298 *rearrange
++ = accents
[k
];
1300 // forming all possible canonical rearrangement by dropping
1302 for (int i
= 0; i
<= accentsize
- 1; i
++) {
1303 int32_t mask
= 1 << (accentsize
- i
- 1);
1305 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1306 *rearrange
++ = accents
[j
];
1311 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
1312 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
1313 strsrch
->canonicalPrefixAccents
,
1314 strsrch
->search
->text
+ offset
,
1316 strsrch
->canonicalSuffixAccents
,
1319 // if status is a failure, ucol_setText does nothing.
1320 // run the collator iterator through this match
1321 ucol_setText(coleiter
, match
, matchsize
, status
);
1322 if (U_SUCCESS(*status
)) {
1323 if (checkCollationMatch(strsrch
, coleiter
)) {
1324 if (match
!= buffer
) {
1332 return USEARCH_DONE
;
1336 * Gets the offset to the safe point in text before textoffset.
1337 * ie. not the middle of a contraction, swappable characters or supplementary
1339 * @param collator collation sata
1340 * @param text string to work with
1341 * @param textoffset offset in string
1342 * @param textlength length of text string
1343 * @return offset to the previous safe character
1346 inline uint32_t getPreviousSafeOffset(const UCollator
*collator
,
1350 int32_t result
= textoffset
; // first contraction character
1351 while (result
!= 0 && ucol_unsafeCP(text
[result
- 1], collator
)) {
1355 // the first contraction character is consider unsafe here
1362 * Cleaning up after we passed the safe zone
1363 * @param strsrch string search data
1364 * @param safetext safe text array
1365 * @param safebuffer safe text buffer
1366 * @param coleiter collation element iterator for safe text
1369 inline void cleanUpSafeText(const UStringSearch
*strsrch
, UChar
*safetext
,
1372 if (safetext
!= safebuffer
&& safetext
!= strsrch
->canonicalSuffixAccents
)
1374 uprv_free(safetext
);
1379 * Take the rearranged end accents and tries matching. If match failed at
1380 * a seperate preceding set of accents (seperated from the rearranged on by
1381 * at least a base character) then we rearrange the preceding accents and
1382 * tries matching again.
1383 * We allow skipping of the ends of the accent set if the ces do not match.
1384 * However if the failure is found before the accent set, it fails.
1385 * Internal method, status assumed to be success, caller has to check status
1386 * before calling this method.
1387 * @param strsrch string search data
1388 * @param textoffset of the start of the rearranged accent
1389 * @param status output error status if any
1390 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1391 * offset of the match. Note this start includes all preceding accents.
1394 int32_t doNextCanonicalSuffixMatch(UStringSearch
*strsrch
,
1398 const UChar
*text
= strsrch
->search
->text
;
1399 const UCollator
*collator
= strsrch
->collator
;
1400 int32_t safelength
= 0;
1402 int32_t safetextlength
;
1403 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
1404 UCollationElements
*coleiter
= strsrch
->utilIter
;
1405 int32_t safeoffset
= textoffset
;
1407 if (textoffset
!= 0 && ucol_unsafeCP(strsrch
->canonicalSuffixAccents
[0],
1409 safeoffset
= getPreviousSafeOffset(collator
, text
, textoffset
);
1410 safelength
= textoffset
- safeoffset
;
1411 safetextlength
= INITIAL_ARRAY_SIZE_
;
1412 safetext
= addToUCharArray(safebuffer
, &safetextlength
, NULL
,
1413 text
+ safeoffset
, safelength
,
1414 strsrch
->canonicalSuffixAccents
,
1418 safetextlength
= u_strlen(strsrch
->canonicalSuffixAccents
);
1419 safetext
= strsrch
->canonicalSuffixAccents
;
1422 // if status is a failure, ucol_setText does nothing
1423 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
1424 // status checked in loop below
1426 uint32_t *ce
= strsrch
->pattern
.CE
;
1427 uint32_t celength
= strsrch
->pattern
.CELength
;
1428 int ceindex
= celength
- 1;
1429 UBool isSafe
= TRUE
; // indication flag for position in safe zone
1431 while (ceindex
>= 0) {
1432 uint32_t textce
= ucol_previous(coleiter
, status
);
1433 if (U_FAILURE(*status
)) {
1435 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1437 return USEARCH_DONE
;
1439 if (textce
== UCOL_NULLORDER
) {
1440 // check if we have passed the safe buffer
1441 if (coleiter
== strsrch
->textIter
) {
1442 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1443 return USEARCH_DONE
;
1445 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1446 safetext
= safebuffer
;
1447 coleiter
= strsrch
->textIter
;
1448 setColEIterOffset(coleiter
, safeoffset
);
1449 // status checked at the start of the loop
1453 textce
= getCE(strsrch
, textce
);
1454 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
1455 // do the beginning stuff
1456 int32_t failedoffset
= getColElemIterOffset(coleiter
, FALSE
);
1457 if (isSafe
&& failedoffset
>= safelength
) {
1458 // alas... no hope. failed at rearranged accent set
1459 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1460 return USEARCH_DONE
;
1464 failedoffset
+= safeoffset
;
1465 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1468 // try rearranging the front accents
1469 int32_t result
= doNextCanonicalPrefixMatch(strsrch
,
1470 failedoffset
, textoffset
, status
);
1471 if (result
!= USEARCH_DONE
) {
1472 // if status is a failure, ucol_setOffset does nothing
1473 setColEIterOffset(strsrch
->textIter
, result
);
1475 if (U_FAILURE(*status
)) {
1476 return USEARCH_DONE
;
1481 if (textce
== ce
[ceindex
]) {
1487 int32_t result
= getColElemIterOffset(coleiter
, FALSE
);
1488 // sets the text iterator here with the correct expansion and offset
1489 int32_t leftoverces
= getExpansionPrefix(coleiter
);
1490 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1491 if (result
>= safelength
) {
1492 result
= textoffset
;
1495 result
+= safeoffset
;
1497 setColEIterOffset(strsrch
->textIter
, result
);
1498 strsrch
->textIter
->iteratordata_
.toReturn
=
1499 setExpansionPrefix(strsrch
->textIter
, leftoverces
);
1503 return ucol_getOffset(coleiter
);
1507 * Trying out the substring and sees if it can be a canonical match.
1508 * This will try normalizing the end accents and arranging them into canonical
1509 * equivalents and check their corresponding ces with the pattern ce.
1510 * Suffix accents in the text will be grouped according to their combining
1511 * class and the groups will be mixed and matched to try find the perfect
1512 * match with the pattern.
1513 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1514 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1515 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1517 * step 2: check if any of the generated substrings matches the pattern.
1518 * Internal method, status assumed to be success, caller has to check status
1519 * before calling this method.
1520 * @param strsrch string search data
1521 * @param textoffset end offset in the collation element text that ends with
1522 * the accents to be rearranged
1523 * @param status error status if any
1524 * @return TRUE if the match is valid, FALSE otherwise
1527 UBool
doNextCanonicalMatch(UStringSearch
*strsrch
,
1531 const UChar
*text
= strsrch
->search
->text
;
1532 int32_t temp
= textoffset
;
1533 UTF_BACK_1(text
, 0, temp
);
1534 if ((getFCD(text
, &temp
, textoffset
) & LAST_BYTE_MASK_
) == 0) {
1535 UCollationElements
*coleiter
= strsrch
->textIter
;
1536 int32_t offset
= getColElemIterOffset(coleiter
, FALSE
);
1537 if (strsrch
->pattern
.hasPrefixAccents
) {
1538 offset
= doNextCanonicalPrefixMatch(strsrch
, offset
, textoffset
,
1540 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
1541 setColEIterOffset(coleiter
, offset
);
1548 if (!strsrch
->pattern
.hasSuffixAccents
) {
1552 UChar accents
[INITIAL_ARRAY_SIZE_
];
1553 // offset to the last base character in substring to search
1554 int32_t baseoffset
= getPreviousBaseOffset(text
, textoffset
);
1555 // normalizing the offensive string
1556 unorm_normalize(text
+ baseoffset
, textoffset
- baseoffset
, UNORM_NFD
,
1557 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
1558 // status checked in loop below
1560 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1561 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
1563 // 2 power n - 1 minus the full set of accents
1564 int32_t count
= (2 << (size
- 1)) - 2;
1565 while (U_SUCCESS(*status
) && count
> 0) {
1566 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
1567 // copy the base characters
1568 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1569 *rearrange
++ = accents
[k
];
1571 // forming all possible canonical rearrangement by dropping
1573 for (int i
= 0; i
<= size
- 1; i
++) {
1574 int32_t mask
= 1 << (size
- i
- 1);
1576 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1577 *rearrange
++ = accents
[j
];
1582 int32_t offset
= doNextCanonicalSuffixMatch(strsrch
, baseoffset
,
1584 if (offset
!= USEARCH_DONE
) {
1585 return TRUE
; // match found
1593 * Gets the previous base character offset depending on the string search
1595 * @param strsrch string search data
1596 * @param textoffset current offset, current character
1597 * @return the offset of the next character after this base character or itself
1598 * if it is a composed character with accents
1601 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch
*strsrch
,
1604 if (strsrch
->pattern
.hasPrefixAccents
&& textoffset
> 0) {
1605 const UChar
*text
= strsrch
->search
->text
;
1606 int32_t offset
= textoffset
;
1607 if (getFCD(text
, &offset
, strsrch
->search
->textLength
) >>
1608 SECOND_LAST_BYTE_SHIFT_
) {
1609 return getPreviousBaseOffset(text
, textoffset
);
1616 * Checks match for contraction.
1617 * If the match ends with a partial contraction we fail.
1618 * If the match starts too far off (because of backwards iteration) we try to
1619 * chip off the extra characters
1620 * Internal method, status assumed to be success, caller has to check status
1621 * before calling this method.
1622 * @param strsrch string search data
1623 * @param start offset of potential match, to be modified if necessary
1624 * @param end offset of potential match, to be modified if necessary
1625 * @param status output error status if any
1626 * @return TRUE if match passes the contraction test, FALSE otherwise
1629 UBool
checkNextCanonicalContractionMatch(UStringSearch
*strsrch
,
1634 UCollationElements
*coleiter
= strsrch
->textIter
;
1635 int32_t textlength
= strsrch
->search
->textLength
;
1636 int32_t temp
= *start
;
1637 const UCollator
*collator
= strsrch
->collator
;
1638 const UChar
*text
= strsrch
->search
->text
;
1639 // This part checks if either ends of the match contains potential
1640 // contraction. If so we'll have to iterate through them
1641 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1642 (*start
+ 1 < textlength
1643 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1644 int32_t expansion
= getExpansionPrefix(coleiter
);
1645 UBool expandflag
= expansion
> 0;
1646 setColEIterOffset(coleiter
, *start
);
1647 while (expansion
> 0) {
1648 // getting rid of the redundant ce, caused by setOffset.
1649 // since backward contraction/expansion may have extra ces if we
1650 // are in the normalization buffer, hasAccentsBeforeMatch would
1651 // have taken care of it.
1652 // E.g. the character \u01FA will have an expansion of 3, but if
1653 // we are only looking for acute and ring \u030A and \u0301, we'll
1654 // have to skip the first ce in the expansion buffer.
1655 ucol_next(coleiter
, status
);
1656 if (U_FAILURE(*status
)) {
1659 if (ucol_getOffset(coleiter
) != temp
) {
1661 temp
= ucol_getOffset(coleiter
);
1666 uint32_t *patternce
= strsrch
->pattern
.CE
;
1667 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1669 int32_t textlength
= strsrch
->search
->textLength
;
1670 while (count
< patterncelength
) {
1671 uint32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1672 // status checked below, note that if status is a failure
1673 // ucol_next returns UCOL_NULLORDER
1674 if (ce
== UCOL_IGNORABLE
) {
1677 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1679 temp
= ucol_getOffset(coleiter
);
1682 if (count
== 0 && ce
!= patternce
[0]) {
1683 // accents may have extra starting ces, this occurs when a
1684 // pure accent pattern is matched without rearrangement
1685 // text \u0325\u0300 and looking for \u0300
1686 uint32_t expected
= patternce
[0];
1687 if (getFCD(text
, start
, textlength
) & LAST_BYTE_MASK_
) {
1688 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1689 while (U_SUCCESS(*status
) && ce
!= expected
&&
1690 ce
!= UCOL_NULLORDER
&&
1691 ucol_getOffset(coleiter
) <= *end
) {
1692 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1696 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1698 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1708 * Checks and sets the match information if found.
1711 * <li> the potential match does not repeat the previous match
1712 * <li> boundaries are correct
1713 * <li> potential match does not end in the middle of a contraction
1714 * <li> identical matches
1716 * Otherwise the offset will be shifted to the next character.
1717 * Internal method, status assumed to be success, caller has to check the
1718 * status before calling this method.
1719 * @param strsrch string search data
1720 * @param textoffset offset in the collation element text. the returned value
1721 * will be the truncated end offset of the match or the new start
1723 * @param status output error status if any
1724 * @return TRUE if the match is valid, FALSE otherwise
1727 inline UBool
checkNextCanonicalMatch(UStringSearch
*strsrch
,
1728 int32_t *textoffset
,
1731 // to ensure that the start and ends are not composite characters
1732 UCollationElements
*coleiter
= strsrch
->textIter
;
1733 // if we have a canonical accent match
1734 if ((strsrch
->pattern
.hasSuffixAccents
&&
1735 strsrch
->canonicalSuffixAccents
[0]) ||
1736 (strsrch
->pattern
.hasPrefixAccents
&&
1737 strsrch
->canonicalPrefixAccents
[0])) {
1738 strsrch
->search
->matchedIndex
= getPreviousUStringSearchBaseOffset(
1740 ucol_getOffset(coleiter
));
1741 strsrch
->search
->matchedLength
= *textoffset
-
1742 strsrch
->search
->matchedIndex
;
1746 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1747 if (!checkNextCanonicalContractionMatch(strsrch
, &start
, textoffset
,
1748 status
) || U_FAILURE(*status
)) {
1752 start
= getPreviousUStringSearchBaseOffset(strsrch
, start
);
1753 // this totally matches, however we need to check if it is repeating
1754 if (checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1755 !isBreakUnit(strsrch
, start
, *textoffset
) ||
1756 !checkIdentical(strsrch
, start
, *textoffset
)) {
1758 *textoffset
= getNextBaseOffset(strsrch
->search
->text
, *textoffset
,
1759 strsrch
->search
->textLength
);
1763 strsrch
->search
->matchedIndex
= start
;
1764 strsrch
->search
->matchedLength
= *textoffset
- start
;
1769 * Shifting the collation element iterator position forward to prepare for
1770 * a preceding match. If the first character is a unsafe character, we'll only
1771 * shift by 1 to capture contractions, normalization etc.
1772 * Internal method, status assumed to be success, caller has to check status
1773 * before calling this method.
1774 * @param text strsrch string search data
1775 * @param textoffset start text position to do search
1776 * @param ce the text ce which failed the match.
1777 * @param patternceindex index of the ce within the pattern ce buffer which
1779 * @return final offset
1782 inline int32_t reverseShift(UStringSearch
*strsrch
,
1785 int32_t patternceindex
)
1787 if (strsrch
->search
->isOverlap
) {
1788 if (textoffset
!= strsrch
->search
->textLength
) {
1792 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1796 if (ce
!= UCOL_NULLORDER
) {
1797 int32_t shift
= strsrch
->pattern
.backShift
[hash(ce
)];
1799 // this is to adjust for characters in the middle of the substring
1800 // for matching that failed.
1801 int32_t adjust
= patternceindex
;
1802 if (adjust
> 1 && shift
> adjust
) {
1803 shift
-= adjust
- 1;
1805 textoffset
-= shift
;
1808 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1811 textoffset
= getPreviousUStringSearchBaseOffset(strsrch
, textoffset
);
1816 * Checks match for contraction.
1817 * If the match starts with a partial contraction we fail.
1818 * Internal method, status assumed to be success, caller has to check status
1819 * before calling this method.
1820 * @param strsrch string search data
1821 * @param start offset of potential match, to be modified if necessary
1822 * @param end offset of potential match, to be modified if necessary
1823 * @param status output error status if any
1824 * @return TRUE if match passes the contraction test, FALSE otherwise
1827 UBool
checkPreviousExactContractionMatch(UStringSearch
*strsrch
,
1829 int32_t *end
, UErrorCode
*status
)
1831 UCollationElements
*coleiter
= strsrch
->textIter
;
1832 int32_t textlength
= strsrch
->search
->textLength
;
1833 int32_t temp
= *end
;
1834 const UCollator
*collator
= strsrch
->collator
;
1835 const UChar
*text
= strsrch
->search
->text
;
1836 // This part checks if either if the start of the match contains potential
1837 // contraction. If so we'll have to iterate through them
1838 // Since we used ucol_next while previously looking for the potential
1839 // match, this guarantees that our end will not be a partial contraction,
1840 // or a partial supplementary character.
1841 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
1842 int32_t expansion
= getExpansionSuffix(coleiter
);
1843 UBool expandflag
= expansion
> 0;
1844 setColEIterOffset(coleiter
, *end
);
1845 while (U_SUCCESS(*status
) && expansion
> 0) {
1846 // getting rid of the redundant ce
1847 // since forward contraction/expansion may have extra ces
1848 // if we are in the normalization buffer, hasAccentsBeforeMatch
1849 // would have taken care of it.
1850 // E.g. the character \u01FA will have an expansion of 3, but if
1851 // we are only looking for A ring A\u030A, we'll have to skip the
1852 // last ce in the expansion buffer
1853 ucol_previous(coleiter
, status
);
1854 if (U_FAILURE(*status
)) {
1857 if (ucol_getOffset(coleiter
) != temp
) {
1859 temp
= ucol_getOffset(coleiter
);
1864 uint32_t *patternce
= strsrch
->pattern
.CE
;
1865 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1866 int32_t count
= patterncelength
;
1868 uint32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
1869 // status checked below, note that if status is a failure
1870 // ucol_previous returns UCOL_NULLORDER
1871 if (ce
== UCOL_IGNORABLE
) {
1874 if (expandflag
&& count
== 0 &&
1875 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
1877 temp
= ucol_getOffset(coleiter
);
1879 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
1881 *start
= getPreviousBaseOffset(text
, *start
);
1891 * Checks and sets the match information if found.
1894 * <li> the current match does not repeat the last match
1895 * <li> boundaries are correct
1896 * <li> exact matches has no extra accents
1897 * <li> identical matches
1899 * Otherwise the offset will be shifted to the preceding character.
1900 * Internal method, status assumed to be success, caller has to check status
1901 * before calling this method.
1902 * @param strsrch string search data
1904 * @param coleiter collation element iterator
1905 * @param text string
1906 * @param textoffset offset in the collation element text. the returned value
1907 * will be the truncated start offset of the match or the new start
1909 * @param status output error status if any
1910 * @return TRUE if the match is valid, FALSE otherwise
1913 inline UBool
checkPreviousExactMatch(UStringSearch
*strsrch
,
1914 int32_t *textoffset
,
1917 // to ensure that the start and ends are not composite characters
1918 int32_t end
= ucol_getOffset(strsrch
->textIter
);
1919 if (!checkPreviousExactContractionMatch(strsrch
, textoffset
, &end
, status
)
1920 || U_FAILURE(*status
)) {
1924 // this totally matches, however we need to check if it is repeating
1926 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
1927 !isBreakUnit(strsrch
, *textoffset
, end
) ||
1928 hasAccentsBeforeMatch(strsrch
, *textoffset
, end
) ||
1929 !checkIdentical(strsrch
, *textoffset
, end
) ||
1930 hasAccentsAfterMatch(strsrch
, *textoffset
, end
)) {
1932 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
1936 strsrch
->search
->matchedIndex
= *textoffset
;
1937 strsrch
->search
->matchedLength
= end
- *textoffset
;
1942 * Rearranges the end accents to try matching.
1943 * Suffix accents in the text will be grouped according to their combining
1944 * class and the groups will be mixed and matched to try find the perfect
1945 * match with the pattern.
1946 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1947 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1948 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1950 * step 2: check if any of the generated substrings matches the pattern.
1951 * Internal method, status assumed to be success, user has to check status
1952 * before calling this method.
1953 * @param strsrch string search match
1954 * @param start offset of the first base character
1955 * @param end start of the last accent set
1956 * @param status only error status if any
1957 * @return USEARCH_DONE if a match is not found, otherwise return the ending
1958 * offset of the match. Note this start includes all following accents.
1961 int32_t doPreviousCanonicalSuffixMatch(UStringSearch
*strsrch
,
1966 const UChar
*text
= strsrch
->search
->text
;
1967 int32_t tempend
= end
;
1969 UTF_BACK_1(text
, 0, tempend
);
1970 if (!(getFCD(text
, &tempend
, strsrch
->search
->textLength
) &
1972 // die... failed at a base character
1973 return USEARCH_DONE
;
1975 end
= getNextBaseOffset(text
, end
, strsrch
->search
->textLength
);
1977 if (U_SUCCESS(*status
)) {
1978 UChar accents
[INITIAL_ARRAY_SIZE_
];
1979 int32_t offset
= getPreviousBaseOffset(text
, end
);
1980 // normalizing the offensive string
1981 unorm_normalize(text
+ offset
, end
- offset
, UNORM_NFD
, 0, accents
,
1982 INITIAL_ARRAY_SIZE_
, status
);
1984 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1985 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1987 int32_t count
= (2 << (accentsize
- 1)) - 2;
1988 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1989 UCollationElements
*coleiter
= strsrch
->utilIter
;
1990 while (U_SUCCESS(*status
) && count
> 0) {
1991 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
1992 // copy the base characters
1993 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1994 *rearrange
++ = accents
[k
];
1996 // forming all possible canonical rearrangement by dropping
1998 for (int i
= 0; i
<= accentsize
- 1; i
++) {
1999 int32_t mask
= 1 << (accentsize
- i
- 1);
2001 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2002 *rearrange
++ = accents
[j
];
2007 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
2008 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
2009 strsrch
->canonicalPrefixAccents
,
2010 strsrch
->search
->text
+ start
,
2012 strsrch
->canonicalSuffixAccents
,
2015 // run the collator iterator through this match
2016 // if status is a failure ucol_setText does nothing
2017 ucol_setText(coleiter
, match
, matchsize
, status
);
2018 if (U_SUCCESS(*status
)) {
2019 if (checkCollationMatch(strsrch
, coleiter
)) {
2020 if (match
!= buffer
) {
2029 return USEARCH_DONE
;
2033 * Take the rearranged start accents and tries matching. If match failed at
2034 * a seperate following set of accents (seperated from the rearranged on by
2035 * at least a base character) then we rearrange the preceding accents and
2036 * tries matching again.
2037 * We allow skipping of the ends of the accent set if the ces do not match.
2038 * However if the failure is found before the accent set, it fails.
2039 * Internal method, status assumed to be success, caller has to check status
2040 * before calling this method.
2041 * @param strsrch string search data
2042 * @param textoffset of the ends of the rearranged accent
2043 * @param status output error status if any
2044 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2045 * offset of the match. Note this start includes all following accents.
2048 int32_t doPreviousCanonicalPrefixMatch(UStringSearch
*strsrch
,
2052 const UChar
*text
= strsrch
->search
->text
;
2053 const UCollator
*collator
= strsrch
->collator
;
2054 int32_t safelength
= 0;
2056 int32_t safetextlength
;
2057 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
2058 int32_t safeoffset
= textoffset
;
2061 ucol_unsafeCP(strsrch
->canonicalPrefixAccents
[
2062 u_strlen(strsrch
->canonicalPrefixAccents
) - 1
2064 safeoffset
= getNextSafeOffset(collator
, text
, textoffset
,
2065 strsrch
->search
->textLength
);
2066 safelength
= safeoffset
- textoffset
;
2067 safetextlength
= INITIAL_ARRAY_SIZE_
;
2068 safetext
= addToUCharArray(safebuffer
, &safetextlength
,
2069 strsrch
->canonicalPrefixAccents
,
2070 text
+ textoffset
, safelength
,
2074 safetextlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2075 safetext
= strsrch
->canonicalPrefixAccents
;
2078 UCollationElements
*coleiter
= strsrch
->utilIter
;
2079 // if status is a failure, ucol_setText does nothing
2080 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
2081 // status checked in loop below
2083 uint32_t *ce
= strsrch
->pattern
.CE
;
2084 int32_t celength
= strsrch
->pattern
.CELength
;
2086 UBool isSafe
= TRUE
; // safe zone indication flag for position
2087 int32_t prefixlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2089 while (ceindex
< celength
) {
2090 uint32_t textce
= ucol_next(coleiter
, status
);
2091 if (U_FAILURE(*status
)) {
2093 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2095 return USEARCH_DONE
;
2097 if (textce
== UCOL_NULLORDER
) {
2098 // check if we have passed the safe buffer
2099 if (coleiter
== strsrch
->textIter
) {
2100 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2101 return USEARCH_DONE
;
2103 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2104 safetext
= safebuffer
;
2105 coleiter
= strsrch
->textIter
;
2106 setColEIterOffset(coleiter
, safeoffset
);
2107 // status checked at the start of the loop
2111 textce
= getCE(strsrch
, textce
);
2112 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
2113 // do the beginning stuff
2114 int32_t failedoffset
= ucol_getOffset(coleiter
);
2115 if (isSafe
&& failedoffset
<= prefixlength
) {
2116 // alas... no hope. failed at rearranged accent set
2117 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2118 return USEARCH_DONE
;
2122 failedoffset
= safeoffset
- failedoffset
;
2123 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2126 // try rearranging the end accents
2127 int32_t result
= doPreviousCanonicalSuffixMatch(strsrch
,
2128 textoffset
, failedoffset
, status
);
2129 if (result
!= USEARCH_DONE
) {
2130 // if status is a failure, ucol_setOffset does nothing
2131 setColEIterOffset(strsrch
->textIter
, result
);
2133 if (U_FAILURE(*status
)) {
2134 return USEARCH_DONE
;
2139 if (textce
== ce
[ceindex
]) {
2145 int32_t result
= ucol_getOffset(coleiter
);
2146 // sets the text iterator here with the correct expansion and offset
2147 int32_t leftoverces
= getExpansionSuffix(coleiter
);
2148 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2149 if (result
<= prefixlength
) {
2150 result
= textoffset
;
2153 result
= textoffset
+ (safeoffset
- result
);
2155 setColEIterOffset(strsrch
->textIter
, result
);
2156 setExpansionSuffix(strsrch
->textIter
, leftoverces
);
2160 return ucol_getOffset(coleiter
);
2164 * Trying out the substring and sees if it can be a canonical match.
2165 * This will try normalizing the starting accents and arranging them into
2166 * canonical equivalents and check their corresponding ces with the pattern ce.
2167 * Prefix accents in the text will be grouped according to their combining
2168 * class and the groups will be mixed and matched to try find the perfect
2169 * match with the pattern.
2170 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2171 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2172 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2174 * step 2: check if any of the generated substrings matches the pattern.
2175 * Internal method, status assumed to be success, caller has to check status
2176 * before calling this method.
2177 * @param strsrch string search data
2178 * @param textoffset start offset in the collation element text that starts
2179 * with the accents to be rearranged
2180 * @param status output error status if any
2181 * @return TRUE if the match is valid, FALSE otherwise
2184 UBool
doPreviousCanonicalMatch(UStringSearch
*strsrch
,
2188 const UChar
*text
= strsrch
->search
->text
;
2189 int32_t temp
= textoffset
;
2190 int32_t textlength
= strsrch
->search
->textLength
;
2191 if ((getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) == 0) {
2192 UCollationElements
*coleiter
= strsrch
->textIter
;
2193 int32_t offset
= ucol_getOffset(coleiter
);
2194 if (strsrch
->pattern
.hasSuffixAccents
) {
2195 offset
= doPreviousCanonicalSuffixMatch(strsrch
, textoffset
,
2197 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
2198 setColEIterOffset(coleiter
, offset
);
2205 if (!strsrch
->pattern
.hasPrefixAccents
) {
2209 UChar accents
[INITIAL_ARRAY_SIZE_
];
2210 // offset to the last base character in substring to search
2211 int32_t baseoffset
= getNextBaseOffset(text
, textoffset
, textlength
);
2212 // normalizing the offensive string
2213 unorm_normalize(text
+ textoffset
, baseoffset
- textoffset
, UNORM_NFD
,
2214 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
2215 // status checked in loop
2217 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2218 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
2220 // 2 power n - 1 minus the full set of accents
2221 int32_t count
= (2 << (size
- 1)) - 2;
2222 while (U_SUCCESS(*status
) && count
> 0) {
2223 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
2224 // copy the base characters
2225 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2226 *rearrange
++ = accents
[k
];
2228 // forming all possible canonical rearrangement by dropping
2230 for (int i
= 0; i
<= size
- 1; i
++) {
2231 int32_t mask
= 1 << (size
- i
- 1);
2233 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2234 *rearrange
++ = accents
[j
];
2239 int32_t offset
= doPreviousCanonicalPrefixMatch(strsrch
,
2240 baseoffset
, status
);
2241 if (offset
!= USEARCH_DONE
) {
2242 return TRUE
; // match found
2250 * Checks match for contraction.
2251 * If the match starts with a partial contraction we fail.
2252 * Internal method, status assumed to be success, caller has to check status
2253 * before calling this method.
2254 * @param strsrch string search data
2255 * @param start offset of potential match, to be modified if necessary
2256 * @param end offset of potential match, to be modified if necessary
2257 * @param status only error status if any
2258 * @return TRUE if match passes the contraction test, FALSE otherwise
2261 UBool
checkPreviousCanonicalContractionMatch(UStringSearch
*strsrch
,
2263 int32_t *end
, UErrorCode
*status
)
2265 UCollationElements
*coleiter
= strsrch
->textIter
;
2266 int32_t textlength
= strsrch
->search
->textLength
;
2267 int32_t temp
= *end
;
2268 const UCollator
*collator
= strsrch
->collator
;
2269 const UChar
*text
= strsrch
->search
->text
;
2270 // This part checks if either if the start of the match contains potential
2271 // contraction. If so we'll have to iterate through them
2272 // Since we used ucol_next while previously looking for the potential
2273 // match, this guarantees that our end will not be a partial contraction,
2274 // or a partial supplementary character.
2275 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2276 int32_t expansion
= getExpansionSuffix(coleiter
);
2277 UBool expandflag
= expansion
> 0;
2278 setColEIterOffset(coleiter
, *end
);
2279 while (expansion
> 0) {
2280 // getting rid of the redundant ce
2281 // since forward contraction/expansion may have extra ces
2282 // if we are in the normalization buffer, hasAccentsBeforeMatch
2283 // would have taken care of it.
2284 // E.g. the character \u01FA will have an expansion of 3, but if
2285 // we are only looking for A ring A\u030A, we'll have to skip the
2286 // last ce in the expansion buffer
2287 ucol_previous(coleiter
, status
);
2288 if (U_FAILURE(*status
)) {
2291 if (ucol_getOffset(coleiter
) != temp
) {
2293 temp
= ucol_getOffset(coleiter
);
2298 uint32_t *patternce
= strsrch
->pattern
.CE
;
2299 int32_t patterncelength
= strsrch
->pattern
.CELength
;
2300 int32_t count
= patterncelength
;
2302 uint32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2303 // status checked below, note that if status is a failure
2304 // ucol_previous returns UCOL_NULLORDER
2305 if (ce
== UCOL_IGNORABLE
) {
2308 if (expandflag
&& count
== 0 &&
2309 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2311 temp
= ucol_getOffset(coleiter
);
2313 if (count
== patterncelength
&&
2314 ce
!= patternce
[patterncelength
- 1]) {
2315 // accents may have extra starting ces, this occurs when a
2316 // pure accent pattern is matched without rearrangement
2317 uint32_t expected
= patternce
[patterncelength
- 1];
2318 UTF_BACK_1(text
, 0, *end
);
2319 if (getFCD(text
, end
, textlength
) & LAST_BYTE_MASK_
) {
2320 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2321 while (U_SUCCESS(*status
) && ce
!= expected
&&
2322 ce
!= UCOL_NULLORDER
&&
2323 ucol_getOffset(coleiter
) <= *start
) {
2324 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2328 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2330 *start
= getPreviousBaseOffset(text
, *start
);
2340 * Checks and sets the match information if found.
2343 * <li> the potential match does not repeat the previous match
2344 * <li> boundaries are correct
2345 * <li> potential match does not end in the middle of a contraction
2346 * <li> identical matches
2348 * Otherwise the offset will be shifted to the next character.
2349 * Internal method, status assumed to be success, caller has to check status
2350 * before calling this method.
2351 * @param strsrch string search data
2352 * @param textoffset offset in the collation element text. the returned value
2353 * will be the truncated start offset of the match or the new start
2355 * @param status only error status if any
2356 * @return TRUE if the match is valid, FALSE otherwise
2359 inline UBool
checkPreviousCanonicalMatch(UStringSearch
*strsrch
,
2360 int32_t *textoffset
,
2363 // to ensure that the start and ends are not composite characters
2364 UCollationElements
*coleiter
= strsrch
->textIter
;
2365 // if we have a canonical accent match
2366 if ((strsrch
->pattern
.hasSuffixAccents
&&
2367 strsrch
->canonicalSuffixAccents
[0]) ||
2368 (strsrch
->pattern
.hasPrefixAccents
&&
2369 strsrch
->canonicalPrefixAccents
[0])) {
2370 strsrch
->search
->matchedIndex
= *textoffset
;
2371 strsrch
->search
->matchedLength
=
2372 getNextUStringSearchBaseOffset(strsrch
,
2373 getColElemIterOffset(coleiter
, FALSE
))
2378 int32_t end
= ucol_getOffset(coleiter
);
2379 if (!checkPreviousCanonicalContractionMatch(strsrch
, textoffset
, &end
,
2381 U_FAILURE(*status
)) {
2385 end
= getNextUStringSearchBaseOffset(strsrch
, end
);
2386 // this totally matches, however we need to check if it is repeating
2387 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2388 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2389 !checkIdentical(strsrch
, *textoffset
, end
)) {
2391 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2396 strsrch
->search
->matchedIndex
= *textoffset
;
2397 strsrch
->search
->matchedLength
= end
- *textoffset
;
2401 // constructors and destructor -------------------------------------------
2403 U_CAPI UStringSearch
* U_EXPORT2
usearch_open(const UChar
*pattern
,
2404 int32_t patternlength
,
2408 UBreakIterator
*breakiter
,
2411 if (U_FAILURE(*status
)) {
2414 #if UCONFIG_NO_BREAK_ITERATION
2415 if (breakiter
!= NULL
) {
2416 *status
= U_UNSUPPORTED_ERROR
;
2421 // ucol_open internally checks for status
2422 UCollator
*collator
= ucol_open(locale
, status
);
2423 // pattern, text checks are done in usearch_openFromCollator
2424 UStringSearch
*result
= usearch_openFromCollator(pattern
,
2425 patternlength
, text
, textlength
,
2426 collator
, breakiter
, status
);
2428 if (result
== NULL
|| U_FAILURE(*status
)) {
2430 ucol_close(collator
);
2435 result
->ownCollator
= TRUE
;
2439 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2443 U_CAPI UStringSearch
* U_EXPORT2
usearch_openFromCollator(
2444 const UChar
*pattern
,
2445 int32_t patternlength
,
2448 const UCollator
*collator
,
2449 UBreakIterator
*breakiter
,
2452 if (U_FAILURE(*status
)) {
2455 #if UCONFIG_NO_BREAK_ITERATION
2456 if (breakiter
!= NULL
) {
2457 *status
= U_UNSUPPORTED_ERROR
;
2461 if (pattern
== NULL
|| text
== NULL
|| collator
== NULL
) {
2462 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2465 if (U_SUCCESS(*status
)) {
2466 initializeFCD(status
);
2467 if (U_FAILURE(*status
)) {
2471 UStringSearch
*result
;
2472 if (textlength
== -1) {
2473 textlength
= u_strlen(text
);
2475 if (patternlength
== -1) {
2476 patternlength
= u_strlen(pattern
);
2478 if (textlength
<= 0 || patternlength
<= 0) {
2479 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2483 result
= (UStringSearch
*)uprv_malloc(sizeof(UStringSearch
));
2484 if (result
== NULL
) {
2485 *status
= U_MEMORY_ALLOCATION_ERROR
;
2489 result
->collator
= collator
;
2490 result
->strength
= ucol_getStrength(collator
);
2491 result
->ceMask
= getMask(result
->strength
);
2493 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2495 result
->variableTop
= ucol_getVariableTop(collator
, status
);
2497 if (U_FAILURE(*status
)) {
2502 result
->search
= (USearch
*)uprv_malloc(sizeof(USearch
));
2503 if (result
->search
== NULL
) {
2504 *status
= U_MEMORY_ALLOCATION_ERROR
;
2509 result
->search
->text
= text
;
2510 result
->search
->textLength
= textlength
;
2512 result
->pattern
.text
= pattern
;
2513 result
->pattern
.textLength
= patternlength
;
2514 result
->pattern
.CE
= NULL
;
2516 result
->search
->breakIter
= breakiter
;
2517 #if !UCONFIG_NO_BREAK_ITERATION
2519 ubrk_setText(breakiter
, text
, textlength
, status
);
2523 result
->ownCollator
= FALSE
;
2524 result
->search
->matchedLength
= 0;
2525 result
->search
->matchedIndex
= USEARCH_DONE
;
2526 result
->textIter
= ucol_openElements(collator
, text
,
2527 textlength
, status
);
2528 if (U_FAILURE(*status
)) {
2529 usearch_close(result
);
2533 result
->utilIter
= NULL
;
2535 result
->search
->isOverlap
= FALSE
;
2536 result
->search
->isCanonicalMatch
= FALSE
;
2537 result
->search
->isForwardSearching
= TRUE
;
2538 result
->search
->reset
= TRUE
;
2540 initialize(result
, status
);
2542 if (U_FAILURE(*status
)) {
2543 usearch_close(result
);
2552 U_CAPI
void U_EXPORT2
usearch_close(UStringSearch
*strsrch
)
2555 if (strsrch
->pattern
.CE
!= strsrch
->pattern
.CEBuffer
&&
2556 strsrch
->pattern
.CE
) {
2557 uprv_free(strsrch
->pattern
.CE
);
2559 ucol_closeElements(strsrch
->textIter
);
2560 ucol_closeElements(strsrch
->utilIter
);
2561 if (strsrch
->ownCollator
&& strsrch
->collator
) {
2562 ucol_close((UCollator
*)strsrch
->collator
);
2564 uprv_free(strsrch
->search
);
2569 // set and get methods --------------------------------------------------
2571 U_CAPI
void U_EXPORT2
usearch_setOffset(UStringSearch
*strsrch
,
2575 if (U_SUCCESS(*status
) && strsrch
) {
2576 if (isOutOfBounds(strsrch
->search
->textLength
, position
)) {
2577 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2580 setColEIterOffset(strsrch
->textIter
, position
);
2582 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2583 strsrch
->search
->matchedLength
= 0;
2584 strsrch
->search
->reset
= FALSE
;
2588 U_CAPI
int32_t U_EXPORT2
usearch_getOffset(const UStringSearch
*strsrch
)
2591 int32_t result
= ucol_getOffset(strsrch
->textIter
);
2592 if (isOutOfBounds(strsrch
->search
->textLength
, result
)) {
2593 return USEARCH_DONE
;
2597 return USEARCH_DONE
;
2600 U_CAPI
void U_EXPORT2
usearch_setAttribute(UStringSearch
*strsrch
,
2601 USearchAttribute attribute
,
2602 USearchAttributeValue value
,
2605 if (U_SUCCESS(*status
) && strsrch
) {
2608 case USEARCH_OVERLAP
:
2609 strsrch
->search
->isOverlap
= (value
== USEARCH_ON
? TRUE
: FALSE
);
2611 case USEARCH_CANONICAL_MATCH
:
2612 strsrch
->search
->isCanonicalMatch
= (value
== USEARCH_ON
? TRUE
:
2615 case USEARCH_ATTRIBUTE_COUNT
:
2617 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2620 if (value
== USEARCH_ATTRIBUTE_VALUE_COUNT
) {
2621 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2625 U_CAPI USearchAttributeValue U_EXPORT2
usearch_getAttribute(
2626 const UStringSearch
*strsrch
,
2627 USearchAttribute attribute
)
2630 switch (attribute
) {
2631 case USEARCH_OVERLAP
:
2632 return (strsrch
->search
->isOverlap
== TRUE
? USEARCH_ON
:
2634 case USEARCH_CANONICAL_MATCH
:
2635 return (strsrch
->search
->isCanonicalMatch
== TRUE
? USEARCH_ON
:
2637 case USEARCH_ATTRIBUTE_COUNT
:
2638 return USEARCH_DEFAULT
;
2641 return USEARCH_DEFAULT
;
2644 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedStart(
2645 const UStringSearch
*strsrch
)
2647 if (strsrch
== NULL
) {
2648 return USEARCH_DONE
;
2650 return strsrch
->search
->matchedIndex
;
2654 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedText(const UStringSearch
*strsrch
,
2656 int32_t resultCapacity
,
2659 if (U_FAILURE(*status
)) {
2660 return USEARCH_DONE
;
2662 if (strsrch
== NULL
|| resultCapacity
< 0 || (resultCapacity
> 0 &&
2664 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2665 return USEARCH_DONE
;
2668 int32_t copylength
= strsrch
->search
->matchedLength
;
2669 int32_t copyindex
= strsrch
->search
->matchedIndex
;
2670 if (copyindex
== USEARCH_DONE
) {
2671 u_terminateUChars(result
, resultCapacity
, 0, status
);
2672 return USEARCH_DONE
;
2675 if (resultCapacity
< copylength
) {
2676 copylength
= resultCapacity
;
2678 if (copylength
> 0) {
2679 uprv_memcpy(result
, strsrch
->search
->text
+ copyindex
,
2680 copylength
* sizeof(UChar
));
2682 return u_terminateUChars(result
, resultCapacity
,
2683 strsrch
->search
->matchedLength
, status
);
2686 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedLength(
2687 const UStringSearch
*strsrch
)
2690 return strsrch
->search
->matchedLength
;
2692 return USEARCH_DONE
;
2695 #if !UCONFIG_NO_BREAK_ITERATION
2697 U_CAPI
void U_EXPORT2
usearch_setBreakIterator(UStringSearch
*strsrch
,
2698 UBreakIterator
*breakiter
,
2701 if (U_SUCCESS(*status
) && strsrch
) {
2702 strsrch
->search
->breakIter
= breakiter
;
2704 ubrk_setText(breakiter
, strsrch
->search
->text
,
2705 strsrch
->search
->textLength
, status
);
2710 U_CAPI
const UBreakIterator
* U_EXPORT2
2711 usearch_getBreakIterator(const UStringSearch
*strsrch
)
2714 return strsrch
->search
->breakIter
;
2721 U_CAPI
void U_EXPORT2
usearch_setText( UStringSearch
*strsrch
,
2726 if (U_SUCCESS(*status
)) {
2727 if (strsrch
== NULL
|| text
== NULL
|| textlength
< -1 ||
2729 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2732 if (textlength
== -1) {
2733 textlength
= u_strlen(text
);
2735 strsrch
->search
->text
= text
;
2736 strsrch
->search
->textLength
= textlength
;
2737 ucol_setText(strsrch
->textIter
, text
, textlength
, status
);
2738 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2739 strsrch
->search
->matchedLength
= 0;
2740 strsrch
->search
->reset
= TRUE
;
2741 #if !UCONFIG_NO_BREAK_ITERATION
2742 if (strsrch
->search
->breakIter
!= NULL
) {
2743 ubrk_setText(strsrch
->search
->breakIter
, text
,
2744 textlength
, status
);
2751 U_CAPI
const UChar
* U_EXPORT2
usearch_getText(const UStringSearch
*strsrch
,
2755 *length
= strsrch
->search
->textLength
;
2756 return strsrch
->search
->text
;
2761 U_CAPI
void U_EXPORT2
usearch_setCollator( UStringSearch
*strsrch
,
2762 const UCollator
*collator
,
2765 if (U_SUCCESS(*status
)) {
2766 if (collator
== NULL
) {
2767 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2771 if (strsrch
->ownCollator
&& (strsrch
->collator
!= collator
)) {
2772 ucol_close((UCollator
*)strsrch
->collator
);
2773 strsrch
->ownCollator
= FALSE
;
2775 strsrch
->collator
= collator
;
2776 strsrch
->strength
= ucol_getStrength(collator
);
2777 strsrch
->ceMask
= getMask(strsrch
->strength
);
2778 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
2780 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2782 // if status is a failure, ucol_getVariableTop returns 0
2783 strsrch
->variableTop
= ucol_getVariableTop(collator
, status
);
2784 if (U_SUCCESS(*status
)) {
2785 initialize(strsrch
, status
);
2786 if (U_SUCCESS(*status
)) {
2787 uprv_init_collIterate(collator
, strsrch
->search
->text
,
2788 strsrch
->search
->textLength
,
2789 &(strsrch
->textIter
->iteratordata_
));
2790 strsrch
->utilIter
->iteratordata_
.coll
= collator
;
2797 U_CAPI UCollator
* U_EXPORT2
usearch_getCollator(const UStringSearch
*strsrch
)
2800 return (UCollator
*)strsrch
->collator
;
2805 U_CAPI
void U_EXPORT2
usearch_setPattern( UStringSearch
*strsrch
,
2806 const UChar
*pattern
,
2807 int32_t patternlength
,
2810 if (U_SUCCESS(*status
)) {
2811 if (strsrch
== NULL
|| pattern
== NULL
) {
2812 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2815 if (patternlength
== -1) {
2816 patternlength
= u_strlen(pattern
);
2818 if (patternlength
== 0) {
2819 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2822 strsrch
->pattern
.text
= pattern
;
2823 strsrch
->pattern
.textLength
= patternlength
;
2824 initialize(strsrch
, status
);
2829 U_CAPI
const UChar
* U_EXPORT2
2830 usearch_getPattern(const UStringSearch
*strsrch
,
2834 *length
= strsrch
->pattern
.textLength
;
2835 return strsrch
->pattern
.text
;
2840 // miscellanous methods --------------------------------------------------
2842 U_CAPI
int32_t U_EXPORT2
usearch_first(UStringSearch
*strsrch
,
2845 if (strsrch
&& U_SUCCESS(*status
)) {
2846 strsrch
->search
->isForwardSearching
= TRUE
;
2847 usearch_setOffset(strsrch
, 0, status
);
2848 if (U_SUCCESS(*status
)) {
2849 return usearch_next(strsrch
, status
);
2852 return USEARCH_DONE
;
2855 U_CAPI
int32_t U_EXPORT2
usearch_following(UStringSearch
*strsrch
,
2859 if (strsrch
&& U_SUCCESS(*status
)) {
2860 strsrch
->search
->isForwardSearching
= TRUE
;
2861 // position checked in usearch_setOffset
2862 usearch_setOffset(strsrch
, position
, status
);
2863 if (U_SUCCESS(*status
)) {
2864 return usearch_next(strsrch
, status
);
2867 return USEARCH_DONE
;
2870 U_CAPI
int32_t U_EXPORT2
usearch_last(UStringSearch
*strsrch
,
2873 if (strsrch
&& U_SUCCESS(*status
)) {
2874 strsrch
->search
->isForwardSearching
= FALSE
;
2875 usearch_setOffset(strsrch
, strsrch
->search
->textLength
, status
);
2876 if (U_SUCCESS(*status
)) {
2877 return usearch_previous(strsrch
, status
);
2880 return USEARCH_DONE
;
2883 U_CAPI
int32_t U_EXPORT2
usearch_preceding(UStringSearch
*strsrch
,
2887 if (strsrch
&& U_SUCCESS(*status
)) {
2888 strsrch
->search
->isForwardSearching
= FALSE
;
2889 // position checked in usearch_setOffset
2890 usearch_setOffset(strsrch
, position
, status
);
2891 if (U_SUCCESS(*status
)) {
2892 return usearch_previous(strsrch
, status
);
2895 return USEARCH_DONE
;
2899 * If a direction switch is required, we'll count the number of ces till the
2900 * beginning of the collation element iterator and iterate forwards that
2901 * number of times. This is so that we get to the correct point within the
2902 * string to continue the search in. Imagine when we are in the middle of the
2903 * normalization buffer when the change in direction is request. arrrgghh....
2904 * After searching the offset within the collation element iterator will be
2905 * shifted to the start of the match. If a match is not found, the offset would
2906 * have been set to the end of the text string in the collation element
2908 * Okay, here's my take on normalization buffer. The only time when there can
2909 * be 2 matches within the same normalization is when the pattern is consists
2910 * of all accents. But since the offset returned is from the text string, we
2911 * should not confuse the caller by returning the second match within the
2912 * same normalization buffer. If we do, the 2 results will have the same match
2913 * offsets, and that'll be confusing. I'll return the next match that doesn't
2914 * fall within the same normalization buffer. Note this does not affect the
2915 * results of matches spanning the text and the normalization buffer.
2916 * The position to start searching is taken from the collation element
2917 * iterator. Callers of this API would have to set the offset in the collation
2918 * element iterator before using this method.
2920 U_CAPI
int32_t U_EXPORT2
usearch_next(UStringSearch
*strsrch
,
2923 if (U_SUCCESS(*status
) && strsrch
) {
2924 int32_t offset
= usearch_getOffset(strsrch
);
2925 USearch
*search
= strsrch
->search
;
2926 search
->reset
= FALSE
;
2927 int32_t textlength
= search
->textLength
;
2928 int32_t matchedindex
= search
->matchedIndex
;
2929 if (search
->isForwardSearching
) {
2930 if (offset
== textlength
|| matchedindex
== textlength
||
2931 (!search
->isOverlap
&&
2932 (offset
+ strsrch
->pattern
.defaultShiftSize
> textlength
||
2933 (matchedindex
!= USEARCH_DONE
&&
2934 matchedindex
+ search
->matchedLength
>= textlength
)))) {
2935 // not enough characters to match
2936 setMatchNotFound(strsrch
);
2937 return USEARCH_DONE
;
2941 // switching direction.
2942 // if matchedIndex == USEARCH_DONE, it means that either a
2943 // setOffset has been called or that previous ran off the text
2944 // string. the iterator would have been set to offset 0 if a
2945 // match is not found.
2946 search
->isForwardSearching
= TRUE
;
2947 if (matchedindex
!= USEARCH_DONE
) {
2948 // there's no need to set the collation element iterator
2949 // the next call to next will set the offset.
2950 return matchedindex
;
2954 if (U_SUCCESS(*status
)) {
2955 if (strsrch
->pattern
.CELength
== 0) {
2956 if (matchedindex
== USEARCH_DONE
) {
2957 search
->matchedIndex
= offset
;
2959 else { // moves by codepoints
2960 UTF_FWD_1(search
->text
, search
->matchedIndex
, textlength
);
2963 search
->matchedLength
= 0;
2964 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
2965 // status checked below
2966 if (search
->matchedIndex
== textlength
) {
2967 search
->matchedIndex
= USEARCH_DONE
;
2971 if (search
->matchedLength
> 0) {
2972 // if matchlength is 0 we are at the start of the iteration
2973 int offset
= ucol_getOffset(strsrch
->textIter
);
2974 if (search
->isOverlap
) {
2975 ucol_setOffset(strsrch
->textIter
, offset
+ 1, status
);
2978 ucol_setOffset(strsrch
->textIter
,
2979 offset
+ search
->matchedLength
, status
);
2982 if (search
->isCanonicalMatch
) {
2983 // can't use exact here since extra accents are allowed.
2984 usearch_handleNextCanonical(strsrch
, status
);
2987 usearch_handleNextExact(strsrch
, status
);
2991 if (U_FAILURE(*status
)) {
2992 return USEARCH_DONE
;
2995 return search
->matchedIndex
;
2998 return USEARCH_DONE
;
3001 U_CAPI
int32_t U_EXPORT2
usearch_previous(UStringSearch
*strsrch
,
3004 if (U_SUCCESS(*status
) && strsrch
) {
3006 USearch
*search
= strsrch
->search
;
3007 if (search
->reset
) {
3008 offset
= search
->textLength
;
3009 search
->isForwardSearching
= FALSE
;
3010 search
->reset
= FALSE
;
3011 setColEIterOffset(strsrch
->textIter
, offset
);
3014 offset
= usearch_getOffset(strsrch
);
3017 int32_t matchedindex
= search
->matchedIndex
;
3018 if (search
->isForwardSearching
== TRUE
) {
3019 // switching direction.
3020 // if matchedIndex == USEARCH_DONE, it means that either a
3021 // setOffset has been called or that next ran off the text
3022 // string. the iterator would have been set to offset textLength if
3023 // a match is not found.
3024 search
->isForwardSearching
= FALSE
;
3025 if (matchedindex
!= USEARCH_DONE
) {
3026 return matchedindex
;
3030 if (offset
== 0 || matchedindex
== 0 ||
3031 (!search
->isOverlap
&&
3032 (offset
< strsrch
->pattern
.defaultShiftSize
||
3033 (matchedindex
!= USEARCH_DONE
&&
3034 matchedindex
< strsrch
->pattern
.defaultShiftSize
)))) {
3035 // not enough characters to match
3036 setMatchNotFound(strsrch
);
3037 return USEARCH_DONE
;
3041 if (U_SUCCESS(*status
)) {
3042 if (strsrch
->pattern
.CELength
== 0) {
3043 search
->matchedIndex
=
3044 (matchedindex
== USEARCH_DONE
? offset
: matchedindex
);
3045 if (search
->matchedIndex
== 0) {
3046 setMatchNotFound(strsrch
);
3047 // status checked below
3049 else { // move by codepoints
3050 UTF_BACK_1(search
->text
, 0, search
->matchedIndex
);
3051 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3052 // status checked below
3053 search
->matchedLength
= 0;
3057 if (strsrch
->search
->isCanonicalMatch
) {
3058 // can't use exact here since extra accents are allowed.
3059 usearch_handlePreviousCanonical(strsrch
, status
);
3060 // status checked below
3063 usearch_handlePreviousExact(strsrch
, status
);
3064 // status checked below
3068 if (U_FAILURE(*status
)) {
3069 return USEARCH_DONE
;
3072 return search
->matchedIndex
;
3075 return USEARCH_DONE
;
3080 U_CAPI
void U_EXPORT2
usearch_reset(UStringSearch
*strsrch
)
3083 reset is setting the attributes that are already in
3084 string search, hence all attributes in the collator should
3085 be retrieved without any problems
3088 UErrorCode status
= U_ZERO_ERROR
;
3089 UBool sameCollAttribute
= TRUE
;
3094 strsrch
->strength
= ucol_getStrength(strsrch
->collator
);
3095 ceMask
= getMask(strsrch
->strength
);
3096 if (strsrch
->ceMask
!= ceMask
) {
3097 strsrch
->ceMask
= ceMask
;
3098 sameCollAttribute
= FALSE
;
3100 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3101 shift
= ucol_getAttribute(strsrch
->collator
, UCOL_ALTERNATE_HANDLING
,
3102 &status
) == UCOL_SHIFTED
;
3103 if (strsrch
->toShift
!= shift
) {
3104 strsrch
->toShift
= shift
;
3105 sameCollAttribute
= FALSE
;
3108 // if status is a failure, ucol_getVariableTop returns 0
3109 varTop
= ucol_getVariableTop(strsrch
->collator
, &status
);
3110 if (strsrch
->variableTop
!= varTop
) {
3111 strsrch
->variableTop
= varTop
;
3112 sameCollAttribute
= FALSE
;
3114 if (!sameCollAttribute
) {
3115 initialize(strsrch
, &status
);
3117 uprv_init_collIterate(strsrch
->collator
, strsrch
->search
->text
,
3118 strsrch
->search
->textLength
,
3119 &(strsrch
->textIter
->iteratordata_
));
3120 strsrch
->search
->matchedLength
= 0;
3121 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
3122 strsrch
->search
->isOverlap
= FALSE
;
3123 strsrch
->search
->isCanonicalMatch
= FALSE
;
3124 strsrch
->search
->isForwardSearching
= TRUE
;
3125 strsrch
->search
->reset
= TRUE
;
3129 // internal use methods declared in usrchimp.h -----------------------------
3131 UBool
usearch_handleNextExact(UStringSearch
*strsrch
, UErrorCode
*status
)
3133 if (U_FAILURE(*status
)) {
3134 setMatchNotFound(strsrch
);
3138 UCollationElements
*coleiter
= strsrch
->textIter
;
3139 int32_t textlength
= strsrch
->search
->textLength
;
3140 uint32_t *patternce
= strsrch
->pattern
.CE
;
3141 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3142 int32_t textoffset
= ucol_getOffset(coleiter
);
3144 // status used in setting coleiter offset, since offset is checked in
3145 // shiftForward before setting the coleiter offset, status never
3147 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
3149 while (textoffset
<= textlength
)
3151 uint32_t patternceindex
= patterncelength
- 1;
3153 UBool found
= FALSE
;
3154 uint32_t lastce
= UCOL_NULLORDER
;
3156 setColEIterOffset(coleiter
, textoffset
);
3159 // finding the last pattern ce match, imagine composite characters
3160 // for example: search for pattern A in text \u00C0
3161 // we'll have to skip \u0300 the grave first before we get to A
3162 targetce
= ucol_previous(coleiter
, status
);
3163 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3167 targetce
= getCE(strsrch
, targetce
);
3168 if (targetce
== UCOL_IGNORABLE
&& inNormBuf(coleiter
)) {
3169 // this is for the text \u0315\u0300 that requires
3170 // normalization and pattern \u0300, where \u0315 is ignorable
3173 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
3176 if (targetce
== patternce
[patternceindex
]) {
3177 // the first ce can be a contraction
3181 if (!hasExpansion(coleiter
)) {
3189 while (found
&& patternceindex
> 0) {
3190 targetce
= ucol_previous(coleiter
, status
);
3191 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3195 targetce
= getCE(strsrch
, targetce
);
3196 if (targetce
== UCOL_IGNORABLE
) {
3201 found
= found
&& targetce
== patternce
[patternceindex
];
3205 if (U_FAILURE(*status
)) {
3208 textoffset
= shiftForward(strsrch
, textoffset
, targetce
,
3210 // status checked at loop.
3211 patternceindex
= patterncelength
;
3215 if (checkNextExactMatch(strsrch
, &textoffset
, status
)) {
3216 // status checked in ucol_setOffset
3217 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
3221 setMatchNotFound(strsrch
);
3225 UBool
usearch_handleNextCanonical(UStringSearch
*strsrch
, UErrorCode
*status
)
3227 if (U_FAILURE(*status
)) {
3228 setMatchNotFound(strsrch
);
3232 UCollationElements
*coleiter
= strsrch
->textIter
;
3233 int32_t textlength
= strsrch
->search
->textLength
;
3234 uint32_t *patternce
= strsrch
->pattern
.CE
;
3235 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3236 int32_t textoffset
= ucol_getOffset(coleiter
);
3237 UBool hasPatternAccents
=
3238 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
3240 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
3242 strsrch
->canonicalPrefixAccents
[0] = 0;
3243 strsrch
->canonicalSuffixAccents
[0] = 0;
3245 while (textoffset
<= textlength
)
3247 int32_t patternceindex
= patterncelength
- 1;
3249 UBool found
= FALSE
;
3250 uint32_t lastce
= UCOL_NULLORDER
;
3252 setColEIterOffset(coleiter
, textoffset
);
3255 // finding the last pattern ce match, imagine composite characters
3256 // for example: search for pattern A in text \u00C0
3257 // we'll have to skip \u0300 the grave first before we get to A
3258 targetce
= ucol_previous(coleiter
, status
);
3259 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3263 targetce
= getCE(strsrch
, targetce
);
3264 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
3267 if (targetce
== patternce
[patternceindex
]) {
3268 // the first ce can be a contraction
3272 if (!hasExpansion(coleiter
)) {
3279 while (found
&& patternceindex
> 0) {
3280 targetce
= ucol_previous(coleiter
, status
);
3281 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3285 targetce
= getCE(strsrch
, targetce
);
3286 if (targetce
== UCOL_IGNORABLE
) {
3291 found
= found
&& targetce
== patternce
[patternceindex
];
3294 // initializing the rearranged accent array
3295 if (hasPatternAccents
&& !found
) {
3296 strsrch
->canonicalPrefixAccents
[0] = 0;
3297 strsrch
->canonicalSuffixAccents
[0] = 0;
3298 if (U_FAILURE(*status
)) {
3301 found
= doNextCanonicalMatch(strsrch
, textoffset
, status
);
3305 if (U_FAILURE(*status
)) {
3308 textoffset
= shiftForward(strsrch
, textoffset
, targetce
,
3310 // status checked at loop
3311 patternceindex
= patterncelength
;
3315 if (checkNextCanonicalMatch(strsrch
, &textoffset
, status
)) {
3316 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
3320 setMatchNotFound(strsrch
);
3324 UBool
usearch_handlePreviousExact(UStringSearch
*strsrch
, UErrorCode
*status
)
3326 if (U_FAILURE(*status
)) {
3327 setMatchNotFound(strsrch
);
3331 UCollationElements
*coleiter
= strsrch
->textIter
;
3332 uint32_t *patternce
= strsrch
->pattern
.CE
;
3333 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3334 int32_t textoffset
= ucol_getOffset(coleiter
);
3336 // shifting it check for setting offset
3337 // if setOffset is called previously or there was no previous match, we
3338 // leave the offset as it is.
3339 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
3340 textoffset
= strsrch
->search
->matchedIndex
;
3343 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
3346 while (textoffset
>= 0)
3348 int32_t patternceindex
= 1;
3350 UBool found
= FALSE
;
3351 uint32_t firstce
= UCOL_NULLORDER
;
3353 // if status is a failure, ucol_setOffset does nothing
3354 setColEIterOffset(coleiter
, textoffset
);
3357 // finding the first pattern ce match, imagine composite
3358 // characters. for example: search for pattern \u0300 in text
3359 // \u00C0, we'll have to skip A first before we get to
3360 // \u0300 the grave accent
3361 targetce
= ucol_next(coleiter
, status
);
3362 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3366 targetce
= getCE(strsrch
, targetce
);
3367 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
3370 if (targetce
== UCOL_IGNORABLE
) {
3373 if (targetce
== patternce
[0]) {
3377 if (!hasExpansion(coleiter
)) {
3378 // checking for accents in composite character
3386 while (found
&& (patternceindex
< patterncelength
)) {
3387 targetce
= ucol_next(coleiter
, status
);
3388 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3392 targetce
= getCE(strsrch
, targetce
);
3393 if (targetce
== UCOL_IGNORABLE
) {
3397 found
= found
&& targetce
== patternce
[patternceindex
];
3402 if (U_FAILURE(*status
)) {
3405 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
3411 if (checkPreviousExactMatch(strsrch
, &textoffset
, status
)) {
3412 setColEIterOffset(coleiter
, textoffset
);
3416 setMatchNotFound(strsrch
);
3420 UBool
usearch_handlePreviousCanonical(UStringSearch
*strsrch
,
3423 if (U_FAILURE(*status
)) {
3424 setMatchNotFound(strsrch
);
3428 UCollationElements
*coleiter
= strsrch
->textIter
;
3429 uint32_t *patternce
= strsrch
->pattern
.CE
;
3430 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3431 int32_t textoffset
= ucol_getOffset(coleiter
);
3432 UBool hasPatternAccents
=
3433 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
3435 // shifting it check for setting offset
3436 // if setOffset is called previously or there was no previous match, we
3437 // leave the offset as it is.
3438 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
3439 textoffset
= strsrch
->search
->matchedIndex
;
3442 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
3444 strsrch
->canonicalPrefixAccents
[0] = 0;
3445 strsrch
->canonicalSuffixAccents
[0] = 0;
3447 while (textoffset
>= 0)
3449 int32_t patternceindex
= 1;
3451 UBool found
= FALSE
;
3452 uint32_t firstce
= UCOL_NULLORDER
;
3454 setColEIterOffset(coleiter
, textoffset
);
3456 // finding the first pattern ce match, imagine composite
3457 // characters. for example: search for pattern \u0300 in text
3458 // \u00C0, we'll have to skip A first before we get to
3459 // \u0300 the grave accent
3460 targetce
= ucol_next(coleiter
, status
);
3461 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3465 targetce
= getCE(strsrch
, targetce
);
3466 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
3470 if (targetce
== patternce
[0]) {
3471 // the first ce can be a contraction
3475 if (!hasExpansion(coleiter
)) {
3476 // checking for accents in composite character
3484 while (found
&& patternceindex
< patterncelength
) {
3485 targetce
= ucol_next(coleiter
, status
);
3486 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3490 targetce
= getCE(strsrch
, targetce
);
3491 if (targetce
== UCOL_IGNORABLE
) {
3495 found
= found
&& targetce
== patternce
[patternceindex
];
3499 // initializing the rearranged accent array
3500 if (hasPatternAccents
&& !found
) {
3501 strsrch
->canonicalPrefixAccents
[0] = 0;
3502 strsrch
->canonicalSuffixAccents
[0] = 0;
3503 if (U_FAILURE(*status
)) {
3506 found
= doPreviousCanonicalMatch(strsrch
, textoffset
, status
);
3510 if (U_FAILURE(*status
)) {
3513 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
3519 if (checkPreviousCanonicalMatch(strsrch
, &textoffset
, status
)) {
3520 setColEIterOffset(coleiter
, textoffset
);
3524 setMatchNotFound(strsrch
);
3528 #endif /* #if !UCONFIG_NO_COLLATION */