2 **********************************************************************
3 * Copyright (C) 2001-2006 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
23 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
25 // internal definition ---------------------------------------------------
27 #define LAST_BYTE_MASK_ 0xFF
28 #define SECOND_LAST_BYTE_SHIFT_ 8
29 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
31 static const uint16_t *FCD_
= NULL
;
33 // internal methods -------------------------------------------------
36 * Fast collation element iterator setOffset.
37 * This function does not check for bounds.
38 * @param coleiter collation element iterator
39 * @param offset to set
42 inline void setColEIterOffset(UCollationElements
*elems
,
45 collIterate
*ci
= &(elems
->iteratordata_
);
46 ci
->pos
= ci
->string
+ offset
;
47 ci
->CEpos
= ci
->toReturn
= ci
->CEs
;
48 if (ci
->flags
& UCOL_ITER_INNORMBUF
) {
49 ci
->flags
= ci
->origFlags
;
51 ci
->fcdPosition
= NULL
;
55 * Getting the mask for collation strength
56 * @param strength collation strength
57 * @return collation element mask
60 inline uint32_t getMask(UCollationStrength strength
)
65 return UCOL_PRIMARYORDERMASK
;
67 return UCOL_SECONDARYORDERMASK
| UCOL_PRIMARYORDERMASK
;
69 return UCOL_TERTIARYORDERMASK
| UCOL_SECONDARYORDERMASK
|
70 UCOL_PRIMARYORDERMASK
;
75 * This is to squeeze the 21bit ces into a 256 table
76 * @param ce collation element
77 * @return collapsed version of the collation element
80 inline int hash(uint32_t ce
)
82 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
83 // well with the new collation where most of the latin 1 characters
84 // are of the value xx000xxx. their hashes will most of the time be 0
85 // to be discussed on the hash algo.
86 return UCOL_PRIMARYORDER(ce
) % MAX_TABLE_SIZE_
;
90 static UBool U_CALLCONV
91 usearch_cleanup(void) {
98 * Initializing the fcd tables.
99 * Internal method, status assumed to be a success.
100 * @param status output error if any, caller to check status before calling
101 * method, status assumed to be success when passed in.
104 inline void initializeFCD(UErrorCode
*status
)
107 FCD_
= unorm_getFCDTrie(status
);
108 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH
, usearch_cleanup
);
113 * Gets the fcd value for a character at the argument index.
114 * This method takes into accounts of the supplementary characters.
115 * @param str UTF16 string where character for fcd retrieval resides
116 * @param offset position of the character whose fcd is to be retrieved, to be
117 * overwritten with the next character position, taking
118 * surrogate characters into consideration.
119 * @param strlength length of the argument string
123 uint16_t getFCD(const UChar
*str
, int32_t *offset
,
126 int32_t temp
= *offset
;
128 UChar ch
= str
[temp
];
129 result
= unorm_getFCD16(FCD_
, ch
);
132 if (result
&& temp
!= strlength
&& UTF_IS_FIRST_SURROGATE(ch
)) {
134 if (UTF_IS_SECOND_SURROGATE(ch
)) {
135 result
= unorm_getFCD16FromSurrogatePair(FCD_
, result
, ch
);
146 * Getting the modified collation elements taking into account the collation
148 * @param strsrch string search data
150 * @return the modified collation element
153 inline int32_t getCE(const UStringSearch
*strsrch
, uint32_t sourcece
)
155 // note for tertiary we can't use the collator->tertiaryMask, that
156 // is a preprocessed mask that takes into account case options. since
157 // we are only concerned with exact matches, we don't need that.
158 sourcece
&= strsrch
->ceMask
;
160 if (strsrch
->toShift
) {
161 // alternate handling here, since only the 16 most significant digits
162 // is only used, we can safely do a compare without masking
163 // if the ce is a variable, we mask and get only the primary values
164 // no shifting to quartenary is required since all primary values
165 // less than variabletop will need to be masked off anyway.
166 if (strsrch
->variableTop
> sourcece
) {
167 if (strsrch
->strength
== UCOL_QUATERNARY
) {
168 sourcece
&= UCOL_PRIMARYORDERMASK
;
171 sourcece
= UCOL_IGNORABLE
;
180 * Allocate a memory and returns NULL if it failed.
181 * Internal method, status assumed to be a success.
182 * @param size to allocate
183 * @param status output error if any, caller to check status before calling
184 * method, status assumed to be success when passed in.
185 * @return newly allocated array, NULL otherwise
188 inline void * allocateMemory(uint32_t size
, UErrorCode
*status
)
190 uint32_t *result
= (uint32_t *)uprv_malloc(size
);
191 if (result
== NULL
) {
192 *status
= U_MEMORY_ALLOCATION_ERROR
;
198 * Adds a uint32_t value to a destination array.
199 * Creates a new array if we run out of space. The caller will have to
200 * manually deallocate the newly allocated array.
201 * Internal method, status assumed to be success, caller has to check status
202 * before calling this method. destination not to be NULL and has at least
203 * size destinationlength.
204 * @param destination target array
205 * @param offset destination offset to add value
206 * @param destinationlength target array size, return value for the new size
207 * @param value to be added
208 * @param increments incremental size expected
209 * @param status output error if any, caller to check status before calling
210 * method, status assumed to be success when passed in.
211 * @return new destination array, destination if there was no new allocation
214 inline int32_t * addTouint32_tArray(int32_t *destination
,
216 uint32_t *destinationlength
,
221 uint32_t newlength
= *destinationlength
;
222 if (offset
+ 1 == newlength
) {
223 newlength
+= increments
;
224 int32_t *temp
= (int32_t *)allocateMemory(
225 sizeof(int32_t) * newlength
, status
);
226 if (U_FAILURE(*status
)) {
229 uprv_memcpy(temp
, destination
, sizeof(int32_t) * offset
);
230 *destinationlength
= newlength
;
233 destination
[offset
] = value
;
238 * Initializing the ce table for a pattern.
239 * Stores non-ignorable collation keys.
240 * Table size will be estimated by the size of the pattern text. Table
241 * expansion will be perform as we go along. Adding 1 to ensure that the table
242 * size definitely increases.
243 * Internal method, status assumed to be a success.
244 * @param strsrch string search data
245 * @param status output error if any, caller to check status before calling
246 * method, status assumed to be success when passed in.
247 * @return total number of expansions
250 inline uint16_t initializePatternCETable(UStringSearch
*strsrch
,
253 UPattern
*pattern
= &(strsrch
->pattern
);
254 uint32_t cetablesize
= INITIAL_ARRAY_SIZE_
;
255 int32_t *cetable
= pattern
->CEBuffer
;
256 uint32_t patternlength
= pattern
->textLength
;
257 UCollationElements
*coleiter
= strsrch
->utilIter
;
259 if (coleiter
== NULL
) {
260 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
261 patternlength
, status
);
262 // status will be checked in ucol_next(..) later and if it is an
263 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
265 strsrch
->utilIter
= coleiter
;
268 uprv_init_collIterate(strsrch
->collator
, pattern
->text
,
270 &coleiter
->iteratordata_
);
273 if (pattern
->CE
!= cetable
&& pattern
->CE
) {
274 uprv_free(pattern
->CE
);
281 while ((ce
= ucol_next(coleiter
, status
)) != UCOL_NULLORDER
&&
282 U_SUCCESS(*status
)) {
283 uint32_t newce
= getCE(strsrch
, ce
);
285 int32_t *temp
= addTouint32_tArray(cetable
, offset
, &cetablesize
,
287 patternlength
- ucol_getOffset(coleiter
) + 1,
289 if (U_FAILURE(*status
)) {
293 if (cetable
!= temp
&& cetable
!= pattern
->CEBuffer
) {
298 result
+= (uint16_t)(ucol_getMaxExpansion(coleiter
, ce
) - 1);
302 pattern
->CE
= cetable
;
303 pattern
->CELength
= offset
;
309 * Initializes the pattern struct.
310 * Internal method, status assumed to be success.
311 * @param strsrch UStringSearch data storage
312 * @param status output error if any, caller to check status before calling
313 * method, status assumed to be success when passed in.
314 * @return expansionsize the total expansion size of the pattern
317 inline int16_t initializePattern(UStringSearch
*strsrch
, UErrorCode
*status
)
319 UPattern
*pattern
= &(strsrch
->pattern
);
320 const UChar
*patterntext
= pattern
->text
;
321 int32_t length
= pattern
->textLength
;
324 pattern
->hasPrefixAccents
= getFCD(patterntext
, &index
, length
) >>
325 SECOND_LAST_BYTE_SHIFT_
;
327 UTF_BACK_1(patterntext
, 0, index
);
328 pattern
->hasSuffixAccents
= getFCD(patterntext
, &index
, length
) &
330 // since intializePattern is an internal method status is a success.
331 return initializePatternCETable(strsrch
, status
);
335 * Initializing shift tables, with the default values.
336 * If a corresponding default value is 0, the shift table is not set.
337 * @param shift table for forwards shift
338 * @param backshift table for backwards shift
339 * @param cetable table containing pattern ce
340 * @param cesize size of the pattern ces
341 * @param expansionsize total size of the expansions
342 * @param defaultforward the default forward value
343 * @param defaultbackward the default backward value
346 inline void setShiftTable(int16_t shift
[], int16_t backshift
[],
347 int32_t *cetable
, int32_t cesize
,
348 int16_t expansionsize
,
349 int16_t defaultforward
,
350 int16_t defaultbackward
)
352 // estimate the value to shift. to do that we estimate the smallest
353 // number of characters to give the relevant ces, ie approximately
354 // the number of ces minus their expansion, since expansions can come
357 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
358 shift
[count
] = defaultforward
;
360 cesize
--; // down to the last index
361 for (count
= 0; count
< cesize
; count
++) {
362 // number of ces from right of array to the count
363 int temp
= defaultforward
- count
- 1;
364 shift
[hash(cetable
[count
])] = temp
> 1 ? temp
: 1;
366 shift
[hash(cetable
[cesize
])] = 1;
367 // for ignorables we just shift by one. see test examples.
370 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
371 backshift
[count
] = defaultbackward
;
373 for (count
= cesize
; count
> 0; count
--) {
374 // the original value count does not seem to work
375 backshift
[hash(cetable
[count
])] = count
> expansionsize
?
376 (int16_t)(count
- expansionsize
) : 1;
378 backshift
[hash(cetable
[0])] = 1;
379 backshift
[hash(0)] = 1;
383 * Building of the pattern collation element list and the boyer moore strsrch
385 * The canonical match will only be performed after the default match fails.
386 * For both cases we need to remember the size of the composed and decomposed
387 * versions of the string. Since the Boyer-Moore shift calculations shifts by
388 * a number of characters in the text and tries to match the pattern from that
389 * offset, the shift value can not be too large in case we miss some
390 * characters. To choose a right shift size, we estimate the NFC form of the
391 * and use its size as a shift guide. The NFC form should be the small
392 * possible representation of the pattern. Anyways, we'll err on the smaller
393 * shift size. Hence the calculation for minlength.
394 * Canonical match will be performed slightly differently. We'll split the
395 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
396 * the first and last base character (MS), the ending accents (EA). Matches
397 * will be done on MS first, and only when we match MS then some processing
398 * will be required for the prefix and end accents in order to determine if
399 * they match PA and EA. Hence the default shift values
400 * for the canonical match will take the size of either end's accent into
401 * consideration. Forwards search will take the end accents into consideration
402 * for the default shift values and the backwards search will take the prefix
403 * accents into consideration.
404 * If pattern has no non-ignorable ce, we return a illegal argument error.
405 * Internal method, status assumed to be success.
406 * @param strsrch UStringSearch data storage
407 * @param status for output errors if it occurs, status is assumed to be a
408 * success when it is passed in.
411 inline void initialize(UStringSearch
*strsrch
, UErrorCode
*status
)
413 int16_t expandlength
= initializePattern(strsrch
, status
);
414 if (U_SUCCESS(*status
) && strsrch
->pattern
.CELength
> 0) {
415 UPattern
*pattern
= &strsrch
->pattern
;
416 int32_t cesize
= pattern
->CELength
;
418 int16_t minlength
= cesize
> expandlength
419 ? (int16_t)cesize
- expandlength
: 1;
420 pattern
->defaultShiftSize
= minlength
;
421 setShiftTable(pattern
->shift
, pattern
->backShift
, pattern
->CE
,
422 cesize
, expandlength
, minlength
, minlength
);
425 strsrch
->pattern
.defaultShiftSize
= 0;
429 * Determine whether the target text in UStringSearch bounded by the offset
430 * start and end is one or more whole units of text as
431 * determined by the breakiterator in UStringSearch.
432 * @param strsrch string search data
433 * @param start target text start offset
434 * @param end target text end offset
437 UBool
isBreakUnit(const UStringSearch
*strsrch
, int32_t start
,
440 #if !UCONFIG_NO_BREAK_ITERATION
441 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
443 int32_t startindex
= ubrk_first(breakiterator
);
444 int32_t endindex
= ubrk_last(breakiterator
);
446 // out-of-range indexes are never boundary positions
447 if (start
< startindex
|| start
> endindex
||
448 end
< startindex
|| end
> endindex
) {
451 // otherwise, we can use following() on the position before the
452 // specified one and return true of the position we get back is the
453 // one the user specified
454 UBool result
= (start
== startindex
||
455 ubrk_following(breakiterator
, start
- 1) == start
) &&
457 ubrk_following(breakiterator
, end
- 1) == end
);
459 // iterates the individual ces
460 UCollationElements
*coleiter
= strsrch
->utilIter
;
461 const UChar
*text
= strsrch
->search
->text
+
463 UErrorCode status
= U_ZERO_ERROR
;
464 ucol_setText(coleiter
, text
, end
- start
, &status
);
465 for (int32_t count
= 0; count
< strsrch
->pattern
.CELength
;
467 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
468 if (ce
== UCOL_IGNORABLE
) {
472 if (U_FAILURE(status
) || ce
!= strsrch
->pattern
.CE
[count
]) {
476 int32_t nextce
= ucol_next(coleiter
, &status
);
477 while (ucol_getOffset(coleiter
) == (end
- start
)
478 && getCE(strsrch
, nextce
) == UCOL_IGNORABLE
) {
479 nextce
= ucol_next(coleiter
, &status
);
481 if (ucol_getOffset(coleiter
) == (end
- start
)
482 && nextce
!= UCOL_NULLORDER
) {
483 // extra collation elements at the end of the match
494 * Getting the next base character offset if current offset is an accent,
495 * or the current offset if the current character contains a base character.
496 * accents the following base character will be returned
498 * @param textoffset current offset
499 * @param textlength length of text string
500 * @return the next base character or the current offset
501 * if the current character is contains a base character.
504 inline int32_t getNextBaseOffset(const UChar
*text
,
508 if (textoffset
< textlength
) {
509 int32_t temp
= textoffset
;
510 if (getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
511 while (temp
< textlength
) {
512 int32_t result
= temp
;
513 if ((getFCD(text
, &temp
, textlength
) >>
514 SECOND_LAST_BYTE_SHIFT_
) == 0) {
525 * Gets the next base character offset depending on the string search pattern
527 * @param strsrch string search data
528 * @param textoffset current offset, one offset away from the last character
530 * @return start index of the next base character or the current offset
531 * if the current character is contains a base character.
534 inline int32_t getNextUStringSearchBaseOffset(UStringSearch
*strsrch
,
537 int32_t textlength
= strsrch
->search
->textLength
;
538 if (strsrch
->pattern
.hasSuffixAccents
&&
539 textoffset
< textlength
) {
540 int32_t temp
= textoffset
;
541 const UChar
*text
= strsrch
->search
->text
;
542 UTF_BACK_1(text
, 0, temp
);
543 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
544 return getNextBaseOffset(text
, textoffset
, textlength
);
551 * Shifting the collation element iterator position forward to prepare for
552 * a following match. If the last character is a unsafe character, we'll only
553 * shift by 1 to capture contractions, normalization etc.
554 * Internal method, status assumed to be success.
555 * @param text strsrch string search data
556 * @param textoffset start text position to do search
557 * @param ce the text ce which failed the match.
558 * @param patternceindex index of the ce within the pattern ce buffer which
560 * @return final offset
563 inline int32_t shiftForward(UStringSearch
*strsrch
,
566 int32_t patternceindex
)
568 UPattern
*pattern
= &(strsrch
->pattern
);
569 if (ce
!= UCOL_NULLORDER
) {
570 int32_t shift
= pattern
->shift
[hash(ce
)];
571 // this is to adjust for characters in the middle of the
572 // substring for matching that failed.
573 int32_t adjust
= pattern
->CELength
- patternceindex
;
574 if (adjust
> 1 && shift
>= adjust
) {
580 textoffset
+= pattern
->defaultShiftSize
;
583 textoffset
= getNextUStringSearchBaseOffset(strsrch
, textoffset
);
584 // check for unsafe characters
585 // * if it is the start or middle of a contraction: to be done after
586 // a initial match is found
587 // * thai or lao base consonant character: similar to contraction
588 // * high surrogate character: similar to contraction
589 // * next character is a accent: shift to the next base character
594 * sets match not found
595 * @param strsrch string search data
598 inline void setMatchNotFound(UStringSearch
*strsrch
)
600 // this method resets the match result regardless of the error status.
601 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
602 strsrch
->search
->matchedLength
= 0;
603 if (strsrch
->search
->isForwardSearching
) {
604 setColEIterOffset(strsrch
->textIter
, strsrch
->search
->textLength
);
607 setColEIterOffset(strsrch
->textIter
, 0);
612 * Gets the offset to the next safe point in text.
613 * ie. not the middle of a contraction, swappable characters or supplementary
615 * @param collator collation sata
616 * @param text string to work with
617 * @param textoffset offset in string
618 * @param textlength length of text string
619 * @return offset to the next safe character
622 inline int32_t getNextSafeOffset(const UCollator
*collator
,
627 int32_t result
= textoffset
; // first contraction character
628 while (result
!= textlength
&& ucol_unsafeCP(text
[result
], collator
)) {
635 * This checks for accents in the potential match started with a .
636 * composite character.
637 * This is really painful... we have to check that composite character do not
638 * have any extra accents. We have to normalize the potential match and find
639 * the immediate decomposed character before the match.
640 * The first composite character would have been taken care of by the fcd
641 * checks in checkForwardExactMatch.
642 * This is the slow path after the fcd of the first character and
643 * the last character has been checked by checkForwardExactMatch and we
644 * determine that the potential match has extra non-ignorable preceding
646 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
647 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
648 * Note here that accents checking are slow and cautioned in the API docs.
649 * Internal method, status assumed to be a success, caller should check status
650 * before calling this method
651 * @param strsrch string search data
652 * @param start index of the potential unfriendly composite character
653 * @param end index of the potential unfriendly composite character
654 * @param status output error status if any.
655 * @return TRUE if there is non-ignorable accents before at the beginning
656 * of the match, FALSE otherwise.
660 UBool
checkExtraMatchAccents(const UStringSearch
*strsrch
, int32_t start
,
664 UBool result
= FALSE
;
665 if (strsrch
->pattern
.hasPrefixAccents
) {
666 int32_t length
= end
- start
;
668 const UChar
*text
= strsrch
->search
->text
+ start
;
670 UTF_FWD_1(text
, offset
, length
);
671 // we are only concerned with the first composite character
672 if (unorm_quickCheck(text
, offset
, UNORM_NFD
, status
) == UNORM_NO
) {
673 int32_t safeoffset
= getNextSafeOffset(strsrch
->collator
,
675 if (safeoffset
!= length
) {
679 UChar buffer
[INITIAL_ARRAY_SIZE_
];
680 int32_t size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0,
681 buffer
, INITIAL_ARRAY_SIZE_
,
683 if (U_FAILURE(*status
)) {
686 if (size
>= INITIAL_ARRAY_SIZE_
) {
687 norm
= (UChar
*)allocateMemory((size
+ 1) * sizeof(UChar
),
689 // if allocation failed, status will be set to
690 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
692 size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0, norm
,
694 if (U_FAILURE(*status
) && norm
!= NULL
) {
703 UCollationElements
*coleiter
= strsrch
->utilIter
;
704 ucol_setText(coleiter
, norm
, size
, status
);
705 uint32_t firstce
= strsrch
->pattern
.CE
[0];
706 UBool ignorable
= TRUE
;
707 uint32_t ce
= UCOL_IGNORABLE
;
708 while (U_SUCCESS(*status
) && ce
!= firstce
) {
709 offset
= ucol_getOffset(coleiter
);
710 if (ce
!= firstce
&& ce
!= UCOL_IGNORABLE
) {
713 ce
= ucol_next(coleiter
, status
);
716 UTF_PREV_CHAR(norm
, 0, offset
, codepoint
);
717 result
= !ignorable
&& (u_getCombiningClass(codepoint
) != 0);
719 if (norm
!= buffer
) {
729 * Used by exact matches, checks if there are accents before the match.
730 * This is really painful... we have to check that composite characters at
731 * the start of the matches have to not have any extra accents.
732 * We check the FCD of the character first, if it starts with an accent and
733 * the first pattern ce does not match the first ce of the character, we bail.
734 * Otherwise we try normalizing the first composite
735 * character and find the immediate decomposed character before the match to
736 * see if it is an non-ignorable accent.
737 * Now normalizing the first composite character is enough because we ensure
738 * that when the match is passed in here with extra beginning ces, the
739 * first or last ce that match has to occur within the first character.
740 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
741 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
742 * Note here that accents checking are slow and cautioned in the API docs.
743 * @param strsrch string search data
744 * @param start offset
746 * @return TRUE if there are accents on either side of the match,
750 UBool
hasAccentsBeforeMatch(const UStringSearch
*strsrch
, int32_t start
,
753 if (strsrch
->pattern
.hasPrefixAccents
) {
754 UCollationElements
*coleiter
= strsrch
->textIter
;
755 UErrorCode status
= U_ZERO_ERROR
;
756 // we have been iterating forwards previously
757 uint32_t ignorable
= TRUE
;
758 int32_t firstce
= strsrch
->pattern
.CE
[0];
760 setColEIterOffset(coleiter
, start
);
761 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
762 if (U_FAILURE(status
)) {
765 while (ce
!= firstce
) {
766 if (ce
!= UCOL_IGNORABLE
) {
769 ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
770 if (U_FAILURE(status
)) {
774 if (!ignorable
&& inNormBuf(coleiter
)) {
775 // within normalization buffer, discontiguous handled here
780 int32_t temp
= start
;
782 // accent = (getFCD(strsrch->search->text, &temp,
783 // strsrch->search->textLength)
784 // >> SECOND_LAST_BYTE_SHIFT_);
785 // however this code does not work well with VC7 .net in release mode.
786 // maybe the inlines for getFCD combined with shifting has bugs in
787 // VC7. anyways this is a work around.
788 UBool accent
= getFCD(strsrch
->search
->text
, &temp
,
789 strsrch
->search
->textLength
) > 0xFF;
791 return checkExtraMatchAccents(strsrch
, start
, end
, &status
);
798 UTF_BACK_1(strsrch
->search
->text
, 0, temp
);
799 if (getFCD(strsrch
->search
->text
, &temp
,
800 strsrch
->search
->textLength
) & LAST_BYTE_MASK_
) {
801 setColEIterOffset(coleiter
, start
);
802 ce
= ucol_previous(coleiter
, &status
);
803 if (U_FAILURE(status
) ||
804 (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
)) {
815 * Used by exact matches, checks if there are accents bounding the match.
816 * Note this is the initial boundary check. If the potential match
817 * starts or ends with composite characters, the accents in those
818 * characters will be determined later.
819 * Not doing backwards iteration here, since discontiguos contraction for
820 * backwards collation element iterator, use up too many characters.
821 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
822 * should fail since there is a acute at the end of \u01FA
823 * Note here that accents checking are slow and cautioned in the API docs.
824 * @param strsrch string search data
825 * @param start offset of match
826 * @param end end offset of the match
827 * @return TRUE if there are accents on either side of the match,
831 UBool
hasAccentsAfterMatch(const UStringSearch
*strsrch
, int32_t start
,
834 if (strsrch
->pattern
.hasSuffixAccents
) {
835 const UChar
*text
= strsrch
->search
->text
;
837 int32_t textlength
= strsrch
->search
->textLength
;
838 UTF_BACK_1(text
, 0, temp
);
839 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
840 int32_t firstce
= strsrch
->pattern
.CE
[0];
841 UCollationElements
*coleiter
= strsrch
->textIter
;
842 UErrorCode status
= U_ZERO_ERROR
;
843 setColEIterOffset(coleiter
, start
);
844 while (getCE(strsrch
, ucol_next(coleiter
, &status
)) != firstce
) {
845 if (U_FAILURE(status
)) {
850 while (count
< strsrch
->pattern
.CELength
) {
851 if (getCE(strsrch
, ucol_next(coleiter
, &status
))
853 // Thai can give an ignorable here.
856 if (U_FAILURE(status
)) {
861 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
862 if (U_FAILURE(status
)) {
865 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
866 if (ucol_getOffset(coleiter
) <= end
) {
869 if (getFCD(text
, &end
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
879 * Checks if the offset runs out of the text string
881 * @param textlength of the text string
882 * @return TRUE if offset is out of bounds, FALSE otherwise
885 inline UBool
isOutOfBounds(int32_t textlength
, int32_t offset
)
887 return offset
< 0 || offset
> textlength
;
891 * Checks for identical match
892 * @param strsrch string search data
893 * @param start offset of possible match
894 * @param end offset of possible match
895 * @return TRUE if identical match is found
898 inline UBool
checkIdentical(const UStringSearch
*strsrch
, int32_t start
,
901 UChar t2
[32], p2
[32];
902 int32_t length
= end
- start
;
903 if (strsrch
->strength
!= UCOL_IDENTICAL
) {
907 UErrorCode status
= U_ZERO_ERROR
, status2
= U_ZERO_ERROR
;
908 int32_t decomplength
= unorm_decompose(t2
, LENGTHOF(t2
),
909 strsrch
->search
->text
+ start
, length
,
911 // use separate status2 in case of buffer overflow
912 if (decomplength
!= unorm_decompose(p2
, LENGTHOF(p2
),
913 strsrch
->pattern
.text
,
914 strsrch
->pattern
.textLength
,
915 FALSE
, 0, &status2
)) {
916 return FALSE
; // lengths are different
920 UChar
*text
, *pattern
;
921 if(U_SUCCESS(status
)) {
924 } else if(status
==U_BUFFER_OVERFLOW_ERROR
) {
925 status
= U_ZERO_ERROR
;
926 // allocate one buffer for both decompositions
927 text
= (UChar
*)uprv_malloc(decomplength
* 2 * U_SIZEOF_UCHAR
);
928 pattern
= text
+ decomplength
;
929 unorm_decompose(text
, decomplength
, strsrch
->search
->text
+ start
,
930 length
, FALSE
, 0, &status
);
931 unorm_decompose(pattern
, decomplength
, strsrch
->pattern
.text
,
932 strsrch
->pattern
.textLength
, FALSE
, 0, &status
);
934 // NFD failed, make sure that u_memcmp() does not overrun t2 & p2
935 // and that we don't uprv_free() an undefined text pointer
939 UBool result
= (UBool
)(u_memcmp(pattern
, text
, decomplength
) == 0);
943 // return FALSE if NFD failed
944 return U_SUCCESS(status
) && result
;
948 * Checks to see if the match is repeated
949 * @param strsrch string search data
950 * @param start new match start index
951 * @param end new match end index
952 * @return TRUE if the the match is repeated, FALSE otherwise
955 inline UBool
checkRepeatedMatch(UStringSearch
*strsrch
,
959 int32_t lastmatchindex
= strsrch
->search
->matchedIndex
;
961 if (lastmatchindex
== USEARCH_DONE
) {
964 if (strsrch
->search
->isForwardSearching
) {
965 result
= start
<= lastmatchindex
;
968 result
= start
>= lastmatchindex
;
970 if (!result
&& !strsrch
->search
->isOverlap
) {
971 if (strsrch
->search
->isForwardSearching
) {
972 result
= start
< lastmatchindex
+ strsrch
->search
->matchedLength
;
975 result
= end
> lastmatchindex
;
982 * Gets the collation element iterator's current offset.
983 * @param coleiter collation element iterator
984 * @param forwards flag TRUE if we are moving in th forwards direction
985 * @return current offset
988 inline int32_t getColElemIterOffset(const UCollationElements
*coleiter
,
991 int32_t result
= ucol_getOffset(coleiter
);
992 // intricacies of the the backwards collation element iterator
993 if (!forwards
&& inNormBuf(coleiter
) && !isFCDPointerNull(coleiter
)) {
1000 * Checks match for contraction.
1001 * If the match ends with a partial contraction we fail.
1002 * If the match starts too far off (because of backwards iteration) we try to
1003 * chip off the extra characters depending on whether a breakiterator has
1005 * Internal method, error assumed to be success, caller has to check status
1006 * before calling this method.
1007 * @param strsrch string search data
1008 * @param start offset of potential match, to be modified if necessary
1009 * @param end offset of potential match, to be modified if necessary
1010 * @param status output error status if any
1011 * @return TRUE if match passes the contraction test, FALSE otherwise
1015 UBool
checkNextExactContractionMatch(UStringSearch
*strsrch
,
1017 int32_t *end
, UErrorCode
*status
)
1019 UCollationElements
*coleiter
= strsrch
->textIter
;
1020 int32_t textlength
= strsrch
->search
->textLength
;
1021 int32_t temp
= *start
;
1022 const UCollator
*collator
= strsrch
->collator
;
1023 const UChar
*text
= strsrch
->search
->text
;
1024 // This part checks if either ends of the match contains potential
1025 // contraction. If so we'll have to iterate through them
1026 // The start contraction needs to be checked since ucol_previous dumps
1027 // all characters till the first safe character into the buffer.
1028 // *start + 1 is used to test for the unsafe characters instead of *start
1029 // because ucol_prev takes all unsafe characters till the first safe
1030 // character ie *start. so by testing *start + 1, we can estimate if
1031 // excess prefix characters has been included in the potential search
1033 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1034 (*start
+ 1 < textlength
1035 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1036 int32_t expansion
= getExpansionPrefix(coleiter
);
1037 UBool expandflag
= expansion
> 0;
1038 setColEIterOffset(coleiter
, *start
);
1039 while (expansion
> 0) {
1040 // getting rid of the redundant ce, caused by setOffset.
1041 // since backward contraction/expansion may have extra ces if we
1042 // are in the normalization buffer, hasAccentsBeforeMatch would
1043 // have taken care of it.
1044 // E.g. the character \u01FA will have an expansion of 3, but if
1045 // we are only looking for acute and ring \u030A and \u0301, we'll
1046 // have to skip the first ce in the expansion buffer.
1047 ucol_next(coleiter
, status
);
1048 if (U_FAILURE(*status
)) {
1051 if (ucol_getOffset(coleiter
) != temp
) {
1053 temp
= ucol_getOffset(coleiter
);
1058 int32_t *patternce
= strsrch
->pattern
.CE
;
1059 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1061 while (count
< patterncelength
) {
1062 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1063 if (ce
== UCOL_IGNORABLE
) {
1066 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1068 temp
= ucol_getOffset(coleiter
);
1070 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1072 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1082 * Checks and sets the match information if found.
1085 * <li> the potential match does not repeat the previous match
1086 * <li> boundaries are correct
1087 * <li> exact matches has no extra accents
1088 * <li> identical matchesb
1089 * <li> potential match does not end in the middle of a contraction
1091 * Otherwise the offset will be shifted to the next character.
1092 * Internal method, status assumed to be success, caller has to check status
1093 * before calling this method.
1094 * @param strsrch string search data
1095 * @param textoffset offset in the collation element text. the returned value
1096 * will be the truncated end offset of the match or the new start
1098 * @param status output error status if any
1099 * @return TRUE if the match is valid, FALSE otherwise
1102 inline UBool
checkNextExactMatch(UStringSearch
*strsrch
,
1103 int32_t *textoffset
, UErrorCode
*status
)
1105 UCollationElements
*coleiter
= strsrch
->textIter
;
1106 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1108 if (!checkNextExactContractionMatch(strsrch
, &start
, textoffset
, status
)) {
1112 // this totally matches, however we need to check if it is repeating
1113 if (!isBreakUnit(strsrch
, start
, *textoffset
) ||
1114 checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1115 hasAccentsBeforeMatch(strsrch
, start
, *textoffset
) ||
1116 !checkIdentical(strsrch
, start
, *textoffset
) ||
1117 hasAccentsAfterMatch(strsrch
, start
, *textoffset
)) {
1120 *textoffset
= getNextUStringSearchBaseOffset(strsrch
, *textoffset
);
1124 // totally match, we will get rid of the ending ignorables.
1125 strsrch
->search
->matchedIndex
= start
;
1126 strsrch
->search
->matchedLength
= *textoffset
- start
;
1131 * Getting the previous base character offset, or the current offset if the
1132 * current character is a base character
1133 * @param text string
1134 * @param textoffset one offset after the current character
1135 * @return the offset of the next character after the base character or the first
1136 * composed character with accents
1139 inline int32_t getPreviousBaseOffset(const UChar
*text
,
1142 if (textoffset
> 0) {
1144 int32_t result
= textoffset
;
1145 UTF_BACK_1(text
, 0, textoffset
);
1146 int32_t temp
= textoffset
;
1147 uint16_t fcd
= getFCD(text
, &temp
, result
);
1148 if ((fcd
>> SECOND_LAST_BYTE_SHIFT_
) == 0) {
1149 if (fcd
& LAST_BYTE_MASK_
) {
1154 if (textoffset
== 0) {
1163 * Getting the indexes of the accents that are not blocked in the argument
1165 * @param accents array of accents in nfd terminated by a 0.
1166 * @param accentsindex array of indexes of the accents that are not blocked
1169 inline int getUnblockedAccentIndex(UChar
*accents
, int32_t *accentsindex
)
1172 int32_t length
= u_strlen(accents
);
1173 UChar32 codepoint
= 0;
1177 while (index
< length
) {
1179 UTF_NEXT_CHAR(accents
, index
, length
, codepoint
);
1180 if (u_getCombiningClass(codepoint
) != cclass
) {
1181 cclass
= u_getCombiningClass(codepoint
);
1182 accentsindex
[result
] = temp
;
1186 accentsindex
[result
] = length
;
1191 * Appends 3 UChar arrays to a destination array.
1192 * Creates a new array if we run out of space. The caller will have to
1193 * manually deallocate the newly allocated array.
1194 * Internal method, status assumed to be success, caller has to check status
1195 * before calling this method. destination not to be NULL and has at least
1196 * size destinationlength.
1197 * @param destination target array
1198 * @param destinationlength target array size, returning the appended length
1199 * @param source1 null-terminated first array
1200 * @param source2 second array
1201 * @param source2length length of seond array
1202 * @param source3 null-terminated third array
1203 * @param status error status if any
1204 * @return new destination array, destination if there was no new allocation
1207 inline UChar
* addToUCharArray( UChar
*destination
,
1208 int32_t *destinationlength
,
1209 const UChar
*source1
,
1210 const UChar
*source2
,
1211 int32_t source2length
,
1212 const UChar
*source3
,
1215 int32_t source1length
= source1
? u_strlen(source1
) : 0;
1216 int32_t source3length
= source3
? u_strlen(source3
) : 0;
1217 if (*destinationlength
< source1length
+ source2length
+ source3length
+
1220 destination
= (UChar
*)allocateMemory(
1221 (source1length
+ source2length
+ source3length
+ 1) * sizeof(UChar
),
1223 // if error allocating memory, status will be
1224 // U_MEMORY_ALLOCATION_ERROR
1225 if (U_FAILURE(*status
)) {
1226 *destinationlength
= 0;
1230 if (source1length
!= 0) {
1231 uprv_memcpy(destination
, source1
, sizeof(UChar
) * source1length
);
1233 if (source2length
!= 0) {
1234 uprv_memcpy(destination
+ source1length
, source2
,
1235 sizeof(UChar
) * source2length
);
1237 if (source3length
!= 0) {
1238 uprv_memcpy(destination
+ source1length
+ source2length
, source3
,
1239 sizeof(UChar
) * source3length
);
1241 *destinationlength
= source1length
+ source2length
+ source3length
;
1246 * Running through a collation element iterator to see if the contents matches
1247 * pattern in string search data
1248 * @param strsrch string search data
1249 * @param coleiter collation element iterator
1250 * @return TRUE if a match if found, FALSE otherwise
1253 inline UBool
checkCollationMatch(const UStringSearch
*strsrch
,
1254 UCollationElements
*coleiter
)
1256 int patternceindex
= strsrch
->pattern
.CELength
;
1257 int32_t *patternce
= strsrch
->pattern
.CE
;
1258 UErrorCode status
= U_ZERO_ERROR
;
1259 while (patternceindex
> 0) {
1260 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
1261 if (ce
== UCOL_IGNORABLE
) {
1264 if (U_FAILURE(status
) || ce
!= *patternce
) {
1274 * Rearranges the front accents to try matching.
1275 * Prefix accents in the text will be grouped according to their combining
1276 * class and the groups will be mixed and matched to try find the perfect
1277 * match with the pattern.
1278 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1279 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1280 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1282 * step 2: check if any of the generated substrings matches the pattern.
1283 * Internal method, status is assumed to be success, caller has to check status
1284 * before calling this method.
1285 * @param strsrch string search match
1286 * @param start first offset of the accents to start searching
1287 * @param end start of the last accent set
1288 * @param status output error status if any
1289 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1290 * offset of the match. Note this start includes all preceding accents.
1293 int32_t doNextCanonicalPrefixMatch(UStringSearch
*strsrch
,
1298 const UChar
*text
= strsrch
->search
->text
;
1299 int32_t textlength
= strsrch
->search
->textLength
;
1300 int32_t tempstart
= start
;
1302 if ((getFCD(text
, &tempstart
, textlength
) & LAST_BYTE_MASK_
) == 0) {
1303 // die... failed at a base character
1304 return USEARCH_DONE
;
1307 int32_t offset
= getNextBaseOffset(text
, tempstart
, textlength
);
1308 start
= getPreviousBaseOffset(text
, tempstart
);
1310 UChar accents
[INITIAL_ARRAY_SIZE_
];
1311 // normalizing the offensive string
1312 unorm_normalize(text
+ start
, offset
- start
, UNORM_NFD
, 0, accents
,
1313 INITIAL_ARRAY_SIZE_
, status
);
1314 if (U_FAILURE(*status
)) {
1315 return USEARCH_DONE
;
1318 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1319 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1321 int32_t count
= (2 << (accentsize
- 1)) - 1;
1322 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1323 UCollationElements
*coleiter
= strsrch
->utilIter
;
1324 while (U_SUCCESS(*status
) && count
> 0) {
1325 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
1326 // copy the base characters
1327 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1328 *rearrange
++ = accents
[k
];
1330 // forming all possible canonical rearrangement by dropping
1332 for (int i
= 0; i
<= accentsize
- 1; i
++) {
1333 int32_t mask
= 1 << (accentsize
- i
- 1);
1335 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1336 *rearrange
++ = accents
[j
];
1341 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
1342 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
1343 strsrch
->canonicalPrefixAccents
,
1344 strsrch
->search
->text
+ offset
,
1346 strsrch
->canonicalSuffixAccents
,
1349 // if status is a failure, ucol_setText does nothing.
1350 // run the collator iterator through this match
1351 ucol_setText(coleiter
, match
, matchsize
, status
);
1352 if (U_SUCCESS(*status
)) {
1353 if (checkCollationMatch(strsrch
, coleiter
)) {
1354 if (match
!= buffer
) {
1362 return USEARCH_DONE
;
1366 * Gets the offset to the safe point in text before textoffset.
1367 * ie. not the middle of a contraction, swappable characters or supplementary
1369 * @param collator collation sata
1370 * @param text string to work with
1371 * @param textoffset offset in string
1372 * @param textlength length of text string
1373 * @return offset to the previous safe character
1376 inline uint32_t getPreviousSafeOffset(const UCollator
*collator
,
1380 int32_t result
= textoffset
; // first contraction character
1381 while (result
!= 0 && ucol_unsafeCP(text
[result
- 1], collator
)) {
1385 // the first contraction character is consider unsafe here
1392 * Cleaning up after we passed the safe zone
1393 * @param strsrch string search data
1394 * @param safetext safe text array
1395 * @param safebuffer safe text buffer
1396 * @param coleiter collation element iterator for safe text
1399 inline void cleanUpSafeText(const UStringSearch
*strsrch
, UChar
*safetext
,
1402 if (safetext
!= safebuffer
&& safetext
!= strsrch
->canonicalSuffixAccents
)
1404 uprv_free(safetext
);
1409 * Take the rearranged end accents and tries matching. If match failed at
1410 * a seperate preceding set of accents (seperated from the rearranged on by
1411 * at least a base character) then we rearrange the preceding accents and
1412 * tries matching again.
1413 * We allow skipping of the ends of the accent set if the ces do not match.
1414 * However if the failure is found before the accent set, it fails.
1415 * Internal method, status assumed to be success, caller has to check status
1416 * before calling this method.
1417 * @param strsrch string search data
1418 * @param textoffset of the start of the rearranged accent
1419 * @param status output error status if any
1420 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1421 * offset of the match. Note this start includes all preceding accents.
1424 int32_t doNextCanonicalSuffixMatch(UStringSearch
*strsrch
,
1428 const UChar
*text
= strsrch
->search
->text
;
1429 const UCollator
*collator
= strsrch
->collator
;
1430 int32_t safelength
= 0;
1432 int32_t safetextlength
;
1433 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
1434 UCollationElements
*coleiter
= strsrch
->utilIter
;
1435 int32_t safeoffset
= textoffset
;
1437 if (textoffset
!= 0 && ucol_unsafeCP(strsrch
->canonicalSuffixAccents
[0],
1439 safeoffset
= getPreviousSafeOffset(collator
, text
, textoffset
);
1440 safelength
= textoffset
- safeoffset
;
1441 safetextlength
= INITIAL_ARRAY_SIZE_
;
1442 safetext
= addToUCharArray(safebuffer
, &safetextlength
, NULL
,
1443 text
+ safeoffset
, safelength
,
1444 strsrch
->canonicalSuffixAccents
,
1448 safetextlength
= u_strlen(strsrch
->canonicalSuffixAccents
);
1449 safetext
= strsrch
->canonicalSuffixAccents
;
1452 // if status is a failure, ucol_setText does nothing
1453 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
1454 // status checked in loop below
1456 int32_t *ce
= strsrch
->pattern
.CE
;
1457 int32_t celength
= strsrch
->pattern
.CELength
;
1458 int ceindex
= celength
- 1;
1459 UBool isSafe
= TRUE
; // indication flag for position in safe zone
1461 while (ceindex
>= 0) {
1462 int32_t textce
= ucol_previous(coleiter
, status
);
1463 if (U_FAILURE(*status
)) {
1465 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1467 return USEARCH_DONE
;
1469 if (textce
== UCOL_NULLORDER
) {
1470 // check if we have passed the safe buffer
1471 if (coleiter
== strsrch
->textIter
) {
1472 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1473 return USEARCH_DONE
;
1475 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1476 safetext
= safebuffer
;
1477 coleiter
= strsrch
->textIter
;
1478 setColEIterOffset(coleiter
, safeoffset
);
1479 // status checked at the start of the loop
1483 textce
= getCE(strsrch
, textce
);
1484 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
1485 // do the beginning stuff
1486 int32_t failedoffset
= getColElemIterOffset(coleiter
, FALSE
);
1487 if (isSafe
&& failedoffset
>= safelength
) {
1488 // alas... no hope. failed at rearranged accent set
1489 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1490 return USEARCH_DONE
;
1494 failedoffset
+= safeoffset
;
1495 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1498 // try rearranging the front accents
1499 int32_t result
= doNextCanonicalPrefixMatch(strsrch
,
1500 failedoffset
, textoffset
, status
);
1501 if (result
!= USEARCH_DONE
) {
1502 // if status is a failure, ucol_setOffset does nothing
1503 setColEIterOffset(strsrch
->textIter
, result
);
1505 if (U_FAILURE(*status
)) {
1506 return USEARCH_DONE
;
1511 if (textce
== ce
[ceindex
]) {
1517 int32_t result
= getColElemIterOffset(coleiter
, FALSE
);
1518 // sets the text iterator here with the correct expansion and offset
1519 int32_t leftoverces
= getExpansionPrefix(coleiter
);
1520 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1521 if (result
>= safelength
) {
1522 result
= textoffset
;
1525 result
+= safeoffset
;
1527 setColEIterOffset(strsrch
->textIter
, result
);
1528 strsrch
->textIter
->iteratordata_
.toReturn
=
1529 setExpansionPrefix(strsrch
->textIter
, leftoverces
);
1533 return ucol_getOffset(coleiter
);
1537 * Trying out the substring and sees if it can be a canonical match.
1538 * This will try normalizing the end accents and arranging them into canonical
1539 * equivalents and check their corresponding ces with the pattern ce.
1540 * Suffix accents in the text will be grouped according to their combining
1541 * class and the groups will be mixed and matched to try find the perfect
1542 * match with the pattern.
1543 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1544 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1545 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1547 * step 2: check if any of the generated substrings matches the pattern.
1548 * Internal method, status assumed to be success, caller has to check status
1549 * before calling this method.
1550 * @param strsrch string search data
1551 * @param textoffset end offset in the collation element text that ends with
1552 * the accents to be rearranged
1553 * @param status error status if any
1554 * @return TRUE if the match is valid, FALSE otherwise
1557 UBool
doNextCanonicalMatch(UStringSearch
*strsrch
,
1561 const UChar
*text
= strsrch
->search
->text
;
1562 int32_t temp
= textoffset
;
1563 UTF_BACK_1(text
, 0, temp
);
1564 if ((getFCD(text
, &temp
, textoffset
) & LAST_BYTE_MASK_
) == 0) {
1565 UCollationElements
*coleiter
= strsrch
->textIter
;
1566 int32_t offset
= getColElemIterOffset(coleiter
, FALSE
);
1567 if (strsrch
->pattern
.hasPrefixAccents
) {
1568 offset
= doNextCanonicalPrefixMatch(strsrch
, offset
, textoffset
,
1570 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
1571 setColEIterOffset(coleiter
, offset
);
1578 if (!strsrch
->pattern
.hasSuffixAccents
) {
1582 UChar accents
[INITIAL_ARRAY_SIZE_
];
1583 // offset to the last base character in substring to search
1584 int32_t baseoffset
= getPreviousBaseOffset(text
, textoffset
);
1585 // normalizing the offensive string
1586 unorm_normalize(text
+ baseoffset
, textoffset
- baseoffset
, UNORM_NFD
,
1587 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
1588 // status checked in loop below
1590 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1591 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
1593 // 2 power n - 1 plus the full set of accents
1594 int32_t count
= (2 << (size
- 1)) - 1;
1595 while (U_SUCCESS(*status
) && count
> 0) {
1596 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
1597 // copy the base characters
1598 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1599 *rearrange
++ = accents
[k
];
1601 // forming all possible canonical rearrangement by dropping
1603 for (int i
= 0; i
<= size
- 1; i
++) {
1604 int32_t mask
= 1 << (size
- i
- 1);
1606 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1607 *rearrange
++ = accents
[j
];
1612 int32_t offset
= doNextCanonicalSuffixMatch(strsrch
, baseoffset
,
1614 if (offset
!= USEARCH_DONE
) {
1615 return TRUE
; // match found
1623 * Gets the previous base character offset depending on the string search
1625 * @param strsrch string search data
1626 * @param textoffset current offset, current character
1627 * @return the offset of the next character after this base character or itself
1628 * if it is a composed character with accents
1631 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch
*strsrch
,
1634 if (strsrch
->pattern
.hasPrefixAccents
&& textoffset
> 0) {
1635 const UChar
*text
= strsrch
->search
->text
;
1636 int32_t offset
= textoffset
;
1637 if (getFCD(text
, &offset
, strsrch
->search
->textLength
) >>
1638 SECOND_LAST_BYTE_SHIFT_
) {
1639 return getPreviousBaseOffset(text
, textoffset
);
1646 * Checks match for contraction.
1647 * If the match ends with a partial contraction we fail.
1648 * If the match starts too far off (because of backwards iteration) we try to
1649 * chip off the extra characters
1650 * Internal method, status assumed to be success, caller has to check status
1651 * before calling this method.
1652 * @param strsrch string search data
1653 * @param start offset of potential match, to be modified if necessary
1654 * @param end offset of potential match, to be modified if necessary
1655 * @param status output error status if any
1656 * @return TRUE if match passes the contraction test, FALSE otherwise
1659 UBool
checkNextCanonicalContractionMatch(UStringSearch
*strsrch
,
1664 UCollationElements
*coleiter
= strsrch
->textIter
;
1665 int32_t textlength
= strsrch
->search
->textLength
;
1666 int32_t temp
= *start
;
1667 const UCollator
*collator
= strsrch
->collator
;
1668 const UChar
*text
= strsrch
->search
->text
;
1669 // This part checks if either ends of the match contains potential
1670 // contraction. If so we'll have to iterate through them
1671 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1672 (*start
+ 1 < textlength
1673 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1674 int32_t expansion
= getExpansionPrefix(coleiter
);
1675 UBool expandflag
= expansion
> 0;
1676 setColEIterOffset(coleiter
, *start
);
1677 while (expansion
> 0) {
1678 // getting rid of the redundant ce, caused by setOffset.
1679 // since backward contraction/expansion may have extra ces if we
1680 // are in the normalization buffer, hasAccentsBeforeMatch would
1681 // have taken care of it.
1682 // E.g. the character \u01FA will have an expansion of 3, but if
1683 // we are only looking for acute and ring \u030A and \u0301, we'll
1684 // have to skip the first ce in the expansion buffer.
1685 ucol_next(coleiter
, status
);
1686 if (U_FAILURE(*status
)) {
1689 if (ucol_getOffset(coleiter
) != temp
) {
1691 temp
= ucol_getOffset(coleiter
);
1696 int32_t *patternce
= strsrch
->pattern
.CE
;
1697 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1699 int32_t textlength
= strsrch
->search
->textLength
;
1700 while (count
< patterncelength
) {
1701 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1702 // status checked below, note that if status is a failure
1703 // ucol_next returns UCOL_NULLORDER
1704 if (ce
== UCOL_IGNORABLE
) {
1707 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1709 temp
= ucol_getOffset(coleiter
);
1712 if (count
== 0 && ce
!= patternce
[0]) {
1713 // accents may have extra starting ces, this occurs when a
1714 // pure accent pattern is matched without rearrangement
1715 // text \u0325\u0300 and looking for \u0300
1716 int32_t expected
= patternce
[0];
1717 if (getFCD(text
, start
, textlength
) & LAST_BYTE_MASK_
) {
1718 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1719 while (U_SUCCESS(*status
) && ce
!= expected
&&
1720 ce
!= UCOL_NULLORDER
&&
1721 ucol_getOffset(coleiter
) <= *end
) {
1722 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1726 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1728 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1738 * Checks and sets the match information if found.
1741 * <li> the potential match does not repeat the previous match
1742 * <li> boundaries are correct
1743 * <li> potential match does not end in the middle of a contraction
1744 * <li> identical matches
1746 * Otherwise the offset will be shifted to the next character.
1747 * Internal method, status assumed to be success, caller has to check the
1748 * status before calling this method.
1749 * @param strsrch string search data
1750 * @param textoffset offset in the collation element text. the returned value
1751 * will be the truncated end offset of the match or the new start
1753 * @param status output error status if any
1754 * @return TRUE if the match is valid, FALSE otherwise
1757 inline UBool
checkNextCanonicalMatch(UStringSearch
*strsrch
,
1758 int32_t *textoffset
,
1761 // to ensure that the start and ends are not composite characters
1762 UCollationElements
*coleiter
= strsrch
->textIter
;
1763 // if we have a canonical accent match
1764 if ((strsrch
->pattern
.hasSuffixAccents
&&
1765 strsrch
->canonicalSuffixAccents
[0]) ||
1766 (strsrch
->pattern
.hasPrefixAccents
&&
1767 strsrch
->canonicalPrefixAccents
[0])) {
1768 strsrch
->search
->matchedIndex
= getPreviousUStringSearchBaseOffset(
1770 ucol_getOffset(coleiter
));
1771 strsrch
->search
->matchedLength
= *textoffset
-
1772 strsrch
->search
->matchedIndex
;
1776 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1777 if (!checkNextCanonicalContractionMatch(strsrch
, &start
, textoffset
,
1778 status
) || U_FAILURE(*status
)) {
1782 start
= getPreviousUStringSearchBaseOffset(strsrch
, start
);
1783 // this totally matches, however we need to check if it is repeating
1784 if (checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1785 !isBreakUnit(strsrch
, start
, *textoffset
) ||
1786 !checkIdentical(strsrch
, start
, *textoffset
)) {
1788 *textoffset
= getNextBaseOffset(strsrch
->search
->text
, *textoffset
,
1789 strsrch
->search
->textLength
);
1793 strsrch
->search
->matchedIndex
= start
;
1794 strsrch
->search
->matchedLength
= *textoffset
- start
;
1799 * Shifting the collation element iterator position forward to prepare for
1800 * a preceding match. If the first character is a unsafe character, we'll only
1801 * shift by 1 to capture contractions, normalization etc.
1802 * Internal method, status assumed to be success, caller has to check status
1803 * before calling this method.
1804 * @param text strsrch string search data
1805 * @param textoffset start text position to do search
1806 * @param ce the text ce which failed the match.
1807 * @param patternceindex index of the ce within the pattern ce buffer which
1809 * @return final offset
1812 inline int32_t reverseShift(UStringSearch
*strsrch
,
1815 int32_t patternceindex
)
1817 if (strsrch
->search
->isOverlap
) {
1818 if (textoffset
!= strsrch
->search
->textLength
) {
1822 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1826 if (ce
!= UCOL_NULLORDER
) {
1827 int32_t shift
= strsrch
->pattern
.backShift
[hash(ce
)];
1829 // this is to adjust for characters in the middle of the substring
1830 // for matching that failed.
1831 int32_t adjust
= patternceindex
;
1832 if (adjust
> 1 && shift
> adjust
) {
1833 shift
-= adjust
- 1;
1835 textoffset
-= shift
;
1838 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1841 textoffset
= getPreviousUStringSearchBaseOffset(strsrch
, textoffset
);
1846 * Checks match for contraction.
1847 * If the match starts with a partial contraction we fail.
1848 * Internal method, status assumed to be success, caller has to check status
1849 * before calling this method.
1850 * @param strsrch string search data
1851 * @param start offset of potential match, to be modified if necessary
1852 * @param end offset of potential match, to be modified if necessary
1853 * @param status output error status if any
1854 * @return TRUE if match passes the contraction test, FALSE otherwise
1857 UBool
checkPreviousExactContractionMatch(UStringSearch
*strsrch
,
1859 int32_t *end
, UErrorCode
*status
)
1861 UCollationElements
*coleiter
= strsrch
->textIter
;
1862 int32_t textlength
= strsrch
->search
->textLength
;
1863 int32_t temp
= *end
;
1864 const UCollator
*collator
= strsrch
->collator
;
1865 const UChar
*text
= strsrch
->search
->text
;
1866 // This part checks if either if the start of the match contains potential
1867 // contraction. If so we'll have to iterate through them
1868 // Since we used ucol_next while previously looking for the potential
1869 // match, this guarantees that our end will not be a partial contraction,
1870 // or a partial supplementary character.
1871 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
1872 int32_t expansion
= getExpansionSuffix(coleiter
);
1873 UBool expandflag
= expansion
> 0;
1874 setColEIterOffset(coleiter
, *end
);
1875 while (U_SUCCESS(*status
) && expansion
> 0) {
1876 // getting rid of the redundant ce
1877 // since forward contraction/expansion may have extra ces
1878 // if we are in the normalization buffer, hasAccentsBeforeMatch
1879 // would have taken care of it.
1880 // E.g. the character \u01FA will have an expansion of 3, but if
1881 // we are only looking for A ring A\u030A, we'll have to skip the
1882 // last ce in the expansion buffer
1883 ucol_previous(coleiter
, status
);
1884 if (U_FAILURE(*status
)) {
1887 if (ucol_getOffset(coleiter
) != temp
) {
1889 temp
= ucol_getOffset(coleiter
);
1894 int32_t *patternce
= strsrch
->pattern
.CE
;
1895 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1896 int32_t count
= patterncelength
;
1898 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
1899 // status checked below, note that if status is a failure
1900 // ucol_previous returns UCOL_NULLORDER
1901 if (ce
== UCOL_IGNORABLE
) {
1904 if (expandflag
&& count
== 0 &&
1905 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
1907 temp
= ucol_getOffset(coleiter
);
1909 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
1911 *start
= getPreviousBaseOffset(text
, *start
);
1921 * Checks and sets the match information if found.
1924 * <li> the current match does not repeat the last match
1925 * <li> boundaries are correct
1926 * <li> exact matches has no extra accents
1927 * <li> identical matches
1929 * Otherwise the offset will be shifted to the preceding character.
1930 * Internal method, status assumed to be success, caller has to check status
1931 * before calling this method.
1932 * @param strsrch string search data
1934 * @param coleiter collation element iterator
1935 * @param text string
1936 * @param textoffset offset in the collation element text. the returned value
1937 * will be the truncated start offset of the match or the new start
1939 * @param status output error status if any
1940 * @return TRUE if the match is valid, FALSE otherwise
1943 inline UBool
checkPreviousExactMatch(UStringSearch
*strsrch
,
1944 int32_t *textoffset
,
1947 // to ensure that the start and ends are not composite characters
1948 int32_t end
= ucol_getOffset(strsrch
->textIter
);
1949 if (!checkPreviousExactContractionMatch(strsrch
, textoffset
, &end
, status
)
1950 || U_FAILURE(*status
)) {
1954 // this totally matches, however we need to check if it is repeating
1956 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
1957 !isBreakUnit(strsrch
, *textoffset
, end
) ||
1958 hasAccentsBeforeMatch(strsrch
, *textoffset
, end
) ||
1959 !checkIdentical(strsrch
, *textoffset
, end
) ||
1960 hasAccentsAfterMatch(strsrch
, *textoffset
, end
)) {
1962 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
1966 strsrch
->search
->matchedIndex
= *textoffset
;
1967 strsrch
->search
->matchedLength
= end
- *textoffset
;
1972 * Rearranges the end accents to try matching.
1973 * Suffix accents in the text will be grouped according to their combining
1974 * class and the groups will be mixed and matched to try find the perfect
1975 * match with the pattern.
1976 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1977 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1978 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1980 * step 2: check if any of the generated substrings matches the pattern.
1981 * Internal method, status assumed to be success, user has to check status
1982 * before calling this method.
1983 * @param strsrch string search match
1984 * @param start offset of the first base character
1985 * @param end start of the last accent set
1986 * @param status only error status if any
1987 * @return USEARCH_DONE if a match is not found, otherwise return the ending
1988 * offset of the match. Note this start includes all following accents.
1991 int32_t doPreviousCanonicalSuffixMatch(UStringSearch
*strsrch
,
1996 const UChar
*text
= strsrch
->search
->text
;
1997 int32_t tempend
= end
;
1999 UTF_BACK_1(text
, 0, tempend
);
2000 if (!(getFCD(text
, &tempend
, strsrch
->search
->textLength
) &
2002 // die... failed at a base character
2003 return USEARCH_DONE
;
2005 end
= getNextBaseOffset(text
, end
, strsrch
->search
->textLength
);
2007 if (U_SUCCESS(*status
)) {
2008 UChar accents
[INITIAL_ARRAY_SIZE_
];
2009 int32_t offset
= getPreviousBaseOffset(text
, end
);
2010 // normalizing the offensive string
2011 unorm_normalize(text
+ offset
, end
- offset
, UNORM_NFD
, 0, accents
,
2012 INITIAL_ARRAY_SIZE_
, status
);
2014 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2015 int32_t accentsize
= getUnblockedAccentIndex(accents
,
2017 int32_t count
= (2 << (accentsize
- 1)) - 1;
2018 UChar buffer
[INITIAL_ARRAY_SIZE_
];
2019 UCollationElements
*coleiter
= strsrch
->utilIter
;
2020 while (U_SUCCESS(*status
) && count
> 0) {
2021 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
2022 // copy the base characters
2023 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2024 *rearrange
++ = accents
[k
];
2026 // forming all possible canonical rearrangement by dropping
2028 for (int i
= 0; i
<= accentsize
- 1; i
++) {
2029 int32_t mask
= 1 << (accentsize
- i
- 1);
2031 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2032 *rearrange
++ = accents
[j
];
2037 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
2038 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
2039 strsrch
->canonicalPrefixAccents
,
2040 strsrch
->search
->text
+ start
,
2042 strsrch
->canonicalSuffixAccents
,
2045 // run the collator iterator through this match
2046 // if status is a failure ucol_setText does nothing
2047 ucol_setText(coleiter
, match
, matchsize
, status
);
2048 if (U_SUCCESS(*status
)) {
2049 if (checkCollationMatch(strsrch
, coleiter
)) {
2050 if (match
!= buffer
) {
2059 return USEARCH_DONE
;
2063 * Take the rearranged start accents and tries matching. If match failed at
2064 * a seperate following set of accents (seperated from the rearranged on by
2065 * at least a base character) then we rearrange the preceding accents and
2066 * tries matching again.
2067 * We allow skipping of the ends of the accent set if the ces do not match.
2068 * However if the failure is found before the accent set, it fails.
2069 * Internal method, status assumed to be success, caller has to check status
2070 * before calling this method.
2071 * @param strsrch string search data
2072 * @param textoffset of the ends of the rearranged accent
2073 * @param status output error status if any
2074 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2075 * offset of the match. Note this start includes all following accents.
2078 int32_t doPreviousCanonicalPrefixMatch(UStringSearch
*strsrch
,
2082 const UChar
*text
= strsrch
->search
->text
;
2083 const UCollator
*collator
= strsrch
->collator
;
2084 int32_t safelength
= 0;
2086 int32_t safetextlength
;
2087 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
2088 int32_t safeoffset
= textoffset
;
2091 ucol_unsafeCP(strsrch
->canonicalPrefixAccents
[
2092 u_strlen(strsrch
->canonicalPrefixAccents
) - 1
2094 safeoffset
= getNextSafeOffset(collator
, text
, textoffset
,
2095 strsrch
->search
->textLength
);
2096 safelength
= safeoffset
- textoffset
;
2097 safetextlength
= INITIAL_ARRAY_SIZE_
;
2098 safetext
= addToUCharArray(safebuffer
, &safetextlength
,
2099 strsrch
->canonicalPrefixAccents
,
2100 text
+ textoffset
, safelength
,
2104 safetextlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2105 safetext
= strsrch
->canonicalPrefixAccents
;
2108 UCollationElements
*coleiter
= strsrch
->utilIter
;
2109 // if status is a failure, ucol_setText does nothing
2110 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
2111 // status checked in loop below
2113 int32_t *ce
= strsrch
->pattern
.CE
;
2114 int32_t celength
= strsrch
->pattern
.CELength
;
2116 UBool isSafe
= TRUE
; // safe zone indication flag for position
2117 int32_t prefixlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2119 while (ceindex
< celength
) {
2120 int32_t textce
= ucol_next(coleiter
, status
);
2121 if (U_FAILURE(*status
)) {
2123 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2125 return USEARCH_DONE
;
2127 if (textce
== UCOL_NULLORDER
) {
2128 // check if we have passed the safe buffer
2129 if (coleiter
== strsrch
->textIter
) {
2130 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2131 return USEARCH_DONE
;
2133 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2134 safetext
= safebuffer
;
2135 coleiter
= strsrch
->textIter
;
2136 setColEIterOffset(coleiter
, safeoffset
);
2137 // status checked at the start of the loop
2141 textce
= getCE(strsrch
, textce
);
2142 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
2143 // do the beginning stuff
2144 int32_t failedoffset
= ucol_getOffset(coleiter
);
2145 if (isSafe
&& failedoffset
<= prefixlength
) {
2146 // alas... no hope. failed at rearranged accent set
2147 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2148 return USEARCH_DONE
;
2152 failedoffset
= safeoffset
- failedoffset
;
2153 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2156 // try rearranging the end accents
2157 int32_t result
= doPreviousCanonicalSuffixMatch(strsrch
,
2158 textoffset
, failedoffset
, status
);
2159 if (result
!= USEARCH_DONE
) {
2160 // if status is a failure, ucol_setOffset does nothing
2161 setColEIterOffset(strsrch
->textIter
, result
);
2163 if (U_FAILURE(*status
)) {
2164 return USEARCH_DONE
;
2169 if (textce
== ce
[ceindex
]) {
2175 int32_t result
= ucol_getOffset(coleiter
);
2176 // sets the text iterator here with the correct expansion and offset
2177 int32_t leftoverces
= getExpansionSuffix(coleiter
);
2178 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2179 if (result
<= prefixlength
) {
2180 result
= textoffset
;
2183 result
= textoffset
+ (safeoffset
- result
);
2185 setColEIterOffset(strsrch
->textIter
, result
);
2186 setExpansionSuffix(strsrch
->textIter
, leftoverces
);
2190 return ucol_getOffset(coleiter
);
2194 * Trying out the substring and sees if it can be a canonical match.
2195 * This will try normalizing the starting accents and arranging them into
2196 * canonical equivalents and check their corresponding ces with the pattern ce.
2197 * Prefix accents in the text will be grouped according to their combining
2198 * class and the groups will be mixed and matched to try find the perfect
2199 * match with the pattern.
2200 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2201 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2202 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2204 * step 2: check if any of the generated substrings matches the pattern.
2205 * Internal method, status assumed to be success, caller has to check status
2206 * before calling this method.
2207 * @param strsrch string search data
2208 * @param textoffset start offset in the collation element text that starts
2209 * with the accents to be rearranged
2210 * @param status output error status if any
2211 * @return TRUE if the match is valid, FALSE otherwise
2214 UBool
doPreviousCanonicalMatch(UStringSearch
*strsrch
,
2218 const UChar
*text
= strsrch
->search
->text
;
2219 int32_t temp
= textoffset
;
2220 int32_t textlength
= strsrch
->search
->textLength
;
2221 if ((getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) == 0) {
2222 UCollationElements
*coleiter
= strsrch
->textIter
;
2223 int32_t offset
= ucol_getOffset(coleiter
);
2224 if (strsrch
->pattern
.hasSuffixAccents
) {
2225 offset
= doPreviousCanonicalSuffixMatch(strsrch
, textoffset
,
2227 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
2228 setColEIterOffset(coleiter
, offset
);
2235 if (!strsrch
->pattern
.hasPrefixAccents
) {
2239 UChar accents
[INITIAL_ARRAY_SIZE_
];
2240 // offset to the last base character in substring to search
2241 int32_t baseoffset
= getNextBaseOffset(text
, textoffset
, textlength
);
2242 // normalizing the offensive string
2243 unorm_normalize(text
+ textoffset
, baseoffset
- textoffset
, UNORM_NFD
,
2244 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
2245 // status checked in loop
2247 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2248 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
2250 // 2 power n - 1 plus the full set of accents
2251 int32_t count
= (2 << (size
- 1)) - 1;
2252 while (U_SUCCESS(*status
) && count
> 0) {
2253 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
2254 // copy the base characters
2255 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2256 *rearrange
++ = accents
[k
];
2258 // forming all possible canonical rearrangement by dropping
2260 for (int i
= 0; i
<= size
- 1; i
++) {
2261 int32_t mask
= 1 << (size
- i
- 1);
2263 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2264 *rearrange
++ = accents
[j
];
2269 int32_t offset
= doPreviousCanonicalPrefixMatch(strsrch
,
2270 baseoffset
, status
);
2271 if (offset
!= USEARCH_DONE
) {
2272 return TRUE
; // match found
2280 * Checks match for contraction.
2281 * If the match starts with a partial contraction we fail.
2282 * Internal method, status assumed to be success, caller has to check status
2283 * before calling this method.
2284 * @param strsrch string search data
2285 * @param start offset of potential match, to be modified if necessary
2286 * @param end offset of potential match, to be modified if necessary
2287 * @param status only error status if any
2288 * @return TRUE if match passes the contraction test, FALSE otherwise
2291 UBool
checkPreviousCanonicalContractionMatch(UStringSearch
*strsrch
,
2293 int32_t *end
, UErrorCode
*status
)
2295 UCollationElements
*coleiter
= strsrch
->textIter
;
2296 int32_t textlength
= strsrch
->search
->textLength
;
2297 int32_t temp
= *end
;
2298 const UCollator
*collator
= strsrch
->collator
;
2299 const UChar
*text
= strsrch
->search
->text
;
2300 // This part checks if either if the start of the match contains potential
2301 // contraction. If so we'll have to iterate through them
2302 // Since we used ucol_next while previously looking for the potential
2303 // match, this guarantees that our end will not be a partial contraction,
2304 // or a partial supplementary character.
2305 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2306 int32_t expansion
= getExpansionSuffix(coleiter
);
2307 UBool expandflag
= expansion
> 0;
2308 setColEIterOffset(coleiter
, *end
);
2309 while (expansion
> 0) {
2310 // getting rid of the redundant ce
2311 // since forward contraction/expansion may have extra ces
2312 // if we are in the normalization buffer, hasAccentsBeforeMatch
2313 // would have taken care of it.
2314 // E.g. the character \u01FA will have an expansion of 3, but if
2315 // we are only looking for A ring A\u030A, we'll have to skip the
2316 // last ce in the expansion buffer
2317 ucol_previous(coleiter
, status
);
2318 if (U_FAILURE(*status
)) {
2321 if (ucol_getOffset(coleiter
) != temp
) {
2323 temp
= ucol_getOffset(coleiter
);
2328 int32_t *patternce
= strsrch
->pattern
.CE
;
2329 int32_t patterncelength
= strsrch
->pattern
.CELength
;
2330 int32_t count
= patterncelength
;
2332 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2333 // status checked below, note that if status is a failure
2334 // ucol_previous returns UCOL_NULLORDER
2335 if (ce
== UCOL_IGNORABLE
) {
2338 if (expandflag
&& count
== 0 &&
2339 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2341 temp
= ucol_getOffset(coleiter
);
2343 if (count
== patterncelength
&&
2344 ce
!= patternce
[patterncelength
- 1]) {
2345 // accents may have extra starting ces, this occurs when a
2346 // pure accent pattern is matched without rearrangement
2347 int32_t expected
= patternce
[patterncelength
- 1];
2348 UTF_BACK_1(text
, 0, *end
);
2349 if (getFCD(text
, end
, textlength
) & LAST_BYTE_MASK_
) {
2350 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2351 while (U_SUCCESS(*status
) && ce
!= expected
&&
2352 ce
!= UCOL_NULLORDER
&&
2353 ucol_getOffset(coleiter
) <= *start
) {
2354 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2358 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2360 *start
= getPreviousBaseOffset(text
, *start
);
2370 * Checks and sets the match information if found.
2373 * <li> the potential match does not repeat the previous match
2374 * <li> boundaries are correct
2375 * <li> potential match does not end in the middle of a contraction
2376 * <li> identical matches
2378 * Otherwise the offset will be shifted to the next character.
2379 * Internal method, status assumed to be success, caller has to check status
2380 * before calling this method.
2381 * @param strsrch string search data
2382 * @param textoffset offset in the collation element text. the returned value
2383 * will be the truncated start offset of the match or the new start
2385 * @param status only error status if any
2386 * @return TRUE if the match is valid, FALSE otherwise
2389 inline UBool
checkPreviousCanonicalMatch(UStringSearch
*strsrch
,
2390 int32_t *textoffset
,
2393 // to ensure that the start and ends are not composite characters
2394 UCollationElements
*coleiter
= strsrch
->textIter
;
2395 // if we have a canonical accent match
2396 if ((strsrch
->pattern
.hasSuffixAccents
&&
2397 strsrch
->canonicalSuffixAccents
[0]) ||
2398 (strsrch
->pattern
.hasPrefixAccents
&&
2399 strsrch
->canonicalPrefixAccents
[0])) {
2400 strsrch
->search
->matchedIndex
= *textoffset
;
2401 strsrch
->search
->matchedLength
=
2402 getNextUStringSearchBaseOffset(strsrch
,
2403 getColElemIterOffset(coleiter
, FALSE
))
2408 int32_t end
= ucol_getOffset(coleiter
);
2409 if (!checkPreviousCanonicalContractionMatch(strsrch
, textoffset
, &end
,
2411 U_FAILURE(*status
)) {
2415 end
= getNextUStringSearchBaseOffset(strsrch
, end
);
2416 // this totally matches, however we need to check if it is repeating
2417 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2418 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2419 !checkIdentical(strsrch
, *textoffset
, end
)) {
2421 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2426 strsrch
->search
->matchedIndex
= *textoffset
;
2427 strsrch
->search
->matchedLength
= end
- *textoffset
;
2431 // constructors and destructor -------------------------------------------
2433 U_CAPI UStringSearch
* U_EXPORT2
usearch_open(const UChar
*pattern
,
2434 int32_t patternlength
,
2438 UBreakIterator
*breakiter
,
2441 if (U_FAILURE(*status
)) {
2444 #if UCONFIG_NO_BREAK_ITERATION
2445 if (breakiter
!= NULL
) {
2446 *status
= U_UNSUPPORTED_ERROR
;
2451 // ucol_open internally checks for status
2452 UCollator
*collator
= ucol_open(locale
, status
);
2453 // pattern, text checks are done in usearch_openFromCollator
2454 UStringSearch
*result
= usearch_openFromCollator(pattern
,
2455 patternlength
, text
, textlength
,
2456 collator
, breakiter
, status
);
2458 if (result
== NULL
|| U_FAILURE(*status
)) {
2460 ucol_close(collator
);
2465 result
->ownCollator
= TRUE
;
2469 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2473 U_CAPI UStringSearch
* U_EXPORT2
usearch_openFromCollator(
2474 const UChar
*pattern
,
2475 int32_t patternlength
,
2478 const UCollator
*collator
,
2479 UBreakIterator
*breakiter
,
2482 if (U_FAILURE(*status
)) {
2485 #if UCONFIG_NO_BREAK_ITERATION
2486 if (breakiter
!= NULL
) {
2487 *status
= U_UNSUPPORTED_ERROR
;
2491 if (pattern
== NULL
|| text
== NULL
|| collator
== NULL
) {
2492 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2496 // string search does not really work when numeric collation is turned on
2497 if(ucol_getAttribute(collator
, UCOL_NUMERIC_COLLATION
, status
) == UCOL_ON
) {
2498 *status
= U_UNSUPPORTED_ERROR
;
2502 if (U_SUCCESS(*status
)) {
2503 initializeFCD(status
);
2504 if (U_FAILURE(*status
)) {
2508 UStringSearch
*result
;
2509 if (textlength
== -1) {
2510 textlength
= u_strlen(text
);
2512 if (patternlength
== -1) {
2513 patternlength
= u_strlen(pattern
);
2515 if (textlength
<= 0 || patternlength
<= 0) {
2516 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2520 result
= (UStringSearch
*)uprv_malloc(sizeof(UStringSearch
));
2521 if (result
== NULL
) {
2522 *status
= U_MEMORY_ALLOCATION_ERROR
;
2526 result
->collator
= collator
;
2527 result
->strength
= ucol_getStrength(collator
);
2528 result
->ceMask
= getMask(result
->strength
);
2530 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2532 result
->variableTop
= ucol_getVariableTop(collator
, status
);
2534 if (U_FAILURE(*status
)) {
2539 result
->search
= (USearch
*)uprv_malloc(sizeof(USearch
));
2540 if (result
->search
== NULL
) {
2541 *status
= U_MEMORY_ALLOCATION_ERROR
;
2546 result
->search
->text
= text
;
2547 result
->search
->textLength
= textlength
;
2549 result
->pattern
.text
= pattern
;
2550 result
->pattern
.textLength
= patternlength
;
2551 result
->pattern
.CE
= NULL
;
2553 result
->search
->breakIter
= breakiter
;
2554 #if !UCONFIG_NO_BREAK_ITERATION
2556 ubrk_setText(breakiter
, text
, textlength
, status
);
2560 result
->ownCollator
= FALSE
;
2561 result
->search
->matchedLength
= 0;
2562 result
->search
->matchedIndex
= USEARCH_DONE
;
2563 result
->textIter
= ucol_openElements(collator
, text
,
2564 textlength
, status
);
2565 if (U_FAILURE(*status
)) {
2566 usearch_close(result
);
2570 result
->utilIter
= NULL
;
2572 result
->search
->isOverlap
= FALSE
;
2573 result
->search
->isCanonicalMatch
= FALSE
;
2574 result
->search
->isForwardSearching
= TRUE
;
2575 result
->search
->reset
= TRUE
;
2577 initialize(result
, status
);
2579 if (U_FAILURE(*status
)) {
2580 usearch_close(result
);
2589 U_CAPI
void U_EXPORT2
usearch_close(UStringSearch
*strsrch
)
2592 if (strsrch
->pattern
.CE
!= strsrch
->pattern
.CEBuffer
&&
2593 strsrch
->pattern
.CE
) {
2594 uprv_free(strsrch
->pattern
.CE
);
2596 ucol_closeElements(strsrch
->textIter
);
2597 ucol_closeElements(strsrch
->utilIter
);
2598 if (strsrch
->ownCollator
&& strsrch
->collator
) {
2599 ucol_close((UCollator
*)strsrch
->collator
);
2601 uprv_free(strsrch
->search
);
2606 // set and get methods --------------------------------------------------
2608 U_CAPI
void U_EXPORT2
usearch_setOffset(UStringSearch
*strsrch
,
2612 if (U_SUCCESS(*status
) && strsrch
) {
2613 if (isOutOfBounds(strsrch
->search
->textLength
, position
)) {
2614 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2617 setColEIterOffset(strsrch
->textIter
, position
);
2619 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2620 strsrch
->search
->matchedLength
= 0;
2621 strsrch
->search
->reset
= FALSE
;
2625 U_CAPI
int32_t U_EXPORT2
usearch_getOffset(const UStringSearch
*strsrch
)
2628 int32_t result
= ucol_getOffset(strsrch
->textIter
);
2629 if (isOutOfBounds(strsrch
->search
->textLength
, result
)) {
2630 return USEARCH_DONE
;
2634 return USEARCH_DONE
;
2637 U_CAPI
void U_EXPORT2
usearch_setAttribute(UStringSearch
*strsrch
,
2638 USearchAttribute attribute
,
2639 USearchAttributeValue value
,
2642 if (U_SUCCESS(*status
) && strsrch
) {
2645 case USEARCH_OVERLAP
:
2646 strsrch
->search
->isOverlap
= (value
== USEARCH_ON
? TRUE
: FALSE
);
2648 case USEARCH_CANONICAL_MATCH
:
2649 strsrch
->search
->isCanonicalMatch
= (value
== USEARCH_ON
? TRUE
:
2652 case USEARCH_ATTRIBUTE_COUNT
:
2654 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2657 if (value
== USEARCH_ATTRIBUTE_VALUE_COUNT
) {
2658 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2662 U_CAPI USearchAttributeValue U_EXPORT2
usearch_getAttribute(
2663 const UStringSearch
*strsrch
,
2664 USearchAttribute attribute
)
2667 switch (attribute
) {
2668 case USEARCH_OVERLAP
:
2669 return (strsrch
->search
->isOverlap
== TRUE
? USEARCH_ON
:
2671 case USEARCH_CANONICAL_MATCH
:
2672 return (strsrch
->search
->isCanonicalMatch
== TRUE
? USEARCH_ON
:
2674 case USEARCH_ATTRIBUTE_COUNT
:
2675 return USEARCH_DEFAULT
;
2678 return USEARCH_DEFAULT
;
2681 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedStart(
2682 const UStringSearch
*strsrch
)
2684 if (strsrch
== NULL
) {
2685 return USEARCH_DONE
;
2687 return strsrch
->search
->matchedIndex
;
2691 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedText(const UStringSearch
*strsrch
,
2693 int32_t resultCapacity
,
2696 if (U_FAILURE(*status
)) {
2697 return USEARCH_DONE
;
2699 if (strsrch
== NULL
|| resultCapacity
< 0 || (resultCapacity
> 0 &&
2701 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2702 return USEARCH_DONE
;
2705 int32_t copylength
= strsrch
->search
->matchedLength
;
2706 int32_t copyindex
= strsrch
->search
->matchedIndex
;
2707 if (copyindex
== USEARCH_DONE
) {
2708 u_terminateUChars(result
, resultCapacity
, 0, status
);
2709 return USEARCH_DONE
;
2712 if (resultCapacity
< copylength
) {
2713 copylength
= resultCapacity
;
2715 if (copylength
> 0) {
2716 uprv_memcpy(result
, strsrch
->search
->text
+ copyindex
,
2717 copylength
* sizeof(UChar
));
2719 return u_terminateUChars(result
, resultCapacity
,
2720 strsrch
->search
->matchedLength
, status
);
2723 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedLength(
2724 const UStringSearch
*strsrch
)
2727 return strsrch
->search
->matchedLength
;
2729 return USEARCH_DONE
;
2732 #if !UCONFIG_NO_BREAK_ITERATION
2734 U_CAPI
void U_EXPORT2
usearch_setBreakIterator(UStringSearch
*strsrch
,
2735 UBreakIterator
*breakiter
,
2738 if (U_SUCCESS(*status
) && strsrch
) {
2739 strsrch
->search
->breakIter
= breakiter
;
2741 ubrk_setText(breakiter
, strsrch
->search
->text
,
2742 strsrch
->search
->textLength
, status
);
2747 U_CAPI
const UBreakIterator
* U_EXPORT2
2748 usearch_getBreakIterator(const UStringSearch
*strsrch
)
2751 return strsrch
->search
->breakIter
;
2758 U_CAPI
void U_EXPORT2
usearch_setText( UStringSearch
*strsrch
,
2763 if (U_SUCCESS(*status
)) {
2764 if (strsrch
== NULL
|| text
== NULL
|| textlength
< -1 ||
2766 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2769 if (textlength
== -1) {
2770 textlength
= u_strlen(text
);
2772 strsrch
->search
->text
= text
;
2773 strsrch
->search
->textLength
= textlength
;
2774 ucol_setText(strsrch
->textIter
, text
, textlength
, status
);
2775 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2776 strsrch
->search
->matchedLength
= 0;
2777 strsrch
->search
->reset
= TRUE
;
2778 #if !UCONFIG_NO_BREAK_ITERATION
2779 if (strsrch
->search
->breakIter
!= NULL
) {
2780 ubrk_setText(strsrch
->search
->breakIter
, text
,
2781 textlength
, status
);
2788 U_CAPI
const UChar
* U_EXPORT2
usearch_getText(const UStringSearch
*strsrch
,
2792 *length
= strsrch
->search
->textLength
;
2793 return strsrch
->search
->text
;
2798 U_CAPI
void U_EXPORT2
usearch_setCollator( UStringSearch
*strsrch
,
2799 const UCollator
*collator
,
2802 if (U_SUCCESS(*status
)) {
2803 if (collator
== NULL
) {
2804 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2808 if (strsrch
->ownCollator
&& (strsrch
->collator
!= collator
)) {
2809 ucol_close((UCollator
*)strsrch
->collator
);
2810 strsrch
->ownCollator
= FALSE
;
2812 strsrch
->collator
= collator
;
2813 strsrch
->strength
= ucol_getStrength(collator
);
2814 strsrch
->ceMask
= getMask(strsrch
->strength
);
2815 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
2817 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2819 // if status is a failure, ucol_getVariableTop returns 0
2820 strsrch
->variableTop
= ucol_getVariableTop(collator
, status
);
2821 if (U_SUCCESS(*status
)) {
2822 initialize(strsrch
, status
);
2823 if (U_SUCCESS(*status
)) {
2824 uprv_init_collIterate(collator
, strsrch
->search
->text
,
2825 strsrch
->search
->textLength
,
2826 &(strsrch
->textIter
->iteratordata_
));
2827 strsrch
->utilIter
->iteratordata_
.coll
= collator
;
2834 U_CAPI UCollator
* U_EXPORT2
usearch_getCollator(const UStringSearch
*strsrch
)
2837 return (UCollator
*)strsrch
->collator
;
2842 U_CAPI
void U_EXPORT2
usearch_setPattern( UStringSearch
*strsrch
,
2843 const UChar
*pattern
,
2844 int32_t patternlength
,
2847 if (U_SUCCESS(*status
)) {
2848 if (strsrch
== NULL
|| pattern
== NULL
) {
2849 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2852 if (patternlength
== -1) {
2853 patternlength
= u_strlen(pattern
);
2855 if (patternlength
== 0) {
2856 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2859 strsrch
->pattern
.text
= pattern
;
2860 strsrch
->pattern
.textLength
= patternlength
;
2861 initialize(strsrch
, status
);
2866 U_CAPI
const UChar
* U_EXPORT2
2867 usearch_getPattern(const UStringSearch
*strsrch
,
2871 *length
= strsrch
->pattern
.textLength
;
2872 return strsrch
->pattern
.text
;
2877 // miscellanous methods --------------------------------------------------
2879 U_CAPI
int32_t U_EXPORT2
usearch_first(UStringSearch
*strsrch
,
2882 if (strsrch
&& U_SUCCESS(*status
)) {
2883 strsrch
->search
->isForwardSearching
= TRUE
;
2884 usearch_setOffset(strsrch
, 0, status
);
2885 if (U_SUCCESS(*status
)) {
2886 return usearch_next(strsrch
, status
);
2889 return USEARCH_DONE
;
2892 U_CAPI
int32_t U_EXPORT2
usearch_following(UStringSearch
*strsrch
,
2896 if (strsrch
&& U_SUCCESS(*status
)) {
2897 strsrch
->search
->isForwardSearching
= TRUE
;
2898 // position checked in usearch_setOffset
2899 usearch_setOffset(strsrch
, position
, status
);
2900 if (U_SUCCESS(*status
)) {
2901 return usearch_next(strsrch
, status
);
2904 return USEARCH_DONE
;
2907 U_CAPI
int32_t U_EXPORT2
usearch_last(UStringSearch
*strsrch
,
2910 if (strsrch
&& U_SUCCESS(*status
)) {
2911 strsrch
->search
->isForwardSearching
= FALSE
;
2912 usearch_setOffset(strsrch
, strsrch
->search
->textLength
, status
);
2913 if (U_SUCCESS(*status
)) {
2914 return usearch_previous(strsrch
, status
);
2917 return USEARCH_DONE
;
2920 U_CAPI
int32_t U_EXPORT2
usearch_preceding(UStringSearch
*strsrch
,
2924 if (strsrch
&& U_SUCCESS(*status
)) {
2925 strsrch
->search
->isForwardSearching
= FALSE
;
2926 // position checked in usearch_setOffset
2927 usearch_setOffset(strsrch
, position
, status
);
2928 if (U_SUCCESS(*status
)) {
2929 return usearch_previous(strsrch
, status
);
2932 return USEARCH_DONE
;
2936 * If a direction switch is required, we'll count the number of ces till the
2937 * beginning of the collation element iterator and iterate forwards that
2938 * number of times. This is so that we get to the correct point within the
2939 * string to continue the search in. Imagine when we are in the middle of the
2940 * normalization buffer when the change in direction is request. arrrgghh....
2941 * After searching the offset within the collation element iterator will be
2942 * shifted to the start of the match. If a match is not found, the offset would
2943 * have been set to the end of the text string in the collation element
2945 * Okay, here's my take on normalization buffer. The only time when there can
2946 * be 2 matches within the same normalization is when the pattern is consists
2947 * of all accents. But since the offset returned is from the text string, we
2948 * should not confuse the caller by returning the second match within the
2949 * same normalization buffer. If we do, the 2 results will have the same match
2950 * offsets, and that'll be confusing. I'll return the next match that doesn't
2951 * fall within the same normalization buffer. Note this does not affect the
2952 * results of matches spanning the text and the normalization buffer.
2953 * The position to start searching is taken from the collation element
2954 * iterator. Callers of this API would have to set the offset in the collation
2955 * element iterator before using this method.
2957 U_CAPI
int32_t U_EXPORT2
usearch_next(UStringSearch
*strsrch
,
2960 if (U_SUCCESS(*status
) && strsrch
) {
2961 // note offset is either equivalent to the start of the previous match
2962 // or is set by the user
2963 int32_t offset
= usearch_getOffset(strsrch
);
2964 USearch
*search
= strsrch
->search
;
2965 search
->reset
= FALSE
;
2966 int32_t textlength
= search
->textLength
;
2967 if (search
->isForwardSearching
) {
2968 if (offset
== textlength
2969 || (!search
->isOverlap
&&
2970 (offset
+ strsrch
->pattern
.defaultShiftSize
> textlength
||
2971 (search
->matchedIndex
!= USEARCH_DONE
&&
2972 offset
+ search
->matchedLength
>= textlength
)))) {
2973 // not enough characters to match
2974 setMatchNotFound(strsrch
);
2975 return USEARCH_DONE
;
2979 // switching direction.
2980 // if matchedIndex == USEARCH_DONE, it means that either a
2981 // setOffset has been called or that previous ran off the text
2982 // string. the iterator would have been set to offset 0 if a
2983 // match is not found.
2984 search
->isForwardSearching
= TRUE
;
2985 if (search
->matchedIndex
!= USEARCH_DONE
) {
2986 // there's no need to set the collation element iterator
2987 // the next call to next will set the offset.
2988 return search
->matchedIndex
;
2992 if (U_SUCCESS(*status
)) {
2993 if (strsrch
->pattern
.CELength
== 0) {
2994 if (search
->matchedIndex
== USEARCH_DONE
) {
2995 search
->matchedIndex
= offset
;
2997 else { // moves by codepoints
2998 UTF_FWD_1(search
->text
, search
->matchedIndex
, textlength
);
3001 search
->matchedLength
= 0;
3002 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3003 // status checked below
3004 if (search
->matchedIndex
== textlength
) {
3005 search
->matchedIndex
= USEARCH_DONE
;
3009 if (search
->matchedLength
> 0) {
3010 // if matchlength is 0 we are at the start of the iteration
3011 if (search
->isOverlap
) {
3012 ucol_setOffset(strsrch
->textIter
, offset
+ 1, status
);
3015 ucol_setOffset(strsrch
->textIter
,
3016 offset
+ search
->matchedLength
, status
);
3020 // for boundary check purposes. this will ensure that the
3021 // next match will not preceed the current offset
3022 // note search->matchedIndex will always be set to something
3024 search
->matchedIndex
= offset
- 1;
3027 if (search
->isCanonicalMatch
) {
3028 // can't use exact here since extra accents are allowed.
3029 usearch_handleNextCanonical(strsrch
, status
);
3032 usearch_handleNextExact(strsrch
, status
);
3036 if (U_FAILURE(*status
)) {
3037 return USEARCH_DONE
;
3040 return search
->matchedIndex
;
3043 return USEARCH_DONE
;
3046 U_CAPI
int32_t U_EXPORT2
usearch_previous(UStringSearch
*strsrch
,
3049 if (U_SUCCESS(*status
) && strsrch
) {
3051 USearch
*search
= strsrch
->search
;
3052 if (search
->reset
) {
3053 offset
= search
->textLength
;
3054 search
->isForwardSearching
= FALSE
;
3055 search
->reset
= FALSE
;
3056 setColEIterOffset(strsrch
->textIter
, offset
);
3059 offset
= usearch_getOffset(strsrch
);
3062 int32_t matchedindex
= search
->matchedIndex
;
3063 if (search
->isForwardSearching
== TRUE
) {
3064 // switching direction.
3065 // if matchedIndex == USEARCH_DONE, it means that either a
3066 // setOffset has been called or that next ran off the text
3067 // string. the iterator would have been set to offset textLength if
3068 // a match is not found.
3069 search
->isForwardSearching
= FALSE
;
3070 if (matchedindex
!= USEARCH_DONE
) {
3071 return matchedindex
;
3075 if (offset
== 0 || matchedindex
== 0 ||
3076 (!search
->isOverlap
&&
3077 (offset
< strsrch
->pattern
.defaultShiftSize
||
3078 (matchedindex
!= USEARCH_DONE
&&
3079 matchedindex
< strsrch
->pattern
.defaultShiftSize
)))) {
3080 // not enough characters to match
3081 setMatchNotFound(strsrch
);
3082 return USEARCH_DONE
;
3086 if (U_SUCCESS(*status
)) {
3087 if (strsrch
->pattern
.CELength
== 0) {
3088 search
->matchedIndex
=
3089 (matchedindex
== USEARCH_DONE
? offset
: matchedindex
);
3090 if (search
->matchedIndex
== 0) {
3091 setMatchNotFound(strsrch
);
3092 // status checked below
3094 else { // move by codepoints
3095 UTF_BACK_1(search
->text
, 0, search
->matchedIndex
);
3096 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3097 // status checked below
3098 search
->matchedLength
= 0;
3102 if (strsrch
->search
->isCanonicalMatch
) {
3103 // can't use exact here since extra accents are allowed.
3104 usearch_handlePreviousCanonical(strsrch
, status
);
3105 // status checked below
3108 usearch_handlePreviousExact(strsrch
, status
);
3109 // status checked below
3113 if (U_FAILURE(*status
)) {
3114 return USEARCH_DONE
;
3117 return search
->matchedIndex
;
3120 return USEARCH_DONE
;
3125 U_CAPI
void U_EXPORT2
usearch_reset(UStringSearch
*strsrch
)
3128 reset is setting the attributes that are already in
3129 string search, hence all attributes in the collator should
3130 be retrieved without any problems
3133 UErrorCode status
= U_ZERO_ERROR
;
3134 UBool sameCollAttribute
= TRUE
;
3139 strsrch
->strength
= ucol_getStrength(strsrch
->collator
);
3140 ceMask
= getMask(strsrch
->strength
);
3141 if (strsrch
->ceMask
!= ceMask
) {
3142 strsrch
->ceMask
= ceMask
;
3143 sameCollAttribute
= FALSE
;
3145 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3146 shift
= ucol_getAttribute(strsrch
->collator
, UCOL_ALTERNATE_HANDLING
,
3147 &status
) == UCOL_SHIFTED
;
3148 if (strsrch
->toShift
!= shift
) {
3149 strsrch
->toShift
= shift
;
3150 sameCollAttribute
= FALSE
;
3153 // if status is a failure, ucol_getVariableTop returns 0
3154 varTop
= ucol_getVariableTop(strsrch
->collator
, &status
);
3155 if (strsrch
->variableTop
!= varTop
) {
3156 strsrch
->variableTop
= varTop
;
3157 sameCollAttribute
= FALSE
;
3159 if (!sameCollAttribute
) {
3160 initialize(strsrch
, &status
);
3162 uprv_init_collIterate(strsrch
->collator
, strsrch
->search
->text
,
3163 strsrch
->search
->textLength
,
3164 &(strsrch
->textIter
->iteratordata_
));
3165 strsrch
->search
->matchedLength
= 0;
3166 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
3167 strsrch
->search
->isOverlap
= FALSE
;
3168 strsrch
->search
->isCanonicalMatch
= FALSE
;
3169 strsrch
->search
->isForwardSearching
= TRUE
;
3170 strsrch
->search
->reset
= TRUE
;
3174 // internal use methods declared in usrchimp.h -----------------------------
3176 UBool
usearch_handleNextExact(UStringSearch
*strsrch
, UErrorCode
*status
)
3178 if (U_FAILURE(*status
)) {
3179 setMatchNotFound(strsrch
);
3183 UCollationElements
*coleiter
= strsrch
->textIter
;
3184 int32_t textlength
= strsrch
->search
->textLength
;
3185 int32_t *patternce
= strsrch
->pattern
.CE
;
3186 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3187 int32_t textoffset
= ucol_getOffset(coleiter
);
3189 // status used in setting coleiter offset, since offset is checked in
3190 // shiftForward before setting the coleiter offset, status never
3192 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
3194 while (textoffset
<= textlength
)
3196 uint32_t patternceindex
= patterncelength
- 1;
3198 UBool found
= FALSE
;
3199 int32_t lastce
= UCOL_NULLORDER
;
3201 setColEIterOffset(coleiter
, textoffset
);
3204 // finding the last pattern ce match, imagine composite characters
3205 // for example: search for pattern A in text \u00C0
3206 // we'll have to skip \u0300 the grave first before we get to A
3207 targetce
= ucol_previous(coleiter
, status
);
3208 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3212 targetce
= getCE(strsrch
, targetce
);
3213 if (targetce
== UCOL_IGNORABLE
&& inNormBuf(coleiter
)) {
3214 // this is for the text \u0315\u0300 that requires
3215 // normalization and pattern \u0300, where \u0315 is ignorable
3218 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
3221 if (targetce
== patternce
[patternceindex
]) {
3222 // the first ce can be a contraction
3226 if (!hasExpansion(coleiter
)) {
3234 while (found
&& patternceindex
> 0) {
3235 targetce
= ucol_previous(coleiter
, status
);
3236 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3240 targetce
= getCE(strsrch
, targetce
);
3241 if (targetce
== UCOL_IGNORABLE
) {
3246 found
= found
&& targetce
== patternce
[patternceindex
];
3250 if (U_FAILURE(*status
)) {
3253 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
3255 // status checked at loop.
3256 patternceindex
= patterncelength
;
3260 if (checkNextExactMatch(strsrch
, &textoffset
, status
)) {
3261 // status checked in ucol_setOffset
3262 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
3266 setMatchNotFound(strsrch
);
3270 UBool
usearch_handleNextCanonical(UStringSearch
*strsrch
, UErrorCode
*status
)
3272 if (U_FAILURE(*status
)) {
3273 setMatchNotFound(strsrch
);
3277 UCollationElements
*coleiter
= strsrch
->textIter
;
3278 int32_t textlength
= strsrch
->search
->textLength
;
3279 int32_t *patternce
= strsrch
->pattern
.CE
;
3280 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3281 int32_t textoffset
= ucol_getOffset(coleiter
);
3282 UBool hasPatternAccents
=
3283 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
3285 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
3287 strsrch
->canonicalPrefixAccents
[0] = 0;
3288 strsrch
->canonicalSuffixAccents
[0] = 0;
3290 while (textoffset
<= textlength
)
3292 int32_t patternceindex
= patterncelength
- 1;
3294 UBool found
= FALSE
;
3295 int32_t lastce
= UCOL_NULLORDER
;
3297 setColEIterOffset(coleiter
, textoffset
);
3300 // finding the last pattern ce match, imagine composite characters
3301 // for example: search for pattern A in text \u00C0
3302 // we'll have to skip \u0300 the grave first before we get to A
3303 targetce
= ucol_previous(coleiter
, status
);
3304 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3308 targetce
= getCE(strsrch
, targetce
);
3309 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
3312 if (targetce
== patternce
[patternceindex
]) {
3313 // the first ce can be a contraction
3317 if (!hasExpansion(coleiter
)) {
3323 while (found
&& patternceindex
> 0) {
3324 targetce
= ucol_previous(coleiter
, status
);
3325 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3329 targetce
= getCE(strsrch
, targetce
);
3330 if (targetce
== UCOL_IGNORABLE
) {
3335 found
= found
&& targetce
== patternce
[patternceindex
];
3338 // initializing the rearranged accent array
3339 if (hasPatternAccents
&& !found
) {
3340 strsrch
->canonicalPrefixAccents
[0] = 0;
3341 strsrch
->canonicalSuffixAccents
[0] = 0;
3342 if (U_FAILURE(*status
)) {
3345 found
= doNextCanonicalMatch(strsrch
, textoffset
, status
);
3349 if (U_FAILURE(*status
)) {
3352 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
3354 // status checked at loop
3355 patternceindex
= patterncelength
;
3359 if (checkNextCanonicalMatch(strsrch
, &textoffset
, status
)) {
3360 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
3364 setMatchNotFound(strsrch
);
3368 UBool
usearch_handlePreviousExact(UStringSearch
*strsrch
, UErrorCode
*status
)
3370 if (U_FAILURE(*status
)) {
3371 setMatchNotFound(strsrch
);
3375 UCollationElements
*coleiter
= strsrch
->textIter
;
3376 int32_t *patternce
= strsrch
->pattern
.CE
;
3377 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3378 int32_t textoffset
= ucol_getOffset(coleiter
);
3380 // shifting it check for setting offset
3381 // if setOffset is called previously or there was no previous match, we
3382 // leave the offset as it is.
3383 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
3384 textoffset
= strsrch
->search
->matchedIndex
;
3387 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
3390 while (textoffset
>= 0)
3392 int32_t patternceindex
= 1;
3394 UBool found
= FALSE
;
3395 int32_t firstce
= UCOL_NULLORDER
;
3397 // if status is a failure, ucol_setOffset does nothing
3398 setColEIterOffset(coleiter
, textoffset
);
3401 // finding the first pattern ce match, imagine composite
3402 // characters. for example: search for pattern \u0300 in text
3403 // \u00C0, we'll have to skip A first before we get to
3404 // \u0300 the grave accent
3405 targetce
= ucol_next(coleiter
, status
);
3406 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3410 targetce
= getCE(strsrch
, targetce
);
3411 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
3414 if (targetce
== UCOL_IGNORABLE
) {
3417 if (targetce
== patternce
[0]) {
3421 if (!hasExpansion(coleiter
)) {
3422 // checking for accents in composite character
3430 while (found
&& (patternceindex
< patterncelength
)) {
3431 targetce
= ucol_next(coleiter
, status
);
3432 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3436 targetce
= getCE(strsrch
, targetce
);
3437 if (targetce
== UCOL_IGNORABLE
) {
3441 found
= found
&& targetce
== patternce
[patternceindex
];
3446 if (U_FAILURE(*status
)) {
3449 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
3455 if (checkPreviousExactMatch(strsrch
, &textoffset
, status
)) {
3456 setColEIterOffset(coleiter
, textoffset
);
3460 setMatchNotFound(strsrch
);
3464 UBool
usearch_handlePreviousCanonical(UStringSearch
*strsrch
,
3467 if (U_FAILURE(*status
)) {
3468 setMatchNotFound(strsrch
);
3472 UCollationElements
*coleiter
= strsrch
->textIter
;
3473 int32_t *patternce
= strsrch
->pattern
.CE
;
3474 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3475 int32_t textoffset
= ucol_getOffset(coleiter
);
3476 UBool hasPatternAccents
=
3477 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
3479 // shifting it check for setting offset
3480 // if setOffset is called previously or there was no previous match, we
3481 // leave the offset as it is.
3482 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
3483 textoffset
= strsrch
->search
->matchedIndex
;
3486 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
3488 strsrch
->canonicalPrefixAccents
[0] = 0;
3489 strsrch
->canonicalSuffixAccents
[0] = 0;
3491 while (textoffset
>= 0)
3493 int32_t patternceindex
= 1;
3495 UBool found
= FALSE
;
3496 int32_t firstce
= UCOL_NULLORDER
;
3498 setColEIterOffset(coleiter
, textoffset
);
3500 // finding the first pattern ce match, imagine composite
3501 // characters. for example: search for pattern \u0300 in text
3502 // \u00C0, we'll have to skip A first before we get to
3503 // \u0300 the grave accent
3504 targetce
= ucol_next(coleiter
, status
);
3505 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3509 targetce
= getCE(strsrch
, targetce
);
3510 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
3514 if (targetce
== patternce
[0]) {
3515 // the first ce can be a contraction
3519 if (!hasExpansion(coleiter
)) {
3520 // checking for accents in composite character
3528 while (found
&& patternceindex
< patterncelength
) {
3529 targetce
= ucol_next(coleiter
, status
);
3530 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3534 targetce
= getCE(strsrch
, targetce
);
3535 if (targetce
== UCOL_IGNORABLE
) {
3539 found
= found
&& targetce
== patternce
[patternceindex
];
3543 // initializing the rearranged accent array
3544 if (hasPatternAccents
&& !found
) {
3545 strsrch
->canonicalPrefixAccents
[0] = 0;
3546 strsrch
->canonicalSuffixAccents
[0] = 0;
3547 if (U_FAILURE(*status
)) {
3550 found
= doPreviousCanonicalMatch(strsrch
, textoffset
, status
);
3554 if (U_FAILURE(*status
)) {
3557 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
3563 if (checkPreviousCanonicalMatch(strsrch
, &textoffset
, status
)) {
3564 setColEIterOffset(coleiter
, textoffset
);
3568 setMatchNotFound(strsrch
);
3572 #endif /* #if !UCONFIG_NO_COLLATION */