2 **********************************************************************
3 * Copyright (C) 2001-2004 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
23 // internal definition ---------------------------------------------------
25 #define LAST_BYTE_MASK_ 0xFF
26 #define SECOND_LAST_BYTE_SHIFT_ 8
27 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
29 static const uint16_t *FCD_
= NULL
;
31 // internal methods -------------------------------------------------
34 * Fast collation element iterator setOffset.
35 * This function does not check for bounds.
36 * @param coleiter collation element iterator
37 * @param offset to set
40 inline void setColEIterOffset(UCollationElements
*elems
,
43 collIterate
*ci
= &(elems
->iteratordata_
);
44 ci
->pos
= ci
->string
+ offset
;
45 ci
->CEpos
= ci
->toReturn
= ci
->CEs
;
46 if (ci
->flags
& UCOL_ITER_INNORMBUF
) {
47 ci
->flags
= ci
->origFlags
;
49 ci
->fcdPosition
= NULL
;
53 * Getting the mask for collation strength
54 * @param strength collation strength
55 * @return collation element mask
58 inline uint32_t getMask(UCollationStrength strength
)
63 return UCOL_PRIMARYORDERMASK
;
65 return UCOL_SECONDARYORDERMASK
| UCOL_PRIMARYORDERMASK
;
67 return UCOL_TERTIARYORDERMASK
| UCOL_SECONDARYORDERMASK
|
68 UCOL_PRIMARYORDERMASK
;
73 * This is to squeeze the 21bit ces into a 256 table
74 * @param ce collation element
75 * @return collapsed version of the collation element
78 inline int hash(uint32_t ce
)
80 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
81 // well with the new collation where most of the latin 1 characters
82 // are of the value xx000xxx. their hashes will most of the time be 0
83 // to be discussed on the hash algo.
84 return UCOL_PRIMARYORDER(ce
) % MAX_TABLE_SIZE_
;
88 static UBool U_CALLCONV
89 usearch_cleanup(void) {
96 * Initializing the fcd tables.
97 * Internal method, status assumed to be a success.
98 * @param status output error if any, caller to check status before calling
99 * method, status assumed to be success when passed in.
102 inline void initializeFCD(UErrorCode
*status
)
105 FCD_
= unorm_getFCDTrie(status
);
106 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH
, usearch_cleanup
);
111 * Gets the fcd value for a character at the argument index.
112 * This method takes into accounts of the supplementary characters.
113 * @param str UTF16 string where character for fcd retrieval resides
114 * @param offset position of the character whose fcd is to be retrieved, to be
115 * overwritten with the next character position, taking
116 * surrogate characters into consideration.
117 * @param strlength length of the argument string
121 inline uint16_t getFCD(const UChar
*str
, int32_t *offset
,
124 int32_t temp
= *offset
;
126 UChar ch
= str
[temp
];
127 result
= unorm_getFCD16(FCD_
, ch
);
130 if (result
&& temp
!= strlength
&& UTF_IS_FIRST_SURROGATE(ch
)) {
132 if (UTF_IS_SECOND_SURROGATE(ch
)) {
133 result
= unorm_getFCD16FromSurrogatePair(FCD_
, result
, ch
);
144 * Getting the modified collation elements taking into account the collation
146 * @param strsrch string search data
148 * @return the modified collation element
151 inline int32_t getCE(const UStringSearch
*strsrch
, uint32_t sourcece
)
153 // note for tertiary we can't use the collator->tertiaryMask, that
154 // is a preprocessed mask that takes into account case options. since
155 // we are only concerned with exact matches, we don't need that.
156 sourcece
&= strsrch
->ceMask
;
158 if (strsrch
->toShift
) {
159 // alternate handling here, since only the 16 most significant digits
160 // is only used, we can safely do a compare without masking
161 // if the ce is a variable, we mask and get only the primary values
162 // no shifting to quartenary is required since all primary values
163 // less than variabletop will need to be masked off anyway.
164 if (strsrch
->variableTop
> sourcece
) {
165 if (strsrch
->strength
== UCOL_QUATERNARY
) {
166 sourcece
&= UCOL_PRIMARYORDERMASK
;
169 sourcece
= UCOL_IGNORABLE
;
178 * Allocate a memory and returns NULL if it failed.
179 * Internal method, status assumed to be a success.
180 * @param size to allocate
181 * @param status output error if any, caller to check status before calling
182 * method, status assumed to be success when passed in.
183 * @return newly allocated array, NULL otherwise
186 inline void * allocateMemory(uint32_t size
, UErrorCode
*status
)
188 uint32_t *result
= (uint32_t *)uprv_malloc(size
);
189 if (result
== NULL
) {
190 *status
= U_MEMORY_ALLOCATION_ERROR
;
196 * Adds a uint32_t value to a destination array.
197 * Creates a new array if we run out of space. The caller will have to
198 * manually deallocate the newly allocated array.
199 * Internal method, status assumed to be success, caller has to check status
200 * before calling this method. destination not to be NULL and has at least
201 * size destinationlength.
202 * @param destination target array
203 * @param offset destination offset to add value
204 * @param destinationlength target array size, return value for the new size
205 * @param value to be added
206 * @param increments incremental size expected
207 * @param status output error if any, caller to check status before calling
208 * method, status assumed to be success when passed in.
209 * @return new destination array, destination if there was no new allocation
212 inline int32_t * addTouint32_tArray(int32_t *destination
,
214 uint32_t *destinationlength
,
219 uint32_t newlength
= *destinationlength
;
220 if (offset
+ 1 == newlength
) {
221 newlength
+= increments
;
222 int32_t *temp
= (int32_t *)allocateMemory(
223 sizeof(int32_t) * newlength
, status
);
224 if (U_FAILURE(*status
)) {
227 uprv_memcpy(temp
, destination
, sizeof(int32_t) * offset
);
228 *destinationlength
= newlength
;
231 destination
[offset
] = value
;
236 * Initializing the ce table for a pattern.
237 * Stores non-ignorable collation keys.
238 * Table size will be estimated by the size of the pattern text. Table
239 * expansion will be perform as we go along. Adding 1 to ensure that the table
240 * size definitely increases.
241 * Internal method, status assumed to be a success.
242 * @param strsrch string search data
243 * @param status output error if any, caller to check status before calling
244 * method, status assumed to be success when passed in.
245 * @return total number of expansions
248 inline uint16_t initializePatternCETable(UStringSearch
*strsrch
,
251 UPattern
*pattern
= &(strsrch
->pattern
);
252 uint32_t cetablesize
= INITIAL_ARRAY_SIZE_
;
253 int32_t *cetable
= pattern
->CEBuffer
;
254 uint32_t patternlength
= pattern
->textLength
;
255 UCollationElements
*coleiter
= strsrch
->utilIter
;
257 if (coleiter
== NULL
) {
258 coleiter
= ucol_openElements(strsrch
->collator
, pattern
->text
,
259 patternlength
, status
);
260 // status will be checked in ucol_next(..) later and if it is an
261 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
263 strsrch
->utilIter
= coleiter
;
266 uprv_init_collIterate(strsrch
->collator
, pattern
->text
,
268 &coleiter
->iteratordata_
);
271 if (pattern
->CE
!= cetable
&& pattern
->CE
) {
272 uprv_free(pattern
->CE
);
279 while ((ce
= ucol_next(coleiter
, status
)) != UCOL_NULLORDER
&&
280 U_SUCCESS(*status
)) {
281 uint32_t newce
= getCE(strsrch
, ce
);
283 int32_t *temp
= addTouint32_tArray(cetable
, offset
, &cetablesize
,
285 patternlength
- ucol_getOffset(coleiter
) + 1,
287 if (U_FAILURE(*status
)) {
291 if (cetable
!= temp
&& cetable
!= pattern
->CEBuffer
) {
296 result
+= (uint16_t)(ucol_getMaxExpansion(coleiter
, ce
) - 1);
300 pattern
->CE
= cetable
;
301 pattern
->CELength
= offset
;
307 * Initializes the pattern struct.
308 * Internal method, status assumed to be success.
309 * @param strsrch UStringSearch data storage
310 * @param status output error if any, caller to check status before calling
311 * method, status assumed to be success when passed in.
312 * @return expansionsize the total expansion size of the pattern
315 inline int16_t initializePattern(UStringSearch
*strsrch
, UErrorCode
*status
)
317 UPattern
*pattern
= &(strsrch
->pattern
);
318 const UChar
*patterntext
= pattern
->text
;
319 int32_t length
= pattern
->textLength
;
322 pattern
->hasPrefixAccents
= getFCD(patterntext
, &index
, length
) >>
323 SECOND_LAST_BYTE_SHIFT_
;
325 UTF_BACK_1(patterntext
, 0, index
);
326 pattern
->hasSuffixAccents
= getFCD(patterntext
, &index
, length
) &
328 // since intializePattern is an internal method status is a success.
329 return initializePatternCETable(strsrch
, status
);
333 * Initializing shift tables, with the default values.
334 * If a corresponding default value is 0, the shift table is not set.
335 * @param shift table for forwards shift
336 * @param backshift table for backwards shift
337 * @param cetable table containing pattern ce
338 * @param cesize size of the pattern ces
339 * @param expansionsize total size of the expansions
340 * @param defaultforward the default forward value
341 * @param defaultbackward the default backward value
344 inline void setShiftTable(int16_t shift
[], int16_t backshift
[],
345 int32_t *cetable
, int32_t cesize
,
346 int16_t expansionsize
,
347 int16_t defaultforward
,
348 int16_t defaultbackward
)
350 // estimate the value to shift. to do that we estimate the smallest
351 // number of characters to give the relevant ces, ie approximately
352 // the number of ces minus their expansion, since expansions can come
355 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
356 shift
[count
] = defaultforward
;
358 cesize
--; // down to the last index
359 for (count
= 0; count
< cesize
; count
++) {
360 // number of ces from right of array to the count
361 int temp
= defaultforward
- count
- 1;
362 shift
[hash(cetable
[count
])] = temp
> 1 ? temp
: 1;
364 shift
[hash(cetable
[cesize
])] = 1;
365 // for ignorables we just shift by one. see test examples.
368 for (count
= 0; count
< MAX_TABLE_SIZE_
; count
++) {
369 backshift
[count
] = defaultbackward
;
371 for (count
= cesize
; count
> 0; count
--) {
372 // the original value count does not seem to work
373 backshift
[hash(cetable
[count
])] = count
> expansionsize
?
374 (int16_t)(count
- expansionsize
) : 1;
376 backshift
[hash(cetable
[0])] = 1;
377 backshift
[hash(0)] = 1;
381 * Building of the pattern collation element list and the boyer moore strsrch
383 * The canonical match will only be performed after the default match fails.
384 * For both cases we need to remember the size of the composed and decomposed
385 * versions of the string. Since the Boyer-Moore shift calculations shifts by
386 * a number of characters in the text and tries to match the pattern from that
387 * offset, the shift value can not be too large in case we miss some
388 * characters. To choose a right shift size, we estimate the NFC form of the
389 * and use its size as a shift guide. The NFC form should be the small
390 * possible representation of the pattern. Anyways, we'll err on the smaller
391 * shift size. Hence the calculation for minlength.
392 * Canonical match will be performed slightly differently. We'll split the
393 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
394 * the first and last base character (MS), the ending accents (EA). Matches
395 * will be done on MS first, and only when we match MS then some processing
396 * will be required for the prefix and end accents in order to determine if
397 * they match PA and EA. Hence the default shift values
398 * for the canonical match will take the size of either end's accent into
399 * consideration. Forwards search will take the end accents into consideration
400 * for the default shift values and the backwards search will take the prefix
401 * accents into consideration.
402 * If pattern has no non-ignorable ce, we return a illegal argument error.
403 * Internal method, status assumed to be success.
404 * @param strsrch UStringSearch data storage
405 * @param status for output errors if it occurs, status is assumed to be a
406 * success when it is passed in.
409 inline void initialize(UStringSearch
*strsrch
, UErrorCode
*status
)
411 int16_t expandlength
= initializePattern(strsrch
, status
);
412 if (U_SUCCESS(*status
) && strsrch
->pattern
.CELength
> 0) {
413 UPattern
*pattern
= &strsrch
->pattern
;
414 int32_t cesize
= pattern
->CELength
;
416 int16_t minlength
= cesize
> expandlength
417 ? (int16_t)cesize
- expandlength
: 1;
418 pattern
->defaultShiftSize
= minlength
;
419 setShiftTable(pattern
->shift
, pattern
->backShift
, pattern
->CE
,
420 cesize
, expandlength
, minlength
, minlength
);
423 strsrch
->pattern
.defaultShiftSize
= 0;
427 * Determine whether the target text in UStringSearch bounded by the offset
428 * start and end is one or more whole units of text as
429 * determined by the breakiterator in UStringSearch.
430 * @param strsrch string search data
431 * @param start target text start offset
432 * @param end target text end offset
435 inline UBool
isBreakUnit(const UStringSearch
*strsrch
, int32_t start
,
438 #if !UCONFIG_NO_BREAK_ITERATION
439 UBreakIterator
*breakiterator
= strsrch
->search
->breakIter
;
441 int32_t startindex
= ubrk_first(breakiterator
);
442 int32_t endindex
= ubrk_last(breakiterator
);
444 // out-of-range indexes are never boundary positions
445 if (start
< startindex
|| start
> endindex
||
446 end
< startindex
|| end
> endindex
) {
449 // otherwise, we can use following() on the position before the
450 // specified one and return true of the position we get back is the
451 // one the user specified
452 UBool result
= (start
== startindex
||
453 ubrk_following(breakiterator
, start
- 1) == start
) &&
455 ubrk_following(breakiterator
, end
- 1) == end
);
457 // iterates the individual ces
458 UCollationElements
*coleiter
= strsrch
->utilIter
;
459 const UChar
*text
= strsrch
->search
->text
+
461 UErrorCode status
= U_ZERO_ERROR
;
462 ucol_setText(coleiter
, text
, end
- start
, &status
);
463 for (int32_t count
= 0; count
< strsrch
->pattern
.CELength
;
465 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
466 if (ce
== UCOL_IGNORABLE
) {
470 if (U_FAILURE(status
) || ce
!= strsrch
->pattern
.CE
[count
]) {
474 int32_t nextce
= ucol_next(coleiter
, &status
);
475 while (ucol_getOffset(coleiter
) == (end
- start
)
476 && getCE(strsrch
, nextce
) == UCOL_IGNORABLE
) {
477 nextce
= ucol_next(coleiter
, &status
);
479 if (ucol_getOffset(coleiter
) == (end
- start
)
480 && nextce
!= UCOL_NULLORDER
) {
481 // extra collation elements at the end of the match
492 * Getting the next base character offset if current offset is an accent,
493 * or the current offset if the current character contains a base character.
494 * accents the following base character will be returned
496 * @param textoffset current offset
497 * @param textlength length of text string
498 * @return the next base character or the current offset
499 * if the current character is contains a base character.
502 inline int32_t getNextBaseOffset(const UChar
*text
,
506 if (textoffset
< textlength
) {
507 int32_t temp
= textoffset
;
508 if (getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
509 while (temp
< textlength
) {
510 int32_t result
= temp
;
511 if ((getFCD(text
, &temp
, textlength
) >>
512 SECOND_LAST_BYTE_SHIFT_
) == 0) {
523 * Gets the next base character offset depending on the string search pattern
525 * @param strsrch string search data
526 * @param textoffset current offset, one offset away from the last character
528 * @return start index of the next base character or the current offset
529 * if the current character is contains a base character.
532 inline int32_t getNextUStringSearchBaseOffset(UStringSearch
*strsrch
,
535 int32_t textlength
= strsrch
->search
->textLength
;
536 if (strsrch
->pattern
.hasSuffixAccents
&&
537 textoffset
< textlength
) {
538 int32_t temp
= textoffset
;
539 const UChar
*text
= strsrch
->search
->text
;
540 UTF_BACK_1(text
, 0, temp
);
541 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
542 return getNextBaseOffset(text
, textoffset
, textlength
);
549 * Shifting the collation element iterator position forward to prepare for
550 * a following match. If the last character is a unsafe character, we'll only
551 * shift by 1 to capture contractions, normalization etc.
552 * Internal method, status assumed to be success.
553 * @param text strsrch string search data
554 * @param textoffset start text position to do search
555 * @param ce the text ce which failed the match.
556 * @param patternceindex index of the ce within the pattern ce buffer which
558 * @return final offset
561 inline int32_t shiftForward(UStringSearch
*strsrch
,
564 int32_t patternceindex
)
566 UPattern
*pattern
= &(strsrch
->pattern
);
567 if (ce
!= UCOL_NULLORDER
) {
568 int32_t shift
= pattern
->shift
[hash(ce
)];
569 // this is to adjust for characters in the middle of the
570 // substring for matching that failed.
571 int32_t adjust
= pattern
->CELength
- patternceindex
;
572 if (adjust
> 1 && shift
>= adjust
) {
578 textoffset
+= pattern
->defaultShiftSize
;
581 textoffset
= getNextUStringSearchBaseOffset(strsrch
, textoffset
);
582 // check for unsafe characters
583 // * if it is the start or middle of a contraction: to be done after
584 // a initial match is found
585 // * thai or lao base consonant character: similar to contraction
586 // * high surrogate character: similar to contraction
587 // * next character is a accent: shift to the next base character
592 * sets match not found
593 * @param strsrch string search data
596 inline void setMatchNotFound(UStringSearch
*strsrch
)
598 // this method resets the match result regardless of the error status.
599 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
600 strsrch
->search
->matchedLength
= 0;
601 if (strsrch
->search
->isForwardSearching
) {
602 setColEIterOffset(strsrch
->textIter
, strsrch
->search
->textLength
);
605 setColEIterOffset(strsrch
->textIter
, 0);
610 * Gets the offset to the next safe point in text.
611 * ie. not the middle of a contraction, swappable characters or supplementary
613 * @param collator collation sata
614 * @param text string to work with
615 * @param textoffset offset in string
616 * @param textlength length of text string
617 * @return offset to the next safe character
620 inline int32_t getNextSafeOffset(const UCollator
*collator
,
625 int32_t result
= textoffset
; // first contraction character
626 while (result
!= textlength
&& ucol_unsafeCP(text
[result
], collator
)) {
633 * This checks for accents in the potential match started with a .
634 * composite character.
635 * This is really painful... we have to check that composite character do not
636 * have any extra accents. We have to normalize the potential match and find
637 * the immediate decomposed character before the match.
638 * The first composite character would have been taken care of by the fcd
639 * checks in checkForwardExactMatch.
640 * This is the slow path after the fcd of the first character and
641 * the last character has been checked by checkForwardExactMatch and we
642 * determine that the potential match has extra non-ignorable preceding
644 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
645 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
646 * Note here that accents checking are slow and cautioned in the API docs.
647 * Internal method, status assumed to be a success, caller should check status
648 * before calling this method
649 * @param strsrch string search data
650 * @param start index of the potential unfriendly composite character
651 * @param end index of the potential unfriendly composite character
652 * @param status output error status if any.
653 * @return TRUE if there is non-ignorable accents before at the beginning
654 * of the match, FALSE otherwise.
658 UBool
checkExtraMatchAccents(const UStringSearch
*strsrch
, int32_t start
,
662 UBool result
= FALSE
;
663 if (strsrch
->pattern
.hasPrefixAccents
) {
664 int32_t length
= end
- start
;
666 const UChar
*text
= strsrch
->search
->text
+ start
;
668 UTF_FWD_1(text
, offset
, length
);
669 // we are only concerned with the first composite character
670 if (unorm_quickCheck(text
, offset
, UNORM_NFD
, status
) == UNORM_NO
) {
671 int32_t safeoffset
= getNextSafeOffset(strsrch
->collator
,
673 if (safeoffset
!= length
) {
677 UChar buffer
[INITIAL_ARRAY_SIZE_
];
678 int32_t size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0,
679 buffer
, INITIAL_ARRAY_SIZE_
,
681 if (U_FAILURE(*status
)) {
684 if (size
>= INITIAL_ARRAY_SIZE_
) {
685 norm
= (UChar
*)allocateMemory((size
+ 1) * sizeof(UChar
),
687 // if allocation failed, status will be set to
688 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
690 size
= unorm_normalize(text
, safeoffset
, UNORM_NFD
, 0, norm
,
692 if (U_FAILURE(*status
) && norm
!= NULL
) {
701 UCollationElements
*coleiter
= strsrch
->utilIter
;
702 ucol_setText(coleiter
, norm
, size
, status
);
703 uint32_t firstce
= strsrch
->pattern
.CE
[0];
704 UBool ignorable
= TRUE
;
705 uint32_t ce
= UCOL_IGNORABLE
;
706 while (U_SUCCESS(*status
) && ce
!= firstce
) {
707 offset
= ucol_getOffset(coleiter
);
708 if (ce
!= firstce
&& ce
!= UCOL_IGNORABLE
) {
711 ce
= ucol_next(coleiter
, status
);
714 UTF_PREV_CHAR(norm
, 0, offset
, codepoint
);
715 result
= !ignorable
&& (u_getCombiningClass(codepoint
) != 0);
717 if (norm
!= buffer
) {
727 * Used by exact matches, checks if there are accents before the match.
728 * This is really painful... we have to check that composite characters at
729 * the start of the matches have to not have any extra accents.
730 * We check the FCD of the character first, if it starts with an accent and
731 * the first pattern ce does not match the first ce of the character, we bail.
732 * Otherwise we try normalizing the first composite
733 * character and find the immediate decomposed character before the match to
734 * see if it is an non-ignorable accent.
735 * Now normalizing the first composite character is enough because we ensure
736 * that when the match is passed in here with extra beginning ces, the
737 * first or last ce that match has to occur within the first character.
738 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
739 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
740 * Note here that accents checking are slow and cautioned in the API docs.
741 * @param strsrch string search data
742 * @param start offset
744 * @return TRUE if there are accents on either side of the match,
748 UBool
hasAccentsBeforeMatch(const UStringSearch
*strsrch
, int32_t start
,
751 if (strsrch
->pattern
.hasPrefixAccents
) {
752 UCollationElements
*coleiter
= strsrch
->textIter
;
753 UErrorCode status
= U_ZERO_ERROR
;
754 // we have been iterating forwards previously
755 uint32_t ignorable
= TRUE
;
756 int32_t firstce
= strsrch
->pattern
.CE
[0];
758 setColEIterOffset(coleiter
, start
);
759 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
760 if (U_FAILURE(status
)) {
763 while (ce
!= firstce
) {
764 if (ce
!= UCOL_IGNORABLE
) {
767 ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
768 if (U_FAILURE(status
)) {
772 if (!ignorable
&& inNormBuf(coleiter
)) {
773 // within normalization buffer, discontiguous handled here
778 int32_t temp
= start
;
780 // accent = (getFCD(strsrch->search->text, &temp,
781 // strsrch->search->textLength)
782 // >> SECOND_LAST_BYTE_SHIFT_);
783 // however this code does not work well with VC7 .net in release mode.
784 // maybe the inlines for getFCD combined with shifting has bugs in
785 // VC7. anyways this is a work around.
786 UBool accent
= getFCD(strsrch
->search
->text
, &temp
,
787 strsrch
->search
->textLength
) > 0xFF;
789 return checkExtraMatchAccents(strsrch
, start
, end
, &status
);
796 UTF_BACK_1(strsrch
->search
->text
, 0, temp
);
797 if (getFCD(strsrch
->search
->text
, &temp
,
798 strsrch
->search
->textLength
) & LAST_BYTE_MASK_
) {
799 setColEIterOffset(coleiter
, start
);
800 ce
= ucol_previous(coleiter
, &status
);
801 if (U_FAILURE(status
) ||
802 (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
)) {
813 * Used by exact matches, checks if there are accents bounding the match.
814 * Note this is the initial boundary check. If the potential match
815 * starts or ends with composite characters, the accents in those
816 * characters will be determined later.
817 * Not doing backwards iteration here, since discontiguos contraction for
818 * backwards collation element iterator, use up too many characters.
819 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
820 * should fail since there is a acute at the end of \u01FA
821 * Note here that accents checking are slow and cautioned in the API docs.
822 * @param strsrch string search data
823 * @param start offset of match
824 * @param end end offset of the match
825 * @return TRUE if there are accents on either side of the match,
829 UBool
hasAccentsAfterMatch(const UStringSearch
*strsrch
, int32_t start
,
832 if (strsrch
->pattern
.hasSuffixAccents
) {
833 const UChar
*text
= strsrch
->search
->text
;
835 int32_t textlength
= strsrch
->search
->textLength
;
836 UTF_BACK_1(text
, 0, temp
);
837 if (getFCD(text
, &temp
, textlength
) & LAST_BYTE_MASK_
) {
838 int32_t firstce
= strsrch
->pattern
.CE
[0];
839 UCollationElements
*coleiter
= strsrch
->textIter
;
840 UErrorCode status
= U_ZERO_ERROR
;
841 setColEIterOffset(coleiter
, start
);
842 while (getCE(strsrch
, ucol_next(coleiter
, &status
)) != firstce
) {
843 if (U_FAILURE(status
)) {
848 while (count
< strsrch
->pattern
.CELength
) {
849 if (getCE(strsrch
, ucol_next(coleiter
, &status
))
851 // Thai can give an ignorable here.
854 if (U_FAILURE(status
)) {
859 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
860 if (U_FAILURE(status
)) {
863 if (ce
!= UCOL_NULLORDER
&& ce
!= UCOL_IGNORABLE
) {
864 if (ucol_getOffset(coleiter
) <= end
) {
867 if (getFCD(text
, &end
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) {
877 * Checks if the offset runs out of the text string
879 * @param textlength of the text string
880 * @return TRUE if offset is out of bounds, FALSE otherwise
883 inline UBool
isOutOfBounds(int32_t textlength
, int32_t offset
)
885 return offset
< 0 || offset
> textlength
;
889 * Checks for identical match
890 * @param strsrch string search data
891 * @param start offset of possible match
892 * @param end offset of possible match
893 * @return TRUE if identical match is found
896 inline UBool
checkIdentical(const UStringSearch
*strsrch
, int32_t start
,
899 int32_t length
= end
- start
;
900 if (strsrch
->strength
!= UCOL_IDENTICAL
) {
904 UErrorCode status
= U_ZERO_ERROR
;
905 int decomplength
= unorm_decompose(NULL
, -1,
906 strsrch
->search
->text
+ start
, length
,
908 if (decomplength
!= unorm_decompose(NULL
, -1, strsrch
->pattern
.text
,
909 strsrch
->pattern
.textLength
,
910 FALSE
, 0, &status
)) {
914 UChar
*text
= (UChar
*)uprv_malloc(decomplength
* sizeof(UChar
));
915 UChar
*pattern
= (UChar
*)uprv_malloc(decomplength
* sizeof(UChar
));
916 unorm_decompose(text
, decomplength
, strsrch
->search
->text
+ start
,
917 length
, FALSE
, 0, &status
);
918 unorm_decompose(pattern
, decomplength
, strsrch
->pattern
.text
,
919 strsrch
->pattern
.textLength
, FALSE
, 0, &status
);
920 UBool result
= (uprv_memcmp(pattern
, text
, decomplength
* sizeof(UChar
))
928 * Checks to see if the match is repeated
929 * @param strsrch string search data
930 * @param start new match start index
931 * @param end new match end index
932 * @return TRUE if the the match is repeated, FALSE otherwise
935 inline UBool
checkRepeatedMatch(UStringSearch
*strsrch
,
939 int32_t lastmatchindex
= strsrch
->search
->matchedIndex
;
941 if (lastmatchindex
== USEARCH_DONE
) {
944 if (strsrch
->search
->isForwardSearching
) {
945 result
= start
<= lastmatchindex
;
948 result
= start
>= lastmatchindex
;
950 if (!result
&& !strsrch
->search
->isOverlap
) {
951 if (strsrch
->search
->isForwardSearching
) {
952 result
= start
< lastmatchindex
+ strsrch
->search
->matchedLength
;
955 result
= end
> lastmatchindex
;
962 * Gets the collation element iterator's current offset.
963 * @param coleiter collation element iterator
964 * @param forwards flag TRUE if we are moving in th forwards direction
965 * @return current offset
968 inline int32_t getColElemIterOffset(const UCollationElements
*coleiter
,
971 int32_t result
= ucol_getOffset(coleiter
);
972 // intricacies of the the backwards collation element iterator
973 if (!forwards
&& inNormBuf(coleiter
) && !isFCDPointerNull(coleiter
)) {
980 * Checks match for contraction.
981 * If the match ends with a partial contraction we fail.
982 * If the match starts too far off (because of backwards iteration) we try to
983 * chip off the extra characters depending on whether a breakiterator has
985 * Internal method, error assumed to be success, caller has to check status
986 * before calling this method.
987 * @param strsrch string search data
988 * @param start offset of potential match, to be modified if necessary
989 * @param end offset of potential match, to be modified if necessary
990 * @param status output error status if any
991 * @return TRUE if match passes the contraction test, FALSE otherwise
995 UBool
checkNextExactContractionMatch(UStringSearch
*strsrch
,
997 int32_t *end
, UErrorCode
*status
)
999 UCollationElements
*coleiter
= strsrch
->textIter
;
1000 int32_t textlength
= strsrch
->search
->textLength
;
1001 int32_t temp
= *start
;
1002 const UCollator
*collator
= strsrch
->collator
;
1003 const UChar
*text
= strsrch
->search
->text
;
1004 // This part checks if either ends of the match contains potential
1005 // contraction. If so we'll have to iterate through them
1006 // The start contraction needs to be checked since ucol_previous dumps
1007 // all characters till the first safe character into the buffer.
1008 // *start + 1 is used to test for the unsafe characters instead of *start
1009 // because ucol_prev takes all unsafe characters till the first safe
1010 // character ie *start. so by testing *start + 1, we can estimate if
1011 // excess prefix characters has been included in the potential search
1013 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1014 (*start
+ 1 < textlength
1015 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1016 int32_t expansion
= getExpansionPrefix(coleiter
);
1017 UBool expandflag
= expansion
> 0;
1018 setColEIterOffset(coleiter
, *start
);
1019 while (expansion
> 0) {
1020 // getting rid of the redundant ce, caused by setOffset.
1021 // since backward contraction/expansion may have extra ces if we
1022 // are in the normalization buffer, hasAccentsBeforeMatch would
1023 // have taken care of it.
1024 // E.g. the character \u01FA will have an expansion of 3, but if
1025 // we are only looking for acute and ring \u030A and \u0301, we'll
1026 // have to skip the first ce in the expansion buffer.
1027 ucol_next(coleiter
, status
);
1028 if (U_FAILURE(*status
)) {
1031 if (ucol_getOffset(coleiter
) != temp
) {
1033 temp
= ucol_getOffset(coleiter
);
1038 int32_t *patternce
= strsrch
->pattern
.CE
;
1039 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1041 while (count
< patterncelength
) {
1042 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1043 if (ce
== UCOL_IGNORABLE
) {
1046 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1048 temp
= ucol_getOffset(coleiter
);
1050 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1052 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1062 * Checks and sets the match information if found.
1065 * <li> the potential match does not repeat the previous match
1066 * <li> boundaries are correct
1067 * <li> exact matches has no extra accents
1068 * <li> identical matchesb
1069 * <li> potential match does not end in the middle of a contraction
1071 * Otherwise the offset will be shifted to the next character.
1072 * Internal method, status assumed to be success, caller has to check status
1073 * before calling this method.
1074 * @param strsrch string search data
1075 * @param textoffset offset in the collation element text. the returned value
1076 * will be the truncated end offset of the match or the new start
1078 * @param status output error status if any
1079 * @return TRUE if the match is valid, FALSE otherwise
1082 inline UBool
checkNextExactMatch(UStringSearch
*strsrch
,
1083 int32_t *textoffset
, UErrorCode
*status
)
1085 UCollationElements
*coleiter
= strsrch
->textIter
;
1086 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1088 if (!checkNextExactContractionMatch(strsrch
, &start
, textoffset
, status
)) {
1092 // this totally matches, however we need to check if it is repeating
1093 if (!isBreakUnit(strsrch
, start
, *textoffset
) ||
1094 checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1095 hasAccentsBeforeMatch(strsrch
, start
, *textoffset
) ||
1096 !checkIdentical(strsrch
, start
, *textoffset
) ||
1097 hasAccentsAfterMatch(strsrch
, start
, *textoffset
)) {
1100 *textoffset
= getNextUStringSearchBaseOffset(strsrch
, *textoffset
);
1104 // totally match, we will get rid of the ending ignorables.
1105 strsrch
->search
->matchedIndex
= start
;
1106 strsrch
->search
->matchedLength
= *textoffset
- start
;
1111 * Getting the previous base character offset, or the current offset if the
1112 * current character is a base character
1113 * @param text string
1114 * @param textoffset one offset after the current character
1115 * @return the offset of the next character after the base character or the first
1116 * composed character with accents
1119 inline int32_t getPreviousBaseOffset(const UChar
*text
,
1122 if (textoffset
> 0) {
1124 int32_t result
= textoffset
;
1125 UTF_BACK_1(text
, 0, textoffset
);
1126 int32_t temp
= textoffset
;
1127 uint16_t fcd
= getFCD(text
, &temp
, result
);
1128 if ((fcd
>> SECOND_LAST_BYTE_SHIFT_
) == 0) {
1129 if (fcd
& LAST_BYTE_MASK_
) {
1134 if (textoffset
== 0) {
1143 * Getting the indexes of the accents that are not blocked in the argument
1145 * @param accents array of accents in nfd terminated by a 0.
1146 * @param accentsindex array of indexes of the accents that are not blocked
1149 inline int getUnblockedAccentIndex(UChar
*accents
, int32_t *accentsindex
)
1152 int32_t length
= u_strlen(accents
);
1153 UChar32 codepoint
= 0;
1157 while (index
< length
) {
1159 UTF_NEXT_CHAR(accents
, index
, length
, codepoint
);
1160 if (u_getCombiningClass(codepoint
) != cclass
) {
1161 cclass
= u_getCombiningClass(codepoint
);
1162 accentsindex
[result
] = temp
;
1166 accentsindex
[result
] = length
;
1171 * Appends 3 UChar arrays to a destination array.
1172 * Creates a new array if we run out of space. The caller will have to
1173 * manually deallocate the newly allocated array.
1174 * Internal method, status assumed to be success, caller has to check status
1175 * before calling this method. destination not to be NULL and has at least
1176 * size destinationlength.
1177 * @param destination target array
1178 * @param destinationlength target array size, returning the appended length
1179 * @param source1 null-terminated first array
1180 * @param source2 second array
1181 * @param source2length length of seond array
1182 * @param source3 null-terminated third array
1183 * @param status error status if any
1184 * @return new destination array, destination if there was no new allocation
1187 inline UChar
* addToUCharArray( UChar
*destination
,
1188 int32_t *destinationlength
,
1189 const UChar
*source1
,
1190 const UChar
*source2
,
1191 int32_t source2length
,
1192 const UChar
*source3
,
1195 int32_t source1length
= source1
? u_strlen(source1
) : 0;
1196 int32_t source3length
= source3
? u_strlen(source3
) : 0;
1197 if (*destinationlength
< source1length
+ source2length
+ source3length
+
1200 destination
= (UChar
*)allocateMemory(
1201 (source1length
+ source2length
+ source3length
+ 1) * sizeof(UChar
),
1203 // if error allocating memory, status will be
1204 // U_MEMORY_ALLOCATION_ERROR
1205 if (U_FAILURE(*status
)) {
1206 *destinationlength
= 0;
1210 if (source1length
!= 0) {
1211 uprv_memcpy(destination
, source1
, sizeof(UChar
) * source1length
);
1213 if (source2length
!= 0) {
1214 uprv_memcpy(destination
+ source1length
, source2
,
1215 sizeof(UChar
) * source2length
);
1217 if (source3length
!= 0) {
1218 uprv_memcpy(destination
+ source1length
+ source2length
, source3
,
1219 sizeof(UChar
) * source3length
);
1221 *destinationlength
= source1length
+ source2length
+ source3length
;
1226 * Running through a collation element iterator to see if the contents matches
1227 * pattern in string search data
1228 * @param strsrch string search data
1229 * @param coleiter collation element iterator
1230 * @return TRUE if a match if found, FALSE otherwise
1233 inline UBool
checkCollationMatch(const UStringSearch
*strsrch
,
1234 UCollationElements
*coleiter
)
1236 int patternceindex
= strsrch
->pattern
.CELength
;
1237 int32_t *patternce
= strsrch
->pattern
.CE
;
1238 UErrorCode status
= U_ZERO_ERROR
;
1239 while (patternceindex
> 0) {
1240 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, &status
));
1241 if (ce
== UCOL_IGNORABLE
) {
1244 if (U_FAILURE(status
) || ce
!= *patternce
) {
1254 * Rearranges the front accents to try matching.
1255 * Prefix accents in the text will be grouped according to their combining
1256 * class and the groups will be mixed and matched to try find the perfect
1257 * match with the pattern.
1258 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1259 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1260 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1262 * step 2: check if any of the generated substrings matches the pattern.
1263 * Internal method, status is assumed to be success, caller has to check status
1264 * before calling this method.
1265 * @param strsrch string search match
1266 * @param start first offset of the accents to start searching
1267 * @param end start of the last accent set
1268 * @param status output error status if any
1269 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1270 * offset of the match. Note this start includes all preceding accents.
1273 int32_t doNextCanonicalPrefixMatch(UStringSearch
*strsrch
,
1278 const UChar
*text
= strsrch
->search
->text
;
1279 int32_t textlength
= strsrch
->search
->textLength
;
1280 int32_t tempstart
= start
;
1282 if ((getFCD(text
, &tempstart
, textlength
) & LAST_BYTE_MASK_
) == 0) {
1283 // die... failed at a base character
1284 return USEARCH_DONE
;
1287 int32_t offset
= getNextBaseOffset(text
, tempstart
, textlength
);
1288 start
= getPreviousBaseOffset(text
, tempstart
);
1290 UChar accents
[INITIAL_ARRAY_SIZE_
];
1291 // normalizing the offensive string
1292 unorm_normalize(text
+ start
, offset
- start
, UNORM_NFD
, 0, accents
,
1293 INITIAL_ARRAY_SIZE_
, status
);
1294 if (U_FAILURE(*status
)) {
1295 return USEARCH_DONE
;
1298 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1299 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1301 int32_t count
= (2 << (accentsize
- 1)) - 1;
1302 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1303 UCollationElements
*coleiter
= strsrch
->utilIter
;
1304 while (U_SUCCESS(*status
) && count
> 0) {
1305 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
1306 // copy the base characters
1307 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1308 *rearrange
++ = accents
[k
];
1310 // forming all possible canonical rearrangement by dropping
1312 for (int i
= 0; i
<= accentsize
- 1; i
++) {
1313 int32_t mask
= 1 << (accentsize
- i
- 1);
1315 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1316 *rearrange
++ = accents
[j
];
1321 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
1322 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
1323 strsrch
->canonicalPrefixAccents
,
1324 strsrch
->search
->text
+ offset
,
1326 strsrch
->canonicalSuffixAccents
,
1329 // if status is a failure, ucol_setText does nothing.
1330 // run the collator iterator through this match
1331 ucol_setText(coleiter
, match
, matchsize
, status
);
1332 if (U_SUCCESS(*status
)) {
1333 if (checkCollationMatch(strsrch
, coleiter
)) {
1334 if (match
!= buffer
) {
1342 return USEARCH_DONE
;
1346 * Gets the offset to the safe point in text before textoffset.
1347 * ie. not the middle of a contraction, swappable characters or supplementary
1349 * @param collator collation sata
1350 * @param text string to work with
1351 * @param textoffset offset in string
1352 * @param textlength length of text string
1353 * @return offset to the previous safe character
1356 inline uint32_t getPreviousSafeOffset(const UCollator
*collator
,
1360 int32_t result
= textoffset
; // first contraction character
1361 while (result
!= 0 && ucol_unsafeCP(text
[result
- 1], collator
)) {
1365 // the first contraction character is consider unsafe here
1372 * Cleaning up after we passed the safe zone
1373 * @param strsrch string search data
1374 * @param safetext safe text array
1375 * @param safebuffer safe text buffer
1376 * @param coleiter collation element iterator for safe text
1379 inline void cleanUpSafeText(const UStringSearch
*strsrch
, UChar
*safetext
,
1382 if (safetext
!= safebuffer
&& safetext
!= strsrch
->canonicalSuffixAccents
)
1384 uprv_free(safetext
);
1389 * Take the rearranged end accents and tries matching. If match failed at
1390 * a seperate preceding set of accents (seperated from the rearranged on by
1391 * at least a base character) then we rearrange the preceding accents and
1392 * tries matching again.
1393 * We allow skipping of the ends of the accent set if the ces do not match.
1394 * However if the failure is found before the accent set, it fails.
1395 * Internal method, status assumed to be success, caller has to check status
1396 * before calling this method.
1397 * @param strsrch string search data
1398 * @param textoffset of the start of the rearranged accent
1399 * @param status output error status if any
1400 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1401 * offset of the match. Note this start includes all preceding accents.
1404 int32_t doNextCanonicalSuffixMatch(UStringSearch
*strsrch
,
1408 const UChar
*text
= strsrch
->search
->text
;
1409 const UCollator
*collator
= strsrch
->collator
;
1410 int32_t safelength
= 0;
1412 int32_t safetextlength
;
1413 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
1414 UCollationElements
*coleiter
= strsrch
->utilIter
;
1415 int32_t safeoffset
= textoffset
;
1417 if (textoffset
!= 0 && ucol_unsafeCP(strsrch
->canonicalSuffixAccents
[0],
1419 safeoffset
= getPreviousSafeOffset(collator
, text
, textoffset
);
1420 safelength
= textoffset
- safeoffset
;
1421 safetextlength
= INITIAL_ARRAY_SIZE_
;
1422 safetext
= addToUCharArray(safebuffer
, &safetextlength
, NULL
,
1423 text
+ safeoffset
, safelength
,
1424 strsrch
->canonicalSuffixAccents
,
1428 safetextlength
= u_strlen(strsrch
->canonicalSuffixAccents
);
1429 safetext
= strsrch
->canonicalSuffixAccents
;
1432 // if status is a failure, ucol_setText does nothing
1433 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
1434 // status checked in loop below
1436 int32_t *ce
= strsrch
->pattern
.CE
;
1437 int32_t celength
= strsrch
->pattern
.CELength
;
1438 int ceindex
= celength
- 1;
1439 UBool isSafe
= TRUE
; // indication flag for position in safe zone
1441 while (ceindex
>= 0) {
1442 int32_t textce
= ucol_previous(coleiter
, status
);
1443 if (U_FAILURE(*status
)) {
1445 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1447 return USEARCH_DONE
;
1449 if (textce
== UCOL_NULLORDER
) {
1450 // check if we have passed the safe buffer
1451 if (coleiter
== strsrch
->textIter
) {
1452 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1453 return USEARCH_DONE
;
1455 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1456 safetext
= safebuffer
;
1457 coleiter
= strsrch
->textIter
;
1458 setColEIterOffset(coleiter
, safeoffset
);
1459 // status checked at the start of the loop
1463 textce
= getCE(strsrch
, textce
);
1464 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
1465 // do the beginning stuff
1466 int32_t failedoffset
= getColElemIterOffset(coleiter
, FALSE
);
1467 if (isSafe
&& failedoffset
>= safelength
) {
1468 // alas... no hope. failed at rearranged accent set
1469 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1470 return USEARCH_DONE
;
1474 failedoffset
+= safeoffset
;
1475 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1478 // try rearranging the front accents
1479 int32_t result
= doNextCanonicalPrefixMatch(strsrch
,
1480 failedoffset
, textoffset
, status
);
1481 if (result
!= USEARCH_DONE
) {
1482 // if status is a failure, ucol_setOffset does nothing
1483 setColEIterOffset(strsrch
->textIter
, result
);
1485 if (U_FAILURE(*status
)) {
1486 return USEARCH_DONE
;
1491 if (textce
== ce
[ceindex
]) {
1497 int32_t result
= getColElemIterOffset(coleiter
, FALSE
);
1498 // sets the text iterator here with the correct expansion and offset
1499 int32_t leftoverces
= getExpansionPrefix(coleiter
);
1500 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
1501 if (result
>= safelength
) {
1502 result
= textoffset
;
1505 result
+= safeoffset
;
1507 setColEIterOffset(strsrch
->textIter
, result
);
1508 strsrch
->textIter
->iteratordata_
.toReturn
=
1509 setExpansionPrefix(strsrch
->textIter
, leftoverces
);
1513 return ucol_getOffset(coleiter
);
1517 * Trying out the substring and sees if it can be a canonical match.
1518 * This will try normalizing the end accents and arranging them into canonical
1519 * equivalents and check their corresponding ces with the pattern ce.
1520 * Suffix accents in the text will be grouped according to their combining
1521 * class and the groups will be mixed and matched to try find the perfect
1522 * match with the pattern.
1523 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1524 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1525 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1527 * step 2: check if any of the generated substrings matches the pattern.
1528 * Internal method, status assumed to be success, caller has to check status
1529 * before calling this method.
1530 * @param strsrch string search data
1531 * @param textoffset end offset in the collation element text that ends with
1532 * the accents to be rearranged
1533 * @param status error status if any
1534 * @return TRUE if the match is valid, FALSE otherwise
1537 UBool
doNextCanonicalMatch(UStringSearch
*strsrch
,
1541 const UChar
*text
= strsrch
->search
->text
;
1542 int32_t temp
= textoffset
;
1543 UTF_BACK_1(text
, 0, temp
);
1544 if ((getFCD(text
, &temp
, textoffset
) & LAST_BYTE_MASK_
) == 0) {
1545 UCollationElements
*coleiter
= strsrch
->textIter
;
1546 int32_t offset
= getColElemIterOffset(coleiter
, FALSE
);
1547 if (strsrch
->pattern
.hasPrefixAccents
) {
1548 offset
= doNextCanonicalPrefixMatch(strsrch
, offset
, textoffset
,
1550 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
1551 setColEIterOffset(coleiter
, offset
);
1558 if (!strsrch
->pattern
.hasSuffixAccents
) {
1562 UChar accents
[INITIAL_ARRAY_SIZE_
];
1563 // offset to the last base character in substring to search
1564 int32_t baseoffset
= getPreviousBaseOffset(text
, textoffset
);
1565 // normalizing the offensive string
1566 unorm_normalize(text
+ baseoffset
, textoffset
- baseoffset
, UNORM_NFD
,
1567 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
1568 // status checked in loop below
1570 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1571 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
1573 // 2 power n - 1 plus the full set of accents
1574 int32_t count
= (2 << (size
- 1)) - 1;
1575 while (U_SUCCESS(*status
) && count
> 0) {
1576 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
1577 // copy the base characters
1578 for (int k
= 0; k
< accentsindex
[0]; k
++) {
1579 *rearrange
++ = accents
[k
];
1581 // forming all possible canonical rearrangement by dropping
1583 for (int i
= 0; i
<= size
- 1; i
++) {
1584 int32_t mask
= 1 << (size
- i
- 1);
1586 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
1587 *rearrange
++ = accents
[j
];
1592 int32_t offset
= doNextCanonicalSuffixMatch(strsrch
, baseoffset
,
1594 if (offset
!= USEARCH_DONE
) {
1595 return TRUE
; // match found
1603 * Gets the previous base character offset depending on the string search
1605 * @param strsrch string search data
1606 * @param textoffset current offset, current character
1607 * @return the offset of the next character after this base character or itself
1608 * if it is a composed character with accents
1611 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch
*strsrch
,
1614 if (strsrch
->pattern
.hasPrefixAccents
&& textoffset
> 0) {
1615 const UChar
*text
= strsrch
->search
->text
;
1616 int32_t offset
= textoffset
;
1617 if (getFCD(text
, &offset
, strsrch
->search
->textLength
) >>
1618 SECOND_LAST_BYTE_SHIFT_
) {
1619 return getPreviousBaseOffset(text
, textoffset
);
1626 * Checks match for contraction.
1627 * If the match ends with a partial contraction we fail.
1628 * If the match starts too far off (because of backwards iteration) we try to
1629 * chip off the extra characters
1630 * Internal method, status assumed to be success, caller has to check status
1631 * before calling this method.
1632 * @param strsrch string search data
1633 * @param start offset of potential match, to be modified if necessary
1634 * @param end offset of potential match, to be modified if necessary
1635 * @param status output error status if any
1636 * @return TRUE if match passes the contraction test, FALSE otherwise
1639 UBool
checkNextCanonicalContractionMatch(UStringSearch
*strsrch
,
1644 UCollationElements
*coleiter
= strsrch
->textIter
;
1645 int32_t textlength
= strsrch
->search
->textLength
;
1646 int32_t temp
= *start
;
1647 const UCollator
*collator
= strsrch
->collator
;
1648 const UChar
*text
= strsrch
->search
->text
;
1649 // This part checks if either ends of the match contains potential
1650 // contraction. If so we'll have to iterate through them
1651 if ((*end
< textlength
&& ucol_unsafeCP(text
[*end
], collator
)) ||
1652 (*start
+ 1 < textlength
1653 && ucol_unsafeCP(text
[*start
+ 1], collator
))) {
1654 int32_t expansion
= getExpansionPrefix(coleiter
);
1655 UBool expandflag
= expansion
> 0;
1656 setColEIterOffset(coleiter
, *start
);
1657 while (expansion
> 0) {
1658 // getting rid of the redundant ce, caused by setOffset.
1659 // since backward contraction/expansion may have extra ces if we
1660 // are in the normalization buffer, hasAccentsBeforeMatch would
1661 // have taken care of it.
1662 // E.g. the character \u01FA will have an expansion of 3, but if
1663 // we are only looking for acute and ring \u030A and \u0301, we'll
1664 // have to skip the first ce in the expansion buffer.
1665 ucol_next(coleiter
, status
);
1666 if (U_FAILURE(*status
)) {
1669 if (ucol_getOffset(coleiter
) != temp
) {
1671 temp
= ucol_getOffset(coleiter
);
1676 int32_t *patternce
= strsrch
->pattern
.CE
;
1677 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1679 int32_t textlength
= strsrch
->search
->textLength
;
1680 while (count
< patterncelength
) {
1681 int32_t ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1682 // status checked below, note that if status is a failure
1683 // ucol_next returns UCOL_NULLORDER
1684 if (ce
== UCOL_IGNORABLE
) {
1687 if (expandflag
&& count
== 0 && ucol_getOffset(coleiter
) != temp
) {
1689 temp
= ucol_getOffset(coleiter
);
1692 if (count
== 0 && ce
!= patternce
[0]) {
1693 // accents may have extra starting ces, this occurs when a
1694 // pure accent pattern is matched without rearrangement
1695 // text \u0325\u0300 and looking for \u0300
1696 int32_t expected
= patternce
[0];
1697 if (getFCD(text
, start
, textlength
) & LAST_BYTE_MASK_
) {
1698 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1699 while (U_SUCCESS(*status
) && ce
!= expected
&&
1700 ce
!= UCOL_NULLORDER
&&
1701 ucol_getOffset(coleiter
) <= *end
) {
1702 ce
= getCE(strsrch
, ucol_next(coleiter
, status
));
1706 if (U_FAILURE(*status
) || ce
!= patternce
[count
]) {
1708 *end
= getNextUStringSearchBaseOffset(strsrch
, *end
);
1718 * Checks and sets the match information if found.
1721 * <li> the potential match does not repeat the previous match
1722 * <li> boundaries are correct
1723 * <li> potential match does not end in the middle of a contraction
1724 * <li> identical matches
1726 * Otherwise the offset will be shifted to the next character.
1727 * Internal method, status assumed to be success, caller has to check the
1728 * status before calling this method.
1729 * @param strsrch string search data
1730 * @param textoffset offset in the collation element text. the returned value
1731 * will be the truncated end offset of the match or the new start
1733 * @param status output error status if any
1734 * @return TRUE if the match is valid, FALSE otherwise
1737 inline UBool
checkNextCanonicalMatch(UStringSearch
*strsrch
,
1738 int32_t *textoffset
,
1741 // to ensure that the start and ends are not composite characters
1742 UCollationElements
*coleiter
= strsrch
->textIter
;
1743 // if we have a canonical accent match
1744 if ((strsrch
->pattern
.hasSuffixAccents
&&
1745 strsrch
->canonicalSuffixAccents
[0]) ||
1746 (strsrch
->pattern
.hasPrefixAccents
&&
1747 strsrch
->canonicalPrefixAccents
[0])) {
1748 strsrch
->search
->matchedIndex
= getPreviousUStringSearchBaseOffset(
1750 ucol_getOffset(coleiter
));
1751 strsrch
->search
->matchedLength
= *textoffset
-
1752 strsrch
->search
->matchedIndex
;
1756 int32_t start
= getColElemIterOffset(coleiter
, FALSE
);
1757 if (!checkNextCanonicalContractionMatch(strsrch
, &start
, textoffset
,
1758 status
) || U_FAILURE(*status
)) {
1762 start
= getPreviousUStringSearchBaseOffset(strsrch
, start
);
1763 // this totally matches, however we need to check if it is repeating
1764 if (checkRepeatedMatch(strsrch
, start
, *textoffset
) ||
1765 !isBreakUnit(strsrch
, start
, *textoffset
) ||
1766 !checkIdentical(strsrch
, start
, *textoffset
)) {
1768 *textoffset
= getNextBaseOffset(strsrch
->search
->text
, *textoffset
,
1769 strsrch
->search
->textLength
);
1773 strsrch
->search
->matchedIndex
= start
;
1774 strsrch
->search
->matchedLength
= *textoffset
- start
;
1779 * Shifting the collation element iterator position forward to prepare for
1780 * a preceding match. If the first character is a unsafe character, we'll only
1781 * shift by 1 to capture contractions, normalization etc.
1782 * Internal method, status assumed to be success, caller has to check status
1783 * before calling this method.
1784 * @param text strsrch string search data
1785 * @param textoffset start text position to do search
1786 * @param ce the text ce which failed the match.
1787 * @param patternceindex index of the ce within the pattern ce buffer which
1789 * @return final offset
1792 inline int32_t reverseShift(UStringSearch
*strsrch
,
1795 int32_t patternceindex
)
1797 if (strsrch
->search
->isOverlap
) {
1798 if (textoffset
!= strsrch
->search
->textLength
) {
1802 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1806 if (ce
!= UCOL_NULLORDER
) {
1807 int32_t shift
= strsrch
->pattern
.backShift
[hash(ce
)];
1809 // this is to adjust for characters in the middle of the substring
1810 // for matching that failed.
1811 int32_t adjust
= patternceindex
;
1812 if (adjust
> 1 && shift
> adjust
) {
1813 shift
-= adjust
- 1;
1815 textoffset
-= shift
;
1818 textoffset
-= strsrch
->pattern
.defaultShiftSize
;
1821 textoffset
= getPreviousUStringSearchBaseOffset(strsrch
, textoffset
);
1826 * Checks match for contraction.
1827 * If the match starts with a partial contraction we fail.
1828 * Internal method, status assumed to be success, caller has to check status
1829 * before calling this method.
1830 * @param strsrch string search data
1831 * @param start offset of potential match, to be modified if necessary
1832 * @param end offset of potential match, to be modified if necessary
1833 * @param status output error status if any
1834 * @return TRUE if match passes the contraction test, FALSE otherwise
1837 UBool
checkPreviousExactContractionMatch(UStringSearch
*strsrch
,
1839 int32_t *end
, UErrorCode
*status
)
1841 UCollationElements
*coleiter
= strsrch
->textIter
;
1842 int32_t textlength
= strsrch
->search
->textLength
;
1843 int32_t temp
= *end
;
1844 const UCollator
*collator
= strsrch
->collator
;
1845 const UChar
*text
= strsrch
->search
->text
;
1846 // This part checks if either if the start of the match contains potential
1847 // contraction. If so we'll have to iterate through them
1848 // Since we used ucol_next while previously looking for the potential
1849 // match, this guarantees that our end will not be a partial contraction,
1850 // or a partial supplementary character.
1851 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
1852 int32_t expansion
= getExpansionSuffix(coleiter
);
1853 UBool expandflag
= expansion
> 0;
1854 setColEIterOffset(coleiter
, *end
);
1855 while (U_SUCCESS(*status
) && expansion
> 0) {
1856 // getting rid of the redundant ce
1857 // since forward contraction/expansion may have extra ces
1858 // if we are in the normalization buffer, hasAccentsBeforeMatch
1859 // would have taken care of it.
1860 // E.g. the character \u01FA will have an expansion of 3, but if
1861 // we are only looking for A ring A\u030A, we'll have to skip the
1862 // last ce in the expansion buffer
1863 ucol_previous(coleiter
, status
);
1864 if (U_FAILURE(*status
)) {
1867 if (ucol_getOffset(coleiter
) != temp
) {
1869 temp
= ucol_getOffset(coleiter
);
1874 int32_t *patternce
= strsrch
->pattern
.CE
;
1875 int32_t patterncelength
= strsrch
->pattern
.CELength
;
1876 int32_t count
= patterncelength
;
1878 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
1879 // status checked below, note that if status is a failure
1880 // ucol_previous returns UCOL_NULLORDER
1881 if (ce
== UCOL_IGNORABLE
) {
1884 if (expandflag
&& count
== 0 &&
1885 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
1887 temp
= ucol_getOffset(coleiter
);
1889 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
1891 *start
= getPreviousBaseOffset(text
, *start
);
1901 * Checks and sets the match information if found.
1904 * <li> the current match does not repeat the last match
1905 * <li> boundaries are correct
1906 * <li> exact matches has no extra accents
1907 * <li> identical matches
1909 * Otherwise the offset will be shifted to the preceding character.
1910 * Internal method, status assumed to be success, caller has to check status
1911 * before calling this method.
1912 * @param strsrch string search data
1914 * @param coleiter collation element iterator
1915 * @param text string
1916 * @param textoffset offset in the collation element text. the returned value
1917 * will be the truncated start offset of the match or the new start
1919 * @param status output error status if any
1920 * @return TRUE if the match is valid, FALSE otherwise
1923 inline UBool
checkPreviousExactMatch(UStringSearch
*strsrch
,
1924 int32_t *textoffset
,
1927 // to ensure that the start and ends are not composite characters
1928 int32_t end
= ucol_getOffset(strsrch
->textIter
);
1929 if (!checkPreviousExactContractionMatch(strsrch
, textoffset
, &end
, status
)
1930 || U_FAILURE(*status
)) {
1934 // this totally matches, however we need to check if it is repeating
1936 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
1937 !isBreakUnit(strsrch
, *textoffset
, end
) ||
1938 hasAccentsBeforeMatch(strsrch
, *textoffset
, end
) ||
1939 !checkIdentical(strsrch
, *textoffset
, end
) ||
1940 hasAccentsAfterMatch(strsrch
, *textoffset
, end
)) {
1942 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
1946 strsrch
->search
->matchedIndex
= *textoffset
;
1947 strsrch
->search
->matchedLength
= end
- *textoffset
;
1952 * Rearranges the end accents to try matching.
1953 * Suffix accents in the text will be grouped according to their combining
1954 * class and the groups will be mixed and matched to try find the perfect
1955 * match with the pattern.
1956 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1957 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1958 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1960 * step 2: check if any of the generated substrings matches the pattern.
1961 * Internal method, status assumed to be success, user has to check status
1962 * before calling this method.
1963 * @param strsrch string search match
1964 * @param start offset of the first base character
1965 * @param end start of the last accent set
1966 * @param status only error status if any
1967 * @return USEARCH_DONE if a match is not found, otherwise return the ending
1968 * offset of the match. Note this start includes all following accents.
1971 int32_t doPreviousCanonicalSuffixMatch(UStringSearch
*strsrch
,
1976 const UChar
*text
= strsrch
->search
->text
;
1977 int32_t tempend
= end
;
1979 UTF_BACK_1(text
, 0, tempend
);
1980 if (!(getFCD(text
, &tempend
, strsrch
->search
->textLength
) &
1982 // die... failed at a base character
1983 return USEARCH_DONE
;
1985 end
= getNextBaseOffset(text
, end
, strsrch
->search
->textLength
);
1987 if (U_SUCCESS(*status
)) {
1988 UChar accents
[INITIAL_ARRAY_SIZE_
];
1989 int32_t offset
= getPreviousBaseOffset(text
, end
);
1990 // normalizing the offensive string
1991 unorm_normalize(text
+ offset
, end
- offset
, UNORM_NFD
, 0, accents
,
1992 INITIAL_ARRAY_SIZE_
, status
);
1994 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
1995 int32_t accentsize
= getUnblockedAccentIndex(accents
,
1997 int32_t count
= (2 << (accentsize
- 1)) - 1;
1998 UChar buffer
[INITIAL_ARRAY_SIZE_
];
1999 UCollationElements
*coleiter
= strsrch
->utilIter
;
2000 while (U_SUCCESS(*status
) && count
> 0) {
2001 UChar
*rearrange
= strsrch
->canonicalSuffixAccents
;
2002 // copy the base characters
2003 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2004 *rearrange
++ = accents
[k
];
2006 // forming all possible canonical rearrangement by dropping
2008 for (int i
= 0; i
<= accentsize
- 1; i
++) {
2009 int32_t mask
= 1 << (accentsize
- i
- 1);
2011 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2012 *rearrange
++ = accents
[j
];
2017 int32_t matchsize
= INITIAL_ARRAY_SIZE_
;
2018 UChar
*match
= addToUCharArray(buffer
, &matchsize
,
2019 strsrch
->canonicalPrefixAccents
,
2020 strsrch
->search
->text
+ start
,
2022 strsrch
->canonicalSuffixAccents
,
2025 // run the collator iterator through this match
2026 // if status is a failure ucol_setText does nothing
2027 ucol_setText(coleiter
, match
, matchsize
, status
);
2028 if (U_SUCCESS(*status
)) {
2029 if (checkCollationMatch(strsrch
, coleiter
)) {
2030 if (match
!= buffer
) {
2039 return USEARCH_DONE
;
2043 * Take the rearranged start accents and tries matching. If match failed at
2044 * a seperate following set of accents (seperated from the rearranged on by
2045 * at least a base character) then we rearrange the preceding accents and
2046 * tries matching again.
2047 * We allow skipping of the ends of the accent set if the ces do not match.
2048 * However if the failure is found before the accent set, it fails.
2049 * Internal method, status assumed to be success, caller has to check status
2050 * before calling this method.
2051 * @param strsrch string search data
2052 * @param textoffset of the ends of the rearranged accent
2053 * @param status output error status if any
2054 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2055 * offset of the match. Note this start includes all following accents.
2058 int32_t doPreviousCanonicalPrefixMatch(UStringSearch
*strsrch
,
2062 const UChar
*text
= strsrch
->search
->text
;
2063 const UCollator
*collator
= strsrch
->collator
;
2064 int32_t safelength
= 0;
2066 int32_t safetextlength
;
2067 UChar safebuffer
[INITIAL_ARRAY_SIZE_
];
2068 int32_t safeoffset
= textoffset
;
2071 ucol_unsafeCP(strsrch
->canonicalPrefixAccents
[
2072 u_strlen(strsrch
->canonicalPrefixAccents
) - 1
2074 safeoffset
= getNextSafeOffset(collator
, text
, textoffset
,
2075 strsrch
->search
->textLength
);
2076 safelength
= safeoffset
- textoffset
;
2077 safetextlength
= INITIAL_ARRAY_SIZE_
;
2078 safetext
= addToUCharArray(safebuffer
, &safetextlength
,
2079 strsrch
->canonicalPrefixAccents
,
2080 text
+ textoffset
, safelength
,
2084 safetextlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2085 safetext
= strsrch
->canonicalPrefixAccents
;
2088 UCollationElements
*coleiter
= strsrch
->utilIter
;
2089 // if status is a failure, ucol_setText does nothing
2090 ucol_setText(coleiter
, safetext
, safetextlength
, status
);
2091 // status checked in loop below
2093 int32_t *ce
= strsrch
->pattern
.CE
;
2094 int32_t celength
= strsrch
->pattern
.CELength
;
2096 UBool isSafe
= TRUE
; // safe zone indication flag for position
2097 int32_t prefixlength
= u_strlen(strsrch
->canonicalPrefixAccents
);
2099 while (ceindex
< celength
) {
2100 int32_t textce
= ucol_next(coleiter
, status
);
2101 if (U_FAILURE(*status
)) {
2103 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2105 return USEARCH_DONE
;
2107 if (textce
== UCOL_NULLORDER
) {
2108 // check if we have passed the safe buffer
2109 if (coleiter
== strsrch
->textIter
) {
2110 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2111 return USEARCH_DONE
;
2113 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2114 safetext
= safebuffer
;
2115 coleiter
= strsrch
->textIter
;
2116 setColEIterOffset(coleiter
, safeoffset
);
2117 // status checked at the start of the loop
2121 textce
= getCE(strsrch
, textce
);
2122 if (textce
!= UCOL_IGNORABLE
&& textce
!= ce
[ceindex
]) {
2123 // do the beginning stuff
2124 int32_t failedoffset
= ucol_getOffset(coleiter
);
2125 if (isSafe
&& failedoffset
<= prefixlength
) {
2126 // alas... no hope. failed at rearranged accent set
2127 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2128 return USEARCH_DONE
;
2132 failedoffset
= safeoffset
- failedoffset
;
2133 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2136 // try rearranging the end accents
2137 int32_t result
= doPreviousCanonicalSuffixMatch(strsrch
,
2138 textoffset
, failedoffset
, status
);
2139 if (result
!= USEARCH_DONE
) {
2140 // if status is a failure, ucol_setOffset does nothing
2141 setColEIterOffset(strsrch
->textIter
, result
);
2143 if (U_FAILURE(*status
)) {
2144 return USEARCH_DONE
;
2149 if (textce
== ce
[ceindex
]) {
2155 int32_t result
= ucol_getOffset(coleiter
);
2156 // sets the text iterator here with the correct expansion and offset
2157 int32_t leftoverces
= getExpansionSuffix(coleiter
);
2158 cleanUpSafeText(strsrch
, safetext
, safebuffer
);
2159 if (result
<= prefixlength
) {
2160 result
= textoffset
;
2163 result
= textoffset
+ (safeoffset
- result
);
2165 setColEIterOffset(strsrch
->textIter
, result
);
2166 setExpansionSuffix(strsrch
->textIter
, leftoverces
);
2170 return ucol_getOffset(coleiter
);
2174 * Trying out the substring and sees if it can be a canonical match.
2175 * This will try normalizing the starting accents and arranging them into
2176 * canonical equivalents and check their corresponding ces with the pattern ce.
2177 * Prefix accents in the text will be grouped according to their combining
2178 * class and the groups will be mixed and matched to try find the perfect
2179 * match with the pattern.
2180 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2181 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2182 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2184 * step 2: check if any of the generated substrings matches the pattern.
2185 * Internal method, status assumed to be success, caller has to check status
2186 * before calling this method.
2187 * @param strsrch string search data
2188 * @param textoffset start offset in the collation element text that starts
2189 * with the accents to be rearranged
2190 * @param status output error status if any
2191 * @return TRUE if the match is valid, FALSE otherwise
2194 UBool
doPreviousCanonicalMatch(UStringSearch
*strsrch
,
2198 const UChar
*text
= strsrch
->search
->text
;
2199 int32_t temp
= textoffset
;
2200 int32_t textlength
= strsrch
->search
->textLength
;
2201 if ((getFCD(text
, &temp
, textlength
) >> SECOND_LAST_BYTE_SHIFT_
) == 0) {
2202 UCollationElements
*coleiter
= strsrch
->textIter
;
2203 int32_t offset
= ucol_getOffset(coleiter
);
2204 if (strsrch
->pattern
.hasSuffixAccents
) {
2205 offset
= doPreviousCanonicalSuffixMatch(strsrch
, textoffset
,
2207 if (U_SUCCESS(*status
) && offset
!= USEARCH_DONE
) {
2208 setColEIterOffset(coleiter
, offset
);
2215 if (!strsrch
->pattern
.hasPrefixAccents
) {
2219 UChar accents
[INITIAL_ARRAY_SIZE_
];
2220 // offset to the last base character in substring to search
2221 int32_t baseoffset
= getNextBaseOffset(text
, textoffset
, textlength
);
2222 // normalizing the offensive string
2223 unorm_normalize(text
+ textoffset
, baseoffset
- textoffset
, UNORM_NFD
,
2224 0, accents
, INITIAL_ARRAY_SIZE_
, status
);
2225 // status checked in loop
2227 int32_t accentsindex
[INITIAL_ARRAY_SIZE_
];
2228 int32_t size
= getUnblockedAccentIndex(accents
, accentsindex
);
2230 // 2 power n - 1 plus the full set of accents
2231 int32_t count
= (2 << (size
- 1)) - 1;
2232 while (U_SUCCESS(*status
) && count
> 0) {
2233 UChar
*rearrange
= strsrch
->canonicalPrefixAccents
;
2234 // copy the base characters
2235 for (int k
= 0; k
< accentsindex
[0]; k
++) {
2236 *rearrange
++ = accents
[k
];
2238 // forming all possible canonical rearrangement by dropping
2240 for (int i
= 0; i
<= size
- 1; i
++) {
2241 int32_t mask
= 1 << (size
- i
- 1);
2243 for (int j
= accentsindex
[i
]; j
< accentsindex
[i
+ 1]; j
++) {
2244 *rearrange
++ = accents
[j
];
2249 int32_t offset
= doPreviousCanonicalPrefixMatch(strsrch
,
2250 baseoffset
, status
);
2251 if (offset
!= USEARCH_DONE
) {
2252 return TRUE
; // match found
2260 * Checks match for contraction.
2261 * If the match starts with a partial contraction we fail.
2262 * Internal method, status assumed to be success, caller has to check status
2263 * before calling this method.
2264 * @param strsrch string search data
2265 * @param start offset of potential match, to be modified if necessary
2266 * @param end offset of potential match, to be modified if necessary
2267 * @param status only error status if any
2268 * @return TRUE if match passes the contraction test, FALSE otherwise
2271 UBool
checkPreviousCanonicalContractionMatch(UStringSearch
*strsrch
,
2273 int32_t *end
, UErrorCode
*status
)
2275 UCollationElements
*coleiter
= strsrch
->textIter
;
2276 int32_t textlength
= strsrch
->search
->textLength
;
2277 int32_t temp
= *end
;
2278 const UCollator
*collator
= strsrch
->collator
;
2279 const UChar
*text
= strsrch
->search
->text
;
2280 // This part checks if either if the start of the match contains potential
2281 // contraction. If so we'll have to iterate through them
2282 // Since we used ucol_next while previously looking for the potential
2283 // match, this guarantees that our end will not be a partial contraction,
2284 // or a partial supplementary character.
2285 if (*start
< textlength
&& ucol_unsafeCP(text
[*start
], collator
)) {
2286 int32_t expansion
= getExpansionSuffix(coleiter
);
2287 UBool expandflag
= expansion
> 0;
2288 setColEIterOffset(coleiter
, *end
);
2289 while (expansion
> 0) {
2290 // getting rid of the redundant ce
2291 // since forward contraction/expansion may have extra ces
2292 // if we are in the normalization buffer, hasAccentsBeforeMatch
2293 // would have taken care of it.
2294 // E.g. the character \u01FA will have an expansion of 3, but if
2295 // we are only looking for A ring A\u030A, we'll have to skip the
2296 // last ce in the expansion buffer
2297 ucol_previous(coleiter
, status
);
2298 if (U_FAILURE(*status
)) {
2301 if (ucol_getOffset(coleiter
) != temp
) {
2303 temp
= ucol_getOffset(coleiter
);
2308 int32_t *patternce
= strsrch
->pattern
.CE
;
2309 int32_t patterncelength
= strsrch
->pattern
.CELength
;
2310 int32_t count
= patterncelength
;
2312 int32_t ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2313 // status checked below, note that if status is a failure
2314 // ucol_previous returns UCOL_NULLORDER
2315 if (ce
== UCOL_IGNORABLE
) {
2318 if (expandflag
&& count
== 0 &&
2319 getColElemIterOffset(coleiter
, FALSE
) != temp
) {
2321 temp
= ucol_getOffset(coleiter
);
2323 if (count
== patterncelength
&&
2324 ce
!= patternce
[patterncelength
- 1]) {
2325 // accents may have extra starting ces, this occurs when a
2326 // pure accent pattern is matched without rearrangement
2327 int32_t expected
= patternce
[patterncelength
- 1];
2328 UTF_BACK_1(text
, 0, *end
);
2329 if (getFCD(text
, end
, textlength
) & LAST_BYTE_MASK_
) {
2330 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2331 while (U_SUCCESS(*status
) && ce
!= expected
&&
2332 ce
!= UCOL_NULLORDER
&&
2333 ucol_getOffset(coleiter
) <= *start
) {
2334 ce
= getCE(strsrch
, ucol_previous(coleiter
, status
));
2338 if (U_FAILURE(*status
) || ce
!= patternce
[count
- 1]) {
2340 *start
= getPreviousBaseOffset(text
, *start
);
2350 * Checks and sets the match information if found.
2353 * <li> the potential match does not repeat the previous match
2354 * <li> boundaries are correct
2355 * <li> potential match does not end in the middle of a contraction
2356 * <li> identical matches
2358 * Otherwise the offset will be shifted to the next character.
2359 * Internal method, status assumed to be success, caller has to check status
2360 * before calling this method.
2361 * @param strsrch string search data
2362 * @param textoffset offset in the collation element text. the returned value
2363 * will be the truncated start offset of the match or the new start
2365 * @param status only error status if any
2366 * @return TRUE if the match is valid, FALSE otherwise
2369 inline UBool
checkPreviousCanonicalMatch(UStringSearch
*strsrch
,
2370 int32_t *textoffset
,
2373 // to ensure that the start and ends are not composite characters
2374 UCollationElements
*coleiter
= strsrch
->textIter
;
2375 // if we have a canonical accent match
2376 if ((strsrch
->pattern
.hasSuffixAccents
&&
2377 strsrch
->canonicalSuffixAccents
[0]) ||
2378 (strsrch
->pattern
.hasPrefixAccents
&&
2379 strsrch
->canonicalPrefixAccents
[0])) {
2380 strsrch
->search
->matchedIndex
= *textoffset
;
2381 strsrch
->search
->matchedLength
=
2382 getNextUStringSearchBaseOffset(strsrch
,
2383 getColElemIterOffset(coleiter
, FALSE
))
2388 int32_t end
= ucol_getOffset(coleiter
);
2389 if (!checkPreviousCanonicalContractionMatch(strsrch
, textoffset
, &end
,
2391 U_FAILURE(*status
)) {
2395 end
= getNextUStringSearchBaseOffset(strsrch
, end
);
2396 // this totally matches, however we need to check if it is repeating
2397 if (checkRepeatedMatch(strsrch
, *textoffset
, end
) ||
2398 !isBreakUnit(strsrch
, *textoffset
, end
) ||
2399 !checkIdentical(strsrch
, *textoffset
, end
)) {
2401 *textoffset
= getPreviousBaseOffset(strsrch
->search
->text
,
2406 strsrch
->search
->matchedIndex
= *textoffset
;
2407 strsrch
->search
->matchedLength
= end
- *textoffset
;
2411 // constructors and destructor -------------------------------------------
2413 U_CAPI UStringSearch
* U_EXPORT2
usearch_open(const UChar
*pattern
,
2414 int32_t patternlength
,
2418 UBreakIterator
*breakiter
,
2421 if (U_FAILURE(*status
)) {
2424 #if UCONFIG_NO_BREAK_ITERATION
2425 if (breakiter
!= NULL
) {
2426 *status
= U_UNSUPPORTED_ERROR
;
2431 // ucol_open internally checks for status
2432 UCollator
*collator
= ucol_open(locale
, status
);
2433 // pattern, text checks are done in usearch_openFromCollator
2434 UStringSearch
*result
= usearch_openFromCollator(pattern
,
2435 patternlength
, text
, textlength
,
2436 collator
, breakiter
, status
);
2438 if (result
== NULL
|| U_FAILURE(*status
)) {
2440 ucol_close(collator
);
2445 result
->ownCollator
= TRUE
;
2449 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2453 U_CAPI UStringSearch
* U_EXPORT2
usearch_openFromCollator(
2454 const UChar
*pattern
,
2455 int32_t patternlength
,
2458 const UCollator
*collator
,
2459 UBreakIterator
*breakiter
,
2462 if (U_FAILURE(*status
)) {
2465 #if UCONFIG_NO_BREAK_ITERATION
2466 if (breakiter
!= NULL
) {
2467 *status
= U_UNSUPPORTED_ERROR
;
2471 if (pattern
== NULL
|| text
== NULL
|| collator
== NULL
) {
2472 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2475 // string search does not really work when numeric collation is turned on
2476 if(ucol_getAttribute(collator
, UCOL_NUMERIC_COLLATION
, status
) == UCOL_ON
) {
2477 *status
= U_UNSUPPORTED_ERROR
;
2480 if (U_SUCCESS(*status
)) {
2481 initializeFCD(status
);
2482 if (U_FAILURE(*status
)) {
2486 UStringSearch
*result
;
2487 if (textlength
== -1) {
2488 textlength
= u_strlen(text
);
2490 if (patternlength
== -1) {
2491 patternlength
= u_strlen(pattern
);
2493 if (textlength
<= 0 || patternlength
<= 0) {
2494 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2498 result
= (UStringSearch
*)uprv_malloc(sizeof(UStringSearch
));
2499 if (result
== NULL
) {
2500 *status
= U_MEMORY_ALLOCATION_ERROR
;
2504 result
->collator
= collator
;
2505 result
->strength
= ucol_getStrength(collator
);
2506 result
->ceMask
= getMask(result
->strength
);
2508 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2510 result
->variableTop
= ucol_getVariableTop(collator
, status
);
2512 if (U_FAILURE(*status
)) {
2517 result
->search
= (USearch
*)uprv_malloc(sizeof(USearch
));
2518 if (result
->search
== NULL
) {
2519 *status
= U_MEMORY_ALLOCATION_ERROR
;
2524 result
->search
->text
= text
;
2525 result
->search
->textLength
= textlength
;
2527 result
->pattern
.text
= pattern
;
2528 result
->pattern
.textLength
= patternlength
;
2529 result
->pattern
.CE
= NULL
;
2531 result
->search
->breakIter
= breakiter
;
2532 #if !UCONFIG_NO_BREAK_ITERATION
2534 ubrk_setText(breakiter
, text
, textlength
, status
);
2538 result
->ownCollator
= FALSE
;
2539 result
->search
->matchedLength
= 0;
2540 result
->search
->matchedIndex
= USEARCH_DONE
;
2541 result
->textIter
= ucol_openElements(collator
, text
,
2542 textlength
, status
);
2543 if (U_FAILURE(*status
)) {
2544 usearch_close(result
);
2548 result
->utilIter
= NULL
;
2550 result
->search
->isOverlap
= FALSE
;
2551 result
->search
->isCanonicalMatch
= FALSE
;
2552 result
->search
->isForwardSearching
= TRUE
;
2553 result
->search
->reset
= TRUE
;
2555 initialize(result
, status
);
2557 if (U_FAILURE(*status
)) {
2558 usearch_close(result
);
2567 U_CAPI
void U_EXPORT2
usearch_close(UStringSearch
*strsrch
)
2570 if (strsrch
->pattern
.CE
!= strsrch
->pattern
.CEBuffer
&&
2571 strsrch
->pattern
.CE
) {
2572 uprv_free(strsrch
->pattern
.CE
);
2574 ucol_closeElements(strsrch
->textIter
);
2575 ucol_closeElements(strsrch
->utilIter
);
2576 if (strsrch
->ownCollator
&& strsrch
->collator
) {
2577 ucol_close((UCollator
*)strsrch
->collator
);
2579 uprv_free(strsrch
->search
);
2584 // set and get methods --------------------------------------------------
2586 U_CAPI
void U_EXPORT2
usearch_setOffset(UStringSearch
*strsrch
,
2590 if (U_SUCCESS(*status
) && strsrch
) {
2591 if (isOutOfBounds(strsrch
->search
->textLength
, position
)) {
2592 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
2595 setColEIterOffset(strsrch
->textIter
, position
);
2597 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2598 strsrch
->search
->matchedLength
= 0;
2599 strsrch
->search
->reset
= FALSE
;
2603 U_CAPI
int32_t U_EXPORT2
usearch_getOffset(const UStringSearch
*strsrch
)
2606 int32_t result
= ucol_getOffset(strsrch
->textIter
);
2607 if (isOutOfBounds(strsrch
->search
->textLength
, result
)) {
2608 return USEARCH_DONE
;
2612 return USEARCH_DONE
;
2615 U_CAPI
void U_EXPORT2
usearch_setAttribute(UStringSearch
*strsrch
,
2616 USearchAttribute attribute
,
2617 USearchAttributeValue value
,
2620 if (U_SUCCESS(*status
) && strsrch
) {
2623 case USEARCH_OVERLAP
:
2624 strsrch
->search
->isOverlap
= (value
== USEARCH_ON
? TRUE
: FALSE
);
2626 case USEARCH_CANONICAL_MATCH
:
2627 strsrch
->search
->isCanonicalMatch
= (value
== USEARCH_ON
? TRUE
:
2630 case USEARCH_ATTRIBUTE_COUNT
:
2632 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2635 if (value
== USEARCH_ATTRIBUTE_VALUE_COUNT
) {
2636 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2640 U_CAPI USearchAttributeValue U_EXPORT2
usearch_getAttribute(
2641 const UStringSearch
*strsrch
,
2642 USearchAttribute attribute
)
2645 switch (attribute
) {
2646 case USEARCH_OVERLAP
:
2647 return (strsrch
->search
->isOverlap
== TRUE
? USEARCH_ON
:
2649 case USEARCH_CANONICAL_MATCH
:
2650 return (strsrch
->search
->isCanonicalMatch
== TRUE
? USEARCH_ON
:
2652 case USEARCH_ATTRIBUTE_COUNT
:
2653 return USEARCH_DEFAULT
;
2656 return USEARCH_DEFAULT
;
2659 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedStart(
2660 const UStringSearch
*strsrch
)
2662 if (strsrch
== NULL
) {
2663 return USEARCH_DONE
;
2665 return strsrch
->search
->matchedIndex
;
2669 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedText(const UStringSearch
*strsrch
,
2671 int32_t resultCapacity
,
2674 if (U_FAILURE(*status
)) {
2675 return USEARCH_DONE
;
2677 if (strsrch
== NULL
|| resultCapacity
< 0 || (resultCapacity
> 0 &&
2679 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2680 return USEARCH_DONE
;
2683 int32_t copylength
= strsrch
->search
->matchedLength
;
2684 int32_t copyindex
= strsrch
->search
->matchedIndex
;
2685 if (copyindex
== USEARCH_DONE
) {
2686 u_terminateUChars(result
, resultCapacity
, 0, status
);
2687 return USEARCH_DONE
;
2690 if (resultCapacity
< copylength
) {
2691 copylength
= resultCapacity
;
2693 if (copylength
> 0) {
2694 uprv_memcpy(result
, strsrch
->search
->text
+ copyindex
,
2695 copylength
* sizeof(UChar
));
2697 return u_terminateUChars(result
, resultCapacity
,
2698 strsrch
->search
->matchedLength
, status
);
2701 U_CAPI
int32_t U_EXPORT2
usearch_getMatchedLength(
2702 const UStringSearch
*strsrch
)
2705 return strsrch
->search
->matchedLength
;
2707 return USEARCH_DONE
;
2710 #if !UCONFIG_NO_BREAK_ITERATION
2712 U_CAPI
void U_EXPORT2
usearch_setBreakIterator(UStringSearch
*strsrch
,
2713 UBreakIterator
*breakiter
,
2716 if (U_SUCCESS(*status
) && strsrch
) {
2717 strsrch
->search
->breakIter
= breakiter
;
2719 ubrk_setText(breakiter
, strsrch
->search
->text
,
2720 strsrch
->search
->textLength
, status
);
2725 U_CAPI
const UBreakIterator
* U_EXPORT2
2726 usearch_getBreakIterator(const UStringSearch
*strsrch
)
2729 return strsrch
->search
->breakIter
;
2736 U_CAPI
void U_EXPORT2
usearch_setText( UStringSearch
*strsrch
,
2741 if (U_SUCCESS(*status
)) {
2742 if (strsrch
== NULL
|| text
== NULL
|| textlength
< -1 ||
2744 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2747 if (textlength
== -1) {
2748 textlength
= u_strlen(text
);
2750 strsrch
->search
->text
= text
;
2751 strsrch
->search
->textLength
= textlength
;
2752 ucol_setText(strsrch
->textIter
, text
, textlength
, status
);
2753 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
2754 strsrch
->search
->matchedLength
= 0;
2755 strsrch
->search
->reset
= TRUE
;
2756 #if !UCONFIG_NO_BREAK_ITERATION
2757 if (strsrch
->search
->breakIter
!= NULL
) {
2758 ubrk_setText(strsrch
->search
->breakIter
, text
,
2759 textlength
, status
);
2766 U_CAPI
const UChar
* U_EXPORT2
usearch_getText(const UStringSearch
*strsrch
,
2770 *length
= strsrch
->search
->textLength
;
2771 return strsrch
->search
->text
;
2776 U_CAPI
void U_EXPORT2
usearch_setCollator( UStringSearch
*strsrch
,
2777 const UCollator
*collator
,
2780 if (U_SUCCESS(*status
)) {
2781 if (collator
== NULL
) {
2782 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2786 if (strsrch
->ownCollator
&& (strsrch
->collator
!= collator
)) {
2787 ucol_close((UCollator
*)strsrch
->collator
);
2788 strsrch
->ownCollator
= FALSE
;
2790 strsrch
->collator
= collator
;
2791 strsrch
->strength
= ucol_getStrength(collator
);
2792 strsrch
->ceMask
= getMask(strsrch
->strength
);
2793 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
2795 ucol_getAttribute(collator
, UCOL_ALTERNATE_HANDLING
, status
) ==
2797 // if status is a failure, ucol_getVariableTop returns 0
2798 strsrch
->variableTop
= ucol_getVariableTop(collator
, status
);
2799 if (U_SUCCESS(*status
)) {
2800 initialize(strsrch
, status
);
2801 if (U_SUCCESS(*status
)) {
2802 uprv_init_collIterate(collator
, strsrch
->search
->text
,
2803 strsrch
->search
->textLength
,
2804 &(strsrch
->textIter
->iteratordata_
));
2805 strsrch
->utilIter
->iteratordata_
.coll
= collator
;
2812 U_CAPI UCollator
* U_EXPORT2
usearch_getCollator(const UStringSearch
*strsrch
)
2815 return (UCollator
*)strsrch
->collator
;
2820 U_CAPI
void U_EXPORT2
usearch_setPattern( UStringSearch
*strsrch
,
2821 const UChar
*pattern
,
2822 int32_t patternlength
,
2825 if (U_SUCCESS(*status
)) {
2826 if (strsrch
== NULL
|| pattern
== NULL
) {
2827 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2830 if (patternlength
== -1) {
2831 patternlength
= u_strlen(pattern
);
2833 if (patternlength
== 0) {
2834 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2837 strsrch
->pattern
.text
= pattern
;
2838 strsrch
->pattern
.textLength
= patternlength
;
2839 initialize(strsrch
, status
);
2844 U_CAPI
const UChar
* U_EXPORT2
2845 usearch_getPattern(const UStringSearch
*strsrch
,
2849 *length
= strsrch
->pattern
.textLength
;
2850 return strsrch
->pattern
.text
;
2855 // miscellanous methods --------------------------------------------------
2857 U_CAPI
int32_t U_EXPORT2
usearch_first(UStringSearch
*strsrch
,
2860 if (strsrch
&& U_SUCCESS(*status
)) {
2861 strsrch
->search
->isForwardSearching
= TRUE
;
2862 usearch_setOffset(strsrch
, 0, status
);
2863 if (U_SUCCESS(*status
)) {
2864 return usearch_next(strsrch
, status
);
2867 return USEARCH_DONE
;
2870 U_CAPI
int32_t U_EXPORT2
usearch_following(UStringSearch
*strsrch
,
2874 if (strsrch
&& U_SUCCESS(*status
)) {
2875 strsrch
->search
->isForwardSearching
= TRUE
;
2876 // position checked in usearch_setOffset
2877 usearch_setOffset(strsrch
, position
, status
);
2878 if (U_SUCCESS(*status
)) {
2879 return usearch_next(strsrch
, status
);
2882 return USEARCH_DONE
;
2885 U_CAPI
int32_t U_EXPORT2
usearch_last(UStringSearch
*strsrch
,
2888 if (strsrch
&& U_SUCCESS(*status
)) {
2889 strsrch
->search
->isForwardSearching
= FALSE
;
2890 usearch_setOffset(strsrch
, strsrch
->search
->textLength
, status
);
2891 if (U_SUCCESS(*status
)) {
2892 return usearch_previous(strsrch
, status
);
2895 return USEARCH_DONE
;
2898 U_CAPI
int32_t U_EXPORT2
usearch_preceding(UStringSearch
*strsrch
,
2902 if (strsrch
&& U_SUCCESS(*status
)) {
2903 strsrch
->search
->isForwardSearching
= FALSE
;
2904 // position checked in usearch_setOffset
2905 usearch_setOffset(strsrch
, position
, status
);
2906 if (U_SUCCESS(*status
)) {
2907 return usearch_previous(strsrch
, status
);
2910 return USEARCH_DONE
;
2914 * If a direction switch is required, we'll count the number of ces till the
2915 * beginning of the collation element iterator and iterate forwards that
2916 * number of times. This is so that we get to the correct point within the
2917 * string to continue the search in. Imagine when we are in the middle of the
2918 * normalization buffer when the change in direction is request. arrrgghh....
2919 * After searching the offset within the collation element iterator will be
2920 * shifted to the start of the match. If a match is not found, the offset would
2921 * have been set to the end of the text string in the collation element
2923 * Okay, here's my take on normalization buffer. The only time when there can
2924 * be 2 matches within the same normalization is when the pattern is consists
2925 * of all accents. But since the offset returned is from the text string, we
2926 * should not confuse the caller by returning the second match within the
2927 * same normalization buffer. If we do, the 2 results will have the same match
2928 * offsets, and that'll be confusing. I'll return the next match that doesn't
2929 * fall within the same normalization buffer. Note this does not affect the
2930 * results of matches spanning the text and the normalization buffer.
2931 * The position to start searching is taken from the collation element
2932 * iterator. Callers of this API would have to set the offset in the collation
2933 * element iterator before using this method.
2935 U_CAPI
int32_t U_EXPORT2
usearch_next(UStringSearch
*strsrch
,
2938 if (U_SUCCESS(*status
) && strsrch
) {
2939 // note offset is either equivalent to the start of the previous match
2940 // or is set by the user
2941 int32_t offset
= usearch_getOffset(strsrch
);
2942 USearch
*search
= strsrch
->search
;
2943 search
->reset
= FALSE
;
2944 int32_t textlength
= search
->textLength
;
2945 if (search
->isForwardSearching
) {
2946 if (offset
== textlength
2947 || (!search
->isOverlap
&&
2948 (offset
+ strsrch
->pattern
.defaultShiftSize
> textlength
||
2949 (search
->matchedIndex
!= USEARCH_DONE
&&
2950 offset
+ search
->matchedLength
>= textlength
)))) {
2951 // not enough characters to match
2952 setMatchNotFound(strsrch
);
2953 return USEARCH_DONE
;
2957 // switching direction.
2958 // if matchedIndex == USEARCH_DONE, it means that either a
2959 // setOffset has been called or that previous ran off the text
2960 // string. the iterator would have been set to offset 0 if a
2961 // match is not found.
2962 search
->isForwardSearching
= TRUE
;
2963 if (search
->matchedIndex
!= USEARCH_DONE
) {
2964 // there's no need to set the collation element iterator
2965 // the next call to next will set the offset.
2966 return search
->matchedIndex
;
2970 if (U_SUCCESS(*status
)) {
2971 if (strsrch
->pattern
.CELength
== 0) {
2972 if (search
->matchedIndex
== USEARCH_DONE
) {
2973 search
->matchedIndex
= offset
;
2975 else { // moves by codepoints
2976 UTF_FWD_1(search
->text
, search
->matchedIndex
, textlength
);
2979 search
->matchedLength
= 0;
2980 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
2981 // status checked below
2982 if (search
->matchedIndex
== textlength
) {
2983 search
->matchedIndex
= USEARCH_DONE
;
2987 if (search
->matchedLength
> 0) {
2988 // if matchlength is 0 we are at the start of the iteration
2989 if (search
->isOverlap
) {
2990 ucol_setOffset(strsrch
->textIter
, offset
+ 1, status
);
2993 ucol_setOffset(strsrch
->textIter
,
2994 offset
+ search
->matchedLength
, status
);
2998 // for boundary check purposes. this will ensure that the
2999 // next match will not preceed the current offset
3000 // note search->matchedIndex will always be set to something
3002 search
->matchedIndex
= offset
- 1;
3005 if (search
->isCanonicalMatch
) {
3006 // can't use exact here since extra accents are allowed.
3007 usearch_handleNextCanonical(strsrch
, status
);
3010 usearch_handleNextExact(strsrch
, status
);
3014 if (U_FAILURE(*status
)) {
3015 return USEARCH_DONE
;
3018 return search
->matchedIndex
;
3021 return USEARCH_DONE
;
3024 U_CAPI
int32_t U_EXPORT2
usearch_previous(UStringSearch
*strsrch
,
3027 if (U_SUCCESS(*status
) && strsrch
) {
3029 USearch
*search
= strsrch
->search
;
3030 if (search
->reset
) {
3031 offset
= search
->textLength
;
3032 search
->isForwardSearching
= FALSE
;
3033 search
->reset
= FALSE
;
3034 setColEIterOffset(strsrch
->textIter
, offset
);
3037 offset
= usearch_getOffset(strsrch
);
3040 int32_t matchedindex
= search
->matchedIndex
;
3041 if (search
->isForwardSearching
== TRUE
) {
3042 // switching direction.
3043 // if matchedIndex == USEARCH_DONE, it means that either a
3044 // setOffset has been called or that next ran off the text
3045 // string. the iterator would have been set to offset textLength if
3046 // a match is not found.
3047 search
->isForwardSearching
= FALSE
;
3048 if (matchedindex
!= USEARCH_DONE
) {
3049 return matchedindex
;
3053 if (offset
== 0 || matchedindex
== 0 ||
3054 (!search
->isOverlap
&&
3055 (offset
< strsrch
->pattern
.defaultShiftSize
||
3056 (matchedindex
!= USEARCH_DONE
&&
3057 matchedindex
< strsrch
->pattern
.defaultShiftSize
)))) {
3058 // not enough characters to match
3059 setMatchNotFound(strsrch
);
3060 return USEARCH_DONE
;
3064 if (U_SUCCESS(*status
)) {
3065 if (strsrch
->pattern
.CELength
== 0) {
3066 search
->matchedIndex
=
3067 (matchedindex
== USEARCH_DONE
? offset
: matchedindex
);
3068 if (search
->matchedIndex
== 0) {
3069 setMatchNotFound(strsrch
);
3070 // status checked below
3072 else { // move by codepoints
3073 UTF_BACK_1(search
->text
, 0, search
->matchedIndex
);
3074 setColEIterOffset(strsrch
->textIter
, search
->matchedIndex
);
3075 // status checked below
3076 search
->matchedLength
= 0;
3080 if (strsrch
->search
->isCanonicalMatch
) {
3081 // can't use exact here since extra accents are allowed.
3082 usearch_handlePreviousCanonical(strsrch
, status
);
3083 // status checked below
3086 usearch_handlePreviousExact(strsrch
, status
);
3087 // status checked below
3091 if (U_FAILURE(*status
)) {
3092 return USEARCH_DONE
;
3095 return search
->matchedIndex
;
3098 return USEARCH_DONE
;
3103 U_CAPI
void U_EXPORT2
usearch_reset(UStringSearch
*strsrch
)
3106 reset is setting the attributes that are already in
3107 string search, hence all attributes in the collator should
3108 be retrieved without any problems
3111 UErrorCode status
= U_ZERO_ERROR
;
3112 UBool sameCollAttribute
= TRUE
;
3117 strsrch
->strength
= ucol_getStrength(strsrch
->collator
);
3118 ceMask
= getMask(strsrch
->strength
);
3119 if (strsrch
->ceMask
!= ceMask
) {
3120 strsrch
->ceMask
= ceMask
;
3121 sameCollAttribute
= FALSE
;
3123 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3124 shift
= ucol_getAttribute(strsrch
->collator
, UCOL_ALTERNATE_HANDLING
,
3125 &status
) == UCOL_SHIFTED
;
3126 if (strsrch
->toShift
!= shift
) {
3127 strsrch
->toShift
= shift
;
3128 sameCollAttribute
= FALSE
;
3131 // if status is a failure, ucol_getVariableTop returns 0
3132 varTop
= ucol_getVariableTop(strsrch
->collator
, &status
);
3133 if (strsrch
->variableTop
!= varTop
) {
3134 strsrch
->variableTop
= varTop
;
3135 sameCollAttribute
= FALSE
;
3137 if (!sameCollAttribute
) {
3138 initialize(strsrch
, &status
);
3140 uprv_init_collIterate(strsrch
->collator
, strsrch
->search
->text
,
3141 strsrch
->search
->textLength
,
3142 &(strsrch
->textIter
->iteratordata_
));
3143 strsrch
->search
->matchedLength
= 0;
3144 strsrch
->search
->matchedIndex
= USEARCH_DONE
;
3145 strsrch
->search
->isOverlap
= FALSE
;
3146 strsrch
->search
->isCanonicalMatch
= FALSE
;
3147 strsrch
->search
->isForwardSearching
= TRUE
;
3148 strsrch
->search
->reset
= TRUE
;
3152 // internal use methods declared in usrchimp.h -----------------------------
3154 UBool
usearch_handleNextExact(UStringSearch
*strsrch
, UErrorCode
*status
)
3156 if (U_FAILURE(*status
)) {
3157 setMatchNotFound(strsrch
);
3161 UCollationElements
*coleiter
= strsrch
->textIter
;
3162 int32_t textlength
= strsrch
->search
->textLength
;
3163 int32_t *patternce
= strsrch
->pattern
.CE
;
3164 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3165 int32_t textoffset
= ucol_getOffset(coleiter
);
3167 // status used in setting coleiter offset, since offset is checked in
3168 // shiftForward before setting the coleiter offset, status never
3170 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
3172 while (textoffset
<= textlength
)
3174 uint32_t patternceindex
= patterncelength
- 1;
3176 UBool found
= FALSE
;
3177 int32_t lastce
= UCOL_NULLORDER
;
3179 setColEIterOffset(coleiter
, textoffset
);
3182 // finding the last pattern ce match, imagine composite characters
3183 // for example: search for pattern A in text \u00C0
3184 // we'll have to skip \u0300 the grave first before we get to A
3185 targetce
= ucol_previous(coleiter
, status
);
3186 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3190 targetce
= getCE(strsrch
, targetce
);
3191 if (targetce
== UCOL_IGNORABLE
&& inNormBuf(coleiter
)) {
3192 // this is for the text \u0315\u0300 that requires
3193 // normalization and pattern \u0300, where \u0315 is ignorable
3196 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
3199 if (targetce
== patternce
[patternceindex
]) {
3200 // the first ce can be a contraction
3204 if (!hasExpansion(coleiter
)) {
3212 while (found
&& patternceindex
> 0) {
3213 targetce
= ucol_previous(coleiter
, status
);
3214 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3218 targetce
= getCE(strsrch
, targetce
);
3219 if (targetce
== UCOL_IGNORABLE
) {
3224 found
= found
&& targetce
== patternce
[patternceindex
];
3228 if (U_FAILURE(*status
)) {
3231 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
3233 // status checked at loop.
3234 patternceindex
= patterncelength
;
3238 if (checkNextExactMatch(strsrch
, &textoffset
, status
)) {
3239 // status checked in ucol_setOffset
3240 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
3244 setMatchNotFound(strsrch
);
3248 UBool
usearch_handleNextCanonical(UStringSearch
*strsrch
, UErrorCode
*status
)
3250 if (U_FAILURE(*status
)) {
3251 setMatchNotFound(strsrch
);
3255 UCollationElements
*coleiter
= strsrch
->textIter
;
3256 int32_t textlength
= strsrch
->search
->textLength
;
3257 int32_t *patternce
= strsrch
->pattern
.CE
;
3258 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3259 int32_t textoffset
= ucol_getOffset(coleiter
);
3260 UBool hasPatternAccents
=
3261 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
3263 textoffset
= shiftForward(strsrch
, textoffset
, UCOL_NULLORDER
,
3265 strsrch
->canonicalPrefixAccents
[0] = 0;
3266 strsrch
->canonicalSuffixAccents
[0] = 0;
3268 while (textoffset
<= textlength
)
3270 int32_t patternceindex
= patterncelength
- 1;
3272 UBool found
= FALSE
;
3273 int32_t lastce
= UCOL_NULLORDER
;
3275 setColEIterOffset(coleiter
, textoffset
);
3278 // finding the last pattern ce match, imagine composite characters
3279 // for example: search for pattern A in text \u00C0
3280 // we'll have to skip \u0300 the grave first before we get to A
3281 targetce
= ucol_previous(coleiter
, status
);
3282 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3286 targetce
= getCE(strsrch
, targetce
);
3287 if (lastce
== UCOL_NULLORDER
|| lastce
== UCOL_IGNORABLE
) {
3290 if (targetce
== patternce
[patternceindex
]) {
3291 // the first ce can be a contraction
3295 if (!hasExpansion(coleiter
)) {
3301 while (found
&& patternceindex
> 0) {
3302 targetce
= ucol_previous(coleiter
, status
);
3303 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3307 targetce
= getCE(strsrch
, targetce
);
3308 if (targetce
== UCOL_IGNORABLE
) {
3313 found
= found
&& targetce
== patternce
[patternceindex
];
3316 // initializing the rearranged accent array
3317 if (hasPatternAccents
&& !found
) {
3318 strsrch
->canonicalPrefixAccents
[0] = 0;
3319 strsrch
->canonicalSuffixAccents
[0] = 0;
3320 if (U_FAILURE(*status
)) {
3323 found
= doNextCanonicalMatch(strsrch
, textoffset
, status
);
3327 if (U_FAILURE(*status
)) {
3330 textoffset
= shiftForward(strsrch
, textoffset
, lastce
,
3332 // status checked at loop
3333 patternceindex
= patterncelength
;
3337 if (checkNextCanonicalMatch(strsrch
, &textoffset
, status
)) {
3338 setColEIterOffset(coleiter
, strsrch
->search
->matchedIndex
);
3342 setMatchNotFound(strsrch
);
3346 UBool
usearch_handlePreviousExact(UStringSearch
*strsrch
, UErrorCode
*status
)
3348 if (U_FAILURE(*status
)) {
3349 setMatchNotFound(strsrch
);
3353 UCollationElements
*coleiter
= strsrch
->textIter
;
3354 int32_t *patternce
= strsrch
->pattern
.CE
;
3355 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3356 int32_t textoffset
= ucol_getOffset(coleiter
);
3358 // shifting it check for setting offset
3359 // if setOffset is called previously or there was no previous match, we
3360 // leave the offset as it is.
3361 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
3362 textoffset
= strsrch
->search
->matchedIndex
;
3365 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
3368 while (textoffset
>= 0)
3370 int32_t patternceindex
= 1;
3372 UBool found
= FALSE
;
3373 int32_t firstce
= UCOL_NULLORDER
;
3375 // if status is a failure, ucol_setOffset does nothing
3376 setColEIterOffset(coleiter
, textoffset
);
3379 // finding the first pattern ce match, imagine composite
3380 // characters. for example: search for pattern \u0300 in text
3381 // \u00C0, we'll have to skip A first before we get to
3382 // \u0300 the grave accent
3383 targetce
= ucol_next(coleiter
, status
);
3384 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3388 targetce
= getCE(strsrch
, targetce
);
3389 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
3392 if (targetce
== UCOL_IGNORABLE
) {
3395 if (targetce
== patternce
[0]) {
3399 if (!hasExpansion(coleiter
)) {
3400 // checking for accents in composite character
3408 while (found
&& (patternceindex
< patterncelength
)) {
3409 targetce
= ucol_next(coleiter
, status
);
3410 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3414 targetce
= getCE(strsrch
, targetce
);
3415 if (targetce
== UCOL_IGNORABLE
) {
3419 found
= found
&& targetce
== patternce
[patternceindex
];
3424 if (U_FAILURE(*status
)) {
3427 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
3433 if (checkPreviousExactMatch(strsrch
, &textoffset
, status
)) {
3434 setColEIterOffset(coleiter
, textoffset
);
3438 setMatchNotFound(strsrch
);
3442 UBool
usearch_handlePreviousCanonical(UStringSearch
*strsrch
,
3445 if (U_FAILURE(*status
)) {
3446 setMatchNotFound(strsrch
);
3450 UCollationElements
*coleiter
= strsrch
->textIter
;
3451 int32_t *patternce
= strsrch
->pattern
.CE
;
3452 int32_t patterncelength
= strsrch
->pattern
.CELength
;
3453 int32_t textoffset
= ucol_getOffset(coleiter
);
3454 UBool hasPatternAccents
=
3455 strsrch
->pattern
.hasSuffixAccents
|| strsrch
->pattern
.hasPrefixAccents
;
3457 // shifting it check for setting offset
3458 // if setOffset is called previously or there was no previous match, we
3459 // leave the offset as it is.
3460 if (strsrch
->search
->matchedIndex
!= USEARCH_DONE
) {
3461 textoffset
= strsrch
->search
->matchedIndex
;
3464 textoffset
= reverseShift(strsrch
, textoffset
, UCOL_NULLORDER
,
3466 strsrch
->canonicalPrefixAccents
[0] = 0;
3467 strsrch
->canonicalSuffixAccents
[0] = 0;
3469 while (textoffset
>= 0)
3471 int32_t patternceindex
= 1;
3473 UBool found
= FALSE
;
3474 int32_t firstce
= UCOL_NULLORDER
;
3476 setColEIterOffset(coleiter
, textoffset
);
3478 // finding the first pattern ce match, imagine composite
3479 // characters. for example: search for pattern \u0300 in text
3480 // \u00C0, we'll have to skip A first before we get to
3481 // \u0300 the grave accent
3482 targetce
= ucol_next(coleiter
, status
);
3483 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3487 targetce
= getCE(strsrch
, targetce
);
3488 if (firstce
== UCOL_NULLORDER
|| firstce
== UCOL_IGNORABLE
) {
3492 if (targetce
== patternce
[0]) {
3493 // the first ce can be a contraction
3497 if (!hasExpansion(coleiter
)) {
3498 // checking for accents in composite character
3506 while (found
&& patternceindex
< patterncelength
) {
3507 targetce
= ucol_next(coleiter
, status
);
3508 if (U_FAILURE(*status
) || targetce
== UCOL_NULLORDER
) {
3512 targetce
= getCE(strsrch
, targetce
);
3513 if (targetce
== UCOL_IGNORABLE
) {
3517 found
= found
&& targetce
== patternce
[patternceindex
];
3521 // initializing the rearranged accent array
3522 if (hasPatternAccents
&& !found
) {
3523 strsrch
->canonicalPrefixAccents
[0] = 0;
3524 strsrch
->canonicalSuffixAccents
[0] = 0;
3525 if (U_FAILURE(*status
)) {
3528 found
= doPreviousCanonicalMatch(strsrch
, textoffset
, status
);
3532 if (U_FAILURE(*status
)) {
3535 textoffset
= reverseShift(strsrch
, textoffset
, targetce
,
3541 if (checkPreviousCanonicalMatch(strsrch
, &textoffset
, status
)) {
3542 setColEIterOffset(coleiter
, textoffset
);
3546 setMatchNotFound(strsrch
);
3550 #endif /* #if !UCONFIG_NO_COLLATION */