]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/usearch.cpp
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / i18n / usearch.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2001-2004 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_COLLATION
13
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unormimp.h"
18 #include "ucol_imp.h"
19 #include "usrchimp.h"
20 #include "cmemory.h"
21 #include "ucln_in.h"
22
23 // internal definition ---------------------------------------------------
24
25 #define LAST_BYTE_MASK_ 0xFF
26 #define SECOND_LAST_BYTE_SHIFT_ 8
27 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
28
29 static const uint16_t *FCD_ = NULL;
30
31 // internal methods -------------------------------------------------
32
33 /**
34 * Fast collation element iterator setOffset.
35 * This function does not check for bounds.
36 * @param coleiter collation element iterator
37 * @param offset to set
38 */
39 static
40 inline void setColEIterOffset(UCollationElements *elems,
41 int32_t offset)
42 {
43 collIterate *ci = &(elems->iteratordata_);
44 ci->pos = ci->string + offset;
45 ci->CEpos = ci->toReturn = ci->CEs;
46 if (ci->flags & UCOL_ITER_INNORMBUF) {
47 ci->flags = ci->origFlags;
48 }
49 ci->fcdPosition = NULL;
50 }
51
52 /**
53 * Getting the mask for collation strength
54 * @param strength collation strength
55 * @return collation element mask
56 */
57 static
58 inline uint32_t getMask(UCollationStrength strength)
59 {
60 switch (strength)
61 {
62 case UCOL_PRIMARY:
63 return UCOL_PRIMARYORDERMASK;
64 case UCOL_SECONDARY:
65 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
66 default:
67 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
68 UCOL_PRIMARYORDERMASK;
69 }
70 }
71
72 /**
73 * This is to squeeze the 21bit ces into a 256 table
74 * @param ce collation element
75 * @return collapsed version of the collation element
76 */
77 static
78 inline int hash(uint32_t ce)
79 {
80 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
81 // well with the new collation where most of the latin 1 characters
82 // are of the value xx000xxx. their hashes will most of the time be 0
83 // to be discussed on the hash algo.
84 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
85 }
86
87 U_CDECL_BEGIN
88 static UBool U_CALLCONV
89 usearch_cleanup(void) {
90 FCD_ = NULL;
91 return TRUE;
92 }
93 U_CDECL_END
94
95 /**
96 * Initializing the fcd tables.
97 * Internal method, status assumed to be a success.
98 * @param status output error if any, caller to check status before calling
99 * method, status assumed to be success when passed in.
100 */
101 static
102 inline void initializeFCD(UErrorCode *status)
103 {
104 if (FCD_ == NULL) {
105 FCD_ = unorm_getFCDTrie(status);
106 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
107 }
108 }
109
110 /**
111 * Gets the fcd value for a character at the argument index.
112 * This method takes into accounts of the supplementary characters.
113 * @param str UTF16 string where character for fcd retrieval resides
114 * @param offset position of the character whose fcd is to be retrieved, to be
115 * overwritten with the next character position, taking
116 * surrogate characters into consideration.
117 * @param strlength length of the argument string
118 * @return fcd value
119 */
120 static
121 inline uint16_t getFCD(const UChar *str, int32_t *offset,
122 int32_t strlength)
123 {
124 int32_t temp = *offset;
125 uint16_t result;
126 UChar ch = str[temp];
127 result = unorm_getFCD16(FCD_, ch);
128 temp ++;
129
130 if (result && temp != strlength && UTF_IS_FIRST_SURROGATE(ch)) {
131 ch = str[temp];
132 if (UTF_IS_SECOND_SURROGATE(ch)) {
133 result = unorm_getFCD16FromSurrogatePair(FCD_, result, ch);
134 temp ++;
135 } else {
136 result = 0;
137 }
138 }
139 *offset = temp;
140 return result;
141 }
142
143 /**
144 * Getting the modified collation elements taking into account the collation
145 * attributes
146 * @param strsrch string search data
147 * @param sourcece
148 * @return the modified collation element
149 */
150 static
151 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
152 {
153 // note for tertiary we can't use the collator->tertiaryMask, that
154 // is a preprocessed mask that takes into account case options. since
155 // we are only concerned with exact matches, we don't need that.
156 sourcece &= strsrch->ceMask;
157
158 if (strsrch->toShift) {
159 // alternate handling here, since only the 16 most significant digits
160 // is only used, we can safely do a compare without masking
161 // if the ce is a variable, we mask and get only the primary values
162 // no shifting to quartenary is required since all primary values
163 // less than variabletop will need to be masked off anyway.
164 if (strsrch->variableTop > sourcece) {
165 if (strsrch->strength == UCOL_QUATERNARY) {
166 sourcece &= UCOL_PRIMARYORDERMASK;
167 }
168 else {
169 sourcece = UCOL_IGNORABLE;
170 }
171 }
172 }
173
174 return sourcece;
175 }
176
177 /**
178 * Allocate a memory and returns NULL if it failed.
179 * Internal method, status assumed to be a success.
180 * @param size to allocate
181 * @param status output error if any, caller to check status before calling
182 * method, status assumed to be success when passed in.
183 * @return newly allocated array, NULL otherwise
184 */
185 static
186 inline void * allocateMemory(uint32_t size, UErrorCode *status)
187 {
188 uint32_t *result = (uint32_t *)uprv_malloc(size);
189 if (result == NULL) {
190 *status = U_MEMORY_ALLOCATION_ERROR;
191 }
192 return result;
193 }
194
195 /**
196 * Adds a uint32_t value to a destination array.
197 * Creates a new array if we run out of space. The caller will have to
198 * manually deallocate the newly allocated array.
199 * Internal method, status assumed to be success, caller has to check status
200 * before calling this method. destination not to be NULL and has at least
201 * size destinationlength.
202 * @param destination target array
203 * @param offset destination offset to add value
204 * @param destinationlength target array size, return value for the new size
205 * @param value to be added
206 * @param increments incremental size expected
207 * @param status output error if any, caller to check status before calling
208 * method, status assumed to be success when passed in.
209 * @return new destination array, destination if there was no new allocation
210 */
211 static
212 inline int32_t * addTouint32_tArray(int32_t *destination,
213 uint32_t offset,
214 uint32_t *destinationlength,
215 uint32_t value,
216 uint32_t increments,
217 UErrorCode *status)
218 {
219 uint32_t newlength = *destinationlength;
220 if (offset + 1 == newlength) {
221 newlength += increments;
222 int32_t *temp = (int32_t *)allocateMemory(
223 sizeof(int32_t) * newlength, status);
224 if (U_FAILURE(*status)) {
225 return NULL;
226 }
227 uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
228 *destinationlength = newlength;
229 destination = temp;
230 }
231 destination[offset] = value;
232 return destination;
233 }
234
235 /**
236 * Initializing the ce table for a pattern.
237 * Stores non-ignorable collation keys.
238 * Table size will be estimated by the size of the pattern text. Table
239 * expansion will be perform as we go along. Adding 1 to ensure that the table
240 * size definitely increases.
241 * Internal method, status assumed to be a success.
242 * @param strsrch string search data
243 * @param status output error if any, caller to check status before calling
244 * method, status assumed to be success when passed in.
245 * @return total number of expansions
246 */
247 static
248 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
249 UErrorCode *status)
250 {
251 UPattern *pattern = &(strsrch->pattern);
252 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
253 int32_t *cetable = pattern->CEBuffer;
254 uint32_t patternlength = pattern->textLength;
255 UCollationElements *coleiter = strsrch->utilIter;
256
257 if (coleiter == NULL) {
258 coleiter = ucol_openElements(strsrch->collator, pattern->text,
259 patternlength, status);
260 // status will be checked in ucol_next(..) later and if it is an
261 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
262 // returned.
263 strsrch->utilIter = coleiter;
264 }
265 else {
266 uprv_init_collIterate(strsrch->collator, pattern->text,
267 pattern->textLength,
268 &coleiter->iteratordata_);
269 }
270
271 if (pattern->CE != cetable && pattern->CE) {
272 uprv_free(pattern->CE);
273 }
274
275 uint16_t offset = 0;
276 uint16_t result = 0;
277 int32_t ce;
278
279 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
280 U_SUCCESS(*status)) {
281 uint32_t newce = getCE(strsrch, ce);
282 if (newce) {
283 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
284 newce,
285 patternlength - ucol_getOffset(coleiter) + 1,
286 status);
287 if (U_FAILURE(*status)) {
288 return 0;
289 }
290 offset ++;
291 if (cetable != temp && cetable != pattern->CEBuffer) {
292 uprv_free(cetable);
293 }
294 cetable = temp;
295 }
296 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
297 }
298
299 cetable[offset] = 0;
300 pattern->CE = cetable;
301 pattern->CELength = offset;
302
303 return result;
304 }
305
306 /**
307 * Initializes the pattern struct.
308 * Internal method, status assumed to be success.
309 * @param strsrch UStringSearch data storage
310 * @param status output error if any, caller to check status before calling
311 * method, status assumed to be success when passed in.
312 * @return expansionsize the total expansion size of the pattern
313 */
314 static
315 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
316 {
317 UPattern *pattern = &(strsrch->pattern);
318 const UChar *patterntext = pattern->text;
319 int32_t length = pattern->textLength;
320 int32_t index = 0;
321
322 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
323 SECOND_LAST_BYTE_SHIFT_;
324 index = length;
325 UTF_BACK_1(patterntext, 0, index);
326 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
327 LAST_BYTE_MASK_;
328 // since intializePattern is an internal method status is a success.
329 return initializePatternCETable(strsrch, status);
330 }
331
332 /**
333 * Initializing shift tables, with the default values.
334 * If a corresponding default value is 0, the shift table is not set.
335 * @param shift table for forwards shift
336 * @param backshift table for backwards shift
337 * @param cetable table containing pattern ce
338 * @param cesize size of the pattern ces
339 * @param expansionsize total size of the expansions
340 * @param defaultforward the default forward value
341 * @param defaultbackward the default backward value
342 */
343 static
344 inline void setShiftTable(int16_t shift[], int16_t backshift[],
345 int32_t *cetable, int32_t cesize,
346 int16_t expansionsize,
347 int16_t defaultforward,
348 int16_t defaultbackward)
349 {
350 // estimate the value to shift. to do that we estimate the smallest
351 // number of characters to give the relevant ces, ie approximately
352 // the number of ces minus their expansion, since expansions can come
353 // from a character.
354 int32_t count;
355 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
356 shift[count] = defaultforward;
357 }
358 cesize --; // down to the last index
359 for (count = 0; count < cesize; count ++) {
360 // number of ces from right of array to the count
361 int temp = defaultforward - count - 1;
362 shift[hash(cetable[count])] = temp > 1 ? temp : 1;
363 }
364 shift[hash(cetable[cesize])] = 1;
365 // for ignorables we just shift by one. see test examples.
366 shift[hash(0)] = 1;
367
368 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
369 backshift[count] = defaultbackward;
370 }
371 for (count = cesize; count > 0; count --) {
372 // the original value count does not seem to work
373 backshift[hash(cetable[count])] = count > expansionsize ?
374 (int16_t)(count - expansionsize) : 1;
375 }
376 backshift[hash(cetable[0])] = 1;
377 backshift[hash(0)] = 1;
378 }
379
380 /**
381 * Building of the pattern collation element list and the boyer moore strsrch
382 * table.
383 * The canonical match will only be performed after the default match fails.
384 * For both cases we need to remember the size of the composed and decomposed
385 * versions of the string. Since the Boyer-Moore shift calculations shifts by
386 * a number of characters in the text and tries to match the pattern from that
387 * offset, the shift value can not be too large in case we miss some
388 * characters. To choose a right shift size, we estimate the NFC form of the
389 * and use its size as a shift guide. The NFC form should be the small
390 * possible representation of the pattern. Anyways, we'll err on the smaller
391 * shift size. Hence the calculation for minlength.
392 * Canonical match will be performed slightly differently. We'll split the
393 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
394 * the first and last base character (MS), the ending accents (EA). Matches
395 * will be done on MS first, and only when we match MS then some processing
396 * will be required for the prefix and end accents in order to determine if
397 * they match PA and EA. Hence the default shift values
398 * for the canonical match will take the size of either end's accent into
399 * consideration. Forwards search will take the end accents into consideration
400 * for the default shift values and the backwards search will take the prefix
401 * accents into consideration.
402 * If pattern has no non-ignorable ce, we return a illegal argument error.
403 * Internal method, status assumed to be success.
404 * @param strsrch UStringSearch data storage
405 * @param status for output errors if it occurs, status is assumed to be a
406 * success when it is passed in.
407 */
408 static
409 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
410 {
411 int16_t expandlength = initializePattern(strsrch, status);
412 if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
413 UPattern *pattern = &strsrch->pattern;
414 int32_t cesize = pattern->CELength;
415
416 int16_t minlength = cesize > expandlength
417 ? (int16_t)cesize - expandlength : 1;
418 pattern->defaultShiftSize = minlength;
419 setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
420 cesize, expandlength, minlength, minlength);
421 return;
422 }
423 strsrch->pattern.defaultShiftSize = 0;
424 }
425
426 /**
427 * Determine whether the target text in UStringSearch bounded by the offset
428 * start and end is one or more whole units of text as
429 * determined by the breakiterator in UStringSearch.
430 * @param strsrch string search data
431 * @param start target text start offset
432 * @param end target text end offset
433 */
434 static
435 inline UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
436 int32_t end)
437 {
438 #if !UCONFIG_NO_BREAK_ITERATION
439 UBreakIterator *breakiterator = strsrch->search->breakIter;
440 if (breakiterator) {
441 int32_t startindex = ubrk_first(breakiterator);
442 int32_t endindex = ubrk_last(breakiterator);
443
444 // out-of-range indexes are never boundary positions
445 if (start < startindex || start > endindex ||
446 end < startindex || end > endindex) {
447 return FALSE;
448 }
449 // otherwise, we can use following() on the position before the
450 // specified one and return true of the position we get back is the
451 // one the user specified
452 UBool result = (start == startindex ||
453 ubrk_following(breakiterator, start - 1) == start) &&
454 (end == endindex ||
455 ubrk_following(breakiterator, end - 1) == end);
456 if (result) {
457 // iterates the individual ces
458 UCollationElements *coleiter = strsrch->utilIter;
459 const UChar *text = strsrch->search->text +
460 start;
461 UErrorCode status = U_ZERO_ERROR;
462 ucol_setText(coleiter, text, end - start, &status);
463 for (int32_t count = 0; count < strsrch->pattern.CELength;
464 count ++) {
465 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
466 if (ce == UCOL_IGNORABLE) {
467 count --;
468 continue;
469 }
470 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
471 return FALSE;
472 }
473 }
474 int32_t nextce = ucol_next(coleiter, &status);
475 while (ucol_getOffset(coleiter) == (end - start)
476 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
477 nextce = ucol_next(coleiter, &status);
478 }
479 if (ucol_getOffset(coleiter) == (end - start)
480 && nextce != UCOL_NULLORDER) {
481 // extra collation elements at the end of the match
482 return FALSE;
483 }
484 }
485 return result;
486 }
487 #endif
488 return TRUE;
489 }
490
491 /**
492 * Getting the next base character offset if current offset is an accent,
493 * or the current offset if the current character contains a base character.
494 * accents the following base character will be returned
495 * @param text string
496 * @param textoffset current offset
497 * @param textlength length of text string
498 * @return the next base character or the current offset
499 * if the current character is contains a base character.
500 */
501 static
502 inline int32_t getNextBaseOffset(const UChar *text,
503 int32_t textoffset,
504 int32_t textlength)
505 {
506 if (textoffset < textlength) {
507 int32_t temp = textoffset;
508 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
509 while (temp < textlength) {
510 int32_t result = temp;
511 if ((getFCD(text, &temp, textlength) >>
512 SECOND_LAST_BYTE_SHIFT_) == 0) {
513 return result;
514 }
515 }
516 return textlength;
517 }
518 }
519 return textoffset;
520 }
521
522 /**
523 * Gets the next base character offset depending on the string search pattern
524 * data
525 * @param strsrch string search data
526 * @param textoffset current offset, one offset away from the last character
527 * to search for.
528 * @return start index of the next base character or the current offset
529 * if the current character is contains a base character.
530 */
531 static
532 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
533 int32_t textoffset)
534 {
535 int32_t textlength = strsrch->search->textLength;
536 if (strsrch->pattern.hasSuffixAccents &&
537 textoffset < textlength) {
538 int32_t temp = textoffset;
539 const UChar *text = strsrch->search->text;
540 UTF_BACK_1(text, 0, temp);
541 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
542 return getNextBaseOffset(text, textoffset, textlength);
543 }
544 }
545 return textoffset;
546 }
547
548 /**
549 * Shifting the collation element iterator position forward to prepare for
550 * a following match. If the last character is a unsafe character, we'll only
551 * shift by 1 to capture contractions, normalization etc.
552 * Internal method, status assumed to be success.
553 * @param text strsrch string search data
554 * @param textoffset start text position to do search
555 * @param ce the text ce which failed the match.
556 * @param patternceindex index of the ce within the pattern ce buffer which
557 * failed the match
558 * @return final offset
559 */
560 static
561 inline int32_t shiftForward(UStringSearch *strsrch,
562 int32_t textoffset,
563 int32_t ce,
564 int32_t patternceindex)
565 {
566 UPattern *pattern = &(strsrch->pattern);
567 if (ce != UCOL_NULLORDER) {
568 int32_t shift = pattern->shift[hash(ce)];
569 // this is to adjust for characters in the middle of the
570 // substring for matching that failed.
571 int32_t adjust = pattern->CELength - patternceindex;
572 if (adjust > 1 && shift >= adjust) {
573 shift -= adjust - 1;
574 }
575 textoffset += shift;
576 }
577 else {
578 textoffset += pattern->defaultShiftSize;
579 }
580
581 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
582 // check for unsafe characters
583 // * if it is the start or middle of a contraction: to be done after
584 // a initial match is found
585 // * thai or lao base consonant character: similar to contraction
586 // * high surrogate character: similar to contraction
587 // * next character is a accent: shift to the next base character
588 return textoffset;
589 }
590
591 /**
592 * sets match not found
593 * @param strsrch string search data
594 */
595 static
596 inline void setMatchNotFound(UStringSearch *strsrch)
597 {
598 // this method resets the match result regardless of the error status.
599 strsrch->search->matchedIndex = USEARCH_DONE;
600 strsrch->search->matchedLength = 0;
601 if (strsrch->search->isForwardSearching) {
602 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
603 }
604 else {
605 setColEIterOffset(strsrch->textIter, 0);
606 }
607 }
608
609 /**
610 * Gets the offset to the next safe point in text.
611 * ie. not the middle of a contraction, swappable characters or supplementary
612 * characters.
613 * @param collator collation sata
614 * @param text string to work with
615 * @param textoffset offset in string
616 * @param textlength length of text string
617 * @return offset to the next safe character
618 */
619 static
620 inline int32_t getNextSafeOffset(const UCollator *collator,
621 const UChar *text,
622 int32_t textoffset,
623 int32_t textlength)
624 {
625 int32_t result = textoffset; // first contraction character
626 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
627 result ++;
628 }
629 return result;
630 }
631
632 /**
633 * This checks for accents in the potential match started with a .
634 * composite character.
635 * This is really painful... we have to check that composite character do not
636 * have any extra accents. We have to normalize the potential match and find
637 * the immediate decomposed character before the match.
638 * The first composite character would have been taken care of by the fcd
639 * checks in checkForwardExactMatch.
640 * This is the slow path after the fcd of the first character and
641 * the last character has been checked by checkForwardExactMatch and we
642 * determine that the potential match has extra non-ignorable preceding
643 * ces.
644 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
645 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
646 * Note here that accents checking are slow and cautioned in the API docs.
647 * Internal method, status assumed to be a success, caller should check status
648 * before calling this method
649 * @param strsrch string search data
650 * @param start index of the potential unfriendly composite character
651 * @param end index of the potential unfriendly composite character
652 * @param status output error status if any.
653 * @return TRUE if there is non-ignorable accents before at the beginning
654 * of the match, FALSE otherwise.
655 */
656
657 static
658 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
659 int32_t end,
660 UErrorCode *status)
661 {
662 UBool result = FALSE;
663 if (strsrch->pattern.hasPrefixAccents) {
664 int32_t length = end - start;
665 int32_t offset = 0;
666 const UChar *text = strsrch->search->text + start;
667
668 UTF_FWD_1(text, offset, length);
669 // we are only concerned with the first composite character
670 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
671 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
672 text, 0, length);
673 if (safeoffset != length) {
674 safeoffset ++;
675 }
676 UChar *norm = NULL;
677 UChar buffer[INITIAL_ARRAY_SIZE_];
678 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
679 buffer, INITIAL_ARRAY_SIZE_,
680 status);
681 if (U_FAILURE(*status)) {
682 return FALSE;
683 }
684 if (size >= INITIAL_ARRAY_SIZE_) {
685 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
686 status);
687 // if allocation failed, status will be set to
688 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
689 // checks for it.
690 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
691 size, status);
692 if (U_FAILURE(*status) && norm != NULL) {
693 uprv_free(norm);
694 return FALSE;
695 }
696 }
697 else {
698 norm = buffer;
699 }
700
701 UCollationElements *coleiter = strsrch->utilIter;
702 ucol_setText(coleiter, norm, size, status);
703 uint32_t firstce = strsrch->pattern.CE[0];
704 UBool ignorable = TRUE;
705 uint32_t ce = UCOL_IGNORABLE;
706 while (U_SUCCESS(*status) && ce != firstce) {
707 offset = ucol_getOffset(coleiter);
708 if (ce != firstce && ce != UCOL_IGNORABLE) {
709 ignorable = FALSE;
710 }
711 ce = ucol_next(coleiter, status);
712 }
713 UChar32 codepoint;
714 UTF_PREV_CHAR(norm, 0, offset, codepoint);
715 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
716
717 if (norm != buffer) {
718 uprv_free(norm);
719 }
720 }
721 }
722
723 return result;
724 }
725
726 /**
727 * Used by exact matches, checks if there are accents before the match.
728 * This is really painful... we have to check that composite characters at
729 * the start of the matches have to not have any extra accents.
730 * We check the FCD of the character first, if it starts with an accent and
731 * the first pattern ce does not match the first ce of the character, we bail.
732 * Otherwise we try normalizing the first composite
733 * character and find the immediate decomposed character before the match to
734 * see if it is an non-ignorable accent.
735 * Now normalizing the first composite character is enough because we ensure
736 * that when the match is passed in here with extra beginning ces, the
737 * first or last ce that match has to occur within the first character.
738 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
739 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
740 * Note here that accents checking are slow and cautioned in the API docs.
741 * @param strsrch string search data
742 * @param start offset
743 * @param end offset
744 * @return TRUE if there are accents on either side of the match,
745 * FALSE otherwise
746 */
747 static
748 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
749 int32_t end)
750 {
751 if (strsrch->pattern.hasPrefixAccents) {
752 UCollationElements *coleiter = strsrch->textIter;
753 UErrorCode status = U_ZERO_ERROR;
754 // we have been iterating forwards previously
755 uint32_t ignorable = TRUE;
756 int32_t firstce = strsrch->pattern.CE[0];
757
758 setColEIterOffset(coleiter, start);
759 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
760 if (U_FAILURE(status)) {
761 return TRUE;
762 }
763 while (ce != firstce) {
764 if (ce != UCOL_IGNORABLE) {
765 ignorable = FALSE;
766 }
767 ce = getCE(strsrch, ucol_next(coleiter, &status));
768 if (U_FAILURE(status)) {
769 return TRUE;
770 }
771 }
772 if (!ignorable && inNormBuf(coleiter)) {
773 // within normalization buffer, discontiguous handled here
774 return TRUE;
775 }
776
777 // within text
778 int32_t temp = start;
779 // original code
780 // accent = (getFCD(strsrch->search->text, &temp,
781 // strsrch->search->textLength)
782 // >> SECOND_LAST_BYTE_SHIFT_);
783 // however this code does not work well with VC7 .net in release mode.
784 // maybe the inlines for getFCD combined with shifting has bugs in
785 // VC7. anyways this is a work around.
786 UBool accent = getFCD(strsrch->search->text, &temp,
787 strsrch->search->textLength) > 0xFF;
788 if (!accent) {
789 return checkExtraMatchAccents(strsrch, start, end, &status);
790 }
791 if (!ignorable) {
792 return TRUE;
793 }
794 if (start > 0) {
795 temp = start;
796 UTF_BACK_1(strsrch->search->text, 0, temp);
797 if (getFCD(strsrch->search->text, &temp,
798 strsrch->search->textLength) & LAST_BYTE_MASK_) {
799 setColEIterOffset(coleiter, start);
800 ce = ucol_previous(coleiter, &status);
801 if (U_FAILURE(status) ||
802 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
803 return TRUE;
804 }
805 }
806 }
807 }
808
809 return FALSE;
810 }
811
812 /**
813 * Used by exact matches, checks if there are accents bounding the match.
814 * Note this is the initial boundary check. If the potential match
815 * starts or ends with composite characters, the accents in those
816 * characters will be determined later.
817 * Not doing backwards iteration here, since discontiguos contraction for
818 * backwards collation element iterator, use up too many characters.
819 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
820 * should fail since there is a acute at the end of \u01FA
821 * Note here that accents checking are slow and cautioned in the API docs.
822 * @param strsrch string search data
823 * @param start offset of match
824 * @param end end offset of the match
825 * @return TRUE if there are accents on either side of the match,
826 * FALSE otherwise
827 */
828 static
829 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
830 int32_t end)
831 {
832 if (strsrch->pattern.hasSuffixAccents) {
833 const UChar *text = strsrch->search->text;
834 int32_t temp = end;
835 int32_t textlength = strsrch->search->textLength;
836 UTF_BACK_1(text, 0, temp);
837 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
838 int32_t firstce = strsrch->pattern.CE[0];
839 UCollationElements *coleiter = strsrch->textIter;
840 UErrorCode status = U_ZERO_ERROR;
841 setColEIterOffset(coleiter, start);
842 while (getCE(strsrch, ucol_next(coleiter, &status)) != firstce) {
843 if (U_FAILURE(status)) {
844 return TRUE;
845 }
846 }
847 int32_t count = 1;
848 while (count < strsrch->pattern.CELength) {
849 if (getCE(strsrch, ucol_next(coleiter, &status))
850 == UCOL_IGNORABLE) {
851 // Thai can give an ignorable here.
852 count --;
853 }
854 if (U_FAILURE(status)) {
855 return TRUE;
856 }
857 count ++;
858 }
859 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
860 if (U_FAILURE(status)) {
861 return TRUE;
862 }
863 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
864 if (ucol_getOffset(coleiter) <= end) {
865 return TRUE;
866 }
867 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
868 return TRUE;
869 }
870 }
871 }
872 }
873 return FALSE;
874 }
875
876 /**
877 * Checks if the offset runs out of the text string
878 * @param offset
879 * @param textlength of the text string
880 * @return TRUE if offset is out of bounds, FALSE otherwise
881 */
882 static
883 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
884 {
885 return offset < 0 || offset > textlength;
886 }
887
888 /**
889 * Checks for identical match
890 * @param strsrch string search data
891 * @param start offset of possible match
892 * @param end offset of possible match
893 * @return TRUE if identical match is found
894 */
895 static
896 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
897 int32_t end)
898 {
899 int32_t length = end - start;
900 if (strsrch->strength != UCOL_IDENTICAL) {
901 return TRUE;
902 }
903
904 UErrorCode status = U_ZERO_ERROR;
905 int decomplength = unorm_decompose(NULL, -1,
906 strsrch->search->text + start, length,
907 FALSE, 0, &status);
908 if (decomplength != unorm_decompose(NULL, -1, strsrch->pattern.text,
909 strsrch->pattern.textLength,
910 FALSE, 0, &status)) {
911 return FALSE;
912 }
913 decomplength ++;
914 UChar *text = (UChar *)uprv_malloc(decomplength * sizeof(UChar));
915 UChar *pattern = (UChar *)uprv_malloc(decomplength * sizeof(UChar));
916 unorm_decompose(text, decomplength, strsrch->search->text + start,
917 length, FALSE, 0, &status);
918 unorm_decompose(pattern, decomplength, strsrch->pattern.text,
919 strsrch->pattern.textLength, FALSE, 0, &status);
920 UBool result = (uprv_memcmp(pattern, text, decomplength * sizeof(UChar))
921 == 0);
922 uprv_free(text);
923 uprv_free(pattern);
924 return result;
925 }
926
927 /**
928 * Checks to see if the match is repeated
929 * @param strsrch string search data
930 * @param start new match start index
931 * @param end new match end index
932 * @return TRUE if the the match is repeated, FALSE otherwise
933 */
934 static
935 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
936 int32_t start,
937 int32_t end)
938 {
939 int32_t lastmatchindex = strsrch->search->matchedIndex;
940 UBool result;
941 if (lastmatchindex == USEARCH_DONE) {
942 return FALSE;
943 }
944 if (strsrch->search->isForwardSearching) {
945 result = start <= lastmatchindex;
946 }
947 else {
948 result = start >= lastmatchindex;
949 }
950 if (!result && !strsrch->search->isOverlap) {
951 if (strsrch->search->isForwardSearching) {
952 result = start < lastmatchindex + strsrch->search->matchedLength;
953 }
954 else {
955 result = end > lastmatchindex;
956 }
957 }
958 return result;
959 }
960
961 /**
962 * Gets the collation element iterator's current offset.
963 * @param coleiter collation element iterator
964 * @param forwards flag TRUE if we are moving in th forwards direction
965 * @return current offset
966 */
967 static
968 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
969 UBool forwards)
970 {
971 int32_t result = ucol_getOffset(coleiter);
972 // intricacies of the the backwards collation element iterator
973 if (!forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
974 result ++;
975 }
976 return result;
977 }
978
979 /**
980 * Checks match for contraction.
981 * If the match ends with a partial contraction we fail.
982 * If the match starts too far off (because of backwards iteration) we try to
983 * chip off the extra characters depending on whether a breakiterator has
984 * been used.
985 * Internal method, error assumed to be success, caller has to check status
986 * before calling this method.
987 * @param strsrch string search data
988 * @param start offset of potential match, to be modified if necessary
989 * @param end offset of potential match, to be modified if necessary
990 * @param status output error status if any
991 * @return TRUE if match passes the contraction test, FALSE otherwise
992 */
993
994 static
995 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
996 int32_t *start,
997 int32_t *end, UErrorCode *status)
998 {
999 UCollationElements *coleiter = strsrch->textIter;
1000 int32_t textlength = strsrch->search->textLength;
1001 int32_t temp = *start;
1002 const UCollator *collator = strsrch->collator;
1003 const UChar *text = strsrch->search->text;
1004 // This part checks if either ends of the match contains potential
1005 // contraction. If so we'll have to iterate through them
1006 // The start contraction needs to be checked since ucol_previous dumps
1007 // all characters till the first safe character into the buffer.
1008 // *start + 1 is used to test for the unsafe characters instead of *start
1009 // because ucol_prev takes all unsafe characters till the first safe
1010 // character ie *start. so by testing *start + 1, we can estimate if
1011 // excess prefix characters has been included in the potential search
1012 // results.
1013 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1014 (*start + 1 < textlength
1015 && ucol_unsafeCP(text[*start + 1], collator))) {
1016 int32_t expansion = getExpansionPrefix(coleiter);
1017 UBool expandflag = expansion > 0;
1018 setColEIterOffset(coleiter, *start);
1019 while (expansion > 0) {
1020 // getting rid of the redundant ce, caused by setOffset.
1021 // since backward contraction/expansion may have extra ces if we
1022 // are in the normalization buffer, hasAccentsBeforeMatch would
1023 // have taken care of it.
1024 // E.g. the character \u01FA will have an expansion of 3, but if
1025 // we are only looking for acute and ring \u030A and \u0301, we'll
1026 // have to skip the first ce in the expansion buffer.
1027 ucol_next(coleiter, status);
1028 if (U_FAILURE(*status)) {
1029 return FALSE;
1030 }
1031 if (ucol_getOffset(coleiter) != temp) {
1032 *start = temp;
1033 temp = ucol_getOffset(coleiter);
1034 }
1035 expansion --;
1036 }
1037
1038 int32_t *patternce = strsrch->pattern.CE;
1039 int32_t patterncelength = strsrch->pattern.CELength;
1040 int32_t count = 0;
1041 while (count < patterncelength) {
1042 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1043 if (ce == UCOL_IGNORABLE) {
1044 continue;
1045 }
1046 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1047 *start = temp;
1048 temp = ucol_getOffset(coleiter);
1049 }
1050 if (U_FAILURE(*status) || ce != patternce[count]) {
1051 (*end) ++;
1052 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1053 return FALSE;
1054 }
1055 count ++;
1056 }
1057 }
1058 return TRUE;
1059 }
1060
1061 /**
1062 * Checks and sets the match information if found.
1063 * Checks
1064 * <ul>
1065 * <li> the potential match does not repeat the previous match
1066 * <li> boundaries are correct
1067 * <li> exact matches has no extra accents
1068 * <li> identical matchesb
1069 * <li> potential match does not end in the middle of a contraction
1070 * <\ul>
1071 * Otherwise the offset will be shifted to the next character.
1072 * Internal method, status assumed to be success, caller has to check status
1073 * before calling this method.
1074 * @param strsrch string search data
1075 * @param textoffset offset in the collation element text. the returned value
1076 * will be the truncated end offset of the match or the new start
1077 * search offset.
1078 * @param status output error status if any
1079 * @return TRUE if the match is valid, FALSE otherwise
1080 */
1081 static
1082 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1083 int32_t *textoffset, UErrorCode *status)
1084 {
1085 UCollationElements *coleiter = strsrch->textIter;
1086 int32_t start = getColElemIterOffset(coleiter, FALSE);
1087
1088 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1089 return FALSE;
1090 }
1091
1092 // this totally matches, however we need to check if it is repeating
1093 if (!isBreakUnit(strsrch, start, *textoffset) ||
1094 checkRepeatedMatch(strsrch, start, *textoffset) ||
1095 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1096 !checkIdentical(strsrch, start, *textoffset) ||
1097 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1098
1099 (*textoffset) ++;
1100 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1101 return FALSE;
1102 }
1103
1104 // totally match, we will get rid of the ending ignorables.
1105 strsrch->search->matchedIndex = start;
1106 strsrch->search->matchedLength = *textoffset - start;
1107 return TRUE;
1108 }
1109
1110 /**
1111 * Getting the previous base character offset, or the current offset if the
1112 * current character is a base character
1113 * @param text string
1114 * @param textoffset one offset after the current character
1115 * @return the offset of the next character after the base character or the first
1116 * composed character with accents
1117 */
1118 static
1119 inline int32_t getPreviousBaseOffset(const UChar *text,
1120 int32_t textoffset)
1121 {
1122 if (textoffset > 0) {
1123 while (TRUE) {
1124 int32_t result = textoffset;
1125 UTF_BACK_1(text, 0, textoffset);
1126 int32_t temp = textoffset;
1127 uint16_t fcd = getFCD(text, &temp, result);
1128 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1129 if (fcd & LAST_BYTE_MASK_) {
1130 return textoffset;
1131 }
1132 return result;
1133 }
1134 if (textoffset == 0) {
1135 return 0;
1136 }
1137 }
1138 }
1139 return textoffset;
1140 }
1141
1142 /**
1143 * Getting the indexes of the accents that are not blocked in the argument
1144 * accent array
1145 * @param accents array of accents in nfd terminated by a 0.
1146 * @param accentsindex array of indexes of the accents that are not blocked
1147 */
1148 static
1149 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1150 {
1151 int32_t index = 0;
1152 int32_t length = u_strlen(accents);
1153 UChar32 codepoint = 0;
1154 int cclass = 0;
1155 int result = 0;
1156 int32_t temp;
1157 while (index < length) {
1158 temp = index;
1159 UTF_NEXT_CHAR(accents, index, length, codepoint);
1160 if (u_getCombiningClass(codepoint) != cclass) {
1161 cclass = u_getCombiningClass(codepoint);
1162 accentsindex[result] = temp;
1163 result ++;
1164 }
1165 }
1166 accentsindex[result] = length;
1167 return result;
1168 }
1169
1170 /**
1171 * Appends 3 UChar arrays to a destination array.
1172 * Creates a new array if we run out of space. The caller will have to
1173 * manually deallocate the newly allocated array.
1174 * Internal method, status assumed to be success, caller has to check status
1175 * before calling this method. destination not to be NULL and has at least
1176 * size destinationlength.
1177 * @param destination target array
1178 * @param destinationlength target array size, returning the appended length
1179 * @param source1 null-terminated first array
1180 * @param source2 second array
1181 * @param source2length length of seond array
1182 * @param source3 null-terminated third array
1183 * @param status error status if any
1184 * @return new destination array, destination if there was no new allocation
1185 */
1186 static
1187 inline UChar * addToUCharArray( UChar *destination,
1188 int32_t *destinationlength,
1189 const UChar *source1,
1190 const UChar *source2,
1191 int32_t source2length,
1192 const UChar *source3,
1193 UErrorCode *status)
1194 {
1195 int32_t source1length = source1 ? u_strlen(source1) : 0;
1196 int32_t source3length = source3 ? u_strlen(source3) : 0;
1197 if (*destinationlength < source1length + source2length + source3length +
1198 1)
1199 {
1200 destination = (UChar *)allocateMemory(
1201 (source1length + source2length + source3length + 1) * sizeof(UChar),
1202 status);
1203 // if error allocating memory, status will be
1204 // U_MEMORY_ALLOCATION_ERROR
1205 if (U_FAILURE(*status)) {
1206 *destinationlength = 0;
1207 return NULL;
1208 }
1209 }
1210 if (source1length != 0) {
1211 uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1212 }
1213 if (source2length != 0) {
1214 uprv_memcpy(destination + source1length, source2,
1215 sizeof(UChar) * source2length);
1216 }
1217 if (source3length != 0) {
1218 uprv_memcpy(destination + source1length + source2length, source3,
1219 sizeof(UChar) * source3length);
1220 }
1221 *destinationlength = source1length + source2length + source3length;
1222 return destination;
1223 }
1224
1225 /**
1226 * Running through a collation element iterator to see if the contents matches
1227 * pattern in string search data
1228 * @param strsrch string search data
1229 * @param coleiter collation element iterator
1230 * @return TRUE if a match if found, FALSE otherwise
1231 */
1232 static
1233 inline UBool checkCollationMatch(const UStringSearch *strsrch,
1234 UCollationElements *coleiter)
1235 {
1236 int patternceindex = strsrch->pattern.CELength;
1237 int32_t *patternce = strsrch->pattern.CE;
1238 UErrorCode status = U_ZERO_ERROR;
1239 while (patternceindex > 0) {
1240 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1241 if (ce == UCOL_IGNORABLE) {
1242 continue;
1243 }
1244 if (U_FAILURE(status) || ce != *patternce) {
1245 return FALSE;
1246 }
1247 patternce ++;
1248 patternceindex --;
1249 }
1250 return TRUE;
1251 }
1252
1253 /**
1254 * Rearranges the front accents to try matching.
1255 * Prefix accents in the text will be grouped according to their combining
1256 * class and the groups will be mixed and matched to try find the perfect
1257 * match with the pattern.
1258 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1259 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1260 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1261 * "\u0301\u0325".
1262 * step 2: check if any of the generated substrings matches the pattern.
1263 * Internal method, status is assumed to be success, caller has to check status
1264 * before calling this method.
1265 * @param strsrch string search match
1266 * @param start first offset of the accents to start searching
1267 * @param end start of the last accent set
1268 * @param status output error status if any
1269 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1270 * offset of the match. Note this start includes all preceding accents.
1271 */
1272 static
1273 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1274 int32_t start,
1275 int32_t end,
1276 UErrorCode *status)
1277 {
1278 const UChar *text = strsrch->search->text;
1279 int32_t textlength = strsrch->search->textLength;
1280 int32_t tempstart = start;
1281
1282 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1283 // die... failed at a base character
1284 return USEARCH_DONE;
1285 }
1286
1287 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1288 start = getPreviousBaseOffset(text, tempstart);
1289
1290 UChar accents[INITIAL_ARRAY_SIZE_];
1291 // normalizing the offensive string
1292 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1293 INITIAL_ARRAY_SIZE_, status);
1294 if (U_FAILURE(*status)) {
1295 return USEARCH_DONE;
1296 }
1297
1298 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1299 int32_t accentsize = getUnblockedAccentIndex(accents,
1300 accentsindex);
1301 int32_t count = (2 << (accentsize - 1)) - 1;
1302 UChar buffer[INITIAL_ARRAY_SIZE_];
1303 UCollationElements *coleiter = strsrch->utilIter;
1304 while (U_SUCCESS(*status) && count > 0) {
1305 UChar *rearrange = strsrch->canonicalPrefixAccents;
1306 // copy the base characters
1307 for (int k = 0; k < accentsindex[0]; k ++) {
1308 *rearrange ++ = accents[k];
1309 }
1310 // forming all possible canonical rearrangement by dropping
1311 // sets of accents
1312 for (int i = 0; i <= accentsize - 1; i ++) {
1313 int32_t mask = 1 << (accentsize - i - 1);
1314 if (count & mask) {
1315 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1316 *rearrange ++ = accents[j];
1317 }
1318 }
1319 }
1320 *rearrange = 0;
1321 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1322 UChar *match = addToUCharArray(buffer, &matchsize,
1323 strsrch->canonicalPrefixAccents,
1324 strsrch->search->text + offset,
1325 end - offset,
1326 strsrch->canonicalSuffixAccents,
1327 status);
1328
1329 // if status is a failure, ucol_setText does nothing.
1330 // run the collator iterator through this match
1331 ucol_setText(coleiter, match, matchsize, status);
1332 if (U_SUCCESS(*status)) {
1333 if (checkCollationMatch(strsrch, coleiter)) {
1334 if (match != buffer) {
1335 uprv_free(match);
1336 }
1337 return start;
1338 }
1339 }
1340 count --;
1341 }
1342 return USEARCH_DONE;
1343 }
1344
1345 /**
1346 * Gets the offset to the safe point in text before textoffset.
1347 * ie. not the middle of a contraction, swappable characters or supplementary
1348 * characters.
1349 * @param collator collation sata
1350 * @param text string to work with
1351 * @param textoffset offset in string
1352 * @param textlength length of text string
1353 * @return offset to the previous safe character
1354 */
1355 static
1356 inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1357 const UChar *text,
1358 int32_t textoffset)
1359 {
1360 int32_t result = textoffset; // first contraction character
1361 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1362 result --;
1363 }
1364 if (result != 0) {
1365 // the first contraction character is consider unsafe here
1366 result --;
1367 }
1368 return result;
1369 }
1370
1371 /**
1372 * Cleaning up after we passed the safe zone
1373 * @param strsrch string search data
1374 * @param safetext safe text array
1375 * @param safebuffer safe text buffer
1376 * @param coleiter collation element iterator for safe text
1377 */
1378 static
1379 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1380 UChar *safebuffer)
1381 {
1382 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1383 {
1384 uprv_free(safetext);
1385 }
1386 }
1387
1388 /**
1389 * Take the rearranged end accents and tries matching. If match failed at
1390 * a seperate preceding set of accents (seperated from the rearranged on by
1391 * at least a base character) then we rearrange the preceding accents and
1392 * tries matching again.
1393 * We allow skipping of the ends of the accent set if the ces do not match.
1394 * However if the failure is found before the accent set, it fails.
1395 * Internal method, status assumed to be success, caller has to check status
1396 * before calling this method.
1397 * @param strsrch string search data
1398 * @param textoffset of the start of the rearranged accent
1399 * @param status output error status if any
1400 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1401 * offset of the match. Note this start includes all preceding accents.
1402 */
1403 static
1404 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1405 int32_t textoffset,
1406 UErrorCode *status)
1407 {
1408 const UChar *text = strsrch->search->text;
1409 const UCollator *collator = strsrch->collator;
1410 int32_t safelength = 0;
1411 UChar *safetext;
1412 int32_t safetextlength;
1413 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1414 UCollationElements *coleiter = strsrch->utilIter;
1415 int32_t safeoffset = textoffset;
1416
1417 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1418 collator)) {
1419 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1420 safelength = textoffset - safeoffset;
1421 safetextlength = INITIAL_ARRAY_SIZE_;
1422 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1423 text + safeoffset, safelength,
1424 strsrch->canonicalSuffixAccents,
1425 status);
1426 }
1427 else {
1428 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1429 safetext = strsrch->canonicalSuffixAccents;
1430 }
1431
1432 // if status is a failure, ucol_setText does nothing
1433 ucol_setText(coleiter, safetext, safetextlength, status);
1434 // status checked in loop below
1435
1436 int32_t *ce = strsrch->pattern.CE;
1437 int32_t celength = strsrch->pattern.CELength;
1438 int ceindex = celength - 1;
1439 UBool isSafe = TRUE; // indication flag for position in safe zone
1440
1441 while (ceindex >= 0) {
1442 int32_t textce = ucol_previous(coleiter, status);
1443 if (U_FAILURE(*status)) {
1444 if (isSafe) {
1445 cleanUpSafeText(strsrch, safetext, safebuffer);
1446 }
1447 return USEARCH_DONE;
1448 }
1449 if (textce == UCOL_NULLORDER) {
1450 // check if we have passed the safe buffer
1451 if (coleiter == strsrch->textIter) {
1452 cleanUpSafeText(strsrch, safetext, safebuffer);
1453 return USEARCH_DONE;
1454 }
1455 cleanUpSafeText(strsrch, safetext, safebuffer);
1456 safetext = safebuffer;
1457 coleiter = strsrch->textIter;
1458 setColEIterOffset(coleiter, safeoffset);
1459 // status checked at the start of the loop
1460 isSafe = FALSE;
1461 continue;
1462 }
1463 textce = getCE(strsrch, textce);
1464 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1465 // do the beginning stuff
1466 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1467 if (isSafe && failedoffset >= safelength) {
1468 // alas... no hope. failed at rearranged accent set
1469 cleanUpSafeText(strsrch, safetext, safebuffer);
1470 return USEARCH_DONE;
1471 }
1472 else {
1473 if (isSafe) {
1474 failedoffset += safeoffset;
1475 cleanUpSafeText(strsrch, safetext, safebuffer);
1476 }
1477
1478 // try rearranging the front accents
1479 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1480 failedoffset, textoffset, status);
1481 if (result != USEARCH_DONE) {
1482 // if status is a failure, ucol_setOffset does nothing
1483 setColEIterOffset(strsrch->textIter, result);
1484 }
1485 if (U_FAILURE(*status)) {
1486 return USEARCH_DONE;
1487 }
1488 return result;
1489 }
1490 }
1491 if (textce == ce[ceindex]) {
1492 ceindex --;
1493 }
1494 }
1495 // set offset here
1496 if (isSafe) {
1497 int32_t result = getColElemIterOffset(coleiter, FALSE);
1498 // sets the text iterator here with the correct expansion and offset
1499 int32_t leftoverces = getExpansionPrefix(coleiter);
1500 cleanUpSafeText(strsrch, safetext, safebuffer);
1501 if (result >= safelength) {
1502 result = textoffset;
1503 }
1504 else {
1505 result += safeoffset;
1506 }
1507 setColEIterOffset(strsrch->textIter, result);
1508 strsrch->textIter->iteratordata_.toReturn =
1509 setExpansionPrefix(strsrch->textIter, leftoverces);
1510 return result;
1511 }
1512
1513 return ucol_getOffset(coleiter);
1514 }
1515
1516 /**
1517 * Trying out the substring and sees if it can be a canonical match.
1518 * This will try normalizing the end accents and arranging them into canonical
1519 * equivalents and check their corresponding ces with the pattern ce.
1520 * Suffix accents in the text will be grouped according to their combining
1521 * class and the groups will be mixed and matched to try find the perfect
1522 * match with the pattern.
1523 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1524 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1525 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1526 * "\u0301\u0325".
1527 * step 2: check if any of the generated substrings matches the pattern.
1528 * Internal method, status assumed to be success, caller has to check status
1529 * before calling this method.
1530 * @param strsrch string search data
1531 * @param textoffset end offset in the collation element text that ends with
1532 * the accents to be rearranged
1533 * @param status error status if any
1534 * @return TRUE if the match is valid, FALSE otherwise
1535 */
1536 static
1537 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1538 int32_t textoffset,
1539 UErrorCode *status)
1540 {
1541 const UChar *text = strsrch->search->text;
1542 int32_t temp = textoffset;
1543 UTF_BACK_1(text, 0, temp);
1544 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1545 UCollationElements *coleiter = strsrch->textIter;
1546 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1547 if (strsrch->pattern.hasPrefixAccents) {
1548 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1549 status);
1550 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1551 setColEIterOffset(coleiter, offset);
1552 return TRUE;
1553 }
1554 }
1555 return FALSE;
1556 }
1557
1558 if (!strsrch->pattern.hasSuffixAccents) {
1559 return FALSE;
1560 }
1561
1562 UChar accents[INITIAL_ARRAY_SIZE_];
1563 // offset to the last base character in substring to search
1564 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1565 // normalizing the offensive string
1566 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1567 0, accents, INITIAL_ARRAY_SIZE_, status);
1568 // status checked in loop below
1569
1570 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1571 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1572
1573 // 2 power n - 1 plus the full set of accents
1574 int32_t count = (2 << (size - 1)) - 1;
1575 while (U_SUCCESS(*status) && count > 0) {
1576 UChar *rearrange = strsrch->canonicalSuffixAccents;
1577 // copy the base characters
1578 for (int k = 0; k < accentsindex[0]; k ++) {
1579 *rearrange ++ = accents[k];
1580 }
1581 // forming all possible canonical rearrangement by dropping
1582 // sets of accents
1583 for (int i = 0; i <= size - 1; i ++) {
1584 int32_t mask = 1 << (size - i - 1);
1585 if (count & mask) {
1586 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1587 *rearrange ++ = accents[j];
1588 }
1589 }
1590 }
1591 *rearrange = 0;
1592 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1593 status);
1594 if (offset != USEARCH_DONE) {
1595 return TRUE; // match found
1596 }
1597 count --;
1598 }
1599 return FALSE;
1600 }
1601
1602 /**
1603 * Gets the previous base character offset depending on the string search
1604 * pattern data
1605 * @param strsrch string search data
1606 * @param textoffset current offset, current character
1607 * @return the offset of the next character after this base character or itself
1608 * if it is a composed character with accents
1609 */
1610 static
1611 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1612 int32_t textoffset)
1613 {
1614 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1615 const UChar *text = strsrch->search->text;
1616 int32_t offset = textoffset;
1617 if (getFCD(text, &offset, strsrch->search->textLength) >>
1618 SECOND_LAST_BYTE_SHIFT_) {
1619 return getPreviousBaseOffset(text, textoffset);
1620 }
1621 }
1622 return textoffset;
1623 }
1624
1625 /**
1626 * Checks match for contraction.
1627 * If the match ends with a partial contraction we fail.
1628 * If the match starts too far off (because of backwards iteration) we try to
1629 * chip off the extra characters
1630 * Internal method, status assumed to be success, caller has to check status
1631 * before calling this method.
1632 * @param strsrch string search data
1633 * @param start offset of potential match, to be modified if necessary
1634 * @param end offset of potential match, to be modified if necessary
1635 * @param status output error status if any
1636 * @return TRUE if match passes the contraction test, FALSE otherwise
1637 */
1638 static
1639 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1640 int32_t *start,
1641 int32_t *end,
1642 UErrorCode *status)
1643 {
1644 UCollationElements *coleiter = strsrch->textIter;
1645 int32_t textlength = strsrch->search->textLength;
1646 int32_t temp = *start;
1647 const UCollator *collator = strsrch->collator;
1648 const UChar *text = strsrch->search->text;
1649 // This part checks if either ends of the match contains potential
1650 // contraction. If so we'll have to iterate through them
1651 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1652 (*start + 1 < textlength
1653 && ucol_unsafeCP(text[*start + 1], collator))) {
1654 int32_t expansion = getExpansionPrefix(coleiter);
1655 UBool expandflag = expansion > 0;
1656 setColEIterOffset(coleiter, *start);
1657 while (expansion > 0) {
1658 // getting rid of the redundant ce, caused by setOffset.
1659 // since backward contraction/expansion may have extra ces if we
1660 // are in the normalization buffer, hasAccentsBeforeMatch would
1661 // have taken care of it.
1662 // E.g. the character \u01FA will have an expansion of 3, but if
1663 // we are only looking for acute and ring \u030A and \u0301, we'll
1664 // have to skip the first ce in the expansion buffer.
1665 ucol_next(coleiter, status);
1666 if (U_FAILURE(*status)) {
1667 return FALSE;
1668 }
1669 if (ucol_getOffset(coleiter) != temp) {
1670 *start = temp;
1671 temp = ucol_getOffset(coleiter);
1672 }
1673 expansion --;
1674 }
1675
1676 int32_t *patternce = strsrch->pattern.CE;
1677 int32_t patterncelength = strsrch->pattern.CELength;
1678 int32_t count = 0;
1679 int32_t textlength = strsrch->search->textLength;
1680 while (count < patterncelength) {
1681 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1682 // status checked below, note that if status is a failure
1683 // ucol_next returns UCOL_NULLORDER
1684 if (ce == UCOL_IGNORABLE) {
1685 continue;
1686 }
1687 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1688 *start = temp;
1689 temp = ucol_getOffset(coleiter);
1690 }
1691
1692 if (count == 0 && ce != patternce[0]) {
1693 // accents may have extra starting ces, this occurs when a
1694 // pure accent pattern is matched without rearrangement
1695 // text \u0325\u0300 and looking for \u0300
1696 int32_t expected = patternce[0];
1697 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1698 ce = getCE(strsrch, ucol_next(coleiter, status));
1699 while (U_SUCCESS(*status) && ce != expected &&
1700 ce != UCOL_NULLORDER &&
1701 ucol_getOffset(coleiter) <= *end) {
1702 ce = getCE(strsrch, ucol_next(coleiter, status));
1703 }
1704 }
1705 }
1706 if (U_FAILURE(*status) || ce != patternce[count]) {
1707 (*end) ++;
1708 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1709 return FALSE;
1710 }
1711 count ++;
1712 }
1713 }
1714 return TRUE;
1715 }
1716
1717 /**
1718 * Checks and sets the match information if found.
1719 * Checks
1720 * <ul>
1721 * <li> the potential match does not repeat the previous match
1722 * <li> boundaries are correct
1723 * <li> potential match does not end in the middle of a contraction
1724 * <li> identical matches
1725 * <\ul>
1726 * Otherwise the offset will be shifted to the next character.
1727 * Internal method, status assumed to be success, caller has to check the
1728 * status before calling this method.
1729 * @param strsrch string search data
1730 * @param textoffset offset in the collation element text. the returned value
1731 * will be the truncated end offset of the match or the new start
1732 * search offset.
1733 * @param status output error status if any
1734 * @return TRUE if the match is valid, FALSE otherwise
1735 */
1736 static
1737 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1738 int32_t *textoffset,
1739 UErrorCode *status)
1740 {
1741 // to ensure that the start and ends are not composite characters
1742 UCollationElements *coleiter = strsrch->textIter;
1743 // if we have a canonical accent match
1744 if ((strsrch->pattern.hasSuffixAccents &&
1745 strsrch->canonicalSuffixAccents[0]) ||
1746 (strsrch->pattern.hasPrefixAccents &&
1747 strsrch->canonicalPrefixAccents[0])) {
1748 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1749 strsrch,
1750 ucol_getOffset(coleiter));
1751 strsrch->search->matchedLength = *textoffset -
1752 strsrch->search->matchedIndex;
1753 return TRUE;
1754 }
1755
1756 int32_t start = getColElemIterOffset(coleiter, FALSE);
1757 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1758 status) || U_FAILURE(*status)) {
1759 return FALSE;
1760 }
1761
1762 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1763 // this totally matches, however we need to check if it is repeating
1764 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1765 !isBreakUnit(strsrch, start, *textoffset) ||
1766 !checkIdentical(strsrch, start, *textoffset)) {
1767 (*textoffset) ++;
1768 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1769 strsrch->search->textLength);
1770 return FALSE;
1771 }
1772
1773 strsrch->search->matchedIndex = start;
1774 strsrch->search->matchedLength = *textoffset - start;
1775 return TRUE;
1776 }
1777
1778 /**
1779 * Shifting the collation element iterator position forward to prepare for
1780 * a preceding match. If the first character is a unsafe character, we'll only
1781 * shift by 1 to capture contractions, normalization etc.
1782 * Internal method, status assumed to be success, caller has to check status
1783 * before calling this method.
1784 * @param text strsrch string search data
1785 * @param textoffset start text position to do search
1786 * @param ce the text ce which failed the match.
1787 * @param patternceindex index of the ce within the pattern ce buffer which
1788 * failed the match
1789 * @return final offset
1790 */
1791 static
1792 inline int32_t reverseShift(UStringSearch *strsrch,
1793 int32_t textoffset,
1794 int32_t ce,
1795 int32_t patternceindex)
1796 {
1797 if (strsrch->search->isOverlap) {
1798 if (textoffset != strsrch->search->textLength) {
1799 textoffset --;
1800 }
1801 else {
1802 textoffset -= strsrch->pattern.defaultShiftSize;
1803 }
1804 }
1805 else {
1806 if (ce != UCOL_NULLORDER) {
1807 int32_t shift = strsrch->pattern.backShift[hash(ce)];
1808
1809 // this is to adjust for characters in the middle of the substring
1810 // for matching that failed.
1811 int32_t adjust = patternceindex;
1812 if (adjust > 1 && shift > adjust) {
1813 shift -= adjust - 1;
1814 }
1815 textoffset -= shift;
1816 }
1817 else {
1818 textoffset -= strsrch->pattern.defaultShiftSize;
1819 }
1820 }
1821 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1822 return textoffset;
1823 }
1824
1825 /**
1826 * Checks match for contraction.
1827 * If the match starts with a partial contraction we fail.
1828 * Internal method, status assumed to be success, caller has to check status
1829 * before calling this method.
1830 * @param strsrch string search data
1831 * @param start offset of potential match, to be modified if necessary
1832 * @param end offset of potential match, to be modified if necessary
1833 * @param status output error status if any
1834 * @return TRUE if match passes the contraction test, FALSE otherwise
1835 */
1836 static
1837 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
1838 int32_t *start,
1839 int32_t *end, UErrorCode *status)
1840 {
1841 UCollationElements *coleiter = strsrch->textIter;
1842 int32_t textlength = strsrch->search->textLength;
1843 int32_t temp = *end;
1844 const UCollator *collator = strsrch->collator;
1845 const UChar *text = strsrch->search->text;
1846 // This part checks if either if the start of the match contains potential
1847 // contraction. If so we'll have to iterate through them
1848 // Since we used ucol_next while previously looking for the potential
1849 // match, this guarantees that our end will not be a partial contraction,
1850 // or a partial supplementary character.
1851 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
1852 int32_t expansion = getExpansionSuffix(coleiter);
1853 UBool expandflag = expansion > 0;
1854 setColEIterOffset(coleiter, *end);
1855 while (U_SUCCESS(*status) && expansion > 0) {
1856 // getting rid of the redundant ce
1857 // since forward contraction/expansion may have extra ces
1858 // if we are in the normalization buffer, hasAccentsBeforeMatch
1859 // would have taken care of it.
1860 // E.g. the character \u01FA will have an expansion of 3, but if
1861 // we are only looking for A ring A\u030A, we'll have to skip the
1862 // last ce in the expansion buffer
1863 ucol_previous(coleiter, status);
1864 if (U_FAILURE(*status)) {
1865 return FALSE;
1866 }
1867 if (ucol_getOffset(coleiter) != temp) {
1868 *end = temp;
1869 temp = ucol_getOffset(coleiter);
1870 }
1871 expansion --;
1872 }
1873
1874 int32_t *patternce = strsrch->pattern.CE;
1875 int32_t patterncelength = strsrch->pattern.CELength;
1876 int32_t count = patterncelength;
1877 while (count > 0) {
1878 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
1879 // status checked below, note that if status is a failure
1880 // ucol_previous returns UCOL_NULLORDER
1881 if (ce == UCOL_IGNORABLE) {
1882 continue;
1883 }
1884 if (expandflag && count == 0 &&
1885 getColElemIterOffset(coleiter, FALSE) != temp) {
1886 *end = temp;
1887 temp = ucol_getOffset(coleiter);
1888 }
1889 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
1890 (*start) --;
1891 *start = getPreviousBaseOffset(text, *start);
1892 return FALSE;
1893 }
1894 count --;
1895 }
1896 }
1897 return TRUE;
1898 }
1899
1900 /**
1901 * Checks and sets the match information if found.
1902 * Checks
1903 * <ul>
1904 * <li> the current match does not repeat the last match
1905 * <li> boundaries are correct
1906 * <li> exact matches has no extra accents
1907 * <li> identical matches
1908 * <\ul>
1909 * Otherwise the offset will be shifted to the preceding character.
1910 * Internal method, status assumed to be success, caller has to check status
1911 * before calling this method.
1912 * @param strsrch string search data
1913 * @param collator
1914 * @param coleiter collation element iterator
1915 * @param text string
1916 * @param textoffset offset in the collation element text. the returned value
1917 * will be the truncated start offset of the match or the new start
1918 * search offset.
1919 * @param status output error status if any
1920 * @return TRUE if the match is valid, FALSE otherwise
1921 */
1922 static
1923 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
1924 int32_t *textoffset,
1925 UErrorCode *status)
1926 {
1927 // to ensure that the start and ends are not composite characters
1928 int32_t end = ucol_getOffset(strsrch->textIter);
1929 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
1930 || U_FAILURE(*status)) {
1931 return FALSE;
1932 }
1933
1934 // this totally matches, however we need to check if it is repeating
1935 // the old match
1936 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
1937 !isBreakUnit(strsrch, *textoffset, end) ||
1938 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
1939 !checkIdentical(strsrch, *textoffset, end) ||
1940 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
1941 (*textoffset) --;
1942 *textoffset = getPreviousBaseOffset(strsrch->search->text,
1943 *textoffset);
1944 return FALSE;
1945 }
1946 strsrch->search->matchedIndex = *textoffset;
1947 strsrch->search->matchedLength = end - *textoffset;
1948 return TRUE;
1949 }
1950
1951 /**
1952 * Rearranges the end accents to try matching.
1953 * Suffix accents in the text will be grouped according to their combining
1954 * class and the groups will be mixed and matched to try find the perfect
1955 * match with the pattern.
1956 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1957 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1958 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1959 * "\u0301\u0325".
1960 * step 2: check if any of the generated substrings matches the pattern.
1961 * Internal method, status assumed to be success, user has to check status
1962 * before calling this method.
1963 * @param strsrch string search match
1964 * @param start offset of the first base character
1965 * @param end start of the last accent set
1966 * @param status only error status if any
1967 * @return USEARCH_DONE if a match is not found, otherwise return the ending
1968 * offset of the match. Note this start includes all following accents.
1969 */
1970 static
1971 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
1972 int32_t start,
1973 int32_t end,
1974 UErrorCode *status)
1975 {
1976 const UChar *text = strsrch->search->text;
1977 int32_t tempend = end;
1978
1979 UTF_BACK_1(text, 0, tempend);
1980 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
1981 LAST_BYTE_MASK_)) {
1982 // die... failed at a base character
1983 return USEARCH_DONE;
1984 }
1985 end = getNextBaseOffset(text, end, strsrch->search->textLength);
1986
1987 if (U_SUCCESS(*status)) {
1988 UChar accents[INITIAL_ARRAY_SIZE_];
1989 int32_t offset = getPreviousBaseOffset(text, end);
1990 // normalizing the offensive string
1991 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
1992 INITIAL_ARRAY_SIZE_, status);
1993
1994 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1995 int32_t accentsize = getUnblockedAccentIndex(accents,
1996 accentsindex);
1997 int32_t count = (2 << (accentsize - 1)) - 1;
1998 UChar buffer[INITIAL_ARRAY_SIZE_];
1999 UCollationElements *coleiter = strsrch->utilIter;
2000 while (U_SUCCESS(*status) && count > 0) {
2001 UChar *rearrange = strsrch->canonicalSuffixAccents;
2002 // copy the base characters
2003 for (int k = 0; k < accentsindex[0]; k ++) {
2004 *rearrange ++ = accents[k];
2005 }
2006 // forming all possible canonical rearrangement by dropping
2007 // sets of accents
2008 for (int i = 0; i <= accentsize - 1; i ++) {
2009 int32_t mask = 1 << (accentsize - i - 1);
2010 if (count & mask) {
2011 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2012 *rearrange ++ = accents[j];
2013 }
2014 }
2015 }
2016 *rearrange = 0;
2017 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2018 UChar *match = addToUCharArray(buffer, &matchsize,
2019 strsrch->canonicalPrefixAccents,
2020 strsrch->search->text + start,
2021 offset - start,
2022 strsrch->canonicalSuffixAccents,
2023 status);
2024
2025 // run the collator iterator through this match
2026 // if status is a failure ucol_setText does nothing
2027 ucol_setText(coleiter, match, matchsize, status);
2028 if (U_SUCCESS(*status)) {
2029 if (checkCollationMatch(strsrch, coleiter)) {
2030 if (match != buffer) {
2031 uprv_free(match);
2032 }
2033 return end;
2034 }
2035 }
2036 count --;
2037 }
2038 }
2039 return USEARCH_DONE;
2040 }
2041
2042 /**
2043 * Take the rearranged start accents and tries matching. If match failed at
2044 * a seperate following set of accents (seperated from the rearranged on by
2045 * at least a base character) then we rearrange the preceding accents and
2046 * tries matching again.
2047 * We allow skipping of the ends of the accent set if the ces do not match.
2048 * However if the failure is found before the accent set, it fails.
2049 * Internal method, status assumed to be success, caller has to check status
2050 * before calling this method.
2051 * @param strsrch string search data
2052 * @param textoffset of the ends of the rearranged accent
2053 * @param status output error status if any
2054 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2055 * offset of the match. Note this start includes all following accents.
2056 */
2057 static
2058 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2059 int32_t textoffset,
2060 UErrorCode *status)
2061 {
2062 const UChar *text = strsrch->search->text;
2063 const UCollator *collator = strsrch->collator;
2064 int32_t safelength = 0;
2065 UChar *safetext;
2066 int32_t safetextlength;
2067 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2068 int32_t safeoffset = textoffset;
2069
2070 if (textoffset &&
2071 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2072 u_strlen(strsrch->canonicalPrefixAccents) - 1
2073 ], collator)) {
2074 safeoffset = getNextSafeOffset(collator, text, textoffset,
2075 strsrch->search->textLength);
2076 safelength = safeoffset - textoffset;
2077 safetextlength = INITIAL_ARRAY_SIZE_;
2078 safetext = addToUCharArray(safebuffer, &safetextlength,
2079 strsrch->canonicalPrefixAccents,
2080 text + textoffset, safelength,
2081 NULL, status);
2082 }
2083 else {
2084 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2085 safetext = strsrch->canonicalPrefixAccents;
2086 }
2087
2088 UCollationElements *coleiter = strsrch->utilIter;
2089 // if status is a failure, ucol_setText does nothing
2090 ucol_setText(coleiter, safetext, safetextlength, status);
2091 // status checked in loop below
2092
2093 int32_t *ce = strsrch->pattern.CE;
2094 int32_t celength = strsrch->pattern.CELength;
2095 int ceindex = 0;
2096 UBool isSafe = TRUE; // safe zone indication flag for position
2097 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2098
2099 while (ceindex < celength) {
2100 int32_t textce = ucol_next(coleiter, status);
2101 if (U_FAILURE(*status)) {
2102 if (isSafe) {
2103 cleanUpSafeText(strsrch, safetext, safebuffer);
2104 }
2105 return USEARCH_DONE;
2106 }
2107 if (textce == UCOL_NULLORDER) {
2108 // check if we have passed the safe buffer
2109 if (coleiter == strsrch->textIter) {
2110 cleanUpSafeText(strsrch, safetext, safebuffer);
2111 return USEARCH_DONE;
2112 }
2113 cleanUpSafeText(strsrch, safetext, safebuffer);
2114 safetext = safebuffer;
2115 coleiter = strsrch->textIter;
2116 setColEIterOffset(coleiter, safeoffset);
2117 // status checked at the start of the loop
2118 isSafe = FALSE;
2119 continue;
2120 }
2121 textce = getCE(strsrch, textce);
2122 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2123 // do the beginning stuff
2124 int32_t failedoffset = ucol_getOffset(coleiter);
2125 if (isSafe && failedoffset <= prefixlength) {
2126 // alas... no hope. failed at rearranged accent set
2127 cleanUpSafeText(strsrch, safetext, safebuffer);
2128 return USEARCH_DONE;
2129 }
2130 else {
2131 if (isSafe) {
2132 failedoffset = safeoffset - failedoffset;
2133 cleanUpSafeText(strsrch, safetext, safebuffer);
2134 }
2135
2136 // try rearranging the end accents
2137 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2138 textoffset, failedoffset, status);
2139 if (result != USEARCH_DONE) {
2140 // if status is a failure, ucol_setOffset does nothing
2141 setColEIterOffset(strsrch->textIter, result);
2142 }
2143 if (U_FAILURE(*status)) {
2144 return USEARCH_DONE;
2145 }
2146 return result;
2147 }
2148 }
2149 if (textce == ce[ceindex]) {
2150 ceindex ++;
2151 }
2152 }
2153 // set offset here
2154 if (isSafe) {
2155 int32_t result = ucol_getOffset(coleiter);
2156 // sets the text iterator here with the correct expansion and offset
2157 int32_t leftoverces = getExpansionSuffix(coleiter);
2158 cleanUpSafeText(strsrch, safetext, safebuffer);
2159 if (result <= prefixlength) {
2160 result = textoffset;
2161 }
2162 else {
2163 result = textoffset + (safeoffset - result);
2164 }
2165 setColEIterOffset(strsrch->textIter, result);
2166 setExpansionSuffix(strsrch->textIter, leftoverces);
2167 return result;
2168 }
2169
2170 return ucol_getOffset(coleiter);
2171 }
2172
2173 /**
2174 * Trying out the substring and sees if it can be a canonical match.
2175 * This will try normalizing the starting accents and arranging them into
2176 * canonical equivalents and check their corresponding ces with the pattern ce.
2177 * Prefix accents in the text will be grouped according to their combining
2178 * class and the groups will be mixed and matched to try find the perfect
2179 * match with the pattern.
2180 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2181 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2182 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2183 * "\u0301\u0325".
2184 * step 2: check if any of the generated substrings matches the pattern.
2185 * Internal method, status assumed to be success, caller has to check status
2186 * before calling this method.
2187 * @param strsrch string search data
2188 * @param textoffset start offset in the collation element text that starts
2189 * with the accents to be rearranged
2190 * @param status output error status if any
2191 * @return TRUE if the match is valid, FALSE otherwise
2192 */
2193 static
2194 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2195 int32_t textoffset,
2196 UErrorCode *status)
2197 {
2198 const UChar *text = strsrch->search->text;
2199 int32_t temp = textoffset;
2200 int32_t textlength = strsrch->search->textLength;
2201 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2202 UCollationElements *coleiter = strsrch->textIter;
2203 int32_t offset = ucol_getOffset(coleiter);
2204 if (strsrch->pattern.hasSuffixAccents) {
2205 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2206 offset, status);
2207 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2208 setColEIterOffset(coleiter, offset);
2209 return TRUE;
2210 }
2211 }
2212 return FALSE;
2213 }
2214
2215 if (!strsrch->pattern.hasPrefixAccents) {
2216 return FALSE;
2217 }
2218
2219 UChar accents[INITIAL_ARRAY_SIZE_];
2220 // offset to the last base character in substring to search
2221 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2222 // normalizing the offensive string
2223 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2224 0, accents, INITIAL_ARRAY_SIZE_, status);
2225 // status checked in loop
2226
2227 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2228 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2229
2230 // 2 power n - 1 plus the full set of accents
2231 int32_t count = (2 << (size - 1)) - 1;
2232 while (U_SUCCESS(*status) && count > 0) {
2233 UChar *rearrange = strsrch->canonicalPrefixAccents;
2234 // copy the base characters
2235 for (int k = 0; k < accentsindex[0]; k ++) {
2236 *rearrange ++ = accents[k];
2237 }
2238 // forming all possible canonical rearrangement by dropping
2239 // sets of accents
2240 for (int i = 0; i <= size - 1; i ++) {
2241 int32_t mask = 1 << (size - i - 1);
2242 if (count & mask) {
2243 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2244 *rearrange ++ = accents[j];
2245 }
2246 }
2247 }
2248 *rearrange = 0;
2249 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2250 baseoffset, status);
2251 if (offset != USEARCH_DONE) {
2252 return TRUE; // match found
2253 }
2254 count --;
2255 }
2256 return FALSE;
2257 }
2258
2259 /**
2260 * Checks match for contraction.
2261 * If the match starts with a partial contraction we fail.
2262 * Internal method, status assumed to be success, caller has to check status
2263 * before calling this method.
2264 * @param strsrch string search data
2265 * @param start offset of potential match, to be modified if necessary
2266 * @param end offset of potential match, to be modified if necessary
2267 * @param status only error status if any
2268 * @return TRUE if match passes the contraction test, FALSE otherwise
2269 */
2270 static
2271 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2272 int32_t *start,
2273 int32_t *end, UErrorCode *status)
2274 {
2275 UCollationElements *coleiter = strsrch->textIter;
2276 int32_t textlength = strsrch->search->textLength;
2277 int32_t temp = *end;
2278 const UCollator *collator = strsrch->collator;
2279 const UChar *text = strsrch->search->text;
2280 // This part checks if either if the start of the match contains potential
2281 // contraction. If so we'll have to iterate through them
2282 // Since we used ucol_next while previously looking for the potential
2283 // match, this guarantees that our end will not be a partial contraction,
2284 // or a partial supplementary character.
2285 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2286 int32_t expansion = getExpansionSuffix(coleiter);
2287 UBool expandflag = expansion > 0;
2288 setColEIterOffset(coleiter, *end);
2289 while (expansion > 0) {
2290 // getting rid of the redundant ce
2291 // since forward contraction/expansion may have extra ces
2292 // if we are in the normalization buffer, hasAccentsBeforeMatch
2293 // would have taken care of it.
2294 // E.g. the character \u01FA will have an expansion of 3, but if
2295 // we are only looking for A ring A\u030A, we'll have to skip the
2296 // last ce in the expansion buffer
2297 ucol_previous(coleiter, status);
2298 if (U_FAILURE(*status)) {
2299 return FALSE;
2300 }
2301 if (ucol_getOffset(coleiter) != temp) {
2302 *end = temp;
2303 temp = ucol_getOffset(coleiter);
2304 }
2305 expansion --;
2306 }
2307
2308 int32_t *patternce = strsrch->pattern.CE;
2309 int32_t patterncelength = strsrch->pattern.CELength;
2310 int32_t count = patterncelength;
2311 while (count > 0) {
2312 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2313 // status checked below, note that if status is a failure
2314 // ucol_previous returns UCOL_NULLORDER
2315 if (ce == UCOL_IGNORABLE) {
2316 continue;
2317 }
2318 if (expandflag && count == 0 &&
2319 getColElemIterOffset(coleiter, FALSE) != temp) {
2320 *end = temp;
2321 temp = ucol_getOffset(coleiter);
2322 }
2323 if (count == patterncelength &&
2324 ce != patternce[patterncelength - 1]) {
2325 // accents may have extra starting ces, this occurs when a
2326 // pure accent pattern is matched without rearrangement
2327 int32_t expected = patternce[patterncelength - 1];
2328 UTF_BACK_1(text, 0, *end);
2329 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2330 ce = getCE(strsrch, ucol_previous(coleiter, status));
2331 while (U_SUCCESS(*status) && ce != expected &&
2332 ce != UCOL_NULLORDER &&
2333 ucol_getOffset(coleiter) <= *start) {
2334 ce = getCE(strsrch, ucol_previous(coleiter, status));
2335 }
2336 }
2337 }
2338 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2339 (*start) --;
2340 *start = getPreviousBaseOffset(text, *start);
2341 return FALSE;
2342 }
2343 count --;
2344 }
2345 }
2346 return TRUE;
2347 }
2348
2349 /**
2350 * Checks and sets the match information if found.
2351 * Checks
2352 * <ul>
2353 * <li> the potential match does not repeat the previous match
2354 * <li> boundaries are correct
2355 * <li> potential match does not end in the middle of a contraction
2356 * <li> identical matches
2357 * <\ul>
2358 * Otherwise the offset will be shifted to the next character.
2359 * Internal method, status assumed to be success, caller has to check status
2360 * before calling this method.
2361 * @param strsrch string search data
2362 * @param textoffset offset in the collation element text. the returned value
2363 * will be the truncated start offset of the match or the new start
2364 * search offset.
2365 * @param status only error status if any
2366 * @return TRUE if the match is valid, FALSE otherwise
2367 */
2368 static
2369 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2370 int32_t *textoffset,
2371 UErrorCode *status)
2372 {
2373 // to ensure that the start and ends are not composite characters
2374 UCollationElements *coleiter = strsrch->textIter;
2375 // if we have a canonical accent match
2376 if ((strsrch->pattern.hasSuffixAccents &&
2377 strsrch->canonicalSuffixAccents[0]) ||
2378 (strsrch->pattern.hasPrefixAccents &&
2379 strsrch->canonicalPrefixAccents[0])) {
2380 strsrch->search->matchedIndex = *textoffset;
2381 strsrch->search->matchedLength =
2382 getNextUStringSearchBaseOffset(strsrch,
2383 getColElemIterOffset(coleiter, FALSE))
2384 - *textoffset;
2385 return TRUE;
2386 }
2387
2388 int32_t end = ucol_getOffset(coleiter);
2389 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2390 status) ||
2391 U_FAILURE(*status)) {
2392 return FALSE;
2393 }
2394
2395 end = getNextUStringSearchBaseOffset(strsrch, end);
2396 // this totally matches, however we need to check if it is repeating
2397 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2398 !isBreakUnit(strsrch, *textoffset, end) ||
2399 !checkIdentical(strsrch, *textoffset, end)) {
2400 (*textoffset) --;
2401 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2402 *textoffset);
2403 return FALSE;
2404 }
2405
2406 strsrch->search->matchedIndex = *textoffset;
2407 strsrch->search->matchedLength = end - *textoffset;
2408 return TRUE;
2409 }
2410
2411 // constructors and destructor -------------------------------------------
2412
2413 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2414 int32_t patternlength,
2415 const UChar *text,
2416 int32_t textlength,
2417 const char *locale,
2418 UBreakIterator *breakiter,
2419 UErrorCode *status)
2420 {
2421 if (U_FAILURE(*status)) {
2422 return NULL;
2423 }
2424 #if UCONFIG_NO_BREAK_ITERATION
2425 if (breakiter != NULL) {
2426 *status = U_UNSUPPORTED_ERROR;
2427 return NULL;
2428 }
2429 #endif
2430 if (locale) {
2431 // ucol_open internally checks for status
2432 UCollator *collator = ucol_open(locale, status);
2433 // pattern, text checks are done in usearch_openFromCollator
2434 UStringSearch *result = usearch_openFromCollator(pattern,
2435 patternlength, text, textlength,
2436 collator, breakiter, status);
2437
2438 if (result == NULL || U_FAILURE(*status)) {
2439 if (collator) {
2440 ucol_close(collator);
2441 }
2442 return NULL;
2443 }
2444 else {
2445 result->ownCollator = TRUE;
2446 }
2447 return result;
2448 }
2449 *status = U_ILLEGAL_ARGUMENT_ERROR;
2450 return NULL;
2451 }
2452
2453 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2454 const UChar *pattern,
2455 int32_t patternlength,
2456 const UChar *text,
2457 int32_t textlength,
2458 const UCollator *collator,
2459 UBreakIterator *breakiter,
2460 UErrorCode *status)
2461 {
2462 if (U_FAILURE(*status)) {
2463 return NULL;
2464 }
2465 #if UCONFIG_NO_BREAK_ITERATION
2466 if (breakiter != NULL) {
2467 *status = U_UNSUPPORTED_ERROR;
2468 return NULL;
2469 }
2470 #endif
2471 if (pattern == NULL || text == NULL || collator == NULL) {
2472 *status = U_ILLEGAL_ARGUMENT_ERROR;
2473 }
2474
2475 // string search does not really work when numeric collation is turned on
2476 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2477 *status = U_UNSUPPORTED_ERROR;
2478 }
2479
2480 if (U_SUCCESS(*status)) {
2481 initializeFCD(status);
2482 if (U_FAILURE(*status)) {
2483 return NULL;
2484 }
2485
2486 UStringSearch *result;
2487 if (textlength == -1) {
2488 textlength = u_strlen(text);
2489 }
2490 if (patternlength == -1) {
2491 patternlength = u_strlen(pattern);
2492 }
2493 if (textlength <= 0 || patternlength <= 0) {
2494 *status = U_ILLEGAL_ARGUMENT_ERROR;
2495 return NULL;
2496 }
2497
2498 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2499 if (result == NULL) {
2500 *status = U_MEMORY_ALLOCATION_ERROR;
2501 return NULL;
2502 }
2503
2504 result->collator = collator;
2505 result->strength = ucol_getStrength(collator);
2506 result->ceMask = getMask(result->strength);
2507 result->toShift =
2508 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2509 UCOL_SHIFTED;
2510 result->variableTop = ucol_getVariableTop(collator, status);
2511
2512 if (U_FAILURE(*status)) {
2513 uprv_free(result);
2514 return NULL;
2515 }
2516
2517 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2518 if (result->search == NULL) {
2519 *status = U_MEMORY_ALLOCATION_ERROR;
2520 uprv_free(result);
2521 return NULL;
2522 }
2523
2524 result->search->text = text;
2525 result->search->textLength = textlength;
2526
2527 result->pattern.text = pattern;
2528 result->pattern.textLength = patternlength;
2529 result->pattern.CE = NULL;
2530
2531 result->search->breakIter = breakiter;
2532 #if !UCONFIG_NO_BREAK_ITERATION
2533 if (breakiter) {
2534 ubrk_setText(breakiter, text, textlength, status);
2535 }
2536 #endif
2537
2538 result->ownCollator = FALSE;
2539 result->search->matchedLength = 0;
2540 result->search->matchedIndex = USEARCH_DONE;
2541 result->textIter = ucol_openElements(collator, text,
2542 textlength, status);
2543 if (U_FAILURE(*status)) {
2544 usearch_close(result);
2545 return NULL;
2546 }
2547
2548 result->utilIter = NULL;
2549
2550 result->search->isOverlap = FALSE;
2551 result->search->isCanonicalMatch = FALSE;
2552 result->search->isForwardSearching = TRUE;
2553 result->search->reset = TRUE;
2554
2555 initialize(result, status);
2556
2557 if (U_FAILURE(*status)) {
2558 usearch_close(result);
2559 return NULL;
2560 }
2561
2562 return result;
2563 }
2564 return NULL;
2565 }
2566
2567 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2568 {
2569 if (strsrch) {
2570 if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2571 strsrch->pattern.CE) {
2572 uprv_free(strsrch->pattern.CE);
2573 }
2574 ucol_closeElements(strsrch->textIter);
2575 ucol_closeElements(strsrch->utilIter);
2576 if (strsrch->ownCollator && strsrch->collator) {
2577 ucol_close((UCollator *)strsrch->collator);
2578 }
2579 uprv_free(strsrch->search);
2580 uprv_free(strsrch);
2581 }
2582 }
2583
2584 // set and get methods --------------------------------------------------
2585
2586 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2587 int32_t position,
2588 UErrorCode *status)
2589 {
2590 if (U_SUCCESS(*status) && strsrch) {
2591 if (isOutOfBounds(strsrch->search->textLength, position)) {
2592 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2593 }
2594 else {
2595 setColEIterOffset(strsrch->textIter, position);
2596 }
2597 strsrch->search->matchedIndex = USEARCH_DONE;
2598 strsrch->search->matchedLength = 0;
2599 strsrch->search->reset = FALSE;
2600 }
2601 }
2602
2603 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2604 {
2605 if (strsrch) {
2606 int32_t result = ucol_getOffset(strsrch->textIter);
2607 if (isOutOfBounds(strsrch->search->textLength, result)) {
2608 return USEARCH_DONE;
2609 }
2610 return result;
2611 }
2612 return USEARCH_DONE;
2613 }
2614
2615 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2616 USearchAttribute attribute,
2617 USearchAttributeValue value,
2618 UErrorCode *status)
2619 {
2620 if (U_SUCCESS(*status) && strsrch) {
2621 switch (attribute)
2622 {
2623 case USEARCH_OVERLAP :
2624 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2625 break;
2626 case USEARCH_CANONICAL_MATCH :
2627 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2628 FALSE);
2629 break;
2630 case USEARCH_ATTRIBUTE_COUNT :
2631 default:
2632 *status = U_ILLEGAL_ARGUMENT_ERROR;
2633 }
2634 }
2635 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2636 *status = U_ILLEGAL_ARGUMENT_ERROR;
2637 }
2638 }
2639
2640 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2641 const UStringSearch *strsrch,
2642 USearchAttribute attribute)
2643 {
2644 if (strsrch) {
2645 switch (attribute) {
2646 case USEARCH_OVERLAP :
2647 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2648 USEARCH_OFF);
2649 case USEARCH_CANONICAL_MATCH :
2650 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2651 USEARCH_OFF);
2652 case USEARCH_ATTRIBUTE_COUNT :
2653 return USEARCH_DEFAULT;
2654 }
2655 }
2656 return USEARCH_DEFAULT;
2657 }
2658
2659 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2660 const UStringSearch *strsrch)
2661 {
2662 if (strsrch == NULL) {
2663 return USEARCH_DONE;
2664 }
2665 return strsrch->search->matchedIndex;
2666 }
2667
2668
2669 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2670 UChar *result,
2671 int32_t resultCapacity,
2672 UErrorCode *status)
2673 {
2674 if (U_FAILURE(*status)) {
2675 return USEARCH_DONE;
2676 }
2677 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2678 result == NULL)) {
2679 *status = U_ILLEGAL_ARGUMENT_ERROR;
2680 return USEARCH_DONE;
2681 }
2682
2683 int32_t copylength = strsrch->search->matchedLength;
2684 int32_t copyindex = strsrch->search->matchedIndex;
2685 if (copyindex == USEARCH_DONE) {
2686 u_terminateUChars(result, resultCapacity, 0, status);
2687 return USEARCH_DONE;
2688 }
2689
2690 if (resultCapacity < copylength) {
2691 copylength = resultCapacity;
2692 }
2693 if (copylength > 0) {
2694 uprv_memcpy(result, strsrch->search->text + copyindex,
2695 copylength * sizeof(UChar));
2696 }
2697 return u_terminateUChars(result, resultCapacity,
2698 strsrch->search->matchedLength, status);
2699 }
2700
2701 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2702 const UStringSearch *strsrch)
2703 {
2704 if (strsrch) {
2705 return strsrch->search->matchedLength;
2706 }
2707 return USEARCH_DONE;
2708 }
2709
2710 #if !UCONFIG_NO_BREAK_ITERATION
2711
2712 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2713 UBreakIterator *breakiter,
2714 UErrorCode *status)
2715 {
2716 if (U_SUCCESS(*status) && strsrch) {
2717 strsrch->search->breakIter = breakiter;
2718 if (breakiter) {
2719 ubrk_setText(breakiter, strsrch->search->text,
2720 strsrch->search->textLength, status);
2721 }
2722 }
2723 }
2724
2725 U_CAPI const UBreakIterator* U_EXPORT2
2726 usearch_getBreakIterator(const UStringSearch *strsrch)
2727 {
2728 if (strsrch) {
2729 return strsrch->search->breakIter;
2730 }
2731 return NULL;
2732 }
2733
2734 #endif
2735
2736 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2737 const UChar *text,
2738 int32_t textlength,
2739 UErrorCode *status)
2740 {
2741 if (U_SUCCESS(*status)) {
2742 if (strsrch == NULL || text == NULL || textlength < -1 ||
2743 textlength == 0) {
2744 *status = U_ILLEGAL_ARGUMENT_ERROR;
2745 }
2746 else {
2747 if (textlength == -1) {
2748 textlength = u_strlen(text);
2749 }
2750 strsrch->search->text = text;
2751 strsrch->search->textLength = textlength;
2752 ucol_setText(strsrch->textIter, text, textlength, status);
2753 strsrch->search->matchedIndex = USEARCH_DONE;
2754 strsrch->search->matchedLength = 0;
2755 strsrch->search->reset = TRUE;
2756 #if !UCONFIG_NO_BREAK_ITERATION
2757 if (strsrch->search->breakIter != NULL) {
2758 ubrk_setText(strsrch->search->breakIter, text,
2759 textlength, status);
2760 }
2761 #endif
2762 }
2763 }
2764 }
2765
2766 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2767 int32_t *length)
2768 {
2769 if (strsrch) {
2770 *length = strsrch->search->textLength;
2771 return strsrch->search->text;
2772 }
2773 return NULL;
2774 }
2775
2776 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
2777 const UCollator *collator,
2778 UErrorCode *status)
2779 {
2780 if (U_SUCCESS(*status)) {
2781 if (collator == NULL) {
2782 *status = U_ILLEGAL_ARGUMENT_ERROR;
2783 return;
2784 }
2785 if (strsrch) {
2786 if (strsrch->ownCollator && (strsrch->collator != collator)) {
2787 ucol_close((UCollator *)strsrch->collator);
2788 strsrch->ownCollator = FALSE;
2789 }
2790 strsrch->collator = collator;
2791 strsrch->strength = ucol_getStrength(collator);
2792 strsrch->ceMask = getMask(strsrch->strength);
2793 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
2794 strsrch->toShift =
2795 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2796 UCOL_SHIFTED;
2797 // if status is a failure, ucol_getVariableTop returns 0
2798 strsrch->variableTop = ucol_getVariableTop(collator, status);
2799 if (U_SUCCESS(*status)) {
2800 initialize(strsrch, status);
2801 if (U_SUCCESS(*status)) {
2802 uprv_init_collIterate(collator, strsrch->search->text,
2803 strsrch->search->textLength,
2804 &(strsrch->textIter->iteratordata_));
2805 strsrch->utilIter->iteratordata_.coll = collator;
2806 }
2807 }
2808 }
2809 }
2810 }
2811
2812 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
2813 {
2814 if (strsrch) {
2815 return (UCollator *)strsrch->collator;
2816 }
2817 return NULL;
2818 }
2819
2820 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
2821 const UChar *pattern,
2822 int32_t patternlength,
2823 UErrorCode *status)
2824 {
2825 if (U_SUCCESS(*status)) {
2826 if (strsrch == NULL || pattern == NULL) {
2827 *status = U_ILLEGAL_ARGUMENT_ERROR;
2828 }
2829 else {
2830 if (patternlength == -1) {
2831 patternlength = u_strlen(pattern);
2832 }
2833 if (patternlength == 0) {
2834 *status = U_ILLEGAL_ARGUMENT_ERROR;
2835 return;
2836 }
2837 strsrch->pattern.text = pattern;
2838 strsrch->pattern.textLength = patternlength;
2839 initialize(strsrch, status);
2840 }
2841 }
2842 }
2843
2844 U_CAPI const UChar* U_EXPORT2
2845 usearch_getPattern(const UStringSearch *strsrch,
2846 int32_t *length)
2847 {
2848 if (strsrch) {
2849 *length = strsrch->pattern.textLength;
2850 return strsrch->pattern.text;
2851 }
2852 return NULL;
2853 }
2854
2855 // miscellanous methods --------------------------------------------------
2856
2857 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
2858 UErrorCode *status)
2859 {
2860 if (strsrch && U_SUCCESS(*status)) {
2861 strsrch->search->isForwardSearching = TRUE;
2862 usearch_setOffset(strsrch, 0, status);
2863 if (U_SUCCESS(*status)) {
2864 return usearch_next(strsrch, status);
2865 }
2866 }
2867 return USEARCH_DONE;
2868 }
2869
2870 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
2871 int32_t position,
2872 UErrorCode *status)
2873 {
2874 if (strsrch && U_SUCCESS(*status)) {
2875 strsrch->search->isForwardSearching = TRUE;
2876 // position checked in usearch_setOffset
2877 usearch_setOffset(strsrch, position, status);
2878 if (U_SUCCESS(*status)) {
2879 return usearch_next(strsrch, status);
2880 }
2881 }
2882 return USEARCH_DONE;
2883 }
2884
2885 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
2886 UErrorCode *status)
2887 {
2888 if (strsrch && U_SUCCESS(*status)) {
2889 strsrch->search->isForwardSearching = FALSE;
2890 usearch_setOffset(strsrch, strsrch->search->textLength, status);
2891 if (U_SUCCESS(*status)) {
2892 return usearch_previous(strsrch, status);
2893 }
2894 }
2895 return USEARCH_DONE;
2896 }
2897
2898 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
2899 int32_t position,
2900 UErrorCode *status)
2901 {
2902 if (strsrch && U_SUCCESS(*status)) {
2903 strsrch->search->isForwardSearching = FALSE;
2904 // position checked in usearch_setOffset
2905 usearch_setOffset(strsrch, position, status);
2906 if (U_SUCCESS(*status)) {
2907 return usearch_previous(strsrch, status);
2908 }
2909 }
2910 return USEARCH_DONE;
2911 }
2912
2913 /**
2914 * If a direction switch is required, we'll count the number of ces till the
2915 * beginning of the collation element iterator and iterate forwards that
2916 * number of times. This is so that we get to the correct point within the
2917 * string to continue the search in. Imagine when we are in the middle of the
2918 * normalization buffer when the change in direction is request. arrrgghh....
2919 * After searching the offset within the collation element iterator will be
2920 * shifted to the start of the match. If a match is not found, the offset would
2921 * have been set to the end of the text string in the collation element
2922 * iterator.
2923 * Okay, here's my take on normalization buffer. The only time when there can
2924 * be 2 matches within the same normalization is when the pattern is consists
2925 * of all accents. But since the offset returned is from the text string, we
2926 * should not confuse the caller by returning the second match within the
2927 * same normalization buffer. If we do, the 2 results will have the same match
2928 * offsets, and that'll be confusing. I'll return the next match that doesn't
2929 * fall within the same normalization buffer. Note this does not affect the
2930 * results of matches spanning the text and the normalization buffer.
2931 * The position to start searching is taken from the collation element
2932 * iterator. Callers of this API would have to set the offset in the collation
2933 * element iterator before using this method.
2934 */
2935 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
2936 UErrorCode *status)
2937 {
2938 if (U_SUCCESS(*status) && strsrch) {
2939 // note offset is either equivalent to the start of the previous match
2940 // or is set by the user
2941 int32_t offset = usearch_getOffset(strsrch);
2942 USearch *search = strsrch->search;
2943 search->reset = FALSE;
2944 int32_t textlength = search->textLength;
2945 if (search->isForwardSearching) {
2946 if (offset == textlength
2947 || (!search->isOverlap &&
2948 (offset + strsrch->pattern.defaultShiftSize > textlength ||
2949 (search->matchedIndex != USEARCH_DONE &&
2950 offset + search->matchedLength >= textlength)))) {
2951 // not enough characters to match
2952 setMatchNotFound(strsrch);
2953 return USEARCH_DONE;
2954 }
2955 }
2956 else {
2957 // switching direction.
2958 // if matchedIndex == USEARCH_DONE, it means that either a
2959 // setOffset has been called or that previous ran off the text
2960 // string. the iterator would have been set to offset 0 if a
2961 // match is not found.
2962 search->isForwardSearching = TRUE;
2963 if (search->matchedIndex != USEARCH_DONE) {
2964 // there's no need to set the collation element iterator
2965 // the next call to next will set the offset.
2966 return search->matchedIndex;
2967 }
2968 }
2969
2970 if (U_SUCCESS(*status)) {
2971 if (strsrch->pattern.CELength == 0) {
2972 if (search->matchedIndex == USEARCH_DONE) {
2973 search->matchedIndex = offset;
2974 }
2975 else { // moves by codepoints
2976 UTF_FWD_1(search->text, search->matchedIndex, textlength);
2977 }
2978
2979 search->matchedLength = 0;
2980 setColEIterOffset(strsrch->textIter, search->matchedIndex);
2981 // status checked below
2982 if (search->matchedIndex == textlength) {
2983 search->matchedIndex = USEARCH_DONE;
2984 }
2985 }
2986 else {
2987 if (search->matchedLength > 0) {
2988 // if matchlength is 0 we are at the start of the iteration
2989 if (search->isOverlap) {
2990 ucol_setOffset(strsrch->textIter, offset + 1, status);
2991 }
2992 else {
2993 ucol_setOffset(strsrch->textIter,
2994 offset + search->matchedLength, status);
2995 }
2996 }
2997 else {
2998 // for boundary check purposes. this will ensure that the
2999 // next match will not preceed the current offset
3000 // note search->matchedIndex will always be set to something
3001 // in the code
3002 search->matchedIndex = offset - 1;
3003 }
3004
3005 if (search->isCanonicalMatch) {
3006 // can't use exact here since extra accents are allowed.
3007 usearch_handleNextCanonical(strsrch, status);
3008 }
3009 else {
3010 usearch_handleNextExact(strsrch, status);
3011 }
3012 }
3013
3014 if (U_FAILURE(*status)) {
3015 return USEARCH_DONE;
3016 }
3017
3018 return search->matchedIndex;
3019 }
3020 }
3021 return USEARCH_DONE;
3022 }
3023
3024 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3025 UErrorCode *status)
3026 {
3027 if (U_SUCCESS(*status) && strsrch) {
3028 int32_t offset;
3029 USearch *search = strsrch->search;
3030 if (search->reset) {
3031 offset = search->textLength;
3032 search->isForwardSearching = FALSE;
3033 search->reset = FALSE;
3034 setColEIterOffset(strsrch->textIter, offset);
3035 }
3036 else {
3037 offset = usearch_getOffset(strsrch);
3038 }
3039
3040 int32_t matchedindex = search->matchedIndex;
3041 if (search->isForwardSearching == TRUE) {
3042 // switching direction.
3043 // if matchedIndex == USEARCH_DONE, it means that either a
3044 // setOffset has been called or that next ran off the text
3045 // string. the iterator would have been set to offset textLength if
3046 // a match is not found.
3047 search->isForwardSearching = FALSE;
3048 if (matchedindex != USEARCH_DONE) {
3049 return matchedindex;
3050 }
3051 }
3052 else {
3053 if (offset == 0 || matchedindex == 0 ||
3054 (!search->isOverlap &&
3055 (offset < strsrch->pattern.defaultShiftSize ||
3056 (matchedindex != USEARCH_DONE &&
3057 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3058 // not enough characters to match
3059 setMatchNotFound(strsrch);
3060 return USEARCH_DONE;
3061 }
3062 }
3063
3064 if (U_SUCCESS(*status)) {
3065 if (strsrch->pattern.CELength == 0) {
3066 search->matchedIndex =
3067 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3068 if (search->matchedIndex == 0) {
3069 setMatchNotFound(strsrch);
3070 // status checked below
3071 }
3072 else { // move by codepoints
3073 UTF_BACK_1(search->text, 0, search->matchedIndex);
3074 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3075 // status checked below
3076 search->matchedLength = 0;
3077 }
3078 }
3079 else {
3080 if (strsrch->search->isCanonicalMatch) {
3081 // can't use exact here since extra accents are allowed.
3082 usearch_handlePreviousCanonical(strsrch, status);
3083 // status checked below
3084 }
3085 else {
3086 usearch_handlePreviousExact(strsrch, status);
3087 // status checked below
3088 }
3089 }
3090
3091 if (U_FAILURE(*status)) {
3092 return USEARCH_DONE;
3093 }
3094
3095 return search->matchedIndex;
3096 }
3097 }
3098 return USEARCH_DONE;
3099 }
3100
3101
3102
3103 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3104 {
3105 /*
3106 reset is setting the attributes that are already in
3107 string search, hence all attributes in the collator should
3108 be retrieved without any problems
3109 */
3110 if (strsrch) {
3111 UErrorCode status = U_ZERO_ERROR;
3112 UBool sameCollAttribute = TRUE;
3113 uint32_t ceMask;
3114 UBool shift;
3115 uint32_t varTop;
3116
3117 strsrch->strength = ucol_getStrength(strsrch->collator);
3118 ceMask = getMask(strsrch->strength);
3119 if (strsrch->ceMask != ceMask) {
3120 strsrch->ceMask = ceMask;
3121 sameCollAttribute = FALSE;
3122 }
3123 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3124 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3125 &status) == UCOL_SHIFTED;
3126 if (strsrch->toShift != shift) {
3127 strsrch->toShift = shift;
3128 sameCollAttribute = FALSE;
3129 }
3130
3131 // if status is a failure, ucol_getVariableTop returns 0
3132 varTop = ucol_getVariableTop(strsrch->collator, &status);
3133 if (strsrch->variableTop != varTop) {
3134 strsrch->variableTop = varTop;
3135 sameCollAttribute = FALSE;
3136 }
3137 if (!sameCollAttribute) {
3138 initialize(strsrch, &status);
3139 }
3140 uprv_init_collIterate(strsrch->collator, strsrch->search->text,
3141 strsrch->search->textLength,
3142 &(strsrch->textIter->iteratordata_));
3143 strsrch->search->matchedLength = 0;
3144 strsrch->search->matchedIndex = USEARCH_DONE;
3145 strsrch->search->isOverlap = FALSE;
3146 strsrch->search->isCanonicalMatch = FALSE;
3147 strsrch->search->isForwardSearching = TRUE;
3148 strsrch->search->reset = TRUE;
3149 }
3150 }
3151
3152 // internal use methods declared in usrchimp.h -----------------------------
3153
3154 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
3155 {
3156 if (U_FAILURE(*status)) {
3157 setMatchNotFound(strsrch);
3158 return FALSE;
3159 }
3160
3161 UCollationElements *coleiter = strsrch->textIter;
3162 int32_t textlength = strsrch->search->textLength;
3163 int32_t *patternce = strsrch->pattern.CE;
3164 int32_t patterncelength = strsrch->pattern.CELength;
3165 int32_t textoffset = ucol_getOffset(coleiter);
3166
3167 // status used in setting coleiter offset, since offset is checked in
3168 // shiftForward before setting the coleiter offset, status never
3169 // a failure
3170 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3171 patterncelength);
3172 while (textoffset <= textlength)
3173 {
3174 uint32_t patternceindex = patterncelength - 1;
3175 int32_t targetce;
3176 UBool found = FALSE;
3177 int32_t lastce = UCOL_NULLORDER;
3178
3179 setColEIterOffset(coleiter, textoffset);
3180
3181 while (TRUE) {
3182 // finding the last pattern ce match, imagine composite characters
3183 // for example: search for pattern A in text \u00C0
3184 // we'll have to skip \u0300 the grave first before we get to A
3185 targetce = ucol_previous(coleiter, status);
3186 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3187 found = FALSE;
3188 break;
3189 }
3190 targetce = getCE(strsrch, targetce);
3191 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
3192 // this is for the text \u0315\u0300 that requires
3193 // normalization and pattern \u0300, where \u0315 is ignorable
3194 continue;
3195 }
3196 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3197 lastce = targetce;
3198 }
3199 if (targetce == patternce[patternceindex]) {
3200 // the first ce can be a contraction
3201 found = TRUE;
3202 break;
3203 }
3204 if (!hasExpansion(coleiter)) {
3205 found = FALSE;
3206 break;
3207 }
3208 }
3209
3210 targetce = lastce;
3211
3212 while (found && patternceindex > 0) {
3213 targetce = ucol_previous(coleiter, status);
3214 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3215 found = FALSE;
3216 break;
3217 }
3218 targetce = getCE(strsrch, targetce);
3219 if (targetce == UCOL_IGNORABLE) {
3220 continue;
3221 }
3222
3223 patternceindex --;
3224 found = found && targetce == patternce[patternceindex];
3225 }
3226
3227 if (!found) {
3228 if (U_FAILURE(*status)) {
3229 break;
3230 }
3231 textoffset = shiftForward(strsrch, textoffset, lastce,
3232 patternceindex);
3233 // status checked at loop.
3234 patternceindex = patterncelength;
3235 continue;
3236 }
3237
3238 if (checkNextExactMatch(strsrch, &textoffset, status)) {
3239 // status checked in ucol_setOffset
3240 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3241 return TRUE;
3242 }
3243 }
3244 setMatchNotFound(strsrch);
3245 return FALSE;
3246 }
3247
3248 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
3249 {
3250 if (U_FAILURE(*status)) {
3251 setMatchNotFound(strsrch);
3252 return FALSE;
3253 }
3254
3255 UCollationElements *coleiter = strsrch->textIter;
3256 int32_t textlength = strsrch->search->textLength;
3257 int32_t *patternce = strsrch->pattern.CE;
3258 int32_t patterncelength = strsrch->pattern.CELength;
3259 int32_t textoffset = ucol_getOffset(coleiter);
3260 UBool hasPatternAccents =
3261 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3262
3263 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3264 patterncelength);
3265 strsrch->canonicalPrefixAccents[0] = 0;
3266 strsrch->canonicalSuffixAccents[0] = 0;
3267
3268 while (textoffset <= textlength)
3269 {
3270 int32_t patternceindex = patterncelength - 1;
3271 int32_t targetce;
3272 UBool found = FALSE;
3273 int32_t lastce = UCOL_NULLORDER;
3274
3275 setColEIterOffset(coleiter, textoffset);
3276
3277 for (;;) {
3278 // finding the last pattern ce match, imagine composite characters
3279 // for example: search for pattern A in text \u00C0
3280 // we'll have to skip \u0300 the grave first before we get to A
3281 targetce = ucol_previous(coleiter, status);
3282 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3283 found = FALSE;
3284 break;
3285 }
3286 targetce = getCE(strsrch, targetce);
3287 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3288 lastce = targetce;
3289 }
3290 if (targetce == patternce[patternceindex]) {
3291 // the first ce can be a contraction
3292 found = TRUE;
3293 break;
3294 }
3295 if (!hasExpansion(coleiter)) {
3296 found = FALSE;
3297 break;
3298 }
3299 }
3300
3301 while (found && patternceindex > 0) {
3302 targetce = ucol_previous(coleiter, status);
3303 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3304 found = FALSE;
3305 break;
3306 }
3307 targetce = getCE(strsrch, targetce);
3308 if (targetce == UCOL_IGNORABLE) {
3309 continue;
3310 }
3311
3312 patternceindex --;
3313 found = found && targetce == patternce[patternceindex];
3314 }
3315
3316 // initializing the rearranged accent array
3317 if (hasPatternAccents && !found) {
3318 strsrch->canonicalPrefixAccents[0] = 0;
3319 strsrch->canonicalSuffixAccents[0] = 0;
3320 if (U_FAILURE(*status)) {
3321 break;
3322 }
3323 found = doNextCanonicalMatch(strsrch, textoffset, status);
3324 }
3325
3326 if (!found) {
3327 if (U_FAILURE(*status)) {
3328 break;
3329 }
3330 textoffset = shiftForward(strsrch, textoffset, lastce,
3331 patternceindex);
3332 // status checked at loop
3333 patternceindex = patterncelength;
3334 continue;
3335 }
3336
3337 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
3338 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3339 return TRUE;
3340 }
3341 }
3342 setMatchNotFound(strsrch);
3343 return FALSE;
3344 }
3345
3346 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
3347 {
3348 if (U_FAILURE(*status)) {
3349 setMatchNotFound(strsrch);
3350 return FALSE;
3351 }
3352
3353 UCollationElements *coleiter = strsrch->textIter;
3354 int32_t *patternce = strsrch->pattern.CE;
3355 int32_t patterncelength = strsrch->pattern.CELength;
3356 int32_t textoffset = ucol_getOffset(coleiter);
3357
3358 // shifting it check for setting offset
3359 // if setOffset is called previously or there was no previous match, we
3360 // leave the offset as it is.
3361 if (strsrch->search->matchedIndex != USEARCH_DONE) {
3362 textoffset = strsrch->search->matchedIndex;
3363 }
3364
3365 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3366 patterncelength);
3367
3368 while (textoffset >= 0)
3369 {
3370 int32_t patternceindex = 1;
3371 int32_t targetce;
3372 UBool found = FALSE;
3373 int32_t firstce = UCOL_NULLORDER;
3374
3375 // if status is a failure, ucol_setOffset does nothing
3376 setColEIterOffset(coleiter, textoffset);
3377
3378 for (;;) {
3379 // finding the first pattern ce match, imagine composite
3380 // characters. for example: search for pattern \u0300 in text
3381 // \u00C0, we'll have to skip A first before we get to
3382 // \u0300 the grave accent
3383 targetce = ucol_next(coleiter, status);
3384 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3385 found = FALSE;
3386 break;
3387 }
3388 targetce = getCE(strsrch, targetce);
3389 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3390 firstce = targetce;
3391 }
3392 if (targetce == UCOL_IGNORABLE) {
3393 continue;
3394 }
3395 if (targetce == patternce[0]) {
3396 found = TRUE;
3397 break;
3398 }
3399 if (!hasExpansion(coleiter)) {
3400 // checking for accents in composite character
3401 found = FALSE;
3402 break;
3403 }
3404 }
3405
3406 targetce = firstce;
3407
3408 while (found && (patternceindex < patterncelength)) {
3409 targetce = ucol_next(coleiter, status);
3410 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3411 found = FALSE;
3412 break;
3413 }
3414 targetce = getCE(strsrch, targetce);
3415 if (targetce == UCOL_IGNORABLE) {
3416 continue;
3417 }
3418
3419 found = found && targetce == patternce[patternceindex];
3420 patternceindex ++;
3421 }
3422
3423 if (!found) {
3424 if (U_FAILURE(*status)) {
3425 break;
3426 }
3427 textoffset = reverseShift(strsrch, textoffset, targetce,
3428 patternceindex);
3429 patternceindex = 0;
3430 continue;
3431 }
3432
3433 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
3434 setColEIterOffset(coleiter, textoffset);
3435 return TRUE;
3436 }
3437 }
3438 setMatchNotFound(strsrch);
3439 return FALSE;
3440 }
3441
3442 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
3443 UErrorCode *status)
3444 {
3445 if (U_FAILURE(*status)) {
3446 setMatchNotFound(strsrch);
3447 return FALSE;
3448 }
3449
3450 UCollationElements *coleiter = strsrch->textIter;
3451 int32_t *patternce = strsrch->pattern.CE;
3452 int32_t patterncelength = strsrch->pattern.CELength;
3453 int32_t textoffset = ucol_getOffset(coleiter);
3454 UBool hasPatternAccents =
3455 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3456
3457 // shifting it check for setting offset
3458 // if setOffset is called previously or there was no previous match, we
3459 // leave the offset as it is.
3460 if (strsrch->search->matchedIndex != USEARCH_DONE) {
3461 textoffset = strsrch->search->matchedIndex;
3462 }
3463
3464 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3465 patterncelength);
3466 strsrch->canonicalPrefixAccents[0] = 0;
3467 strsrch->canonicalSuffixAccents[0] = 0;
3468
3469 while (textoffset >= 0)
3470 {
3471 int32_t patternceindex = 1;
3472 int32_t targetce;
3473 UBool found = FALSE;
3474 int32_t firstce = UCOL_NULLORDER;
3475
3476 setColEIterOffset(coleiter, textoffset);
3477 while (TRUE) {
3478 // finding the first pattern ce match, imagine composite
3479 // characters. for example: search for pattern \u0300 in text
3480 // \u00C0, we'll have to skip A first before we get to
3481 // \u0300 the grave accent
3482 targetce = ucol_next(coleiter, status);
3483 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3484 found = FALSE;
3485 break;
3486 }
3487 targetce = getCE(strsrch, targetce);
3488 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3489 firstce = targetce;
3490 }
3491
3492 if (targetce == patternce[0]) {
3493 // the first ce can be a contraction
3494 found = TRUE;
3495 break;
3496 }
3497 if (!hasExpansion(coleiter)) {
3498 // checking for accents in composite character
3499 found = FALSE;
3500 break;
3501 }
3502 }
3503
3504 targetce = firstce;
3505
3506 while (found && patternceindex < patterncelength) {
3507 targetce = ucol_next(coleiter, status);
3508 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3509 found = FALSE;
3510 break;
3511 }
3512 targetce = getCE(strsrch, targetce);
3513 if (targetce == UCOL_IGNORABLE) {
3514 continue;
3515 }
3516
3517 found = found && targetce == patternce[patternceindex];
3518 patternceindex ++;
3519 }
3520
3521 // initializing the rearranged accent array
3522 if (hasPatternAccents && !found) {
3523 strsrch->canonicalPrefixAccents[0] = 0;
3524 strsrch->canonicalSuffixAccents[0] = 0;
3525 if (U_FAILURE(*status)) {
3526 break;
3527 }
3528 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
3529 }
3530
3531 if (!found) {
3532 if (U_FAILURE(*status)) {
3533 break;
3534 }
3535 textoffset = reverseShift(strsrch, textoffset, targetce,
3536 patternceindex);
3537 patternceindex = 0;
3538 continue;
3539 }
3540
3541 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
3542 setColEIterOffset(coleiter, textoffset);
3543 return TRUE;
3544 }
3545 }
3546 setMatchNotFound(strsrch);
3547 return FALSE;
3548 }
3549
3550 #endif /* #if !UCONFIG_NO_COLLATION */