]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/usearch.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / i18n / usearch.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2001-2003 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_COLLATION
13
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unormimp.h"
18 #include "ucol_imp.h"
19 #include "usrchimp.h"
20 #include "cmemory.h"
21
22 // internal definition ---------------------------------------------------
23
24 #define LAST_BYTE_MASK_ 0xFF
25 #define SECOND_LAST_BYTE_SHIFT_ 8
26 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
27
28 static const uint16_t *FCD_ = NULL;
29
30 // internal methods -------------------------------------------------
31
32 /**
33 * Fast collation element iterator setOffset.
34 * This function does not check for bounds.
35 * @param coleiter collation element iterator
36 * @param offset to set
37 */
38 static
39 inline void setColEIterOffset(UCollationElements *elems,
40 int32_t offset)
41 {
42 collIterate *ci = &(elems->iteratordata_);
43 ci->pos = ci->string + offset;
44 ci->CEpos = ci->toReturn = ci->CEs;
45 if (ci->flags & UCOL_ITER_INNORMBUF) {
46 ci->flags = ci->origFlags;
47 }
48 ci->fcdPosition = NULL;
49 }
50
51 /**
52 * Getting the mask for collation strength
53 * @param strength collation strength
54 * @return collation element mask
55 */
56 static
57 inline uint32_t getMask(UCollationStrength strength)
58 {
59 switch (strength)
60 {
61 case UCOL_PRIMARY:
62 return UCOL_PRIMARYORDERMASK;
63 case UCOL_SECONDARY:
64 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
65 default:
66 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
67 UCOL_PRIMARYORDERMASK;
68 }
69 }
70
71 /**
72 * This is to squeeze the 21bit ces into a 256 table
73 * @param ce collation element
74 * @return collapsed version of the collation element
75 */
76 static
77 inline int hash(uint32_t ce)
78 {
79 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
80 // well with the new collation where most of the latin 1 characters
81 // are of the value xx000xxx. their hashes will most of the time be 0
82 // to be discussed on the hash algo.
83 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
84 }
85
86 /**
87 * Initializing the fcd tables.
88 * Internal method, status assumed to be a success.
89 * @param status output error if any, caller to check status before calling
90 * method, status assumed to be success when passed in.
91 */
92 static
93 inline void initializeFCD(UErrorCode *status)
94 {
95 if (FCD_ == NULL) {
96 FCD_ = unorm_getFCDTrie(status);
97 }
98 }
99
100 /**
101 * Gets the fcd value for a character at the argument index.
102 * This method takes into accounts of the supplementary characters.
103 * @param str UTF16 string where character for fcd retrieval resides
104 * @param offset position of the character whose fcd is to be retrieved, to be
105 * overwritten with the next character position, taking
106 * surrogate characters into consideration.
107 * @param strlength length of the argument string
108 * @return fcd value
109 */
110 static
111 inline uint16_t getFCD(const UChar *str, int32_t *offset,
112 int32_t strlength)
113 {
114 int32_t temp = *offset;
115 uint16_t result;
116 UChar ch = str[temp];
117 result = unorm_getFCD16(FCD_, ch);
118 temp ++;
119
120 if (result && temp != strlength && UTF_IS_FIRST_SURROGATE(ch)) {
121 ch = str[temp];
122 if (UTF_IS_SECOND_SURROGATE(ch)) {
123 result = unorm_getFCD16FromSurrogatePair(FCD_, result, ch);
124 temp ++;
125 } else {
126 result = 0;
127 }
128 }
129 *offset = temp;
130 return result;
131 }
132
133 /**
134 * Getting the modified collation elements taking into account the collation
135 * attributes
136 * @param strsrch string search data
137 * @param sourcece
138 * @return the modified collation element
139 */
140 static
141 inline uint32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
142 {
143 // note for tertiary we can't use the collator->tertiaryMask, that
144 // is a preprocessed mask that takes into account case options. since
145 // we are only concerned with exact matches, we don't need that.
146 sourcece &= strsrch->ceMask;
147
148 if (strsrch->toShift) {
149 // alternate handling here, since only the 16 most significant digits
150 // is only used, we can safely do a compare without masking
151 // if the ce is a variable, we mask and get only the primary values
152 // no shifting to quartenary is required since all primary values
153 // less than variabletop will need to be masked off anyway.
154 if (strsrch->variableTop > sourcece) {
155 if (strsrch->strength == UCOL_QUATERNARY) {
156 sourcece &= UCOL_PRIMARYORDERMASK;
157 }
158 else {
159 sourcece = UCOL_IGNORABLE;
160 }
161 }
162 }
163
164 return sourcece;
165 }
166
167 /**
168 * Allocate a memory and returns NULL if it failed.
169 * Internal method, status assumed to be a success.
170 * @param size to allocate
171 * @param status output error if any, caller to check status before calling
172 * method, status assumed to be success when passed in.
173 * @return newly allocated array, NULL otherwise
174 */
175 static
176 inline void * allocateMemory(uint32_t size, UErrorCode *status)
177 {
178 uint32_t *result = (uint32_t *)uprv_malloc(size);
179 if (result == NULL) {
180 *status = U_MEMORY_ALLOCATION_ERROR;
181 }
182 return result;
183 }
184
185 /**
186 * Adds a uint32_t value to a destination array.
187 * Creates a new array if we run out of space. The caller will have to
188 * manually deallocate the newly allocated array.
189 * Internal method, status assumed to be success, caller has to check status
190 * before calling this method. destination not to be NULL and has at least
191 * size destinationlength.
192 * @param destination target array
193 * @param offset destination offset to add value
194 * @param destinationlength target array size, return value for the new size
195 * @param value to be added
196 * @param increments incremental size expected
197 * @param status output error if any, caller to check status before calling
198 * method, status assumed to be success when passed in.
199 * @return new destination array, destination if there was no new allocation
200 */
201 static
202 inline uint32_t * addTouint32_tArray(uint32_t *destination,
203 uint32_t offset,
204 uint32_t *destinationlength,
205 uint32_t value,
206 uint32_t increments,
207 UErrorCode *status)
208 {
209 uint32_t newlength = *destinationlength;
210 if (offset + 1 == newlength) {
211 newlength += increments;
212 uint32_t *temp = (uint32_t *)allocateMemory(
213 sizeof(uint32_t) * newlength, status);
214 if (U_FAILURE(*status)) {
215 return NULL;
216 }
217 uprv_memcpy(temp, destination, sizeof(uint32_t) * offset);
218 *destinationlength = newlength;
219 destination = temp;
220 }
221 destination[offset] = value;
222 return destination;
223 }
224
225 /**
226 * Initializing the ce table for a pattern.
227 * Stores non-ignorable collation keys.
228 * Table size will be estimated by the size of the pattern text. Table
229 * expansion will be perform as we go along. Adding 1 to ensure that the table
230 * size definitely increases.
231 * Internal method, status assumed to be a success.
232 * @param strsrch string search data
233 * @param status output error if any, caller to check status before calling
234 * method, status assumed to be success when passed in.
235 * @return total number of expansions
236 */
237 static
238 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
239 UErrorCode *status)
240 {
241 UPattern *pattern = &(strsrch->pattern);
242 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
243 uint32_t *cetable = pattern->CEBuffer;
244 uint32_t patternlength = pattern->textLength;
245 UCollationElements *coleiter = strsrch->utilIter;
246
247 if (coleiter == NULL) {
248 coleiter = ucol_openElements(strsrch->collator, pattern->text,
249 patternlength, status);
250 // status will be checked in ucol_next(..) later and if it is an
251 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
252 // returned.
253 strsrch->utilIter = coleiter;
254 }
255 else {
256 uprv_init_collIterate(strsrch->collator, pattern->text,
257 pattern->textLength,
258 &coleiter->iteratordata_);
259 }
260
261 if (pattern->CE != cetable && pattern->CE) {
262 uprv_free(pattern->CE);
263 }
264
265 uint16_t offset = 0;
266 uint16_t result = 0;
267 uint32_t ce;
268
269 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
270 U_SUCCESS(*status)) {
271 uint32_t newce = getCE(strsrch, ce);
272 if (newce) {
273 uint32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
274 newce,
275 patternlength - ucol_getOffset(coleiter) + 1,
276 status);
277 if (U_FAILURE(*status)) {
278 return 0;
279 }
280 offset ++;
281 if (cetable != temp && cetable != pattern->CEBuffer) {
282 uprv_free(cetable);
283 }
284 cetable = temp;
285 }
286 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
287 }
288
289 cetable[offset] = 0;
290 pattern->CE = cetable;
291 pattern->CELength = offset;
292
293 return result;
294 }
295
296 /**
297 * Initializes the pattern struct.
298 * Internal method, status assumed to be success.
299 * @param strsrch UStringSearch data storage
300 * @param status output error if any, caller to check status before calling
301 * method, status assumed to be success when passed in.
302 * @return expansionsize the total expansion size of the pattern
303 */
304 static
305 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
306 {
307 UPattern *pattern = &(strsrch->pattern);
308 const UChar *patterntext = pattern->text;
309 int32_t length = pattern->textLength;
310 int32_t index = 0;
311
312 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
313 SECOND_LAST_BYTE_SHIFT_;
314 index = length;
315 UTF_BACK_1(patterntext, 0, index);
316 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
317 LAST_BYTE_MASK_;
318 // since intializePattern is an internal method status is a success.
319 return initializePatternCETable(strsrch, status);
320 }
321
322 /**
323 * Initializing shift tables, with the default values.
324 * If a corresponding default value is 0, the shift table is not set.
325 * @param shift table for forwards shift
326 * @param backshift table for backwards shift
327 * @param cetable table containing pattern ce
328 * @param cesize size of the pattern ces
329 * @param expansionsize total size of the expansions
330 * @param defaultforward the default forward value
331 * @param defaultbackward the default backward value
332 */
333 static
334 inline void setShiftTable(int16_t shift[], int16_t backshift[],
335 uint32_t *cetable, int32_t cesize,
336 int16_t expansionsize,
337 int16_t defaultforward,
338 int16_t defaultbackward)
339 {
340 // estimate the value to shift. to do that we estimate the smallest
341 // number of characters to give the relevant ces, ie approximately
342 // the number of ces minus their expansion, since expansions can come
343 // from a character.
344 int32_t count;
345 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
346 shift[count] = defaultforward;
347 }
348 cesize --; // down to the last index
349 for (count = 0; count < cesize; count ++) {
350 // number of ces from right of array to the count
351 int temp = defaultforward - count - 1;
352 shift[hash(cetable[count])] = temp > 1 ? temp : 1;
353 }
354 shift[hash(cetable[cesize])] = 1;
355 // for ignorables we just shift by one. see test examples.
356 shift[hash(0)] = 1;
357
358 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
359 backshift[count] = defaultbackward;
360 }
361 for (count = cesize; count > 0; count --) {
362 // the original value count does not seem to work
363 backshift[hash(cetable[count])] = count > expansionsize ?
364 (int16_t)(count - expansionsize) : 1;
365 }
366 backshift[hash(cetable[0])] = 1;
367 backshift[hash(0)] = 1;
368 }
369
370 /**
371 * Building of the pattern collation element list and the boyer moore strsrch
372 * table.
373 * The canonical match will only be performed after the default match fails.
374 * For both cases we need to remember the size of the composed and decomposed
375 * versions of the string. Since the Boyer-Moore shift calculations shifts by
376 * a number of characters in the text and tries to match the pattern from that
377 * offset, the shift value can not be too large in case we miss some
378 * characters. To choose a right shift size, we estimate the NFC form of the
379 * and use its size as a shift guide. The NFC form should be the small
380 * possible representation of the pattern. Anyways, we'll err on the smaller
381 * shift size. Hence the calculation for minlength.
382 * Canonical match will be performed slightly differently. We'll split the
383 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
384 * the first and last base character (MS), the ending accents (EA). Matches
385 * will be done on MS first, and only when we match MS then some processing
386 * will be required for the prefix and end accents in order to determine if
387 * they match PA and EA. Hence the default shift values
388 * for the canonical match will take the size of either end's accent into
389 * consideration. Forwards search will take the end accents into consideration
390 * for the default shift values and the backwards search will take the prefix
391 * accents into consideration.
392 * If pattern has no non-ignorable ce, we return a illegal argument error.
393 * Internal method, status assumed to be success.
394 * @param strsrch UStringSearch data storage
395 * @param status for output errors if it occurs, status is assumed to be a
396 * success when it is passed in.
397 */
398 static
399 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
400 {
401 int16_t expandlength = initializePattern(strsrch, status);
402 if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
403 UPattern *pattern = &strsrch->pattern;
404 int32_t cesize = pattern->CELength;
405
406 int16_t minlength = cesize > expandlength
407 ? (int16_t)cesize - expandlength : 1;
408 pattern->defaultShiftSize = minlength;
409 setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
410 cesize, expandlength, minlength, minlength);
411 return;
412 }
413 strsrch->pattern.defaultShiftSize = 0;
414 }
415
416 /**
417 * Determine whether the target text in UStringSearch bounded by the offset
418 * start and end is one or more whole units of text as
419 * determined by the breakiterator in UStringSearch.
420 * @param strsrch string search data
421 * @param start target text start offset
422 * @param end target text end offset
423 */
424 static
425 inline UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
426 int32_t end)
427 {
428 #if !UCONFIG_NO_BREAK_ITERATION
429 UBreakIterator *breakiterator = strsrch->search->breakIter;
430 if (breakiterator) {
431 int32_t startindex = ubrk_first(breakiterator);
432 int32_t endindex = ubrk_last(breakiterator);
433
434 // out-of-range indexes are never boundary positions
435 if (start < startindex || start > endindex ||
436 end < startindex || end > endindex) {
437 return FALSE;
438 }
439 // otherwise, we can use following() on the position before the
440 // specified one and return true of the position we get back is the
441 // one the user specified
442 UBool result = (start == startindex ||
443 ubrk_following(breakiterator, start - 1) == start) &&
444 (end == endindex ||
445 ubrk_following(breakiterator, end - 1) == end);
446 if (result) {
447 // iterates the individual ces
448 UCollationElements *coleiter = strsrch->utilIter;
449 const UChar *text = strsrch->search->text +
450 start;
451 UErrorCode status = U_ZERO_ERROR;
452 ucol_setText(coleiter, text, end - start, &status);
453 for (int32_t count = 0; count < strsrch->pattern.CELength;
454 count ++) {
455 uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
456 if (ce == UCOL_IGNORABLE) {
457 count --;
458 continue;
459 }
460 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
461 return FALSE;
462 }
463 }
464 uint32_t nextce = ucol_next(coleiter, &status);
465 while (ucol_getOffset(coleiter) == (end - start)
466 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
467 nextce = ucol_next(coleiter, &status);
468 }
469 if (ucol_getOffset(coleiter) == (end - start)
470 && nextce != UCOL_NULLORDER) {
471 // extra collation elements at the end of the match
472 return FALSE;
473 }
474 }
475 return result;
476 }
477 #endif
478 return TRUE;
479 }
480
481 /**
482 * Getting the next base character offset if current offset is an accent,
483 * or the current offset if the current character contains a base character.
484 * accents the following base character will be returned
485 * @param text string
486 * @param textoffset current offset
487 * @param textlength length of text string
488 * @return the next base character or the current offset
489 * if the current character is contains a base character.
490 */
491 static
492 inline int32_t getNextBaseOffset(const UChar *text,
493 int32_t textoffset,
494 int32_t textlength)
495 {
496 if (textoffset < textlength) {
497 int32_t temp = textoffset;
498 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
499 while (temp < textlength) {
500 int32_t result = temp;
501 if ((getFCD(text, &temp, textlength) >>
502 SECOND_LAST_BYTE_SHIFT_) == 0) {
503 return result;
504 }
505 }
506 return textlength;
507 }
508 }
509 return textoffset;
510 }
511
512 /**
513 * Gets the next base character offset depending on the string search pattern
514 * data
515 * @param strsrch string search data
516 * @param textoffset current offset, one offset away from the last character
517 * to search for.
518 * @return start index of the next base character or the current offset
519 * if the current character is contains a base character.
520 */
521 static
522 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
523 int32_t textoffset)
524 {
525 int32_t textlength = strsrch->search->textLength;
526 if (strsrch->pattern.hasSuffixAccents &&
527 textoffset < textlength) {
528 int32_t temp = textoffset;
529 const UChar *text = strsrch->search->text;
530 UTF_BACK_1(text, 0, temp);
531 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
532 return getNextBaseOffset(text, textoffset, textlength);
533 }
534 }
535 return textoffset;
536 }
537
538 /**
539 * Shifting the collation element iterator position forward to prepare for
540 * a following match. If the last character is a unsafe character, we'll only
541 * shift by 1 to capture contractions, normalization etc.
542 * Internal method, status assumed to be success.
543 * @param text strsrch string search data
544 * @param textoffset start text position to do search
545 * @param ce the text ce which failed the match.
546 * @param patternceindex index of the ce within the pattern ce buffer which
547 * failed the match
548 * @return final offset
549 */
550 static
551 inline int32_t shiftForward(UStringSearch *strsrch,
552 int32_t textoffset,
553 uint32_t ce,
554 int32_t patternceindex)
555 {
556 UPattern *pattern = &(strsrch->pattern);
557 if (ce != UCOL_NULLORDER) {
558 int32_t shift = pattern->shift[hash(ce)];
559 // this is to adjust for characters in the middle of the
560 // substring for matching that failed.
561 int32_t adjust = pattern->CELength - patternceindex;
562 if (adjust > 1 && shift >= adjust) {
563 shift -= adjust - 1;
564 }
565 textoffset += shift;
566 }
567 else {
568 textoffset += pattern->defaultShiftSize;
569 }
570
571 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
572 // check for unsafe characters
573 // * if it is the start or middle of a contraction: to be done after
574 // a initial match is found
575 // * thai or lao base consonant character: similar to contraction
576 // * high surrogate character: similar to contraction
577 // * next character is a accent: shift to the next base character
578 return textoffset;
579 }
580
581 /**
582 * sets match not found
583 * @param strsrch string search data
584 */
585 static
586 inline void setMatchNotFound(UStringSearch *strsrch)
587 {
588 // this method resets the match result regardless of the error status.
589 strsrch->search->matchedIndex = USEARCH_DONE;
590 strsrch->search->matchedLength = 0;
591 if (strsrch->search->isForwardSearching) {
592 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
593 }
594 else {
595 setColEIterOffset(strsrch->textIter, 0);
596 }
597 }
598
599 /**
600 * Gets the offset to the next safe point in text.
601 * ie. not the middle of a contraction, swappable characters or supplementary
602 * characters.
603 * @param collator collation sata
604 * @param text string to work with
605 * @param textoffset offset in string
606 * @param textlength length of text string
607 * @return offset to the next safe character
608 */
609 static
610 inline int32_t getNextSafeOffset(const UCollator *collator,
611 const UChar *text,
612 int32_t textoffset,
613 int32_t textlength)
614 {
615 int32_t result = textoffset; // first contraction character
616 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
617 result ++;
618 }
619 return result;
620 }
621
622 /**
623 * This checks for accents in the potential match started with a .
624 * composite character.
625 * This is really painful... we have to check that composite character do not
626 * have any extra accents. We have to normalize the potential match and find
627 * the immediate decomposed character before the match.
628 * The first composite character would have been taken care of by the fcd
629 * checks in checkForwardExactMatch.
630 * This is the slow path after the fcd of the first character and
631 * the last character has been checked by checkForwardExactMatch and we
632 * determine that the potential match has extra non-ignorable preceding
633 * ces.
634 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
635 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
636 * Note here that accents checking are slow and cautioned in the API docs.
637 * Internal method, status assumed to be a success, caller should check status
638 * before calling this method
639 * @param strsrch string search data
640 * @param start index of the potential unfriendly composite character
641 * @param end index of the potential unfriendly composite character
642 * @param status output error status if any.
643 * @return TRUE if there is non-ignorable accents before at the beginning
644 * of the match, FALSE otherwise.
645 */
646
647 static
648 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
649 int32_t end,
650 UErrorCode *status)
651 {
652 UBool result = FALSE;
653 if (strsrch->pattern.hasPrefixAccents) {
654 int32_t length = end - start;
655 int32_t offset = 0;
656 const UChar *text = strsrch->search->text + start;
657
658 UTF_FWD_1(text, offset, length);
659 // we are only concerned with the first composite character
660 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
661 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
662 text, 0, length);
663 if (safeoffset != length) {
664 safeoffset ++;
665 }
666 UChar *norm = NULL;
667 UChar buffer[INITIAL_ARRAY_SIZE_];
668 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
669 buffer, INITIAL_ARRAY_SIZE_,
670 status);
671 if (U_FAILURE(*status)) {
672 return FALSE;
673 }
674 if (size >= INITIAL_ARRAY_SIZE_) {
675 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
676 status);
677 // if allocation failed, status will be set to
678 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
679 // checks for it.
680 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
681 size, status);
682 if (U_FAILURE(*status) && norm != NULL) {
683 uprv_free(norm);
684 return FALSE;
685 }
686 }
687 else {
688 norm = buffer;
689 }
690
691 UCollationElements *coleiter = strsrch->utilIter;
692 ucol_setText(coleiter, norm, size, status);
693 uint32_t firstce = strsrch->pattern.CE[0];
694 UBool ignorable = TRUE;
695 uint32_t ce = UCOL_IGNORABLE;
696 while (U_SUCCESS(*status) && ce != firstce) {
697 offset = ucol_getOffset(coleiter);
698 if (ce != firstce && ce != UCOL_IGNORABLE) {
699 ignorable = FALSE;
700 }
701 ce = ucol_next(coleiter, status);
702 }
703 UChar32 codepoint;
704 UTF_PREV_CHAR(norm, 0, offset, codepoint);
705 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
706
707 if (norm != buffer) {
708 uprv_free(norm);
709 }
710 }
711 }
712
713 return result;
714 }
715
716 /**
717 * Used by exact matches, checks if there are accents before the match.
718 * This is really painful... we have to check that composite characters at
719 * the start of the matches have to not have any extra accents.
720 * We check the FCD of the character first, if it starts with an accent and
721 * the first pattern ce does not match the first ce of the character, we bail.
722 * Otherwise we try normalizing the first composite
723 * character and find the immediate decomposed character before the match to
724 * see if it is an non-ignorable accent.
725 * Now normalizing the first composite character is enough because we ensure
726 * that when the match is passed in here with extra beginning ces, the
727 * first or last ce that match has to occur within the first character.
728 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
729 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
730 * Note here that accents checking are slow and cautioned in the API docs.
731 * @param strsrch string search data
732 * @param start offset
733 * @param end offset
734 * @return TRUE if there are accents on either side of the match,
735 * FALSE otherwise
736 */
737 static
738 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
739 int32_t end)
740 {
741 if (strsrch->pattern.hasPrefixAccents) {
742 UCollationElements *coleiter = strsrch->textIter;
743 UErrorCode status = U_ZERO_ERROR;
744 // we have been iterating forwards previously
745 uint32_t ignorable = TRUE;
746 uint32_t firstce = strsrch->pattern.CE[0];
747
748 setColEIterOffset(coleiter, start);
749 uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
750 if (U_FAILURE(status)) {
751 return TRUE;
752 }
753 while (ce != firstce) {
754 if (ce != UCOL_IGNORABLE) {
755 ignorable = FALSE;
756 }
757 ce = getCE(strsrch, ucol_next(coleiter, &status));
758 if (U_FAILURE(status)) {
759 return TRUE;
760 }
761 }
762 if (!ignorable && inNormBuf(coleiter)) {
763 // within normalization buffer, discontiguous handled here
764 return TRUE;
765 }
766
767 // within text
768 int32_t temp = start;
769 // original code
770 // accent = (getFCD(strsrch->search->text, &temp,
771 // strsrch->search->textLength)
772 // >> SECOND_LAST_BYTE_SHIFT_);
773 // however this code does not work well with VC7 .net in release mode.
774 // maybe the inlines for getFCD combined with shifting has bugs in
775 // VC7. anyways this is a work around.
776 UBool accent = getFCD(strsrch->search->text, &temp,
777 strsrch->search->textLength) > 0xFF;
778 if (!accent) {
779 return checkExtraMatchAccents(strsrch, start, end, &status);
780 }
781 if (!ignorable) {
782 return TRUE;
783 }
784 if (start > 0) {
785 temp = start;
786 UTF_BACK_1(strsrch->search->text, 0, temp);
787 if (getFCD(strsrch->search->text, &temp,
788 strsrch->search->textLength) & LAST_BYTE_MASK_) {
789 setColEIterOffset(coleiter, start);
790 ce = ucol_previous(coleiter, &status);
791 if (U_FAILURE(status) ||
792 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
793 return TRUE;
794 }
795 }
796 }
797 }
798
799 return FALSE;
800 }
801
802 /**
803 * Used by exact matches, checks if there are accents bounding the match.
804 * Note this is the initial boundary check. If the potential match
805 * starts or ends with composite characters, the accents in those
806 * characters will be determined later.
807 * Not doing backwards iteration here, since discontiguos contraction for
808 * backwards collation element iterator, use up too many characters.
809 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
810 * should fail since there is a acute at the end of \u01FA
811 * Note here that accents checking are slow and cautioned in the API docs.
812 * @param strsrch string search data
813 * @param start offset of match
814 * @param end end offset of the match
815 * @return TRUE if there are accents on either side of the match,
816 * FALSE otherwise
817 */
818 static
819 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
820 int32_t end)
821 {
822 if (strsrch->pattern.hasSuffixAccents) {
823 const UChar *text = strsrch->search->text;
824 int32_t temp = end;
825 int32_t textlength = strsrch->search->textLength;
826 UTF_BACK_1(text, 0, temp);
827 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
828 uint32_t firstce = strsrch->pattern.CE[0];
829 UCollationElements *coleiter = strsrch->textIter;
830 UErrorCode status = U_ZERO_ERROR;
831 setColEIterOffset(coleiter, start);
832 while (getCE(strsrch, ucol_next(coleiter, &status)) != firstce) {
833 if (U_FAILURE(status)) {
834 return TRUE;
835 }
836 }
837 int32_t count = 1;
838 while (count < strsrch->pattern.CELength) {
839 if (getCE(strsrch, ucol_next(coleiter, &status))
840 == UCOL_IGNORABLE) {
841 // Thai can give an ignorable here.
842 count --;
843 }
844 if (U_FAILURE(status)) {
845 return TRUE;
846 }
847 count ++;
848 }
849 uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
850 if (U_FAILURE(status)) {
851 return TRUE;
852 }
853 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
854 if (ucol_getOffset(coleiter) <= end) {
855 return TRUE;
856 }
857 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
858 return TRUE;
859 }
860 }
861 }
862 }
863 return FALSE;
864 }
865
866 /**
867 * Checks if the offset runs out of the text string
868 * @param offset
869 * @param textlength of the text string
870 * @return TRUE if offset is out of bounds, FALSE otherwise
871 */
872 static
873 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
874 {
875 return offset < 0 || offset > textlength;
876 }
877
878 /**
879 * Checks for identical match
880 * @param strsrch string search data
881 * @param start offset of possible match
882 * @param end offset of possible match
883 * @return TRUE if identical match is found
884 */
885 static
886 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
887 int32_t end)
888 {
889 int32_t length = end - start;
890 if (strsrch->strength != UCOL_IDENTICAL) {
891 return TRUE;
892 }
893
894 UErrorCode status = U_ZERO_ERROR;
895 int decomplength = unorm_decompose(NULL, -1,
896 strsrch->search->text + start, length,
897 FALSE, 0, &status);
898 if (decomplength != unorm_decompose(NULL, -1, strsrch->pattern.text,
899 strsrch->pattern.textLength,
900 FALSE, 0, &status)) {
901 return FALSE;
902 }
903 decomplength ++;
904 UChar *text = (UChar *)uprv_malloc(decomplength * sizeof(UChar));
905 UChar *pattern = (UChar *)uprv_malloc(decomplength * sizeof(UChar));
906 unorm_decompose(text, decomplength, strsrch->search->text + start,
907 length, FALSE, 0, &status);
908 unorm_decompose(pattern, decomplength, strsrch->pattern.text,
909 strsrch->pattern.textLength, FALSE, 0, &status);
910 UBool result = (uprv_memcmp(pattern, text, decomplength * sizeof(UChar))
911 == 0);
912 uprv_free(text);
913 uprv_free(pattern);
914 return result;
915 }
916
917 /**
918 * Checks to see if the match is repeated
919 * @param strsrch string search data
920 * @param start new match start index
921 * @param end new match end index
922 * @return TRUE if the the match is repeated, FALSE otherwise
923 */
924 static
925 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
926 int32_t start,
927 int32_t end)
928 {
929 int32_t lastmatchindex = strsrch->search->matchedIndex;
930 UBool result;
931 if (lastmatchindex == USEARCH_DONE) {
932 return FALSE;
933 }
934 if (strsrch->search->isForwardSearching) {
935 result = start <= lastmatchindex;
936 }
937 else {
938 result = start >= lastmatchindex;
939 }
940 if (!strsrch->search->isOverlap) {
941 if (strsrch->search->isForwardSearching) {
942 result = start < lastmatchindex + strsrch->search->matchedLength;
943 }
944 else {
945 result = end > lastmatchindex;
946 }
947 }
948 return result;
949 }
950
951 /**
952 * Gets the collation element iterator's current offset.
953 * @param coleiter collation element iterator
954 * @param forwards flag TRUE if we are moving in th forwards direction
955 * @return current offset
956 */
957 static
958 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
959 UBool forwards)
960 {
961 int32_t result = ucol_getOffset(coleiter);
962 // intricacies of the the backwards collation element iterator
963 if (!forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
964 result ++;
965 }
966 return result;
967 }
968
969 /**
970 * Checks match for contraction.
971 * If the match ends with a partial contraction we fail.
972 * If the match starts too far off (because of backwards iteration) we try to
973 * chip off the extra characters depending on whether a breakiterator has
974 * been used.
975 * Internal method, error assumed to be success, caller has to check status
976 * before calling this method.
977 * @param strsrch string search data
978 * @param start offset of potential match, to be modified if necessary
979 * @param end offset of potential match, to be modified if necessary
980 * @param status output error status if any
981 * @return TRUE if match passes the contraction test, FALSE otherwise
982 */
983
984 static
985 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
986 int32_t *start,
987 int32_t *end, UErrorCode *status)
988 {
989 UCollationElements *coleiter = strsrch->textIter;
990 int32_t textlength = strsrch->search->textLength;
991 int32_t temp = *start;
992 const UCollator *collator = strsrch->collator;
993 const UChar *text = strsrch->search->text;
994 // This part checks if either ends of the match contains potential
995 // contraction. If so we'll have to iterate through them
996 // The start contraction needs to be checked since ucol_previous dumps
997 // all characters till the first safe character into the buffer.
998 // *start + 1 is used to test for the unsafe characters instead of *start
999 // because ucol_prev takes all unsafe characters till the first safe
1000 // character ie *start. so by testing *start + 1, we can estimate if
1001 // excess prefix characters has been included in the potential search
1002 // results.
1003 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1004 (*start + 1 < textlength
1005 && ucol_unsafeCP(text[*start + 1], collator))) {
1006 int32_t expansion = getExpansionPrefix(coleiter);
1007 UBool expandflag = expansion > 0;
1008 setColEIterOffset(coleiter, *start);
1009 while (expansion > 0) {
1010 // getting rid of the redundant ce, caused by setOffset.
1011 // since backward contraction/expansion may have extra ces if we
1012 // are in the normalization buffer, hasAccentsBeforeMatch would
1013 // have taken care of it.
1014 // E.g. the character \u01FA will have an expansion of 3, but if
1015 // we are only looking for acute and ring \u030A and \u0301, we'll
1016 // have to skip the first ce in the expansion buffer.
1017 ucol_next(coleiter, status);
1018 if (U_FAILURE(*status)) {
1019 return FALSE;
1020 }
1021 if (ucol_getOffset(coleiter) != temp) {
1022 *start = temp;
1023 temp = ucol_getOffset(coleiter);
1024 }
1025 expansion --;
1026 }
1027
1028 uint32_t *patternce = strsrch->pattern.CE;
1029 int32_t patterncelength = strsrch->pattern.CELength;
1030 int32_t count = 0;
1031 while (count < patterncelength) {
1032 uint32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1033 if (ce == UCOL_IGNORABLE) {
1034 continue;
1035 }
1036 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1037 *start = temp;
1038 temp = ucol_getOffset(coleiter);
1039 }
1040 if (U_FAILURE(*status) || ce != patternce[count]) {
1041 (*end) ++;
1042 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1043 return FALSE;
1044 }
1045 count ++;
1046 }
1047 }
1048 return TRUE;
1049 }
1050
1051 /**
1052 * Checks and sets the match information if found.
1053 * Checks
1054 * <ul>
1055 * <li> the potential match does not repeat the previous match
1056 * <li> boundaries are correct
1057 * <li> exact matches has no extra accents
1058 * <li> identical matchesb
1059 * <li> potential match does not end in the middle of a contraction
1060 * <\ul>
1061 * Otherwise the offset will be shifted to the next character.
1062 * Internal method, status assumed to be success, caller has to check status
1063 * before calling this method.
1064 * @param strsrch string search data
1065 * @param textoffset offset in the collation element text. the returned value
1066 * will be the truncated end offset of the match or the new start
1067 * search offset.
1068 * @param status output error status if any
1069 * @return TRUE if the match is valid, FALSE otherwise
1070 */
1071 static
1072 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1073 int32_t *textoffset, UErrorCode *status)
1074 {
1075 UCollationElements *coleiter = strsrch->textIter;
1076 int32_t start = getColElemIterOffset(coleiter, FALSE);
1077
1078 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1079 return FALSE;
1080 }
1081
1082 // this totally matches, however we need to check if it is repeating
1083 if (!isBreakUnit(strsrch, start, *textoffset) ||
1084 checkRepeatedMatch(strsrch, start, *textoffset) ||
1085 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1086 !checkIdentical(strsrch, start, *textoffset) ||
1087 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1088
1089 (*textoffset) ++;
1090 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1091 return FALSE;
1092 }
1093
1094 // totally match, we will get rid of the ending ignorables.
1095 strsrch->search->matchedIndex = start;
1096 strsrch->search->matchedLength = *textoffset - start;
1097 return TRUE;
1098 }
1099
1100 /**
1101 * Getting the previous base character offset, or the current offset if the
1102 * current character is a base character
1103 * @param text string
1104 * @param textoffset one offset after the current character
1105 * @return the offset of the next character after the base character or the first
1106 * composed character with accents
1107 */
1108 static
1109 inline int32_t getPreviousBaseOffset(const UChar *text,
1110 int32_t textoffset)
1111 {
1112 if (textoffset > 0) {
1113 while (TRUE) {
1114 int32_t result = textoffset;
1115 UTF_BACK_1(text, 0, textoffset);
1116 int32_t temp = textoffset;
1117 uint16_t fcd = getFCD(text, &temp, result);
1118 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1119 if (fcd & LAST_BYTE_MASK_) {
1120 return textoffset;
1121 }
1122 return result;
1123 }
1124 if (textoffset == 0) {
1125 return 0;
1126 }
1127 }
1128 }
1129 return textoffset;
1130 }
1131
1132 /**
1133 * Getting the indexes of the accents that are not blocked in the argument
1134 * accent array
1135 * @param accents array of accents in nfd terminated by a 0.
1136 * @param accentsindex array of indexes of the accents that are not blocked
1137 */
1138 static
1139 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1140 {
1141 int32_t index = 0;
1142 int32_t length = u_strlen(accents);
1143 UChar32 codepoint = 0;
1144 int cclass = 0;
1145 int result = 0;
1146 int32_t temp;
1147 while (index < length) {
1148 temp = index;
1149 UTF_NEXT_CHAR(accents, index, length, codepoint);
1150 if (u_getCombiningClass(codepoint) != cclass) {
1151 cclass = u_getCombiningClass(codepoint);
1152 accentsindex[result] = temp;
1153 result ++;
1154 }
1155 }
1156 accentsindex[result] = length;
1157 return result;
1158 }
1159
1160 /**
1161 * Appends 3 UChar arrays to a destination array.
1162 * Creates a new array if we run out of space. The caller will have to
1163 * manually deallocate the newly allocated array.
1164 * Internal method, status assumed to be success, caller has to check status
1165 * before calling this method. destination not to be NULL and has at least
1166 * size destinationlength.
1167 * @param destination target array
1168 * @param destinationlength target array size, returning the appended length
1169 * @param source1 null-terminated first array
1170 * @param source2 second array
1171 * @param source2length length of seond array
1172 * @param source3 null-terminated third array
1173 * @param status error status if any
1174 * @return new destination array, destination if there was no new allocation
1175 */
1176 static
1177 inline UChar * addToUCharArray( UChar *destination,
1178 int32_t *destinationlength,
1179 const UChar *source1,
1180 const UChar *source2,
1181 int32_t source2length,
1182 const UChar *source3,
1183 UErrorCode *status)
1184 {
1185 int32_t source1length = source1 ? u_strlen(source1) : 0;
1186 int32_t source3length = source3 ? u_strlen(source3) : 0;
1187 if (*destinationlength < source1length + source2length + source3length +
1188 1)
1189 {
1190 destination = (UChar *)allocateMemory(
1191 (source1length + source2length + source3length + 1) * sizeof(UChar),
1192 status);
1193 // if error allocating memory, status will be
1194 // U_MEMORY_ALLOCATION_ERROR
1195 if (U_FAILURE(*status)) {
1196 *destinationlength = 0;
1197 return NULL;
1198 }
1199 }
1200 if (source1length != 0) {
1201 uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1202 }
1203 if (source2length != 0) {
1204 uprv_memcpy(destination + source1length, source2,
1205 sizeof(UChar) * source2length);
1206 }
1207 if (source3length != 0) {
1208 uprv_memcpy(destination + source1length + source2length, source3,
1209 sizeof(UChar) * source3length);
1210 }
1211 *destinationlength = source1length + source2length + source3length;
1212 return destination;
1213 }
1214
1215 /**
1216 * Running through a collation element iterator to see if the contents matches
1217 * pattern in string search data
1218 * @param strsrch string search data
1219 * @param coleiter collation element iterator
1220 * @return TRUE if a match if found, FALSE otherwise
1221 */
1222 static
1223 inline UBool checkCollationMatch(const UStringSearch *strsrch,
1224 UCollationElements *coleiter)
1225 {
1226 int patternceindex = strsrch->pattern.CELength;
1227 uint32_t *patternce = strsrch->pattern.CE;
1228 UErrorCode status = U_ZERO_ERROR;
1229 while (patternceindex > 0) {
1230 uint32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1231 if (ce == UCOL_IGNORABLE) {
1232 continue;
1233 }
1234 if (U_FAILURE(status) || ce != *patternce) {
1235 return FALSE;
1236 }
1237 patternce ++;
1238 patternceindex --;
1239 }
1240 return TRUE;
1241 }
1242
1243 /**
1244 * Rearranges the front accents to try matching.
1245 * Prefix accents in the text will be grouped according to their combining
1246 * class and the groups will be mixed and matched to try find the perfect
1247 * match with the pattern.
1248 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1249 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1250 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1251 * "\u0301\u0325".
1252 * step 2: check if any of the generated substrings matches the pattern.
1253 * Internal method, status is assumed to be success, caller has to check status
1254 * before calling this method.
1255 * @param strsrch string search match
1256 * @param start first offset of the accents to start searching
1257 * @param end start of the last accent set
1258 * @param status output error status if any
1259 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1260 * offset of the match. Note this start includes all preceding accents.
1261 */
1262 static
1263 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1264 int32_t start,
1265 int32_t end,
1266 UErrorCode *status)
1267 {
1268 const UChar *text = strsrch->search->text;
1269 int32_t textlength = strsrch->search->textLength;
1270 int32_t tempstart = start;
1271
1272 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1273 // die... failed at a base character
1274 return USEARCH_DONE;
1275 }
1276
1277 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1278 start = getPreviousBaseOffset(text, tempstart);
1279
1280 UChar accents[INITIAL_ARRAY_SIZE_];
1281 // normalizing the offensive string
1282 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1283 INITIAL_ARRAY_SIZE_, status);
1284 if (U_FAILURE(*status)) {
1285 return USEARCH_DONE;
1286 }
1287
1288 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1289 int32_t accentsize = getUnblockedAccentIndex(accents,
1290 accentsindex);
1291 int32_t count = (2 << (accentsize - 1)) - 2;
1292 UChar buffer[INITIAL_ARRAY_SIZE_];
1293 UCollationElements *coleiter = strsrch->utilIter;
1294 while (U_SUCCESS(*status) && count > 0) {
1295 UChar *rearrange = strsrch->canonicalPrefixAccents;
1296 // copy the base characters
1297 for (int k = 0; k < accentsindex[0]; k ++) {
1298 *rearrange ++ = accents[k];
1299 }
1300 // forming all possible canonical rearrangement by dropping
1301 // sets of accents
1302 for (int i = 0; i <= accentsize - 1; i ++) {
1303 int32_t mask = 1 << (accentsize - i - 1);
1304 if (count & mask) {
1305 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1306 *rearrange ++ = accents[j];
1307 }
1308 }
1309 }
1310 *rearrange = 0;
1311 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1312 UChar *match = addToUCharArray(buffer, &matchsize,
1313 strsrch->canonicalPrefixAccents,
1314 strsrch->search->text + offset,
1315 end - offset,
1316 strsrch->canonicalSuffixAccents,
1317 status);
1318
1319 // if status is a failure, ucol_setText does nothing.
1320 // run the collator iterator through this match
1321 ucol_setText(coleiter, match, matchsize, status);
1322 if (U_SUCCESS(*status)) {
1323 if (checkCollationMatch(strsrch, coleiter)) {
1324 if (match != buffer) {
1325 uprv_free(match);
1326 }
1327 return start;
1328 }
1329 }
1330 count --;
1331 }
1332 return USEARCH_DONE;
1333 }
1334
1335 /**
1336 * Gets the offset to the safe point in text before textoffset.
1337 * ie. not the middle of a contraction, swappable characters or supplementary
1338 * characters.
1339 * @param collator collation sata
1340 * @param text string to work with
1341 * @param textoffset offset in string
1342 * @param textlength length of text string
1343 * @return offset to the previous safe character
1344 */
1345 static
1346 inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1347 const UChar *text,
1348 int32_t textoffset)
1349 {
1350 int32_t result = textoffset; // first contraction character
1351 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1352 result --;
1353 }
1354 if (result != 0) {
1355 // the first contraction character is consider unsafe here
1356 result --;
1357 }
1358 return result;
1359 }
1360
1361 /**
1362 * Cleaning up after we passed the safe zone
1363 * @param strsrch string search data
1364 * @param safetext safe text array
1365 * @param safebuffer safe text buffer
1366 * @param coleiter collation element iterator for safe text
1367 */
1368 static
1369 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1370 UChar *safebuffer)
1371 {
1372 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1373 {
1374 uprv_free(safetext);
1375 }
1376 }
1377
1378 /**
1379 * Take the rearranged end accents and tries matching. If match failed at
1380 * a seperate preceding set of accents (seperated from the rearranged on by
1381 * at least a base character) then we rearrange the preceding accents and
1382 * tries matching again.
1383 * We allow skipping of the ends of the accent set if the ces do not match.
1384 * However if the failure is found before the accent set, it fails.
1385 * Internal method, status assumed to be success, caller has to check status
1386 * before calling this method.
1387 * @param strsrch string search data
1388 * @param textoffset of the start of the rearranged accent
1389 * @param status output error status if any
1390 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1391 * offset of the match. Note this start includes all preceding accents.
1392 */
1393 static
1394 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1395 int32_t textoffset,
1396 UErrorCode *status)
1397 {
1398 const UChar *text = strsrch->search->text;
1399 const UCollator *collator = strsrch->collator;
1400 int32_t safelength = 0;
1401 UChar *safetext;
1402 int32_t safetextlength;
1403 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1404 UCollationElements *coleiter = strsrch->utilIter;
1405 int32_t safeoffset = textoffset;
1406
1407 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1408 collator)) {
1409 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1410 safelength = textoffset - safeoffset;
1411 safetextlength = INITIAL_ARRAY_SIZE_;
1412 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1413 text + safeoffset, safelength,
1414 strsrch->canonicalSuffixAccents,
1415 status);
1416 }
1417 else {
1418 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1419 safetext = strsrch->canonicalSuffixAccents;
1420 }
1421
1422 // if status is a failure, ucol_setText does nothing
1423 ucol_setText(coleiter, safetext, safetextlength, status);
1424 // status checked in loop below
1425
1426 uint32_t *ce = strsrch->pattern.CE;
1427 uint32_t celength = strsrch->pattern.CELength;
1428 int ceindex = celength - 1;
1429 UBool isSafe = TRUE; // indication flag for position in safe zone
1430
1431 while (ceindex >= 0) {
1432 uint32_t textce = ucol_previous(coleiter, status);
1433 if (U_FAILURE(*status)) {
1434 if (isSafe) {
1435 cleanUpSafeText(strsrch, safetext, safebuffer);
1436 }
1437 return USEARCH_DONE;
1438 }
1439 if (textce == UCOL_NULLORDER) {
1440 // check if we have passed the safe buffer
1441 if (coleiter == strsrch->textIter) {
1442 cleanUpSafeText(strsrch, safetext, safebuffer);
1443 return USEARCH_DONE;
1444 }
1445 cleanUpSafeText(strsrch, safetext, safebuffer);
1446 safetext = safebuffer;
1447 coleiter = strsrch->textIter;
1448 setColEIterOffset(coleiter, safeoffset);
1449 // status checked at the start of the loop
1450 isSafe = FALSE;
1451 continue;
1452 }
1453 textce = getCE(strsrch, textce);
1454 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1455 // do the beginning stuff
1456 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1457 if (isSafe && failedoffset >= safelength) {
1458 // alas... no hope. failed at rearranged accent set
1459 cleanUpSafeText(strsrch, safetext, safebuffer);
1460 return USEARCH_DONE;
1461 }
1462 else {
1463 if (isSafe) {
1464 failedoffset += safeoffset;
1465 cleanUpSafeText(strsrch, safetext, safebuffer);
1466 }
1467
1468 // try rearranging the front accents
1469 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1470 failedoffset, textoffset, status);
1471 if (result != USEARCH_DONE) {
1472 // if status is a failure, ucol_setOffset does nothing
1473 setColEIterOffset(strsrch->textIter, result);
1474 }
1475 if (U_FAILURE(*status)) {
1476 return USEARCH_DONE;
1477 }
1478 return result;
1479 }
1480 }
1481 if (textce == ce[ceindex]) {
1482 ceindex --;
1483 }
1484 }
1485 // set offset here
1486 if (isSafe) {
1487 int32_t result = getColElemIterOffset(coleiter, FALSE);
1488 // sets the text iterator here with the correct expansion and offset
1489 int32_t leftoverces = getExpansionPrefix(coleiter);
1490 cleanUpSafeText(strsrch, safetext, safebuffer);
1491 if (result >= safelength) {
1492 result = textoffset;
1493 }
1494 else {
1495 result += safeoffset;
1496 }
1497 setColEIterOffset(strsrch->textIter, result);
1498 strsrch->textIter->iteratordata_.toReturn =
1499 setExpansionPrefix(strsrch->textIter, leftoverces);
1500 return result;
1501 }
1502
1503 return ucol_getOffset(coleiter);
1504 }
1505
1506 /**
1507 * Trying out the substring and sees if it can be a canonical match.
1508 * This will try normalizing the end accents and arranging them into canonical
1509 * equivalents and check their corresponding ces with the pattern ce.
1510 * Suffix accents in the text will be grouped according to their combining
1511 * class and the groups will be mixed and matched to try find the perfect
1512 * match with the pattern.
1513 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1514 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1515 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1516 * "\u0301\u0325".
1517 * step 2: check if any of the generated substrings matches the pattern.
1518 * Internal method, status assumed to be success, caller has to check status
1519 * before calling this method.
1520 * @param strsrch string search data
1521 * @param textoffset end offset in the collation element text that ends with
1522 * the accents to be rearranged
1523 * @param status error status if any
1524 * @return TRUE if the match is valid, FALSE otherwise
1525 */
1526 static
1527 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1528 int32_t textoffset,
1529 UErrorCode *status)
1530 {
1531 const UChar *text = strsrch->search->text;
1532 int32_t temp = textoffset;
1533 UTF_BACK_1(text, 0, temp);
1534 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1535 UCollationElements *coleiter = strsrch->textIter;
1536 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1537 if (strsrch->pattern.hasPrefixAccents) {
1538 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1539 status);
1540 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1541 setColEIterOffset(coleiter, offset);
1542 return TRUE;
1543 }
1544 }
1545 return FALSE;
1546 }
1547
1548 if (!strsrch->pattern.hasSuffixAccents) {
1549 return FALSE;
1550 }
1551
1552 UChar accents[INITIAL_ARRAY_SIZE_];
1553 // offset to the last base character in substring to search
1554 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1555 // normalizing the offensive string
1556 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1557 0, accents, INITIAL_ARRAY_SIZE_, status);
1558 // status checked in loop below
1559
1560 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1561 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1562
1563 // 2 power n - 1 minus the full set of accents
1564 int32_t count = (2 << (size - 1)) - 2;
1565 while (U_SUCCESS(*status) && count > 0) {
1566 UChar *rearrange = strsrch->canonicalSuffixAccents;
1567 // copy the base characters
1568 for (int k = 0; k < accentsindex[0]; k ++) {
1569 *rearrange ++ = accents[k];
1570 }
1571 // forming all possible canonical rearrangement by dropping
1572 // sets of accents
1573 for (int i = 0; i <= size - 1; i ++) {
1574 int32_t mask = 1 << (size - i - 1);
1575 if (count & mask) {
1576 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1577 *rearrange ++ = accents[j];
1578 }
1579 }
1580 }
1581 *rearrange = 0;
1582 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1583 status);
1584 if (offset != USEARCH_DONE) {
1585 return TRUE; // match found
1586 }
1587 count --;
1588 }
1589 return FALSE;
1590 }
1591
1592 /**
1593 * Gets the previous base character offset depending on the string search
1594 * pattern data
1595 * @param strsrch string search data
1596 * @param textoffset current offset, current character
1597 * @return the offset of the next character after this base character or itself
1598 * if it is a composed character with accents
1599 */
1600 static
1601 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1602 int32_t textoffset)
1603 {
1604 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1605 const UChar *text = strsrch->search->text;
1606 int32_t offset = textoffset;
1607 if (getFCD(text, &offset, strsrch->search->textLength) >>
1608 SECOND_LAST_BYTE_SHIFT_) {
1609 return getPreviousBaseOffset(text, textoffset);
1610 }
1611 }
1612 return textoffset;
1613 }
1614
1615 /**
1616 * Checks match for contraction.
1617 * If the match ends with a partial contraction we fail.
1618 * If the match starts too far off (because of backwards iteration) we try to
1619 * chip off the extra characters
1620 * Internal method, status assumed to be success, caller has to check status
1621 * before calling this method.
1622 * @param strsrch string search data
1623 * @param start offset of potential match, to be modified if necessary
1624 * @param end offset of potential match, to be modified if necessary
1625 * @param status output error status if any
1626 * @return TRUE if match passes the contraction test, FALSE otherwise
1627 */
1628 static
1629 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1630 int32_t *start,
1631 int32_t *end,
1632 UErrorCode *status)
1633 {
1634 UCollationElements *coleiter = strsrch->textIter;
1635 int32_t textlength = strsrch->search->textLength;
1636 int32_t temp = *start;
1637 const UCollator *collator = strsrch->collator;
1638 const UChar *text = strsrch->search->text;
1639 // This part checks if either ends of the match contains potential
1640 // contraction. If so we'll have to iterate through them
1641 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1642 (*start + 1 < textlength
1643 && ucol_unsafeCP(text[*start + 1], collator))) {
1644 int32_t expansion = getExpansionPrefix(coleiter);
1645 UBool expandflag = expansion > 0;
1646 setColEIterOffset(coleiter, *start);
1647 while (expansion > 0) {
1648 // getting rid of the redundant ce, caused by setOffset.
1649 // since backward contraction/expansion may have extra ces if we
1650 // are in the normalization buffer, hasAccentsBeforeMatch would
1651 // have taken care of it.
1652 // E.g. the character \u01FA will have an expansion of 3, but if
1653 // we are only looking for acute and ring \u030A and \u0301, we'll
1654 // have to skip the first ce in the expansion buffer.
1655 ucol_next(coleiter, status);
1656 if (U_FAILURE(*status)) {
1657 return FALSE;
1658 }
1659 if (ucol_getOffset(coleiter) != temp) {
1660 *start = temp;
1661 temp = ucol_getOffset(coleiter);
1662 }
1663 expansion --;
1664 }
1665
1666 uint32_t *patternce = strsrch->pattern.CE;
1667 int32_t patterncelength = strsrch->pattern.CELength;
1668 int32_t count = 0;
1669 int32_t textlength = strsrch->search->textLength;
1670 while (count < patterncelength) {
1671 uint32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1672 // status checked below, note that if status is a failure
1673 // ucol_next returns UCOL_NULLORDER
1674 if (ce == UCOL_IGNORABLE) {
1675 continue;
1676 }
1677 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1678 *start = temp;
1679 temp = ucol_getOffset(coleiter);
1680 }
1681
1682 if (count == 0 && ce != patternce[0]) {
1683 // accents may have extra starting ces, this occurs when a
1684 // pure accent pattern is matched without rearrangement
1685 // text \u0325\u0300 and looking for \u0300
1686 uint32_t expected = patternce[0];
1687 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1688 ce = getCE(strsrch, ucol_next(coleiter, status));
1689 while (U_SUCCESS(*status) && ce != expected &&
1690 ce != UCOL_NULLORDER &&
1691 ucol_getOffset(coleiter) <= *end) {
1692 ce = getCE(strsrch, ucol_next(coleiter, status));
1693 }
1694 }
1695 }
1696 if (U_FAILURE(*status) || ce != patternce[count]) {
1697 (*end) ++;
1698 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1699 return FALSE;
1700 }
1701 count ++;
1702 }
1703 }
1704 return TRUE;
1705 }
1706
1707 /**
1708 * Checks and sets the match information if found.
1709 * Checks
1710 * <ul>
1711 * <li> the potential match does not repeat the previous match
1712 * <li> boundaries are correct
1713 * <li> potential match does not end in the middle of a contraction
1714 * <li> identical matches
1715 * <\ul>
1716 * Otherwise the offset will be shifted to the next character.
1717 * Internal method, status assumed to be success, caller has to check the
1718 * status before calling this method.
1719 * @param strsrch string search data
1720 * @param textoffset offset in the collation element text. the returned value
1721 * will be the truncated end offset of the match or the new start
1722 * search offset.
1723 * @param status output error status if any
1724 * @return TRUE if the match is valid, FALSE otherwise
1725 */
1726 static
1727 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1728 int32_t *textoffset,
1729 UErrorCode *status)
1730 {
1731 // to ensure that the start and ends are not composite characters
1732 UCollationElements *coleiter = strsrch->textIter;
1733 // if we have a canonical accent match
1734 if ((strsrch->pattern.hasSuffixAccents &&
1735 strsrch->canonicalSuffixAccents[0]) ||
1736 (strsrch->pattern.hasPrefixAccents &&
1737 strsrch->canonicalPrefixAccents[0])) {
1738 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1739 strsrch,
1740 ucol_getOffset(coleiter));
1741 strsrch->search->matchedLength = *textoffset -
1742 strsrch->search->matchedIndex;
1743 return TRUE;
1744 }
1745
1746 int32_t start = getColElemIterOffset(coleiter, FALSE);
1747 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1748 status) || U_FAILURE(*status)) {
1749 return FALSE;
1750 }
1751
1752 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1753 // this totally matches, however we need to check if it is repeating
1754 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1755 !isBreakUnit(strsrch, start, *textoffset) ||
1756 !checkIdentical(strsrch, start, *textoffset)) {
1757 (*textoffset) ++;
1758 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1759 strsrch->search->textLength);
1760 return FALSE;
1761 }
1762
1763 strsrch->search->matchedIndex = start;
1764 strsrch->search->matchedLength = *textoffset - start;
1765 return TRUE;
1766 }
1767
1768 /**
1769 * Shifting the collation element iterator position forward to prepare for
1770 * a preceding match. If the first character is a unsafe character, we'll only
1771 * shift by 1 to capture contractions, normalization etc.
1772 * Internal method, status assumed to be success, caller has to check status
1773 * before calling this method.
1774 * @param text strsrch string search data
1775 * @param textoffset start text position to do search
1776 * @param ce the text ce which failed the match.
1777 * @param patternceindex index of the ce within the pattern ce buffer which
1778 * failed the match
1779 * @return final offset
1780 */
1781 static
1782 inline int32_t reverseShift(UStringSearch *strsrch,
1783 int32_t textoffset,
1784 uint32_t ce,
1785 int32_t patternceindex)
1786 {
1787 if (strsrch->search->isOverlap) {
1788 if (textoffset != strsrch->search->textLength) {
1789 textoffset --;
1790 }
1791 else {
1792 textoffset -= strsrch->pattern.defaultShiftSize;
1793 }
1794 }
1795 else {
1796 if (ce != UCOL_NULLORDER) {
1797 int32_t shift = strsrch->pattern.backShift[hash(ce)];
1798
1799 // this is to adjust for characters in the middle of the substring
1800 // for matching that failed.
1801 int32_t adjust = patternceindex;
1802 if (adjust > 1 && shift > adjust) {
1803 shift -= adjust - 1;
1804 }
1805 textoffset -= shift;
1806 }
1807 else {
1808 textoffset -= strsrch->pattern.defaultShiftSize;
1809 }
1810 }
1811 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1812 return textoffset;
1813 }
1814
1815 /**
1816 * Checks match for contraction.
1817 * If the match starts with a partial contraction we fail.
1818 * Internal method, status assumed to be success, caller has to check status
1819 * before calling this method.
1820 * @param strsrch string search data
1821 * @param start offset of potential match, to be modified if necessary
1822 * @param end offset of potential match, to be modified if necessary
1823 * @param status output error status if any
1824 * @return TRUE if match passes the contraction test, FALSE otherwise
1825 */
1826 static
1827 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
1828 int32_t *start,
1829 int32_t *end, UErrorCode *status)
1830 {
1831 UCollationElements *coleiter = strsrch->textIter;
1832 int32_t textlength = strsrch->search->textLength;
1833 int32_t temp = *end;
1834 const UCollator *collator = strsrch->collator;
1835 const UChar *text = strsrch->search->text;
1836 // This part checks if either if the start of the match contains potential
1837 // contraction. If so we'll have to iterate through them
1838 // Since we used ucol_next while previously looking for the potential
1839 // match, this guarantees that our end will not be a partial contraction,
1840 // or a partial supplementary character.
1841 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
1842 int32_t expansion = getExpansionSuffix(coleiter);
1843 UBool expandflag = expansion > 0;
1844 setColEIterOffset(coleiter, *end);
1845 while (U_SUCCESS(*status) && expansion > 0) {
1846 // getting rid of the redundant ce
1847 // since forward contraction/expansion may have extra ces
1848 // if we are in the normalization buffer, hasAccentsBeforeMatch
1849 // would have taken care of it.
1850 // E.g. the character \u01FA will have an expansion of 3, but if
1851 // we are only looking for A ring A\u030A, we'll have to skip the
1852 // last ce in the expansion buffer
1853 ucol_previous(coleiter, status);
1854 if (U_FAILURE(*status)) {
1855 return FALSE;
1856 }
1857 if (ucol_getOffset(coleiter) != temp) {
1858 *end = temp;
1859 temp = ucol_getOffset(coleiter);
1860 }
1861 expansion --;
1862 }
1863
1864 uint32_t *patternce = strsrch->pattern.CE;
1865 int32_t patterncelength = strsrch->pattern.CELength;
1866 int32_t count = patterncelength;
1867 while (count > 0) {
1868 uint32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
1869 // status checked below, note that if status is a failure
1870 // ucol_previous returns UCOL_NULLORDER
1871 if (ce == UCOL_IGNORABLE) {
1872 continue;
1873 }
1874 if (expandflag && count == 0 &&
1875 getColElemIterOffset(coleiter, FALSE) != temp) {
1876 *end = temp;
1877 temp = ucol_getOffset(coleiter);
1878 }
1879 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
1880 (*start) --;
1881 *start = getPreviousBaseOffset(text, *start);
1882 return FALSE;
1883 }
1884 count --;
1885 }
1886 }
1887 return TRUE;
1888 }
1889
1890 /**
1891 * Checks and sets the match information if found.
1892 * Checks
1893 * <ul>
1894 * <li> the current match does not repeat the last match
1895 * <li> boundaries are correct
1896 * <li> exact matches has no extra accents
1897 * <li> identical matches
1898 * <\ul>
1899 * Otherwise the offset will be shifted to the preceding character.
1900 * Internal method, status assumed to be success, caller has to check status
1901 * before calling this method.
1902 * @param strsrch string search data
1903 * @param collator
1904 * @param coleiter collation element iterator
1905 * @param text string
1906 * @param textoffset offset in the collation element text. the returned value
1907 * will be the truncated start offset of the match or the new start
1908 * search offset.
1909 * @param status output error status if any
1910 * @return TRUE if the match is valid, FALSE otherwise
1911 */
1912 static
1913 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
1914 int32_t *textoffset,
1915 UErrorCode *status)
1916 {
1917 // to ensure that the start and ends are not composite characters
1918 int32_t end = ucol_getOffset(strsrch->textIter);
1919 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
1920 || U_FAILURE(*status)) {
1921 return FALSE;
1922 }
1923
1924 // this totally matches, however we need to check if it is repeating
1925 // the old match
1926 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
1927 !isBreakUnit(strsrch, *textoffset, end) ||
1928 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
1929 !checkIdentical(strsrch, *textoffset, end) ||
1930 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
1931 (*textoffset) --;
1932 *textoffset = getPreviousBaseOffset(strsrch->search->text,
1933 *textoffset);
1934 return FALSE;
1935 }
1936 strsrch->search->matchedIndex = *textoffset;
1937 strsrch->search->matchedLength = end - *textoffset;
1938 return TRUE;
1939 }
1940
1941 /**
1942 * Rearranges the end accents to try matching.
1943 * Suffix accents in the text will be grouped according to their combining
1944 * class and the groups will be mixed and matched to try find the perfect
1945 * match with the pattern.
1946 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1947 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1948 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1949 * "\u0301\u0325".
1950 * step 2: check if any of the generated substrings matches the pattern.
1951 * Internal method, status assumed to be success, user has to check status
1952 * before calling this method.
1953 * @param strsrch string search match
1954 * @param start offset of the first base character
1955 * @param end start of the last accent set
1956 * @param status only error status if any
1957 * @return USEARCH_DONE if a match is not found, otherwise return the ending
1958 * offset of the match. Note this start includes all following accents.
1959 */
1960 static
1961 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
1962 int32_t start,
1963 int32_t end,
1964 UErrorCode *status)
1965 {
1966 const UChar *text = strsrch->search->text;
1967 int32_t tempend = end;
1968
1969 UTF_BACK_1(text, 0, tempend);
1970 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
1971 LAST_BYTE_MASK_)) {
1972 // die... failed at a base character
1973 return USEARCH_DONE;
1974 }
1975 end = getNextBaseOffset(text, end, strsrch->search->textLength);
1976
1977 if (U_SUCCESS(*status)) {
1978 UChar accents[INITIAL_ARRAY_SIZE_];
1979 int32_t offset = getPreviousBaseOffset(text, end);
1980 // normalizing the offensive string
1981 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
1982 INITIAL_ARRAY_SIZE_, status);
1983
1984 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1985 int32_t accentsize = getUnblockedAccentIndex(accents,
1986 accentsindex);
1987 int32_t count = (2 << (accentsize - 1)) - 2;
1988 UChar buffer[INITIAL_ARRAY_SIZE_];
1989 UCollationElements *coleiter = strsrch->utilIter;
1990 while (U_SUCCESS(*status) && count > 0) {
1991 UChar *rearrange = strsrch->canonicalSuffixAccents;
1992 // copy the base characters
1993 for (int k = 0; k < accentsindex[0]; k ++) {
1994 *rearrange ++ = accents[k];
1995 }
1996 // forming all possible canonical rearrangement by dropping
1997 // sets of accents
1998 for (int i = 0; i <= accentsize - 1; i ++) {
1999 int32_t mask = 1 << (accentsize - i - 1);
2000 if (count & mask) {
2001 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2002 *rearrange ++ = accents[j];
2003 }
2004 }
2005 }
2006 *rearrange = 0;
2007 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2008 UChar *match = addToUCharArray(buffer, &matchsize,
2009 strsrch->canonicalPrefixAccents,
2010 strsrch->search->text + start,
2011 offset - start,
2012 strsrch->canonicalSuffixAccents,
2013 status);
2014
2015 // run the collator iterator through this match
2016 // if status is a failure ucol_setText does nothing
2017 ucol_setText(coleiter, match, matchsize, status);
2018 if (U_SUCCESS(*status)) {
2019 if (checkCollationMatch(strsrch, coleiter)) {
2020 if (match != buffer) {
2021 uprv_free(match);
2022 }
2023 return end;
2024 }
2025 }
2026 count --;
2027 }
2028 }
2029 return USEARCH_DONE;
2030 }
2031
2032 /**
2033 * Take the rearranged start accents and tries matching. If match failed at
2034 * a seperate following set of accents (seperated from the rearranged on by
2035 * at least a base character) then we rearrange the preceding accents and
2036 * tries matching again.
2037 * We allow skipping of the ends of the accent set if the ces do not match.
2038 * However if the failure is found before the accent set, it fails.
2039 * Internal method, status assumed to be success, caller has to check status
2040 * before calling this method.
2041 * @param strsrch string search data
2042 * @param textoffset of the ends of the rearranged accent
2043 * @param status output error status if any
2044 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2045 * offset of the match. Note this start includes all following accents.
2046 */
2047 static
2048 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2049 int32_t textoffset,
2050 UErrorCode *status)
2051 {
2052 const UChar *text = strsrch->search->text;
2053 const UCollator *collator = strsrch->collator;
2054 int32_t safelength = 0;
2055 UChar *safetext;
2056 int32_t safetextlength;
2057 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2058 int32_t safeoffset = textoffset;
2059
2060 if (textoffset &&
2061 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2062 u_strlen(strsrch->canonicalPrefixAccents) - 1
2063 ], collator)) {
2064 safeoffset = getNextSafeOffset(collator, text, textoffset,
2065 strsrch->search->textLength);
2066 safelength = safeoffset - textoffset;
2067 safetextlength = INITIAL_ARRAY_SIZE_;
2068 safetext = addToUCharArray(safebuffer, &safetextlength,
2069 strsrch->canonicalPrefixAccents,
2070 text + textoffset, safelength,
2071 NULL, status);
2072 }
2073 else {
2074 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2075 safetext = strsrch->canonicalPrefixAccents;
2076 }
2077
2078 UCollationElements *coleiter = strsrch->utilIter;
2079 // if status is a failure, ucol_setText does nothing
2080 ucol_setText(coleiter, safetext, safetextlength, status);
2081 // status checked in loop below
2082
2083 uint32_t *ce = strsrch->pattern.CE;
2084 int32_t celength = strsrch->pattern.CELength;
2085 int ceindex = 0;
2086 UBool isSafe = TRUE; // safe zone indication flag for position
2087 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2088
2089 while (ceindex < celength) {
2090 uint32_t textce = ucol_next(coleiter, status);
2091 if (U_FAILURE(*status)) {
2092 if (isSafe) {
2093 cleanUpSafeText(strsrch, safetext, safebuffer);
2094 }
2095 return USEARCH_DONE;
2096 }
2097 if (textce == UCOL_NULLORDER) {
2098 // check if we have passed the safe buffer
2099 if (coleiter == strsrch->textIter) {
2100 cleanUpSafeText(strsrch, safetext, safebuffer);
2101 return USEARCH_DONE;
2102 }
2103 cleanUpSafeText(strsrch, safetext, safebuffer);
2104 safetext = safebuffer;
2105 coleiter = strsrch->textIter;
2106 setColEIterOffset(coleiter, safeoffset);
2107 // status checked at the start of the loop
2108 isSafe = FALSE;
2109 continue;
2110 }
2111 textce = getCE(strsrch, textce);
2112 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2113 // do the beginning stuff
2114 int32_t failedoffset = ucol_getOffset(coleiter);
2115 if (isSafe && failedoffset <= prefixlength) {
2116 // alas... no hope. failed at rearranged accent set
2117 cleanUpSafeText(strsrch, safetext, safebuffer);
2118 return USEARCH_DONE;
2119 }
2120 else {
2121 if (isSafe) {
2122 failedoffset = safeoffset - failedoffset;
2123 cleanUpSafeText(strsrch, safetext, safebuffer);
2124 }
2125
2126 // try rearranging the end accents
2127 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2128 textoffset, failedoffset, status);
2129 if (result != USEARCH_DONE) {
2130 // if status is a failure, ucol_setOffset does nothing
2131 setColEIterOffset(strsrch->textIter, result);
2132 }
2133 if (U_FAILURE(*status)) {
2134 return USEARCH_DONE;
2135 }
2136 return result;
2137 }
2138 }
2139 if (textce == ce[ceindex]) {
2140 ceindex ++;
2141 }
2142 }
2143 // set offset here
2144 if (isSafe) {
2145 int32_t result = ucol_getOffset(coleiter);
2146 // sets the text iterator here with the correct expansion and offset
2147 int32_t leftoverces = getExpansionSuffix(coleiter);
2148 cleanUpSafeText(strsrch, safetext, safebuffer);
2149 if (result <= prefixlength) {
2150 result = textoffset;
2151 }
2152 else {
2153 result = textoffset + (safeoffset - result);
2154 }
2155 setColEIterOffset(strsrch->textIter, result);
2156 setExpansionSuffix(strsrch->textIter, leftoverces);
2157 return result;
2158 }
2159
2160 return ucol_getOffset(coleiter);
2161 }
2162
2163 /**
2164 * Trying out the substring and sees if it can be a canonical match.
2165 * This will try normalizing the starting accents and arranging them into
2166 * canonical equivalents and check their corresponding ces with the pattern ce.
2167 * Prefix accents in the text will be grouped according to their combining
2168 * class and the groups will be mixed and matched to try find the perfect
2169 * match with the pattern.
2170 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2171 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2172 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2173 * "\u0301\u0325".
2174 * step 2: check if any of the generated substrings matches the pattern.
2175 * Internal method, status assumed to be success, caller has to check status
2176 * before calling this method.
2177 * @param strsrch string search data
2178 * @param textoffset start offset in the collation element text that starts
2179 * with the accents to be rearranged
2180 * @param status output error status if any
2181 * @return TRUE if the match is valid, FALSE otherwise
2182 */
2183 static
2184 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2185 int32_t textoffset,
2186 UErrorCode *status)
2187 {
2188 const UChar *text = strsrch->search->text;
2189 int32_t temp = textoffset;
2190 int32_t textlength = strsrch->search->textLength;
2191 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2192 UCollationElements *coleiter = strsrch->textIter;
2193 int32_t offset = ucol_getOffset(coleiter);
2194 if (strsrch->pattern.hasSuffixAccents) {
2195 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2196 offset, status);
2197 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2198 setColEIterOffset(coleiter, offset);
2199 return TRUE;
2200 }
2201 }
2202 return FALSE;
2203 }
2204
2205 if (!strsrch->pattern.hasPrefixAccents) {
2206 return FALSE;
2207 }
2208
2209 UChar accents[INITIAL_ARRAY_SIZE_];
2210 // offset to the last base character in substring to search
2211 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2212 // normalizing the offensive string
2213 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2214 0, accents, INITIAL_ARRAY_SIZE_, status);
2215 // status checked in loop
2216
2217 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2218 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2219
2220 // 2 power n - 1 minus the full set of accents
2221 int32_t count = (2 << (size - 1)) - 2;
2222 while (U_SUCCESS(*status) && count > 0) {
2223 UChar *rearrange = strsrch->canonicalPrefixAccents;
2224 // copy the base characters
2225 for (int k = 0; k < accentsindex[0]; k ++) {
2226 *rearrange ++ = accents[k];
2227 }
2228 // forming all possible canonical rearrangement by dropping
2229 // sets of accents
2230 for (int i = 0; i <= size - 1; i ++) {
2231 int32_t mask = 1 << (size - i - 1);
2232 if (count & mask) {
2233 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2234 *rearrange ++ = accents[j];
2235 }
2236 }
2237 }
2238 *rearrange = 0;
2239 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2240 baseoffset, status);
2241 if (offset != USEARCH_DONE) {
2242 return TRUE; // match found
2243 }
2244 count --;
2245 }
2246 return FALSE;
2247 }
2248
2249 /**
2250 * Checks match for contraction.
2251 * If the match starts with a partial contraction we fail.
2252 * Internal method, status assumed to be success, caller has to check status
2253 * before calling this method.
2254 * @param strsrch string search data
2255 * @param start offset of potential match, to be modified if necessary
2256 * @param end offset of potential match, to be modified if necessary
2257 * @param status only error status if any
2258 * @return TRUE if match passes the contraction test, FALSE otherwise
2259 */
2260 static
2261 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2262 int32_t *start,
2263 int32_t *end, UErrorCode *status)
2264 {
2265 UCollationElements *coleiter = strsrch->textIter;
2266 int32_t textlength = strsrch->search->textLength;
2267 int32_t temp = *end;
2268 const UCollator *collator = strsrch->collator;
2269 const UChar *text = strsrch->search->text;
2270 // This part checks if either if the start of the match contains potential
2271 // contraction. If so we'll have to iterate through them
2272 // Since we used ucol_next while previously looking for the potential
2273 // match, this guarantees that our end will not be a partial contraction,
2274 // or a partial supplementary character.
2275 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2276 int32_t expansion = getExpansionSuffix(coleiter);
2277 UBool expandflag = expansion > 0;
2278 setColEIterOffset(coleiter, *end);
2279 while (expansion > 0) {
2280 // getting rid of the redundant ce
2281 // since forward contraction/expansion may have extra ces
2282 // if we are in the normalization buffer, hasAccentsBeforeMatch
2283 // would have taken care of it.
2284 // E.g. the character \u01FA will have an expansion of 3, but if
2285 // we are only looking for A ring A\u030A, we'll have to skip the
2286 // last ce in the expansion buffer
2287 ucol_previous(coleiter, status);
2288 if (U_FAILURE(*status)) {
2289 return FALSE;
2290 }
2291 if (ucol_getOffset(coleiter) != temp) {
2292 *end = temp;
2293 temp = ucol_getOffset(coleiter);
2294 }
2295 expansion --;
2296 }
2297
2298 uint32_t *patternce = strsrch->pattern.CE;
2299 int32_t patterncelength = strsrch->pattern.CELength;
2300 int32_t count = patterncelength;
2301 while (count > 0) {
2302 uint32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2303 // status checked below, note that if status is a failure
2304 // ucol_previous returns UCOL_NULLORDER
2305 if (ce == UCOL_IGNORABLE) {
2306 continue;
2307 }
2308 if (expandflag && count == 0 &&
2309 getColElemIterOffset(coleiter, FALSE) != temp) {
2310 *end = temp;
2311 temp = ucol_getOffset(coleiter);
2312 }
2313 if (count == patterncelength &&
2314 ce != patternce[patterncelength - 1]) {
2315 // accents may have extra starting ces, this occurs when a
2316 // pure accent pattern is matched without rearrangement
2317 uint32_t expected = patternce[patterncelength - 1];
2318 UTF_BACK_1(text, 0, *end);
2319 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2320 ce = getCE(strsrch, ucol_previous(coleiter, status));
2321 while (U_SUCCESS(*status) && ce != expected &&
2322 ce != UCOL_NULLORDER &&
2323 ucol_getOffset(coleiter) <= *start) {
2324 ce = getCE(strsrch, ucol_previous(coleiter, status));
2325 }
2326 }
2327 }
2328 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2329 (*start) --;
2330 *start = getPreviousBaseOffset(text, *start);
2331 return FALSE;
2332 }
2333 count --;
2334 }
2335 }
2336 return TRUE;
2337 }
2338
2339 /**
2340 * Checks and sets the match information if found.
2341 * Checks
2342 * <ul>
2343 * <li> the potential match does not repeat the previous match
2344 * <li> boundaries are correct
2345 * <li> potential match does not end in the middle of a contraction
2346 * <li> identical matches
2347 * <\ul>
2348 * Otherwise the offset will be shifted to the next character.
2349 * Internal method, status assumed to be success, caller has to check status
2350 * before calling this method.
2351 * @param strsrch string search data
2352 * @param textoffset offset in the collation element text. the returned value
2353 * will be the truncated start offset of the match or the new start
2354 * search offset.
2355 * @param status only error status if any
2356 * @return TRUE if the match is valid, FALSE otherwise
2357 */
2358 static
2359 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2360 int32_t *textoffset,
2361 UErrorCode *status)
2362 {
2363 // to ensure that the start and ends are not composite characters
2364 UCollationElements *coleiter = strsrch->textIter;
2365 // if we have a canonical accent match
2366 if ((strsrch->pattern.hasSuffixAccents &&
2367 strsrch->canonicalSuffixAccents[0]) ||
2368 (strsrch->pattern.hasPrefixAccents &&
2369 strsrch->canonicalPrefixAccents[0])) {
2370 strsrch->search->matchedIndex = *textoffset;
2371 strsrch->search->matchedLength =
2372 getNextUStringSearchBaseOffset(strsrch,
2373 getColElemIterOffset(coleiter, FALSE))
2374 - *textoffset;
2375 return TRUE;
2376 }
2377
2378 int32_t end = ucol_getOffset(coleiter);
2379 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2380 status) ||
2381 U_FAILURE(*status)) {
2382 return FALSE;
2383 }
2384
2385 end = getNextUStringSearchBaseOffset(strsrch, end);
2386 // this totally matches, however we need to check if it is repeating
2387 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2388 !isBreakUnit(strsrch, *textoffset, end) ||
2389 !checkIdentical(strsrch, *textoffset, end)) {
2390 (*textoffset) --;
2391 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2392 *textoffset);
2393 return FALSE;
2394 }
2395
2396 strsrch->search->matchedIndex = *textoffset;
2397 strsrch->search->matchedLength = end - *textoffset;
2398 return TRUE;
2399 }
2400
2401 // constructors and destructor -------------------------------------------
2402
2403 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2404 int32_t patternlength,
2405 const UChar *text,
2406 int32_t textlength,
2407 const char *locale,
2408 UBreakIterator *breakiter,
2409 UErrorCode *status)
2410 {
2411 if (U_FAILURE(*status)) {
2412 return NULL;
2413 }
2414 #if UCONFIG_NO_BREAK_ITERATION
2415 if (breakiter != NULL) {
2416 *status = U_UNSUPPORTED_ERROR;
2417 return NULL;
2418 }
2419 #endif
2420 if (locale) {
2421 // ucol_open internally checks for status
2422 UCollator *collator = ucol_open(locale, status);
2423 // pattern, text checks are done in usearch_openFromCollator
2424 UStringSearch *result = usearch_openFromCollator(pattern,
2425 patternlength, text, textlength,
2426 collator, breakiter, status);
2427
2428 if (result == NULL || U_FAILURE(*status)) {
2429 if (collator) {
2430 ucol_close(collator);
2431 }
2432 return NULL;
2433 }
2434 else {
2435 result->ownCollator = TRUE;
2436 }
2437 return result;
2438 }
2439 *status = U_ILLEGAL_ARGUMENT_ERROR;
2440 return NULL;
2441 }
2442
2443 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2444 const UChar *pattern,
2445 int32_t patternlength,
2446 const UChar *text,
2447 int32_t textlength,
2448 const UCollator *collator,
2449 UBreakIterator *breakiter,
2450 UErrorCode *status)
2451 {
2452 if (U_FAILURE(*status)) {
2453 return NULL;
2454 }
2455 #if UCONFIG_NO_BREAK_ITERATION
2456 if (breakiter != NULL) {
2457 *status = U_UNSUPPORTED_ERROR;
2458 return NULL;
2459 }
2460 #endif
2461 if (pattern == NULL || text == NULL || collator == NULL) {
2462 *status = U_ILLEGAL_ARGUMENT_ERROR;
2463 }
2464
2465 if (U_SUCCESS(*status)) {
2466 initializeFCD(status);
2467 if (U_FAILURE(*status)) {
2468 return NULL;
2469 }
2470
2471 UStringSearch *result;
2472 if (textlength == -1) {
2473 textlength = u_strlen(text);
2474 }
2475 if (patternlength == -1) {
2476 patternlength = u_strlen(pattern);
2477 }
2478 if (textlength <= 0 || patternlength <= 0) {
2479 *status = U_ILLEGAL_ARGUMENT_ERROR;
2480 return NULL;
2481 }
2482
2483 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2484 if (result == NULL) {
2485 *status = U_MEMORY_ALLOCATION_ERROR;
2486 return NULL;
2487 }
2488
2489 result->collator = collator;
2490 result->strength = ucol_getStrength(collator);
2491 result->ceMask = getMask(result->strength);
2492 result->toShift =
2493 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2494 UCOL_SHIFTED;
2495 result->variableTop = ucol_getVariableTop(collator, status);
2496
2497 if (U_FAILURE(*status)) {
2498 uprv_free(result);
2499 return NULL;
2500 }
2501
2502 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2503 if (result->search == NULL) {
2504 *status = U_MEMORY_ALLOCATION_ERROR;
2505 uprv_free(result);
2506 return NULL;
2507 }
2508
2509 result->search->text = text;
2510 result->search->textLength = textlength;
2511
2512 result->pattern.text = pattern;
2513 result->pattern.textLength = patternlength;
2514 result->pattern.CE = NULL;
2515
2516 result->search->breakIter = breakiter;
2517 #if !UCONFIG_NO_BREAK_ITERATION
2518 if (breakiter) {
2519 ubrk_setText(breakiter, text, textlength, status);
2520 }
2521 #endif
2522
2523 result->ownCollator = FALSE;
2524 result->search->matchedLength = 0;
2525 result->search->matchedIndex = USEARCH_DONE;
2526 result->textIter = ucol_openElements(collator, text,
2527 textlength, status);
2528 if (U_FAILURE(*status)) {
2529 usearch_close(result);
2530 return NULL;
2531 }
2532
2533 result->utilIter = NULL;
2534
2535 result->search->isOverlap = FALSE;
2536 result->search->isCanonicalMatch = FALSE;
2537 result->search->isForwardSearching = TRUE;
2538 result->search->reset = TRUE;
2539
2540 initialize(result, status);
2541
2542 if (U_FAILURE(*status)) {
2543 usearch_close(result);
2544 return NULL;
2545 }
2546
2547 return result;
2548 }
2549 return NULL;
2550 }
2551
2552 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2553 {
2554 if (strsrch) {
2555 if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2556 strsrch->pattern.CE) {
2557 uprv_free(strsrch->pattern.CE);
2558 }
2559 ucol_closeElements(strsrch->textIter);
2560 ucol_closeElements(strsrch->utilIter);
2561 if (strsrch->ownCollator && strsrch->collator) {
2562 ucol_close((UCollator *)strsrch->collator);
2563 }
2564 uprv_free(strsrch->search);
2565 uprv_free(strsrch);
2566 }
2567 }
2568
2569 // set and get methods --------------------------------------------------
2570
2571 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2572 int32_t position,
2573 UErrorCode *status)
2574 {
2575 if (U_SUCCESS(*status) && strsrch) {
2576 if (isOutOfBounds(strsrch->search->textLength, position)) {
2577 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2578 }
2579 else {
2580 setColEIterOffset(strsrch->textIter, position);
2581 }
2582 strsrch->search->matchedIndex = USEARCH_DONE;
2583 strsrch->search->matchedLength = 0;
2584 strsrch->search->reset = FALSE;
2585 }
2586 }
2587
2588 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2589 {
2590 if (strsrch) {
2591 int32_t result = ucol_getOffset(strsrch->textIter);
2592 if (isOutOfBounds(strsrch->search->textLength, result)) {
2593 return USEARCH_DONE;
2594 }
2595 return result;
2596 }
2597 return USEARCH_DONE;
2598 }
2599
2600 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2601 USearchAttribute attribute,
2602 USearchAttributeValue value,
2603 UErrorCode *status)
2604 {
2605 if (U_SUCCESS(*status) && strsrch) {
2606 switch (attribute)
2607 {
2608 case USEARCH_OVERLAP :
2609 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2610 break;
2611 case USEARCH_CANONICAL_MATCH :
2612 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2613 FALSE);
2614 break;
2615 case USEARCH_ATTRIBUTE_COUNT :
2616 default:
2617 *status = U_ILLEGAL_ARGUMENT_ERROR;
2618 }
2619 }
2620 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2621 *status = U_ILLEGAL_ARGUMENT_ERROR;
2622 }
2623 }
2624
2625 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2626 const UStringSearch *strsrch,
2627 USearchAttribute attribute)
2628 {
2629 if (strsrch) {
2630 switch (attribute) {
2631 case USEARCH_OVERLAP :
2632 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2633 USEARCH_OFF);
2634 case USEARCH_CANONICAL_MATCH :
2635 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2636 USEARCH_OFF);
2637 case USEARCH_ATTRIBUTE_COUNT :
2638 return USEARCH_DEFAULT;
2639 }
2640 }
2641 return USEARCH_DEFAULT;
2642 }
2643
2644 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2645 const UStringSearch *strsrch)
2646 {
2647 if (strsrch == NULL) {
2648 return USEARCH_DONE;
2649 }
2650 return strsrch->search->matchedIndex;
2651 }
2652
2653
2654 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2655 UChar *result,
2656 int32_t resultCapacity,
2657 UErrorCode *status)
2658 {
2659 if (U_FAILURE(*status)) {
2660 return USEARCH_DONE;
2661 }
2662 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2663 result == NULL)) {
2664 *status = U_ILLEGAL_ARGUMENT_ERROR;
2665 return USEARCH_DONE;
2666 }
2667
2668 int32_t copylength = strsrch->search->matchedLength;
2669 int32_t copyindex = strsrch->search->matchedIndex;
2670 if (copyindex == USEARCH_DONE) {
2671 u_terminateUChars(result, resultCapacity, 0, status);
2672 return USEARCH_DONE;
2673 }
2674
2675 if (resultCapacity < copylength) {
2676 copylength = resultCapacity;
2677 }
2678 if (copylength > 0) {
2679 uprv_memcpy(result, strsrch->search->text + copyindex,
2680 copylength * sizeof(UChar));
2681 }
2682 return u_terminateUChars(result, resultCapacity,
2683 strsrch->search->matchedLength, status);
2684 }
2685
2686 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2687 const UStringSearch *strsrch)
2688 {
2689 if (strsrch) {
2690 return strsrch->search->matchedLength;
2691 }
2692 return USEARCH_DONE;
2693 }
2694
2695 #if !UCONFIG_NO_BREAK_ITERATION
2696
2697 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2698 UBreakIterator *breakiter,
2699 UErrorCode *status)
2700 {
2701 if (U_SUCCESS(*status) && strsrch) {
2702 strsrch->search->breakIter = breakiter;
2703 if (breakiter) {
2704 ubrk_setText(breakiter, strsrch->search->text,
2705 strsrch->search->textLength, status);
2706 }
2707 }
2708 }
2709
2710 U_CAPI const UBreakIterator* U_EXPORT2
2711 usearch_getBreakIterator(const UStringSearch *strsrch)
2712 {
2713 if (strsrch) {
2714 return strsrch->search->breakIter;
2715 }
2716 return NULL;
2717 }
2718
2719 #endif
2720
2721 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2722 const UChar *text,
2723 int32_t textlength,
2724 UErrorCode *status)
2725 {
2726 if (U_SUCCESS(*status)) {
2727 if (strsrch == NULL || text == NULL || textlength < -1 ||
2728 textlength == 0) {
2729 *status = U_ILLEGAL_ARGUMENT_ERROR;
2730 }
2731 else {
2732 if (textlength == -1) {
2733 textlength = u_strlen(text);
2734 }
2735 strsrch->search->text = text;
2736 strsrch->search->textLength = textlength;
2737 ucol_setText(strsrch->textIter, text, textlength, status);
2738 strsrch->search->matchedIndex = USEARCH_DONE;
2739 strsrch->search->matchedLength = 0;
2740 strsrch->search->reset = TRUE;
2741 #if !UCONFIG_NO_BREAK_ITERATION
2742 if (strsrch->search->breakIter != NULL) {
2743 ubrk_setText(strsrch->search->breakIter, text,
2744 textlength, status);
2745 }
2746 #endif
2747 }
2748 }
2749 }
2750
2751 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2752 int32_t *length)
2753 {
2754 if (strsrch) {
2755 *length = strsrch->search->textLength;
2756 return strsrch->search->text;
2757 }
2758 return NULL;
2759 }
2760
2761 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
2762 const UCollator *collator,
2763 UErrorCode *status)
2764 {
2765 if (U_SUCCESS(*status)) {
2766 if (collator == NULL) {
2767 *status = U_ILLEGAL_ARGUMENT_ERROR;
2768 return;
2769 }
2770 if (strsrch) {
2771 if (strsrch->ownCollator && (strsrch->collator != collator)) {
2772 ucol_close((UCollator *)strsrch->collator);
2773 strsrch->ownCollator = FALSE;
2774 }
2775 strsrch->collator = collator;
2776 strsrch->strength = ucol_getStrength(collator);
2777 strsrch->ceMask = getMask(strsrch->strength);
2778 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
2779 strsrch->toShift =
2780 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2781 UCOL_SHIFTED;
2782 // if status is a failure, ucol_getVariableTop returns 0
2783 strsrch->variableTop = ucol_getVariableTop(collator, status);
2784 if (U_SUCCESS(*status)) {
2785 initialize(strsrch, status);
2786 if (U_SUCCESS(*status)) {
2787 uprv_init_collIterate(collator, strsrch->search->text,
2788 strsrch->search->textLength,
2789 &(strsrch->textIter->iteratordata_));
2790 strsrch->utilIter->iteratordata_.coll = collator;
2791 }
2792 }
2793 }
2794 }
2795 }
2796
2797 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
2798 {
2799 if (strsrch) {
2800 return (UCollator *)strsrch->collator;
2801 }
2802 return NULL;
2803 }
2804
2805 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
2806 const UChar *pattern,
2807 int32_t patternlength,
2808 UErrorCode *status)
2809 {
2810 if (U_SUCCESS(*status)) {
2811 if (strsrch == NULL || pattern == NULL) {
2812 *status = U_ILLEGAL_ARGUMENT_ERROR;
2813 }
2814 else {
2815 if (patternlength == -1) {
2816 patternlength = u_strlen(pattern);
2817 }
2818 if (patternlength == 0) {
2819 *status = U_ILLEGAL_ARGUMENT_ERROR;
2820 return;
2821 }
2822 strsrch->pattern.text = pattern;
2823 strsrch->pattern.textLength = patternlength;
2824 initialize(strsrch, status);
2825 }
2826 }
2827 }
2828
2829 U_CAPI const UChar* U_EXPORT2
2830 usearch_getPattern(const UStringSearch *strsrch,
2831 int32_t *length)
2832 {
2833 if (strsrch) {
2834 *length = strsrch->pattern.textLength;
2835 return strsrch->pattern.text;
2836 }
2837 return NULL;
2838 }
2839
2840 // miscellanous methods --------------------------------------------------
2841
2842 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
2843 UErrorCode *status)
2844 {
2845 if (strsrch && U_SUCCESS(*status)) {
2846 strsrch->search->isForwardSearching = TRUE;
2847 usearch_setOffset(strsrch, 0, status);
2848 if (U_SUCCESS(*status)) {
2849 return usearch_next(strsrch, status);
2850 }
2851 }
2852 return USEARCH_DONE;
2853 }
2854
2855 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
2856 int32_t position,
2857 UErrorCode *status)
2858 {
2859 if (strsrch && U_SUCCESS(*status)) {
2860 strsrch->search->isForwardSearching = TRUE;
2861 // position checked in usearch_setOffset
2862 usearch_setOffset(strsrch, position, status);
2863 if (U_SUCCESS(*status)) {
2864 return usearch_next(strsrch, status);
2865 }
2866 }
2867 return USEARCH_DONE;
2868 }
2869
2870 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
2871 UErrorCode *status)
2872 {
2873 if (strsrch && U_SUCCESS(*status)) {
2874 strsrch->search->isForwardSearching = FALSE;
2875 usearch_setOffset(strsrch, strsrch->search->textLength, status);
2876 if (U_SUCCESS(*status)) {
2877 return usearch_previous(strsrch, status);
2878 }
2879 }
2880 return USEARCH_DONE;
2881 }
2882
2883 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
2884 int32_t position,
2885 UErrorCode *status)
2886 {
2887 if (strsrch && U_SUCCESS(*status)) {
2888 strsrch->search->isForwardSearching = FALSE;
2889 // position checked in usearch_setOffset
2890 usearch_setOffset(strsrch, position, status);
2891 if (U_SUCCESS(*status)) {
2892 return usearch_previous(strsrch, status);
2893 }
2894 }
2895 return USEARCH_DONE;
2896 }
2897
2898 /**
2899 * If a direction switch is required, we'll count the number of ces till the
2900 * beginning of the collation element iterator and iterate forwards that
2901 * number of times. This is so that we get to the correct point within the
2902 * string to continue the search in. Imagine when we are in the middle of the
2903 * normalization buffer when the change in direction is request. arrrgghh....
2904 * After searching the offset within the collation element iterator will be
2905 * shifted to the start of the match. If a match is not found, the offset would
2906 * have been set to the end of the text string in the collation element
2907 * iterator.
2908 * Okay, here's my take on normalization buffer. The only time when there can
2909 * be 2 matches within the same normalization is when the pattern is consists
2910 * of all accents. But since the offset returned is from the text string, we
2911 * should not confuse the caller by returning the second match within the
2912 * same normalization buffer. If we do, the 2 results will have the same match
2913 * offsets, and that'll be confusing. I'll return the next match that doesn't
2914 * fall within the same normalization buffer. Note this does not affect the
2915 * results of matches spanning the text and the normalization buffer.
2916 * The position to start searching is taken from the collation element
2917 * iterator. Callers of this API would have to set the offset in the collation
2918 * element iterator before using this method.
2919 */
2920 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
2921 UErrorCode *status)
2922 {
2923 if (U_SUCCESS(*status) && strsrch) {
2924 int32_t offset = usearch_getOffset(strsrch);
2925 USearch *search = strsrch->search;
2926 search->reset = FALSE;
2927 int32_t textlength = search->textLength;
2928 int32_t matchedindex = search->matchedIndex;
2929 if (search->isForwardSearching) {
2930 if (offset == textlength || matchedindex == textlength ||
2931 (!search->isOverlap &&
2932 (offset + strsrch->pattern.defaultShiftSize > textlength ||
2933 (matchedindex != USEARCH_DONE &&
2934 matchedindex + search->matchedLength >= textlength)))) {
2935 // not enough characters to match
2936 setMatchNotFound(strsrch);
2937 return USEARCH_DONE;
2938 }
2939 }
2940 else {
2941 // switching direction.
2942 // if matchedIndex == USEARCH_DONE, it means that either a
2943 // setOffset has been called or that previous ran off the text
2944 // string. the iterator would have been set to offset 0 if a
2945 // match is not found.
2946 search->isForwardSearching = TRUE;
2947 if (matchedindex != USEARCH_DONE) {
2948 // there's no need to set the collation element iterator
2949 // the next call to next will set the offset.
2950 return matchedindex;
2951 }
2952 }
2953
2954 if (U_SUCCESS(*status)) {
2955 if (strsrch->pattern.CELength == 0) {
2956 if (matchedindex == USEARCH_DONE) {
2957 search->matchedIndex = offset;
2958 }
2959 else { // moves by codepoints
2960 UTF_FWD_1(search->text, search->matchedIndex, textlength);
2961 }
2962
2963 search->matchedLength = 0;
2964 setColEIterOffset(strsrch->textIter, search->matchedIndex);
2965 // status checked below
2966 if (search->matchedIndex == textlength) {
2967 search->matchedIndex = USEARCH_DONE;
2968 }
2969 }
2970 else {
2971 if (search->matchedLength > 0) {
2972 // if matchlength is 0 we are at the start of the iteration
2973 int offset = ucol_getOffset(strsrch->textIter);
2974 if (search->isOverlap) {
2975 ucol_setOffset(strsrch->textIter, offset + 1, status);
2976 }
2977 else {
2978 ucol_setOffset(strsrch->textIter,
2979 offset + search->matchedLength, status);
2980 }
2981 }
2982 if (search->isCanonicalMatch) {
2983 // can't use exact here since extra accents are allowed.
2984 usearch_handleNextCanonical(strsrch, status);
2985 }
2986 else {
2987 usearch_handleNextExact(strsrch, status);
2988 }
2989 }
2990
2991 if (U_FAILURE(*status)) {
2992 return USEARCH_DONE;
2993 }
2994
2995 return search->matchedIndex;
2996 }
2997 }
2998 return USEARCH_DONE;
2999 }
3000
3001 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3002 UErrorCode *status)
3003 {
3004 if (U_SUCCESS(*status) && strsrch) {
3005 int32_t offset;
3006 USearch *search = strsrch->search;
3007 if (search->reset) {
3008 offset = search->textLength;
3009 search->isForwardSearching = FALSE;
3010 search->reset = FALSE;
3011 setColEIterOffset(strsrch->textIter, offset);
3012 }
3013 else {
3014 offset = usearch_getOffset(strsrch);
3015 }
3016
3017 int32_t matchedindex = search->matchedIndex;
3018 if (search->isForwardSearching == TRUE) {
3019 // switching direction.
3020 // if matchedIndex == USEARCH_DONE, it means that either a
3021 // setOffset has been called or that next ran off the text
3022 // string. the iterator would have been set to offset textLength if
3023 // a match is not found.
3024 search->isForwardSearching = FALSE;
3025 if (matchedindex != USEARCH_DONE) {
3026 return matchedindex;
3027 }
3028 }
3029 else {
3030 if (offset == 0 || matchedindex == 0 ||
3031 (!search->isOverlap &&
3032 (offset < strsrch->pattern.defaultShiftSize ||
3033 (matchedindex != USEARCH_DONE &&
3034 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3035 // not enough characters to match
3036 setMatchNotFound(strsrch);
3037 return USEARCH_DONE;
3038 }
3039 }
3040
3041 if (U_SUCCESS(*status)) {
3042 if (strsrch->pattern.CELength == 0) {
3043 search->matchedIndex =
3044 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3045 if (search->matchedIndex == 0) {
3046 setMatchNotFound(strsrch);
3047 // status checked below
3048 }
3049 else { // move by codepoints
3050 UTF_BACK_1(search->text, 0, search->matchedIndex);
3051 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3052 // status checked below
3053 search->matchedLength = 0;
3054 }
3055 }
3056 else {
3057 if (strsrch->search->isCanonicalMatch) {
3058 // can't use exact here since extra accents are allowed.
3059 usearch_handlePreviousCanonical(strsrch, status);
3060 // status checked below
3061 }
3062 else {
3063 usearch_handlePreviousExact(strsrch, status);
3064 // status checked below
3065 }
3066 }
3067
3068 if (U_FAILURE(*status)) {
3069 return USEARCH_DONE;
3070 }
3071
3072 return search->matchedIndex;
3073 }
3074 }
3075 return USEARCH_DONE;
3076 }
3077
3078
3079
3080 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3081 {
3082 /*
3083 reset is setting the attributes that are already in
3084 string search, hence all attributes in the collator should
3085 be retrieved without any problems
3086 */
3087 if (strsrch) {
3088 UErrorCode status = U_ZERO_ERROR;
3089 UBool sameCollAttribute = TRUE;
3090 uint32_t ceMask;
3091 UBool shift;
3092 uint32_t varTop;
3093
3094 strsrch->strength = ucol_getStrength(strsrch->collator);
3095 ceMask = getMask(strsrch->strength);
3096 if (strsrch->ceMask != ceMask) {
3097 strsrch->ceMask = ceMask;
3098 sameCollAttribute = FALSE;
3099 }
3100 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3101 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3102 &status) == UCOL_SHIFTED;
3103 if (strsrch->toShift != shift) {
3104 strsrch->toShift = shift;
3105 sameCollAttribute = FALSE;
3106 }
3107
3108 // if status is a failure, ucol_getVariableTop returns 0
3109 varTop = ucol_getVariableTop(strsrch->collator, &status);
3110 if (strsrch->variableTop != varTop) {
3111 strsrch->variableTop = varTop;
3112 sameCollAttribute = FALSE;
3113 }
3114 if (!sameCollAttribute) {
3115 initialize(strsrch, &status);
3116 }
3117 uprv_init_collIterate(strsrch->collator, strsrch->search->text,
3118 strsrch->search->textLength,
3119 &(strsrch->textIter->iteratordata_));
3120 strsrch->search->matchedLength = 0;
3121 strsrch->search->matchedIndex = USEARCH_DONE;
3122 strsrch->search->isOverlap = FALSE;
3123 strsrch->search->isCanonicalMatch = FALSE;
3124 strsrch->search->isForwardSearching = TRUE;
3125 strsrch->search->reset = TRUE;
3126 }
3127 }
3128
3129 // internal use methods declared in usrchimp.h -----------------------------
3130
3131 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
3132 {
3133 if (U_FAILURE(*status)) {
3134 setMatchNotFound(strsrch);
3135 return FALSE;
3136 }
3137
3138 UCollationElements *coleiter = strsrch->textIter;
3139 int32_t textlength = strsrch->search->textLength;
3140 uint32_t *patternce = strsrch->pattern.CE;
3141 int32_t patterncelength = strsrch->pattern.CELength;
3142 int32_t textoffset = ucol_getOffset(coleiter);
3143
3144 // status used in setting coleiter offset, since offset is checked in
3145 // shiftForward before setting the coleiter offset, status never
3146 // a failure
3147 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3148 patterncelength);
3149 while (textoffset <= textlength)
3150 {
3151 uint32_t patternceindex = patterncelength - 1;
3152 uint32_t targetce;
3153 UBool found = FALSE;
3154 uint32_t lastce = UCOL_NULLORDER;
3155
3156 setColEIterOffset(coleiter, textoffset);
3157
3158 while (TRUE) {
3159 // finding the last pattern ce match, imagine composite characters
3160 // for example: search for pattern A in text \u00C0
3161 // we'll have to skip \u0300 the grave first before we get to A
3162 targetce = ucol_previous(coleiter, status);
3163 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3164 found = FALSE;
3165 break;
3166 }
3167 targetce = getCE(strsrch, targetce);
3168 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
3169 // this is for the text \u0315\u0300 that requires
3170 // normalization and pattern \u0300, where \u0315 is ignorable
3171 continue;
3172 }
3173 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3174 lastce = targetce;
3175 }
3176 if (targetce == patternce[patternceindex]) {
3177 // the first ce can be a contraction
3178 found = TRUE;
3179 break;
3180 }
3181 if (!hasExpansion(coleiter)) {
3182 found = FALSE;
3183 break;
3184 }
3185 }
3186
3187 targetce = lastce;
3188
3189 while (found && patternceindex > 0) {
3190 targetce = ucol_previous(coleiter, status);
3191 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3192 found = FALSE;
3193 break;
3194 }
3195 targetce = getCE(strsrch, targetce);
3196 if (targetce == UCOL_IGNORABLE) {
3197 continue;
3198 }
3199
3200 patternceindex --;
3201 found = found && targetce == patternce[patternceindex];
3202 }
3203
3204 if (!found) {
3205 if (U_FAILURE(*status)) {
3206 break;
3207 }
3208 textoffset = shiftForward(strsrch, textoffset, targetce,
3209 patternceindex);
3210 // status checked at loop.
3211 patternceindex = patterncelength;
3212 continue;
3213 }
3214
3215 if (checkNextExactMatch(strsrch, &textoffset, status)) {
3216 // status checked in ucol_setOffset
3217 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3218 return TRUE;
3219 }
3220 }
3221 setMatchNotFound(strsrch);
3222 return FALSE;
3223 }
3224
3225 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
3226 {
3227 if (U_FAILURE(*status)) {
3228 setMatchNotFound(strsrch);
3229 return FALSE;
3230 }
3231
3232 UCollationElements *coleiter = strsrch->textIter;
3233 int32_t textlength = strsrch->search->textLength;
3234 uint32_t *patternce = strsrch->pattern.CE;
3235 int32_t patterncelength = strsrch->pattern.CELength;
3236 int32_t textoffset = ucol_getOffset(coleiter);
3237 UBool hasPatternAccents =
3238 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3239
3240 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
3241 patterncelength);
3242 strsrch->canonicalPrefixAccents[0] = 0;
3243 strsrch->canonicalSuffixAccents[0] = 0;
3244
3245 while (textoffset <= textlength)
3246 {
3247 int32_t patternceindex = patterncelength - 1;
3248 uint32_t targetce;
3249 UBool found = FALSE;
3250 uint32_t lastce = UCOL_NULLORDER;
3251
3252 setColEIterOffset(coleiter, textoffset);
3253
3254 while (TRUE) {
3255 // finding the last pattern ce match, imagine composite characters
3256 // for example: search for pattern A in text \u00C0
3257 // we'll have to skip \u0300 the grave first before we get to A
3258 targetce = ucol_previous(coleiter, status);
3259 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3260 found = FALSE;
3261 break;
3262 }
3263 targetce = getCE(strsrch, targetce);
3264 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
3265 lastce = targetce;
3266 }
3267 if (targetce == patternce[patternceindex]) {
3268 // the first ce can be a contraction
3269 found = TRUE;
3270 break;
3271 }
3272 if (!hasExpansion(coleiter)) {
3273 found = FALSE;
3274 break;
3275 }
3276 }
3277 targetce = lastce;
3278
3279 while (found && patternceindex > 0) {
3280 targetce = ucol_previous(coleiter, status);
3281 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3282 found = FALSE;
3283 break;
3284 }
3285 targetce = getCE(strsrch, targetce);
3286 if (targetce == UCOL_IGNORABLE) {
3287 continue;
3288 }
3289
3290 patternceindex --;
3291 found = found && targetce == patternce[patternceindex];
3292 }
3293
3294 // initializing the rearranged accent array
3295 if (hasPatternAccents && !found) {
3296 strsrch->canonicalPrefixAccents[0] = 0;
3297 strsrch->canonicalSuffixAccents[0] = 0;
3298 if (U_FAILURE(*status)) {
3299 break;
3300 }
3301 found = doNextCanonicalMatch(strsrch, textoffset, status);
3302 }
3303
3304 if (!found) {
3305 if (U_FAILURE(*status)) {
3306 break;
3307 }
3308 textoffset = shiftForward(strsrch, textoffset, targetce,
3309 patternceindex);
3310 // status checked at loop
3311 patternceindex = patterncelength;
3312 continue;
3313 }
3314
3315 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
3316 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
3317 return TRUE;
3318 }
3319 }
3320 setMatchNotFound(strsrch);
3321 return FALSE;
3322 }
3323
3324 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
3325 {
3326 if (U_FAILURE(*status)) {
3327 setMatchNotFound(strsrch);
3328 return FALSE;
3329 }
3330
3331 UCollationElements *coleiter = strsrch->textIter;
3332 uint32_t *patternce = strsrch->pattern.CE;
3333 int32_t patterncelength = strsrch->pattern.CELength;
3334 int32_t textoffset = ucol_getOffset(coleiter);
3335
3336 // shifting it check for setting offset
3337 // if setOffset is called previously or there was no previous match, we
3338 // leave the offset as it is.
3339 if (strsrch->search->matchedIndex != USEARCH_DONE) {
3340 textoffset = strsrch->search->matchedIndex;
3341 }
3342
3343 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3344 patterncelength);
3345
3346 while (textoffset >= 0)
3347 {
3348 int32_t patternceindex = 1;
3349 uint32_t targetce;
3350 UBool found = FALSE;
3351 uint32_t firstce = UCOL_NULLORDER;
3352
3353 // if status is a failure, ucol_setOffset does nothing
3354 setColEIterOffset(coleiter, textoffset);
3355
3356 while (TRUE) {
3357 // finding the first pattern ce match, imagine composite
3358 // characters. for example: search for pattern \u0300 in text
3359 // \u00C0, we'll have to skip A first before we get to
3360 // \u0300 the grave accent
3361 targetce = ucol_next(coleiter, status);
3362 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3363 found = FALSE;
3364 break;
3365 }
3366 targetce = getCE(strsrch, targetce);
3367 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3368 firstce = targetce;
3369 }
3370 if (targetce == UCOL_IGNORABLE) {
3371 continue;
3372 }
3373 if (targetce == patternce[0]) {
3374 found = TRUE;
3375 break;
3376 }
3377 if (!hasExpansion(coleiter)) {
3378 // checking for accents in composite character
3379 found = FALSE;
3380 break;
3381 }
3382 }
3383
3384 targetce = firstce;
3385
3386 while (found && (patternceindex < patterncelength)) {
3387 targetce = ucol_next(coleiter, status);
3388 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3389 found = FALSE;
3390 break;
3391 }
3392 targetce = getCE(strsrch, targetce);
3393 if (targetce == UCOL_IGNORABLE) {
3394 continue;
3395 }
3396
3397 found = found && targetce == patternce[patternceindex];
3398 patternceindex ++;
3399 }
3400
3401 if (!found) {
3402 if (U_FAILURE(*status)) {
3403 break;
3404 }
3405 textoffset = reverseShift(strsrch, textoffset, targetce,
3406 patternceindex);
3407 patternceindex = 0;
3408 continue;
3409 }
3410
3411 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
3412 setColEIterOffset(coleiter, textoffset);
3413 return TRUE;
3414 }
3415 }
3416 setMatchNotFound(strsrch);
3417 return FALSE;
3418 }
3419
3420 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
3421 UErrorCode *status)
3422 {
3423 if (U_FAILURE(*status)) {
3424 setMatchNotFound(strsrch);
3425 return FALSE;
3426 }
3427
3428 UCollationElements *coleiter = strsrch->textIter;
3429 uint32_t *patternce = strsrch->pattern.CE;
3430 int32_t patterncelength = strsrch->pattern.CELength;
3431 int32_t textoffset = ucol_getOffset(coleiter);
3432 UBool hasPatternAccents =
3433 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
3434
3435 // shifting it check for setting offset
3436 // if setOffset is called previously or there was no previous match, we
3437 // leave the offset as it is.
3438 if (strsrch->search->matchedIndex != USEARCH_DONE) {
3439 textoffset = strsrch->search->matchedIndex;
3440 }
3441
3442 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
3443 patterncelength);
3444 strsrch->canonicalPrefixAccents[0] = 0;
3445 strsrch->canonicalSuffixAccents[0] = 0;
3446
3447 while (textoffset >= 0)
3448 {
3449 int32_t patternceindex = 1;
3450 uint32_t targetce;
3451 UBool found = FALSE;
3452 uint32_t firstce = UCOL_NULLORDER;
3453
3454 setColEIterOffset(coleiter, textoffset);
3455 while (TRUE) {
3456 // finding the first pattern ce match, imagine composite
3457 // characters. for example: search for pattern \u0300 in text
3458 // \u00C0, we'll have to skip A first before we get to
3459 // \u0300 the grave accent
3460 targetce = ucol_next(coleiter, status);
3461 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3462 found = FALSE;
3463 break;
3464 }
3465 targetce = getCE(strsrch, targetce);
3466 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
3467 firstce = targetce;
3468 }
3469
3470 if (targetce == patternce[0]) {
3471 // the first ce can be a contraction
3472 found = TRUE;
3473 break;
3474 }
3475 if (!hasExpansion(coleiter)) {
3476 // checking for accents in composite character
3477 found = FALSE;
3478 break;
3479 }
3480 }
3481
3482 targetce = firstce;
3483
3484 while (found && patternceindex < patterncelength) {
3485 targetce = ucol_next(coleiter, status);
3486 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
3487 found = FALSE;
3488 break;
3489 }
3490 targetce = getCE(strsrch, targetce);
3491 if (targetce == UCOL_IGNORABLE) {
3492 continue;
3493 }
3494
3495 found = found && targetce == patternce[patternceindex];
3496 patternceindex ++;
3497 }
3498
3499 // initializing the rearranged accent array
3500 if (hasPatternAccents && !found) {
3501 strsrch->canonicalPrefixAccents[0] = 0;
3502 strsrch->canonicalSuffixAccents[0] = 0;
3503 if (U_FAILURE(*status)) {
3504 break;
3505 }
3506 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
3507 }
3508
3509 if (!found) {
3510 if (U_FAILURE(*status)) {
3511 break;
3512 }
3513 textoffset = reverseShift(strsrch, textoffset, targetce,
3514 patternceindex);
3515 patternceindex = 0;
3516 continue;
3517 }
3518
3519 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
3520 setColEIterOffset(coleiter, textoffset);
3521 return TRUE;
3522 }
3523 }
3524 setMatchNotFound(strsrch);
3525 return FALSE;
3526 }
3527
3528 #endif /* #if !UCONFIG_NO_COLLATION */