]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/usearch.cpp
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / usearch.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2001-2015 IBM and others. All rights reserved.
6 **********************************************************************
7 * Date Name Description
8 * 07/02/2001 synwee Creation.
9 **********************************************************************
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
15
16 #include "unicode/usearch.h"
17 #include "unicode/ustring.h"
18 #include "unicode/uchar.h"
19 #include "unicode/utf16.h"
20 #include "normalizer2impl.h"
21 #include "usrchimp.h"
22 #include "cmemory.h"
23 #include "ucln_in.h"
24 #include "uassert.h"
25 #include "ustr_imp.h"
26 #if U_PLATFORM_IS_DARWIN_BASED
27 // Add logging for error conditions as in <rdar://51554495>
28 #include <os/log.h>
29 #endif
30
31 U_NAMESPACE_USE
32
33 // don't use Boyer-Moore
34 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
35 #define BOYER_MOORE 0
36
37 // internal definition ---------------------------------------------------
38
39 #define LAST_BYTE_MASK_ 0xFF
40 #define SECOND_LAST_BYTE_SHIFT_ 8
41 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
42
43 static const Normalizer2Impl *g_nfcImpl = NULL;
44
45 // internal methods -------------------------------------------------
46
47 /**
48 * Fast collation element iterator setOffset.
49 * This function does not check for bounds.
50 * @param coleiter collation element iterator
51 * @param offset to set
52 */
53 static
54 inline void setColEIterOffset(UCollationElements *elems,
55 int32_t offset)
56 {
57 // Note: Not "fast" any more after the 2013 collation rewrite.
58 // We do not want to expose more internals than necessary.
59 UErrorCode status = U_ZERO_ERROR;
60 ucol_setOffset(elems, offset, &status);
61 }
62
63 /**
64 * Getting the mask for collation strength
65 * @param strength collation strength
66 * @return collation element mask
67 */
68 static
69 inline uint32_t getMask(UCollationStrength strength)
70 {
71 switch (strength)
72 {
73 case UCOL_PRIMARY:
74 return UCOL_PRIMARYORDERMASK;
75 case UCOL_SECONDARY:
76 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
77 default:
78 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
79 UCOL_PRIMARYORDERMASK;
80 }
81 }
82
83 /**
84 * @param ce 32-bit collation element
85 * @return hash code
86 */
87 static
88 inline int hashFromCE32(uint32_t ce)
89 {
90 int hc = (int)(
91 ((((((ce >> 24) * 37) +
92 (ce >> 16)) * 37) +
93 (ce >> 8)) * 37) +
94 ce);
95 hc %= MAX_TABLE_SIZE_;
96 if (hc < 0) {
97 hc += MAX_TABLE_SIZE_;
98 }
99 return hc;
100 }
101
102 U_CDECL_BEGIN
103 static UBool U_CALLCONV
104 usearch_cleanup(void) {
105 g_nfcImpl = NULL;
106 return TRUE;
107 }
108 U_CDECL_END
109
110 /**
111 * Initializing the fcd tables.
112 * Internal method, status assumed to be a success.
113 * @param status output error if any, caller to check status before calling
114 * method, status assumed to be success when passed in.
115 */
116 static
117 inline void initializeFCD(UErrorCode *status)
118 {
119 if (g_nfcImpl == NULL) {
120 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
121 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
122 }
123 }
124
125 /**
126 * Gets the fcd value for a character at the argument index.
127 * This method takes into accounts of the supplementary characters.
128 * @param str UTF16 string where character for fcd retrieval resides
129 * @param offset position of the character whose fcd is to be retrieved, to be
130 * overwritten with the next character position, taking
131 * surrogate characters into consideration.
132 * @param strlength length of the argument string
133 * @return fcd value
134 */
135 static
136 uint16_t getFCD(const UChar *str, int32_t *offset,
137 int32_t strlength)
138 {
139 const UChar *temp = str + *offset;
140 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
141 *offset = (int32_t)(temp - str);
142 return result;
143 }
144
145 /**
146 * Getting the modified collation elements taking into account the collation
147 * attributes
148 * @param strsrch string search data
149 * @param sourcece
150 * @return the modified collation element
151 */
152 static
153 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
154 {
155 // note for tertiary we can't use the collator->tertiaryMask, that
156 // is a preprocessed mask that takes into account case options. since
157 // we are only concerned with exact matches, we don't need that.
158 sourcece &= strsrch->ceMask;
159
160 if (strsrch->toShift) {
161 // alternate handling here, since only the 16 most significant digits
162 // is only used, we can safely do a compare without masking
163 // if the ce is a variable, we mask and get only the primary values
164 // no shifting to quartenary is required since all primary values
165 // less than variabletop will need to be masked off anyway.
166 if (strsrch->variableTop > sourcece) {
167 if (strsrch->strength >= UCOL_QUATERNARY) {
168 sourcece &= UCOL_PRIMARYORDERMASK;
169 }
170 else {
171 sourcece = UCOL_IGNORABLE;
172 }
173 }
174 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
175 sourcece = 0xFFFF;
176 }
177
178 return sourcece;
179 }
180
181 /**
182 * Allocate a memory and returns NULL if it failed.
183 * Internal method, status assumed to be a success.
184 * @param size to allocate
185 * @param status output error if any, caller to check status before calling
186 * method, status assumed to be success when passed in.
187 * @return newly allocated array, NULL otherwise
188 */
189 static
190 inline void * allocateMemory(uint32_t size, UErrorCode *status)
191 {
192 uint32_t *result = (uint32_t *)uprv_malloc(size);
193 if (result == NULL) {
194 *status = U_MEMORY_ALLOCATION_ERROR;
195 }
196 return result;
197 }
198
199 /**
200 * Adds a uint32_t value to a destination array.
201 * Creates a new array if we run out of space. The caller will have to
202 * manually deallocate the newly allocated array.
203 * Internal method, status assumed to be success, caller has to check status
204 * before calling this method. destination not to be NULL and has at least
205 * size destinationlength.
206 * @param destination target array
207 * @param offset destination offset to add value
208 * @param destinationlength target array size, return value for the new size
209 * @param value to be added
210 * @param increments incremental size expected
211 * @param status output error if any, caller to check status before calling
212 * method, status assumed to be success when passed in.
213 * @return new destination array, destination if there was no new allocation
214 */
215 static
216 inline int32_t * addTouint32_tArray(int32_t *destination,
217 uint32_t offset,
218 uint32_t *destinationlength,
219 uint32_t value,
220 uint32_t increments,
221 UErrorCode *status)
222 {
223 uint32_t newlength = *destinationlength;
224 if (offset + 1 == newlength) {
225 newlength += increments;
226 int32_t *temp = (int32_t *)allocateMemory(
227 sizeof(int32_t) * newlength, status);
228 if (U_FAILURE(*status)) {
229 return NULL;
230 }
231 uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
232 *destinationlength = newlength;
233 destination = temp;
234 }
235 destination[offset] = value;
236 return destination;
237 }
238
239 /**
240 * Adds a uint64_t value to a destination array.
241 * Creates a new array if we run out of space. The caller will have to
242 * manually deallocate the newly allocated array.
243 * Internal method, status assumed to be success, caller has to check status
244 * before calling this method. destination not to be NULL and has at least
245 * size destinationlength.
246 * @param destination target array
247 * @param offset destination offset to add value
248 * @param destinationlength target array size, return value for the new size
249 * @param value to be added
250 * @param increments incremental size expected
251 * @param status output error if any, caller to check status before calling
252 * method, status assumed to be success when passed in.
253 * @return new destination array, destination if there was no new allocation
254 */
255 static
256 inline int64_t * addTouint64_tArray(int64_t *destination,
257 uint32_t offset,
258 uint32_t *destinationlength,
259 uint64_t value,
260 uint32_t increments,
261 UErrorCode *status)
262 {
263 uint32_t newlength = *destinationlength;
264 if (offset + 1 == newlength) {
265 newlength += increments;
266 int64_t *temp = (int64_t *)allocateMemory(
267 sizeof(int64_t) * newlength, status);
268
269 if (U_FAILURE(*status)) {
270 return NULL;
271 }
272
273 uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
274 *destinationlength = newlength;
275 destination = temp;
276 }
277
278 destination[offset] = value;
279
280 return destination;
281 }
282
283 /**
284 * Initializing the ce table for a pattern.
285 * Stores non-ignorable collation keys.
286 * Table size will be estimated by the size of the pattern text. Table
287 * expansion will be perform as we go along. Adding 1 to ensure that the table
288 * size definitely increases.
289 * Internal method, status assumed to be a success.
290 * @param strsrch string search data
291 * @param status output error if any, caller to check status before calling
292 * method, status assumed to be success when passed in.
293 * @return total number of expansions
294 */
295 static
296 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
297 UErrorCode *status)
298 {
299 UPattern *pattern = &(strsrch->pattern);
300 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
301 int32_t *cetable = pattern->cesBuffer;
302 uint32_t patternlength = pattern->textLength;
303 UCollationElements *coleiter = strsrch->utilIter;
304
305 if (coleiter == NULL) {
306 coleiter = ucol_openElements(strsrch->collator, pattern->text,
307 patternlength, status);
308 // status will be checked in ucol_next(..) later and if it is an
309 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
310 // returned.
311 strsrch->utilIter = coleiter;
312 }
313 else {
314 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
315 }
316 if(U_FAILURE(*status)) {
317 return 0;
318 }
319
320 if (pattern->ces != cetable && pattern->ces) {
321 uprv_free(pattern->ces);
322 }
323
324 uint16_t offset = 0;
325 uint16_t result = 0;
326 int32_t ce;
327
328 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
329 U_SUCCESS(*status)) {
330 uint32_t newce = getCE(strsrch, ce);
331 if (newce) {
332 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
333 newce,
334 patternlength - ucol_getOffset(coleiter) + 1,
335 status);
336 if (U_FAILURE(*status)) {
337 return 0;
338 }
339 offset ++;
340 if (cetable != temp && cetable != pattern->cesBuffer) {
341 uprv_free(cetable);
342 }
343 cetable = temp;
344 }
345 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
346 }
347
348 cetable[offset] = 0;
349 pattern->ces = cetable;
350 pattern->cesLength = offset;
351
352 return result;
353 }
354
355 /**
356 * Initializing the pce table for a pattern.
357 * Stores non-ignorable collation keys.
358 * Table size will be estimated by the size of the pattern text. Table
359 * expansion will be perform as we go along. Adding 1 to ensure that the table
360 * size definitely increases.
361 * Internal method, status assumed to be a success.
362 * @param strsrch string search data
363 * @param status output error if any, caller to check status before calling
364 * method, status assumed to be success when passed in.
365 * @return total number of expansions
366 */
367 static
368 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
369 UErrorCode *status)
370 {
371 UPattern *pattern = &(strsrch->pattern);
372 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
373 int64_t *pcetable = pattern->pcesBuffer;
374 uint32_t patternlength = pattern->textLength;
375 UCollationElements *coleiter = strsrch->utilIter;
376
377 if (coleiter == NULL) {
378 coleiter = ucol_openElements(strsrch->collator, pattern->text,
379 patternlength, status);
380 // status will be checked in ucol_next(..) later and if it is an
381 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
382 // returned.
383 strsrch->utilIter = coleiter;
384 } else {
385 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
386 }
387 if(U_FAILURE(*status)) {
388 return 0;
389 }
390
391 if (pattern->pces != pcetable && pattern->pces != NULL) {
392 uprv_free(pattern->pces);
393 }
394
395 uint16_t offset = 0;
396 uint16_t result = 0;
397 int64_t pce;
398
399 icu::UCollationPCE iter(coleiter);
400
401 // ** Should processed CEs be signed or unsigned?
402 // ** (the rest of the code in this file seems to play fast-and-loose with
403 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
404 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
405 U_SUCCESS(*status)) {
406 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
407 pce,
408 patternlength - ucol_getOffset(coleiter) + 1,
409 status);
410
411 if (U_FAILURE(*status)) {
412 return 0;
413 }
414
415 offset += 1;
416
417 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
418 uprv_free(pcetable);
419 }
420
421 pcetable = temp;
422 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
423 }
424
425 pcetable[offset] = 0;
426 pattern->pces = pcetable;
427 pattern->pcesLength = offset;
428
429 return result;
430 }
431
432 /**
433 * Initializes the pattern struct.
434 * Internal method, status assumed to be success.
435 * @param strsrch UStringSearch data storage
436 * @param status output error if any, caller to check status before calling
437 * method, status assumed to be success when passed in.
438 * @return expansionsize the total expansion size of the pattern
439 */
440 static
441 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
442 {
443 if (U_FAILURE(*status)) { return 0; }
444 UPattern *pattern = &(strsrch->pattern);
445 const UChar *patterntext = pattern->text;
446 int32_t length = pattern->textLength;
447 int32_t index = 0;
448
449 // Since the strength is primary, accents are ignored in the pattern.
450 if (strsrch->strength == UCOL_PRIMARY) {
451 pattern->hasPrefixAccents = 0;
452 pattern->hasSuffixAccents = 0;
453 } else {
454 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
455 SECOND_LAST_BYTE_SHIFT_;
456 index = length;
457 U16_BACK_1(patterntext, 0, index);
458 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
459 LAST_BYTE_MASK_;
460 }
461
462 // ** HACK **
463 if (strsrch->pattern.pces != NULL) {
464 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
465 uprv_free(strsrch->pattern.pces);
466 }
467
468 strsrch->pattern.pces = NULL;
469 }
470
471 // since intializePattern is an internal method status is a success.
472 return initializePatternCETable(strsrch, status);
473 }
474
475 /**
476 * Initializing shift tables, with the default values.
477 * If a corresponding default value is 0, the shift table is not set.
478 * @param shift table for forwards shift
479 * @param backshift table for backwards shift
480 * @param cetable table containing pattern ce
481 * @param cesize size of the pattern ces
482 * @param expansionsize total size of the expansions
483 * @param defaultforward the default forward value
484 * @param defaultbackward the default backward value
485 */
486 static
487 inline void setShiftTable(int16_t shift[], int16_t backshift[],
488 int32_t *cetable, int32_t cesize,
489 int16_t expansionsize,
490 int16_t defaultforward,
491 int16_t defaultbackward)
492 {
493 // estimate the value to shift. to do that we estimate the smallest
494 // number of characters to give the relevant ces, ie approximately
495 // the number of ces minus their expansion, since expansions can come
496 // from a character.
497 int32_t count;
498 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
499 shift[count] = defaultforward;
500 }
501 cesize --; // down to the last index
502 for (count = 0; count < cesize; count ++) {
503 // number of ces from right of array to the count
504 int temp = defaultforward - count - 1;
505 shift[hashFromCE32(cetable[count])] = temp > 1 ? static_cast<int16_t>(temp) : 1;
506 }
507 shift[hashFromCE32(cetable[cesize])] = 1;
508 // for ignorables we just shift by one. see test examples.
509 shift[hashFromCE32(0)] = 1;
510
511 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
512 backshift[count] = defaultbackward;
513 }
514 for (count = cesize; count > 0; count --) {
515 // the original value count does not seem to work
516 backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
517 (int16_t)(count - expansionsize) : 1;
518 }
519 backshift[hashFromCE32(cetable[0])] = 1;
520 backshift[hashFromCE32(0)] = 1;
521 }
522
523 /**
524 * Building of the pattern collation element list and the boyer moore strsrch
525 * table.
526 * The canonical match will only be performed after the default match fails.
527 * For both cases we need to remember the size of the composed and decomposed
528 * versions of the string. Since the Boyer-Moore shift calculations shifts by
529 * a number of characters in the text and tries to match the pattern from that
530 * offset, the shift value can not be too large in case we miss some
531 * characters. To choose a right shift size, we estimate the NFC form of the
532 * and use its size as a shift guide. The NFC form should be the small
533 * possible representation of the pattern. Anyways, we'll err on the smaller
534 * shift size. Hence the calculation for minlength.
535 * Canonical match will be performed slightly differently. We'll split the
536 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
537 * the first and last base character (MS), the ending accents (EA). Matches
538 * will be done on MS first, and only when we match MS then some processing
539 * will be required for the prefix and end accents in order to determine if
540 * they match PA and EA. Hence the default shift values
541 * for the canonical match will take the size of either end's accent into
542 * consideration. Forwards search will take the end accents into consideration
543 * for the default shift values and the backwards search will take the prefix
544 * accents into consideration.
545 * If pattern has no non-ignorable ce, we return a illegal argument error.
546 * Internal method, status assumed to be success.
547 * @param strsrch UStringSearch data storage
548 * @param status for output errors if it occurs, status is assumed to be a
549 * success when it is passed in.
550 */
551 static
552 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
553 {
554 int16_t expandlength = initializePattern(strsrch, status);
555 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
556 UPattern *pattern = &strsrch->pattern;
557 int32_t cesize = pattern->cesLength;
558
559 int16_t minlength = cesize > expandlength
560 ? (int16_t)cesize - expandlength : 1;
561 pattern->defaultShiftSize = minlength;
562 setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
563 cesize, expandlength, minlength, minlength);
564 return;
565 }
566 strsrch->pattern.defaultShiftSize = 0;
567 }
568
569 #if BOYER_MOORE
570 /**
571 * Check to make sure that the match length is at the end of the character by
572 * using the breakiterator.
573 * @param strsrch string search data
574 * @param start target text start offset
575 * @param end target text end offset
576 */
577 static
578 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
579 int32_t *end)
580 {
581 #if !UCONFIG_NO_BREAK_ITERATION
582 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
583 if (breakiterator) {
584 int32_t matchend = *end;
585 //int32_t matchstart = *start;
586
587 if (!ubrk_isBoundary(breakiterator, matchend)) {
588 *end = ubrk_following(breakiterator, matchend);
589 }
590
591 /* Check the start of the matched text to make sure it doesn't have any accents
592 * before it. This code may not be necessary and so it is commented out */
593 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
594 *start = ubrk_preceding(breakiterator, matchstart);
595 }*/
596 }
597 #endif
598 }
599
600 /**
601 * Determine whether the target text in UStringSearch bounded by the offset
602 * start and end is one or more whole units of text as
603 * determined by the breakiterator in UStringSearch.
604 * @param strsrch string search data
605 * @param start target text start offset
606 * @param end target text end offset
607 */
608 static
609 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
610 int32_t end)
611 {
612 #if !UCONFIG_NO_BREAK_ITERATION
613 UBreakIterator *breakiterator = strsrch->search->breakIter;
614 //TODO: Add here.
615 if (breakiterator) {
616 int32_t startindex = ubrk_first(breakiterator);
617 int32_t endindex = ubrk_last(breakiterator);
618
619 // out-of-range indexes are never boundary positions
620 if (start < startindex || start > endindex ||
621 end < startindex || end > endindex) {
622 return FALSE;
623 }
624 // otherwise, we can use following() on the position before the
625 // specified one and return true of the position we get back is the
626 // one the user specified
627 UBool result = (start == startindex ||
628 ubrk_following(breakiterator, start - 1) == start) &&
629 (end == endindex ||
630 ubrk_following(breakiterator, end - 1) == end);
631 if (result) {
632 // iterates the individual ces
633 UCollationElements *coleiter = strsrch->utilIter;
634 const UChar *text = strsrch->search->text +
635 start;
636 UErrorCode status = U_ZERO_ERROR;
637 ucol_setText(coleiter, text, end - start, &status);
638 for (int32_t count = 0; count < strsrch->pattern.cesLength;
639 count ++) {
640 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
641 if (ce == UCOL_IGNORABLE) {
642 count --;
643 continue;
644 }
645 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
646 return FALSE;
647 }
648 }
649 int32_t nextce = ucol_next(coleiter, &status);
650 while (ucol_getOffset(coleiter) == (end - start)
651 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
652 nextce = ucol_next(coleiter, &status);
653 }
654 if (ucol_getOffset(coleiter) == (end - start)
655 && nextce != UCOL_NULLORDER) {
656 // extra collation elements at the end of the match
657 return FALSE;
658 }
659 }
660 return result;
661 }
662 #endif
663 return TRUE;
664 }
665
666 /**
667 * Getting the next base character offset if current offset is an accent,
668 * or the current offset if the current character contains a base character.
669 * accents the following base character will be returned
670 * @param text string
671 * @param textoffset current offset
672 * @param textlength length of text string
673 * @return the next base character or the current offset
674 * if the current character is contains a base character.
675 */
676 static
677 inline int32_t getNextBaseOffset(const UChar *text,
678 int32_t textoffset,
679 int32_t textlength)
680 {
681 if (textoffset < textlength) {
682 int32_t temp = textoffset;
683 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
684 while (temp < textlength) {
685 int32_t result = temp;
686 if ((getFCD(text, &temp, textlength) >>
687 SECOND_LAST_BYTE_SHIFT_) == 0) {
688 return result;
689 }
690 }
691 return textlength;
692 }
693 }
694 return textoffset;
695 }
696
697 /**
698 * Gets the next base character offset depending on the string search pattern
699 * data
700 * @param strsrch string search data
701 * @param textoffset current offset, one offset away from the last character
702 * to search for.
703 * @return start index of the next base character or the current offset
704 * if the current character is contains a base character.
705 */
706 static
707 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
708 int32_t textoffset)
709 {
710 int32_t textlength = strsrch->search->textLength;
711 if (strsrch->pattern.hasSuffixAccents &&
712 textoffset < textlength) {
713 int32_t temp = textoffset;
714 const UChar *text = strsrch->search->text;
715 U16_BACK_1(text, 0, temp);
716 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
717 return getNextBaseOffset(text, textoffset, textlength);
718 }
719 }
720 return textoffset;
721 }
722
723 /**
724 * Shifting the collation element iterator position forward to prepare for
725 * a following match. If the last character is a unsafe character, we'll only
726 * shift by 1 to capture contractions, normalization etc.
727 * Internal method, status assumed to be success.
728 * @param text strsrch string search data
729 * @param textoffset start text position to do search
730 * @param ce the text ce which failed the match.
731 * @param patternceindex index of the ce within the pattern ce buffer which
732 * failed the match
733 * @return final offset
734 */
735 static
736 inline int32_t shiftForward(UStringSearch *strsrch,
737 int32_t textoffset,
738 int32_t ce,
739 int32_t patternceindex)
740 {
741 UPattern *pattern = &(strsrch->pattern);
742 if (ce != UCOL_NULLORDER) {
743 int32_t shift = pattern->shift[hashFromCE32(ce)];
744 // this is to adjust for characters in the middle of the
745 // substring for matching that failed.
746 int32_t adjust = pattern->cesLength - patternceindex;
747 if (adjust > 1 && shift >= adjust) {
748 shift -= adjust - 1;
749 }
750 textoffset += shift;
751 }
752 else {
753 textoffset += pattern->defaultShiftSize;
754 }
755
756 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
757 // check for unsafe characters
758 // * if it is the start or middle of a contraction: to be done after
759 // a initial match is found
760 // * thai or lao base consonant character: similar to contraction
761 // * high surrogate character: similar to contraction
762 // * next character is a accent: shift to the next base character
763 return textoffset;
764 }
765 #endif // #if BOYER_MOORE
766
767 /**
768 * sets match not found
769 * @param strsrch string search data
770 */
771 static
772 inline void setMatchNotFound(UStringSearch *strsrch)
773 {
774 // this method resets the match result regardless of the error status.
775 strsrch->search->matchedIndex = USEARCH_DONE;
776 strsrch->search->matchedLength = 0;
777 if (strsrch->search->isForwardSearching) {
778 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
779 }
780 else {
781 setColEIterOffset(strsrch->textIter, 0);
782 }
783 }
784
785 #if BOYER_MOORE
786 /**
787 * Gets the offset to the next safe point in text.
788 * ie. not the middle of a contraction, swappable characters or supplementary
789 * characters.
790 * @param collator collation sata
791 * @param text string to work with
792 * @param textoffset offset in string
793 * @param textlength length of text string
794 * @return offset to the next safe character
795 */
796 static
797 inline int32_t getNextSafeOffset(const UCollator *collator,
798 const UChar *text,
799 int32_t textoffset,
800 int32_t textlength)
801 {
802 int32_t result = textoffset; // first contraction character
803 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
804 result ++;
805 }
806 return result;
807 }
808
809 /**
810 * This checks for accents in the potential match started with a .
811 * composite character.
812 * This is really painful... we have to check that composite character do not
813 * have any extra accents. We have to normalize the potential match and find
814 * the immediate decomposed character before the match.
815 * The first composite character would have been taken care of by the fcd
816 * checks in checkForwardExactMatch.
817 * This is the slow path after the fcd of the first character and
818 * the last character has been checked by checkForwardExactMatch and we
819 * determine that the potential match has extra non-ignorable preceding
820 * ces.
821 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
822 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
823 * Note here that accents checking are slow and cautioned in the API docs.
824 * Internal method, status assumed to be a success, caller should check status
825 * before calling this method
826 * @param strsrch string search data
827 * @param start index of the potential unfriendly composite character
828 * @param end index of the potential unfriendly composite character
829 * @param status output error status if any.
830 * @return TRUE if there is non-ignorable accents before at the beginning
831 * of the match, FALSE otherwise.
832 */
833
834 static
835 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
836 int32_t end,
837 UErrorCode *status)
838 {
839 UBool result = FALSE;
840 if (strsrch->pattern.hasPrefixAccents) {
841 int32_t length = end - start;
842 int32_t offset = 0;
843 const UChar *text = strsrch->search->text + start;
844
845 U16_FWD_1(text, offset, length);
846 // we are only concerned with the first composite character
847 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
848 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
849 text, 0, length);
850 if (safeoffset != length) {
851 safeoffset ++;
852 }
853 UChar *norm = NULL;
854 UChar buffer[INITIAL_ARRAY_SIZE_];
855 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
856 buffer, INITIAL_ARRAY_SIZE_,
857 status);
858 if (U_FAILURE(*status)) {
859 return FALSE;
860 }
861 if (size >= INITIAL_ARRAY_SIZE_) {
862 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
863 status);
864 // if allocation failed, status will be set to
865 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
866 // checks for it.
867 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
868 size, status);
869 if (U_FAILURE(*status) && norm != NULL) {
870 uprv_free(norm);
871 return FALSE;
872 }
873 }
874 else {
875 norm = buffer;
876 }
877
878 UCollationElements *coleiter = strsrch->utilIter;
879 ucol_setText(coleiter, norm, size, status);
880 uint32_t firstce = strsrch->pattern.ces[0];
881 UBool ignorable = TRUE;
882 uint32_t ce = UCOL_IGNORABLE;
883 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
884 offset = ucol_getOffset(coleiter);
885 if (ce != firstce && ce != UCOL_IGNORABLE) {
886 ignorable = FALSE;
887 }
888 ce = ucol_next(coleiter, status);
889 }
890 UChar32 codepoint;
891 U16_PREV(norm, 0, offset, codepoint);
892 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
893
894 if (norm != buffer) {
895 uprv_free(norm);
896 }
897 }
898 }
899
900 return result;
901 }
902
903 /**
904 * Used by exact matches, checks if there are accents before the match.
905 * This is really painful... we have to check that composite characters at
906 * the start of the matches have to not have any extra accents.
907 * We check the FCD of the character first, if it starts with an accent and
908 * the first pattern ce does not match the first ce of the character, we bail.
909 * Otherwise we try normalizing the first composite
910 * character and find the immediate decomposed character before the match to
911 * see if it is an non-ignorable accent.
912 * Now normalizing the first composite character is enough because we ensure
913 * that when the match is passed in here with extra beginning ces, the
914 * first or last ce that match has to occur within the first character.
915 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
916 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
917 * Note here that accents checking are slow and cautioned in the API docs.
918 * @param strsrch string search data
919 * @param start offset
920 * @param end offset
921 * @return TRUE if there are accents on either side of the match,
922 * FALSE otherwise
923 */
924 static
925 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
926 int32_t end)
927 {
928 if (strsrch->pattern.hasPrefixAccents) {
929 UCollationElements *coleiter = strsrch->textIter;
930 UErrorCode status = U_ZERO_ERROR;
931 // we have been iterating forwards previously
932 uint32_t ignorable = TRUE;
933 int32_t firstce = strsrch->pattern.ces[0];
934
935 setColEIterOffset(coleiter, start);
936 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
937 if (U_FAILURE(status)) {
938 return TRUE;
939 }
940 while (ce != firstce) {
941 if (ce != UCOL_IGNORABLE) {
942 ignorable = FALSE;
943 }
944 ce = getCE(strsrch, ucol_next(coleiter, &status));
945 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
946 return TRUE;
947 }
948 }
949 if (!ignorable && inNormBuf(coleiter)) {
950 // within normalization buffer, discontiguous handled here
951 return TRUE;
952 }
953
954 // within text
955 int32_t temp = start;
956 // original code
957 // accent = (getFCD(strsrch->search->text, &temp,
958 // strsrch->search->textLength)
959 // >> SECOND_LAST_BYTE_SHIFT_);
960 // however this code does not work well with VC7 .net in release mode.
961 // maybe the inlines for getFCD combined with shifting has bugs in
962 // VC7. anyways this is a work around.
963 UBool accent = getFCD(strsrch->search->text, &temp,
964 strsrch->search->textLength) > 0xFF;
965 if (!accent) {
966 return checkExtraMatchAccents(strsrch, start, end, &status);
967 }
968 if (!ignorable) {
969 return TRUE;
970 }
971 if (start > 0) {
972 temp = start;
973 U16_BACK_1(strsrch->search->text, 0, temp);
974 if (getFCD(strsrch->search->text, &temp,
975 strsrch->search->textLength) & LAST_BYTE_MASK_) {
976 setColEIterOffset(coleiter, start);
977 ce = ucol_previous(coleiter, &status);
978 if (U_FAILURE(status) ||
979 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
980 return TRUE;
981 }
982 }
983 }
984 }
985
986 return FALSE;
987 }
988
989 /**
990 * Used by exact matches, checks if there are accents bounding the match.
991 * Note this is the initial boundary check. If the potential match
992 * starts or ends with composite characters, the accents in those
993 * characters will be determined later.
994 * Not doing backwards iteration here, since discontiguos contraction for
995 * backwards collation element iterator, use up too many characters.
996 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
997 * should fail since there is a acute at the end of \u01FA
998 * Note here that accents checking are slow and cautioned in the API docs.
999 * @param strsrch string search data
1000 * @param start offset of match
1001 * @param end end offset of the match
1002 * @return TRUE if there are accents on either side of the match,
1003 * FALSE otherwise
1004 */
1005 static
1006 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1007 int32_t end)
1008 {
1009 if (strsrch->pattern.hasSuffixAccents) {
1010 const UChar *text = strsrch->search->text;
1011 int32_t temp = end;
1012 int32_t textlength = strsrch->search->textLength;
1013 U16_BACK_1(text, 0, temp);
1014 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1015 int32_t firstce = strsrch->pattern.ces[0];
1016 UCollationElements *coleiter = strsrch->textIter;
1017 UErrorCode status = U_ZERO_ERROR;
1018 int32_t ce;
1019 setColEIterOffset(coleiter, start);
1020 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1021 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1022 return TRUE;
1023 }
1024 }
1025 int32_t count = 1;
1026 while (count < strsrch->pattern.cesLength) {
1027 if (getCE(strsrch, ucol_next(coleiter, &status))
1028 == UCOL_IGNORABLE) {
1029 // Thai can give an ignorable here.
1030 count --;
1031 }
1032 if (U_FAILURE(status)) {
1033 return TRUE;
1034 }
1035 count ++;
1036 }
1037
1038 ce = ucol_next(coleiter, &status);
1039 if (U_FAILURE(status)) {
1040 return TRUE;
1041 }
1042 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1043 ce = getCE(strsrch, ce);
1044 }
1045 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1046 if (ucol_getOffset(coleiter) <= end) {
1047 return TRUE;
1048 }
1049 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1050 return TRUE;
1051 }
1052 }
1053 }
1054 }
1055 return FALSE;
1056 }
1057 #endif // #if BOYER_MOORE
1058
1059 /**
1060 * Checks if the offset runs out of the text string
1061 * @param offset
1062 * @param textlength of the text string
1063 * @return TRUE if offset is out of bounds, FALSE otherwise
1064 */
1065 static
1066 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1067 {
1068 return offset < 0 || offset > textlength;
1069 }
1070
1071 /**
1072 * Checks for identical match
1073 * @param strsrch string search data
1074 * @param start offset of possible match
1075 * @param end offset of possible match
1076 * @return TRUE if identical match is found
1077 */
1078 static
1079 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1080 int32_t end)
1081 {
1082 if (strsrch->strength != UCOL_IDENTICAL) {
1083 return TRUE;
1084 }
1085
1086 // Note: We could use Normalizer::compare() or similar, but for short strings
1087 // which may not be in FCD it might be faster to just NFD them.
1088 UErrorCode status = U_ZERO_ERROR;
1089 UnicodeString t2, p2;
1090 strsrch->nfd->normalize(
1091 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1092 strsrch->nfd->normalize(
1093 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1094 // return FALSE if NFD failed
1095 return U_SUCCESS(status) && t2 == p2;
1096 }
1097
1098 #if BOYER_MOORE
1099 /**
1100 * Checks to see if the match is repeated
1101 * @param strsrch string search data
1102 * @param start new match start index
1103 * @param end new match end index
1104 * @return TRUE if the the match is repeated, FALSE otherwise
1105 */
1106 static
1107 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1108 int32_t start,
1109 int32_t end)
1110 {
1111 int32_t lastmatchindex = strsrch->search->matchedIndex;
1112 UBool result;
1113 if (lastmatchindex == USEARCH_DONE) {
1114 return FALSE;
1115 }
1116 if (strsrch->search->isForwardSearching) {
1117 result = start <= lastmatchindex;
1118 }
1119 else {
1120 result = start >= lastmatchindex;
1121 }
1122 if (!result && !strsrch->search->isOverlap) {
1123 if (strsrch->search->isForwardSearching) {
1124 result = start < lastmatchindex + strsrch->search->matchedLength;
1125 }
1126 else {
1127 result = end > lastmatchindex;
1128 }
1129 }
1130 return result;
1131 }
1132
1133 /**
1134 * Gets the collation element iterator's current offset.
1135 * @param coleiter collation element iterator
1136 * @param forwards flag TRUE if we are moving in th forwards direction
1137 * @return current offset
1138 */
1139 static
1140 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1141 UBool forwards)
1142 {
1143 int32_t result = ucol_getOffset(coleiter);
1144 // intricacies of the the backwards collation element iterator
1145 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1146 result ++;
1147 }
1148 return result;
1149 }
1150
1151 /**
1152 * Checks match for contraction.
1153 * If the match ends with a partial contraction we fail.
1154 * If the match starts too far off (because of backwards iteration) we try to
1155 * chip off the extra characters depending on whether a breakiterator has
1156 * been used.
1157 * Internal method, error assumed to be success, caller has to check status
1158 * before calling this method.
1159 * @param strsrch string search data
1160 * @param start offset of potential match, to be modified if necessary
1161 * @param end offset of potential match, to be modified if necessary
1162 * @param status output error status if any
1163 * @return TRUE if match passes the contraction test, FALSE otherwise
1164 */
1165
1166 static
1167 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1168 int32_t *start,
1169 int32_t *end, UErrorCode *status)
1170 {
1171 UCollationElements *coleiter = strsrch->textIter;
1172 int32_t textlength = strsrch->search->textLength;
1173 int32_t temp = *start;
1174 const UCollator *collator = strsrch->collator;
1175 const UChar *text = strsrch->search->text;
1176 // This part checks if either ends of the match contains potential
1177 // contraction. If so we'll have to iterate through them
1178 // The start contraction needs to be checked since ucol_previous dumps
1179 // all characters till the first safe character into the buffer.
1180 // *start + 1 is used to test for the unsafe characters instead of *start
1181 // because ucol_prev takes all unsafe characters till the first safe
1182 // character ie *start. so by testing *start + 1, we can estimate if
1183 // excess prefix characters has been included in the potential search
1184 // results.
1185 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1186 (*start + 1 < textlength
1187 && ucol_unsafeCP(text[*start + 1], collator))) {
1188 int32_t expansion = getExpansionPrefix(coleiter);
1189 UBool expandflag = expansion > 0;
1190 setColEIterOffset(coleiter, *start);
1191 while (expansion > 0) {
1192 // getting rid of the redundant ce, caused by setOffset.
1193 // since backward contraction/expansion may have extra ces if we
1194 // are in the normalization buffer, hasAccentsBeforeMatch would
1195 // have taken care of it.
1196 // E.g. the character \u01FA will have an expansion of 3, but if
1197 // we are only looking for acute and ring \u030A and \u0301, we'll
1198 // have to skip the first ce in the expansion buffer.
1199 ucol_next(coleiter, status);
1200 if (U_FAILURE(*status)) {
1201 return FALSE;
1202 }
1203 if (ucol_getOffset(coleiter) != temp) {
1204 *start = temp;
1205 temp = ucol_getOffset(coleiter);
1206 }
1207 expansion --;
1208 }
1209
1210 int32_t *patternce = strsrch->pattern.ces;
1211 int32_t patterncelength = strsrch->pattern.cesLength;
1212 int32_t count = 0;
1213 while (count < patterncelength) {
1214 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1215 if (ce == UCOL_IGNORABLE) {
1216 continue;
1217 }
1218 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1219 *start = temp;
1220 temp = ucol_getOffset(coleiter);
1221 }
1222 if (U_FAILURE(*status) || ce != patternce[count]) {
1223 (*end) ++;
1224 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1225 return FALSE;
1226 }
1227 count ++;
1228 }
1229 }
1230 return TRUE;
1231 }
1232
1233 /**
1234 * Checks and sets the match information if found.
1235 * Checks
1236 * <ul>
1237 * <li> the potential match does not repeat the previous match
1238 * <li> boundaries are correct
1239 * <li> exact matches has no extra accents
1240 * <li> identical matchesb
1241 * <li> potential match does not end in the middle of a contraction
1242 * <\ul>
1243 * Otherwise the offset will be shifted to the next character.
1244 * Internal method, status assumed to be success, caller has to check status
1245 * before calling this method.
1246 * @param strsrch string search data
1247 * @param textoffset offset in the collation element text. the returned value
1248 * will be the truncated end offset of the match or the new start
1249 * search offset.
1250 * @param status output error status if any
1251 * @return TRUE if the match is valid, FALSE otherwise
1252 */
1253 static
1254 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1255 int32_t *textoffset, UErrorCode *status)
1256 {
1257 UCollationElements *coleiter = strsrch->textIter;
1258 int32_t start = getColElemIterOffset(coleiter, FALSE);
1259
1260 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1261 return FALSE;
1262 }
1263
1264 // this totally matches, however we need to check if it is repeating
1265 if (!isBreakUnit(strsrch, start, *textoffset) ||
1266 checkRepeatedMatch(strsrch, start, *textoffset) ||
1267 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1268 !checkIdentical(strsrch, start, *textoffset) ||
1269 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1270
1271 (*textoffset) ++;
1272 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1273 return FALSE;
1274 }
1275
1276 //Add breakiterator boundary check for primary strength search.
1277 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1278 checkBreakBoundary(strsrch, &start, textoffset);
1279 }
1280
1281 // totally match, we will get rid of the ending ignorables.
1282 strsrch->search->matchedIndex = start;
1283 strsrch->search->matchedLength = *textoffset - start;
1284 return TRUE;
1285 }
1286
1287 /**
1288 * Getting the previous base character offset, or the current offset if the
1289 * current character is a base character
1290 * @param text string
1291 * @param textoffset one offset after the current character
1292 * @return the offset of the next character after the base character or the first
1293 * composed character with accents
1294 */
1295 static
1296 inline int32_t getPreviousBaseOffset(const UChar *text,
1297 int32_t textoffset)
1298 {
1299 if (textoffset > 0) {
1300 for (;;) {
1301 int32_t result = textoffset;
1302 U16_BACK_1(text, 0, textoffset);
1303 int32_t temp = textoffset;
1304 uint16_t fcd = getFCD(text, &temp, result);
1305 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1306 if (fcd & LAST_BYTE_MASK_) {
1307 return textoffset;
1308 }
1309 return result;
1310 }
1311 if (textoffset == 0) {
1312 return 0;
1313 }
1314 }
1315 }
1316 return textoffset;
1317 }
1318
1319 /**
1320 * Getting the indexes of the accents that are not blocked in the argument
1321 * accent array
1322 * @param accents array of accents in nfd terminated by a 0.
1323 * @param accentsindex array of indexes of the accents that are not blocked
1324 */
1325 static
1326 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1327 {
1328 int32_t index = 0;
1329 int32_t length = u_strlen(accents);
1330 UChar32 codepoint = 0;
1331 int cclass = 0;
1332 int result = 0;
1333 int32_t temp;
1334 while (index < length) {
1335 temp = index;
1336 U16_NEXT(accents, index, length, codepoint);
1337 if (u_getCombiningClass(codepoint) != cclass) {
1338 cclass = u_getCombiningClass(codepoint);
1339 accentsindex[result] = temp;
1340 result ++;
1341 }
1342 }
1343 accentsindex[result] = length;
1344 return result;
1345 }
1346
1347 /**
1348 * Appends 3 UChar arrays to a destination array.
1349 * Creates a new array if we run out of space. The caller will have to
1350 * manually deallocate the newly allocated array.
1351 * Internal method, status assumed to be success, caller has to check status
1352 * before calling this method. destination not to be NULL and has at least
1353 * size destinationlength.
1354 * @param destination target array
1355 * @param destinationlength target array size, returning the appended length
1356 * @param source1 null-terminated first array
1357 * @param source2 second array
1358 * @param source2length length of seond array
1359 * @param source3 null-terminated third array
1360 * @param status error status if any
1361 * @return new destination array, destination if there was no new allocation
1362 */
1363 static
1364 inline UChar * addToUCharArray( UChar *destination,
1365 int32_t *destinationlength,
1366 const UChar *source1,
1367 const UChar *source2,
1368 int32_t source2length,
1369 const UChar *source3,
1370 UErrorCode *status)
1371 {
1372 int32_t source1length = source1 ? u_strlen(source1) : 0;
1373 int32_t source3length = source3 ? u_strlen(source3) : 0;
1374 if (*destinationlength < source1length + source2length + source3length +
1375 1)
1376 {
1377 destination = (UChar *)allocateMemory(
1378 (source1length + source2length + source3length + 1) * sizeof(UChar),
1379 status);
1380 // if error allocating memory, status will be
1381 // U_MEMORY_ALLOCATION_ERROR
1382 if (U_FAILURE(*status)) {
1383 *destinationlength = 0;
1384 return NULL;
1385 }
1386 }
1387 if (source1length != 0) {
1388 u_memcpy(destination, source1, source1length);
1389 }
1390 if (source2length != 0) {
1391 uprv_memcpy(destination + source1length, source2,
1392 sizeof(UChar) * source2length);
1393 }
1394 if (source3length != 0) {
1395 uprv_memcpy(destination + source1length + source2length, source3,
1396 sizeof(UChar) * source3length);
1397 }
1398 *destinationlength = source1length + source2length + source3length;
1399 return destination;
1400 }
1401
1402 /**
1403 * Running through a collation element iterator to see if the contents matches
1404 * pattern in string search data
1405 * @param strsrch string search data
1406 * @param coleiter collation element iterator
1407 * @return TRUE if a match if found, FALSE otherwise
1408 */
1409 static
1410 inline UBool checkCollationMatch(const UStringSearch *strsrch,
1411 UCollationElements *coleiter)
1412 {
1413 int patternceindex = strsrch->pattern.cesLength;
1414 int32_t *patternce = strsrch->pattern.ces;
1415 UErrorCode status = U_ZERO_ERROR;
1416 while (patternceindex > 0) {
1417 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1418 if (ce == UCOL_IGNORABLE) {
1419 continue;
1420 }
1421 if (U_FAILURE(status) || ce != *patternce) {
1422 return FALSE;
1423 }
1424 patternce ++;
1425 patternceindex --;
1426 }
1427 return TRUE;
1428 }
1429
1430 /**
1431 * Rearranges the front accents to try matching.
1432 * Prefix accents in the text will be grouped according to their combining
1433 * class and the groups will be mixed and matched to try find the perfect
1434 * match with the pattern.
1435 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1436 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1437 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1438 * "\u0301\u0325".
1439 * step 2: check if any of the generated substrings matches the pattern.
1440 * Internal method, status is assumed to be success, caller has to check status
1441 * before calling this method.
1442 * @param strsrch string search match
1443 * @param start first offset of the accents to start searching
1444 * @param end start of the last accent set
1445 * @param status output error status if any
1446 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1447 * offset of the match. Note this start includes all preceding accents.
1448 */
1449 static
1450 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1451 int32_t start,
1452 int32_t end,
1453 UErrorCode *status)
1454 {
1455 const UChar *text = strsrch->search->text;
1456 int32_t textlength = strsrch->search->textLength;
1457 int32_t tempstart = start;
1458
1459 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1460 // die... failed at a base character
1461 return USEARCH_DONE;
1462 }
1463
1464 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1465 start = getPreviousBaseOffset(text, tempstart);
1466
1467 UChar accents[INITIAL_ARRAY_SIZE_];
1468 // normalizing the offensive string
1469 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1470 INITIAL_ARRAY_SIZE_, status);
1471 if (U_FAILURE(*status)) {
1472 return USEARCH_DONE;
1473 }
1474
1475 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1476 int32_t accentsize = getUnblockedAccentIndex(accents,
1477 accentsindex);
1478 int32_t count = (2 << (accentsize - 1)) - 1;
1479 UChar buffer[INITIAL_ARRAY_SIZE_];
1480 UCollationElements *coleiter = strsrch->utilIter;
1481 while (U_SUCCESS(*status) && count > 0) {
1482 UChar *rearrange = strsrch->canonicalPrefixAccents;
1483 // copy the base characters
1484 for (int k = 0; k < accentsindex[0]; k ++) {
1485 *rearrange ++ = accents[k];
1486 }
1487 // forming all possible canonical rearrangement by dropping
1488 // sets of accents
1489 for (int i = 0; i <= accentsize - 1; i ++) {
1490 int32_t mask = 1 << (accentsize - i - 1);
1491 if (count & mask) {
1492 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1493 *rearrange ++ = accents[j];
1494 }
1495 }
1496 }
1497 *rearrange = 0;
1498 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1499 UChar *match = addToUCharArray(buffer, &matchsize,
1500 strsrch->canonicalPrefixAccents,
1501 strsrch->search->text + offset,
1502 end - offset,
1503 strsrch->canonicalSuffixAccents,
1504 status);
1505
1506 // if status is a failure, ucol_setText does nothing.
1507 // run the collator iterator through this match
1508 ucol_setText(coleiter, match, matchsize, status);
1509 if (U_SUCCESS(*status)) {
1510 if (checkCollationMatch(strsrch, coleiter)) {
1511 if (match != buffer) {
1512 uprv_free(match);
1513 }
1514 return start;
1515 }
1516 }
1517 count --;
1518 }
1519 return USEARCH_DONE;
1520 }
1521
1522 /**
1523 * Gets the offset to the safe point in text before textoffset.
1524 * ie. not the middle of a contraction, swappable characters or supplementary
1525 * characters.
1526 * @param collator collation sata
1527 * @param text string to work with
1528 * @param textoffset offset in string
1529 * @param textlength length of text string
1530 * @return offset to the previous safe character
1531 */
1532 static
1533 inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1534 const UChar *text,
1535 int32_t textoffset)
1536 {
1537 int32_t result = textoffset; // first contraction character
1538 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1539 result --;
1540 }
1541 if (result != 0) {
1542 // the first contraction character is consider unsafe here
1543 result --;
1544 }
1545 return result;
1546 }
1547
1548 /**
1549 * Cleaning up after we passed the safe zone
1550 * @param strsrch string search data
1551 * @param safetext safe text array
1552 * @param safebuffer safe text buffer
1553 * @param coleiter collation element iterator for safe text
1554 */
1555 static
1556 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1557 UChar *safebuffer)
1558 {
1559 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1560 {
1561 uprv_free(safetext);
1562 }
1563 }
1564
1565 /**
1566 * Take the rearranged end accents and tries matching. If match failed at
1567 * a seperate preceding set of accents (seperated from the rearranged on by
1568 * at least a base character) then we rearrange the preceding accents and
1569 * tries matching again.
1570 * We allow skipping of the ends of the accent set if the ces do not match.
1571 * However if the failure is found before the accent set, it fails.
1572 * Internal method, status assumed to be success, caller has to check status
1573 * before calling this method.
1574 * @param strsrch string search data
1575 * @param textoffset of the start of the rearranged accent
1576 * @param status output error status if any
1577 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1578 * offset of the match. Note this start includes all preceding accents.
1579 */
1580 static
1581 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1582 int32_t textoffset,
1583 UErrorCode *status)
1584 {
1585 const UChar *text = strsrch->search->text;
1586 const UCollator *collator = strsrch->collator;
1587 int32_t safelength = 0;
1588 UChar *safetext;
1589 int32_t safetextlength;
1590 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1591 UCollationElements *coleiter = strsrch->utilIter;
1592 int32_t safeoffset = textoffset;
1593
1594 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1595 collator)) {
1596 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1597 safelength = textoffset - safeoffset;
1598 safetextlength = INITIAL_ARRAY_SIZE_;
1599 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1600 text + safeoffset, safelength,
1601 strsrch->canonicalSuffixAccents,
1602 status);
1603 }
1604 else {
1605 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1606 safetext = strsrch->canonicalSuffixAccents;
1607 }
1608
1609 // if status is a failure, ucol_setText does nothing
1610 ucol_setText(coleiter, safetext, safetextlength, status);
1611 // status checked in loop below
1612
1613 int32_t *ce = strsrch->pattern.ces;
1614 int32_t celength = strsrch->pattern.cesLength;
1615 int ceindex = celength - 1;
1616 UBool isSafe = TRUE; // indication flag for position in safe zone
1617
1618 while (ceindex >= 0) {
1619 int32_t textce = ucol_previous(coleiter, status);
1620 if (U_FAILURE(*status)) {
1621 if (isSafe) {
1622 cleanUpSafeText(strsrch, safetext, safebuffer);
1623 }
1624 return USEARCH_DONE;
1625 }
1626 if (textce == UCOL_NULLORDER) {
1627 // check if we have passed the safe buffer
1628 if (coleiter == strsrch->textIter) {
1629 cleanUpSafeText(strsrch, safetext, safebuffer);
1630 return USEARCH_DONE;
1631 }
1632 cleanUpSafeText(strsrch, safetext, safebuffer);
1633 safetext = safebuffer;
1634 coleiter = strsrch->textIter;
1635 setColEIterOffset(coleiter, safeoffset);
1636 // status checked at the start of the loop
1637 isSafe = FALSE;
1638 continue;
1639 }
1640 textce = getCE(strsrch, textce);
1641 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1642 // do the beginning stuff
1643 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1644 if (isSafe && failedoffset >= safelength) {
1645 // alas... no hope. failed at rearranged accent set
1646 cleanUpSafeText(strsrch, safetext, safebuffer);
1647 return USEARCH_DONE;
1648 }
1649 else {
1650 if (isSafe) {
1651 failedoffset += safeoffset;
1652 cleanUpSafeText(strsrch, safetext, safebuffer);
1653 }
1654
1655 // try rearranging the front accents
1656 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1657 failedoffset, textoffset, status);
1658 if (result != USEARCH_DONE) {
1659 // if status is a failure, ucol_setOffset does nothing
1660 setColEIterOffset(strsrch->textIter, result);
1661 }
1662 if (U_FAILURE(*status)) {
1663 return USEARCH_DONE;
1664 }
1665 return result;
1666 }
1667 }
1668 if (textce == ce[ceindex]) {
1669 ceindex --;
1670 }
1671 }
1672 // set offset here
1673 if (isSafe) {
1674 int32_t result = getColElemIterOffset(coleiter, FALSE);
1675 // sets the text iterator here with the correct expansion and offset
1676 int32_t leftoverces = getExpansionPrefix(coleiter);
1677 cleanUpSafeText(strsrch, safetext, safebuffer);
1678 if (result >= safelength) {
1679 result = textoffset;
1680 }
1681 else {
1682 result += safeoffset;
1683 }
1684 setColEIterOffset(strsrch->textIter, result);
1685 strsrch->textIter->iteratordata_.toReturn =
1686 setExpansionPrefix(strsrch->textIter, leftoverces);
1687 return result;
1688 }
1689
1690 return ucol_getOffset(coleiter);
1691 }
1692
1693 /**
1694 * Trying out the substring and sees if it can be a canonical match.
1695 * This will try normalizing the end accents and arranging them into canonical
1696 * equivalents and check their corresponding ces with the pattern ce.
1697 * Suffix accents in the text will be grouped according to their combining
1698 * class and the groups will be mixed and matched to try find the perfect
1699 * match with the pattern.
1700 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1701 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1702 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1703 * "\u0301\u0325".
1704 * step 2: check if any of the generated substrings matches the pattern.
1705 * Internal method, status assumed to be success, caller has to check status
1706 * before calling this method.
1707 * @param strsrch string search data
1708 * @param textoffset end offset in the collation element text that ends with
1709 * the accents to be rearranged
1710 * @param status error status if any
1711 * @return TRUE if the match is valid, FALSE otherwise
1712 */
1713 static
1714 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1715 int32_t textoffset,
1716 UErrorCode *status)
1717 {
1718 const UChar *text = strsrch->search->text;
1719 int32_t temp = textoffset;
1720 U16_BACK_1(text, 0, temp);
1721 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1722 UCollationElements *coleiter = strsrch->textIter;
1723 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1724 if (strsrch->pattern.hasPrefixAccents) {
1725 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1726 status);
1727 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1728 setColEIterOffset(coleiter, offset);
1729 return TRUE;
1730 }
1731 }
1732 return FALSE;
1733 }
1734
1735 if (!strsrch->pattern.hasSuffixAccents) {
1736 return FALSE;
1737 }
1738
1739 UChar accents[INITIAL_ARRAY_SIZE_];
1740 // offset to the last base character in substring to search
1741 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1742 // normalizing the offensive string
1743 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1744 0, accents, INITIAL_ARRAY_SIZE_, status);
1745 // status checked in loop below
1746
1747 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1748 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1749
1750 // 2 power n - 1 plus the full set of accents
1751 int32_t count = (2 << (size - 1)) - 1;
1752 while (U_SUCCESS(*status) && count > 0) {
1753 UChar *rearrange = strsrch->canonicalSuffixAccents;
1754 // copy the base characters
1755 for (int k = 0; k < accentsindex[0]; k ++) {
1756 *rearrange ++ = accents[k];
1757 }
1758 // forming all possible canonical rearrangement by dropping
1759 // sets of accents
1760 for (int i = 0; i <= size - 1; i ++) {
1761 int32_t mask = 1 << (size - i - 1);
1762 if (count & mask) {
1763 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1764 *rearrange ++ = accents[j];
1765 }
1766 }
1767 }
1768 *rearrange = 0;
1769 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1770 status);
1771 if (offset != USEARCH_DONE) {
1772 return TRUE; // match found
1773 }
1774 count --;
1775 }
1776 return FALSE;
1777 }
1778
1779 /**
1780 * Gets the previous base character offset depending on the string search
1781 * pattern data
1782 * @param strsrch string search data
1783 * @param textoffset current offset, current character
1784 * @return the offset of the next character after this base character or itself
1785 * if it is a composed character with accents
1786 */
1787 static
1788 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1789 int32_t textoffset)
1790 {
1791 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1792 const UChar *text = strsrch->search->text;
1793 int32_t offset = textoffset;
1794 if (getFCD(text, &offset, strsrch->search->textLength) >>
1795 SECOND_LAST_BYTE_SHIFT_) {
1796 return getPreviousBaseOffset(text, textoffset);
1797 }
1798 }
1799 return textoffset;
1800 }
1801
1802 /**
1803 * Checks match for contraction.
1804 * If the match ends with a partial contraction we fail.
1805 * If the match starts too far off (because of backwards iteration) we try to
1806 * chip off the extra characters
1807 * Internal method, status assumed to be success, caller has to check status
1808 * before calling this method.
1809 * @param strsrch string search data
1810 * @param start offset of potential match, to be modified if necessary
1811 * @param end offset of potential match, to be modified if necessary
1812 * @param status output error status if any
1813 * @return TRUE if match passes the contraction test, FALSE otherwise
1814 */
1815 static
1816 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1817 int32_t *start,
1818 int32_t *end,
1819 UErrorCode *status)
1820 {
1821 UCollationElements *coleiter = strsrch->textIter;
1822 int32_t textlength = strsrch->search->textLength;
1823 int32_t temp = *start;
1824 const UCollator *collator = strsrch->collator;
1825 const UChar *text = strsrch->search->text;
1826 // This part checks if either ends of the match contains potential
1827 // contraction. If so we'll have to iterate through them
1828 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1829 (*start + 1 < textlength
1830 && ucol_unsafeCP(text[*start + 1], collator))) {
1831 int32_t expansion = getExpansionPrefix(coleiter);
1832 UBool expandflag = expansion > 0;
1833 setColEIterOffset(coleiter, *start);
1834 while (expansion > 0) {
1835 // getting rid of the redundant ce, caused by setOffset.
1836 // since backward contraction/expansion may have extra ces if we
1837 // are in the normalization buffer, hasAccentsBeforeMatch would
1838 // have taken care of it.
1839 // E.g. the character \u01FA will have an expansion of 3, but if
1840 // we are only looking for acute and ring \u030A and \u0301, we'll
1841 // have to skip the first ce in the expansion buffer.
1842 ucol_next(coleiter, status);
1843 if (U_FAILURE(*status)) {
1844 return FALSE;
1845 }
1846 if (ucol_getOffset(coleiter) != temp) {
1847 *start = temp;
1848 temp = ucol_getOffset(coleiter);
1849 }
1850 expansion --;
1851 }
1852
1853 int32_t *patternce = strsrch->pattern.ces;
1854 int32_t patterncelength = strsrch->pattern.cesLength;
1855 int32_t count = 0;
1856 int32_t textlength = strsrch->search->textLength;
1857 while (count < patterncelength) {
1858 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1859 // status checked below, note that if status is a failure
1860 // ucol_next returns UCOL_NULLORDER
1861 if (ce == UCOL_IGNORABLE) {
1862 continue;
1863 }
1864 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1865 *start = temp;
1866 temp = ucol_getOffset(coleiter);
1867 }
1868
1869 if (count == 0 && ce != patternce[0]) {
1870 // accents may have extra starting ces, this occurs when a
1871 // pure accent pattern is matched without rearrangement
1872 // text \u0325\u0300 and looking for \u0300
1873 int32_t expected = patternce[0];
1874 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1875 ce = getCE(strsrch, ucol_next(coleiter, status));
1876 while (U_SUCCESS(*status) && ce != expected &&
1877 ce != UCOL_NULLORDER &&
1878 ucol_getOffset(coleiter) <= *end) {
1879 ce = getCE(strsrch, ucol_next(coleiter, status));
1880 }
1881 }
1882 }
1883 if (U_FAILURE(*status) || ce != patternce[count]) {
1884 (*end) ++;
1885 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1886 return FALSE;
1887 }
1888 count ++;
1889 }
1890 }
1891 return TRUE;
1892 }
1893
1894 /**
1895 * Checks and sets the match information if found.
1896 * Checks
1897 * <ul>
1898 * <li> the potential match does not repeat the previous match
1899 * <li> boundaries are correct
1900 * <li> potential match does not end in the middle of a contraction
1901 * <li> identical matches
1902 * <\ul>
1903 * Otherwise the offset will be shifted to the next character.
1904 * Internal method, status assumed to be success, caller has to check the
1905 * status before calling this method.
1906 * @param strsrch string search data
1907 * @param textoffset offset in the collation element text. the returned value
1908 * will be the truncated end offset of the match or the new start
1909 * search offset.
1910 * @param status output error status if any
1911 * @return TRUE if the match is valid, FALSE otherwise
1912 */
1913 static
1914 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1915 int32_t *textoffset,
1916 UErrorCode *status)
1917 {
1918 // to ensure that the start and ends are not composite characters
1919 UCollationElements *coleiter = strsrch->textIter;
1920 // if we have a canonical accent match
1921 if ((strsrch->pattern.hasSuffixAccents &&
1922 strsrch->canonicalSuffixAccents[0]) ||
1923 (strsrch->pattern.hasPrefixAccents &&
1924 strsrch->canonicalPrefixAccents[0])) {
1925 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1926 strsrch,
1927 ucol_getOffset(coleiter));
1928 strsrch->search->matchedLength = *textoffset -
1929 strsrch->search->matchedIndex;
1930 return TRUE;
1931 }
1932
1933 int32_t start = getColElemIterOffset(coleiter, FALSE);
1934 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1935 status) || U_FAILURE(*status)) {
1936 return FALSE;
1937 }
1938
1939 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1940 // this totally matches, however we need to check if it is repeating
1941 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1942 !isBreakUnit(strsrch, start, *textoffset) ||
1943 !checkIdentical(strsrch, start, *textoffset)) {
1944 (*textoffset) ++;
1945 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1946 strsrch->search->textLength);
1947 return FALSE;
1948 }
1949
1950 strsrch->search->matchedIndex = start;
1951 strsrch->search->matchedLength = *textoffset - start;
1952 return TRUE;
1953 }
1954
1955 /**
1956 * Shifting the collation element iterator position forward to prepare for
1957 * a preceding match. If the first character is a unsafe character, we'll only
1958 * shift by 1 to capture contractions, normalization etc.
1959 * Internal method, status assumed to be success, caller has to check status
1960 * before calling this method.
1961 * @param text strsrch string search data
1962 * @param textoffset start text position to do search
1963 * @param ce the text ce which failed the match.
1964 * @param patternceindex index of the ce within the pattern ce buffer which
1965 * failed the match
1966 * @return final offset
1967 */
1968 static
1969 inline int32_t reverseShift(UStringSearch *strsrch,
1970 int32_t textoffset,
1971 int32_t ce,
1972 int32_t patternceindex)
1973 {
1974 if (strsrch->search->isOverlap) {
1975 if (textoffset != strsrch->search->textLength) {
1976 textoffset --;
1977 }
1978 else {
1979 textoffset -= strsrch->pattern.defaultShiftSize;
1980 }
1981 }
1982 else {
1983 if (ce != UCOL_NULLORDER) {
1984 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1985
1986 // this is to adjust for characters in the middle of the substring
1987 // for matching that failed.
1988 int32_t adjust = patternceindex;
1989 if (adjust > 1 && shift > adjust) {
1990 shift -= adjust - 1;
1991 }
1992 textoffset -= shift;
1993 }
1994 else {
1995 textoffset -= strsrch->pattern.defaultShiftSize;
1996 }
1997 }
1998 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1999 return textoffset;
2000 }
2001
2002 /**
2003 * Checks match for contraction.
2004 * If the match starts with a partial contraction we fail.
2005 * Internal method, status assumed to be success, caller has to check status
2006 * before calling this method.
2007 * @param strsrch string search data
2008 * @param start offset of potential match, to be modified if necessary
2009 * @param end offset of potential match, to be modified if necessary
2010 * @param status output error status if any
2011 * @return TRUE if match passes the contraction test, FALSE otherwise
2012 */
2013 static
2014 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2015 int32_t *start,
2016 int32_t *end, UErrorCode *status)
2017 {
2018 UCollationElements *coleiter = strsrch->textIter;
2019 int32_t textlength = strsrch->search->textLength;
2020 int32_t temp = *end;
2021 const UCollator *collator = strsrch->collator;
2022 const UChar *text = strsrch->search->text;
2023 // This part checks if either if the start of the match contains potential
2024 // contraction. If so we'll have to iterate through them
2025 // Since we used ucol_next while previously looking for the potential
2026 // match, this guarantees that our end will not be a partial contraction,
2027 // or a partial supplementary character.
2028 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2029 int32_t expansion = getExpansionSuffix(coleiter);
2030 UBool expandflag = expansion > 0;
2031 setColEIterOffset(coleiter, *end);
2032 while (U_SUCCESS(*status) && expansion > 0) {
2033 // getting rid of the redundant ce
2034 // since forward contraction/expansion may have extra ces
2035 // if we are in the normalization buffer, hasAccentsBeforeMatch
2036 // would have taken care of it.
2037 // E.g. the character \u01FA will have an expansion of 3, but if
2038 // we are only looking for A ring A\u030A, we'll have to skip the
2039 // last ce in the expansion buffer
2040 ucol_previous(coleiter, status);
2041 if (U_FAILURE(*status)) {
2042 return FALSE;
2043 }
2044 if (ucol_getOffset(coleiter) != temp) {
2045 *end = temp;
2046 temp = ucol_getOffset(coleiter);
2047 }
2048 expansion --;
2049 }
2050
2051 int32_t *patternce = strsrch->pattern.ces;
2052 int32_t patterncelength = strsrch->pattern.cesLength;
2053 int32_t count = patterncelength;
2054 while (count > 0) {
2055 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2056 // status checked below, note that if status is a failure
2057 // ucol_previous returns UCOL_NULLORDER
2058 if (ce == UCOL_IGNORABLE) {
2059 continue;
2060 }
2061 if (expandflag && count == 0 &&
2062 getColElemIterOffset(coleiter, FALSE) != temp) {
2063 *end = temp;
2064 temp = ucol_getOffset(coleiter);
2065 }
2066 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2067 (*start) --;
2068 *start = getPreviousBaseOffset(text, *start);
2069 return FALSE;
2070 }
2071 count --;
2072 }
2073 }
2074 return TRUE;
2075 }
2076
2077 /**
2078 * Checks and sets the match information if found.
2079 * Checks
2080 * <ul>
2081 * <li> the current match does not repeat the last match
2082 * <li> boundaries are correct
2083 * <li> exact matches has no extra accents
2084 * <li> identical matches
2085 * <\ul>
2086 * Otherwise the offset will be shifted to the preceding character.
2087 * Internal method, status assumed to be success, caller has to check status
2088 * before calling this method.
2089 * @param strsrch string search data
2090 * @param collator
2091 * @param coleiter collation element iterator
2092 * @param text string
2093 * @param textoffset offset in the collation element text. the returned value
2094 * will be the truncated start offset of the match or the new start
2095 * search offset.
2096 * @param status output error status if any
2097 * @return TRUE if the match is valid, FALSE otherwise
2098 */
2099 static
2100 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2101 int32_t *textoffset,
2102 UErrorCode *status)
2103 {
2104 // to ensure that the start and ends are not composite characters
2105 int32_t end = ucol_getOffset(strsrch->textIter);
2106 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2107 || U_FAILURE(*status)) {
2108 return FALSE;
2109 }
2110
2111 // this totally matches, however we need to check if it is repeating
2112 // the old match
2113 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2114 !isBreakUnit(strsrch, *textoffset, end) ||
2115 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2116 !checkIdentical(strsrch, *textoffset, end) ||
2117 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2118 (*textoffset) --;
2119 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2120 *textoffset);
2121 return FALSE;
2122 }
2123
2124 //Add breakiterator boundary check for primary strength search.
2125 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2126 checkBreakBoundary(strsrch, textoffset, &end);
2127 }
2128
2129 strsrch->search->matchedIndex = *textoffset;
2130 strsrch->search->matchedLength = end - *textoffset;
2131 return TRUE;
2132 }
2133
2134 /**
2135 * Rearranges the end accents to try matching.
2136 * Suffix accents in the text will be grouped according to their combining
2137 * class and the groups will be mixed and matched to try find the perfect
2138 * match with the pattern.
2139 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2140 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2141 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2142 * "\u0301\u0325".
2143 * step 2: check if any of the generated substrings matches the pattern.
2144 * Internal method, status assumed to be success, user has to check status
2145 * before calling this method.
2146 * @param strsrch string search match
2147 * @param start offset of the first base character
2148 * @param end start of the last accent set
2149 * @param status only error status if any
2150 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2151 * offset of the match. Note this start includes all following accents.
2152 */
2153 static
2154 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2155 int32_t start,
2156 int32_t end,
2157 UErrorCode *status)
2158 {
2159 const UChar *text = strsrch->search->text;
2160 int32_t tempend = end;
2161
2162 U16_BACK_1(text, 0, tempend);
2163 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2164 LAST_BYTE_MASK_)) {
2165 // die... failed at a base character
2166 return USEARCH_DONE;
2167 }
2168 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2169
2170 if (U_SUCCESS(*status)) {
2171 UChar accents[INITIAL_ARRAY_SIZE_];
2172 int32_t offset = getPreviousBaseOffset(text, end);
2173 // normalizing the offensive string
2174 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2175 INITIAL_ARRAY_SIZE_, status);
2176
2177 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2178 int32_t accentsize = getUnblockedAccentIndex(accents,
2179 accentsindex);
2180 int32_t count = (2 << (accentsize - 1)) - 1;
2181 UChar buffer[INITIAL_ARRAY_SIZE_];
2182 UCollationElements *coleiter = strsrch->utilIter;
2183 while (U_SUCCESS(*status) && count > 0) {
2184 UChar *rearrange = strsrch->canonicalSuffixAccents;
2185 // copy the base characters
2186 for (int k = 0; k < accentsindex[0]; k ++) {
2187 *rearrange ++ = accents[k];
2188 }
2189 // forming all possible canonical rearrangement by dropping
2190 // sets of accents
2191 for (int i = 0; i <= accentsize - 1; i ++) {
2192 int32_t mask = 1 << (accentsize - i - 1);
2193 if (count & mask) {
2194 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2195 *rearrange ++ = accents[j];
2196 }
2197 }
2198 }
2199 *rearrange = 0;
2200 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2201 UChar *match = addToUCharArray(buffer, &matchsize,
2202 strsrch->canonicalPrefixAccents,
2203 strsrch->search->text + start,
2204 offset - start,
2205 strsrch->canonicalSuffixAccents,
2206 status);
2207
2208 // run the collator iterator through this match
2209 // if status is a failure ucol_setText does nothing
2210 ucol_setText(coleiter, match, matchsize, status);
2211 if (U_SUCCESS(*status)) {
2212 if (checkCollationMatch(strsrch, coleiter)) {
2213 if (match != buffer) {
2214 uprv_free(match);
2215 }
2216 return end;
2217 }
2218 }
2219 count --;
2220 }
2221 }
2222 return USEARCH_DONE;
2223 }
2224
2225 /**
2226 * Take the rearranged start accents and tries matching. If match failed at
2227 * a seperate following set of accents (seperated from the rearranged on by
2228 * at least a base character) then we rearrange the preceding accents and
2229 * tries matching again.
2230 * We allow skipping of the ends of the accent set if the ces do not match.
2231 * However if the failure is found before the accent set, it fails.
2232 * Internal method, status assumed to be success, caller has to check status
2233 * before calling this method.
2234 * @param strsrch string search data
2235 * @param textoffset of the ends of the rearranged accent
2236 * @param status output error status if any
2237 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2238 * offset of the match. Note this start includes all following accents.
2239 */
2240 static
2241 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2242 int32_t textoffset,
2243 UErrorCode *status)
2244 {
2245 const UChar *text = strsrch->search->text;
2246 const UCollator *collator = strsrch->collator;
2247 int32_t safelength = 0;
2248 UChar *safetext;
2249 int32_t safetextlength;
2250 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2251 int32_t safeoffset = textoffset;
2252
2253 if (textoffset &&
2254 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2255 u_strlen(strsrch->canonicalPrefixAccents) - 1
2256 ], collator)) {
2257 safeoffset = getNextSafeOffset(collator, text, textoffset,
2258 strsrch->search->textLength);
2259 safelength = safeoffset - textoffset;
2260 safetextlength = INITIAL_ARRAY_SIZE_;
2261 safetext = addToUCharArray(safebuffer, &safetextlength,
2262 strsrch->canonicalPrefixAccents,
2263 text + textoffset, safelength,
2264 NULL, status);
2265 }
2266 else {
2267 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2268 safetext = strsrch->canonicalPrefixAccents;
2269 }
2270
2271 UCollationElements *coleiter = strsrch->utilIter;
2272 // if status is a failure, ucol_setText does nothing
2273 ucol_setText(coleiter, safetext, safetextlength, status);
2274 // status checked in loop below
2275
2276 int32_t *ce = strsrch->pattern.ces;
2277 int32_t celength = strsrch->pattern.cesLength;
2278 int ceindex = 0;
2279 UBool isSafe = TRUE; // safe zone indication flag for position
2280 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2281
2282 while (ceindex < celength) {
2283 int32_t textce = ucol_next(coleiter, status);
2284 if (U_FAILURE(*status)) {
2285 if (isSafe) {
2286 cleanUpSafeText(strsrch, safetext, safebuffer);
2287 }
2288 return USEARCH_DONE;
2289 }
2290 if (textce == UCOL_NULLORDER) {
2291 // check if we have passed the safe buffer
2292 if (coleiter == strsrch->textIter) {
2293 cleanUpSafeText(strsrch, safetext, safebuffer);
2294 return USEARCH_DONE;
2295 }
2296 cleanUpSafeText(strsrch, safetext, safebuffer);
2297 safetext = safebuffer;
2298 coleiter = strsrch->textIter;
2299 setColEIterOffset(coleiter, safeoffset);
2300 // status checked at the start of the loop
2301 isSafe = FALSE;
2302 continue;
2303 }
2304 textce = getCE(strsrch, textce);
2305 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2306 // do the beginning stuff
2307 int32_t failedoffset = ucol_getOffset(coleiter);
2308 if (isSafe && failedoffset <= prefixlength) {
2309 // alas... no hope. failed at rearranged accent set
2310 cleanUpSafeText(strsrch, safetext, safebuffer);
2311 return USEARCH_DONE;
2312 }
2313 else {
2314 if (isSafe) {
2315 failedoffset = safeoffset - failedoffset;
2316 cleanUpSafeText(strsrch, safetext, safebuffer);
2317 }
2318
2319 // try rearranging the end accents
2320 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2321 textoffset, failedoffset, status);
2322 if (result != USEARCH_DONE) {
2323 // if status is a failure, ucol_setOffset does nothing
2324 setColEIterOffset(strsrch->textIter, result);
2325 }
2326 if (U_FAILURE(*status)) {
2327 return USEARCH_DONE;
2328 }
2329 return result;
2330 }
2331 }
2332 if (textce == ce[ceindex]) {
2333 ceindex ++;
2334 }
2335 }
2336 // set offset here
2337 if (isSafe) {
2338 int32_t result = ucol_getOffset(coleiter);
2339 // sets the text iterator here with the correct expansion and offset
2340 int32_t leftoverces = getExpansionSuffix(coleiter);
2341 cleanUpSafeText(strsrch, safetext, safebuffer);
2342 if (result <= prefixlength) {
2343 result = textoffset;
2344 }
2345 else {
2346 result = textoffset + (safeoffset - result);
2347 }
2348 setColEIterOffset(strsrch->textIter, result);
2349 setExpansionSuffix(strsrch->textIter, leftoverces);
2350 return result;
2351 }
2352
2353 return ucol_getOffset(coleiter);
2354 }
2355
2356 /**
2357 * Trying out the substring and sees if it can be a canonical match.
2358 * This will try normalizing the starting accents and arranging them into
2359 * canonical equivalents and check their corresponding ces with the pattern ce.
2360 * Prefix accents in the text will be grouped according to their combining
2361 * class and the groups will be mixed and matched to try find the perfect
2362 * match with the pattern.
2363 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2364 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2365 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2366 * "\u0301\u0325".
2367 * step 2: check if any of the generated substrings matches the pattern.
2368 * Internal method, status assumed to be success, caller has to check status
2369 * before calling this method.
2370 * @param strsrch string search data
2371 * @param textoffset start offset in the collation element text that starts
2372 * with the accents to be rearranged
2373 * @param status output error status if any
2374 * @return TRUE if the match is valid, FALSE otherwise
2375 */
2376 static
2377 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2378 int32_t textoffset,
2379 UErrorCode *status)
2380 {
2381 const UChar *text = strsrch->search->text;
2382 int32_t temp = textoffset;
2383 int32_t textlength = strsrch->search->textLength;
2384 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2385 UCollationElements *coleiter = strsrch->textIter;
2386 int32_t offset = ucol_getOffset(coleiter);
2387 if (strsrch->pattern.hasSuffixAccents) {
2388 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2389 offset, status);
2390 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2391 setColEIterOffset(coleiter, offset);
2392 return TRUE;
2393 }
2394 }
2395 return FALSE;
2396 }
2397
2398 if (!strsrch->pattern.hasPrefixAccents) {
2399 return FALSE;
2400 }
2401
2402 UChar accents[INITIAL_ARRAY_SIZE_];
2403 // offset to the last base character in substring to search
2404 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2405 // normalizing the offensive string
2406 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2407 0, accents, INITIAL_ARRAY_SIZE_, status);
2408 // status checked in loop
2409
2410 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2411 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2412
2413 // 2 power n - 1 plus the full set of accents
2414 int32_t count = (2 << (size - 1)) - 1;
2415 while (U_SUCCESS(*status) && count > 0) {
2416 UChar *rearrange = strsrch->canonicalPrefixAccents;
2417 // copy the base characters
2418 for (int k = 0; k < accentsindex[0]; k ++) {
2419 *rearrange ++ = accents[k];
2420 }
2421 // forming all possible canonical rearrangement by dropping
2422 // sets of accents
2423 for (int i = 0; i <= size - 1; i ++) {
2424 int32_t mask = 1 << (size - i - 1);
2425 if (count & mask) {
2426 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2427 *rearrange ++ = accents[j];
2428 }
2429 }
2430 }
2431 *rearrange = 0;
2432 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2433 baseoffset, status);
2434 if (offset != USEARCH_DONE) {
2435 return TRUE; // match found
2436 }
2437 count --;
2438 }
2439 return FALSE;
2440 }
2441
2442 /**
2443 * Checks match for contraction.
2444 * If the match starts with a partial contraction we fail.
2445 * Internal method, status assumed to be success, caller has to check status
2446 * before calling this method.
2447 * @param strsrch string search data
2448 * @param start offset of potential match, to be modified if necessary
2449 * @param end offset of potential match, to be modified if necessary
2450 * @param status only error status if any
2451 * @return TRUE if match passes the contraction test, FALSE otherwise
2452 */
2453 static
2454 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2455 int32_t *start,
2456 int32_t *end, UErrorCode *status)
2457 {
2458 UCollationElements *coleiter = strsrch->textIter;
2459 int32_t textlength = strsrch->search->textLength;
2460 int32_t temp = *end;
2461 const UCollator *collator = strsrch->collator;
2462 const UChar *text = strsrch->search->text;
2463 // This part checks if either if the start of the match contains potential
2464 // contraction. If so we'll have to iterate through them
2465 // Since we used ucol_next while previously looking for the potential
2466 // match, this guarantees that our end will not be a partial contraction,
2467 // or a partial supplementary character.
2468 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2469 int32_t expansion = getExpansionSuffix(coleiter);
2470 UBool expandflag = expansion > 0;
2471 setColEIterOffset(coleiter, *end);
2472 while (expansion > 0) {
2473 // getting rid of the redundant ce
2474 // since forward contraction/expansion may have extra ces
2475 // if we are in the normalization buffer, hasAccentsBeforeMatch
2476 // would have taken care of it.
2477 // E.g. the character \u01FA will have an expansion of 3, but if
2478 // we are only looking for A ring A\u030A, we'll have to skip the
2479 // last ce in the expansion buffer
2480 ucol_previous(coleiter, status);
2481 if (U_FAILURE(*status)) {
2482 return FALSE;
2483 }
2484 if (ucol_getOffset(coleiter) != temp) {
2485 *end = temp;
2486 temp = ucol_getOffset(coleiter);
2487 }
2488 expansion --;
2489 }
2490
2491 int32_t *patternce = strsrch->pattern.ces;
2492 int32_t patterncelength = strsrch->pattern.cesLength;
2493 int32_t count = patterncelength;
2494 while (count > 0) {
2495 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2496 // status checked below, note that if status is a failure
2497 // ucol_previous returns UCOL_NULLORDER
2498 if (ce == UCOL_IGNORABLE) {
2499 continue;
2500 }
2501 if (expandflag && count == 0 &&
2502 getColElemIterOffset(coleiter, FALSE) != temp) {
2503 *end = temp;
2504 temp = ucol_getOffset(coleiter);
2505 }
2506 if (count == patterncelength &&
2507 ce != patternce[patterncelength - 1]) {
2508 // accents may have extra starting ces, this occurs when a
2509 // pure accent pattern is matched without rearrangement
2510 int32_t expected = patternce[patterncelength - 1];
2511 U16_BACK_1(text, 0, *end);
2512 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2513 ce = getCE(strsrch, ucol_previous(coleiter, status));
2514 while (U_SUCCESS(*status) && ce != expected &&
2515 ce != UCOL_NULLORDER &&
2516 ucol_getOffset(coleiter) <= *start) {
2517 ce = getCE(strsrch, ucol_previous(coleiter, status));
2518 }
2519 }
2520 }
2521 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2522 (*start) --;
2523 *start = getPreviousBaseOffset(text, *start);
2524 return FALSE;
2525 }
2526 count --;
2527 }
2528 }
2529 return TRUE;
2530 }
2531
2532 /**
2533 * Checks and sets the match information if found.
2534 * Checks
2535 * <ul>
2536 * <li> the potential match does not repeat the previous match
2537 * <li> boundaries are correct
2538 * <li> potential match does not end in the middle of a contraction
2539 * <li> identical matches
2540 * <\ul>
2541 * Otherwise the offset will be shifted to the next character.
2542 * Internal method, status assumed to be success, caller has to check status
2543 * before calling this method.
2544 * @param strsrch string search data
2545 * @param textoffset offset in the collation element text. the returned value
2546 * will be the truncated start offset of the match or the new start
2547 * search offset.
2548 * @param status only error status if any
2549 * @return TRUE if the match is valid, FALSE otherwise
2550 */
2551 static
2552 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2553 int32_t *textoffset,
2554 UErrorCode *status)
2555 {
2556 // to ensure that the start and ends are not composite characters
2557 UCollationElements *coleiter = strsrch->textIter;
2558 // if we have a canonical accent match
2559 if ((strsrch->pattern.hasSuffixAccents &&
2560 strsrch->canonicalSuffixAccents[0]) ||
2561 (strsrch->pattern.hasPrefixAccents &&
2562 strsrch->canonicalPrefixAccents[0])) {
2563 strsrch->search->matchedIndex = *textoffset;
2564 strsrch->search->matchedLength =
2565 getNextUStringSearchBaseOffset(strsrch,
2566 getColElemIterOffset(coleiter, FALSE))
2567 - *textoffset;
2568 return TRUE;
2569 }
2570
2571 int32_t end = ucol_getOffset(coleiter);
2572 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2573 status) ||
2574 U_FAILURE(*status)) {
2575 return FALSE;
2576 }
2577
2578 end = getNextUStringSearchBaseOffset(strsrch, end);
2579 // this totally matches, however we need to check if it is repeating
2580 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2581 !isBreakUnit(strsrch, *textoffset, end) ||
2582 !checkIdentical(strsrch, *textoffset, end)) {
2583 (*textoffset) --;
2584 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2585 *textoffset);
2586 return FALSE;
2587 }
2588
2589 strsrch->search->matchedIndex = *textoffset;
2590 strsrch->search->matchedLength = end - *textoffset;
2591 return TRUE;
2592 }
2593 #endif // #if BOYER_MOORE
2594
2595 // constructors and destructor -------------------------------------------
2596
2597 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2598 int32_t patternlength,
2599 const UChar *text,
2600 int32_t textlength,
2601 const char *locale,
2602 UBreakIterator *breakiter,
2603 UErrorCode *status)
2604 {
2605 if (U_FAILURE(*status)) {
2606 return NULL;
2607 }
2608 #if UCONFIG_NO_BREAK_ITERATION
2609 if (breakiter != NULL) {
2610 *status = U_UNSUPPORTED_ERROR;
2611 return NULL;
2612 }
2613 #endif
2614 if (locale) {
2615 // ucol_open internally checks for status
2616 UCollator *collator = ucol_open(locale, status);
2617 // pattern, text checks are done in usearch_openFromCollator
2618 UStringSearch *result = usearch_openFromCollator(pattern,
2619 patternlength, text, textlength,
2620 collator, breakiter, status);
2621
2622 if (result == NULL || U_FAILURE(*status)) {
2623 if (collator) {
2624 ucol_close(collator);
2625 }
2626 return NULL;
2627 }
2628 else {
2629 result->ownCollator = TRUE;
2630 }
2631 return result;
2632 }
2633 *status = U_ILLEGAL_ARGUMENT_ERROR;
2634 return NULL;
2635 }
2636
2637 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2638 const UChar *pattern,
2639 int32_t patternlength,
2640 const UChar *text,
2641 int32_t textlength,
2642 const UCollator *collator,
2643 UBreakIterator *breakiter,
2644 UErrorCode *status)
2645 {
2646 if (U_FAILURE(*status)) {
2647 return NULL;
2648 }
2649 #if UCONFIG_NO_BREAK_ITERATION
2650 if (breakiter != NULL) {
2651 *status = U_UNSUPPORTED_ERROR;
2652 return NULL;
2653 }
2654 #endif
2655 if (pattern == NULL || text == NULL || collator == NULL) {
2656 *status = U_ILLEGAL_ARGUMENT_ERROR;
2657 return NULL;
2658 }
2659
2660 // string search does not really work when numeric collation is turned on
2661 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2662 *status = U_UNSUPPORTED_ERROR;
2663 return NULL;
2664 }
2665
2666 if (U_SUCCESS(*status)) {
2667 initializeFCD(status);
2668 if (U_FAILURE(*status)) {
2669 return NULL;
2670 }
2671
2672 UStringSearch *result;
2673 if (textlength == -1) {
2674 textlength = u_strlen(text);
2675 }
2676 if (patternlength == -1) {
2677 patternlength = u_strlen(pattern);
2678 }
2679 if (textlength <= 0 || patternlength <= 0) {
2680 *status = U_ILLEGAL_ARGUMENT_ERROR;
2681 return NULL;
2682 }
2683
2684 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2685 if (result == NULL) {
2686 *status = U_MEMORY_ALLOCATION_ERROR;
2687 return NULL;
2688 }
2689
2690 result->collator = collator;
2691 result->strength = ucol_getStrength(collator);
2692 result->ceMask = getMask(result->strength);
2693 result->toShift =
2694 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2695 UCOL_SHIFTED;
2696 result->variableTop = ucol_getVariableTop(collator, status);
2697
2698 result->nfd = Normalizer2::getNFDInstance(*status);
2699
2700 if (U_FAILURE(*status)) {
2701 uprv_free(result);
2702 return NULL;
2703 }
2704
2705 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2706 if (result->search == NULL) {
2707 *status = U_MEMORY_ALLOCATION_ERROR;
2708 uprv_free(result);
2709 return NULL;
2710 }
2711
2712 result->search->text = text;
2713 result->search->textLength = textlength;
2714
2715 result->pattern.text = pattern;
2716 result->pattern.textLength = patternlength;
2717 result->pattern.ces = NULL;
2718 result->pattern.pces = NULL;
2719
2720 result->search->breakIter = breakiter;
2721 #if !UCONFIG_NO_BREAK_ITERATION
2722 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2723 if (breakiter) {
2724 ubrk_setText(breakiter, text, textlength, status);
2725 }
2726 #endif
2727
2728 result->ownCollator = FALSE;
2729 result->search->matchedLength = 0;
2730 result->search->matchedIndex = USEARCH_DONE;
2731 result->utilIter = NULL;
2732 result->textIter = ucol_openElements(collator, text,
2733 textlength, status);
2734 result->textProcessedIter = NULL;
2735 if (U_FAILURE(*status)) {
2736 usearch_close(result);
2737 return NULL;
2738 }
2739
2740 result->search->isOverlap = FALSE;
2741 result->search->isCanonicalMatch = FALSE;
2742 result->search->elementComparisonType = 0;
2743 result->search->isForwardSearching = TRUE;
2744 result->search->reset = TRUE;
2745
2746 initialize(result, status);
2747
2748 if (U_FAILURE(*status)) {
2749 usearch_close(result);
2750 return NULL;
2751 }
2752
2753 return result;
2754 }
2755 return NULL;
2756 }
2757
2758 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2759 {
2760 if (strsrch) {
2761 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2762 strsrch->pattern.ces) {
2763 uprv_free(strsrch->pattern.ces);
2764 }
2765
2766 if (strsrch->pattern.pces != NULL &&
2767 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2768 uprv_free(strsrch->pattern.pces);
2769 }
2770
2771 delete strsrch->textProcessedIter;
2772 ucol_closeElements(strsrch->textIter);
2773 ucol_closeElements(strsrch->utilIter);
2774
2775 if (strsrch->ownCollator && strsrch->collator) {
2776 ucol_close((UCollator *)strsrch->collator);
2777 }
2778
2779 #if !UCONFIG_NO_BREAK_ITERATION
2780 if (strsrch->search->internalBreakIter) {
2781 ubrk_close(strsrch->search->internalBreakIter);
2782 }
2783 #endif
2784
2785 uprv_free(strsrch->search);
2786 uprv_free(strsrch);
2787 }
2788 }
2789
2790 namespace {
2791
2792 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2793 if (U_FAILURE(*status)) { return FALSE; }
2794 if (strsrch->textProcessedIter == NULL) {
2795 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2796 if (strsrch->textProcessedIter == NULL) {
2797 *status = U_MEMORY_ALLOCATION_ERROR;
2798 return FALSE;
2799 }
2800 } else {
2801 strsrch->textProcessedIter->init(strsrch->textIter);
2802 }
2803 return TRUE;
2804 }
2805
2806 }
2807
2808 // set and get methods --------------------------------------------------
2809
2810 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2811 int32_t position,
2812 UErrorCode *status)
2813 {
2814 if (U_SUCCESS(*status) && strsrch) {
2815 if (isOutOfBounds(strsrch->search->textLength, position)) {
2816 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2817 }
2818 else {
2819 setColEIterOffset(strsrch->textIter, position);
2820 }
2821 strsrch->search->matchedIndex = USEARCH_DONE;
2822 strsrch->search->matchedLength = 0;
2823 strsrch->search->reset = FALSE;
2824 }
2825 }
2826
2827 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2828 {
2829 if (strsrch) {
2830 int32_t result = ucol_getOffset(strsrch->textIter);
2831 if (isOutOfBounds(strsrch->search->textLength, result)) {
2832 return USEARCH_DONE;
2833 }
2834 return result;
2835 }
2836 return USEARCH_DONE;
2837 }
2838
2839 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2840 USearchAttribute attribute,
2841 USearchAttributeValue value,
2842 UErrorCode *status)
2843 {
2844 if (U_SUCCESS(*status) && strsrch) {
2845 switch (attribute)
2846 {
2847 case USEARCH_OVERLAP :
2848 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2849 break;
2850 case USEARCH_CANONICAL_MATCH :
2851 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2852 FALSE);
2853 break;
2854 case USEARCH_ELEMENT_COMPARISON :
2855 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2856 strsrch->search->elementComparisonType = (int16_t)value;
2857 } else {
2858 strsrch->search->elementComparisonType = 0;
2859 }
2860 break;
2861 case USEARCH_ATTRIBUTE_COUNT :
2862 default:
2863 *status = U_ILLEGAL_ARGUMENT_ERROR;
2864 }
2865 }
2866 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2867 *status = U_ILLEGAL_ARGUMENT_ERROR;
2868 }
2869 }
2870
2871 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2872 const UStringSearch *strsrch,
2873 USearchAttribute attribute)
2874 {
2875 if (strsrch) {
2876 switch (attribute) {
2877 case USEARCH_OVERLAP :
2878 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2879 USEARCH_OFF);
2880 case USEARCH_CANONICAL_MATCH :
2881 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2882 USEARCH_OFF);
2883 case USEARCH_ELEMENT_COMPARISON :
2884 {
2885 int16_t value = strsrch->search->elementComparisonType;
2886 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2887 return (USearchAttributeValue)value;
2888 } else {
2889 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2890 }
2891 }
2892 case USEARCH_ATTRIBUTE_COUNT :
2893 return USEARCH_DEFAULT;
2894 }
2895 }
2896 return USEARCH_DEFAULT;
2897 }
2898
2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2900 const UStringSearch *strsrch)
2901 {
2902 if (strsrch == NULL) {
2903 return USEARCH_DONE;
2904 }
2905 return strsrch->search->matchedIndex;
2906 }
2907
2908
2909 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2910 UChar *result,
2911 int32_t resultCapacity,
2912 UErrorCode *status)
2913 {
2914 if (U_FAILURE(*status)) {
2915 return USEARCH_DONE;
2916 }
2917 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2918 result == NULL)) {
2919 *status = U_ILLEGAL_ARGUMENT_ERROR;
2920 return USEARCH_DONE;
2921 }
2922
2923 int32_t copylength = strsrch->search->matchedLength;
2924 int32_t copyindex = strsrch->search->matchedIndex;
2925 if (copyindex == USEARCH_DONE) {
2926 u_terminateUChars(result, resultCapacity, 0, status);
2927 return USEARCH_DONE;
2928 }
2929
2930 if (resultCapacity < copylength) {
2931 copylength = resultCapacity;
2932 }
2933 if (copylength > 0) {
2934 uprv_memcpy(result, strsrch->search->text + copyindex,
2935 copylength * sizeof(UChar));
2936 }
2937 return u_terminateUChars(result, resultCapacity,
2938 strsrch->search->matchedLength, status);
2939 }
2940
2941 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2942 const UStringSearch *strsrch)
2943 {
2944 if (strsrch) {
2945 return strsrch->search->matchedLength;
2946 }
2947 return USEARCH_DONE;
2948 }
2949
2950 #if !UCONFIG_NO_BREAK_ITERATION
2951
2952 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2953 UBreakIterator *breakiter,
2954 UErrorCode *status)
2955 {
2956 if (U_SUCCESS(*status) && strsrch) {
2957 strsrch->search->breakIter = breakiter;
2958 if (breakiter) {
2959 ubrk_setText(breakiter, strsrch->search->text,
2960 strsrch->search->textLength, status);
2961 }
2962 }
2963 }
2964
2965 U_CAPI const UBreakIterator* U_EXPORT2
2966 usearch_getBreakIterator(const UStringSearch *strsrch)
2967 {
2968 if (strsrch) {
2969 return strsrch->search->breakIter;
2970 }
2971 return NULL;
2972 }
2973
2974 #endif
2975
2976 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2977 const UChar *text,
2978 int32_t textlength,
2979 UErrorCode *status)
2980 {
2981 if (U_SUCCESS(*status)) {
2982 if (strsrch == NULL || text == NULL || textlength < -1 ||
2983 textlength == 0) {
2984 *status = U_ILLEGAL_ARGUMENT_ERROR;
2985 }
2986 else {
2987 if (textlength == -1) {
2988 textlength = u_strlen(text);
2989 }
2990 strsrch->search->text = text;
2991 strsrch->search->textLength = textlength;
2992 ucol_setText(strsrch->textIter, text, textlength, status);
2993 strsrch->search->matchedIndex = USEARCH_DONE;
2994 strsrch->search->matchedLength = 0;
2995 strsrch->search->reset = TRUE;
2996 #if !UCONFIG_NO_BREAK_ITERATION
2997 if (strsrch->search->breakIter != NULL) {
2998 ubrk_setText(strsrch->search->breakIter, text,
2999 textlength, status);
3000 }
3001 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
3002 #endif
3003 }
3004 }
3005 }
3006
3007 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3008 int32_t *length)
3009 {
3010 if (strsrch) {
3011 *length = strsrch->search->textLength;
3012 return strsrch->search->text;
3013 }
3014 return NULL;
3015 }
3016
3017 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
3018 const UCollator *collator,
3019 UErrorCode *status)
3020 {
3021 if (U_SUCCESS(*status)) {
3022 if (collator == NULL) {
3023 *status = U_ILLEGAL_ARGUMENT_ERROR;
3024 return;
3025 }
3026
3027 if (strsrch) {
3028 delete strsrch->textProcessedIter;
3029 strsrch->textProcessedIter = NULL;
3030 ucol_closeElements(strsrch->textIter);
3031 ucol_closeElements(strsrch->utilIter);
3032 strsrch->textIter = strsrch->utilIter = NULL;
3033 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3034 ucol_close((UCollator *)strsrch->collator);
3035 strsrch->ownCollator = FALSE;
3036 }
3037 strsrch->collator = collator;
3038 strsrch->strength = ucol_getStrength(collator);
3039 strsrch->ceMask = getMask(strsrch->strength);
3040 #if !UCONFIG_NO_BREAK_ITERATION
3041 ubrk_close(strsrch->search->internalBreakIter);
3042 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3043 strsrch->search->text, strsrch->search->textLength, status);
3044 #endif
3045 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3046 strsrch->toShift =
3047 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3048 UCOL_SHIFTED;
3049 // if status is a failure, ucol_getVariableTop returns 0
3050 strsrch->variableTop = ucol_getVariableTop(collator, status);
3051 strsrch->textIter = ucol_openElements(collator,
3052 strsrch->search->text,
3053 strsrch->search->textLength,
3054 status);
3055 strsrch->utilIter = ucol_openElements(
3056 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3057 // initialize() _after_ setting the iterators for the new collator.
3058 initialize(strsrch, status);
3059 }
3060
3061 // **** are these calls needed?
3062 // **** we call uprv_init_pce in initializePatternPCETable
3063 // **** and the CEIBuffer constructor...
3064 #if 0
3065 uprv_init_pce(strsrch->textIter);
3066 uprv_init_pce(strsrch->utilIter);
3067 #endif
3068 }
3069 }
3070
3071 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3072 {
3073 if (strsrch) {
3074 return (UCollator *)strsrch->collator;
3075 }
3076 return NULL;
3077 }
3078
3079 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
3080 const UChar *pattern,
3081 int32_t patternlength,
3082 UErrorCode *status)
3083 {
3084 if (U_SUCCESS(*status)) {
3085 if (strsrch == NULL || pattern == NULL) {
3086 *status = U_ILLEGAL_ARGUMENT_ERROR;
3087 }
3088 else {
3089 if (patternlength == -1) {
3090 patternlength = u_strlen(pattern);
3091 }
3092 if (patternlength == 0) {
3093 *status = U_ILLEGAL_ARGUMENT_ERROR;
3094 return;
3095 }
3096 strsrch->pattern.text = pattern;
3097 strsrch->pattern.textLength = patternlength;
3098 initialize(strsrch, status);
3099 }
3100 }
3101 }
3102
3103 U_CAPI const UChar* U_EXPORT2
3104 usearch_getPattern(const UStringSearch *strsrch,
3105 int32_t *length)
3106 {
3107 if (strsrch) {
3108 *length = strsrch->pattern.textLength;
3109 return strsrch->pattern.text;
3110 }
3111 return NULL;
3112 }
3113
3114 // miscellanous methods --------------------------------------------------
3115
3116 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3117 UErrorCode *status)
3118 {
3119 if (strsrch && U_SUCCESS(*status)) {
3120 strsrch->search->isForwardSearching = TRUE;
3121 usearch_setOffset(strsrch, 0, status);
3122 if (U_SUCCESS(*status)) {
3123 return usearch_next(strsrch, status);
3124 }
3125 }
3126 return USEARCH_DONE;
3127 }
3128
3129 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3130 int32_t position,
3131 UErrorCode *status)
3132 {
3133 if (strsrch && U_SUCCESS(*status)) {
3134 strsrch->search->isForwardSearching = TRUE;
3135 // position checked in usearch_setOffset
3136 usearch_setOffset(strsrch, position, status);
3137 if (U_SUCCESS(*status)) {
3138 return usearch_next(strsrch, status);
3139 }
3140 }
3141 return USEARCH_DONE;
3142 }
3143
3144 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3145 UErrorCode *status)
3146 {
3147 if (strsrch && U_SUCCESS(*status)) {
3148 strsrch->search->isForwardSearching = FALSE;
3149 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3150 if (U_SUCCESS(*status)) {
3151 return usearch_previous(strsrch, status);
3152 }
3153 }
3154 return USEARCH_DONE;
3155 }
3156
3157 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3158 int32_t position,
3159 UErrorCode *status)
3160 {
3161 if (strsrch && U_SUCCESS(*status)) {
3162 strsrch->search->isForwardSearching = FALSE;
3163 // position checked in usearch_setOffset
3164 usearch_setOffset(strsrch, position, status);
3165 if (U_SUCCESS(*status)) {
3166 return usearch_previous(strsrch, status);
3167 }
3168 }
3169 return USEARCH_DONE;
3170 }
3171
3172 /**
3173 * If a direction switch is required, we'll count the number of ces till the
3174 * beginning of the collation element iterator and iterate forwards that
3175 * number of times. This is so that we get to the correct point within the
3176 * string to continue the search in. Imagine when we are in the middle of the
3177 * normalization buffer when the change in direction is request. arrrgghh....
3178 * After searching the offset within the collation element iterator will be
3179 * shifted to the start of the match. If a match is not found, the offset would
3180 * have been set to the end of the text string in the collation element
3181 * iterator.
3182 * Okay, here's my take on normalization buffer. The only time when there can
3183 * be 2 matches within the same normalization is when the pattern is consists
3184 * of all accents. But since the offset returned is from the text string, we
3185 * should not confuse the caller by returning the second match within the
3186 * same normalization buffer. If we do, the 2 results will have the same match
3187 * offsets, and that'll be confusing. I'll return the next match that doesn't
3188 * fall within the same normalization buffer. Note this does not affect the
3189 * results of matches spanning the text and the normalization buffer.
3190 * The position to start searching is taken from the collation element
3191 * iterator. Callers of this API would have to set the offset in the collation
3192 * element iterator before using this method.
3193 */
3194 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3195 UErrorCode *status)
3196 {
3197 if (U_SUCCESS(*status) && strsrch) {
3198 // note offset is either equivalent to the start of the previous match
3199 // or is set by the user
3200 int32_t offset = usearch_getOffset(strsrch);
3201 USearch *search = strsrch->search;
3202 search->reset = FALSE;
3203 int32_t textlength = search->textLength;
3204 if (search->isForwardSearching) {
3205 #if BOYER_MOORE
3206 if (offset == textlength
3207 || (!search->isOverlap &&
3208 (offset + strsrch->pattern.defaultShiftSize > textlength ||
3209 (search->matchedIndex != USEARCH_DONE &&
3210 offset + search->matchedLength >= textlength)))) {
3211 // not enough characters to match
3212 setMatchNotFound(strsrch);
3213 return USEARCH_DONE;
3214 }
3215 #else
3216 if (offset == textlength ||
3217 (! search->isOverlap &&
3218 (search->matchedIndex != USEARCH_DONE &&
3219 offset + search->matchedLength > textlength))) {
3220 // not enough characters to match
3221 setMatchNotFound(strsrch);
3222 return USEARCH_DONE;
3223 }
3224 #endif
3225 }
3226 else {
3227 // switching direction.
3228 // if matchedIndex == USEARCH_DONE, it means that either a
3229 // setOffset has been called or that previous ran off the text
3230 // string. the iterator would have been set to offset 0 if a
3231 // match is not found.
3232 search->isForwardSearching = TRUE;
3233 if (search->matchedIndex != USEARCH_DONE) {
3234 // there's no need to set the collation element iterator
3235 // the next call to next will set the offset.
3236 return search->matchedIndex;
3237 }
3238 }
3239
3240 if (U_SUCCESS(*status)) {
3241 if (strsrch->pattern.cesLength == 0) {
3242 if (search->matchedIndex == USEARCH_DONE) {
3243 search->matchedIndex = offset;
3244 }
3245 else { // moves by codepoints
3246 U16_FWD_1(search->text, search->matchedIndex, textlength);
3247 }
3248
3249 search->matchedLength = 0;
3250 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3251 // status checked below
3252 if (search->matchedIndex == textlength) {
3253 search->matchedIndex = USEARCH_DONE;
3254 }
3255 }
3256 else {
3257 if (search->matchedLength > 0) {
3258 // if matchlength is 0 we are at the start of the iteration
3259 if (search->isOverlap) {
3260 ucol_setOffset(strsrch->textIter, offset + 1, status);
3261 }
3262 else {
3263 ucol_setOffset(strsrch->textIter,
3264 offset + search->matchedLength, status);
3265 }
3266 }
3267 else {
3268 // for boundary check purposes. this will ensure that the
3269 // next match will not preceed the current offset
3270 // note search->matchedIndex will always be set to something
3271 // in the code
3272 search->matchedIndex = offset - 1;
3273 }
3274
3275 if (search->isCanonicalMatch) {
3276 // can't use exact here since extra accents are allowed.
3277 usearch_handleNextCanonical(strsrch, status);
3278 }
3279 else {
3280 usearch_handleNextExact(strsrch, status);
3281 }
3282 }
3283
3284 if (U_FAILURE(*status)) {
3285 return USEARCH_DONE;
3286 }
3287
3288 #if !BOYER_MOORE
3289 if (search->matchedIndex == USEARCH_DONE) {
3290 ucol_setOffset(strsrch->textIter, search->textLength, status);
3291 } else {
3292 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3293 }
3294 #endif
3295
3296 return search->matchedIndex;
3297 }
3298 }
3299 return USEARCH_DONE;
3300 }
3301
3302 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3303 UErrorCode *status)
3304 {
3305 if (U_SUCCESS(*status) && strsrch) {
3306 int32_t offset;
3307 USearch *search = strsrch->search;
3308 if (search->reset) {
3309 offset = search->textLength;
3310 search->isForwardSearching = FALSE;
3311 search->reset = FALSE;
3312 setColEIterOffset(strsrch->textIter, offset);
3313 }
3314 else {
3315 offset = usearch_getOffset(strsrch);
3316 }
3317
3318 int32_t matchedindex = search->matchedIndex;
3319 if (search->isForwardSearching == TRUE) {
3320 // switching direction.
3321 // if matchedIndex == USEARCH_DONE, it means that either a
3322 // setOffset has been called or that next ran off the text
3323 // string. the iterator would have been set to offset textLength if
3324 // a match is not found.
3325 search->isForwardSearching = FALSE;
3326 if (matchedindex != USEARCH_DONE) {
3327 return matchedindex;
3328 }
3329 }
3330 else {
3331 #if BOYER_MOORE
3332 if (offset == 0 || matchedindex == 0 ||
3333 (!search->isOverlap &&
3334 (offset < strsrch->pattern.defaultShiftSize ||
3335 (matchedindex != USEARCH_DONE &&
3336 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3337 // not enough characters to match
3338 setMatchNotFound(strsrch);
3339 return USEARCH_DONE;
3340 }
3341 #else
3342 // Could check pattern length, but the
3343 // linear search will do the right thing
3344 if (offset == 0 || matchedindex == 0) {
3345 setMatchNotFound(strsrch);
3346 return USEARCH_DONE;
3347 }
3348 #endif
3349 }
3350
3351 if (U_SUCCESS(*status)) {
3352 if (strsrch->pattern.cesLength == 0) {
3353 search->matchedIndex =
3354 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3355 if (search->matchedIndex == 0) {
3356 setMatchNotFound(strsrch);
3357 // status checked below
3358 }
3359 else { // move by codepoints
3360 U16_BACK_1(search->text, 0, search->matchedIndex);
3361 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3362 // status checked below
3363 search->matchedLength = 0;
3364 }
3365 }
3366 else {
3367 if (strsrch->search->isCanonicalMatch) {
3368 // can't use exact here since extra accents are allowed.
3369 usearch_handlePreviousCanonical(strsrch, status);
3370 // status checked below
3371 }
3372 else {
3373 usearch_handlePreviousExact(strsrch, status);
3374 // status checked below
3375 }
3376 }
3377
3378 if (U_FAILURE(*status)) {
3379 return USEARCH_DONE;
3380 }
3381
3382 return search->matchedIndex;
3383 }
3384 }
3385 return USEARCH_DONE;
3386 }
3387
3388
3389
3390 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3391 {
3392 /*
3393 reset is setting the attributes that are already in
3394 string search, hence all attributes in the collator should
3395 be retrieved without any problems
3396 */
3397 if (strsrch) {
3398 UErrorCode status = U_ZERO_ERROR;
3399 UBool sameCollAttribute = TRUE;
3400 uint32_t ceMask;
3401 UBool shift;
3402 uint32_t varTop;
3403
3404 // **** hack to deal w/ how processed CEs encode quaternary ****
3405 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3406 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3407 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3408 sameCollAttribute = FALSE;
3409 }
3410
3411 strsrch->strength = ucol_getStrength(strsrch->collator);
3412 ceMask = getMask(strsrch->strength);
3413 if (strsrch->ceMask != ceMask) {
3414 strsrch->ceMask = ceMask;
3415 sameCollAttribute = FALSE;
3416 }
3417
3418 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3419 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3420 &status) == UCOL_SHIFTED;
3421 if (strsrch->toShift != shift) {
3422 strsrch->toShift = shift;
3423 sameCollAttribute = FALSE;
3424 }
3425
3426 // if status is a failure, ucol_getVariableTop returns 0
3427 varTop = ucol_getVariableTop(strsrch->collator, &status);
3428 if (strsrch->variableTop != varTop) {
3429 strsrch->variableTop = varTop;
3430 sameCollAttribute = FALSE;
3431 }
3432 if (!sameCollAttribute) {
3433 initialize(strsrch, &status);
3434 }
3435 ucol_setText(strsrch->textIter, strsrch->search->text,
3436 strsrch->search->textLength,
3437 &status);
3438 strsrch->search->matchedLength = 0;
3439 strsrch->search->matchedIndex = USEARCH_DONE;
3440 strsrch->search->isOverlap = FALSE;
3441 strsrch->search->isCanonicalMatch = FALSE;
3442 strsrch->search->elementComparisonType = 0;
3443 strsrch->search->isForwardSearching = TRUE;
3444 strsrch->search->reset = TRUE;
3445 }
3446 }
3447
3448 //
3449 // CEI Collation Element + source text index.
3450 // These structs are kept in the circular buffer.
3451 //
3452 struct CEI {
3453 int64_t ce;
3454 int32_t lowIndex;
3455 int32_t highIndex;
3456 };
3457
3458 U_NAMESPACE_BEGIN
3459
3460 namespace {
3461 //
3462 // CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3463 //
3464 #define DEFAULT_CEBUFFER_SIZE 96
3465 #define CEBUFFER_EXTRA 32
3466 // Some typical max values to make buffer size more reasonable for asymmetric search.
3467 // #8694 is for a better long-term solution to allocation of this buffer.
3468 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3469 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3470 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3471 struct CEIBuffer {
3472 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3473 CEI *buf;
3474 int32_t bufSize;
3475 int32_t firstIx;
3476 int32_t limitIx;
3477 UCollationElements *ceIter;
3478 UStringSearch *strSearch;
3479
3480
3481
3482 CEIBuffer(UStringSearch *ss, UErrorCode *status);
3483 ~CEIBuffer();
3484 const CEI *get(int32_t index);
3485 const CEI *getPrevious(int32_t index);
3486 };
3487
3488
3489 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3490 buf = defBuf;
3491 strSearch = ss;
3492 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3493 if (ss->search->elementComparisonType != 0) {
3494 const UChar * patText = ss->pattern.text;
3495 if (patText) {
3496 const UChar * patTextLimit = patText + ss->pattern.textLength;
3497 while ( patText < patTextLimit ) {
3498 UChar c = *patText++;
3499 if (MIGHT_BE_JAMO_L(c)) {
3500 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3501 } else {
3502 // No check for surrogates, we might allocate slightly more buffer than necessary.
3503 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3504 }
3505 }
3506 }
3507 }
3508 ceIter = ss->textIter;
3509 firstIx = 0;
3510 limitIx = 0;
3511
3512 if (!initTextProcessedIter(ss, status)) { return; }
3513
3514 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3515 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3516 if (buf == NULL) {
3517 *status = U_MEMORY_ALLOCATION_ERROR;
3518 }
3519 }
3520 }
3521
3522 // TODO: add a reset or init function so that allocated
3523 // buffers can be retained & reused.
3524
3525 CEIBuffer::~CEIBuffer() {
3526 if (buf != defBuf) {
3527 uprv_free(buf);
3528 }
3529 }
3530
3531
3532 // Get the CE with the specified index.
3533 // Index must be in the range
3534 // n-history_size < index < n+1
3535 // where n is the largest index to have been fetched by some previous call to this function.
3536 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3537 //
3538 const CEI *CEIBuffer::get(int32_t index) {
3539 int i = index % bufSize;
3540
3541 if (index>=firstIx && index<limitIx) {
3542 // The request was for an entry already in our buffer.
3543 // Just return it.
3544 return &buf[i];
3545 }
3546
3547 // Caller is requesting a new, never accessed before, CE.
3548 // Verify that it is the next one in sequence, which is all
3549 // that is allowed.
3550 if (index != limitIx) {
3551 #if U_PLATFORM_IS_DARWIN_BASED
3552 os_log(OS_LOG_DEFAULT, "# CEIBuffer::get param err, index %d limitIx %d", index, limitIx); // <rdar://51554495>
3553 #endif
3554 U_ASSERT(FALSE);
3555
3556 return NULL;
3557 }
3558
3559 // Manage the circular CE buffer indexing
3560 limitIx++;
3561
3562 if (limitIx - firstIx >= bufSize) {
3563 // The buffer is full, knock out the lowest-indexed entry.
3564 firstIx++;
3565 }
3566
3567 UErrorCode status = U_ZERO_ERROR;
3568
3569 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3570
3571 return &buf[i];
3572 }
3573
3574 // Get the CE with the specified index.
3575 // Index must be in the range
3576 // n-history_size < index < n+1
3577 // where n is the largest index to have been fetched by some previous call to this function.
3578 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3579 //
3580 const CEI *CEIBuffer::getPrevious(int32_t index) {
3581 int i = index % bufSize;
3582
3583 if (index>=firstIx && index<limitIx) {
3584 // The request was for an entry already in our buffer.
3585 // Just return it.
3586 return &buf[i];
3587 }
3588
3589 // Caller is requesting a new, never accessed before, CE.
3590 // Verify that it is the next one in sequence, which is all
3591 // that is allowed.
3592 if (index != limitIx) {
3593 #if U_PLATFORM_IS_DARWIN_BASED
3594 os_log(OS_LOG_DEFAULT, "# CEIBuffer::getPrevious param err, index %d limitIx %d", index, limitIx); // <rdar://51554495>
3595 #endif
3596 U_ASSERT(FALSE);
3597
3598 return NULL;
3599 }
3600
3601 // Manage the circular CE buffer indexing
3602 limitIx++;
3603
3604 if (limitIx - firstIx >= bufSize) {
3605 // The buffer is full, knock out the lowest-indexed entry.
3606 firstIx++;
3607 }
3608
3609 UErrorCode status = U_ZERO_ERROR;
3610
3611 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3612
3613 return &buf[i];
3614 }
3615
3616 }
3617
3618 U_NAMESPACE_END
3619
3620
3621 // #define USEARCH_DEBUG
3622
3623 #ifdef USEARCH_DEBUG
3624 #include <stdio.h>
3625 #include <stdlib.h>
3626 #endif
3627
3628 /*
3629 * Find the next break boundary after startIndex. If the UStringSearch object
3630 * has an external break iterator, use that. Otherwise use the internal character
3631 * break iterator.
3632 */
3633 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3634 #if 0
3635 const UChar *text = strsrch->search->text;
3636 int32_t textLen = strsrch->search->textLength;
3637
3638 U_ASSERT(startIndex>=0);
3639 U_ASSERT(startIndex<=textLen);
3640
3641 if (startIndex >= textLen) {
3642 return startIndex;
3643 }
3644
3645 UChar32 c;
3646 int32_t i = startIndex;
3647 U16_NEXT(text, i, textLen, c);
3648
3649 // If we are on a control character, stop without looking for combining marks.
3650 // Control characters do not combine.
3651 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3652 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3653 return i;
3654 }
3655
3656 // The initial character was not a control, and can thus accept trailing
3657 // combining characters. Advance over however many of them there are.
3658 int32_t indexOfLastCharChecked;
3659 for (;;) {
3660 indexOfLastCharChecked = i;
3661 if (i>=textLen) {
3662 break;
3663 }
3664 U16_NEXT(text, i, textLen, c);
3665 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3666 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3667 break;
3668 }
3669 }
3670 return indexOfLastCharChecked;
3671 #elif !UCONFIG_NO_BREAK_ITERATION
3672 UBreakIterator *breakiterator = strsrch->search->breakIter;
3673
3674 if (breakiterator == NULL) {
3675 breakiterator = strsrch->search->internalBreakIter;
3676 }
3677
3678 if (breakiterator != NULL) {
3679 return ubrk_following(breakiterator, startIndex);
3680 }
3681
3682 return startIndex;
3683 #else
3684 // **** or should we use the original code? ****
3685 return startIndex;
3686 #endif
3687
3688 }
3689
3690 /*
3691 * Returns TRUE if index is on a break boundary. If the UStringSearch
3692 * has an external break iterator, test using that, otherwise test
3693 * using the internal character break iterator.
3694 */
3695 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3696 #if 0
3697 const UChar *text = strsrch->search->text;
3698 int32_t textLen = strsrch->search->textLength;
3699
3700 U_ASSERT(index>=0);
3701 U_ASSERT(index<=textLen);
3702
3703 if (index>=textLen || index<=0) {
3704 return TRUE;
3705 }
3706
3707 // If the character at the current index is not a GRAPHEME_EXTEND
3708 // then we can not be within a combining sequence.
3709 UChar32 c;
3710 U16_GET(text, 0, index, textLen, c);
3711 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3712 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3713 return TRUE;
3714 }
3715
3716 // We are at a combining mark. If the preceding character is anything
3717 // except a CONTROL, CR or LF, we are in a combining sequence.
3718 U16_PREV(text, 0, index, c);
3719 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3720 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3721 return !combining;
3722 #elif !UCONFIG_NO_BREAK_ITERATION
3723 UBreakIterator *breakiterator = strsrch->search->breakIter;
3724
3725 if (breakiterator == NULL) {
3726 breakiterator = strsrch->search->internalBreakIter;
3727 }
3728
3729 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3730 #else
3731 // **** or use the original code? ****
3732 return TRUE;
3733 #endif
3734 }
3735
3736 #if 0
3737 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3738 {
3739 #if !UCONFIG_NO_BREAK_ITERATION
3740 UBreakIterator *breakiterator = strsrch->search->breakIter;
3741
3742 if (breakiterator != NULL) {
3743 int32_t startindex = ubrk_first(breakiterator);
3744 int32_t endindex = ubrk_last(breakiterator);
3745
3746 // out-of-range indexes are never boundary positions
3747 if (start < startindex || start > endindex ||
3748 end < startindex || end > endindex) {
3749 return FALSE;
3750 }
3751
3752 return ubrk_isBoundary(breakiterator, start) &&
3753 ubrk_isBoundary(breakiterator, end);
3754 }
3755 #endif
3756
3757 return TRUE;
3758 }
3759 #endif
3760
3761 typedef enum {
3762 U_CE_MATCH = -1,
3763 U_CE_NO_MATCH = 0,
3764 U_CE_SKIP_TARG,
3765 U_CE_SKIP_PATN
3766 } UCompareCEsResult;
3767 #define U_CE_LEVEL2_BASE 0x00000005
3768 #define U_CE_LEVEL3_BASE 0x00050000
3769
3770 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3771 if (targCE == patCE) {
3772 return U_CE_MATCH;
3773 }
3774 if (compareType == 0) {
3775 return U_CE_NO_MATCH;
3776 }
3777
3778 int64_t targCEshifted = targCE >> 32;
3779 int64_t patCEshifted = patCE >> 32;
3780 int64_t mask;
3781
3782 mask = 0xFFFF0000;
3783 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3784 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3785 if ( targLev1 != patLev1 ) {
3786 if ( targLev1 == 0 ) {
3787 return U_CE_SKIP_TARG;
3788 }
3789 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3790 return U_CE_SKIP_PATN;
3791 }
3792 return U_CE_NO_MATCH;
3793 }
3794
3795 mask = 0x0000FFFF;
3796 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3797 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3798 if ( targLev2 != patLev2 ) {
3799 if ( targLev2 == 0 ) {
3800 return U_CE_SKIP_TARG;
3801 }
3802 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3803 return U_CE_SKIP_PATN;
3804 }
3805 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3806 U_CE_MATCH: U_CE_NO_MATCH;
3807 }
3808
3809 mask = 0xFFFF0000;
3810 int32_t targLev3 = (int32_t)(targCE & mask);
3811 int32_t patLev3 = (int32_t)(patCE & mask);
3812 if ( targLev3 != patLev3 ) {
3813 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3814 U_CE_MATCH: U_CE_NO_MATCH;
3815 }
3816
3817 return U_CE_MATCH;
3818 }
3819
3820 #if BOYER_MOORE
3821 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3822 #endif
3823
3824 namespace {
3825
3826 UChar32 codePointAt(const USearch &search, int32_t index) {
3827 if (index < search.textLength) {
3828 UChar32 c;
3829 U16_NEXT(search.text, index, search.textLength, c);
3830 return c;
3831 }
3832 return U_SENTINEL;
3833 }
3834
3835 UChar32 codePointBefore(const USearch &search, int32_t index) {
3836 if (0 < index) {
3837 UChar32 c;
3838 U16_PREV(search.text, 0, index, c);
3839 return c;
3840 }
3841 return U_SENTINEL;
3842 }
3843
3844 } // namespace
3845
3846 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3847 int32_t startIdx,
3848 int32_t *matchStart,
3849 int32_t *matchLimit,
3850 UErrorCode *status)
3851 {
3852 if (U_FAILURE(*status)) {
3853 return FALSE;
3854 }
3855
3856 // TODO: reject search patterns beginning with a combining char.
3857
3858 #ifdef USEARCH_DEBUG
3859 if (getenv("USEARCH_DEBUG") != NULL) {
3860 printf("Pattern CEs\n");
3861 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3862 printf(" %8x", strsrch->pattern.ces[ii]);
3863 }
3864 printf("\n");
3865 }
3866
3867 #endif
3868 // Input parameter sanity check.
3869 // TODO: should input indicies clip to the text length
3870 // in the same way that UText does.
3871 if(strsrch->pattern.cesLength == 0 ||
3872 startIdx < 0 ||
3873 startIdx > strsrch->search->textLength ||
3874 strsrch->pattern.ces == NULL) {
3875 *status = U_ILLEGAL_ARGUMENT_ERROR;
3876 return FALSE;
3877 }
3878
3879 if (strsrch->pattern.pces == NULL) {
3880 initializePatternPCETable(strsrch, status);
3881 }
3882
3883 ucol_setOffset(strsrch->textIter, startIdx, status);
3884 CEIBuffer ceb(strsrch, status);
3885
3886
3887 int32_t targetIx = 0;
3888 const CEI *targetCEI = NULL;
3889 int32_t patIx;
3890 UBool found;
3891
3892 int32_t mStart = -1;
3893 int32_t mLimit = -1;
3894 int32_t minLimit;
3895 int32_t maxLimit;
3896
3897
3898
3899 // Outer loop moves over match starting positions in the
3900 // target CE space.
3901 // Here we see the target as a sequence of collation elements, resulting from the following:
3902 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3903 // (for example, digraphs such as IJ may be broken into two characters).
3904 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3905 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3906 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3907 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3908 // then the CE is deleted, so the following code sees only CEs that are relevant.
3909 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3910 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3911 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3912 //
3913 for(targetIx=0; ; targetIx++)
3914 {
3915 found = TRUE;
3916 // Inner loop checks for a match beginning at each
3917 // position from the outer loop.
3918 int32_t targetIxOffset = 0;
3919 int64_t patCE = 0;
3920 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3921 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3922 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3923 const CEI *firstCEI = ceb.get(targetIx);
3924 if (firstCEI == NULL) {
3925 *status = U_INTERNAL_PROGRAM_ERROR;
3926 found = FALSE;
3927 break;
3928 }
3929
3930 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3931 patCE = strsrch->pattern.pces[patIx];
3932 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3933 // Compare CE from target string with CE from the pattern.
3934 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3935 // which will fail the compare, below.
3936 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3937 if ( ceMatch == U_CE_NO_MATCH ) {
3938 found = FALSE;
3939 break;
3940 } else if ( ceMatch > U_CE_NO_MATCH ) {
3941 if ( ceMatch == U_CE_SKIP_TARG ) {
3942 // redo with same patCE, next targCE
3943 patIx--;
3944 targetIxOffset++;
3945 } else { // ceMatch == U_CE_SKIP_PATN
3946 // redo with same targCE, next patCE
3947 targetIxOffset--;
3948 }
3949 }
3950 }
3951 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3952
3953 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3954 // No match at this targetIx. Try again at the next.
3955 continue;
3956 }
3957
3958 if (!found) {
3959 // No match at all, we have run off the end of the target text.
3960 break;
3961 }
3962
3963
3964 // We have found a match in CE space.
3965 // Now determine the bounds in string index space.
3966 // There still is a chance of match failure if the CE range not correspond to
3967 // an acceptable character range.
3968 //
3969 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
3970
3971 mStart = firstCEI->lowIndex;
3972 minLimit = lastCEI->lowIndex;
3973
3974 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3975 // extended to the end of input, and the match is good.
3976
3977 // Look at the high and low indices of the CE following the match. If
3978 // they are the same it means one of two things:
3979 // 1. The match extended to the last CE from the target text, which is OK, or
3980 // 2. The last CE that was part of the match is in an expansion that extends
3981 // to the first CE after the match. In this case, we reject the match.
3982 const CEI *nextCEI = 0;
3983 if (strsrch->search->elementComparisonType == 0) {
3984 nextCEI = ceb.get(targetIx + targetIxOffset);
3985 maxLimit = nextCEI->lowIndex;
3986 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3987 found = FALSE;
3988 }
3989 } else {
3990 for ( ; ; ++targetIxOffset ) {
3991 nextCEI = ceb.get(targetIx + targetIxOffset);
3992 maxLimit = nextCEI->lowIndex;
3993 // If we are at the end of the target too, match succeeds
3994 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3995 break;
3996 }
3997 // As long as the next CE has primary weight of 0,
3998 // it is part of the last target element matched by the pattern;
3999 // make sure it can be part of a match with the last patCE
4000 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
4001 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
4002 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
4003 found = FALSE;
4004 break;
4005 }
4006 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
4007 // target element, but it has non-zero primary weight => match fails
4008 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
4009 found = false;
4010 break;
4011 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4012 } else {
4013 break;
4014 }
4015 }
4016 }
4017
4018
4019 // Check for the start of the match being within a combining sequence.
4020 // This can happen if the pattern itself begins with a combining char, and
4021 // the match found combining marks in the target text that were attached
4022 // to something else.
4023 // This type of match should be rejected for not completely consuming a
4024 // combining sequence.
4025 if (!isBreakBoundary(strsrch, mStart)) {
4026 found = FALSE;
4027 }
4028
4029 // Check for the start of the match being within an Collation Element Expansion,
4030 // meaning that the first char of the match is only partially matched.
4031 // With exapnsions, the first CE will report the index of the source
4032 // character, and all subsequent (expansions) CEs will report the source index of the
4033 // _following_ character.
4034 int32_t secondIx = firstCEI->highIndex;
4035 if (mStart == secondIx) {
4036 found = FALSE;
4037 }
4038
4039 // Allow matches to end in the middle of a grapheme cluster if the following
4040 // conditions are met; this is needed to make prefix search work properly in
4041 // Indic, see #11750
4042 // * the default breakIter is being used
4043 // * the next collation element after this combining sequence
4044 // - has non-zero primary weight
4045 // - corresponds to a separate character following the one at end of the current match
4046 // (the second of these conditions, and perhaps both, may be redundant given the
4047 // subsequent check for normalization boundary; however they are likely much faster
4048 // tests in any case)
4049 // * the match limit is a normalization boundary
4050 UBool allowMidclusterMatch = FALSE;
4051 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4052 allowMidclusterMatch =
4053 strsrch->search->breakIter == NULL &&
4054 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4055 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4056 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4057 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4058 }
4059 // If those conditions are met, then:
4060 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4061 // the match limit may be backed off to a previous break boundary. This handles
4062 // cases in which mLimit includes target characters that are ignorable with current
4063 // settings (such as space) and which extend beyond the pattern match.
4064 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4065 // * do NOT require that match limit be on a breakIter boundary
4066
4067 // Advance the match end position to the first acceptable match boundary.
4068 // This advances the index over any combining charcters.
4069 mLimit = maxLimit;
4070 if (minLimit < maxLimit) {
4071 // When the last CE's low index is same with its high index, the CE is likely
4072 // a part of expansion. In this case, the index is located just after the
4073 // character corresponding to the CEs compared above. If the index is right
4074 // at the break boundary, move the position to the next boundary will result
4075 // incorrect match length when there are ignorable characters exist between
4076 // the position and the next character produces CE(s). See ticket#8482.
4077 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4078 mLimit = minLimit;
4079 } else {
4080 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4081 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4082 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4083 // (i.e. we back off mLimit to the previous breakIterator boundary).
4084 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4085 mLimit = nba;
4086 }
4087 }
4088 }
4089
4090 #ifdef USEARCH_DEBUG
4091 if (getenv("USEARCH_DEBUG") != NULL) {
4092 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4093 }
4094 #endif
4095
4096 if (!allowMidclusterMatch) {
4097 // If advancing to the end of a combining sequence in character indexing space
4098 // advanced us beyond the end of the match in CE space, reject this match.
4099 if (mLimit > maxLimit) {
4100 found = FALSE;
4101 }
4102
4103 if (!isBreakBoundary(strsrch, mLimit)) {
4104 found = FALSE;
4105 }
4106 }
4107
4108 if (! checkIdentical(strsrch, mStart, mLimit)) {
4109 found = FALSE;
4110 }
4111
4112 if (found) {
4113 break;
4114 }
4115 }
4116
4117 #ifdef USEARCH_DEBUG
4118 if (getenv("USEARCH_DEBUG") != NULL) {
4119 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4120 int32_t lastToPrint = ceb.limitIx+2;
4121 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4122 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4123 }
4124 printf("\n%s\n", found? "match found" : "no match");
4125 }
4126 #endif
4127
4128 // All Done. Store back the match bounds to the caller.
4129 //
4130 if (found==FALSE) {
4131 mLimit = -1;
4132 mStart = -1;
4133 }
4134
4135 if (matchStart != NULL) {
4136 *matchStart= mStart;
4137 }
4138
4139 if (matchLimit != NULL) {
4140 *matchLimit = mLimit;
4141 }
4142
4143 return found;
4144 }
4145
4146 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4147 int32_t startIdx,
4148 int32_t *matchStart,
4149 int32_t *matchLimit,
4150 UErrorCode *status)
4151 {
4152 if (U_FAILURE(*status)) {
4153 return FALSE;
4154 }
4155
4156 // TODO: reject search patterns beginning with a combining char.
4157
4158 #ifdef USEARCH_DEBUG
4159 if (getenv("USEARCH_DEBUG") != NULL) {
4160 printf("Pattern CEs\n");
4161 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4162 printf(" %8x", strsrch->pattern.ces[ii]);
4163 }
4164 printf("\n");
4165 }
4166
4167 #endif
4168 // Input parameter sanity check.
4169 // TODO: should input indicies clip to the text length
4170 // in the same way that UText does.
4171 if(strsrch->pattern.cesLength == 0 ||
4172 startIdx < 0 ||
4173 startIdx > strsrch->search->textLength ||
4174 strsrch->pattern.ces == NULL) {
4175 *status = U_ILLEGAL_ARGUMENT_ERROR;
4176 return FALSE;
4177 }
4178
4179 if (strsrch->pattern.pces == NULL) {
4180 initializePatternPCETable(strsrch, status);
4181 }
4182
4183 CEIBuffer ceb(strsrch, status);
4184 int32_t targetIx = 0;
4185
4186 /*
4187 * Pre-load the buffer with the CE's for the grapheme
4188 * after our starting position so that we're sure that
4189 * we can look at the CE following the match when we
4190 * check the match boundaries.
4191 *
4192 * This will also pre-fetch the first CE that we'll
4193 * consider for the match.
4194 */
4195 if (startIdx < strsrch->search->textLength) {
4196 UBreakIterator *bi = strsrch->search->internalBreakIter;
4197 int32_t next = ubrk_following(bi, startIdx);
4198
4199 ucol_setOffset(strsrch->textIter, next, status);
4200
4201 for (targetIx = 0; ; targetIx += 1) {
4202 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4203 break;
4204 }
4205 }
4206 } else {
4207 ucol_setOffset(strsrch->textIter, startIdx, status);
4208 }
4209
4210
4211 const CEI *targetCEI = NULL;
4212 int32_t patIx;
4213 UBool found;
4214
4215 int32_t limitIx = targetIx;
4216 int32_t mStart = -1;
4217 int32_t mLimit = -1;
4218 int32_t minLimit;
4219 int32_t maxLimit;
4220
4221
4222
4223 // Outer loop moves over match starting positions in the
4224 // target CE space.
4225 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4226 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4227 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4228 // and the beginning of the base text.
4229 for(targetIx = limitIx; ; targetIx += 1)
4230 {
4231 found = TRUE;
4232 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4233 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4234 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4235 const CEI *lastCEI = ceb.getPrevious(targetIx);
4236 if (lastCEI == NULL) {
4237 *status = U_INTERNAL_PROGRAM_ERROR;
4238 found = FALSE;
4239 break;
4240 }
4241 // Inner loop checks for a match beginning at each
4242 // position from the outer loop.
4243 int32_t targetIxOffset = 0;
4244 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4245 int64_t patCE = strsrch->pattern.pces[patIx];
4246
4247 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4248 // Compare CE from target string with CE from the pattern.
4249 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4250 // which will fail the compare, below.
4251 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4252 if ( ceMatch == U_CE_NO_MATCH ) {
4253 found = FALSE;
4254 break;
4255 } else if ( ceMatch > U_CE_NO_MATCH ) {
4256 if ( ceMatch == U_CE_SKIP_TARG ) {
4257 // redo with same patCE, next targCE
4258 patIx++;
4259 targetIxOffset++;
4260 } else { // ceMatch == U_CE_SKIP_PATN
4261 // redo with same targCE, next patCE
4262 targetIxOffset--;
4263 }
4264 }
4265 }
4266
4267 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4268 // No match at this targetIx. Try again at the next.
4269 continue;
4270 }
4271
4272 if (!found) {
4273 // No match at all, we have run off the end of the target text.
4274 break;
4275 }
4276
4277
4278 // We have found a match in CE space.
4279 // Now determine the bounds in string index space.
4280 // There still is a chance of match failure if the CE range not correspond to
4281 // an acceptable character range.
4282 //
4283 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4284 mStart = firstCEI->lowIndex;
4285
4286 // Check for the start of the match being within a combining sequence.
4287 // This can happen if the pattern itself begins with a combining char, and
4288 // the match found combining marks in the target text that were attached
4289 // to something else.
4290 // This type of match should be rejected for not completely consuming a
4291 // combining sequence.
4292 if (!isBreakBoundary(strsrch, mStart)) {
4293 found = FALSE;
4294 }
4295
4296 // Look at the high index of the first CE in the match. If it's the same as the
4297 // low index, the first CE in the match is in the middle of an expansion.
4298 if (mStart == firstCEI->highIndex) {
4299 found = FALSE;
4300 }
4301
4302
4303 minLimit = lastCEI->lowIndex;
4304
4305 if (targetIx > 0) {
4306 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4307 // extended to the end of input, and the match is good.
4308
4309 // Look at the high and low indices of the CE following the match. If
4310 // they are the same it means one of two things:
4311 // 1. The match extended to the last CE from the target text, which is OK, or
4312 // 2. The last CE that was part of the match is in an expansion that extends
4313 // to the first CE after the match. In this case, we reject the match.
4314 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4315
4316 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4317 found = FALSE;
4318 }
4319
4320 mLimit = maxLimit = nextCEI->lowIndex;
4321
4322 // Allow matches to end in the middle of a grapheme cluster if the following
4323 // conditions are met; this is needed to make prefix search work properly in
4324 // Indic, see #11750
4325 // * the default breakIter is being used
4326 // * the next collation element after this combining sequence
4327 // - has non-zero primary weight
4328 // - corresponds to a separate character following the one at end of the current match
4329 // (the second of these conditions, and perhaps both, may be redundant given the
4330 // subsequent check for normalization boundary; however they are likely much faster
4331 // tests in any case)
4332 // * the match limit is a normalization boundary
4333 UBool allowMidclusterMatch = FALSE;
4334 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4335 allowMidclusterMatch =
4336 strsrch->search->breakIter == NULL &&
4337 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4338 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4339 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4340 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4341 }
4342 // If those conditions are met, then:
4343 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4344 // the match limit may be backed off to a previous break boundary. This handles
4345 // cases in which mLimit includes target characters that are ignorable with current
4346 // settings (such as space) and which extend beyond the pattern match.
4347 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4348 // * do NOT require that match limit be on a breakIter boundary
4349
4350 // Advance the match end position to the first acceptable match boundary.
4351 // This advances the index over any combining characters.
4352 if (minLimit < maxLimit) {
4353 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4354 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4355 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4356 // (i.e. we back off mLimit to the previous breakIterator boundary).
4357 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4358 mLimit = nba;
4359 }
4360 }
4361
4362 if (!allowMidclusterMatch) {
4363 // If advancing to the end of a combining sequence in character indexing space
4364 // advanced us beyond the end of the match in CE space, reject this match.
4365 if (mLimit > maxLimit) {
4366 found = FALSE;
4367 }
4368
4369 // Make sure the end of the match is on a break boundary
4370 if (!isBreakBoundary(strsrch, mLimit)) {
4371 found = FALSE;
4372 }
4373 }
4374
4375 } else {
4376 // No non-ignorable CEs after this point.
4377 // The maximum position is detected by boundary after
4378 // the last non-ignorable CE. Combining sequence
4379 // across the start index will be truncated.
4380 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4381 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4382 }
4383
4384 #ifdef USEARCH_DEBUG
4385 if (getenv("USEARCH_DEBUG") != NULL) {
4386 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4387 }
4388 #endif
4389
4390
4391 if (! checkIdentical(strsrch, mStart, mLimit)) {
4392 found = FALSE;
4393 }
4394
4395 if (found) {
4396 break;
4397 }
4398 }
4399
4400 #ifdef USEARCH_DEBUG
4401 if (getenv("USEARCH_DEBUG") != NULL) {
4402 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4403 int32_t lastToPrint = ceb.limitIx+2;
4404 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4405 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4406 }
4407 printf("\n%s\n", found? "match found" : "no match");
4408 }
4409 #endif
4410
4411 // All Done. Store back the match bounds to the caller.
4412 //
4413 if (found==FALSE) {
4414 mLimit = -1;
4415 mStart = -1;
4416 }
4417
4418 if (matchStart != NULL) {
4419 *matchStart= mStart;
4420 }
4421
4422 if (matchLimit != NULL) {
4423 *matchLimit = mLimit;
4424 }
4425
4426 return found;
4427 }
4428
4429 // internal use methods declared in usrchimp.h -----------------------------
4430
4431 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4432 {
4433 if (U_FAILURE(*status)) {
4434 setMatchNotFound(strsrch);
4435 return FALSE;
4436 }
4437
4438 #if BOYER_MOORE
4439 UCollationElements *coleiter = strsrch->textIter;
4440 int32_t textlength = strsrch->search->textLength;
4441 int32_t *patternce = strsrch->pattern.ces;
4442 int32_t patterncelength = strsrch->pattern.cesLength;
4443 int32_t textoffset = ucol_getOffset(coleiter);
4444
4445 // status used in setting coleiter offset, since offset is checked in
4446 // shiftForward before setting the coleiter offset, status never
4447 // a failure
4448 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4449 patterncelength);
4450 while (textoffset <= textlength)
4451 {
4452 uint32_t patternceindex = patterncelength - 1;
4453 int32_t targetce;
4454 UBool found = FALSE;
4455 int32_t lastce = UCOL_NULLORDER;
4456
4457 setColEIterOffset(coleiter, textoffset);
4458
4459 for (;;) {
4460 // finding the last pattern ce match, imagine composite characters
4461 // for example: search for pattern A in text \u00C0
4462 // we'll have to skip \u0300 the grave first before we get to A
4463 targetce = ucol_previous(coleiter, status);
4464 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4465 found = FALSE;
4466 break;
4467 }
4468 targetce = getCE(strsrch, targetce);
4469 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4470 // this is for the text \u0315\u0300 that requires
4471 // normalization and pattern \u0300, where \u0315 is ignorable
4472 continue;
4473 }
4474 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4475 lastce = targetce;
4476 }
4477 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4478 if (targetce == patternce[patternceindex]) {
4479 // the first ce can be a contraction
4480 found = TRUE;
4481 break;
4482 }
4483 if (!hasExpansion(coleiter)) {
4484 found = FALSE;
4485 break;
4486 }
4487 }
4488
4489 //targetce = lastce;
4490
4491 while (found && patternceindex > 0) {
4492 lastce = targetce;
4493 targetce = ucol_previous(coleiter, status);
4494 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4495 found = FALSE;
4496 break;
4497 }
4498 targetce = getCE(strsrch, targetce);
4499 if (targetce == UCOL_IGNORABLE) {
4500 continue;
4501 }
4502
4503 patternceindex --;
4504 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4505 found = found && targetce == patternce[patternceindex];
4506 }
4507
4508 targetce = lastce;
4509
4510 if (!found) {
4511 if (U_FAILURE(*status)) {
4512 break;
4513 }
4514 textoffset = shiftForward(strsrch, textoffset, lastce,
4515 patternceindex);
4516 // status checked at loop.
4517 patternceindex = patterncelength;
4518 continue;
4519 }
4520
4521 if (checkNextExactMatch(strsrch, &textoffset, status)) {
4522 // status checked in ucol_setOffset
4523 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4524 return TRUE;
4525 }
4526 }
4527 setMatchNotFound(strsrch);
4528 return FALSE;
4529 #else
4530 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4531 int32_t start = -1;
4532 int32_t end = -1;
4533
4534 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4535 strsrch->search->matchedIndex = start;
4536 strsrch->search->matchedLength = end - start;
4537 return TRUE;
4538 } else {
4539 setMatchNotFound(strsrch);
4540 return FALSE;
4541 }
4542 #endif
4543 }
4544
4545 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4546 {
4547 if (U_FAILURE(*status)) {
4548 setMatchNotFound(strsrch);
4549 return FALSE;
4550 }
4551
4552 #if BOYER_MOORE
4553 UCollationElements *coleiter = strsrch->textIter;
4554 int32_t textlength = strsrch->search->textLength;
4555 int32_t *patternce = strsrch->pattern.ces;
4556 int32_t patterncelength = strsrch->pattern.cesLength;
4557 int32_t textoffset = ucol_getOffset(coleiter);
4558 UBool hasPatternAccents =
4559 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4560
4561 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4562 patterncelength);
4563 strsrch->canonicalPrefixAccents[0] = 0;
4564 strsrch->canonicalSuffixAccents[0] = 0;
4565
4566 while (textoffset <= textlength)
4567 {
4568 int32_t patternceindex = patterncelength - 1;
4569 int32_t targetce;
4570 UBool found = FALSE;
4571 int32_t lastce = UCOL_NULLORDER;
4572
4573 setColEIterOffset(coleiter, textoffset);
4574
4575 for (;;) {
4576 // finding the last pattern ce match, imagine composite characters
4577 // for example: search for pattern A in text \u00C0
4578 // we'll have to skip \u0300 the grave first before we get to A
4579 targetce = ucol_previous(coleiter, status);
4580 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4581 found = FALSE;
4582 break;
4583 }
4584 targetce = getCE(strsrch, targetce);
4585 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4586 lastce = targetce;
4587 }
4588 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4589 if (targetce == patternce[patternceindex]) {
4590 // the first ce can be a contraction
4591 found = TRUE;
4592 break;
4593 }
4594 if (!hasExpansion(coleiter)) {
4595 found = FALSE;
4596 break;
4597 }
4598 }
4599
4600 while (found && patternceindex > 0) {
4601 targetce = ucol_previous(coleiter, status);
4602 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4603 found = FALSE;
4604 break;
4605 }
4606 targetce = getCE(strsrch, targetce);
4607 if (targetce == UCOL_IGNORABLE) {
4608 continue;
4609 }
4610
4611 patternceindex --;
4612 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4613 found = found && targetce == patternce[patternceindex];
4614 }
4615
4616 // initializing the rearranged accent array
4617 if (hasPatternAccents && !found) {
4618 strsrch->canonicalPrefixAccents[0] = 0;
4619 strsrch->canonicalSuffixAccents[0] = 0;
4620 if (U_FAILURE(*status)) {
4621 break;
4622 }
4623 found = doNextCanonicalMatch(strsrch, textoffset, status);
4624 }
4625
4626 if (!found) {
4627 if (U_FAILURE(*status)) {
4628 break;
4629 }
4630 textoffset = shiftForward(strsrch, textoffset, lastce,
4631 patternceindex);
4632 // status checked at loop
4633 patternceindex = patterncelength;
4634 continue;
4635 }
4636
4637 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4638 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4639 return TRUE;
4640 }
4641 }
4642 setMatchNotFound(strsrch);
4643 return FALSE;
4644 #else
4645 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4646 int32_t start = -1;
4647 int32_t end = -1;
4648
4649 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4650 strsrch->search->matchedIndex = start;
4651 strsrch->search->matchedLength = end - start;
4652 return TRUE;
4653 } else {
4654 setMatchNotFound(strsrch);
4655 return FALSE;
4656 }
4657 #endif
4658 }
4659
4660 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4661 {
4662 if (U_FAILURE(*status)) {
4663 setMatchNotFound(strsrch);
4664 return FALSE;
4665 }
4666
4667 #if BOYER_MOORE
4668 UCollationElements *coleiter = strsrch->textIter;
4669 int32_t *patternce = strsrch->pattern.ces;
4670 int32_t patterncelength = strsrch->pattern.cesLength;
4671 int32_t textoffset = ucol_getOffset(coleiter);
4672
4673 // shifting it check for setting offset
4674 // if setOffset is called previously or there was no previous match, we
4675 // leave the offset as it is.
4676 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4677 textoffset = strsrch->search->matchedIndex;
4678 }
4679
4680 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4681 patterncelength);
4682
4683 while (textoffset >= 0)
4684 {
4685 int32_t patternceindex = 1;
4686 int32_t targetce;
4687 UBool found = FALSE;
4688 int32_t firstce = UCOL_NULLORDER;
4689
4690 // if status is a failure, ucol_setOffset does nothing
4691 setColEIterOffset(coleiter, textoffset);
4692
4693 for (;;) {
4694 // finding the first pattern ce match, imagine composite
4695 // characters. for example: search for pattern \u0300 in text
4696 // \u00C0, we'll have to skip A first before we get to
4697 // \u0300 the grave accent
4698 targetce = ucol_next(coleiter, status);
4699 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4700 found = FALSE;
4701 break;
4702 }
4703 targetce = getCE(strsrch, targetce);
4704 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4705 firstce = targetce;
4706 }
4707 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4708 continue;
4709 }
4710 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4711 if (targetce == patternce[0]) {
4712 found = TRUE;
4713 break;
4714 }
4715 if (!hasExpansion(coleiter)) {
4716 // checking for accents in composite character
4717 found = FALSE;
4718 break;
4719 }
4720 }
4721
4722 //targetce = firstce;
4723
4724 while (found && (patternceindex < patterncelength)) {
4725 firstce = targetce;
4726 targetce = ucol_next(coleiter, status);
4727 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4728 found = FALSE;
4729 break;
4730 }
4731 targetce = getCE(strsrch, targetce);
4732 if (targetce == UCOL_IGNORABLE) {
4733 continue;
4734 }
4735
4736 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4737 found = found && targetce == patternce[patternceindex];
4738 patternceindex ++;
4739 }
4740
4741 targetce = firstce;
4742
4743 if (!found) {
4744 if (U_FAILURE(*status)) {
4745 break;
4746 }
4747
4748 textoffset = reverseShift(strsrch, textoffset, targetce,
4749 patternceindex);
4750 patternceindex = 0;
4751 continue;
4752 }
4753
4754 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4755 setColEIterOffset(coleiter, textoffset);
4756 return TRUE;
4757 }
4758 }
4759 setMatchNotFound(strsrch);
4760 return FALSE;
4761 #else
4762 int32_t textOffset;
4763
4764 if (strsrch->search->isOverlap) {
4765 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4766 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4767 } else {
4768 // move the start position at the end of possible match
4769 initializePatternPCETable(strsrch, status);
4770 if (!initTextProcessedIter(strsrch, status)) {
4771 setMatchNotFound(strsrch);
4772 return FALSE;
4773 }
4774 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4775 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4776 if (pce == UCOL_PROCESSED_NULLORDER) {
4777 // at the end of the text
4778 break;
4779 }
4780 }
4781 if (U_FAILURE(*status)) {
4782 setMatchNotFound(strsrch);
4783 return FALSE;
4784 }
4785 textOffset = ucol_getOffset(strsrch->textIter);
4786 }
4787 } else {
4788 textOffset = ucol_getOffset(strsrch->textIter);
4789 }
4790
4791 int32_t start = -1;
4792 int32_t end = -1;
4793
4794 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4795 strsrch->search->matchedIndex = start;
4796 strsrch->search->matchedLength = end - start;
4797 return TRUE;
4798 } else {
4799 setMatchNotFound(strsrch);
4800 return FALSE;
4801 }
4802 #endif
4803 }
4804
4805 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4806 UErrorCode *status)
4807 {
4808 if (U_FAILURE(*status)) {
4809 setMatchNotFound(strsrch);
4810 return FALSE;
4811 }
4812
4813 #if BOYER_MOORE
4814 UCollationElements *coleiter = strsrch->textIter;
4815 int32_t *patternce = strsrch->pattern.ces;
4816 int32_t patterncelength = strsrch->pattern.cesLength;
4817 int32_t textoffset = ucol_getOffset(coleiter);
4818 UBool hasPatternAccents =
4819 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4820
4821 // shifting it check for setting offset
4822 // if setOffset is called previously or there was no previous match, we
4823 // leave the offset as it is.
4824 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4825 textoffset = strsrch->search->matchedIndex;
4826 }
4827
4828 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4829 patterncelength);
4830 strsrch->canonicalPrefixAccents[0] = 0;
4831 strsrch->canonicalSuffixAccents[0] = 0;
4832
4833 while (textoffset >= 0)
4834 {
4835 int32_t patternceindex = 1;
4836 int32_t targetce;
4837 UBool found = FALSE;
4838 int32_t firstce = UCOL_NULLORDER;
4839
4840 setColEIterOffset(coleiter, textoffset);
4841 for (;;) {
4842 // finding the first pattern ce match, imagine composite
4843 // characters. for example: search for pattern \u0300 in text
4844 // \u00C0, we'll have to skip A first before we get to
4845 // \u0300 the grave accent
4846 targetce = ucol_next(coleiter, status);
4847 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4848 found = FALSE;
4849 break;
4850 }
4851 targetce = getCE(strsrch, targetce);
4852 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4853 firstce = targetce;
4854 }
4855
4856 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4857 if (targetce == patternce[0]) {
4858 // the first ce can be a contraction
4859 found = TRUE;
4860 break;
4861 }
4862 if (!hasExpansion(coleiter)) {
4863 // checking for accents in composite character
4864 found = FALSE;
4865 break;
4866 }
4867 }
4868
4869 targetce = firstce;
4870
4871 while (found && patternceindex < patterncelength) {
4872 targetce = ucol_next(coleiter, status);
4873 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4874 found = FALSE;
4875 break;
4876 }
4877 targetce = getCE(strsrch, targetce);
4878 if (targetce == UCOL_IGNORABLE) {
4879 continue;
4880 }
4881
4882 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4883 found = found && targetce == patternce[patternceindex];
4884 patternceindex ++;
4885 }
4886
4887 // initializing the rearranged accent array
4888 if (hasPatternAccents && !found) {
4889 strsrch->canonicalPrefixAccents[0] = 0;
4890 strsrch->canonicalSuffixAccents[0] = 0;
4891 if (U_FAILURE(*status)) {
4892 break;
4893 }
4894 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4895 }
4896
4897 if (!found) {
4898 if (U_FAILURE(*status)) {
4899 break;
4900 }
4901 textoffset = reverseShift(strsrch, textoffset, targetce,
4902 patternceindex);
4903 patternceindex = 0;
4904 continue;
4905 }
4906
4907 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4908 setColEIterOffset(coleiter, textoffset);
4909 return TRUE;
4910 }
4911 }
4912 setMatchNotFound(strsrch);
4913 return FALSE;
4914 #else
4915 int32_t textOffset;
4916
4917 if (strsrch->search->isOverlap) {
4918 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4919 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4920 } else {
4921 // move the start position at the end of possible match
4922 initializePatternPCETable(strsrch, status);
4923 if (!initTextProcessedIter(strsrch, status)) {
4924 setMatchNotFound(strsrch);
4925 return FALSE;
4926 }
4927 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4928 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4929 if (pce == UCOL_PROCESSED_NULLORDER) {
4930 // at the end of the text
4931 break;
4932 }
4933 }
4934 if (U_FAILURE(*status)) {
4935 setMatchNotFound(strsrch);
4936 return FALSE;
4937 }
4938 textOffset = ucol_getOffset(strsrch->textIter);
4939 }
4940 } else {
4941 textOffset = ucol_getOffset(strsrch->textIter);
4942 }
4943
4944 int32_t start = -1;
4945 int32_t end = -1;
4946
4947 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4948 strsrch->search->matchedIndex = start;
4949 strsrch->search->matchedLength = end - start;
4950 return TRUE;
4951 } else {
4952 setMatchNotFound(strsrch);
4953 return FALSE;
4954 }
4955 #endif
4956 }
4957
4958 #endif /* #if !UCONFIG_NO_COLLATION */