]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/usearch.cpp
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / i18n / usearch.cpp
... / ...
CommitLineData
1/*
2**********************************************************************
3* Copyright (C) 2001-2015 IBM and others. All rights reserved.
4**********************************************************************
5* Date Name Description
6* 07/02/2001 synwee Creation.
7**********************************************************************
8*/
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
13
14#include "unicode/usearch.h"
15#include "unicode/ustring.h"
16#include "unicode/uchar.h"
17#include "unicode/utf16.h"
18#include "normalizer2impl.h"
19#include "usrchimp.h"
20#include "cmemory.h"
21#include "ucln_in.h"
22#include "uassert.h"
23#include "ustr_imp.h"
24
25U_NAMESPACE_USE
26
27// don't use Boyer-Moore
28// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
29#define BOYER_MOORE 0
30
31// internal definition ---------------------------------------------------
32
33#define LAST_BYTE_MASK_ 0xFF
34#define SECOND_LAST_BYTE_SHIFT_ 8
35#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
36
37static const Normalizer2Impl *g_nfcImpl = NULL;
38
39// internal methods -------------------------------------------------
40
41/**
42* Fast collation element iterator setOffset.
43* This function does not check for bounds.
44* @param coleiter collation element iterator
45* @param offset to set
46*/
47static
48inline void setColEIterOffset(UCollationElements *elems,
49 int32_t offset)
50{
51 // Note: Not "fast" any more after the 2013 collation rewrite.
52 // We do not want to expose more internals than necessary.
53 UErrorCode status = U_ZERO_ERROR;
54 ucol_setOffset(elems, offset, &status);
55}
56
57/**
58* Getting the mask for collation strength
59* @param strength collation strength
60* @return collation element mask
61*/
62static
63inline uint32_t getMask(UCollationStrength strength)
64{
65 switch (strength)
66 {
67 case UCOL_PRIMARY:
68 return UCOL_PRIMARYORDERMASK;
69 case UCOL_SECONDARY:
70 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
71 default:
72 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
73 UCOL_PRIMARYORDERMASK;
74 }
75}
76
77/**
78* This is to squeeze the 21bit ces into a 256 table
79* @param ce collation element
80* @return collapsed version of the collation element
81*/
82static
83inline int hash(uint32_t ce)
84{
85 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
86 // well with the new collation where most of the latin 1 characters
87 // are of the value xx000xxx. their hashes will most of the time be 0
88 // to be discussed on the hash algo.
89 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
90}
91
92U_CDECL_BEGIN
93static UBool U_CALLCONV
94usearch_cleanup(void) {
95 g_nfcImpl = NULL;
96 return TRUE;
97}
98U_CDECL_END
99
100/**
101* Initializing the fcd tables.
102* Internal method, status assumed to be a success.
103* @param status output error if any, caller to check status before calling
104* method, status assumed to be success when passed in.
105*/
106static
107inline void initializeFCD(UErrorCode *status)
108{
109 if (g_nfcImpl == NULL) {
110 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
111 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
112 }
113}
114
115/**
116* Gets the fcd value for a character at the argument index.
117* This method takes into accounts of the supplementary characters.
118* @param str UTF16 string where character for fcd retrieval resides
119* @param offset position of the character whose fcd is to be retrieved, to be
120* overwritten with the next character position, taking
121* surrogate characters into consideration.
122* @param strlength length of the argument string
123* @return fcd value
124*/
125static
126uint16_t getFCD(const UChar *str, int32_t *offset,
127 int32_t strlength)
128{
129 const UChar *temp = str + *offset;
130 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
131 *offset = (int32_t)(temp - str);
132 return result;
133}
134
135/**
136* Getting the modified collation elements taking into account the collation
137* attributes
138* @param strsrch string search data
139* @param sourcece
140* @return the modified collation element
141*/
142static
143inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
144{
145 // note for tertiary we can't use the collator->tertiaryMask, that
146 // is a preprocessed mask that takes into account case options. since
147 // we are only concerned with exact matches, we don't need that.
148 sourcece &= strsrch->ceMask;
149
150 if (strsrch->toShift) {
151 // alternate handling here, since only the 16 most significant digits
152 // is only used, we can safely do a compare without masking
153 // if the ce is a variable, we mask and get only the primary values
154 // no shifting to quartenary is required since all primary values
155 // less than variabletop will need to be masked off anyway.
156 if (strsrch->variableTop > sourcece) {
157 if (strsrch->strength >= UCOL_QUATERNARY) {
158 sourcece &= UCOL_PRIMARYORDERMASK;
159 }
160 else {
161 sourcece = UCOL_IGNORABLE;
162 }
163 }
164 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
165 sourcece = 0xFFFF;
166 }
167
168 return sourcece;
169}
170
171/**
172* Allocate a memory and returns NULL if it failed.
173* Internal method, status assumed to be a success.
174* @param size to allocate
175* @param status output error if any, caller to check status before calling
176* method, status assumed to be success when passed in.
177* @return newly allocated array, NULL otherwise
178*/
179static
180inline void * allocateMemory(uint32_t size, UErrorCode *status)
181{
182 uint32_t *result = (uint32_t *)uprv_malloc(size);
183 if (result == NULL) {
184 *status = U_MEMORY_ALLOCATION_ERROR;
185 }
186 return result;
187}
188
189/**
190* Adds a uint32_t value to a destination array.
191* Creates a new array if we run out of space. The caller will have to
192* manually deallocate the newly allocated array.
193* Internal method, status assumed to be success, caller has to check status
194* before calling this method. destination not to be NULL and has at least
195* size destinationlength.
196* @param destination target array
197* @param offset destination offset to add value
198* @param destinationlength target array size, return value for the new size
199* @param value to be added
200* @param increments incremental size expected
201* @param status output error if any, caller to check status before calling
202* method, status assumed to be success when passed in.
203* @return new destination array, destination if there was no new allocation
204*/
205static
206inline int32_t * addTouint32_tArray(int32_t *destination,
207 uint32_t offset,
208 uint32_t *destinationlength,
209 uint32_t value,
210 uint32_t increments,
211 UErrorCode *status)
212{
213 uint32_t newlength = *destinationlength;
214 if (offset + 1 == newlength) {
215 newlength += increments;
216 int32_t *temp = (int32_t *)allocateMemory(
217 sizeof(int32_t) * newlength, status);
218 if (U_FAILURE(*status)) {
219 return NULL;
220 }
221 uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
222 *destinationlength = newlength;
223 destination = temp;
224 }
225 destination[offset] = value;
226 return destination;
227}
228
229/**
230* Adds a uint64_t value to a destination array.
231* Creates a new array if we run out of space. The caller will have to
232* manually deallocate the newly allocated array.
233* Internal method, status assumed to be success, caller has to check status
234* before calling this method. destination not to be NULL and has at least
235* size destinationlength.
236* @param destination target array
237* @param offset destination offset to add value
238* @param destinationlength target array size, return value for the new size
239* @param value to be added
240* @param increments incremental size expected
241* @param status output error if any, caller to check status before calling
242* method, status assumed to be success when passed in.
243* @return new destination array, destination if there was no new allocation
244*/
245static
246inline int64_t * addTouint64_tArray(int64_t *destination,
247 uint32_t offset,
248 uint32_t *destinationlength,
249 uint64_t value,
250 uint32_t increments,
251 UErrorCode *status)
252{
253 uint32_t newlength = *destinationlength;
254 if (offset + 1 == newlength) {
255 newlength += increments;
256 int64_t *temp = (int64_t *)allocateMemory(
257 sizeof(int64_t) * newlength, status);
258
259 if (U_FAILURE(*status)) {
260 return NULL;
261 }
262
263 uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
264 *destinationlength = newlength;
265 destination = temp;
266 }
267
268 destination[offset] = value;
269
270 return destination;
271}
272
273/**
274* Initializing the ce table for a pattern.
275* Stores non-ignorable collation keys.
276* Table size will be estimated by the size of the pattern text. Table
277* expansion will be perform as we go along. Adding 1 to ensure that the table
278* size definitely increases.
279* Internal method, status assumed to be a success.
280* @param strsrch string search data
281* @param status output error if any, caller to check status before calling
282* method, status assumed to be success when passed in.
283* @return total number of expansions
284*/
285static
286inline uint16_t initializePatternCETable(UStringSearch *strsrch,
287 UErrorCode *status)
288{
289 UPattern *pattern = &(strsrch->pattern);
290 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
291 int32_t *cetable = pattern->cesBuffer;
292 uint32_t patternlength = pattern->textLength;
293 UCollationElements *coleiter = strsrch->utilIter;
294
295 if (coleiter == NULL) {
296 coleiter = ucol_openElements(strsrch->collator, pattern->text,
297 patternlength, status);
298 // status will be checked in ucol_next(..) later and if it is an
299 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
300 // returned.
301 strsrch->utilIter = coleiter;
302 }
303 else {
304 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
305 }
306 if(U_FAILURE(*status)) {
307 return 0;
308 }
309
310 if (pattern->ces != cetable && pattern->ces) {
311 uprv_free(pattern->ces);
312 }
313
314 uint16_t offset = 0;
315 uint16_t result = 0;
316 int32_t ce;
317
318 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
319 U_SUCCESS(*status)) {
320 uint32_t newce = getCE(strsrch, ce);
321 if (newce) {
322 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
323 newce,
324 patternlength - ucol_getOffset(coleiter) + 1,
325 status);
326 if (U_FAILURE(*status)) {
327 return 0;
328 }
329 offset ++;
330 if (cetable != temp && cetable != pattern->cesBuffer) {
331 uprv_free(cetable);
332 }
333 cetable = temp;
334 }
335 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
336 }
337
338 cetable[offset] = 0;
339 pattern->ces = cetable;
340 pattern->cesLength = offset;
341
342 return result;
343}
344
345/**
346* Initializing the pce table for a pattern.
347* Stores non-ignorable collation keys.
348* Table size will be estimated by the size of the pattern text. Table
349* expansion will be perform as we go along. Adding 1 to ensure that the table
350* size definitely increases.
351* Internal method, status assumed to be a success.
352* @param strsrch string search data
353* @param status output error if any, caller to check status before calling
354* method, status assumed to be success when passed in.
355* @return total number of expansions
356*/
357static
358inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
359 UErrorCode *status)
360{
361 UPattern *pattern = &(strsrch->pattern);
362 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
363 int64_t *pcetable = pattern->pcesBuffer;
364 uint32_t patternlength = pattern->textLength;
365 UCollationElements *coleiter = strsrch->utilIter;
366
367 if (coleiter == NULL) {
368 coleiter = ucol_openElements(strsrch->collator, pattern->text,
369 patternlength, status);
370 // status will be checked in ucol_next(..) later and if it is an
371 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
372 // returned.
373 strsrch->utilIter = coleiter;
374 } else {
375 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
376 }
377 if(U_FAILURE(*status)) {
378 return 0;
379 }
380
381 if (pattern->pces != pcetable && pattern->pces != NULL) {
382 uprv_free(pattern->pces);
383 }
384
385 uint16_t offset = 0;
386 uint16_t result = 0;
387 int64_t pce;
388
389 icu::UCollationPCE iter(coleiter);
390
391 // ** Should processed CEs be signed or unsigned?
392 // ** (the rest of the code in this file seems to play fast-and-loose with
393 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
394 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
395 U_SUCCESS(*status)) {
396 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
397 pce,
398 patternlength - ucol_getOffset(coleiter) + 1,
399 status);
400
401 if (U_FAILURE(*status)) {
402 return 0;
403 }
404
405 offset += 1;
406
407 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
408 uprv_free(pcetable);
409 }
410
411 pcetable = temp;
412 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
413 }
414
415 pcetable[offset] = 0;
416 pattern->pces = pcetable;
417 pattern->pcesLength = offset;
418
419 return result;
420}
421
422/**
423* Initializes the pattern struct.
424* Internal method, status assumed to be success.
425* @param strsrch UStringSearch data storage
426* @param status output error if any, caller to check status before calling
427* method, status assumed to be success when passed in.
428* @return expansionsize the total expansion size of the pattern
429*/
430static
431inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
432{
433 if (U_FAILURE(*status)) { return 0; }
434 UPattern *pattern = &(strsrch->pattern);
435 const UChar *patterntext = pattern->text;
436 int32_t length = pattern->textLength;
437 int32_t index = 0;
438
439 // Since the strength is primary, accents are ignored in the pattern.
440 if (strsrch->strength == UCOL_PRIMARY) {
441 pattern->hasPrefixAccents = 0;
442 pattern->hasSuffixAccents = 0;
443 } else {
444 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
445 SECOND_LAST_BYTE_SHIFT_;
446 index = length;
447 U16_BACK_1(patterntext, 0, index);
448 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
449 LAST_BYTE_MASK_;
450 }
451
452 // ** HACK **
453 if (strsrch->pattern.pces != NULL) {
454 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
455 uprv_free(strsrch->pattern.pces);
456 }
457
458 strsrch->pattern.pces = NULL;
459 }
460
461 // since intializePattern is an internal method status is a success.
462 return initializePatternCETable(strsrch, status);
463}
464
465/**
466* Initializing shift tables, with the default values.
467* If a corresponding default value is 0, the shift table is not set.
468* @param shift table for forwards shift
469* @param backshift table for backwards shift
470* @param cetable table containing pattern ce
471* @param cesize size of the pattern ces
472* @param expansionsize total size of the expansions
473* @param defaultforward the default forward value
474* @param defaultbackward the default backward value
475*/
476static
477inline void setShiftTable(int16_t shift[], int16_t backshift[],
478 int32_t *cetable, int32_t cesize,
479 int16_t expansionsize,
480 int16_t defaultforward,
481 int16_t defaultbackward)
482{
483 // estimate the value to shift. to do that we estimate the smallest
484 // number of characters to give the relevant ces, ie approximately
485 // the number of ces minus their expansion, since expansions can come
486 // from a character.
487 int32_t count;
488 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
489 shift[count] = defaultforward;
490 }
491 cesize --; // down to the last index
492 for (count = 0; count < cesize; count ++) {
493 // number of ces from right of array to the count
494 int temp = defaultforward - count - 1;
495 shift[hash(cetable[count])] = temp > 1 ? temp : 1;
496 }
497 shift[hash(cetable[cesize])] = 1;
498 // for ignorables we just shift by one. see test examples.
499 shift[hash(0)] = 1;
500
501 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
502 backshift[count] = defaultbackward;
503 }
504 for (count = cesize; count > 0; count --) {
505 // the original value count does not seem to work
506 backshift[hash(cetable[count])] = count > expansionsize ?
507 (int16_t)(count - expansionsize) : 1;
508 }
509 backshift[hash(cetable[0])] = 1;
510 backshift[hash(0)] = 1;
511}
512
513/**
514* Building of the pattern collation element list and the boyer moore strsrch
515* table.
516* The canonical match will only be performed after the default match fails.
517* For both cases we need to remember the size of the composed and decomposed
518* versions of the string. Since the Boyer-Moore shift calculations shifts by
519* a number of characters in the text and tries to match the pattern from that
520* offset, the shift value can not be too large in case we miss some
521* characters. To choose a right shift size, we estimate the NFC form of the
522* and use its size as a shift guide. The NFC form should be the small
523* possible representation of the pattern. Anyways, we'll err on the smaller
524* shift size. Hence the calculation for minlength.
525* Canonical match will be performed slightly differently. We'll split the
526* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
527* the first and last base character (MS), the ending accents (EA). Matches
528* will be done on MS first, and only when we match MS then some processing
529* will be required for the prefix and end accents in order to determine if
530* they match PA and EA. Hence the default shift values
531* for the canonical match will take the size of either end's accent into
532* consideration. Forwards search will take the end accents into consideration
533* for the default shift values and the backwards search will take the prefix
534* accents into consideration.
535* If pattern has no non-ignorable ce, we return a illegal argument error.
536* Internal method, status assumed to be success.
537* @param strsrch UStringSearch data storage
538* @param status for output errors if it occurs, status is assumed to be a
539* success when it is passed in.
540*/
541static
542inline void initialize(UStringSearch *strsrch, UErrorCode *status)
543{
544 int16_t expandlength = initializePattern(strsrch, status);
545 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
546 UPattern *pattern = &strsrch->pattern;
547 int32_t cesize = pattern->cesLength;
548
549 int16_t minlength = cesize > expandlength
550 ? (int16_t)cesize - expandlength : 1;
551 pattern->defaultShiftSize = minlength;
552 setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
553 cesize, expandlength, minlength, minlength);
554 return;
555 }
556 strsrch->pattern.defaultShiftSize = 0;
557}
558
559#if BOYER_MOORE
560/**
561* Check to make sure that the match length is at the end of the character by
562* using the breakiterator.
563* @param strsrch string search data
564* @param start target text start offset
565* @param end target text end offset
566*/
567static
568void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
569 int32_t *end)
570{
571#if !UCONFIG_NO_BREAK_ITERATION
572 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
573 if (breakiterator) {
574 int32_t matchend = *end;
575 //int32_t matchstart = *start;
576
577 if (!ubrk_isBoundary(breakiterator, matchend)) {
578 *end = ubrk_following(breakiterator, matchend);
579 }
580
581 /* Check the start of the matched text to make sure it doesn't have any accents
582 * before it. This code may not be necessary and so it is commented out */
583 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
584 *start = ubrk_preceding(breakiterator, matchstart);
585 }*/
586 }
587#endif
588}
589
590/**
591* Determine whether the target text in UStringSearch bounded by the offset
592* start and end is one or more whole units of text as
593* determined by the breakiterator in UStringSearch.
594* @param strsrch string search data
595* @param start target text start offset
596* @param end target text end offset
597*/
598static
599UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
600 int32_t end)
601{
602#if !UCONFIG_NO_BREAK_ITERATION
603 UBreakIterator *breakiterator = strsrch->search->breakIter;
604 //TODO: Add here.
605 if (breakiterator) {
606 int32_t startindex = ubrk_first(breakiterator);
607 int32_t endindex = ubrk_last(breakiterator);
608
609 // out-of-range indexes are never boundary positions
610 if (start < startindex || start > endindex ||
611 end < startindex || end > endindex) {
612 return FALSE;
613 }
614 // otherwise, we can use following() on the position before the
615 // specified one and return true of the position we get back is the
616 // one the user specified
617 UBool result = (start == startindex ||
618 ubrk_following(breakiterator, start - 1) == start) &&
619 (end == endindex ||
620 ubrk_following(breakiterator, end - 1) == end);
621 if (result) {
622 // iterates the individual ces
623 UCollationElements *coleiter = strsrch->utilIter;
624 const UChar *text = strsrch->search->text +
625 start;
626 UErrorCode status = U_ZERO_ERROR;
627 ucol_setText(coleiter, text, end - start, &status);
628 for (int32_t count = 0; count < strsrch->pattern.cesLength;
629 count ++) {
630 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
631 if (ce == UCOL_IGNORABLE) {
632 count --;
633 continue;
634 }
635 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
636 return FALSE;
637 }
638 }
639 int32_t nextce = ucol_next(coleiter, &status);
640 while (ucol_getOffset(coleiter) == (end - start)
641 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
642 nextce = ucol_next(coleiter, &status);
643 }
644 if (ucol_getOffset(coleiter) == (end - start)
645 && nextce != UCOL_NULLORDER) {
646 // extra collation elements at the end of the match
647 return FALSE;
648 }
649 }
650 return result;
651 }
652#endif
653 return TRUE;
654}
655
656/**
657* Getting the next base character offset if current offset is an accent,
658* or the current offset if the current character contains a base character.
659* accents the following base character will be returned
660* @param text string
661* @param textoffset current offset
662* @param textlength length of text string
663* @return the next base character or the current offset
664* if the current character is contains a base character.
665*/
666static
667inline int32_t getNextBaseOffset(const UChar *text,
668 int32_t textoffset,
669 int32_t textlength)
670{
671 if (textoffset < textlength) {
672 int32_t temp = textoffset;
673 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
674 while (temp < textlength) {
675 int32_t result = temp;
676 if ((getFCD(text, &temp, textlength) >>
677 SECOND_LAST_BYTE_SHIFT_) == 0) {
678 return result;
679 }
680 }
681 return textlength;
682 }
683 }
684 return textoffset;
685}
686
687/**
688* Gets the next base character offset depending on the string search pattern
689* data
690* @param strsrch string search data
691* @param textoffset current offset, one offset away from the last character
692* to search for.
693* @return start index of the next base character or the current offset
694* if the current character is contains a base character.
695*/
696static
697inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
698 int32_t textoffset)
699{
700 int32_t textlength = strsrch->search->textLength;
701 if (strsrch->pattern.hasSuffixAccents &&
702 textoffset < textlength) {
703 int32_t temp = textoffset;
704 const UChar *text = strsrch->search->text;
705 U16_BACK_1(text, 0, temp);
706 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
707 return getNextBaseOffset(text, textoffset, textlength);
708 }
709 }
710 return textoffset;
711}
712
713/**
714* Shifting the collation element iterator position forward to prepare for
715* a following match. If the last character is a unsafe character, we'll only
716* shift by 1 to capture contractions, normalization etc.
717* Internal method, status assumed to be success.
718* @param text strsrch string search data
719* @param textoffset start text position to do search
720* @param ce the text ce which failed the match.
721* @param patternceindex index of the ce within the pattern ce buffer which
722* failed the match
723* @return final offset
724*/
725static
726inline int32_t shiftForward(UStringSearch *strsrch,
727 int32_t textoffset,
728 int32_t ce,
729 int32_t patternceindex)
730{
731 UPattern *pattern = &(strsrch->pattern);
732 if (ce != UCOL_NULLORDER) {
733 int32_t shift = pattern->shift[hash(ce)];
734 // this is to adjust for characters in the middle of the
735 // substring for matching that failed.
736 int32_t adjust = pattern->cesLength - patternceindex;
737 if (adjust > 1 && shift >= adjust) {
738 shift -= adjust - 1;
739 }
740 textoffset += shift;
741 }
742 else {
743 textoffset += pattern->defaultShiftSize;
744 }
745
746 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
747 // check for unsafe characters
748 // * if it is the start or middle of a contraction: to be done after
749 // a initial match is found
750 // * thai or lao base consonant character: similar to contraction
751 // * high surrogate character: similar to contraction
752 // * next character is a accent: shift to the next base character
753 return textoffset;
754}
755#endif // #if BOYER_MOORE
756
757/**
758* sets match not found
759* @param strsrch string search data
760*/
761static
762inline void setMatchNotFound(UStringSearch *strsrch)
763{
764 // this method resets the match result regardless of the error status.
765 strsrch->search->matchedIndex = USEARCH_DONE;
766 strsrch->search->matchedLength = 0;
767 if (strsrch->search->isForwardSearching) {
768 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
769 }
770 else {
771 setColEIterOffset(strsrch->textIter, 0);
772 }
773}
774
775#if BOYER_MOORE
776/**
777* Gets the offset to the next safe point in text.
778* ie. not the middle of a contraction, swappable characters or supplementary
779* characters.
780* @param collator collation sata
781* @param text string to work with
782* @param textoffset offset in string
783* @param textlength length of text string
784* @return offset to the next safe character
785*/
786static
787inline int32_t getNextSafeOffset(const UCollator *collator,
788 const UChar *text,
789 int32_t textoffset,
790 int32_t textlength)
791{
792 int32_t result = textoffset; // first contraction character
793 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
794 result ++;
795 }
796 return result;
797}
798
799/**
800* This checks for accents in the potential match started with a .
801* composite character.
802* This is really painful... we have to check that composite character do not
803* have any extra accents. We have to normalize the potential match and find
804* the immediate decomposed character before the match.
805* The first composite character would have been taken care of by the fcd
806* checks in checkForwardExactMatch.
807* This is the slow path after the fcd of the first character and
808* the last character has been checked by checkForwardExactMatch and we
809* determine that the potential match has extra non-ignorable preceding
810* ces.
811* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
812* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
813* Note here that accents checking are slow and cautioned in the API docs.
814* Internal method, status assumed to be a success, caller should check status
815* before calling this method
816* @param strsrch string search data
817* @param start index of the potential unfriendly composite character
818* @param end index of the potential unfriendly composite character
819* @param status output error status if any.
820* @return TRUE if there is non-ignorable accents before at the beginning
821* of the match, FALSE otherwise.
822*/
823
824static
825UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
826 int32_t end,
827 UErrorCode *status)
828{
829 UBool result = FALSE;
830 if (strsrch->pattern.hasPrefixAccents) {
831 int32_t length = end - start;
832 int32_t offset = 0;
833 const UChar *text = strsrch->search->text + start;
834
835 U16_FWD_1(text, offset, length);
836 // we are only concerned with the first composite character
837 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
838 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
839 text, 0, length);
840 if (safeoffset != length) {
841 safeoffset ++;
842 }
843 UChar *norm = NULL;
844 UChar buffer[INITIAL_ARRAY_SIZE_];
845 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
846 buffer, INITIAL_ARRAY_SIZE_,
847 status);
848 if (U_FAILURE(*status)) {
849 return FALSE;
850 }
851 if (size >= INITIAL_ARRAY_SIZE_) {
852 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
853 status);
854 // if allocation failed, status will be set to
855 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
856 // checks for it.
857 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
858 size, status);
859 if (U_FAILURE(*status) && norm != NULL) {
860 uprv_free(norm);
861 return FALSE;
862 }
863 }
864 else {
865 norm = buffer;
866 }
867
868 UCollationElements *coleiter = strsrch->utilIter;
869 ucol_setText(coleiter, norm, size, status);
870 uint32_t firstce = strsrch->pattern.ces[0];
871 UBool ignorable = TRUE;
872 uint32_t ce = UCOL_IGNORABLE;
873 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
874 offset = ucol_getOffset(coleiter);
875 if (ce != firstce && ce != UCOL_IGNORABLE) {
876 ignorable = FALSE;
877 }
878 ce = ucol_next(coleiter, status);
879 }
880 UChar32 codepoint;
881 U16_PREV(norm, 0, offset, codepoint);
882 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
883
884 if (norm != buffer) {
885 uprv_free(norm);
886 }
887 }
888 }
889
890 return result;
891}
892
893/**
894* Used by exact matches, checks if there are accents before the match.
895* This is really painful... we have to check that composite characters at
896* the start of the matches have to not have any extra accents.
897* We check the FCD of the character first, if it starts with an accent and
898* the first pattern ce does not match the first ce of the character, we bail.
899* Otherwise we try normalizing the first composite
900* character and find the immediate decomposed character before the match to
901* see if it is an non-ignorable accent.
902* Now normalizing the first composite character is enough because we ensure
903* that when the match is passed in here with extra beginning ces, the
904* first or last ce that match has to occur within the first character.
905* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
906* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
907* Note here that accents checking are slow and cautioned in the API docs.
908* @param strsrch string search data
909* @param start offset
910* @param end offset
911* @return TRUE if there are accents on either side of the match,
912* FALSE otherwise
913*/
914static
915UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
916 int32_t end)
917{
918 if (strsrch->pattern.hasPrefixAccents) {
919 UCollationElements *coleiter = strsrch->textIter;
920 UErrorCode status = U_ZERO_ERROR;
921 // we have been iterating forwards previously
922 uint32_t ignorable = TRUE;
923 int32_t firstce = strsrch->pattern.ces[0];
924
925 setColEIterOffset(coleiter, start);
926 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
927 if (U_FAILURE(status)) {
928 return TRUE;
929 }
930 while (ce != firstce) {
931 if (ce != UCOL_IGNORABLE) {
932 ignorable = FALSE;
933 }
934 ce = getCE(strsrch, ucol_next(coleiter, &status));
935 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
936 return TRUE;
937 }
938 }
939 if (!ignorable && inNormBuf(coleiter)) {
940 // within normalization buffer, discontiguous handled here
941 return TRUE;
942 }
943
944 // within text
945 int32_t temp = start;
946 // original code
947 // accent = (getFCD(strsrch->search->text, &temp,
948 // strsrch->search->textLength)
949 // >> SECOND_LAST_BYTE_SHIFT_);
950 // however this code does not work well with VC7 .net in release mode.
951 // maybe the inlines for getFCD combined with shifting has bugs in
952 // VC7. anyways this is a work around.
953 UBool accent = getFCD(strsrch->search->text, &temp,
954 strsrch->search->textLength) > 0xFF;
955 if (!accent) {
956 return checkExtraMatchAccents(strsrch, start, end, &status);
957 }
958 if (!ignorable) {
959 return TRUE;
960 }
961 if (start > 0) {
962 temp = start;
963 U16_BACK_1(strsrch->search->text, 0, temp);
964 if (getFCD(strsrch->search->text, &temp,
965 strsrch->search->textLength) & LAST_BYTE_MASK_) {
966 setColEIterOffset(coleiter, start);
967 ce = ucol_previous(coleiter, &status);
968 if (U_FAILURE(status) ||
969 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
970 return TRUE;
971 }
972 }
973 }
974 }
975
976 return FALSE;
977}
978
979/**
980* Used by exact matches, checks if there are accents bounding the match.
981* Note this is the initial boundary check. If the potential match
982* starts or ends with composite characters, the accents in those
983* characters will be determined later.
984* Not doing backwards iteration here, since discontiguos contraction for
985* backwards collation element iterator, use up too many characters.
986* E.g. looking for \u030A ring in \u01FA A ring above and acute,
987* should fail since there is a acute at the end of \u01FA
988* Note here that accents checking are slow and cautioned in the API docs.
989* @param strsrch string search data
990* @param start offset of match
991* @param end end offset of the match
992* @return TRUE if there are accents on either side of the match,
993* FALSE otherwise
994*/
995static
996UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
997 int32_t end)
998{
999 if (strsrch->pattern.hasSuffixAccents) {
1000 const UChar *text = strsrch->search->text;
1001 int32_t temp = end;
1002 int32_t textlength = strsrch->search->textLength;
1003 U16_BACK_1(text, 0, temp);
1004 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1005 int32_t firstce = strsrch->pattern.ces[0];
1006 UCollationElements *coleiter = strsrch->textIter;
1007 UErrorCode status = U_ZERO_ERROR;
1008 int32_t ce;
1009 setColEIterOffset(coleiter, start);
1010 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1011 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1012 return TRUE;
1013 }
1014 }
1015 int32_t count = 1;
1016 while (count < strsrch->pattern.cesLength) {
1017 if (getCE(strsrch, ucol_next(coleiter, &status))
1018 == UCOL_IGNORABLE) {
1019 // Thai can give an ignorable here.
1020 count --;
1021 }
1022 if (U_FAILURE(status)) {
1023 return TRUE;
1024 }
1025 count ++;
1026 }
1027
1028 ce = ucol_next(coleiter, &status);
1029 if (U_FAILURE(status)) {
1030 return TRUE;
1031 }
1032 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1033 ce = getCE(strsrch, ce);
1034 }
1035 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1036 if (ucol_getOffset(coleiter) <= end) {
1037 return TRUE;
1038 }
1039 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1040 return TRUE;
1041 }
1042 }
1043 }
1044 }
1045 return FALSE;
1046}
1047#endif // #if BOYER_MOORE
1048
1049/**
1050* Checks if the offset runs out of the text string
1051* @param offset
1052* @param textlength of the text string
1053* @return TRUE if offset is out of bounds, FALSE otherwise
1054*/
1055static
1056inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1057{
1058 return offset < 0 || offset > textlength;
1059}
1060
1061/**
1062* Checks for identical match
1063* @param strsrch string search data
1064* @param start offset of possible match
1065* @param end offset of possible match
1066* @return TRUE if identical match is found
1067*/
1068static
1069inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1070 int32_t end)
1071{
1072 if (strsrch->strength != UCOL_IDENTICAL) {
1073 return TRUE;
1074 }
1075
1076 // Note: We could use Normalizer::compare() or similar, but for short strings
1077 // which may not be in FCD it might be faster to just NFD them.
1078 UErrorCode status = U_ZERO_ERROR;
1079 UnicodeString t2, p2;
1080 strsrch->nfd->normalize(
1081 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1082 strsrch->nfd->normalize(
1083 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1084 // return FALSE if NFD failed
1085 return U_SUCCESS(status) && t2 == p2;
1086}
1087
1088#if BOYER_MOORE
1089/**
1090* Checks to see if the match is repeated
1091* @param strsrch string search data
1092* @param start new match start index
1093* @param end new match end index
1094* @return TRUE if the the match is repeated, FALSE otherwise
1095*/
1096static
1097inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1098 int32_t start,
1099 int32_t end)
1100{
1101 int32_t lastmatchindex = strsrch->search->matchedIndex;
1102 UBool result;
1103 if (lastmatchindex == USEARCH_DONE) {
1104 return FALSE;
1105 }
1106 if (strsrch->search->isForwardSearching) {
1107 result = start <= lastmatchindex;
1108 }
1109 else {
1110 result = start >= lastmatchindex;
1111 }
1112 if (!result && !strsrch->search->isOverlap) {
1113 if (strsrch->search->isForwardSearching) {
1114 result = start < lastmatchindex + strsrch->search->matchedLength;
1115 }
1116 else {
1117 result = end > lastmatchindex;
1118 }
1119 }
1120 return result;
1121}
1122
1123/**
1124* Gets the collation element iterator's current offset.
1125* @param coleiter collation element iterator
1126* @param forwards flag TRUE if we are moving in th forwards direction
1127* @return current offset
1128*/
1129static
1130inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1131 UBool forwards)
1132{
1133 int32_t result = ucol_getOffset(coleiter);
1134 // intricacies of the the backwards collation element iterator
1135 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1136 result ++;
1137 }
1138 return result;
1139}
1140
1141/**
1142* Checks match for contraction.
1143* If the match ends with a partial contraction we fail.
1144* If the match starts too far off (because of backwards iteration) we try to
1145* chip off the extra characters depending on whether a breakiterator has
1146* been used.
1147* Internal method, error assumed to be success, caller has to check status
1148* before calling this method.
1149* @param strsrch string search data
1150* @param start offset of potential match, to be modified if necessary
1151* @param end offset of potential match, to be modified if necessary
1152* @param status output error status if any
1153* @return TRUE if match passes the contraction test, FALSE otherwise
1154*/
1155
1156static
1157UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1158 int32_t *start,
1159 int32_t *end, UErrorCode *status)
1160{
1161 UCollationElements *coleiter = strsrch->textIter;
1162 int32_t textlength = strsrch->search->textLength;
1163 int32_t temp = *start;
1164 const UCollator *collator = strsrch->collator;
1165 const UChar *text = strsrch->search->text;
1166 // This part checks if either ends of the match contains potential
1167 // contraction. If so we'll have to iterate through them
1168 // The start contraction needs to be checked since ucol_previous dumps
1169 // all characters till the first safe character into the buffer.
1170 // *start + 1 is used to test for the unsafe characters instead of *start
1171 // because ucol_prev takes all unsafe characters till the first safe
1172 // character ie *start. so by testing *start + 1, we can estimate if
1173 // excess prefix characters has been included in the potential search
1174 // results.
1175 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1176 (*start + 1 < textlength
1177 && ucol_unsafeCP(text[*start + 1], collator))) {
1178 int32_t expansion = getExpansionPrefix(coleiter);
1179 UBool expandflag = expansion > 0;
1180 setColEIterOffset(coleiter, *start);
1181 while (expansion > 0) {
1182 // getting rid of the redundant ce, caused by setOffset.
1183 // since backward contraction/expansion may have extra ces if we
1184 // are in the normalization buffer, hasAccentsBeforeMatch would
1185 // have taken care of it.
1186 // E.g. the character \u01FA will have an expansion of 3, but if
1187 // we are only looking for acute and ring \u030A and \u0301, we'll
1188 // have to skip the first ce in the expansion buffer.
1189 ucol_next(coleiter, status);
1190 if (U_FAILURE(*status)) {
1191 return FALSE;
1192 }
1193 if (ucol_getOffset(coleiter) != temp) {
1194 *start = temp;
1195 temp = ucol_getOffset(coleiter);
1196 }
1197 expansion --;
1198 }
1199
1200 int32_t *patternce = strsrch->pattern.ces;
1201 int32_t patterncelength = strsrch->pattern.cesLength;
1202 int32_t count = 0;
1203 while (count < patterncelength) {
1204 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1205 if (ce == UCOL_IGNORABLE) {
1206 continue;
1207 }
1208 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1209 *start = temp;
1210 temp = ucol_getOffset(coleiter);
1211 }
1212 if (U_FAILURE(*status) || ce != patternce[count]) {
1213 (*end) ++;
1214 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1215 return FALSE;
1216 }
1217 count ++;
1218 }
1219 }
1220 return TRUE;
1221}
1222
1223/**
1224* Checks and sets the match information if found.
1225* Checks
1226* <ul>
1227* <li> the potential match does not repeat the previous match
1228* <li> boundaries are correct
1229* <li> exact matches has no extra accents
1230* <li> identical matchesb
1231* <li> potential match does not end in the middle of a contraction
1232* <\ul>
1233* Otherwise the offset will be shifted to the next character.
1234* Internal method, status assumed to be success, caller has to check status
1235* before calling this method.
1236* @param strsrch string search data
1237* @param textoffset offset in the collation element text. the returned value
1238* will be the truncated end offset of the match or the new start
1239* search offset.
1240* @param status output error status if any
1241* @return TRUE if the match is valid, FALSE otherwise
1242*/
1243static
1244inline UBool checkNextExactMatch(UStringSearch *strsrch,
1245 int32_t *textoffset, UErrorCode *status)
1246{
1247 UCollationElements *coleiter = strsrch->textIter;
1248 int32_t start = getColElemIterOffset(coleiter, FALSE);
1249
1250 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1251 return FALSE;
1252 }
1253
1254 // this totally matches, however we need to check if it is repeating
1255 if (!isBreakUnit(strsrch, start, *textoffset) ||
1256 checkRepeatedMatch(strsrch, start, *textoffset) ||
1257 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1258 !checkIdentical(strsrch, start, *textoffset) ||
1259 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1260
1261 (*textoffset) ++;
1262 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1263 return FALSE;
1264 }
1265
1266 //Add breakiterator boundary check for primary strength search.
1267 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1268 checkBreakBoundary(strsrch, &start, textoffset);
1269 }
1270
1271 // totally match, we will get rid of the ending ignorables.
1272 strsrch->search->matchedIndex = start;
1273 strsrch->search->matchedLength = *textoffset - start;
1274 return TRUE;
1275}
1276
1277/**
1278* Getting the previous base character offset, or the current offset if the
1279* current character is a base character
1280* @param text string
1281* @param textoffset one offset after the current character
1282* @return the offset of the next character after the base character or the first
1283* composed character with accents
1284*/
1285static
1286inline int32_t getPreviousBaseOffset(const UChar *text,
1287 int32_t textoffset)
1288{
1289 if (textoffset > 0) {
1290 for (;;) {
1291 int32_t result = textoffset;
1292 U16_BACK_1(text, 0, textoffset);
1293 int32_t temp = textoffset;
1294 uint16_t fcd = getFCD(text, &temp, result);
1295 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1296 if (fcd & LAST_BYTE_MASK_) {
1297 return textoffset;
1298 }
1299 return result;
1300 }
1301 if (textoffset == 0) {
1302 return 0;
1303 }
1304 }
1305 }
1306 return textoffset;
1307}
1308
1309/**
1310* Getting the indexes of the accents that are not blocked in the argument
1311* accent array
1312* @param accents array of accents in nfd terminated by a 0.
1313* @param accentsindex array of indexes of the accents that are not blocked
1314*/
1315static
1316inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1317{
1318 int32_t index = 0;
1319 int32_t length = u_strlen(accents);
1320 UChar32 codepoint = 0;
1321 int cclass = 0;
1322 int result = 0;
1323 int32_t temp;
1324 while (index < length) {
1325 temp = index;
1326 U16_NEXT(accents, index, length, codepoint);
1327 if (u_getCombiningClass(codepoint) != cclass) {
1328 cclass = u_getCombiningClass(codepoint);
1329 accentsindex[result] = temp;
1330 result ++;
1331 }
1332 }
1333 accentsindex[result] = length;
1334 return result;
1335}
1336
1337/**
1338* Appends 3 UChar arrays to a destination array.
1339* Creates a new array if we run out of space. The caller will have to
1340* manually deallocate the newly allocated array.
1341* Internal method, status assumed to be success, caller has to check status
1342* before calling this method. destination not to be NULL and has at least
1343* size destinationlength.
1344* @param destination target array
1345* @param destinationlength target array size, returning the appended length
1346* @param source1 null-terminated first array
1347* @param source2 second array
1348* @param source2length length of seond array
1349* @param source3 null-terminated third array
1350* @param status error status if any
1351* @return new destination array, destination if there was no new allocation
1352*/
1353static
1354inline UChar * addToUCharArray( UChar *destination,
1355 int32_t *destinationlength,
1356 const UChar *source1,
1357 const UChar *source2,
1358 int32_t source2length,
1359 const UChar *source3,
1360 UErrorCode *status)
1361{
1362 int32_t source1length = source1 ? u_strlen(source1) : 0;
1363 int32_t source3length = source3 ? u_strlen(source3) : 0;
1364 if (*destinationlength < source1length + source2length + source3length +
1365 1)
1366 {
1367 destination = (UChar *)allocateMemory(
1368 (source1length + source2length + source3length + 1) * sizeof(UChar),
1369 status);
1370 // if error allocating memory, status will be
1371 // U_MEMORY_ALLOCATION_ERROR
1372 if (U_FAILURE(*status)) {
1373 *destinationlength = 0;
1374 return NULL;
1375 }
1376 }
1377 if (source1length != 0) {
1378 uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1379 }
1380 if (source2length != 0) {
1381 uprv_memcpy(destination + source1length, source2,
1382 sizeof(UChar) * source2length);
1383 }
1384 if (source3length != 0) {
1385 uprv_memcpy(destination + source1length + source2length, source3,
1386 sizeof(UChar) * source3length);
1387 }
1388 *destinationlength = source1length + source2length + source3length;
1389 return destination;
1390}
1391
1392/**
1393* Running through a collation element iterator to see if the contents matches
1394* pattern in string search data
1395* @param strsrch string search data
1396* @param coleiter collation element iterator
1397* @return TRUE if a match if found, FALSE otherwise
1398*/
1399static
1400inline UBool checkCollationMatch(const UStringSearch *strsrch,
1401 UCollationElements *coleiter)
1402{
1403 int patternceindex = strsrch->pattern.cesLength;
1404 int32_t *patternce = strsrch->pattern.ces;
1405 UErrorCode status = U_ZERO_ERROR;
1406 while (patternceindex > 0) {
1407 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1408 if (ce == UCOL_IGNORABLE) {
1409 continue;
1410 }
1411 if (U_FAILURE(status) || ce != *patternce) {
1412 return FALSE;
1413 }
1414 patternce ++;
1415 patternceindex --;
1416 }
1417 return TRUE;
1418}
1419
1420/**
1421* Rearranges the front accents to try matching.
1422* Prefix accents in the text will be grouped according to their combining
1423* class and the groups will be mixed and matched to try find the perfect
1424* match with the pattern.
1425* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1426* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1427* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1428* "\u0301\u0325".
1429* step 2: check if any of the generated substrings matches the pattern.
1430* Internal method, status is assumed to be success, caller has to check status
1431* before calling this method.
1432* @param strsrch string search match
1433* @param start first offset of the accents to start searching
1434* @param end start of the last accent set
1435* @param status output error status if any
1436* @return USEARCH_DONE if a match is not found, otherwise return the starting
1437* offset of the match. Note this start includes all preceding accents.
1438*/
1439static
1440int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1441 int32_t start,
1442 int32_t end,
1443 UErrorCode *status)
1444{
1445 const UChar *text = strsrch->search->text;
1446 int32_t textlength = strsrch->search->textLength;
1447 int32_t tempstart = start;
1448
1449 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1450 // die... failed at a base character
1451 return USEARCH_DONE;
1452 }
1453
1454 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1455 start = getPreviousBaseOffset(text, tempstart);
1456
1457 UChar accents[INITIAL_ARRAY_SIZE_];
1458 // normalizing the offensive string
1459 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1460 INITIAL_ARRAY_SIZE_, status);
1461 if (U_FAILURE(*status)) {
1462 return USEARCH_DONE;
1463 }
1464
1465 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1466 int32_t accentsize = getUnblockedAccentIndex(accents,
1467 accentsindex);
1468 int32_t count = (2 << (accentsize - 1)) - 1;
1469 UChar buffer[INITIAL_ARRAY_SIZE_];
1470 UCollationElements *coleiter = strsrch->utilIter;
1471 while (U_SUCCESS(*status) && count > 0) {
1472 UChar *rearrange = strsrch->canonicalPrefixAccents;
1473 // copy the base characters
1474 for (int k = 0; k < accentsindex[0]; k ++) {
1475 *rearrange ++ = accents[k];
1476 }
1477 // forming all possible canonical rearrangement by dropping
1478 // sets of accents
1479 for (int i = 0; i <= accentsize - 1; i ++) {
1480 int32_t mask = 1 << (accentsize - i - 1);
1481 if (count & mask) {
1482 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1483 *rearrange ++ = accents[j];
1484 }
1485 }
1486 }
1487 *rearrange = 0;
1488 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1489 UChar *match = addToUCharArray(buffer, &matchsize,
1490 strsrch->canonicalPrefixAccents,
1491 strsrch->search->text + offset,
1492 end - offset,
1493 strsrch->canonicalSuffixAccents,
1494 status);
1495
1496 // if status is a failure, ucol_setText does nothing.
1497 // run the collator iterator through this match
1498 ucol_setText(coleiter, match, matchsize, status);
1499 if (U_SUCCESS(*status)) {
1500 if (checkCollationMatch(strsrch, coleiter)) {
1501 if (match != buffer) {
1502 uprv_free(match);
1503 }
1504 return start;
1505 }
1506 }
1507 count --;
1508 }
1509 return USEARCH_DONE;
1510}
1511
1512/**
1513* Gets the offset to the safe point in text before textoffset.
1514* ie. not the middle of a contraction, swappable characters or supplementary
1515* characters.
1516* @param collator collation sata
1517* @param text string to work with
1518* @param textoffset offset in string
1519* @param textlength length of text string
1520* @return offset to the previous safe character
1521*/
1522static
1523inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1524 const UChar *text,
1525 int32_t textoffset)
1526{
1527 int32_t result = textoffset; // first contraction character
1528 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1529 result --;
1530 }
1531 if (result != 0) {
1532 // the first contraction character is consider unsafe here
1533 result --;
1534 }
1535 return result;
1536}
1537
1538/**
1539* Cleaning up after we passed the safe zone
1540* @param strsrch string search data
1541* @param safetext safe text array
1542* @param safebuffer safe text buffer
1543* @param coleiter collation element iterator for safe text
1544*/
1545static
1546inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1547 UChar *safebuffer)
1548{
1549 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1550 {
1551 uprv_free(safetext);
1552 }
1553}
1554
1555/**
1556* Take the rearranged end accents and tries matching. If match failed at
1557* a seperate preceding set of accents (seperated from the rearranged on by
1558* at least a base character) then we rearrange the preceding accents and
1559* tries matching again.
1560* We allow skipping of the ends of the accent set if the ces do not match.
1561* However if the failure is found before the accent set, it fails.
1562* Internal method, status assumed to be success, caller has to check status
1563* before calling this method.
1564* @param strsrch string search data
1565* @param textoffset of the start of the rearranged accent
1566* @param status output error status if any
1567* @return USEARCH_DONE if a match is not found, otherwise return the starting
1568* offset of the match. Note this start includes all preceding accents.
1569*/
1570static
1571int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1572 int32_t textoffset,
1573 UErrorCode *status)
1574{
1575 const UChar *text = strsrch->search->text;
1576 const UCollator *collator = strsrch->collator;
1577 int32_t safelength = 0;
1578 UChar *safetext;
1579 int32_t safetextlength;
1580 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1581 UCollationElements *coleiter = strsrch->utilIter;
1582 int32_t safeoffset = textoffset;
1583
1584 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1585 collator)) {
1586 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1587 safelength = textoffset - safeoffset;
1588 safetextlength = INITIAL_ARRAY_SIZE_;
1589 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1590 text + safeoffset, safelength,
1591 strsrch->canonicalSuffixAccents,
1592 status);
1593 }
1594 else {
1595 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1596 safetext = strsrch->canonicalSuffixAccents;
1597 }
1598
1599 // if status is a failure, ucol_setText does nothing
1600 ucol_setText(coleiter, safetext, safetextlength, status);
1601 // status checked in loop below
1602
1603 int32_t *ce = strsrch->pattern.ces;
1604 int32_t celength = strsrch->pattern.cesLength;
1605 int ceindex = celength - 1;
1606 UBool isSafe = TRUE; // indication flag for position in safe zone
1607
1608 while (ceindex >= 0) {
1609 int32_t textce = ucol_previous(coleiter, status);
1610 if (U_FAILURE(*status)) {
1611 if (isSafe) {
1612 cleanUpSafeText(strsrch, safetext, safebuffer);
1613 }
1614 return USEARCH_DONE;
1615 }
1616 if (textce == UCOL_NULLORDER) {
1617 // check if we have passed the safe buffer
1618 if (coleiter == strsrch->textIter) {
1619 cleanUpSafeText(strsrch, safetext, safebuffer);
1620 return USEARCH_DONE;
1621 }
1622 cleanUpSafeText(strsrch, safetext, safebuffer);
1623 safetext = safebuffer;
1624 coleiter = strsrch->textIter;
1625 setColEIterOffset(coleiter, safeoffset);
1626 // status checked at the start of the loop
1627 isSafe = FALSE;
1628 continue;
1629 }
1630 textce = getCE(strsrch, textce);
1631 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1632 // do the beginning stuff
1633 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1634 if (isSafe && failedoffset >= safelength) {
1635 // alas... no hope. failed at rearranged accent set
1636 cleanUpSafeText(strsrch, safetext, safebuffer);
1637 return USEARCH_DONE;
1638 }
1639 else {
1640 if (isSafe) {
1641 failedoffset += safeoffset;
1642 cleanUpSafeText(strsrch, safetext, safebuffer);
1643 }
1644
1645 // try rearranging the front accents
1646 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1647 failedoffset, textoffset, status);
1648 if (result != USEARCH_DONE) {
1649 // if status is a failure, ucol_setOffset does nothing
1650 setColEIterOffset(strsrch->textIter, result);
1651 }
1652 if (U_FAILURE(*status)) {
1653 return USEARCH_DONE;
1654 }
1655 return result;
1656 }
1657 }
1658 if (textce == ce[ceindex]) {
1659 ceindex --;
1660 }
1661 }
1662 // set offset here
1663 if (isSafe) {
1664 int32_t result = getColElemIterOffset(coleiter, FALSE);
1665 // sets the text iterator here with the correct expansion and offset
1666 int32_t leftoverces = getExpansionPrefix(coleiter);
1667 cleanUpSafeText(strsrch, safetext, safebuffer);
1668 if (result >= safelength) {
1669 result = textoffset;
1670 }
1671 else {
1672 result += safeoffset;
1673 }
1674 setColEIterOffset(strsrch->textIter, result);
1675 strsrch->textIter->iteratordata_.toReturn =
1676 setExpansionPrefix(strsrch->textIter, leftoverces);
1677 return result;
1678 }
1679
1680 return ucol_getOffset(coleiter);
1681}
1682
1683/**
1684* Trying out the substring and sees if it can be a canonical match.
1685* This will try normalizing the end accents and arranging them into canonical
1686* equivalents and check their corresponding ces with the pattern ce.
1687* Suffix accents in the text will be grouped according to their combining
1688* class and the groups will be mixed and matched to try find the perfect
1689* match with the pattern.
1690* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1691* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1692* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1693* "\u0301\u0325".
1694* step 2: check if any of the generated substrings matches the pattern.
1695* Internal method, status assumed to be success, caller has to check status
1696* before calling this method.
1697* @param strsrch string search data
1698* @param textoffset end offset in the collation element text that ends with
1699* the accents to be rearranged
1700* @param status error status if any
1701* @return TRUE if the match is valid, FALSE otherwise
1702*/
1703static
1704UBool doNextCanonicalMatch(UStringSearch *strsrch,
1705 int32_t textoffset,
1706 UErrorCode *status)
1707{
1708 const UChar *text = strsrch->search->text;
1709 int32_t temp = textoffset;
1710 U16_BACK_1(text, 0, temp);
1711 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1712 UCollationElements *coleiter = strsrch->textIter;
1713 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1714 if (strsrch->pattern.hasPrefixAccents) {
1715 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1716 status);
1717 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1718 setColEIterOffset(coleiter, offset);
1719 return TRUE;
1720 }
1721 }
1722 return FALSE;
1723 }
1724
1725 if (!strsrch->pattern.hasSuffixAccents) {
1726 return FALSE;
1727 }
1728
1729 UChar accents[INITIAL_ARRAY_SIZE_];
1730 // offset to the last base character in substring to search
1731 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1732 // normalizing the offensive string
1733 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1734 0, accents, INITIAL_ARRAY_SIZE_, status);
1735 // status checked in loop below
1736
1737 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1738 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1739
1740 // 2 power n - 1 plus the full set of accents
1741 int32_t count = (2 << (size - 1)) - 1;
1742 while (U_SUCCESS(*status) && count > 0) {
1743 UChar *rearrange = strsrch->canonicalSuffixAccents;
1744 // copy the base characters
1745 for (int k = 0; k < accentsindex[0]; k ++) {
1746 *rearrange ++ = accents[k];
1747 }
1748 // forming all possible canonical rearrangement by dropping
1749 // sets of accents
1750 for (int i = 0; i <= size - 1; i ++) {
1751 int32_t mask = 1 << (size - i - 1);
1752 if (count & mask) {
1753 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1754 *rearrange ++ = accents[j];
1755 }
1756 }
1757 }
1758 *rearrange = 0;
1759 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1760 status);
1761 if (offset != USEARCH_DONE) {
1762 return TRUE; // match found
1763 }
1764 count --;
1765 }
1766 return FALSE;
1767}
1768
1769/**
1770* Gets the previous base character offset depending on the string search
1771* pattern data
1772* @param strsrch string search data
1773* @param textoffset current offset, current character
1774* @return the offset of the next character after this base character or itself
1775* if it is a composed character with accents
1776*/
1777static
1778inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1779 int32_t textoffset)
1780{
1781 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1782 const UChar *text = strsrch->search->text;
1783 int32_t offset = textoffset;
1784 if (getFCD(text, &offset, strsrch->search->textLength) >>
1785 SECOND_LAST_BYTE_SHIFT_) {
1786 return getPreviousBaseOffset(text, textoffset);
1787 }
1788 }
1789 return textoffset;
1790}
1791
1792/**
1793* Checks match for contraction.
1794* If the match ends with a partial contraction we fail.
1795* If the match starts too far off (because of backwards iteration) we try to
1796* chip off the extra characters
1797* Internal method, status assumed to be success, caller has to check status
1798* before calling this method.
1799* @param strsrch string search data
1800* @param start offset of potential match, to be modified if necessary
1801* @param end offset of potential match, to be modified if necessary
1802* @param status output error status if any
1803* @return TRUE if match passes the contraction test, FALSE otherwise
1804*/
1805static
1806UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1807 int32_t *start,
1808 int32_t *end,
1809 UErrorCode *status)
1810{
1811 UCollationElements *coleiter = strsrch->textIter;
1812 int32_t textlength = strsrch->search->textLength;
1813 int32_t temp = *start;
1814 const UCollator *collator = strsrch->collator;
1815 const UChar *text = strsrch->search->text;
1816 // This part checks if either ends of the match contains potential
1817 // contraction. If so we'll have to iterate through them
1818 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1819 (*start + 1 < textlength
1820 && ucol_unsafeCP(text[*start + 1], collator))) {
1821 int32_t expansion = getExpansionPrefix(coleiter);
1822 UBool expandflag = expansion > 0;
1823 setColEIterOffset(coleiter, *start);
1824 while (expansion > 0) {
1825 // getting rid of the redundant ce, caused by setOffset.
1826 // since backward contraction/expansion may have extra ces if we
1827 // are in the normalization buffer, hasAccentsBeforeMatch would
1828 // have taken care of it.
1829 // E.g. the character \u01FA will have an expansion of 3, but if
1830 // we are only looking for acute and ring \u030A and \u0301, we'll
1831 // have to skip the first ce in the expansion buffer.
1832 ucol_next(coleiter, status);
1833 if (U_FAILURE(*status)) {
1834 return FALSE;
1835 }
1836 if (ucol_getOffset(coleiter) != temp) {
1837 *start = temp;
1838 temp = ucol_getOffset(coleiter);
1839 }
1840 expansion --;
1841 }
1842
1843 int32_t *patternce = strsrch->pattern.ces;
1844 int32_t patterncelength = strsrch->pattern.cesLength;
1845 int32_t count = 0;
1846 int32_t textlength = strsrch->search->textLength;
1847 while (count < patterncelength) {
1848 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1849 // status checked below, note that if status is a failure
1850 // ucol_next returns UCOL_NULLORDER
1851 if (ce == UCOL_IGNORABLE) {
1852 continue;
1853 }
1854 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1855 *start = temp;
1856 temp = ucol_getOffset(coleiter);
1857 }
1858
1859 if (count == 0 && ce != patternce[0]) {
1860 // accents may have extra starting ces, this occurs when a
1861 // pure accent pattern is matched without rearrangement
1862 // text \u0325\u0300 and looking for \u0300
1863 int32_t expected = patternce[0];
1864 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1865 ce = getCE(strsrch, ucol_next(coleiter, status));
1866 while (U_SUCCESS(*status) && ce != expected &&
1867 ce != UCOL_NULLORDER &&
1868 ucol_getOffset(coleiter) <= *end) {
1869 ce = getCE(strsrch, ucol_next(coleiter, status));
1870 }
1871 }
1872 }
1873 if (U_FAILURE(*status) || ce != patternce[count]) {
1874 (*end) ++;
1875 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1876 return FALSE;
1877 }
1878 count ++;
1879 }
1880 }
1881 return TRUE;
1882}
1883
1884/**
1885* Checks and sets the match information if found.
1886* Checks
1887* <ul>
1888* <li> the potential match does not repeat the previous match
1889* <li> boundaries are correct
1890* <li> potential match does not end in the middle of a contraction
1891* <li> identical matches
1892* <\ul>
1893* Otherwise the offset will be shifted to the next character.
1894* Internal method, status assumed to be success, caller has to check the
1895* status before calling this method.
1896* @param strsrch string search data
1897* @param textoffset offset in the collation element text. the returned value
1898* will be the truncated end offset of the match or the new start
1899* search offset.
1900* @param status output error status if any
1901* @return TRUE if the match is valid, FALSE otherwise
1902*/
1903static
1904inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1905 int32_t *textoffset,
1906 UErrorCode *status)
1907{
1908 // to ensure that the start and ends are not composite characters
1909 UCollationElements *coleiter = strsrch->textIter;
1910 // if we have a canonical accent match
1911 if ((strsrch->pattern.hasSuffixAccents &&
1912 strsrch->canonicalSuffixAccents[0]) ||
1913 (strsrch->pattern.hasPrefixAccents &&
1914 strsrch->canonicalPrefixAccents[0])) {
1915 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1916 strsrch,
1917 ucol_getOffset(coleiter));
1918 strsrch->search->matchedLength = *textoffset -
1919 strsrch->search->matchedIndex;
1920 return TRUE;
1921 }
1922
1923 int32_t start = getColElemIterOffset(coleiter, FALSE);
1924 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1925 status) || U_FAILURE(*status)) {
1926 return FALSE;
1927 }
1928
1929 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1930 // this totally matches, however we need to check if it is repeating
1931 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1932 !isBreakUnit(strsrch, start, *textoffset) ||
1933 !checkIdentical(strsrch, start, *textoffset)) {
1934 (*textoffset) ++;
1935 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1936 strsrch->search->textLength);
1937 return FALSE;
1938 }
1939
1940 strsrch->search->matchedIndex = start;
1941 strsrch->search->matchedLength = *textoffset - start;
1942 return TRUE;
1943}
1944
1945/**
1946* Shifting the collation element iterator position forward to prepare for
1947* a preceding match. If the first character is a unsafe character, we'll only
1948* shift by 1 to capture contractions, normalization etc.
1949* Internal method, status assumed to be success, caller has to check status
1950* before calling this method.
1951* @param text strsrch string search data
1952* @param textoffset start text position to do search
1953* @param ce the text ce which failed the match.
1954* @param patternceindex index of the ce within the pattern ce buffer which
1955* failed the match
1956* @return final offset
1957*/
1958static
1959inline int32_t reverseShift(UStringSearch *strsrch,
1960 int32_t textoffset,
1961 int32_t ce,
1962 int32_t patternceindex)
1963{
1964 if (strsrch->search->isOverlap) {
1965 if (textoffset != strsrch->search->textLength) {
1966 textoffset --;
1967 }
1968 else {
1969 textoffset -= strsrch->pattern.defaultShiftSize;
1970 }
1971 }
1972 else {
1973 if (ce != UCOL_NULLORDER) {
1974 int32_t shift = strsrch->pattern.backShift[hash(ce)];
1975
1976 // this is to adjust for characters in the middle of the substring
1977 // for matching that failed.
1978 int32_t adjust = patternceindex;
1979 if (adjust > 1 && shift > adjust) {
1980 shift -= adjust - 1;
1981 }
1982 textoffset -= shift;
1983 }
1984 else {
1985 textoffset -= strsrch->pattern.defaultShiftSize;
1986 }
1987 }
1988 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1989 return textoffset;
1990}
1991
1992/**
1993* Checks match for contraction.
1994* If the match starts with a partial contraction we fail.
1995* Internal method, status assumed to be success, caller has to check status
1996* before calling this method.
1997* @param strsrch string search data
1998* @param start offset of potential match, to be modified if necessary
1999* @param end offset of potential match, to be modified if necessary
2000* @param status output error status if any
2001* @return TRUE if match passes the contraction test, FALSE otherwise
2002*/
2003static
2004UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2005 int32_t *start,
2006 int32_t *end, UErrorCode *status)
2007{
2008 UCollationElements *coleiter = strsrch->textIter;
2009 int32_t textlength = strsrch->search->textLength;
2010 int32_t temp = *end;
2011 const UCollator *collator = strsrch->collator;
2012 const UChar *text = strsrch->search->text;
2013 // This part checks if either if the start of the match contains potential
2014 // contraction. If so we'll have to iterate through them
2015 // Since we used ucol_next while previously looking for the potential
2016 // match, this guarantees that our end will not be a partial contraction,
2017 // or a partial supplementary character.
2018 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2019 int32_t expansion = getExpansionSuffix(coleiter);
2020 UBool expandflag = expansion > 0;
2021 setColEIterOffset(coleiter, *end);
2022 while (U_SUCCESS(*status) && expansion > 0) {
2023 // getting rid of the redundant ce
2024 // since forward contraction/expansion may have extra ces
2025 // if we are in the normalization buffer, hasAccentsBeforeMatch
2026 // would have taken care of it.
2027 // E.g. the character \u01FA will have an expansion of 3, but if
2028 // we are only looking for A ring A\u030A, we'll have to skip the
2029 // last ce in the expansion buffer
2030 ucol_previous(coleiter, status);
2031 if (U_FAILURE(*status)) {
2032 return FALSE;
2033 }
2034 if (ucol_getOffset(coleiter) != temp) {
2035 *end = temp;
2036 temp = ucol_getOffset(coleiter);
2037 }
2038 expansion --;
2039 }
2040
2041 int32_t *patternce = strsrch->pattern.ces;
2042 int32_t patterncelength = strsrch->pattern.cesLength;
2043 int32_t count = patterncelength;
2044 while (count > 0) {
2045 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2046 // status checked below, note that if status is a failure
2047 // ucol_previous returns UCOL_NULLORDER
2048 if (ce == UCOL_IGNORABLE) {
2049 continue;
2050 }
2051 if (expandflag && count == 0 &&
2052 getColElemIterOffset(coleiter, FALSE) != temp) {
2053 *end = temp;
2054 temp = ucol_getOffset(coleiter);
2055 }
2056 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2057 (*start) --;
2058 *start = getPreviousBaseOffset(text, *start);
2059 return FALSE;
2060 }
2061 count --;
2062 }
2063 }
2064 return TRUE;
2065}
2066
2067/**
2068* Checks and sets the match information if found.
2069* Checks
2070* <ul>
2071* <li> the current match does not repeat the last match
2072* <li> boundaries are correct
2073* <li> exact matches has no extra accents
2074* <li> identical matches
2075* <\ul>
2076* Otherwise the offset will be shifted to the preceding character.
2077* Internal method, status assumed to be success, caller has to check status
2078* before calling this method.
2079* @param strsrch string search data
2080* @param collator
2081* @param coleiter collation element iterator
2082* @param text string
2083* @param textoffset offset in the collation element text. the returned value
2084* will be the truncated start offset of the match or the new start
2085* search offset.
2086* @param status output error status if any
2087* @return TRUE if the match is valid, FALSE otherwise
2088*/
2089static
2090inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2091 int32_t *textoffset,
2092 UErrorCode *status)
2093{
2094 // to ensure that the start and ends are not composite characters
2095 int32_t end = ucol_getOffset(strsrch->textIter);
2096 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2097 || U_FAILURE(*status)) {
2098 return FALSE;
2099 }
2100
2101 // this totally matches, however we need to check if it is repeating
2102 // the old match
2103 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2104 !isBreakUnit(strsrch, *textoffset, end) ||
2105 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2106 !checkIdentical(strsrch, *textoffset, end) ||
2107 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2108 (*textoffset) --;
2109 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2110 *textoffset);
2111 return FALSE;
2112 }
2113
2114 //Add breakiterator boundary check for primary strength search.
2115 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2116 checkBreakBoundary(strsrch, textoffset, &end);
2117 }
2118
2119 strsrch->search->matchedIndex = *textoffset;
2120 strsrch->search->matchedLength = end - *textoffset;
2121 return TRUE;
2122}
2123
2124/**
2125* Rearranges the end accents to try matching.
2126* Suffix accents in the text will be grouped according to their combining
2127* class and the groups will be mixed and matched to try find the perfect
2128* match with the pattern.
2129* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2130* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2131* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2132* "\u0301\u0325".
2133* step 2: check if any of the generated substrings matches the pattern.
2134* Internal method, status assumed to be success, user has to check status
2135* before calling this method.
2136* @param strsrch string search match
2137* @param start offset of the first base character
2138* @param end start of the last accent set
2139* @param status only error status if any
2140* @return USEARCH_DONE if a match is not found, otherwise return the ending
2141* offset of the match. Note this start includes all following accents.
2142*/
2143static
2144int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2145 int32_t start,
2146 int32_t end,
2147 UErrorCode *status)
2148{
2149 const UChar *text = strsrch->search->text;
2150 int32_t tempend = end;
2151
2152 U16_BACK_1(text, 0, tempend);
2153 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2154 LAST_BYTE_MASK_)) {
2155 // die... failed at a base character
2156 return USEARCH_DONE;
2157 }
2158 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2159
2160 if (U_SUCCESS(*status)) {
2161 UChar accents[INITIAL_ARRAY_SIZE_];
2162 int32_t offset = getPreviousBaseOffset(text, end);
2163 // normalizing the offensive string
2164 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2165 INITIAL_ARRAY_SIZE_, status);
2166
2167 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2168 int32_t accentsize = getUnblockedAccentIndex(accents,
2169 accentsindex);
2170 int32_t count = (2 << (accentsize - 1)) - 1;
2171 UChar buffer[INITIAL_ARRAY_SIZE_];
2172 UCollationElements *coleiter = strsrch->utilIter;
2173 while (U_SUCCESS(*status) && count > 0) {
2174 UChar *rearrange = strsrch->canonicalSuffixAccents;
2175 // copy the base characters
2176 for (int k = 0; k < accentsindex[0]; k ++) {
2177 *rearrange ++ = accents[k];
2178 }
2179 // forming all possible canonical rearrangement by dropping
2180 // sets of accents
2181 for (int i = 0; i <= accentsize - 1; i ++) {
2182 int32_t mask = 1 << (accentsize - i - 1);
2183 if (count & mask) {
2184 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2185 *rearrange ++ = accents[j];
2186 }
2187 }
2188 }
2189 *rearrange = 0;
2190 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2191 UChar *match = addToUCharArray(buffer, &matchsize,
2192 strsrch->canonicalPrefixAccents,
2193 strsrch->search->text + start,
2194 offset - start,
2195 strsrch->canonicalSuffixAccents,
2196 status);
2197
2198 // run the collator iterator through this match
2199 // if status is a failure ucol_setText does nothing
2200 ucol_setText(coleiter, match, matchsize, status);
2201 if (U_SUCCESS(*status)) {
2202 if (checkCollationMatch(strsrch, coleiter)) {
2203 if (match != buffer) {
2204 uprv_free(match);
2205 }
2206 return end;
2207 }
2208 }
2209 count --;
2210 }
2211 }
2212 return USEARCH_DONE;
2213}
2214
2215/**
2216* Take the rearranged start accents and tries matching. If match failed at
2217* a seperate following set of accents (seperated from the rearranged on by
2218* at least a base character) then we rearrange the preceding accents and
2219* tries matching again.
2220* We allow skipping of the ends of the accent set if the ces do not match.
2221* However if the failure is found before the accent set, it fails.
2222* Internal method, status assumed to be success, caller has to check status
2223* before calling this method.
2224* @param strsrch string search data
2225* @param textoffset of the ends of the rearranged accent
2226* @param status output error status if any
2227* @return USEARCH_DONE if a match is not found, otherwise return the ending
2228* offset of the match. Note this start includes all following accents.
2229*/
2230static
2231int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2232 int32_t textoffset,
2233 UErrorCode *status)
2234{
2235 const UChar *text = strsrch->search->text;
2236 const UCollator *collator = strsrch->collator;
2237 int32_t safelength = 0;
2238 UChar *safetext;
2239 int32_t safetextlength;
2240 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2241 int32_t safeoffset = textoffset;
2242
2243 if (textoffset &&
2244 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2245 u_strlen(strsrch->canonicalPrefixAccents) - 1
2246 ], collator)) {
2247 safeoffset = getNextSafeOffset(collator, text, textoffset,
2248 strsrch->search->textLength);
2249 safelength = safeoffset - textoffset;
2250 safetextlength = INITIAL_ARRAY_SIZE_;
2251 safetext = addToUCharArray(safebuffer, &safetextlength,
2252 strsrch->canonicalPrefixAccents,
2253 text + textoffset, safelength,
2254 NULL, status);
2255 }
2256 else {
2257 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2258 safetext = strsrch->canonicalPrefixAccents;
2259 }
2260
2261 UCollationElements *coleiter = strsrch->utilIter;
2262 // if status is a failure, ucol_setText does nothing
2263 ucol_setText(coleiter, safetext, safetextlength, status);
2264 // status checked in loop below
2265
2266 int32_t *ce = strsrch->pattern.ces;
2267 int32_t celength = strsrch->pattern.cesLength;
2268 int ceindex = 0;
2269 UBool isSafe = TRUE; // safe zone indication flag for position
2270 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2271
2272 while (ceindex < celength) {
2273 int32_t textce = ucol_next(coleiter, status);
2274 if (U_FAILURE(*status)) {
2275 if (isSafe) {
2276 cleanUpSafeText(strsrch, safetext, safebuffer);
2277 }
2278 return USEARCH_DONE;
2279 }
2280 if (textce == UCOL_NULLORDER) {
2281 // check if we have passed the safe buffer
2282 if (coleiter == strsrch->textIter) {
2283 cleanUpSafeText(strsrch, safetext, safebuffer);
2284 return USEARCH_DONE;
2285 }
2286 cleanUpSafeText(strsrch, safetext, safebuffer);
2287 safetext = safebuffer;
2288 coleiter = strsrch->textIter;
2289 setColEIterOffset(coleiter, safeoffset);
2290 // status checked at the start of the loop
2291 isSafe = FALSE;
2292 continue;
2293 }
2294 textce = getCE(strsrch, textce);
2295 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2296 // do the beginning stuff
2297 int32_t failedoffset = ucol_getOffset(coleiter);
2298 if (isSafe && failedoffset <= prefixlength) {
2299 // alas... no hope. failed at rearranged accent set
2300 cleanUpSafeText(strsrch, safetext, safebuffer);
2301 return USEARCH_DONE;
2302 }
2303 else {
2304 if (isSafe) {
2305 failedoffset = safeoffset - failedoffset;
2306 cleanUpSafeText(strsrch, safetext, safebuffer);
2307 }
2308
2309 // try rearranging the end accents
2310 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2311 textoffset, failedoffset, status);
2312 if (result != USEARCH_DONE) {
2313 // if status is a failure, ucol_setOffset does nothing
2314 setColEIterOffset(strsrch->textIter, result);
2315 }
2316 if (U_FAILURE(*status)) {
2317 return USEARCH_DONE;
2318 }
2319 return result;
2320 }
2321 }
2322 if (textce == ce[ceindex]) {
2323 ceindex ++;
2324 }
2325 }
2326 // set offset here
2327 if (isSafe) {
2328 int32_t result = ucol_getOffset(coleiter);
2329 // sets the text iterator here with the correct expansion and offset
2330 int32_t leftoverces = getExpansionSuffix(coleiter);
2331 cleanUpSafeText(strsrch, safetext, safebuffer);
2332 if (result <= prefixlength) {
2333 result = textoffset;
2334 }
2335 else {
2336 result = textoffset + (safeoffset - result);
2337 }
2338 setColEIterOffset(strsrch->textIter, result);
2339 setExpansionSuffix(strsrch->textIter, leftoverces);
2340 return result;
2341 }
2342
2343 return ucol_getOffset(coleiter);
2344}
2345
2346/**
2347* Trying out the substring and sees if it can be a canonical match.
2348* This will try normalizing the starting accents and arranging them into
2349* canonical equivalents and check their corresponding ces with the pattern ce.
2350* Prefix accents in the text will be grouped according to their combining
2351* class and the groups will be mixed and matched to try find the perfect
2352* match with the pattern.
2353* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2354* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2355* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2356* "\u0301\u0325".
2357* step 2: check if any of the generated substrings matches the pattern.
2358* Internal method, status assumed to be success, caller has to check status
2359* before calling this method.
2360* @param strsrch string search data
2361* @param textoffset start offset in the collation element text that starts
2362* with the accents to be rearranged
2363* @param status output error status if any
2364* @return TRUE if the match is valid, FALSE otherwise
2365*/
2366static
2367UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2368 int32_t textoffset,
2369 UErrorCode *status)
2370{
2371 const UChar *text = strsrch->search->text;
2372 int32_t temp = textoffset;
2373 int32_t textlength = strsrch->search->textLength;
2374 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2375 UCollationElements *coleiter = strsrch->textIter;
2376 int32_t offset = ucol_getOffset(coleiter);
2377 if (strsrch->pattern.hasSuffixAccents) {
2378 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2379 offset, status);
2380 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2381 setColEIterOffset(coleiter, offset);
2382 return TRUE;
2383 }
2384 }
2385 return FALSE;
2386 }
2387
2388 if (!strsrch->pattern.hasPrefixAccents) {
2389 return FALSE;
2390 }
2391
2392 UChar accents[INITIAL_ARRAY_SIZE_];
2393 // offset to the last base character in substring to search
2394 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2395 // normalizing the offensive string
2396 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2397 0, accents, INITIAL_ARRAY_SIZE_, status);
2398 // status checked in loop
2399
2400 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2401 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2402
2403 // 2 power n - 1 plus the full set of accents
2404 int32_t count = (2 << (size - 1)) - 1;
2405 while (U_SUCCESS(*status) && count > 0) {
2406 UChar *rearrange = strsrch->canonicalPrefixAccents;
2407 // copy the base characters
2408 for (int k = 0; k < accentsindex[0]; k ++) {
2409 *rearrange ++ = accents[k];
2410 }
2411 // forming all possible canonical rearrangement by dropping
2412 // sets of accents
2413 for (int i = 0; i <= size - 1; i ++) {
2414 int32_t mask = 1 << (size - i - 1);
2415 if (count & mask) {
2416 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2417 *rearrange ++ = accents[j];
2418 }
2419 }
2420 }
2421 *rearrange = 0;
2422 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2423 baseoffset, status);
2424 if (offset != USEARCH_DONE) {
2425 return TRUE; // match found
2426 }
2427 count --;
2428 }
2429 return FALSE;
2430}
2431
2432/**
2433* Checks match for contraction.
2434* If the match starts with a partial contraction we fail.
2435* Internal method, status assumed to be success, caller has to check status
2436* before calling this method.
2437* @param strsrch string search data
2438* @param start offset of potential match, to be modified if necessary
2439* @param end offset of potential match, to be modified if necessary
2440* @param status only error status if any
2441* @return TRUE if match passes the contraction test, FALSE otherwise
2442*/
2443static
2444UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2445 int32_t *start,
2446 int32_t *end, UErrorCode *status)
2447{
2448 UCollationElements *coleiter = strsrch->textIter;
2449 int32_t textlength = strsrch->search->textLength;
2450 int32_t temp = *end;
2451 const UCollator *collator = strsrch->collator;
2452 const UChar *text = strsrch->search->text;
2453 // This part checks if either if the start of the match contains potential
2454 // contraction. If so we'll have to iterate through them
2455 // Since we used ucol_next while previously looking for the potential
2456 // match, this guarantees that our end will not be a partial contraction,
2457 // or a partial supplementary character.
2458 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2459 int32_t expansion = getExpansionSuffix(coleiter);
2460 UBool expandflag = expansion > 0;
2461 setColEIterOffset(coleiter, *end);
2462 while (expansion > 0) {
2463 // getting rid of the redundant ce
2464 // since forward contraction/expansion may have extra ces
2465 // if we are in the normalization buffer, hasAccentsBeforeMatch
2466 // would have taken care of it.
2467 // E.g. the character \u01FA will have an expansion of 3, but if
2468 // we are only looking for A ring A\u030A, we'll have to skip the
2469 // last ce in the expansion buffer
2470 ucol_previous(coleiter, status);
2471 if (U_FAILURE(*status)) {
2472 return FALSE;
2473 }
2474 if (ucol_getOffset(coleiter) != temp) {
2475 *end = temp;
2476 temp = ucol_getOffset(coleiter);
2477 }
2478 expansion --;
2479 }
2480
2481 int32_t *patternce = strsrch->pattern.ces;
2482 int32_t patterncelength = strsrch->pattern.cesLength;
2483 int32_t count = patterncelength;
2484 while (count > 0) {
2485 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2486 // status checked below, note that if status is a failure
2487 // ucol_previous returns UCOL_NULLORDER
2488 if (ce == UCOL_IGNORABLE) {
2489 continue;
2490 }
2491 if (expandflag && count == 0 &&
2492 getColElemIterOffset(coleiter, FALSE) != temp) {
2493 *end = temp;
2494 temp = ucol_getOffset(coleiter);
2495 }
2496 if (count == patterncelength &&
2497 ce != patternce[patterncelength - 1]) {
2498 // accents may have extra starting ces, this occurs when a
2499 // pure accent pattern is matched without rearrangement
2500 int32_t expected = patternce[patterncelength - 1];
2501 U16_BACK_1(text, 0, *end);
2502 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2503 ce = getCE(strsrch, ucol_previous(coleiter, status));
2504 while (U_SUCCESS(*status) && ce != expected &&
2505 ce != UCOL_NULLORDER &&
2506 ucol_getOffset(coleiter) <= *start) {
2507 ce = getCE(strsrch, ucol_previous(coleiter, status));
2508 }
2509 }
2510 }
2511 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2512 (*start) --;
2513 *start = getPreviousBaseOffset(text, *start);
2514 return FALSE;
2515 }
2516 count --;
2517 }
2518 }
2519 return TRUE;
2520}
2521
2522/**
2523* Checks and sets the match information if found.
2524* Checks
2525* <ul>
2526* <li> the potential match does not repeat the previous match
2527* <li> boundaries are correct
2528* <li> potential match does not end in the middle of a contraction
2529* <li> identical matches
2530* <\ul>
2531* Otherwise the offset will be shifted to the next character.
2532* Internal method, status assumed to be success, caller has to check status
2533* before calling this method.
2534* @param strsrch string search data
2535* @param textoffset offset in the collation element text. the returned value
2536* will be the truncated start offset of the match or the new start
2537* search offset.
2538* @param status only error status if any
2539* @return TRUE if the match is valid, FALSE otherwise
2540*/
2541static
2542inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2543 int32_t *textoffset,
2544 UErrorCode *status)
2545{
2546 // to ensure that the start and ends are not composite characters
2547 UCollationElements *coleiter = strsrch->textIter;
2548 // if we have a canonical accent match
2549 if ((strsrch->pattern.hasSuffixAccents &&
2550 strsrch->canonicalSuffixAccents[0]) ||
2551 (strsrch->pattern.hasPrefixAccents &&
2552 strsrch->canonicalPrefixAccents[0])) {
2553 strsrch->search->matchedIndex = *textoffset;
2554 strsrch->search->matchedLength =
2555 getNextUStringSearchBaseOffset(strsrch,
2556 getColElemIterOffset(coleiter, FALSE))
2557 - *textoffset;
2558 return TRUE;
2559 }
2560
2561 int32_t end = ucol_getOffset(coleiter);
2562 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2563 status) ||
2564 U_FAILURE(*status)) {
2565 return FALSE;
2566 }
2567
2568 end = getNextUStringSearchBaseOffset(strsrch, end);
2569 // this totally matches, however we need to check if it is repeating
2570 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2571 !isBreakUnit(strsrch, *textoffset, end) ||
2572 !checkIdentical(strsrch, *textoffset, end)) {
2573 (*textoffset) --;
2574 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2575 *textoffset);
2576 return FALSE;
2577 }
2578
2579 strsrch->search->matchedIndex = *textoffset;
2580 strsrch->search->matchedLength = end - *textoffset;
2581 return TRUE;
2582}
2583#endif // #if BOYER_MOORE
2584
2585// constructors and destructor -------------------------------------------
2586
2587U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2588 int32_t patternlength,
2589 const UChar *text,
2590 int32_t textlength,
2591 const char *locale,
2592 UBreakIterator *breakiter,
2593 UErrorCode *status)
2594{
2595 if (U_FAILURE(*status)) {
2596 return NULL;
2597 }
2598#if UCONFIG_NO_BREAK_ITERATION
2599 if (breakiter != NULL) {
2600 *status = U_UNSUPPORTED_ERROR;
2601 return NULL;
2602 }
2603#endif
2604 if (locale) {
2605 // ucol_open internally checks for status
2606 UCollator *collator = ucol_open(locale, status);
2607 // pattern, text checks are done in usearch_openFromCollator
2608 UStringSearch *result = usearch_openFromCollator(pattern,
2609 patternlength, text, textlength,
2610 collator, breakiter, status);
2611
2612 if (result == NULL || U_FAILURE(*status)) {
2613 if (collator) {
2614 ucol_close(collator);
2615 }
2616 return NULL;
2617 }
2618 else {
2619 result->ownCollator = TRUE;
2620 }
2621 return result;
2622 }
2623 *status = U_ILLEGAL_ARGUMENT_ERROR;
2624 return NULL;
2625}
2626
2627U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2628 const UChar *pattern,
2629 int32_t patternlength,
2630 const UChar *text,
2631 int32_t textlength,
2632 const UCollator *collator,
2633 UBreakIterator *breakiter,
2634 UErrorCode *status)
2635{
2636 if (U_FAILURE(*status)) {
2637 return NULL;
2638 }
2639#if UCONFIG_NO_BREAK_ITERATION
2640 if (breakiter != NULL) {
2641 *status = U_UNSUPPORTED_ERROR;
2642 return NULL;
2643 }
2644#endif
2645 if (pattern == NULL || text == NULL || collator == NULL) {
2646 *status = U_ILLEGAL_ARGUMENT_ERROR;
2647 return NULL;
2648 }
2649
2650 // string search does not really work when numeric collation is turned on
2651 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2652 *status = U_UNSUPPORTED_ERROR;
2653 return NULL;
2654 }
2655
2656 if (U_SUCCESS(*status)) {
2657 initializeFCD(status);
2658 if (U_FAILURE(*status)) {
2659 return NULL;
2660 }
2661
2662 UStringSearch *result;
2663 if (textlength == -1) {
2664 textlength = u_strlen(text);
2665 }
2666 if (patternlength == -1) {
2667 patternlength = u_strlen(pattern);
2668 }
2669 if (textlength <= 0 || patternlength <= 0) {
2670 *status = U_ILLEGAL_ARGUMENT_ERROR;
2671 return NULL;
2672 }
2673
2674 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2675 if (result == NULL) {
2676 *status = U_MEMORY_ALLOCATION_ERROR;
2677 return NULL;
2678 }
2679
2680 result->collator = collator;
2681 result->strength = ucol_getStrength(collator);
2682 result->ceMask = getMask(result->strength);
2683 result->toShift =
2684 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2685 UCOL_SHIFTED;
2686 result->variableTop = ucol_getVariableTop(collator, status);
2687
2688 result->nfd = Normalizer2::getNFDInstance(*status);
2689
2690 if (U_FAILURE(*status)) {
2691 uprv_free(result);
2692 return NULL;
2693 }
2694
2695 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2696 if (result->search == NULL) {
2697 *status = U_MEMORY_ALLOCATION_ERROR;
2698 uprv_free(result);
2699 return NULL;
2700 }
2701
2702 result->search->text = text;
2703 result->search->textLength = textlength;
2704
2705 result->pattern.text = pattern;
2706 result->pattern.textLength = patternlength;
2707 result->pattern.ces = NULL;
2708 result->pattern.pces = NULL;
2709
2710 result->search->breakIter = breakiter;
2711#if !UCONFIG_NO_BREAK_ITERATION
2712 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2713 if (breakiter) {
2714 ubrk_setText(breakiter, text, textlength, status);
2715 }
2716#endif
2717
2718 result->ownCollator = FALSE;
2719 result->search->matchedLength = 0;
2720 result->search->matchedIndex = USEARCH_DONE;
2721 result->utilIter = NULL;
2722 result->textIter = ucol_openElements(collator, text,
2723 textlength, status);
2724 result->textProcessedIter = NULL;
2725 if (U_FAILURE(*status)) {
2726 usearch_close(result);
2727 return NULL;
2728 }
2729
2730 result->search->isOverlap = FALSE;
2731 result->search->isCanonicalMatch = FALSE;
2732 result->search->elementComparisonType = 0;
2733 result->search->isForwardSearching = TRUE;
2734 result->search->reset = TRUE;
2735
2736 initialize(result, status);
2737
2738 if (U_FAILURE(*status)) {
2739 usearch_close(result);
2740 return NULL;
2741 }
2742
2743 return result;
2744 }
2745 return NULL;
2746}
2747
2748U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2749{
2750 if (strsrch) {
2751 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2752 strsrch->pattern.ces) {
2753 uprv_free(strsrch->pattern.ces);
2754 }
2755
2756 if (strsrch->pattern.pces != NULL &&
2757 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2758 uprv_free(strsrch->pattern.pces);
2759 }
2760
2761 delete strsrch->textProcessedIter;
2762 ucol_closeElements(strsrch->textIter);
2763 ucol_closeElements(strsrch->utilIter);
2764
2765 if (strsrch->ownCollator && strsrch->collator) {
2766 ucol_close((UCollator *)strsrch->collator);
2767 }
2768
2769#if !UCONFIG_NO_BREAK_ITERATION
2770 if (strsrch->search->internalBreakIter) {
2771 ubrk_close(strsrch->search->internalBreakIter);
2772 }
2773#endif
2774
2775 uprv_free(strsrch->search);
2776 uprv_free(strsrch);
2777 }
2778}
2779
2780namespace {
2781
2782UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2783 if (U_FAILURE(*status)) { return FALSE; }
2784 if (strsrch->textProcessedIter == NULL) {
2785 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2786 if (strsrch->textProcessedIter == NULL) {
2787 *status = U_MEMORY_ALLOCATION_ERROR;
2788 return FALSE;
2789 }
2790 } else {
2791 strsrch->textProcessedIter->init(strsrch->textIter);
2792 }
2793 return TRUE;
2794}
2795
2796}
2797
2798// set and get methods --------------------------------------------------
2799
2800U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2801 int32_t position,
2802 UErrorCode *status)
2803{
2804 if (U_SUCCESS(*status) && strsrch) {
2805 if (isOutOfBounds(strsrch->search->textLength, position)) {
2806 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2807 }
2808 else {
2809 setColEIterOffset(strsrch->textIter, position);
2810 }
2811 strsrch->search->matchedIndex = USEARCH_DONE;
2812 strsrch->search->matchedLength = 0;
2813 strsrch->search->reset = FALSE;
2814 }
2815}
2816
2817U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2818{
2819 if (strsrch) {
2820 int32_t result = ucol_getOffset(strsrch->textIter);
2821 if (isOutOfBounds(strsrch->search->textLength, result)) {
2822 return USEARCH_DONE;
2823 }
2824 return result;
2825 }
2826 return USEARCH_DONE;
2827}
2828
2829U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2830 USearchAttribute attribute,
2831 USearchAttributeValue value,
2832 UErrorCode *status)
2833{
2834 if (U_SUCCESS(*status) && strsrch) {
2835 switch (attribute)
2836 {
2837 case USEARCH_OVERLAP :
2838 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2839 break;
2840 case USEARCH_CANONICAL_MATCH :
2841 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2842 FALSE);
2843 break;
2844 case USEARCH_ELEMENT_COMPARISON :
2845 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2846 strsrch->search->elementComparisonType = (int16_t)value;
2847 } else {
2848 strsrch->search->elementComparisonType = 0;
2849 }
2850 break;
2851 case USEARCH_ATTRIBUTE_COUNT :
2852 default:
2853 *status = U_ILLEGAL_ARGUMENT_ERROR;
2854 }
2855 }
2856 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2857 *status = U_ILLEGAL_ARGUMENT_ERROR;
2858 }
2859}
2860
2861U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2862 const UStringSearch *strsrch,
2863 USearchAttribute attribute)
2864{
2865 if (strsrch) {
2866 switch (attribute) {
2867 case USEARCH_OVERLAP :
2868 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2869 USEARCH_OFF);
2870 case USEARCH_CANONICAL_MATCH :
2871 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2872 USEARCH_OFF);
2873 case USEARCH_ELEMENT_COMPARISON :
2874 {
2875 int16_t value = strsrch->search->elementComparisonType;
2876 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2877 return (USearchAttributeValue)value;
2878 } else {
2879 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2880 }
2881 }
2882 case USEARCH_ATTRIBUTE_COUNT :
2883 return USEARCH_DEFAULT;
2884 }
2885 }
2886 return USEARCH_DEFAULT;
2887}
2888
2889U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2890 const UStringSearch *strsrch)
2891{
2892 if (strsrch == NULL) {
2893 return USEARCH_DONE;
2894 }
2895 return strsrch->search->matchedIndex;
2896}
2897
2898
2899U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2900 UChar *result,
2901 int32_t resultCapacity,
2902 UErrorCode *status)
2903{
2904 if (U_FAILURE(*status)) {
2905 return USEARCH_DONE;
2906 }
2907 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2908 result == NULL)) {
2909 *status = U_ILLEGAL_ARGUMENT_ERROR;
2910 return USEARCH_DONE;
2911 }
2912
2913 int32_t copylength = strsrch->search->matchedLength;
2914 int32_t copyindex = strsrch->search->matchedIndex;
2915 if (copyindex == USEARCH_DONE) {
2916 u_terminateUChars(result, resultCapacity, 0, status);
2917 return USEARCH_DONE;
2918 }
2919
2920 if (resultCapacity < copylength) {
2921 copylength = resultCapacity;
2922 }
2923 if (copylength > 0) {
2924 uprv_memcpy(result, strsrch->search->text + copyindex,
2925 copylength * sizeof(UChar));
2926 }
2927 return u_terminateUChars(result, resultCapacity,
2928 strsrch->search->matchedLength, status);
2929}
2930
2931U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2932 const UStringSearch *strsrch)
2933{
2934 if (strsrch) {
2935 return strsrch->search->matchedLength;
2936 }
2937 return USEARCH_DONE;
2938}
2939
2940#if !UCONFIG_NO_BREAK_ITERATION
2941
2942U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2943 UBreakIterator *breakiter,
2944 UErrorCode *status)
2945{
2946 if (U_SUCCESS(*status) && strsrch) {
2947 strsrch->search->breakIter = breakiter;
2948 if (breakiter) {
2949 ubrk_setText(breakiter, strsrch->search->text,
2950 strsrch->search->textLength, status);
2951 }
2952 }
2953}
2954
2955U_CAPI const UBreakIterator* U_EXPORT2
2956usearch_getBreakIterator(const UStringSearch *strsrch)
2957{
2958 if (strsrch) {
2959 return strsrch->search->breakIter;
2960 }
2961 return NULL;
2962}
2963
2964#endif
2965
2966U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2967 const UChar *text,
2968 int32_t textlength,
2969 UErrorCode *status)
2970{
2971 if (U_SUCCESS(*status)) {
2972 if (strsrch == NULL || text == NULL || textlength < -1 ||
2973 textlength == 0) {
2974 *status = U_ILLEGAL_ARGUMENT_ERROR;
2975 }
2976 else {
2977 if (textlength == -1) {
2978 textlength = u_strlen(text);
2979 }
2980 strsrch->search->text = text;
2981 strsrch->search->textLength = textlength;
2982 ucol_setText(strsrch->textIter, text, textlength, status);
2983 strsrch->search->matchedIndex = USEARCH_DONE;
2984 strsrch->search->matchedLength = 0;
2985 strsrch->search->reset = TRUE;
2986#if !UCONFIG_NO_BREAK_ITERATION
2987 if (strsrch->search->breakIter != NULL) {
2988 ubrk_setText(strsrch->search->breakIter, text,
2989 textlength, status);
2990 }
2991 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2992#endif
2993 }
2994 }
2995}
2996
2997U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2998 int32_t *length)
2999{
3000 if (strsrch) {
3001 *length = strsrch->search->textLength;
3002 return strsrch->search->text;
3003 }
3004 return NULL;
3005}
3006
3007U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
3008 const UCollator *collator,
3009 UErrorCode *status)
3010{
3011 if (U_SUCCESS(*status)) {
3012 if (collator == NULL) {
3013 *status = U_ILLEGAL_ARGUMENT_ERROR;
3014 return;
3015 }
3016
3017 if (strsrch) {
3018 delete strsrch->textProcessedIter;
3019 strsrch->textProcessedIter = NULL;
3020 ucol_closeElements(strsrch->textIter);
3021 ucol_closeElements(strsrch->utilIter);
3022 strsrch->textIter = strsrch->utilIter = NULL;
3023 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3024 ucol_close((UCollator *)strsrch->collator);
3025 strsrch->ownCollator = FALSE;
3026 }
3027 strsrch->collator = collator;
3028 strsrch->strength = ucol_getStrength(collator);
3029 strsrch->ceMask = getMask(strsrch->strength);
3030#if !UCONFIG_NO_BREAK_ITERATION
3031 ubrk_close(strsrch->search->internalBreakIter);
3032 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3033 strsrch->search->text, strsrch->search->textLength, status);
3034#endif
3035 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3036 strsrch->toShift =
3037 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3038 UCOL_SHIFTED;
3039 // if status is a failure, ucol_getVariableTop returns 0
3040 strsrch->variableTop = ucol_getVariableTop(collator, status);
3041 strsrch->textIter = ucol_openElements(collator,
3042 strsrch->search->text,
3043 strsrch->search->textLength,
3044 status);
3045 strsrch->utilIter = ucol_openElements(
3046 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3047 // initialize() _after_ setting the iterators for the new collator.
3048 initialize(strsrch, status);
3049 }
3050
3051 // **** are these calls needed?
3052 // **** we call uprv_init_pce in initializePatternPCETable
3053 // **** and the CEIBuffer constructor...
3054#if 0
3055 uprv_init_pce(strsrch->textIter);
3056 uprv_init_pce(strsrch->utilIter);
3057#endif
3058 }
3059}
3060
3061U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3062{
3063 if (strsrch) {
3064 return (UCollator *)strsrch->collator;
3065 }
3066 return NULL;
3067}
3068
3069U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
3070 const UChar *pattern,
3071 int32_t patternlength,
3072 UErrorCode *status)
3073{
3074 if (U_SUCCESS(*status)) {
3075 if (strsrch == NULL || pattern == NULL) {
3076 *status = U_ILLEGAL_ARGUMENT_ERROR;
3077 }
3078 else {
3079 if (patternlength == -1) {
3080 patternlength = u_strlen(pattern);
3081 }
3082 if (patternlength == 0) {
3083 *status = U_ILLEGAL_ARGUMENT_ERROR;
3084 return;
3085 }
3086 strsrch->pattern.text = pattern;
3087 strsrch->pattern.textLength = patternlength;
3088 initialize(strsrch, status);
3089 }
3090 }
3091}
3092
3093U_CAPI const UChar* U_EXPORT2
3094usearch_getPattern(const UStringSearch *strsrch,
3095 int32_t *length)
3096{
3097 if (strsrch) {
3098 *length = strsrch->pattern.textLength;
3099 return strsrch->pattern.text;
3100 }
3101 return NULL;
3102}
3103
3104// miscellanous methods --------------------------------------------------
3105
3106U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3107 UErrorCode *status)
3108{
3109 if (strsrch && U_SUCCESS(*status)) {
3110 strsrch->search->isForwardSearching = TRUE;
3111 usearch_setOffset(strsrch, 0, status);
3112 if (U_SUCCESS(*status)) {
3113 return usearch_next(strsrch, status);
3114 }
3115 }
3116 return USEARCH_DONE;
3117}
3118
3119U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3120 int32_t position,
3121 UErrorCode *status)
3122{
3123 if (strsrch && U_SUCCESS(*status)) {
3124 strsrch->search->isForwardSearching = TRUE;
3125 // position checked in usearch_setOffset
3126 usearch_setOffset(strsrch, position, status);
3127 if (U_SUCCESS(*status)) {
3128 return usearch_next(strsrch, status);
3129 }
3130 }
3131 return USEARCH_DONE;
3132}
3133
3134U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3135 UErrorCode *status)
3136{
3137 if (strsrch && U_SUCCESS(*status)) {
3138 strsrch->search->isForwardSearching = FALSE;
3139 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3140 if (U_SUCCESS(*status)) {
3141 return usearch_previous(strsrch, status);
3142 }
3143 }
3144 return USEARCH_DONE;
3145}
3146
3147U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3148 int32_t position,
3149 UErrorCode *status)
3150{
3151 if (strsrch && U_SUCCESS(*status)) {
3152 strsrch->search->isForwardSearching = FALSE;
3153 // position checked in usearch_setOffset
3154 usearch_setOffset(strsrch, position, status);
3155 if (U_SUCCESS(*status)) {
3156 return usearch_previous(strsrch, status);
3157 }
3158 }
3159 return USEARCH_DONE;
3160}
3161
3162/**
3163* If a direction switch is required, we'll count the number of ces till the
3164* beginning of the collation element iterator and iterate forwards that
3165* number of times. This is so that we get to the correct point within the
3166* string to continue the search in. Imagine when we are in the middle of the
3167* normalization buffer when the change in direction is request. arrrgghh....
3168* After searching the offset within the collation element iterator will be
3169* shifted to the start of the match. If a match is not found, the offset would
3170* have been set to the end of the text string in the collation element
3171* iterator.
3172* Okay, here's my take on normalization buffer. The only time when there can
3173* be 2 matches within the same normalization is when the pattern is consists
3174* of all accents. But since the offset returned is from the text string, we
3175* should not confuse the caller by returning the second match within the
3176* same normalization buffer. If we do, the 2 results will have the same match
3177* offsets, and that'll be confusing. I'll return the next match that doesn't
3178* fall within the same normalization buffer. Note this does not affect the
3179* results of matches spanning the text and the normalization buffer.
3180* The position to start searching is taken from the collation element
3181* iterator. Callers of this API would have to set the offset in the collation
3182* element iterator before using this method.
3183*/
3184U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3185 UErrorCode *status)
3186{
3187 if (U_SUCCESS(*status) && strsrch) {
3188 // note offset is either equivalent to the start of the previous match
3189 // or is set by the user
3190 int32_t offset = usearch_getOffset(strsrch);
3191 USearch *search = strsrch->search;
3192 search->reset = FALSE;
3193 int32_t textlength = search->textLength;
3194 if (search->isForwardSearching) {
3195#if BOYER_MOORE
3196 if (offset == textlength
3197 || (!search->isOverlap &&
3198 (offset + strsrch->pattern.defaultShiftSize > textlength ||
3199 (search->matchedIndex != USEARCH_DONE &&
3200 offset + search->matchedLength >= textlength)))) {
3201 // not enough characters to match
3202 setMatchNotFound(strsrch);
3203 return USEARCH_DONE;
3204 }
3205#else
3206 if (offset == textlength ||
3207 (! search->isOverlap &&
3208 (search->matchedIndex != USEARCH_DONE &&
3209 offset + search->matchedLength > textlength))) {
3210 // not enough characters to match
3211 setMatchNotFound(strsrch);
3212 return USEARCH_DONE;
3213 }
3214#endif
3215 }
3216 else {
3217 // switching direction.
3218 // if matchedIndex == USEARCH_DONE, it means that either a
3219 // setOffset has been called or that previous ran off the text
3220 // string. the iterator would have been set to offset 0 if a
3221 // match is not found.
3222 search->isForwardSearching = TRUE;
3223 if (search->matchedIndex != USEARCH_DONE) {
3224 // there's no need to set the collation element iterator
3225 // the next call to next will set the offset.
3226 return search->matchedIndex;
3227 }
3228 }
3229
3230 if (U_SUCCESS(*status)) {
3231 if (strsrch->pattern.cesLength == 0) {
3232 if (search->matchedIndex == USEARCH_DONE) {
3233 search->matchedIndex = offset;
3234 }
3235 else { // moves by codepoints
3236 U16_FWD_1(search->text, search->matchedIndex, textlength);
3237 }
3238
3239 search->matchedLength = 0;
3240 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3241 // status checked below
3242 if (search->matchedIndex == textlength) {
3243 search->matchedIndex = USEARCH_DONE;
3244 }
3245 }
3246 else {
3247 if (search->matchedLength > 0) {
3248 // if matchlength is 0 we are at the start of the iteration
3249 if (search->isOverlap) {
3250 ucol_setOffset(strsrch->textIter, offset + 1, status);
3251 }
3252 else {
3253 ucol_setOffset(strsrch->textIter,
3254 offset + search->matchedLength, status);
3255 }
3256 }
3257 else {
3258 // for boundary check purposes. this will ensure that the
3259 // next match will not preceed the current offset
3260 // note search->matchedIndex will always be set to something
3261 // in the code
3262 search->matchedIndex = offset - 1;
3263 }
3264
3265 if (search->isCanonicalMatch) {
3266 // can't use exact here since extra accents are allowed.
3267 usearch_handleNextCanonical(strsrch, status);
3268 }
3269 else {
3270 usearch_handleNextExact(strsrch, status);
3271 }
3272 }
3273
3274 if (U_FAILURE(*status)) {
3275 return USEARCH_DONE;
3276 }
3277
3278#if !BOYER_MOORE
3279 if (search->matchedIndex == USEARCH_DONE) {
3280 ucol_setOffset(strsrch->textIter, search->textLength, status);
3281 } else {
3282 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3283 }
3284#endif
3285
3286 return search->matchedIndex;
3287 }
3288 }
3289 return USEARCH_DONE;
3290}
3291
3292U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3293 UErrorCode *status)
3294{
3295 if (U_SUCCESS(*status) && strsrch) {
3296 int32_t offset;
3297 USearch *search = strsrch->search;
3298 if (search->reset) {
3299 offset = search->textLength;
3300 search->isForwardSearching = FALSE;
3301 search->reset = FALSE;
3302 setColEIterOffset(strsrch->textIter, offset);
3303 }
3304 else {
3305 offset = usearch_getOffset(strsrch);
3306 }
3307
3308 int32_t matchedindex = search->matchedIndex;
3309 if (search->isForwardSearching == TRUE) {
3310 // switching direction.
3311 // if matchedIndex == USEARCH_DONE, it means that either a
3312 // setOffset has been called or that next ran off the text
3313 // string. the iterator would have been set to offset textLength if
3314 // a match is not found.
3315 search->isForwardSearching = FALSE;
3316 if (matchedindex != USEARCH_DONE) {
3317 return matchedindex;
3318 }
3319 }
3320 else {
3321#if BOYER_MOORE
3322 if (offset == 0 || matchedindex == 0 ||
3323 (!search->isOverlap &&
3324 (offset < strsrch->pattern.defaultShiftSize ||
3325 (matchedindex != USEARCH_DONE &&
3326 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3327 // not enough characters to match
3328 setMatchNotFound(strsrch);
3329 return USEARCH_DONE;
3330 }
3331#else
3332 // Could check pattern length, but the
3333 // linear search will do the right thing
3334 if (offset == 0 || matchedindex == 0) {
3335 setMatchNotFound(strsrch);
3336 return USEARCH_DONE;
3337 }
3338#endif
3339 }
3340
3341 if (U_SUCCESS(*status)) {
3342 if (strsrch->pattern.cesLength == 0) {
3343 search->matchedIndex =
3344 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3345 if (search->matchedIndex == 0) {
3346 setMatchNotFound(strsrch);
3347 // status checked below
3348 }
3349 else { // move by codepoints
3350 U16_BACK_1(search->text, 0, search->matchedIndex);
3351 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3352 // status checked below
3353 search->matchedLength = 0;
3354 }
3355 }
3356 else {
3357 if (strsrch->search->isCanonicalMatch) {
3358 // can't use exact here since extra accents are allowed.
3359 usearch_handlePreviousCanonical(strsrch, status);
3360 // status checked below
3361 }
3362 else {
3363 usearch_handlePreviousExact(strsrch, status);
3364 // status checked below
3365 }
3366 }
3367
3368 if (U_FAILURE(*status)) {
3369 return USEARCH_DONE;
3370 }
3371
3372 return search->matchedIndex;
3373 }
3374 }
3375 return USEARCH_DONE;
3376}
3377
3378
3379
3380U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3381{
3382 /*
3383 reset is setting the attributes that are already in
3384 string search, hence all attributes in the collator should
3385 be retrieved without any problems
3386 */
3387 if (strsrch) {
3388 UErrorCode status = U_ZERO_ERROR;
3389 UBool sameCollAttribute = TRUE;
3390 uint32_t ceMask;
3391 UBool shift;
3392 uint32_t varTop;
3393
3394 // **** hack to deal w/ how processed CEs encode quaternary ****
3395 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3396 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3397 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3398 sameCollAttribute = FALSE;
3399 }
3400
3401 strsrch->strength = ucol_getStrength(strsrch->collator);
3402 ceMask = getMask(strsrch->strength);
3403 if (strsrch->ceMask != ceMask) {
3404 strsrch->ceMask = ceMask;
3405 sameCollAttribute = FALSE;
3406 }
3407
3408 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3409 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3410 &status) == UCOL_SHIFTED;
3411 if (strsrch->toShift != shift) {
3412 strsrch->toShift = shift;
3413 sameCollAttribute = FALSE;
3414 }
3415
3416 // if status is a failure, ucol_getVariableTop returns 0
3417 varTop = ucol_getVariableTop(strsrch->collator, &status);
3418 if (strsrch->variableTop != varTop) {
3419 strsrch->variableTop = varTop;
3420 sameCollAttribute = FALSE;
3421 }
3422 if (!sameCollAttribute) {
3423 initialize(strsrch, &status);
3424 }
3425 ucol_setText(strsrch->textIter, strsrch->search->text,
3426 strsrch->search->textLength,
3427 &status);
3428 strsrch->search->matchedLength = 0;
3429 strsrch->search->matchedIndex = USEARCH_DONE;
3430 strsrch->search->isOverlap = FALSE;
3431 strsrch->search->isCanonicalMatch = FALSE;
3432 strsrch->search->elementComparisonType = 0;
3433 strsrch->search->isForwardSearching = TRUE;
3434 strsrch->search->reset = TRUE;
3435 }
3436}
3437
3438//
3439// CEI Collation Element + source text index.
3440// These structs are kept in the circular buffer.
3441//
3442struct CEI {
3443 int64_t ce;
3444 int32_t lowIndex;
3445 int32_t highIndex;
3446};
3447
3448U_NAMESPACE_BEGIN
3449
3450namespace {
3451//
3452// CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3453//
3454#define DEFAULT_CEBUFFER_SIZE 96
3455#define CEBUFFER_EXTRA 32
3456// Some typical max values to make buffer size more reasonable for asymmetric search.
3457// #8694 is for a better long-term solution to allocation of this buffer.
3458#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3459#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3460#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3461struct CEIBuffer {
3462 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3463 CEI *buf;
3464 int32_t bufSize;
3465 int32_t firstIx;
3466 int32_t limitIx;
3467 UCollationElements *ceIter;
3468 UStringSearch *strSearch;
3469
3470
3471
3472 CEIBuffer(UStringSearch *ss, UErrorCode *status);
3473 ~CEIBuffer();
3474 const CEI *get(int32_t index);
3475 const CEI *getPrevious(int32_t index);
3476};
3477
3478
3479CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3480 buf = defBuf;
3481 strSearch = ss;
3482 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3483 if (ss->search->elementComparisonType != 0) {
3484 const UChar * patText = ss->pattern.text;
3485 if (patText) {
3486 const UChar * patTextLimit = patText + ss->pattern.textLength;
3487 while ( patText < patTextLimit ) {
3488 UChar c = *patText++;
3489 if (MIGHT_BE_JAMO_L(c)) {
3490 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3491 } else {
3492 // No check for surrogates, we might allocate slightly more buffer than necessary.
3493 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3494 }
3495 }
3496 }
3497 }
3498 ceIter = ss->textIter;
3499 firstIx = 0;
3500 limitIx = 0;
3501
3502 if (!initTextProcessedIter(ss, status)) { return; }
3503
3504 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3505 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3506 if (buf == NULL) {
3507 *status = U_MEMORY_ALLOCATION_ERROR;
3508 }
3509 }
3510}
3511
3512// TODO: add a reset or init function so that allocated
3513// buffers can be retained & reused.
3514
3515CEIBuffer::~CEIBuffer() {
3516 if (buf != defBuf) {
3517 uprv_free(buf);
3518 }
3519}
3520
3521
3522// Get the CE with the specified index.
3523// Index must be in the range
3524// n-history_size < index < n+1
3525// where n is the largest index to have been fetched by some previous call to this function.
3526// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3527//
3528const CEI *CEIBuffer::get(int32_t index) {
3529 int i = index % bufSize;
3530
3531 if (index>=firstIx && index<limitIx) {
3532 // The request was for an entry already in our buffer.
3533 // Just return it.
3534 return &buf[i];
3535 }
3536
3537 // Caller is requesting a new, never accessed before, CE.
3538 // Verify that it is the next one in sequence, which is all
3539 // that is allowed.
3540 if (index != limitIx) {
3541 U_ASSERT(FALSE);
3542
3543 return NULL;
3544 }
3545
3546 // Manage the circular CE buffer indexing
3547 limitIx++;
3548
3549 if (limitIx - firstIx >= bufSize) {
3550 // The buffer is full, knock out the lowest-indexed entry.
3551 firstIx++;
3552 }
3553
3554 UErrorCode status = U_ZERO_ERROR;
3555
3556 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3557
3558 return &buf[i];
3559}
3560
3561// Get the CE with the specified index.
3562// Index must be in the range
3563// n-history_size < index < n+1
3564// where n is the largest index to have been fetched by some previous call to this function.
3565// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3566//
3567const CEI *CEIBuffer::getPrevious(int32_t index) {
3568 int i = index % bufSize;
3569
3570 if (index>=firstIx && index<limitIx) {
3571 // The request was for an entry already in our buffer.
3572 // Just return it.
3573 return &buf[i];
3574 }
3575
3576 // Caller is requesting a new, never accessed before, CE.
3577 // Verify that it is the next one in sequence, which is all
3578 // that is allowed.
3579 if (index != limitIx) {
3580 U_ASSERT(FALSE);
3581
3582 return NULL;
3583 }
3584
3585 // Manage the circular CE buffer indexing
3586 limitIx++;
3587
3588 if (limitIx - firstIx >= bufSize) {
3589 // The buffer is full, knock out the lowest-indexed entry.
3590 firstIx++;
3591 }
3592
3593 UErrorCode status = U_ZERO_ERROR;
3594
3595 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3596
3597 return &buf[i];
3598}
3599
3600}
3601
3602U_NAMESPACE_END
3603
3604
3605// #define USEARCH_DEBUG
3606
3607#ifdef USEARCH_DEBUG
3608#include <stdio.h>
3609#include <stdlib.h>
3610#endif
3611
3612/*
3613 * Find the next break boundary after startIndex. If the UStringSearch object
3614 * has an external break iterator, use that. Otherwise use the internal character
3615 * break iterator.
3616 */
3617static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3618#if 0
3619 const UChar *text = strsrch->search->text;
3620 int32_t textLen = strsrch->search->textLength;
3621
3622 U_ASSERT(startIndex>=0);
3623 U_ASSERT(startIndex<=textLen);
3624
3625 if (startIndex >= textLen) {
3626 return startIndex;
3627 }
3628
3629 UChar32 c;
3630 int32_t i = startIndex;
3631 U16_NEXT(text, i, textLen, c);
3632
3633 // If we are on a control character, stop without looking for combining marks.
3634 // Control characters do not combine.
3635 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3636 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3637 return i;
3638 }
3639
3640 // The initial character was not a control, and can thus accept trailing
3641 // combining characters. Advance over however many of them there are.
3642 int32_t indexOfLastCharChecked;
3643 for (;;) {
3644 indexOfLastCharChecked = i;
3645 if (i>=textLen) {
3646 break;
3647 }
3648 U16_NEXT(text, i, textLen, c);
3649 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3650 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3651 break;
3652 }
3653 }
3654 return indexOfLastCharChecked;
3655#elif !UCONFIG_NO_BREAK_ITERATION
3656 UBreakIterator *breakiterator = strsrch->search->breakIter;
3657
3658 if (breakiterator == NULL) {
3659 breakiterator = strsrch->search->internalBreakIter;
3660 }
3661
3662 if (breakiterator != NULL) {
3663 return ubrk_following(breakiterator, startIndex);
3664 }
3665
3666 return startIndex;
3667#else
3668 // **** or should we use the original code? ****
3669 return startIndex;
3670#endif
3671
3672}
3673
3674/*
3675 * Returns TRUE if index is on a break boundary. If the UStringSearch
3676 * has an external break iterator, test using that, otherwise test
3677 * using the internal character break iterator.
3678 */
3679static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3680#if 0
3681 const UChar *text = strsrch->search->text;
3682 int32_t textLen = strsrch->search->textLength;
3683
3684 U_ASSERT(index>=0);
3685 U_ASSERT(index<=textLen);
3686
3687 if (index>=textLen || index<=0) {
3688 return TRUE;
3689 }
3690
3691 // If the character at the current index is not a GRAPHEME_EXTEND
3692 // then we can not be within a combining sequence.
3693 UChar32 c;
3694 U16_GET(text, 0, index, textLen, c);
3695 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3696 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3697 return TRUE;
3698 }
3699
3700 // We are at a combining mark. If the preceding character is anything
3701 // except a CONTROL, CR or LF, we are in a combining sequence.
3702 U16_PREV(text, 0, index, c);
3703 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3704 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3705 return !combining;
3706#elif !UCONFIG_NO_BREAK_ITERATION
3707 UBreakIterator *breakiterator = strsrch->search->breakIter;
3708
3709 if (breakiterator == NULL) {
3710 breakiterator = strsrch->search->internalBreakIter;
3711 }
3712
3713 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3714#else
3715 // **** or use the original code? ****
3716 return TRUE;
3717#endif
3718}
3719
3720#if 0
3721static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3722{
3723#if !UCONFIG_NO_BREAK_ITERATION
3724 UBreakIterator *breakiterator = strsrch->search->breakIter;
3725
3726 if (breakiterator != NULL) {
3727 int32_t startindex = ubrk_first(breakiterator);
3728 int32_t endindex = ubrk_last(breakiterator);
3729
3730 // out-of-range indexes are never boundary positions
3731 if (start < startindex || start > endindex ||
3732 end < startindex || end > endindex) {
3733 return FALSE;
3734 }
3735
3736 return ubrk_isBoundary(breakiterator, start) &&
3737 ubrk_isBoundary(breakiterator, end);
3738 }
3739#endif
3740
3741 return TRUE;
3742}
3743#endif
3744
3745typedef enum {
3746 U_CE_MATCH = -1,
3747 U_CE_NO_MATCH = 0,
3748 U_CE_SKIP_TARG,
3749 U_CE_SKIP_PATN
3750} UCompareCEsResult;
3751#define U_CE_LEVEL2_BASE 0x00000005
3752#define U_CE_LEVEL3_BASE 0x00050000
3753
3754static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3755 if (targCE == patCE) {
3756 return U_CE_MATCH;
3757 }
3758 if (compareType == 0) {
3759 return U_CE_NO_MATCH;
3760 }
3761
3762 int64_t targCEshifted = targCE >> 32;
3763 int64_t patCEshifted = patCE >> 32;
3764 int64_t mask;
3765
3766 mask = 0xFFFF0000;
3767 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3768 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3769 if ( targLev1 != patLev1 ) {
3770 if ( targLev1 == 0 ) {
3771 return U_CE_SKIP_TARG;
3772 }
3773 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3774 return U_CE_SKIP_PATN;
3775 }
3776 return U_CE_NO_MATCH;
3777 }
3778
3779 mask = 0x0000FFFF;
3780 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3781 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3782 if ( targLev2 != patLev2 ) {
3783 if ( targLev2 == 0 ) {
3784 return U_CE_SKIP_TARG;
3785 }
3786 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3787 return U_CE_SKIP_PATN;
3788 }
3789 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3790 U_CE_MATCH: U_CE_NO_MATCH;
3791 }
3792
3793 mask = 0xFFFF0000;
3794 int32_t targLev3 = (int32_t)(targCE & mask);
3795 int32_t patLev3 = (int32_t)(patCE & mask);
3796 if ( targLev3 != patLev3 ) {
3797 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3798 U_CE_MATCH: U_CE_NO_MATCH;
3799 }
3800
3801 return U_CE_MATCH;
3802}
3803
3804#if BOYER_MOORE
3805// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3806#endif
3807
3808U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3809 int32_t startIdx,
3810 int32_t *matchStart,
3811 int32_t *matchLimit,
3812 UErrorCode *status)
3813{
3814 if (U_FAILURE(*status)) {
3815 return FALSE;
3816 }
3817
3818 // TODO: reject search patterns beginning with a combining char.
3819
3820#ifdef USEARCH_DEBUG
3821 if (getenv("USEARCH_DEBUG") != NULL) {
3822 printf("Pattern CEs\n");
3823 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3824 printf(" %8x", strsrch->pattern.ces[ii]);
3825 }
3826 printf("\n");
3827 }
3828
3829#endif
3830 // Input parameter sanity check.
3831 // TODO: should input indicies clip to the text length
3832 // in the same way that UText does.
3833 if(strsrch->pattern.cesLength == 0 ||
3834 startIdx < 0 ||
3835 startIdx > strsrch->search->textLength ||
3836 strsrch->pattern.ces == NULL) {
3837 *status = U_ILLEGAL_ARGUMENT_ERROR;
3838 return FALSE;
3839 }
3840
3841 if (strsrch->pattern.pces == NULL) {
3842 initializePatternPCETable(strsrch, status);
3843 }
3844
3845 ucol_setOffset(strsrch->textIter, startIdx, status);
3846 CEIBuffer ceb(strsrch, status);
3847
3848
3849 int32_t targetIx = 0;
3850 const CEI *targetCEI = NULL;
3851 int32_t patIx;
3852 UBool found;
3853
3854 int32_t mStart = -1;
3855 int32_t mLimit = -1;
3856 int32_t minLimit;
3857 int32_t maxLimit;
3858
3859
3860
3861 // Outer loop moves over match starting positions in the
3862 // target CE space.
3863 // Here we see the target as a sequence of collation elements, resulting from the following:
3864 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3865 // (for example, digraphs such as IJ may be broken into two characters).
3866 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3867 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3868 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3869 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3870 // then the CE is deleted, so the following code sees only CEs that are relevant.
3871 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3872 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3873 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3874 //
3875 for(targetIx=0; ; targetIx++)
3876 {
3877 found = TRUE;
3878 // Inner loop checks for a match beginning at each
3879 // position from the outer loop.
3880 int32_t targetIxOffset = 0;
3881 int64_t patCE = 0;
3882 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3883 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3884 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3885 const CEI *firstCEI = ceb.get(targetIx);
3886 if (firstCEI == NULL) {
3887 *status = U_INTERNAL_PROGRAM_ERROR;
3888 found = FALSE;
3889 break;
3890 }
3891
3892 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3893 patCE = strsrch->pattern.pces[patIx];
3894 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3895 // Compare CE from target string with CE from the pattern.
3896 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3897 // which will fail the compare, below.
3898 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3899 if ( ceMatch == U_CE_NO_MATCH ) {
3900 found = FALSE;
3901 break;
3902 } else if ( ceMatch > U_CE_NO_MATCH ) {
3903 if ( ceMatch == U_CE_SKIP_TARG ) {
3904 // redo with same patCE, next targCE
3905 patIx--;
3906 targetIxOffset++;
3907 } else { // ceMatch == U_CE_SKIP_PATN
3908 // redo with same targCE, next patCE
3909 targetIxOffset--;
3910 }
3911 }
3912 }
3913 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3914
3915 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3916 // No match at this targetIx. Try again at the next.
3917 continue;
3918 }
3919
3920 if (!found) {
3921 // No match at all, we have run off the end of the target text.
3922 break;
3923 }
3924
3925
3926 // We have found a match in CE space.
3927 // Now determine the bounds in string index space.
3928 // There still is a chance of match failure if the CE range not correspond to
3929 // an acceptable character range.
3930 //
3931 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
3932
3933 mStart = firstCEI->lowIndex;
3934 minLimit = lastCEI->lowIndex;
3935
3936 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3937 // extended to the end of input, and the match is good.
3938
3939 // Look at the high and low indices of the CE following the match. If
3940 // they are the same it means one of two things:
3941 // 1. The match extended to the last CE from the target text, which is OK, or
3942 // 2. The last CE that was part of the match is in an expansion that extends
3943 // to the first CE after the match. In this case, we reject the match.
3944 const CEI *nextCEI = 0;
3945 if (strsrch->search->elementComparisonType == 0) {
3946 nextCEI = ceb.get(targetIx + targetIxOffset);
3947 maxLimit = nextCEI->lowIndex;
3948 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3949 found = FALSE;
3950 }
3951 } else {
3952 for ( ; ; ++targetIxOffset ) {
3953 nextCEI = ceb.get(targetIx + targetIxOffset);
3954 maxLimit = nextCEI->lowIndex;
3955 // If we are at the end of the target too, match succeeds
3956 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3957 break;
3958 }
3959 // As long as the next CE has primary weight of 0,
3960 // it is part of the last target element matched by the pattern;
3961 // make sure it can be part of a match with the last patCE
3962 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3963 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3964 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3965 found = FALSE;
3966 break;
3967 }
3968 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3969 // target element, but it has non-zero primary weight => match fails
3970 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3971 found = false;
3972 break;
3973 // Else the target CE is not part of an expansion of the last matched element, match succeeds
3974 } else {
3975 break;
3976 }
3977 }
3978 }
3979
3980
3981 // Check for the start of the match being within a combining sequence.
3982 // This can happen if the pattern itself begins with a combining char, and
3983 // the match found combining marks in the target text that were attached
3984 // to something else.
3985 // This type of match should be rejected for not completely consuming a
3986 // combining sequence.
3987 if (!isBreakBoundary(strsrch, mStart)) {
3988 found = FALSE;
3989 }
3990
3991 // Check for the start of the match being within an Collation Element Expansion,
3992 // meaning that the first char of the match is only partially matched.
3993 // With exapnsions, the first CE will report the index of the source
3994 // character, and all subsequent (expansions) CEs will report the source index of the
3995 // _following_ character.
3996 int32_t secondIx = firstCEI->highIndex;
3997 if (mStart == secondIx) {
3998 found = FALSE;
3999 }
4000
4001 // Advance the match end position to the first acceptable match boundary.
4002 // This advances the index over any combining charcters.
4003 mLimit = maxLimit;
4004 if (minLimit < maxLimit) {
4005 // When the last CE's low index is same with its high index, the CE is likely
4006 // a part of expansion. In this case, the index is located just after the
4007 // character corresponding to the CEs compared above. If the index is right
4008 // at the break boundary, move the position to the next boundary will result
4009 // incorrect match length when there are ignorable characters exist between
4010 // the position and the next character produces CE(s). See ticket#8482.
4011 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4012 mLimit = minLimit;
4013 } else {
4014 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4015 if (nba >= lastCEI->highIndex) {
4016 mLimit = nba;
4017 }
4018 }
4019 }
4020
4021 #ifdef USEARCH_DEBUG
4022 if (getenv("USEARCH_DEBUG") != NULL) {
4023 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4024 }
4025 #endif
4026
4027 // If default breakIter is being used, and next collation element belonging to this
4028 // combining sequence has non-zero primary weight and corresponds to a separate
4029 // character following the one at end of the current match, then do NOT require
4030 // that match end position be on a breakIter boundary, or that end of the
4031 // combining sequence not extend beyond the match in CE space. Only do those
4032 // tests if the conditions above are not met. Added this to make prefix search
4033 // work in Indic scripts per <rdar://problem/18063262>.
4034 UBool doLimitTests = !(strsrch->search->breakIter == NULL &&
4035 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4036 nextCEI->lowIndex >= lastCEI->highIndex && nextCEI->highIndex > nextCEI->lowIndex);
4037
4038 if (doLimitTests) { // <rdar://problem/18063262>
4039 // If advancing to the end of a combining sequence in character indexing space
4040 // advanced us beyond the end of the match in CE space, reject this match.
4041 if (mLimit > maxLimit) {
4042 found = FALSE;
4043 }
4044
4045 if (!isBreakBoundary(strsrch, mLimit)) {
4046 found = FALSE;
4047 }
4048 }
4049
4050 if (! checkIdentical(strsrch, mStart, mLimit)) {
4051 found = FALSE;
4052 }
4053
4054 if (found) {
4055 break;
4056 }
4057 }
4058
4059 #ifdef USEARCH_DEBUG
4060 if (getenv("USEARCH_DEBUG") != NULL) {
4061 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4062 int32_t lastToPrint = ceb.limitIx+2;
4063 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4064 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4065 }
4066 printf("\n%s\n", found? "match found" : "no match");
4067 }
4068 #endif
4069
4070 // All Done. Store back the match bounds to the caller.
4071 //
4072 if (found==FALSE) {
4073 mLimit = -1;
4074 mStart = -1;
4075 }
4076
4077 if (matchStart != NULL) {
4078 *matchStart= mStart;
4079 }
4080
4081 if (matchLimit != NULL) {
4082 *matchLimit = mLimit;
4083 }
4084
4085 return found;
4086}
4087
4088U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4089 int32_t startIdx,
4090 int32_t *matchStart,
4091 int32_t *matchLimit,
4092 UErrorCode *status)
4093{
4094 if (U_FAILURE(*status)) {
4095 return FALSE;
4096 }
4097
4098 // TODO: reject search patterns beginning with a combining char.
4099
4100#ifdef USEARCH_DEBUG
4101 if (getenv("USEARCH_DEBUG") != NULL) {
4102 printf("Pattern CEs\n");
4103 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4104 printf(" %8x", strsrch->pattern.ces[ii]);
4105 }
4106 printf("\n");
4107 }
4108
4109#endif
4110 // Input parameter sanity check.
4111 // TODO: should input indicies clip to the text length
4112 // in the same way that UText does.
4113 if(strsrch->pattern.cesLength == 0 ||
4114 startIdx < 0 ||
4115 startIdx > strsrch->search->textLength ||
4116 strsrch->pattern.ces == NULL) {
4117 *status = U_ILLEGAL_ARGUMENT_ERROR;
4118 return FALSE;
4119 }
4120
4121 if (strsrch->pattern.pces == NULL) {
4122 initializePatternPCETable(strsrch, status);
4123 }
4124
4125 CEIBuffer ceb(strsrch, status);
4126 int32_t targetIx = 0;
4127
4128 /*
4129 * Pre-load the buffer with the CE's for the grapheme
4130 * after our starting position so that we're sure that
4131 * we can look at the CE following the match when we
4132 * check the match boundaries.
4133 *
4134 * This will also pre-fetch the first CE that we'll
4135 * consider for the match.
4136 */
4137 if (startIdx < strsrch->search->textLength) {
4138 UBreakIterator *bi = strsrch->search->internalBreakIter;
4139 int32_t next = ubrk_following(bi, startIdx);
4140
4141 ucol_setOffset(strsrch->textIter, next, status);
4142
4143 for (targetIx = 0; ; targetIx += 1) {
4144 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4145 break;
4146 }
4147 }
4148 } else {
4149 ucol_setOffset(strsrch->textIter, startIdx, status);
4150 }
4151
4152
4153 const CEI *targetCEI = NULL;
4154 int32_t patIx;
4155 UBool found;
4156
4157 int32_t limitIx = targetIx;
4158 int32_t mStart = -1;
4159 int32_t mLimit = -1;
4160 int32_t minLimit;
4161 int32_t maxLimit;
4162
4163
4164
4165 // Outer loop moves over match starting positions in the
4166 // target CE space.
4167 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4168 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4169 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4170 // and the beginning of the base text.
4171 for(targetIx = limitIx; ; targetIx += 1)
4172 {
4173 found = TRUE;
4174 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4175 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4176 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4177 const CEI *lastCEI = ceb.getPrevious(targetIx);
4178 if (lastCEI == NULL) {
4179 *status = U_INTERNAL_PROGRAM_ERROR;
4180 found = FALSE;
4181 break;
4182 }
4183 // Inner loop checks for a match beginning at each
4184 // position from the outer loop.
4185 int32_t targetIxOffset = 0;
4186 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4187 int64_t patCE = strsrch->pattern.pces[patIx];
4188
4189 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4190 // Compare CE from target string with CE from the pattern.
4191 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4192 // which will fail the compare, below.
4193 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4194 if ( ceMatch == U_CE_NO_MATCH ) {
4195 found = FALSE;
4196 break;
4197 } else if ( ceMatch > U_CE_NO_MATCH ) {
4198 if ( ceMatch == U_CE_SKIP_TARG ) {
4199 // redo with same patCE, next targCE
4200 patIx++;
4201 targetIxOffset++;
4202 } else { // ceMatch == U_CE_SKIP_PATN
4203 // redo with same targCE, next patCE
4204 targetIxOffset--;
4205 }
4206 }
4207 }
4208
4209 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4210 // No match at this targetIx. Try again at the next.
4211 continue;
4212 }
4213
4214 if (!found) {
4215 // No match at all, we have run off the end of the target text.
4216 break;
4217 }
4218
4219
4220 // We have found a match in CE space.
4221 // Now determine the bounds in string index space.
4222 // There still is a chance of match failure if the CE range not correspond to
4223 // an acceptable character range.
4224 //
4225 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4226 mStart = firstCEI->lowIndex;
4227
4228 // Check for the start of the match being within a combining sequence.
4229 // This can happen if the pattern itself begins with a combining char, and
4230 // the match found combining marks in the target text that were attached
4231 // to something else.
4232 // This type of match should be rejected for not completely consuming a
4233 // combining sequence.
4234 if (!isBreakBoundary(strsrch, mStart)) {
4235 found = FALSE;
4236 }
4237
4238 // Look at the high index of the first CE in the match. If it's the same as the
4239 // low index, the first CE in the match is in the middle of an expansion.
4240 if (mStart == firstCEI->highIndex) {
4241 found = FALSE;
4242 }
4243
4244
4245 minLimit = lastCEI->lowIndex;
4246
4247 if (targetIx > 0) {
4248 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4249 // extended to the end of input, and the match is good.
4250
4251 // Look at the high and low indices of the CE following the match. If
4252 // they are the same it means one of two things:
4253 // 1. The match extended to the last CE from the target text, which is OK, or
4254 // 2. The last CE that was part of the match is in an expansion that extends
4255 // to the first CE after the match. In this case, we reject the match.
4256 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4257
4258 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4259 found = FALSE;
4260 }
4261
4262 mLimit = maxLimit = nextCEI->lowIndex;
4263
4264 // Advance the match end position to the first acceptable match boundary.
4265 // This advances the index over any combining characters.
4266 if (minLimit < maxLimit) {
4267 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4268
4269 if (nba >= lastCEI->highIndex) {
4270 mLimit = nba;
4271 }
4272 }
4273
4274 // If default breakIter is being used, and next collation element belonging to this
4275 // combining sequence has non-zero primary weight and corresponds to a separate
4276 // character following the one at end of the current match, then do NOT require
4277 // that match end position be on a breakIter boundary, or that end of the
4278 // combining sequence not extend beyond the match in CE space. Only do those
4279 // tests if the conditions above are not met. Added this to make prefix search
4280 // work in Indic scripts per <rdar://problem/18063262>.
4281 UBool doLimitTests = !(strsrch->search->breakIter == NULL &&
4282 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4283 nextCEI->lowIndex >= lastCEI->highIndex && nextCEI->highIndex > nextCEI->lowIndex);
4284
4285 if (doLimitTests) { // <rdar://problem/18063262>
4286 // If advancing to the end of a combining sequence in character indexing space
4287 // advanced us beyond the end of the match in CE space, reject this match.
4288 if (mLimit > maxLimit) {
4289 found = FALSE;
4290 }
4291
4292 // Make sure the end of the match is on a break boundary
4293 if (!isBreakBoundary(strsrch, mLimit)) {
4294 found = FALSE;
4295 }
4296 }
4297
4298 } else {
4299 // No non-ignorable CEs after this point.
4300 // The maximum position is detected by boundary after
4301 // the last non-ignorable CE. Combining sequence
4302 // across the start index will be truncated.
4303 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4304 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4305 }
4306
4307 #ifdef USEARCH_DEBUG
4308 if (getenv("USEARCH_DEBUG") != NULL) {
4309 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4310 }
4311 #endif
4312
4313
4314 if (! checkIdentical(strsrch, mStart, mLimit)) {
4315 found = FALSE;
4316 }
4317
4318 if (found) {
4319 break;
4320 }
4321 }
4322
4323 #ifdef USEARCH_DEBUG
4324 if (getenv("USEARCH_DEBUG") != NULL) {
4325 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4326 int32_t lastToPrint = ceb.limitIx+2;
4327 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4328 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4329 }
4330 printf("\n%s\n", found? "match found" : "no match");
4331 }
4332 #endif
4333
4334 // All Done. Store back the match bounds to the caller.
4335 //
4336 if (found==FALSE) {
4337 mLimit = -1;
4338 mStart = -1;
4339 }
4340
4341 if (matchStart != NULL) {
4342 *matchStart= mStart;
4343 }
4344
4345 if (matchLimit != NULL) {
4346 *matchLimit = mLimit;
4347 }
4348
4349 return found;
4350}
4351
4352// internal use methods declared in usrchimp.h -----------------------------
4353
4354UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4355{
4356 if (U_FAILURE(*status)) {
4357 setMatchNotFound(strsrch);
4358 return FALSE;
4359 }
4360
4361#if BOYER_MOORE
4362 UCollationElements *coleiter = strsrch->textIter;
4363 int32_t textlength = strsrch->search->textLength;
4364 int32_t *patternce = strsrch->pattern.ces;
4365 int32_t patterncelength = strsrch->pattern.cesLength;
4366 int32_t textoffset = ucol_getOffset(coleiter);
4367
4368 // status used in setting coleiter offset, since offset is checked in
4369 // shiftForward before setting the coleiter offset, status never
4370 // a failure
4371 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4372 patterncelength);
4373 while (textoffset <= textlength)
4374 {
4375 uint32_t patternceindex = patterncelength - 1;
4376 int32_t targetce;
4377 UBool found = FALSE;
4378 int32_t lastce = UCOL_NULLORDER;
4379
4380 setColEIterOffset(coleiter, textoffset);
4381
4382 for (;;) {
4383 // finding the last pattern ce match, imagine composite characters
4384 // for example: search for pattern A in text \u00C0
4385 // we'll have to skip \u0300 the grave first before we get to A
4386 targetce = ucol_previous(coleiter, status);
4387 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4388 found = FALSE;
4389 break;
4390 }
4391 targetce = getCE(strsrch, targetce);
4392 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4393 // this is for the text \u0315\u0300 that requires
4394 // normalization and pattern \u0300, where \u0315 is ignorable
4395 continue;
4396 }
4397 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4398 lastce = targetce;
4399 }
4400 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4401 if (targetce == patternce[patternceindex]) {
4402 // the first ce can be a contraction
4403 found = TRUE;
4404 break;
4405 }
4406 if (!hasExpansion(coleiter)) {
4407 found = FALSE;
4408 break;
4409 }
4410 }
4411
4412 //targetce = lastce;
4413
4414 while (found && patternceindex > 0) {
4415 lastce = targetce;
4416 targetce = ucol_previous(coleiter, status);
4417 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4418 found = FALSE;
4419 break;
4420 }
4421 targetce = getCE(strsrch, targetce);
4422 if (targetce == UCOL_IGNORABLE) {
4423 continue;
4424 }
4425
4426 patternceindex --;
4427 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4428 found = found && targetce == patternce[patternceindex];
4429 }
4430
4431 targetce = lastce;
4432
4433 if (!found) {
4434 if (U_FAILURE(*status)) {
4435 break;
4436 }
4437 textoffset = shiftForward(strsrch, textoffset, lastce,
4438 patternceindex);
4439 // status checked at loop.
4440 patternceindex = patterncelength;
4441 continue;
4442 }
4443
4444 if (checkNextExactMatch(strsrch, &textoffset, status)) {
4445 // status checked in ucol_setOffset
4446 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4447 return TRUE;
4448 }
4449 }
4450 setMatchNotFound(strsrch);
4451 return FALSE;
4452#else
4453 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4454 int32_t start = -1;
4455 int32_t end = -1;
4456
4457 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4458 strsrch->search->matchedIndex = start;
4459 strsrch->search->matchedLength = end - start;
4460 return TRUE;
4461 } else {
4462 setMatchNotFound(strsrch);
4463 return FALSE;
4464 }
4465#endif
4466}
4467
4468UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4469{
4470 if (U_FAILURE(*status)) {
4471 setMatchNotFound(strsrch);
4472 return FALSE;
4473 }
4474
4475#if BOYER_MOORE
4476 UCollationElements *coleiter = strsrch->textIter;
4477 int32_t textlength = strsrch->search->textLength;
4478 int32_t *patternce = strsrch->pattern.ces;
4479 int32_t patterncelength = strsrch->pattern.cesLength;
4480 int32_t textoffset = ucol_getOffset(coleiter);
4481 UBool hasPatternAccents =
4482 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4483
4484 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4485 patterncelength);
4486 strsrch->canonicalPrefixAccents[0] = 0;
4487 strsrch->canonicalSuffixAccents[0] = 0;
4488
4489 while (textoffset <= textlength)
4490 {
4491 int32_t patternceindex = patterncelength - 1;
4492 int32_t targetce;
4493 UBool found = FALSE;
4494 int32_t lastce = UCOL_NULLORDER;
4495
4496 setColEIterOffset(coleiter, textoffset);
4497
4498 for (;;) {
4499 // finding the last pattern ce match, imagine composite characters
4500 // for example: search for pattern A in text \u00C0
4501 // we'll have to skip \u0300 the grave first before we get to A
4502 targetce = ucol_previous(coleiter, status);
4503 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4504 found = FALSE;
4505 break;
4506 }
4507 targetce = getCE(strsrch, targetce);
4508 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4509 lastce = targetce;
4510 }
4511 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4512 if (targetce == patternce[patternceindex]) {
4513 // the first ce can be a contraction
4514 found = TRUE;
4515 break;
4516 }
4517 if (!hasExpansion(coleiter)) {
4518 found = FALSE;
4519 break;
4520 }
4521 }
4522
4523 while (found && patternceindex > 0) {
4524 targetce = ucol_previous(coleiter, status);
4525 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4526 found = FALSE;
4527 break;
4528 }
4529 targetce = getCE(strsrch, targetce);
4530 if (targetce == UCOL_IGNORABLE) {
4531 continue;
4532 }
4533
4534 patternceindex --;
4535 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4536 found = found && targetce == patternce[patternceindex];
4537 }
4538
4539 // initializing the rearranged accent array
4540 if (hasPatternAccents && !found) {
4541 strsrch->canonicalPrefixAccents[0] = 0;
4542 strsrch->canonicalSuffixAccents[0] = 0;
4543 if (U_FAILURE(*status)) {
4544 break;
4545 }
4546 found = doNextCanonicalMatch(strsrch, textoffset, status);
4547 }
4548
4549 if (!found) {
4550 if (U_FAILURE(*status)) {
4551 break;
4552 }
4553 textoffset = shiftForward(strsrch, textoffset, lastce,
4554 patternceindex);
4555 // status checked at loop
4556 patternceindex = patterncelength;
4557 continue;
4558 }
4559
4560 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4561 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4562 return TRUE;
4563 }
4564 }
4565 setMatchNotFound(strsrch);
4566 return FALSE;
4567#else
4568 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4569 int32_t start = -1;
4570 int32_t end = -1;
4571
4572 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4573 strsrch->search->matchedIndex = start;
4574 strsrch->search->matchedLength = end - start;
4575 return TRUE;
4576 } else {
4577 setMatchNotFound(strsrch);
4578 return FALSE;
4579 }
4580#endif
4581}
4582
4583UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4584{
4585 if (U_FAILURE(*status)) {
4586 setMatchNotFound(strsrch);
4587 return FALSE;
4588 }
4589
4590#if BOYER_MOORE
4591 UCollationElements *coleiter = strsrch->textIter;
4592 int32_t *patternce = strsrch->pattern.ces;
4593 int32_t patterncelength = strsrch->pattern.cesLength;
4594 int32_t textoffset = ucol_getOffset(coleiter);
4595
4596 // shifting it check for setting offset
4597 // if setOffset is called previously or there was no previous match, we
4598 // leave the offset as it is.
4599 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4600 textoffset = strsrch->search->matchedIndex;
4601 }
4602
4603 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4604 patterncelength);
4605
4606 while (textoffset >= 0)
4607 {
4608 int32_t patternceindex = 1;
4609 int32_t targetce;
4610 UBool found = FALSE;
4611 int32_t firstce = UCOL_NULLORDER;
4612
4613 // if status is a failure, ucol_setOffset does nothing
4614 setColEIterOffset(coleiter, textoffset);
4615
4616 for (;;) {
4617 // finding the first pattern ce match, imagine composite
4618 // characters. for example: search for pattern \u0300 in text
4619 // \u00C0, we'll have to skip A first before we get to
4620 // \u0300 the grave accent
4621 targetce = ucol_next(coleiter, status);
4622 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4623 found = FALSE;
4624 break;
4625 }
4626 targetce = getCE(strsrch, targetce);
4627 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4628 firstce = targetce;
4629 }
4630 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4631 continue;
4632 }
4633 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4634 if (targetce == patternce[0]) {
4635 found = TRUE;
4636 break;
4637 }
4638 if (!hasExpansion(coleiter)) {
4639 // checking for accents in composite character
4640 found = FALSE;
4641 break;
4642 }
4643 }
4644
4645 //targetce = firstce;
4646
4647 while (found && (patternceindex < patterncelength)) {
4648 firstce = targetce;
4649 targetce = ucol_next(coleiter, status);
4650 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4651 found = FALSE;
4652 break;
4653 }
4654 targetce = getCE(strsrch, targetce);
4655 if (targetce == UCOL_IGNORABLE) {
4656 continue;
4657 }
4658
4659 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4660 found = found && targetce == patternce[patternceindex];
4661 patternceindex ++;
4662 }
4663
4664 targetce = firstce;
4665
4666 if (!found) {
4667 if (U_FAILURE(*status)) {
4668 break;
4669 }
4670
4671 textoffset = reverseShift(strsrch, textoffset, targetce,
4672 patternceindex);
4673 patternceindex = 0;
4674 continue;
4675 }
4676
4677 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4678 setColEIterOffset(coleiter, textoffset);
4679 return TRUE;
4680 }
4681 }
4682 setMatchNotFound(strsrch);
4683 return FALSE;
4684#else
4685 int32_t textOffset;
4686
4687 if (strsrch->search->isOverlap) {
4688 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4689 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4690 } else {
4691 // move the start position at the end of possible match
4692 initializePatternPCETable(strsrch, status);
4693 if (!initTextProcessedIter(strsrch, status)) {
4694 setMatchNotFound(strsrch);
4695 return FALSE;
4696 }
4697 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4698 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4699 if (pce == UCOL_PROCESSED_NULLORDER) {
4700 // at the end of the text
4701 break;
4702 }
4703 }
4704 if (U_FAILURE(*status)) {
4705 setMatchNotFound(strsrch);
4706 return FALSE;
4707 }
4708 textOffset = ucol_getOffset(strsrch->textIter);
4709 }
4710 } else {
4711 textOffset = ucol_getOffset(strsrch->textIter);
4712 }
4713
4714 int32_t start = -1;
4715 int32_t end = -1;
4716
4717 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4718 strsrch->search->matchedIndex = start;
4719 strsrch->search->matchedLength = end - start;
4720 return TRUE;
4721 } else {
4722 setMatchNotFound(strsrch);
4723 return FALSE;
4724 }
4725#endif
4726}
4727
4728UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4729 UErrorCode *status)
4730{
4731 if (U_FAILURE(*status)) {
4732 setMatchNotFound(strsrch);
4733 return FALSE;
4734 }
4735
4736#if BOYER_MOORE
4737 UCollationElements *coleiter = strsrch->textIter;
4738 int32_t *patternce = strsrch->pattern.ces;
4739 int32_t patterncelength = strsrch->pattern.cesLength;
4740 int32_t textoffset = ucol_getOffset(coleiter);
4741 UBool hasPatternAccents =
4742 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4743
4744 // shifting it check for setting offset
4745 // if setOffset is called previously or there was no previous match, we
4746 // leave the offset as it is.
4747 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4748 textoffset = strsrch->search->matchedIndex;
4749 }
4750
4751 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4752 patterncelength);
4753 strsrch->canonicalPrefixAccents[0] = 0;
4754 strsrch->canonicalSuffixAccents[0] = 0;
4755
4756 while (textoffset >= 0)
4757 {
4758 int32_t patternceindex = 1;
4759 int32_t targetce;
4760 UBool found = FALSE;
4761 int32_t firstce = UCOL_NULLORDER;
4762
4763 setColEIterOffset(coleiter, textoffset);
4764 for (;;) {
4765 // finding the first pattern ce match, imagine composite
4766 // characters. for example: search for pattern \u0300 in text
4767 // \u00C0, we'll have to skip A first before we get to
4768 // \u0300 the grave accent
4769 targetce = ucol_next(coleiter, status);
4770 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4771 found = FALSE;
4772 break;
4773 }
4774 targetce = getCE(strsrch, targetce);
4775 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4776 firstce = targetce;
4777 }
4778
4779 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4780 if (targetce == patternce[0]) {
4781 // the first ce can be a contraction
4782 found = TRUE;
4783 break;
4784 }
4785 if (!hasExpansion(coleiter)) {
4786 // checking for accents in composite character
4787 found = FALSE;
4788 break;
4789 }
4790 }
4791
4792 targetce = firstce;
4793
4794 while (found && patternceindex < patterncelength) {
4795 targetce = ucol_next(coleiter, status);
4796 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4797 found = FALSE;
4798 break;
4799 }
4800 targetce = getCE(strsrch, targetce);
4801 if (targetce == UCOL_IGNORABLE) {
4802 continue;
4803 }
4804
4805 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4806 found = found && targetce == patternce[patternceindex];
4807 patternceindex ++;
4808 }
4809
4810 // initializing the rearranged accent array
4811 if (hasPatternAccents && !found) {
4812 strsrch->canonicalPrefixAccents[0] = 0;
4813 strsrch->canonicalSuffixAccents[0] = 0;
4814 if (U_FAILURE(*status)) {
4815 break;
4816 }
4817 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4818 }
4819
4820 if (!found) {
4821 if (U_FAILURE(*status)) {
4822 break;
4823 }
4824 textoffset = reverseShift(strsrch, textoffset, targetce,
4825 patternceindex);
4826 patternceindex = 0;
4827 continue;
4828 }
4829
4830 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4831 setColEIterOffset(coleiter, textoffset);
4832 return TRUE;
4833 }
4834 }
4835 setMatchNotFound(strsrch);
4836 return FALSE;
4837#else
4838 int32_t textOffset;
4839
4840 if (strsrch->search->isOverlap) {
4841 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4842 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4843 } else {
4844 // move the start position at the end of possible match
4845 initializePatternPCETable(strsrch, status);
4846 if (!initTextProcessedIter(strsrch, status)) {
4847 setMatchNotFound(strsrch);
4848 return FALSE;
4849 }
4850 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4851 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4852 if (pce == UCOL_PROCESSED_NULLORDER) {
4853 // at the end of the text
4854 break;
4855 }
4856 }
4857 if (U_FAILURE(*status)) {
4858 setMatchNotFound(strsrch);
4859 return FALSE;
4860 }
4861 textOffset = ucol_getOffset(strsrch->textIter);
4862 }
4863 } else {
4864 textOffset = ucol_getOffset(strsrch->textIter);
4865 }
4866
4867 int32_t start = -1;
4868 int32_t end = -1;
4869
4870 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4871 strsrch->search->matchedIndex = start;
4872 strsrch->search->matchedLength = end - start;
4873 return TRUE;
4874 } else {
4875 setMatchNotFound(strsrch);
4876 return FALSE;
4877 }
4878#endif
4879}
4880
4881#endif /* #if !UCONFIG_NO_COLLATION */