]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/usearch.cpp
ICU-57149.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / usearch.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
b331163b 3* Copyright (C) 2001-2015 IBM and others. All rights reserved.
b75a7d8f
A
4**********************************************************************
5* Date Name Description
6* 07/02/2001 synwee Creation.
7**********************************************************************
8*/
9
10#include "unicode/utypes.h"
11
46f4442e 12#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
b75a7d8f
A
13
14#include "unicode/usearch.h"
15#include "unicode/ustring.h"
16#include "unicode/uchar.h"
4388f060 17#include "unicode/utf16.h"
729e4ab9 18#include "normalizer2impl.h"
b75a7d8f
A
19#include "usrchimp.h"
20#include "cmemory.h"
374ca955 21#include "ucln_in.h"
46f4442e 22#include "uassert.h"
729e4ab9 23#include "ustr_imp.h"
46f4442e
A
24
25U_NAMESPACE_USE
26
27// don't use Boyer-Moore
729e4ab9 28// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
46f4442e 29#define BOYER_MOORE 0
b75a7d8f
A
30
31// internal definition ---------------------------------------------------
32
33#define LAST_BYTE_MASK_ 0xFF
34#define SECOND_LAST_BYTE_SHIFT_ 8
35#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
36
4388f060 37static const Normalizer2Impl *g_nfcImpl = NULL;
b75a7d8f
A
38
39// internal methods -------------------------------------------------
40
41/**
42* Fast collation element iterator setOffset.
43* This function does not check for bounds.
44* @param coleiter collation element iterator
45* @param offset to set
46*/
729e4ab9 47static
b75a7d8f
A
48inline void setColEIterOffset(UCollationElements *elems,
49 int32_t offset)
50{
57a6839d
A
51 // Note: Not "fast" any more after the 2013 collation rewrite.
52 // We do not want to expose more internals than necessary.
53 UErrorCode status = U_ZERO_ERROR;
54 ucol_setOffset(elems, offset, &status);
b75a7d8f
A
55}
56
57/**
58* Getting the mask for collation strength
59* @param strength collation strength
60* @return collation element mask
61*/
62static
729e4ab9 63inline uint32_t getMask(UCollationStrength strength)
b75a7d8f 64{
729e4ab9 65 switch (strength)
b75a7d8f
A
66 {
67 case UCOL_PRIMARY:
68 return UCOL_PRIMARYORDERMASK;
69 case UCOL_SECONDARY:
70 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
71 default:
729e4ab9 72 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
b75a7d8f
A
73 UCOL_PRIMARYORDERMASK;
74 }
75}
76
77/**
2ca993e8
A
78* @param ce 32-bit collation element
79* @return hash code
b75a7d8f
A
80*/
81static
2ca993e8 82inline int hashFromCE32(uint32_t ce)
b75a7d8f 83{
2ca993e8
A
84 int hc = (int)(
85 ((((((ce >> 24) * 37) +
86 (ce >> 16)) * 37) +
87 (ce >> 8)) * 37) +
88 ce);
89 hc %= MAX_TABLE_SIZE_;
90 if (hc < 0) {
91 hc += MAX_TABLE_SIZE_;
92 }
93 return hc;
b75a7d8f
A
94}
95
374ca955
A
96U_CDECL_BEGIN
97static UBool U_CALLCONV
98usearch_cleanup(void) {
4388f060 99 g_nfcImpl = NULL;
374ca955
A
100 return TRUE;
101}
102U_CDECL_END
103
b75a7d8f
A
104/**
105* Initializing the fcd tables.
106* Internal method, status assumed to be a success.
729e4ab9 107* @param status output error if any, caller to check status before calling
b75a7d8f
A
108* method, status assumed to be success when passed in.
109*/
110static
729e4ab9 111inline void initializeFCD(UErrorCode *status)
b75a7d8f 112{
4388f060
A
113 if (g_nfcImpl == NULL) {
114 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
374ca955 115 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
b75a7d8f
A
116 }
117}
118
119/**
120* Gets the fcd value for a character at the argument index.
121* This method takes into accounts of the supplementary characters.
122* @param str UTF16 string where character for fcd retrieval resides
729e4ab9
A
123* @param offset position of the character whose fcd is to be retrieved, to be
124* overwritten with the next character position, taking
b75a7d8f
A
125* surrogate characters into consideration.
126* @param strlength length of the argument string
127* @return fcd value
128*/
129static
729e4ab9 130uint16_t getFCD(const UChar *str, int32_t *offset,
b75a7d8f
A
131 int32_t strlength)
132{
729e4ab9 133 const UChar *temp = str + *offset;
4388f060 134 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
729e4ab9 135 *offset = (int32_t)(temp - str);
b75a7d8f
A
136 return result;
137}
138
139/**
729e4ab9 140* Getting the modified collation elements taking into account the collation
b75a7d8f
A
141* attributes
142* @param strsrch string search data
729e4ab9 143* @param sourcece
b75a7d8f
A
144* @return the modified collation element
145*/
146static
374ca955 147inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
b75a7d8f
A
148{
149 // note for tertiary we can't use the collator->tertiaryMask, that
150 // is a preprocessed mask that takes into account case options. since
151 // we are only concerned with exact matches, we don't need that.
152 sourcece &= strsrch->ceMask;
729e4ab9 153
b75a7d8f
A
154 if (strsrch->toShift) {
155 // alternate handling here, since only the 16 most significant digits
156 // is only used, we can safely do a compare without masking
157 // if the ce is a variable, we mask and get only the primary values
158 // no shifting to quartenary is required since all primary values
159 // less than variabletop will need to be masked off anyway.
160 if (strsrch->variableTop > sourcece) {
729e4ab9 161 if (strsrch->strength >= UCOL_QUATERNARY) {
b75a7d8f
A
162 sourcece &= UCOL_PRIMARYORDERMASK;
163 }
729e4ab9 164 else {
b75a7d8f
A
165 sourcece = UCOL_IGNORABLE;
166 }
167 }
729e4ab9
A
168 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
169 sourcece = 0xFFFF;
b75a7d8f
A
170 }
171
172 return sourcece;
173}
174
729e4ab9 175/**
b75a7d8f
A
176* Allocate a memory and returns NULL if it failed.
177* Internal method, status assumed to be a success.
178* @param size to allocate
729e4ab9 179* @param status output error if any, caller to check status before calling
b75a7d8f
A
180* method, status assumed to be success when passed in.
181* @return newly allocated array, NULL otherwise
182*/
183static
729e4ab9 184inline void * allocateMemory(uint32_t size, UErrorCode *status)
b75a7d8f
A
185{
186 uint32_t *result = (uint32_t *)uprv_malloc(size);
187 if (result == NULL) {
188 *status = U_MEMORY_ALLOCATION_ERROR;
189 }
190 return result;
191}
192
193/**
194* Adds a uint32_t value to a destination array.
729e4ab9 195* Creates a new array if we run out of space. The caller will have to
b75a7d8f 196* manually deallocate the newly allocated array.
729e4ab9
A
197* Internal method, status assumed to be success, caller has to check status
198* before calling this method. destination not to be NULL and has at least
b75a7d8f
A
199* size destinationlength.
200* @param destination target array
201* @param offset destination offset to add value
202* @param destinationlength target array size, return value for the new size
203* @param value to be added
204* @param increments incremental size expected
729e4ab9 205* @param status output error if any, caller to check status before calling
b75a7d8f
A
206* method, status assumed to be success when passed in.
207* @return new destination array, destination if there was no new allocation
208*/
209static
729e4ab9
A
210inline int32_t * addTouint32_tArray(int32_t *destination,
211 uint32_t offset,
212 uint32_t *destinationlength,
374ca955 213 uint32_t value,
729e4ab9
A
214 uint32_t increments,
215 UErrorCode *status)
b75a7d8f
A
216{
217 uint32_t newlength = *destinationlength;
218 if (offset + 1 == newlength) {
219 newlength += increments;
374ca955
A
220 int32_t *temp = (int32_t *)allocateMemory(
221 sizeof(int32_t) * newlength, status);
b75a7d8f
A
222 if (U_FAILURE(*status)) {
223 return NULL;
224 }
a62d09fc 225 uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
b75a7d8f
A
226 *destinationlength = newlength;
227 destination = temp;
228 }
229 destination[offset] = value;
230 return destination;
231}
232
46f4442e
A
233/**
234* Adds a uint64_t value to a destination array.
729e4ab9 235* Creates a new array if we run out of space. The caller will have to
46f4442e 236* manually deallocate the newly allocated array.
729e4ab9
A
237* Internal method, status assumed to be success, caller has to check status
238* before calling this method. destination not to be NULL and has at least
46f4442e
A
239* size destinationlength.
240* @param destination target array
241* @param offset destination offset to add value
242* @param destinationlength target array size, return value for the new size
243* @param value to be added
244* @param increments incremental size expected
729e4ab9 245* @param status output error if any, caller to check status before calling
46f4442e
A
246* method, status assumed to be success when passed in.
247* @return new destination array, destination if there was no new allocation
248*/
249static
729e4ab9
A
250inline int64_t * addTouint64_tArray(int64_t *destination,
251 uint32_t offset,
252 uint32_t *destinationlength,
46f4442e 253 uint64_t value,
729e4ab9
A
254 uint32_t increments,
255 UErrorCode *status)
46f4442e
A
256{
257 uint32_t newlength = *destinationlength;
258 if (offset + 1 == newlength) {
259 newlength += increments;
260 int64_t *temp = (int64_t *)allocateMemory(
261 sizeof(int64_t) * newlength, status);
729e4ab9 262
46f4442e
A
263 if (U_FAILURE(*status)) {
264 return NULL;
265 }
266
a62d09fc 267 uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
46f4442e
A
268 *destinationlength = newlength;
269 destination = temp;
270 }
271
272 destination[offset] = value;
273
274 return destination;
275}
276
b75a7d8f
A
277/**
278* Initializing the ce table for a pattern.
279* Stores non-ignorable collation keys.
729e4ab9
A
280* Table size will be estimated by the size of the pattern text. Table
281* expansion will be perform as we go along. Adding 1 to ensure that the table
b75a7d8f
A
282* size definitely increases.
283* Internal method, status assumed to be a success.
284* @param strsrch string search data
729e4ab9 285* @param status output error if any, caller to check status before calling
b75a7d8f 286* method, status assumed to be success when passed in.
729e4ab9 287* @return total number of expansions
b75a7d8f
A
288*/
289static
729e4ab9 290inline uint16_t initializePatternCETable(UStringSearch *strsrch,
b75a7d8f
A
291 UErrorCode *status)
292{
293 UPattern *pattern = &(strsrch->pattern);
294 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
b331163b 295 int32_t *cetable = pattern->cesBuffer;
b75a7d8f
A
296 uint32_t patternlength = pattern->textLength;
297 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 298
b75a7d8f 299 if (coleiter == NULL) {
729e4ab9 300 coleiter = ucol_openElements(strsrch->collator, pattern->text,
b75a7d8f 301 patternlength, status);
729e4ab9
A
302 // status will be checked in ucol_next(..) later and if it is an
303 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
b75a7d8f
A
304 // returned.
305 strsrch->utilIter = coleiter;
306 }
307 else {
57a6839d 308 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
729e4ab9
A
309 }
310 if(U_FAILURE(*status)) {
311 return 0;
b75a7d8f 312 }
729e4ab9 313
b331163b
A
314 if (pattern->ces != cetable && pattern->ces) {
315 uprv_free(pattern->ces);
b75a7d8f 316 }
729e4ab9 317
b75a7d8f
A
318 uint16_t offset = 0;
319 uint16_t result = 0;
374ca955 320 int32_t ce;
b75a7d8f
A
321
322 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
323 U_SUCCESS(*status)) {
324 uint32_t newce = getCE(strsrch, ce);
325 if (newce) {
729e4ab9 326 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
b75a7d8f 327 newce,
729e4ab9 328 patternlength - ucol_getOffset(coleiter) + 1,
b75a7d8f
A
329 status);
330 if (U_FAILURE(*status)) {
331 return 0;
332 }
333 offset ++;
b331163b 334 if (cetable != temp && cetable != pattern->cesBuffer) {
b75a7d8f
A
335 uprv_free(cetable);
336 }
337 cetable = temp;
338 }
339 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
340 }
341
342 cetable[offset] = 0;
b331163b
A
343 pattern->ces = cetable;
344 pattern->cesLength = offset;
b75a7d8f
A
345
346 return result;
347}
348
46f4442e
A
349/**
350* Initializing the pce table for a pattern.
351* Stores non-ignorable collation keys.
729e4ab9
A
352* Table size will be estimated by the size of the pattern text. Table
353* expansion will be perform as we go along. Adding 1 to ensure that the table
46f4442e
A
354* size definitely increases.
355* Internal method, status assumed to be a success.
356* @param strsrch string search data
729e4ab9 357* @param status output error if any, caller to check status before calling
46f4442e 358* method, status assumed to be success when passed in.
729e4ab9 359* @return total number of expansions
46f4442e
A
360*/
361static
729e4ab9 362inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
46f4442e
A
363 UErrorCode *status)
364{
365 UPattern *pattern = &(strsrch->pattern);
366 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
b331163b 367 int64_t *pcetable = pattern->pcesBuffer;
46f4442e
A
368 uint32_t patternlength = pattern->textLength;
369 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 370
46f4442e 371 if (coleiter == NULL) {
729e4ab9 372 coleiter = ucol_openElements(strsrch->collator, pattern->text,
46f4442e 373 patternlength, status);
729e4ab9
A
374 // status will be checked in ucol_next(..) later and if it is an
375 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
46f4442e
A
376 // returned.
377 strsrch->utilIter = coleiter;
378 } else {
57a6839d 379 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
729e4ab9
A
380 }
381 if(U_FAILURE(*status)) {
382 return 0;
46f4442e 383 }
729e4ab9 384
b331163b
A
385 if (pattern->pces != pcetable && pattern->pces != NULL) {
386 uprv_free(pattern->pces);
46f4442e 387 }
729e4ab9 388
46f4442e
A
389 uint16_t offset = 0;
390 uint16_t result = 0;
391 int64_t pce;
392
57a6839d 393 icu::UCollationPCE iter(coleiter);
46f4442e
A
394
395 // ** Should processed CEs be signed or unsigned?
729e4ab9 396 // ** (the rest of the code in this file seems to play fast-and-loose with
46f4442e 397 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
57a6839d 398 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
46f4442e 399 U_SUCCESS(*status)) {
729e4ab9 400 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
46f4442e 401 pce,
729e4ab9 402 patternlength - ucol_getOffset(coleiter) + 1,
46f4442e
A
403 status);
404
405 if (U_FAILURE(*status)) {
406 return 0;
407 }
408
409 offset += 1;
410
b331163b 411 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
46f4442e
A
412 uprv_free(pcetable);
413 }
414
415 pcetable = temp;
416 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
417 }
418
419 pcetable[offset] = 0;
b331163b
A
420 pattern->pces = pcetable;
421 pattern->pcesLength = offset;
46f4442e
A
422
423 return result;
424}
425
b75a7d8f
A
426/**
427* Initializes the pattern struct.
428* Internal method, status assumed to be success.
429* @param strsrch UStringSearch data storage
729e4ab9 430* @param status output error if any, caller to check status before calling
b75a7d8f
A
431* method, status assumed to be success when passed in.
432* @return expansionsize the total expansion size of the pattern
729e4ab9 433*/
b75a7d8f 434static
729e4ab9 435inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
b75a7d8f 436{
57a6839d 437 if (U_FAILURE(*status)) { return 0; }
b75a7d8f
A
438 UPattern *pattern = &(strsrch->pattern);
439 const UChar *patterntext = pattern->text;
440 int32_t length = pattern->textLength;
441 int32_t index = 0;
729e4ab9 442
46f4442e
A
443 // Since the strength is primary, accents are ignored in the pattern.
444 if (strsrch->strength == UCOL_PRIMARY) {
4388f060
A
445 pattern->hasPrefixAccents = 0;
446 pattern->hasSuffixAccents = 0;
46f4442e 447 } else {
4388f060
A
448 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
449 SECOND_LAST_BYTE_SHIFT_;
450 index = length;
451 U16_BACK_1(patterntext, 0, index);
452 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
453 LAST_BYTE_MASK_;
46f4442e
A
454 }
455
456 // ** HACK **
b331163b
A
457 if (strsrch->pattern.pces != NULL) {
458 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
459 uprv_free(strsrch->pattern.pces);
46f4442e
A
460 }
461
b331163b 462 strsrch->pattern.pces = NULL;
46f4442e 463 }
b75a7d8f 464
b75a7d8f 465 // since intializePattern is an internal method status is a success.
729e4ab9 466 return initializePatternCETable(strsrch, status);
b75a7d8f
A
467}
468
469/**
470* Initializing shift tables, with the default values.
471* If a corresponding default value is 0, the shift table is not set.
729e4ab9 472* @param shift table for forwards shift
b75a7d8f
A
473* @param backshift table for backwards shift
474* @param cetable table containing pattern ce
475* @param cesize size of the pattern ces
476* @param expansionsize total size of the expansions
477* @param defaultforward the default forward value
478* @param defaultbackward the default backward value
479*/
480static
729e4ab9
A
481inline void setShiftTable(int16_t shift[], int16_t backshift[],
482 int32_t *cetable, int32_t cesize,
b75a7d8f
A
483 int16_t expansionsize,
484 int16_t defaultforward,
485 int16_t defaultbackward)
486{
729e4ab9 487 // estimate the value to shift. to do that we estimate the smallest
b75a7d8f 488 // number of characters to give the relevant ces, ie approximately
729e4ab9 489 // the number of ces minus their expansion, since expansions can come
b75a7d8f
A
490 // from a character.
491 int32_t count;
492 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
493 shift[count] = defaultforward;
494 }
495 cesize --; // down to the last index
496 for (count = 0; count < cesize; count ++) {
497 // number of ces from right of array to the count
498 int temp = defaultforward - count - 1;
2ca993e8 499 shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
b75a7d8f 500 }
2ca993e8 501 shift[hashFromCE32(cetable[cesize])] = 1;
b75a7d8f 502 // for ignorables we just shift by one. see test examples.
2ca993e8 503 shift[hashFromCE32(0)] = 1;
729e4ab9 504
b75a7d8f
A
505 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
506 backshift[count] = defaultbackward;
507 }
508 for (count = cesize; count > 0; count --) {
509 // the original value count does not seem to work
2ca993e8 510 backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
b75a7d8f
A
511 (int16_t)(count - expansionsize) : 1;
512 }
2ca993e8
A
513 backshift[hashFromCE32(cetable[0])] = 1;
514 backshift[hashFromCE32(0)] = 1;
b75a7d8f
A
515}
516
517/**
518* Building of the pattern collation element list and the boyer moore strsrch
519* table.
520* The canonical match will only be performed after the default match fails.
521* For both cases we need to remember the size of the composed and decomposed
522* versions of the string. Since the Boyer-Moore shift calculations shifts by
729e4ab9
A
523* a number of characters in the text and tries to match the pattern from that
524* offset, the shift value can not be too large in case we miss some
525* characters. To choose a right shift size, we estimate the NFC form of the
526* and use its size as a shift guide. The NFC form should be the small
b75a7d8f
A
527* possible representation of the pattern. Anyways, we'll err on the smaller
528* shift size. Hence the calculation for minlength.
729e4ab9
A
529* Canonical match will be performed slightly differently. We'll split the
530* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
531* the first and last base character (MS), the ending accents (EA). Matches
b75a7d8f
A
532* will be done on MS first, and only when we match MS then some processing
533* will be required for the prefix and end accents in order to determine if
729e4ab9
A
534* they match PA and EA. Hence the default shift values
535* for the canonical match will take the size of either end's accent into
b75a7d8f
A
536* consideration. Forwards search will take the end accents into consideration
537* for the default shift values and the backwards search will take the prefix
538* accents into consideration.
539* If pattern has no non-ignorable ce, we return a illegal argument error.
540* Internal method, status assumed to be success.
541* @param strsrch UStringSearch data storage
542* @param status for output errors if it occurs, status is assumed to be a
543* success when it is passed in.
729e4ab9 544*/
b75a7d8f 545static
729e4ab9 546inline void initialize(UStringSearch *strsrch, UErrorCode *status)
b75a7d8f 547{
729e4ab9 548 int16_t expandlength = initializePattern(strsrch, status);
b331163b 549 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
b75a7d8f 550 UPattern *pattern = &strsrch->pattern;
b331163b 551 int32_t cesize = pattern->cesLength;
b75a7d8f 552
729e4ab9 553 int16_t minlength = cesize > expandlength
374ca955 554 ? (int16_t)cesize - expandlength : 1;
b75a7d8f 555 pattern->defaultShiftSize = minlength;
b331163b 556 setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
b75a7d8f
A
557 cesize, expandlength, minlength, minlength);
558 return;
559 }
560 strsrch->pattern.defaultShiftSize = 0;
561}
562
46f4442e
A
563#if BOYER_MOORE
564/**
729e4ab9 565* Check to make sure that the match length is at the end of the character by
46f4442e 566* using the breakiterator.
729e4ab9 567* @param strsrch string search data
46f4442e
A
568* @param start target text start offset
569* @param end target text end offset
570*/
571static
729e4ab9 572void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
46f4442e
A
573 int32_t *end)
574{
575#if !UCONFIG_NO_BREAK_ITERATION
576 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
577 if (breakiterator) {
4388f060
A
578 int32_t matchend = *end;
579 //int32_t matchstart = *start;
729e4ab9 580
4388f060
A
581 if (!ubrk_isBoundary(breakiterator, matchend)) {
582 *end = ubrk_following(breakiterator, matchend);
46f4442e 583 }
729e4ab9 584
4388f060
A
585 /* Check the start of the matched text to make sure it doesn't have any accents
586 * before it. This code may not be necessary and so it is commented out */
587 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
588 *start = ubrk_preceding(breakiterator, matchstart);
589 }*/
46f4442e
A
590 }
591#endif
592}
593
b75a7d8f 594/**
729e4ab9
A
595* Determine whether the target text in UStringSearch bounded by the offset
596* start and end is one or more whole units of text as
b75a7d8f 597* determined by the breakiterator in UStringSearch.
729e4ab9 598* @param strsrch string search data
b75a7d8f
A
599* @param start target text start offset
600* @param end target text end offset
601*/
602static
729e4ab9 603UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
b75a7d8f
A
604 int32_t end)
605{
606#if !UCONFIG_NO_BREAK_ITERATION
607 UBreakIterator *breakiterator = strsrch->search->breakIter;
46f4442e 608 //TODO: Add here.
b75a7d8f
A
609 if (breakiterator) {
610 int32_t startindex = ubrk_first(breakiterator);
611 int32_t endindex = ubrk_last(breakiterator);
729e4ab9 612
b75a7d8f
A
613 // out-of-range indexes are never boundary positions
614 if (start < startindex || start > endindex ||
615 end < startindex || end > endindex) {
616 return FALSE;
617 }
729e4ab9
A
618 // otherwise, we can use following() on the position before the
619 // specified one and return true of the position we get back is the
b75a7d8f 620 // one the user specified
729e4ab9
A
621 UBool result = (start == startindex ||
622 ubrk_following(breakiterator, start - 1) == start) &&
623 (end == endindex ||
b75a7d8f
A
624 ubrk_following(breakiterator, end - 1) == end);
625 if (result) {
626 // iterates the individual ces
627 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 628 const UChar *text = strsrch->search->text +
b75a7d8f
A
629 start;
630 UErrorCode status = U_ZERO_ERROR;
631 ucol_setText(coleiter, text, end - start, &status);
b331163b 632 for (int32_t count = 0; count < strsrch->pattern.cesLength;
b75a7d8f 633 count ++) {
374ca955 634 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
635 if (ce == UCOL_IGNORABLE) {
636 count --;
637 continue;
638 }
b331163b 639 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
b75a7d8f
A
640 return FALSE;
641 }
642 }
374ca955 643 int32_t nextce = ucol_next(coleiter, &status);
b75a7d8f
A
644 while (ucol_getOffset(coleiter) == (end - start)
645 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
646 nextce = ucol_next(coleiter, &status);
647 }
648 if (ucol_getOffset(coleiter) == (end - start)
649 && nextce != UCOL_NULLORDER) {
650 // extra collation elements at the end of the match
651 return FALSE;
652 }
653 }
654 return result;
655 }
656#endif
657 return TRUE;
658}
659
660/**
729e4ab9
A
661* Getting the next base character offset if current offset is an accent,
662* or the current offset if the current character contains a base character.
b75a7d8f
A
663* accents the following base character will be returned
664* @param text string
665* @param textoffset current offset
666* @param textlength length of text string
667* @return the next base character or the current offset
668* if the current character is contains a base character.
669*/
670static
729e4ab9 671inline int32_t getNextBaseOffset(const UChar *text,
b75a7d8f
A
672 int32_t textoffset,
673 int32_t textlength)
674{
675 if (textoffset < textlength) {
676 int32_t temp = textoffset;
677 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
729e4ab9 678 while (temp < textlength) {
b75a7d8f 679 int32_t result = temp;
729e4ab9 680 if ((getFCD(text, &temp, textlength) >>
b75a7d8f
A
681 SECOND_LAST_BYTE_SHIFT_) == 0) {
682 return result;
683 }
684 }
685 return textlength;
686 }
687 }
688 return textoffset;
689}
690
691/**
692* Gets the next base character offset depending on the string search pattern
693* data
694* @param strsrch string search data
695* @param textoffset current offset, one offset away from the last character
696* to search for.
697* @return start index of the next base character or the current offset
698* if the current character is contains a base character.
699*/
700static
729e4ab9 701inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
b75a7d8f
A
702 int32_t textoffset)
703{
374ca955 704 int32_t textlength = strsrch->search->textLength;
729e4ab9 705 if (strsrch->pattern.hasSuffixAccents &&
b75a7d8f
A
706 textoffset < textlength) {
707 int32_t temp = textoffset;
708 const UChar *text = strsrch->search->text;
4388f060 709 U16_BACK_1(text, 0, temp);
b75a7d8f
A
710 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
711 return getNextBaseOffset(text, textoffset, textlength);
712 }
713 }
714 return textoffset;
715}
716
717/**
718* Shifting the collation element iterator position forward to prepare for
719* a following match. If the last character is a unsafe character, we'll only
720* shift by 1 to capture contractions, normalization etc.
721* Internal method, status assumed to be success.
722* @param text strsrch string search data
723* @param textoffset start text position to do search
724* @param ce the text ce which failed the match.
725* @param patternceindex index of the ce within the pattern ce buffer which
726* failed the match
727* @return final offset
728*/
729static
730inline int32_t shiftForward(UStringSearch *strsrch,
731 int32_t textoffset,
374ca955 732 int32_t ce,
b75a7d8f
A
733 int32_t patternceindex)
734{
374ca955 735 UPattern *pattern = &(strsrch->pattern);
b75a7d8f 736 if (ce != UCOL_NULLORDER) {
2ca993e8 737 int32_t shift = pattern->shift[hashFromCE32(ce)];
729e4ab9 738 // this is to adjust for characters in the middle of the
b75a7d8f 739 // substring for matching that failed.
b331163b 740 int32_t adjust = pattern->cesLength - patternceindex;
b75a7d8f
A
741 if (adjust > 1 && shift >= adjust) {
742 shift -= adjust - 1;
743 }
744 textoffset += shift;
745 }
746 else {
747 textoffset += pattern->defaultShiftSize;
748 }
729e4ab9 749
b75a7d8f
A
750 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
751 // check for unsafe characters
729e4ab9 752 // * if it is the start or middle of a contraction: to be done after
b75a7d8f
A
753 // a initial match is found
754 // * thai or lao base consonant character: similar to contraction
755 // * high surrogate character: similar to contraction
756 // * next character is a accent: shift to the next base character
757 return textoffset;
758}
46f4442e 759#endif // #if BOYER_MOORE
b75a7d8f
A
760
761/**
729e4ab9 762* sets match not found
b75a7d8f
A
763* @param strsrch string search data
764*/
765static
729e4ab9 766inline void setMatchNotFound(UStringSearch *strsrch)
b75a7d8f
A
767{
768 // this method resets the match result regardless of the error status.
769 strsrch->search->matchedIndex = USEARCH_DONE;
770 strsrch->search->matchedLength = 0;
771 if (strsrch->search->isForwardSearching) {
772 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
773 }
774 else {
775 setColEIterOffset(strsrch->textIter, 0);
776 }
777}
778
46f4442e 779#if BOYER_MOORE
b75a7d8f
A
780/**
781* Gets the offset to the next safe point in text.
782* ie. not the middle of a contraction, swappable characters or supplementary
783* characters.
784* @param collator collation sata
785* @param text string to work with
786* @param textoffset offset in string
787* @param textlength length of text string
788* @return offset to the next safe character
789*/
790static
729e4ab9 791inline int32_t getNextSafeOffset(const UCollator *collator,
b75a7d8f
A
792 const UChar *text,
793 int32_t textoffset,
794 int32_t textlength)
795{
796 int32_t result = textoffset; // first contraction character
797 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
798 result ++;
799 }
729e4ab9 800 return result;
b75a7d8f
A
801}
802
729e4ab9 803/**
b75a7d8f
A
804* This checks for accents in the potential match started with a .
805* composite character.
729e4ab9
A
806* This is really painful... we have to check that composite character do not
807* have any extra accents. We have to normalize the potential match and find
b75a7d8f 808* the immediate decomposed character before the match.
729e4ab9 809* The first composite character would have been taken care of by the fcd
b75a7d8f 810* checks in checkForwardExactMatch.
729e4ab9
A
811* This is the slow path after the fcd of the first character and
812* the last character has been checked by checkForwardExactMatch and we
b75a7d8f
A
813* determine that the potential match has extra non-ignorable preceding
814* ces.
729e4ab9 815* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
b75a7d8f
A
816* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
817* Note here that accents checking are slow and cautioned in the API docs.
818* Internal method, status assumed to be a success, caller should check status
819* before calling this method
820* @param strsrch string search data
821* @param start index of the potential unfriendly composite character
822* @param end index of the potential unfriendly composite character
823* @param status output error status if any.
824* @return TRUE if there is non-ignorable accents before at the beginning
825* of the match, FALSE otherwise.
826*/
827
828static
829UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
729e4ab9 830 int32_t end,
b75a7d8f
A
831 UErrorCode *status)
832{
833 UBool result = FALSE;
834 if (strsrch->pattern.hasPrefixAccents) {
835 int32_t length = end - start;
836 int32_t offset = 0;
837 const UChar *text = strsrch->search->text + start;
729e4ab9 838
4388f060 839 U16_FWD_1(text, offset, length);
b75a7d8f
A
840 // we are only concerned with the first composite character
841 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
729e4ab9 842 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
b75a7d8f
A
843 text, 0, length);
844 if (safeoffset != length) {
845 safeoffset ++;
846 }
847 UChar *norm = NULL;
848 UChar buffer[INITIAL_ARRAY_SIZE_];
729e4ab9
A
849 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
850 buffer, INITIAL_ARRAY_SIZE_,
851 status);
b75a7d8f
A
852 if (U_FAILURE(*status)) {
853 return FALSE;
854 }
855 if (size >= INITIAL_ARRAY_SIZE_) {
856 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
857 status);
729e4ab9 858 // if allocation failed, status will be set to
b75a7d8f
A
859 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
860 // checks for it.
729e4ab9 861 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
b75a7d8f
A
862 size, status);
863 if (U_FAILURE(*status) && norm != NULL) {
864 uprv_free(norm);
865 return FALSE;
866 }
867 }
868 else {
869 norm = buffer;
870 }
871
872 UCollationElements *coleiter = strsrch->utilIter;
873 ucol_setText(coleiter, norm, size, status);
b331163b 874 uint32_t firstce = strsrch->pattern.ces[0];
b75a7d8f
A
875 UBool ignorable = TRUE;
876 uint32_t ce = UCOL_IGNORABLE;
46f4442e 877 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
b75a7d8f
A
878 offset = ucol_getOffset(coleiter);
879 if (ce != firstce && ce != UCOL_IGNORABLE) {
880 ignorable = FALSE;
881 }
882 ce = ucol_next(coleiter, status);
883 }
884 UChar32 codepoint;
4388f060 885 U16_PREV(norm, 0, offset, codepoint);
b75a7d8f
A
886 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
887
888 if (norm != buffer) {
889 uprv_free(norm);
890 }
891 }
892 }
893
894 return result;
895}
896
897/**
729e4ab9 898* Used by exact matches, checks if there are accents before the match.
b75a7d8f 899* This is really painful... we have to check that composite characters at
729e4ab9
A
900* the start of the matches have to not have any extra accents.
901* We check the FCD of the character first, if it starts with an accent and
b75a7d8f 902* the first pattern ce does not match the first ce of the character, we bail.
729e4ab9
A
903* Otherwise we try normalizing the first composite
904* character and find the immediate decomposed character before the match to
b75a7d8f 905* see if it is an non-ignorable accent.
729e4ab9
A
906* Now normalizing the first composite character is enough because we ensure
907* that when the match is passed in here with extra beginning ces, the
b75a7d8f 908* first or last ce that match has to occur within the first character.
729e4ab9 909* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
b75a7d8f
A
910* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
911* Note here that accents checking are slow and cautioned in the API docs.
912* @param strsrch string search data
729e4ab9 913* @param start offset
b75a7d8f 914* @param end offset
729e4ab9 915* @return TRUE if there are accents on either side of the match,
b75a7d8f
A
916* FALSE otherwise
917*/
918static
919UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
729e4ab9 920 int32_t end)
b75a7d8f
A
921{
922 if (strsrch->pattern.hasPrefixAccents) {
923 UCollationElements *coleiter = strsrch->textIter;
924 UErrorCode status = U_ZERO_ERROR;
925 // we have been iterating forwards previously
926 uint32_t ignorable = TRUE;
b331163b 927 int32_t firstce = strsrch->pattern.ces[0];
b75a7d8f 928
374ca955
A
929 setColEIterOffset(coleiter, start);
930 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
931 if (U_FAILURE(status)) {
b75a7d8f
A
932 return TRUE;
933 }
934 while (ce != firstce) {
935 if (ce != UCOL_IGNORABLE) {
936 ignorable = FALSE;
937 }
938 ce = getCE(strsrch, ucol_next(coleiter, &status));
46f4442e 939 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
b75a7d8f
A
940 return TRUE;
941 }
942 }
374ca955 943 if (!ignorable && inNormBuf(coleiter)) {
b75a7d8f 944 // within normalization buffer, discontiguous handled here
374ca955 945 return TRUE;
b75a7d8f
A
946 }
947
374ca955 948 // within text
b75a7d8f 949 int32_t temp = start;
374ca955 950 // original code
729e4ab9
A
951 // accent = (getFCD(strsrch->search->text, &temp,
952 // strsrch->search->textLength)
953 // >> SECOND_LAST_BYTE_SHIFT_);
374ca955 954 // however this code does not work well with VC7 .net in release mode.
729e4ab9 955 // maybe the inlines for getFCD combined with shifting has bugs in
374ca955 956 // VC7. anyways this is a work around.
729e4ab9 957 UBool accent = getFCD(strsrch->search->text, &temp,
b75a7d8f
A
958 strsrch->search->textLength) > 0xFF;
959 if (!accent) {
374ca955 960 return checkExtraMatchAccents(strsrch, start, end, &status);
b75a7d8f 961 }
374ca955 962 if (!ignorable) {
b75a7d8f
A
963 return TRUE;
964 }
965 if (start > 0) {
966 temp = start;
4388f060 967 U16_BACK_1(strsrch->search->text, 0, temp);
729e4ab9 968 if (getFCD(strsrch->search->text, &temp,
b75a7d8f
A
969 strsrch->search->textLength) & LAST_BYTE_MASK_) {
970 setColEIterOffset(coleiter, start);
971 ce = ucol_previous(coleiter, &status);
729e4ab9 972 if (U_FAILURE(status) ||
b75a7d8f
A
973 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
974 return TRUE;
975 }
976 }
977 }
978 }
729e4ab9 979
b75a7d8f
A
980 return FALSE;
981}
982
983/**
984* Used by exact matches, checks if there are accents bounding the match.
985* Note this is the initial boundary check. If the potential match
986* starts or ends with composite characters, the accents in those
987* characters will be determined later.
729e4ab9 988* Not doing backwards iteration here, since discontiguos contraction for
b75a7d8f 989* backwards collation element iterator, use up too many characters.
729e4ab9 990* E.g. looking for \u030A ring in \u01FA A ring above and acute,
b75a7d8f
A
991* should fail since there is a acute at the end of \u01FA
992* Note here that accents checking are slow and cautioned in the API docs.
993* @param strsrch string search data
994* @param start offset of match
995* @param end end offset of the match
729e4ab9 996* @return TRUE if there are accents on either side of the match,
b75a7d8f
A
997* FALSE otherwise
998*/
999static
729e4ab9
A
1000UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1001 int32_t end)
b75a7d8f
A
1002{
1003 if (strsrch->pattern.hasSuffixAccents) {
1004 const UChar *text = strsrch->search->text;
1005 int32_t temp = end;
1006 int32_t textlength = strsrch->search->textLength;
4388f060 1007 U16_BACK_1(text, 0, temp);
b75a7d8f 1008 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
b331163b 1009 int32_t firstce = strsrch->pattern.ces[0];
b75a7d8f
A
1010 UCollationElements *coleiter = strsrch->textIter;
1011 UErrorCode status = U_ZERO_ERROR;
4388f060 1012 int32_t ce;
b75a7d8f 1013 setColEIterOffset(coleiter, start);
46f4442e
A
1014 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1015 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
b75a7d8f
A
1016 return TRUE;
1017 }
1018 }
1019 int32_t count = 1;
b331163b 1020 while (count < strsrch->pattern.cesLength) {
729e4ab9 1021 if (getCE(strsrch, ucol_next(coleiter, &status))
b75a7d8f
A
1022 == UCOL_IGNORABLE) {
1023 // Thai can give an ignorable here.
1024 count --;
1025 }
1026 if (U_FAILURE(status)) {
1027 return TRUE;
1028 }
1029 count ++;
1030 }
729e4ab9 1031
4388f060 1032 ce = ucol_next(coleiter, &status);
b75a7d8f
A
1033 if (U_FAILURE(status)) {
1034 return TRUE;
1035 }
46f4442e 1036 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
4388f060 1037 ce = getCE(strsrch, ce);
46f4442e 1038 }
b75a7d8f
A
1039 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1040 if (ucol_getOffset(coleiter) <= end) {
1041 return TRUE;
1042 }
1043 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1044 return TRUE;
1045 }
1046 }
1047 }
1048 }
1049 return FALSE;
1050}
46f4442e 1051#endif // #if BOYER_MOORE
b75a7d8f
A
1052
1053/**
1054* Checks if the offset runs out of the text string
729e4ab9 1055* @param offset
b75a7d8f
A
1056* @param textlength of the text string
1057* @return TRUE if offset is out of bounds, FALSE otherwise
1058*/
1059static
1060inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1061{
1062 return offset < 0 || offset > textlength;
1063}
1064
1065/**
1066* Checks for identical match
1067* @param strsrch string search data
1068* @param start offset of possible match
1069* @param end offset of possible match
1070* @return TRUE if identical match is found
1071*/
1072static
729e4ab9
A
1073inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1074 int32_t end)
b75a7d8f 1075{
b75a7d8f
A
1076 if (strsrch->strength != UCOL_IDENTICAL) {
1077 return TRUE;
1078 }
1079
729e4ab9
A
1080 // Note: We could use Normalizer::compare() or similar, but for short strings
1081 // which may not be in FCD it might be faster to just NFD them.
1082 UErrorCode status = U_ZERO_ERROR;
1083 UnicodeString t2, p2;
1084 strsrch->nfd->normalize(
1085 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1086 strsrch->nfd->normalize(
1087 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
73c04bcf 1088 // return FALSE if NFD failed
729e4ab9 1089 return U_SUCCESS(status) && t2 == p2;
b75a7d8f
A
1090}
1091
729e4ab9 1092#if BOYER_MOORE
b75a7d8f
A
1093/**
1094* Checks to see if the match is repeated
1095* @param strsrch string search data
1096* @param start new match start index
1097* @param end new match end index
1098* @return TRUE if the the match is repeated, FALSE otherwise
1099*/
1100static
1101inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1102 int32_t start,
1103 int32_t end)
1104{
1105 int32_t lastmatchindex = strsrch->search->matchedIndex;
1106 UBool result;
1107 if (lastmatchindex == USEARCH_DONE) {
1108 return FALSE;
1109 }
1110 if (strsrch->search->isForwardSearching) {
1111 result = start <= lastmatchindex;
1112 }
1113 else {
1114 result = start >= lastmatchindex;
1115 }
374ca955 1116 if (!result && !strsrch->search->isOverlap) {
b75a7d8f
A
1117 if (strsrch->search->isForwardSearching) {
1118 result = start < lastmatchindex + strsrch->search->matchedLength;
1119 }
1120 else {
1121 result = end > lastmatchindex;
1122 }
1123 }
1124 return result;
1125}
1126
1127/**
1128* Gets the collation element iterator's current offset.
1129* @param coleiter collation element iterator
1130* @param forwards flag TRUE if we are moving in th forwards direction
729e4ab9 1131* @return current offset
b75a7d8f
A
1132*/
1133static
1134inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1135 UBool forwards)
1136{
1137 int32_t result = ucol_getOffset(coleiter);
1138 // intricacies of the the backwards collation element iterator
46f4442e 1139 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
b75a7d8f
A
1140 result ++;
1141 }
1142 return result;
1143}
1144
1145/**
729e4ab9 1146* Checks match for contraction.
b75a7d8f
A
1147* If the match ends with a partial contraction we fail.
1148* If the match starts too far off (because of backwards iteration) we try to
1149* chip off the extra characters depending on whether a breakiterator has
1150* been used.
729e4ab9 1151* Internal method, error assumed to be success, caller has to check status
b75a7d8f
A
1152* before calling this method.
1153* @param strsrch string search data
1154* @param start offset of potential match, to be modified if necessary
1155* @param end offset of potential match, to be modified if necessary
1156* @param status output error status if any
1157* @return TRUE if match passes the contraction test, FALSE otherwise
1158*/
1159
1160static
729e4ab9
A
1161UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1162 int32_t *start,
1163 int32_t *end, UErrorCode *status)
b75a7d8f
A
1164{
1165 UCollationElements *coleiter = strsrch->textIter;
1166 int32_t textlength = strsrch->search->textLength;
46f4442e 1167 int32_t temp = *start;
b75a7d8f
A
1168 const UCollator *collator = strsrch->collator;
1169 const UChar *text = strsrch->search->text;
729e4ab9 1170 // This part checks if either ends of the match contains potential
b75a7d8f 1171 // contraction. If so we'll have to iterate through them
374ca955
A
1172 // The start contraction needs to be checked since ucol_previous dumps
1173 // all characters till the first safe character into the buffer.
729e4ab9
A
1174 // *start + 1 is used to test for the unsafe characters instead of *start
1175 // because ucol_prev takes all unsafe characters till the first safe
1176 // character ie *start. so by testing *start + 1, we can estimate if
1177 // excess prefix characters has been included in the potential search
374ca955 1178 // results.
729e4ab9
A
1179 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1180 (*start + 1 < textlength
b75a7d8f
A
1181 && ucol_unsafeCP(text[*start + 1], collator))) {
1182 int32_t expansion = getExpansionPrefix(coleiter);
1183 UBool expandflag = expansion > 0;
1184 setColEIterOffset(coleiter, *start);
1185 while (expansion > 0) {
1186 // getting rid of the redundant ce, caused by setOffset.
729e4ab9
A
1187 // since backward contraction/expansion may have extra ces if we
1188 // are in the normalization buffer, hasAccentsBeforeMatch would
b75a7d8f
A
1189 // have taken care of it.
1190 // E.g. the character \u01FA will have an expansion of 3, but if
1191 // we are only looking for acute and ring \u030A and \u0301, we'll
1192 // have to skip the first ce in the expansion buffer.
1193 ucol_next(coleiter, status);
374ca955
A
1194 if (U_FAILURE(*status)) {
1195 return FALSE;
1196 }
b75a7d8f
A
1197 if (ucol_getOffset(coleiter) != temp) {
1198 *start = temp;
1199 temp = ucol_getOffset(coleiter);
1200 }
1201 expansion --;
1202 }
1203
b331163b
A
1204 int32_t *patternce = strsrch->pattern.ces;
1205 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
1206 int32_t count = 0;
1207 while (count < patterncelength) {
374ca955 1208 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
b75a7d8f
A
1209 if (ce == UCOL_IGNORABLE) {
1210 continue;
1211 }
1212 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1213 *start = temp;
1214 temp = ucol_getOffset(coleiter);
1215 }
1216 if (U_FAILURE(*status) || ce != patternce[count]) {
1217 (*end) ++;
729e4ab9 1218 *end = getNextUStringSearchBaseOffset(strsrch, *end);
b75a7d8f
A
1219 return FALSE;
1220 }
1221 count ++;
1222 }
729e4ab9 1223 }
b75a7d8f
A
1224 return TRUE;
1225}
1226
1227/**
1228* Checks and sets the match information if found.
729e4ab9 1229* Checks
b75a7d8f
A
1230* <ul>
1231* <li> the potential match does not repeat the previous match
1232* <li> boundaries are correct
1233* <li> exact matches has no extra accents
1234* <li> identical matchesb
1235* <li> potential match does not end in the middle of a contraction
1236* <\ul>
1237* Otherwise the offset will be shifted to the next character.
729e4ab9 1238* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1239* before calling this method.
1240* @param strsrch string search data
1241* @param textoffset offset in the collation element text. the returned value
729e4ab9 1242* will be the truncated end offset of the match or the new start
b75a7d8f
A
1243* search offset.
1244* @param status output error status if any
1245* @return TRUE if the match is valid, FALSE otherwise
1246*/
1247static
729e4ab9 1248inline UBool checkNextExactMatch(UStringSearch *strsrch,
b75a7d8f
A
1249 int32_t *textoffset, UErrorCode *status)
1250{
1251 UCollationElements *coleiter = strsrch->textIter;
729e4ab9
A
1252 int32_t start = getColElemIterOffset(coleiter, FALSE);
1253
374ca955
A
1254 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1255 return FALSE;
b75a7d8f
A
1256 }
1257
1258 // this totally matches, however we need to check if it is repeating
1259 if (!isBreakUnit(strsrch, start, *textoffset) ||
729e4ab9
A
1260 checkRepeatedMatch(strsrch, start, *textoffset) ||
1261 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
b75a7d8f
A
1262 !checkIdentical(strsrch, start, *textoffset) ||
1263 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
374ca955
A
1264
1265 (*textoffset) ++;
729e4ab9 1266 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
374ca955 1267 return FALSE;
b75a7d8f 1268 }
46f4442e
A
1269
1270 //Add breakiterator boundary check for primary strength search.
1271 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
4388f060 1272 checkBreakBoundary(strsrch, &start, textoffset);
46f4442e 1273 }
729e4ab9 1274
b75a7d8f
A
1275 // totally match, we will get rid of the ending ignorables.
1276 strsrch->search->matchedIndex = start;
1277 strsrch->search->matchedLength = *textoffset - start;
374ca955 1278 return TRUE;
b75a7d8f
A
1279}
1280
1281/**
729e4ab9 1282* Getting the previous base character offset, or the current offset if the
b75a7d8f
A
1283* current character is a base character
1284* @param text string
1285* @param textoffset one offset after the current character
729e4ab9 1286* @return the offset of the next character after the base character or the first
b75a7d8f
A
1287* composed character with accents
1288*/
1289static
729e4ab9 1290inline int32_t getPreviousBaseOffset(const UChar *text,
b75a7d8f
A
1291 int32_t textoffset)
1292{
1293 if (textoffset > 0) {
46f4442e 1294 for (;;) {
b75a7d8f 1295 int32_t result = textoffset;
4388f060 1296 U16_BACK_1(text, 0, textoffset);
b75a7d8f
A
1297 int32_t temp = textoffset;
1298 uint16_t fcd = getFCD(text, &temp, result);
1299 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1300 if (fcd & LAST_BYTE_MASK_) {
1301 return textoffset;
1302 }
1303 return result;
1304 }
1305 if (textoffset == 0) {
1306 return 0;
1307 }
1308 }
1309 }
1310 return textoffset;
1311}
1312
1313/**
1314* Getting the indexes of the accents that are not blocked in the argument
1315* accent array
1316* @param accents array of accents in nfd terminated by a 0.
1317* @param accentsindex array of indexes of the accents that are not blocked
1318*/
1319static
1320inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1321{
1322 int32_t index = 0;
1323 int32_t length = u_strlen(accents);
1324 UChar32 codepoint = 0;
1325 int cclass = 0;
1326 int result = 0;
1327 int32_t temp;
1328 while (index < length) {
1329 temp = index;
4388f060 1330 U16_NEXT(accents, index, length, codepoint);
b75a7d8f
A
1331 if (u_getCombiningClass(codepoint) != cclass) {
1332 cclass = u_getCombiningClass(codepoint);
1333 accentsindex[result] = temp;
1334 result ++;
1335 }
1336 }
1337 accentsindex[result] = length;
1338 return result;
1339}
1340
1341/**
1342* Appends 3 UChar arrays to a destination array.
729e4ab9 1343* Creates a new array if we run out of space. The caller will have to
b75a7d8f 1344* manually deallocate the newly allocated array.
729e4ab9
A
1345* Internal method, status assumed to be success, caller has to check status
1346* before calling this method. destination not to be NULL and has at least
b75a7d8f
A
1347* size destinationlength.
1348* @param destination target array
1349* @param destinationlength target array size, returning the appended length
1350* @param source1 null-terminated first array
1351* @param source2 second array
1352* @param source2length length of seond array
1353* @param source3 null-terminated third array
1354* @param status error status if any
1355* @return new destination array, destination if there was no new allocation
1356*/
1357static
729e4ab9
A
1358inline UChar * addToUCharArray( UChar *destination,
1359 int32_t *destinationlength,
1360 const UChar *source1,
b75a7d8f 1361 const UChar *source2,
729e4ab9
A
1362 int32_t source2length,
1363 const UChar *source3,
1364 UErrorCode *status)
b75a7d8f
A
1365{
1366 int32_t source1length = source1 ? u_strlen(source1) : 0;
729e4ab9
A
1367 int32_t source3length = source3 ? u_strlen(source3) : 0;
1368 if (*destinationlength < source1length + source2length + source3length +
1369 1)
b75a7d8f
A
1370 {
1371 destination = (UChar *)allocateMemory(
1372 (source1length + source2length + source3length + 1) * sizeof(UChar),
1373 status);
729e4ab9 1374 // if error allocating memory, status will be
b75a7d8f
A
1375 // U_MEMORY_ALLOCATION_ERROR
1376 if (U_FAILURE(*status)) {
1377 *destinationlength = 0;
1378 return NULL;
1379 }
1380 }
1381 if (source1length != 0) {
a62d09fc 1382 u_memcpy(destination, source1, source1length);
b75a7d8f
A
1383 }
1384 if (source2length != 0) {
729e4ab9 1385 uprv_memcpy(destination + source1length, source2,
b75a7d8f
A
1386 sizeof(UChar) * source2length);
1387 }
1388 if (source3length != 0) {
729e4ab9 1389 uprv_memcpy(destination + source1length + source2length, source3,
b75a7d8f
A
1390 sizeof(UChar) * source3length);
1391 }
1392 *destinationlength = source1length + source2length + source3length;
1393 return destination;
1394}
1395
1396/**
1397* Running through a collation element iterator to see if the contents matches
1398* pattern in string search data
1399* @param strsrch string search data
1400* @param coleiter collation element iterator
1401* @return TRUE if a match if found, FALSE otherwise
1402*/
1403static
729e4ab9 1404inline UBool checkCollationMatch(const UStringSearch *strsrch,
b75a7d8f
A
1405 UCollationElements *coleiter)
1406{
b331163b
A
1407 int patternceindex = strsrch->pattern.cesLength;
1408 int32_t *patternce = strsrch->pattern.ces;
b75a7d8f
A
1409 UErrorCode status = U_ZERO_ERROR;
1410 while (patternceindex > 0) {
374ca955 1411 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
1412 if (ce == UCOL_IGNORABLE) {
1413 continue;
1414 }
1415 if (U_FAILURE(status) || ce != *patternce) {
1416 return FALSE;
1417 }
1418 patternce ++;
1419 patternceindex --;
1420 }
1421 return TRUE;
1422}
1423
1424/**
1425* Rearranges the front accents to try matching.
729e4ab9
A
1426* Prefix accents in the text will be grouped according to their combining
1427* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
1428* match with the pattern.
1429* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1430* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 1431* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
1432* "\u0301\u0325".
1433* step 2: check if any of the generated substrings matches the pattern.
1434* Internal method, status is assumed to be success, caller has to check status
1435* before calling this method.
1436* @param strsrch string search match
1437* @param start first offset of the accents to start searching
1438* @param end start of the last accent set
1439* @param status output error status if any
1440* @return USEARCH_DONE if a match is not found, otherwise return the starting
1441* offset of the match. Note this start includes all preceding accents.
1442*/
1443static
729e4ab9 1444int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
b75a7d8f 1445 int32_t start,
729e4ab9 1446 int32_t end,
b75a7d8f
A
1447 UErrorCode *status)
1448{
1449 const UChar *text = strsrch->search->text;
1450 int32_t textlength = strsrch->search->textLength;
1451 int32_t tempstart = start;
1452
1453 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1454 // die... failed at a base character
1455 return USEARCH_DONE;
1456 }
1457
1458 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1459 start = getPreviousBaseOffset(text, tempstart);
1460
1461 UChar accents[INITIAL_ARRAY_SIZE_];
1462 // normalizing the offensive string
729e4ab9
A
1463 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1464 INITIAL_ARRAY_SIZE_, status);
b75a7d8f
A
1465 if (U_FAILURE(*status)) {
1466 return USEARCH_DONE;
1467 }
729e4ab9
A
1468
1469 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1470 int32_t accentsize = getUnblockedAccentIndex(accents,
b75a7d8f 1471 accentsindex);
729e4ab9 1472 int32_t count = (2 << (accentsize - 1)) - 1;
b75a7d8f
A
1473 UChar buffer[INITIAL_ARRAY_SIZE_];
1474 UCollationElements *coleiter = strsrch->utilIter;
1475 while (U_SUCCESS(*status) && count > 0) {
1476 UChar *rearrange = strsrch->canonicalPrefixAccents;
1477 // copy the base characters
1478 for (int k = 0; k < accentsindex[0]; k ++) {
1479 *rearrange ++ = accents[k];
1480 }
1481 // forming all possible canonical rearrangement by dropping
1482 // sets of accents
1483 for (int i = 0; i <= accentsize - 1; i ++) {
1484 int32_t mask = 1 << (accentsize - i - 1);
1485 if (count & mask) {
1486 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1487 *rearrange ++ = accents[j];
1488 }
1489 }
1490 }
1491 *rearrange = 0;
1492 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1493 UChar *match = addToUCharArray(buffer, &matchsize,
1494 strsrch->canonicalPrefixAccents,
1495 strsrch->search->text + offset,
1496 end - offset,
1497 strsrch->canonicalSuffixAccents,
1498 status);
729e4ab9 1499
b75a7d8f
A
1500 // if status is a failure, ucol_setText does nothing.
1501 // run the collator iterator through this match
1502 ucol_setText(coleiter, match, matchsize, status);
1503 if (U_SUCCESS(*status)) {
1504 if (checkCollationMatch(strsrch, coleiter)) {
1505 if (match != buffer) {
1506 uprv_free(match);
1507 }
1508 return start;
1509 }
1510 }
1511 count --;
1512 }
1513 return USEARCH_DONE;
1514}
1515
1516/**
1517* Gets the offset to the safe point in text before textoffset.
1518* ie. not the middle of a contraction, swappable characters or supplementary
1519* characters.
1520* @param collator collation sata
1521* @param text string to work with
1522* @param textoffset offset in string
1523* @param textlength length of text string
1524* @return offset to the previous safe character
1525*/
1526static
729e4ab9 1527inline uint32_t getPreviousSafeOffset(const UCollator *collator,
b75a7d8f
A
1528 const UChar *text,
1529 int32_t textoffset)
1530{
1531 int32_t result = textoffset; // first contraction character
1532 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1533 result --;
1534 }
1535 if (result != 0) {
1536 // the first contraction character is consider unsafe here
1537 result --;
1538 }
729e4ab9 1539 return result;
b75a7d8f
A
1540}
1541
1542/**
1543* Cleaning up after we passed the safe zone
1544* @param strsrch string search data
1545* @param safetext safe text array
1546* @param safebuffer safe text buffer
1547* @param coleiter collation element iterator for safe text
1548*/
1549static
1550inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1551 UChar *safebuffer)
1552{
729e4ab9 1553 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
b75a7d8f
A
1554 {
1555 uprv_free(safetext);
1556 }
1557}
1558
1559/**
1560* Take the rearranged end accents and tries matching. If match failed at
1561* a seperate preceding set of accents (seperated from the rearranged on by
729e4ab9 1562* at least a base character) then we rearrange the preceding accents and
b75a7d8f 1563* tries matching again.
729e4ab9 1564* We allow skipping of the ends of the accent set if the ces do not match.
b75a7d8f
A
1565* However if the failure is found before the accent set, it fails.
1566* Internal method, status assumed to be success, caller has to check status
1567* before calling this method.
1568* @param strsrch string search data
1569* @param textoffset of the start of the rearranged accent
1570* @param status output error status if any
1571* @return USEARCH_DONE if a match is not found, otherwise return the starting
1572* offset of the match. Note this start includes all preceding accents.
1573*/
1574static
729e4ab9 1575int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
b75a7d8f
A
1576 int32_t textoffset,
1577 UErrorCode *status)
1578{
1579 const UChar *text = strsrch->search->text;
1580 const UCollator *collator = strsrch->collator;
1581 int32_t safelength = 0;
1582 UChar *safetext;
1583 int32_t safetextlength;
1584 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1585 UCollationElements *coleiter = strsrch->utilIter;
1586 int32_t safeoffset = textoffset;
1587
729e4ab9 1588 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
b75a7d8f
A
1589 collator)) {
1590 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1591 safelength = textoffset - safeoffset;
1592 safetextlength = INITIAL_ARRAY_SIZE_;
729e4ab9
A
1593 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1594 text + safeoffset, safelength,
1595 strsrch->canonicalSuffixAccents,
b75a7d8f
A
1596 status);
1597 }
1598 else {
1599 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1600 safetext = strsrch->canonicalSuffixAccents;
1601 }
1602
1603 // if status is a failure, ucol_setText does nothing
1604 ucol_setText(coleiter, safetext, safetextlength, status);
1605 // status checked in loop below
1606
b331163b
A
1607 int32_t *ce = strsrch->pattern.ces;
1608 int32_t celength = strsrch->pattern.cesLength;
b75a7d8f
A
1609 int ceindex = celength - 1;
1610 UBool isSafe = TRUE; // indication flag for position in safe zone
729e4ab9 1611
b75a7d8f 1612 while (ceindex >= 0) {
374ca955 1613 int32_t textce = ucol_previous(coleiter, status);
b75a7d8f
A
1614 if (U_FAILURE(*status)) {
1615 if (isSafe) {
1616 cleanUpSafeText(strsrch, safetext, safebuffer);
1617 }
1618 return USEARCH_DONE;
1619 }
1620 if (textce == UCOL_NULLORDER) {
1621 // check if we have passed the safe buffer
1622 if (coleiter == strsrch->textIter) {
1623 cleanUpSafeText(strsrch, safetext, safebuffer);
1624 return USEARCH_DONE;
1625 }
1626 cleanUpSafeText(strsrch, safetext, safebuffer);
1627 safetext = safebuffer;
1628 coleiter = strsrch->textIter;
1629 setColEIterOffset(coleiter, safeoffset);
1630 // status checked at the start of the loop
1631 isSafe = FALSE;
1632 continue;
1633 }
1634 textce = getCE(strsrch, textce);
1635 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1636 // do the beginning stuff
1637 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1638 if (isSafe && failedoffset >= safelength) {
1639 // alas... no hope. failed at rearranged accent set
1640 cleanUpSafeText(strsrch, safetext, safebuffer);
1641 return USEARCH_DONE;
1642 }
1643 else {
1644 if (isSafe) {
1645 failedoffset += safeoffset;
1646 cleanUpSafeText(strsrch, safetext, safebuffer);
1647 }
729e4ab9 1648
b75a7d8f 1649 // try rearranging the front accents
729e4ab9 1650 int32_t result = doNextCanonicalPrefixMatch(strsrch,
b75a7d8f
A
1651 failedoffset, textoffset, status);
1652 if (result != USEARCH_DONE) {
1653 // if status is a failure, ucol_setOffset does nothing
1654 setColEIterOffset(strsrch->textIter, result);
1655 }
1656 if (U_FAILURE(*status)) {
1657 return USEARCH_DONE;
1658 }
1659 return result;
1660 }
1661 }
1662 if (textce == ce[ceindex]) {
1663 ceindex --;
1664 }
1665 }
1666 // set offset here
1667 if (isSafe) {
1668 int32_t result = getColElemIterOffset(coleiter, FALSE);
1669 // sets the text iterator here with the correct expansion and offset
1670 int32_t leftoverces = getExpansionPrefix(coleiter);
1671 cleanUpSafeText(strsrch, safetext, safebuffer);
729e4ab9 1672 if (result >= safelength) {
b75a7d8f
A
1673 result = textoffset;
1674 }
1675 else {
1676 result += safeoffset;
1677 }
1678 setColEIterOffset(strsrch->textIter, result);
729e4ab9 1679 strsrch->textIter->iteratordata_.toReturn =
b75a7d8f
A
1680 setExpansionPrefix(strsrch->textIter, leftoverces);
1681 return result;
1682 }
729e4ab9
A
1683
1684 return ucol_getOffset(coleiter);
b75a7d8f
A
1685}
1686
1687/**
1688* Trying out the substring and sees if it can be a canonical match.
1689* This will try normalizing the end accents and arranging them into canonical
1690* equivalents and check their corresponding ces with the pattern ce.
729e4ab9
A
1691* Suffix accents in the text will be grouped according to their combining
1692* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
1693* match with the pattern.
1694* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1695* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 1696* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
1697* "\u0301\u0325".
1698* step 2: check if any of the generated substrings matches the pattern.
729e4ab9 1699* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1700* before calling this method.
1701* @param strsrch string search data
729e4ab9 1702* @param textoffset end offset in the collation element text that ends with
b75a7d8f
A
1703* the accents to be rearranged
1704* @param status error status if any
1705* @return TRUE if the match is valid, FALSE otherwise
1706*/
1707static
729e4ab9
A
1708UBool doNextCanonicalMatch(UStringSearch *strsrch,
1709 int32_t textoffset,
b75a7d8f
A
1710 UErrorCode *status)
1711{
1712 const UChar *text = strsrch->search->text;
1713 int32_t temp = textoffset;
4388f060 1714 U16_BACK_1(text, 0, temp);
b75a7d8f
A
1715 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1716 UCollationElements *coleiter = strsrch->textIter;
1717 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1718 if (strsrch->pattern.hasPrefixAccents) {
729e4ab9 1719 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
b75a7d8f
A
1720 status);
1721 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1722 setColEIterOffset(coleiter, offset);
1723 return TRUE;
1724 }
1725 }
1726 return FALSE;
1727 }
1728
1729 if (!strsrch->pattern.hasSuffixAccents) {
1730 return FALSE;
1731 }
1732
1733 UChar accents[INITIAL_ARRAY_SIZE_];
1734 // offset to the last base character in substring to search
1735 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1736 // normalizing the offensive string
729e4ab9
A
1737 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1738 0, accents, INITIAL_ARRAY_SIZE_, status);
b75a7d8f 1739 // status checked in loop below
729e4ab9 1740
b75a7d8f
A
1741 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1742 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1743
374ca955
A
1744 // 2 power n - 1 plus the full set of accents
1745 int32_t count = (2 << (size - 1)) - 1;
b75a7d8f
A
1746 while (U_SUCCESS(*status) && count > 0) {
1747 UChar *rearrange = strsrch->canonicalSuffixAccents;
1748 // copy the base characters
1749 for (int k = 0; k < accentsindex[0]; k ++) {
1750 *rearrange ++ = accents[k];
1751 }
1752 // forming all possible canonical rearrangement by dropping
1753 // sets of accents
1754 for (int i = 0; i <= size - 1; i ++) {
1755 int32_t mask = 1 << (size - i - 1);
1756 if (count & mask) {
1757 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1758 *rearrange ++ = accents[j];
1759 }
1760 }
1761 }
1762 *rearrange = 0;
729e4ab9 1763 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
b75a7d8f
A
1764 status);
1765 if (offset != USEARCH_DONE) {
1766 return TRUE; // match found
1767 }
1768 count --;
1769 }
1770 return FALSE;
1771}
1772
1773/**
729e4ab9 1774* Gets the previous base character offset depending on the string search
b75a7d8f
A
1775* pattern data
1776* @param strsrch string search data
1777* @param textoffset current offset, current character
1778* @return the offset of the next character after this base character or itself
1779* if it is a composed character with accents
1780*/
1781static
729e4ab9 1782inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
b75a7d8f
A
1783 int32_t textoffset)
1784{
1785 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1786 const UChar *text = strsrch->search->text;
1787 int32_t offset = textoffset;
729e4ab9 1788 if (getFCD(text, &offset, strsrch->search->textLength) >>
b75a7d8f
A
1789 SECOND_LAST_BYTE_SHIFT_) {
1790 return getPreviousBaseOffset(text, textoffset);
1791 }
1792 }
1793 return textoffset;
1794}
1795
1796/**
729e4ab9 1797* Checks match for contraction.
b75a7d8f
A
1798* If the match ends with a partial contraction we fail.
1799* If the match starts too far off (because of backwards iteration) we try to
1800* chip off the extra characters
729e4ab9 1801* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1802* before calling this method.
1803* @param strsrch string search data
1804* @param start offset of potential match, to be modified if necessary
1805* @param end offset of potential match, to be modified if necessary
1806* @param status output error status if any
1807* @return TRUE if match passes the contraction test, FALSE otherwise
1808*/
1809static
729e4ab9
A
1810UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1811 int32_t *start,
1812 int32_t *end,
1813 UErrorCode *status)
b75a7d8f
A
1814{
1815 UCollationElements *coleiter = strsrch->textIter;
1816 int32_t textlength = strsrch->search->textLength;
1817 int32_t temp = *start;
1818 const UCollator *collator = strsrch->collator;
1819 const UChar *text = strsrch->search->text;
729e4ab9 1820 // This part checks if either ends of the match contains potential
b75a7d8f 1821 // contraction. If so we'll have to iterate through them
729e4ab9
A
1822 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1823 (*start + 1 < textlength
b75a7d8f
A
1824 && ucol_unsafeCP(text[*start + 1], collator))) {
1825 int32_t expansion = getExpansionPrefix(coleiter);
1826 UBool expandflag = expansion > 0;
1827 setColEIterOffset(coleiter, *start);
1828 while (expansion > 0) {
1829 // getting rid of the redundant ce, caused by setOffset.
729e4ab9
A
1830 // since backward contraction/expansion may have extra ces if we
1831 // are in the normalization buffer, hasAccentsBeforeMatch would
b75a7d8f
A
1832 // have taken care of it.
1833 // E.g. the character \u01FA will have an expansion of 3, but if
1834 // we are only looking for acute and ring \u030A and \u0301, we'll
1835 // have to skip the first ce in the expansion buffer.
1836 ucol_next(coleiter, status);
374ca955
A
1837 if (U_FAILURE(*status)) {
1838 return FALSE;
1839 }
b75a7d8f
A
1840 if (ucol_getOffset(coleiter) != temp) {
1841 *start = temp;
1842 temp = ucol_getOffset(coleiter);
1843 }
1844 expansion --;
1845 }
1846
b331163b
A
1847 int32_t *patternce = strsrch->pattern.ces;
1848 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
1849 int32_t count = 0;
1850 int32_t textlength = strsrch->search->textLength;
1851 while (count < patterncelength) {
374ca955 1852 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
b75a7d8f
A
1853 // status checked below, note that if status is a failure
1854 // ucol_next returns UCOL_NULLORDER
1855 if (ce == UCOL_IGNORABLE) {
1856 continue;
1857 }
1858 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1859 *start = temp;
1860 temp = ucol_getOffset(coleiter);
1861 }
1862
1863 if (count == 0 && ce != patternce[0]) {
729e4ab9 1864 // accents may have extra starting ces, this occurs when a
b75a7d8f
A
1865 // pure accent pattern is matched without rearrangement
1866 // text \u0325\u0300 and looking for \u0300
729e4ab9 1867 int32_t expected = patternce[0];
b75a7d8f
A
1868 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1869 ce = getCE(strsrch, ucol_next(coleiter, status));
729e4ab9 1870 while (U_SUCCESS(*status) && ce != expected &&
b75a7d8f
A
1871 ce != UCOL_NULLORDER &&
1872 ucol_getOffset(coleiter) <= *end) {
1873 ce = getCE(strsrch, ucol_next(coleiter, status));
1874 }
1875 }
1876 }
1877 if (U_FAILURE(*status) || ce != patternce[count]) {
1878 (*end) ++;
729e4ab9 1879 *end = getNextUStringSearchBaseOffset(strsrch, *end);
b75a7d8f
A
1880 return FALSE;
1881 }
1882 count ++;
1883 }
729e4ab9 1884 }
b75a7d8f
A
1885 return TRUE;
1886}
1887
1888/**
1889* Checks and sets the match information if found.
729e4ab9 1890* Checks
b75a7d8f
A
1891* <ul>
1892* <li> the potential match does not repeat the previous match
1893* <li> boundaries are correct
1894* <li> potential match does not end in the middle of a contraction
1895* <li> identical matches
1896* <\ul>
1897* Otherwise the offset will be shifted to the next character.
729e4ab9 1898* Internal method, status assumed to be success, caller has to check the
b75a7d8f
A
1899* status before calling this method.
1900* @param strsrch string search data
1901* @param textoffset offset in the collation element text. the returned value
729e4ab9 1902* will be the truncated end offset of the match or the new start
b75a7d8f
A
1903* search offset.
1904* @param status output error status if any
1905* @return TRUE if the match is valid, FALSE otherwise
1906*/
1907static
729e4ab9
A
1908inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1909 int32_t *textoffset,
b75a7d8f
A
1910 UErrorCode *status)
1911{
1912 // to ensure that the start and ends are not composite characters
1913 UCollationElements *coleiter = strsrch->textIter;
1914 // if we have a canonical accent match
729e4ab9
A
1915 if ((strsrch->pattern.hasSuffixAccents &&
1916 strsrch->canonicalSuffixAccents[0]) ||
1917 (strsrch->pattern.hasPrefixAccents &&
b75a7d8f
A
1918 strsrch->canonicalPrefixAccents[0])) {
1919 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1920 strsrch,
1921 ucol_getOffset(coleiter));
729e4ab9 1922 strsrch->search->matchedLength = *textoffset -
b75a7d8f
A
1923 strsrch->search->matchedIndex;
1924 return TRUE;
1925 }
1926
1927 int32_t start = getColElemIterOffset(coleiter, FALSE);
729e4ab9 1928 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
b75a7d8f
A
1929 status) || U_FAILURE(*status)) {
1930 return FALSE;
1931 }
729e4ab9 1932
b75a7d8f
A
1933 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1934 // this totally matches, however we need to check if it is repeating
729e4ab9
A
1935 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1936 !isBreakUnit(strsrch, start, *textoffset) ||
b75a7d8f
A
1937 !checkIdentical(strsrch, start, *textoffset)) {
1938 (*textoffset) ++;
729e4ab9 1939 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
b75a7d8f
A
1940 strsrch->search->textLength);
1941 return FALSE;
1942 }
729e4ab9 1943
b75a7d8f
A
1944 strsrch->search->matchedIndex = start;
1945 strsrch->search->matchedLength = *textoffset - start;
1946 return TRUE;
1947}
1948
1949/**
1950* Shifting the collation element iterator position forward to prepare for
1951* a preceding match. If the first character is a unsafe character, we'll only
1952* shift by 1 to capture contractions, normalization etc.
729e4ab9 1953* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1954* before calling this method.
1955* @param text strsrch string search data
1956* @param textoffset start text position to do search
1957* @param ce the text ce which failed the match.
1958* @param patternceindex index of the ce within the pattern ce buffer which
1959* failed the match
1960* @return final offset
1961*/
1962static
1963inline int32_t reverseShift(UStringSearch *strsrch,
1964 int32_t textoffset,
374ca955 1965 int32_t ce,
b75a7d8f 1966 int32_t patternceindex)
729e4ab9 1967{
b75a7d8f
A
1968 if (strsrch->search->isOverlap) {
1969 if (textoffset != strsrch->search->textLength) {
1970 textoffset --;
1971 }
1972 else {
1973 textoffset -= strsrch->pattern.defaultShiftSize;
1974 }
1975 }
1976 else {
1977 if (ce != UCOL_NULLORDER) {
2ca993e8 1978 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
729e4ab9
A
1979
1980 // this is to adjust for characters in the middle of the substring
b75a7d8f
A
1981 // for matching that failed.
1982 int32_t adjust = patternceindex;
1983 if (adjust > 1 && shift > adjust) {
1984 shift -= adjust - 1;
1985 }
1986 textoffset -= shift;
1987 }
1988 else {
1989 textoffset -= strsrch->pattern.defaultShiftSize;
1990 }
729e4ab9 1991 }
b75a7d8f
A
1992 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1993 return textoffset;
1994}
1995
1996/**
729e4ab9 1997* Checks match for contraction.
b75a7d8f 1998* If the match starts with a partial contraction we fail.
729e4ab9 1999* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
2000* before calling this method.
2001* @param strsrch string search data
2002* @param start offset of potential match, to be modified if necessary
2003* @param end offset of potential match, to be modified if necessary
2004* @param status output error status if any
2005* @return TRUE if match passes the contraction test, FALSE otherwise
2006*/
2007static
729e4ab9
A
2008UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2009 int32_t *start,
2010 int32_t *end, UErrorCode *status)
b75a7d8f
A
2011{
2012 UCollationElements *coleiter = strsrch->textIter;
2013 int32_t textlength = strsrch->search->textLength;
2014 int32_t temp = *end;
2015 const UCollator *collator = strsrch->collator;
2016 const UChar *text = strsrch->search->text;
729e4ab9 2017 // This part checks if either if the start of the match contains potential
b75a7d8f 2018 // contraction. If so we'll have to iterate through them
729e4ab9 2019 // Since we used ucol_next while previously looking for the potential
374ca955
A
2020 // match, this guarantees that our end will not be a partial contraction,
2021 // or a partial supplementary character.
b75a7d8f
A
2022 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2023 int32_t expansion = getExpansionSuffix(coleiter);
2024 UBool expandflag = expansion > 0;
2025 setColEIterOffset(coleiter, *end);
2026 while (U_SUCCESS(*status) && expansion > 0) {
2027 // getting rid of the redundant ce
2028 // since forward contraction/expansion may have extra ces
2029 // if we are in the normalization buffer, hasAccentsBeforeMatch
2030 // would have taken care of it.
2031 // E.g. the character \u01FA will have an expansion of 3, but if
729e4ab9 2032 // we are only looking for A ring A\u030A, we'll have to skip the
b75a7d8f
A
2033 // last ce in the expansion buffer
2034 ucol_previous(coleiter, status);
374ca955
A
2035 if (U_FAILURE(*status)) {
2036 return FALSE;
2037 }
b75a7d8f
A
2038 if (ucol_getOffset(coleiter) != temp) {
2039 *end = temp;
2040 temp = ucol_getOffset(coleiter);
2041 }
2042 expansion --;
2043 }
2044
b331163b
A
2045 int32_t *patternce = strsrch->pattern.ces;
2046 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
2047 int32_t count = patterncelength;
2048 while (count > 0) {
374ca955 2049 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
b75a7d8f
A
2050 // status checked below, note that if status is a failure
2051 // ucol_previous returns UCOL_NULLORDER
2052 if (ce == UCOL_IGNORABLE) {
2053 continue;
2054 }
729e4ab9 2055 if (expandflag && count == 0 &&
b75a7d8f
A
2056 getColElemIterOffset(coleiter, FALSE) != temp) {
2057 *end = temp;
2058 temp = ucol_getOffset(coleiter);
2059 }
2060 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2061 (*start) --;
2062 *start = getPreviousBaseOffset(text, *start);
2063 return FALSE;
2064 }
2065 count --;
2066 }
729e4ab9 2067 }
b75a7d8f
A
2068 return TRUE;
2069}
2070
2071/**
2072* Checks and sets the match information if found.
729e4ab9 2073* Checks
b75a7d8f
A
2074* <ul>
2075* <li> the current match does not repeat the last match
2076* <li> boundaries are correct
2077* <li> exact matches has no extra accents
2078* <li> identical matches
2079* <\ul>
2080* Otherwise the offset will be shifted to the preceding character.
2081* Internal method, status assumed to be success, caller has to check status
2082* before calling this method.
2083* @param strsrch string search data
729e4ab9 2084* @param collator
b75a7d8f
A
2085* @param coleiter collation element iterator
2086* @param text string
2087* @param textoffset offset in the collation element text. the returned value
729e4ab9 2088* will be the truncated start offset of the match or the new start
b75a7d8f
A
2089* search offset.
2090* @param status output error status if any
2091* @return TRUE if the match is valid, FALSE otherwise
2092*/
2093static
729e4ab9
A
2094inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2095 int32_t *textoffset,
b75a7d8f
A
2096 UErrorCode *status)
2097{
2098 // to ensure that the start and ends are not composite characters
729e4ab9 2099 int32_t end = ucol_getOffset(strsrch->textIter);
b75a7d8f
A
2100 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2101 || U_FAILURE(*status)) {
2102 return FALSE;
2103 }
729e4ab9 2104
b75a7d8f
A
2105 // this totally matches, however we need to check if it is repeating
2106 // the old match
729e4ab9 2107 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
b75a7d8f
A
2108 !isBreakUnit(strsrch, *textoffset, end) ||
2109 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
729e4ab9 2110 !checkIdentical(strsrch, *textoffset, end) ||
b75a7d8f
A
2111 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2112 (*textoffset) --;
729e4ab9 2113 *textoffset = getPreviousBaseOffset(strsrch->search->text,
b75a7d8f
A
2114 *textoffset);
2115 return FALSE;
2116 }
729e4ab9 2117
46f4442e
A
2118 //Add breakiterator boundary check for primary strength search.
2119 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
4388f060 2120 checkBreakBoundary(strsrch, textoffset, &end);
46f4442e 2121 }
729e4ab9 2122
b75a7d8f
A
2123 strsrch->search->matchedIndex = *textoffset;
2124 strsrch->search->matchedLength = end - *textoffset;
2125 return TRUE;
2126}
2127
2128/**
2129* Rearranges the end accents to try matching.
729e4ab9
A
2130* Suffix accents in the text will be grouped according to their combining
2131* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
2132* match with the pattern.
2133* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2134* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 2135* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
2136* "\u0301\u0325".
2137* step 2: check if any of the generated substrings matches the pattern.
729e4ab9 2138* Internal method, status assumed to be success, user has to check status
b75a7d8f
A
2139* before calling this method.
2140* @param strsrch string search match
2141* @param start offset of the first base character
2142* @param end start of the last accent set
2143* @param status only error status if any
2144* @return USEARCH_DONE if a match is not found, otherwise return the ending
2145* offset of the match. Note this start includes all following accents.
2146*/
2147static
729e4ab9 2148int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
b75a7d8f 2149 int32_t start,
729e4ab9 2150 int32_t end,
b75a7d8f
A
2151 UErrorCode *status)
2152{
2153 const UChar *text = strsrch->search->text;
2154 int32_t tempend = end;
2155
4388f060 2156 U16_BACK_1(text, 0, tempend);
729e4ab9 2157 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
b75a7d8f
A
2158 LAST_BYTE_MASK_)) {
2159 // die... failed at a base character
2160 return USEARCH_DONE;
2161 }
2162 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2163
2164 if (U_SUCCESS(*status)) {
2165 UChar accents[INITIAL_ARRAY_SIZE_];
2166 int32_t offset = getPreviousBaseOffset(text, end);
2167 // normalizing the offensive string
729e4ab9
A
2168 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2169 INITIAL_ARRAY_SIZE_, status);
2170
2171 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2172 int32_t accentsize = getUnblockedAccentIndex(accents,
b75a7d8f 2173 accentsindex);
729e4ab9 2174 int32_t count = (2 << (accentsize - 1)) - 1;
b75a7d8f
A
2175 UChar buffer[INITIAL_ARRAY_SIZE_];
2176 UCollationElements *coleiter = strsrch->utilIter;
2177 while (U_SUCCESS(*status) && count > 0) {
2178 UChar *rearrange = strsrch->canonicalSuffixAccents;
2179 // copy the base characters
2180 for (int k = 0; k < accentsindex[0]; k ++) {
2181 *rearrange ++ = accents[k];
2182 }
2183 // forming all possible canonical rearrangement by dropping
2184 // sets of accents
2185 for (int i = 0; i <= accentsize - 1; i ++) {
2186 int32_t mask = 1 << (accentsize - i - 1);
2187 if (count & mask) {
2188 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2189 *rearrange ++ = accents[j];
2190 }
2191 }
2192 }
2193 *rearrange = 0;
2194 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2195 UChar *match = addToUCharArray(buffer, &matchsize,
2196 strsrch->canonicalPrefixAccents,
2197 strsrch->search->text + start,
2198 offset - start,
2199 strsrch->canonicalSuffixAccents,
2200 status);
729e4ab9 2201
b75a7d8f
A
2202 // run the collator iterator through this match
2203 // if status is a failure ucol_setText does nothing
2204 ucol_setText(coleiter, match, matchsize, status);
2205 if (U_SUCCESS(*status)) {
2206 if (checkCollationMatch(strsrch, coleiter)) {
2207 if (match != buffer) {
2208 uprv_free(match);
2209 }
2210 return end;
2211 }
2212 }
2213 count --;
2214 }
2215 }
2216 return USEARCH_DONE;
2217}
2218
2219/**
2220* Take the rearranged start accents and tries matching. If match failed at
2221* a seperate following set of accents (seperated from the rearranged on by
729e4ab9 2222* at least a base character) then we rearrange the preceding accents and
b75a7d8f 2223* tries matching again.
729e4ab9 2224* We allow skipping of the ends of the accent set if the ces do not match.
b75a7d8f 2225* However if the failure is found before the accent set, it fails.
729e4ab9 2226* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
2227* before calling this method.
2228* @param strsrch string search data
2229* @param textoffset of the ends of the rearranged accent
2230* @param status output error status if any
2231* @return USEARCH_DONE if a match is not found, otherwise return the ending
2232* offset of the match. Note this start includes all following accents.
2233*/
2234static
729e4ab9 2235int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
b75a7d8f
A
2236 int32_t textoffset,
2237 UErrorCode *status)
2238{
2239 const UChar *text = strsrch->search->text;
2240 const UCollator *collator = strsrch->collator;
2241 int32_t safelength = 0;
2242 UChar *safetext;
2243 int32_t safetextlength;
2244 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2245 int32_t safeoffset = textoffset;
2246
729e4ab9 2247 if (textoffset &&
b75a7d8f
A
2248 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2249 u_strlen(strsrch->canonicalPrefixAccents) - 1
2250 ], collator)) {
729e4ab9 2251 safeoffset = getNextSafeOffset(collator, text, textoffset,
b75a7d8f
A
2252 strsrch->search->textLength);
2253 safelength = safeoffset - textoffset;
2254 safetextlength = INITIAL_ARRAY_SIZE_;
729e4ab9
A
2255 safetext = addToUCharArray(safebuffer, &safetextlength,
2256 strsrch->canonicalPrefixAccents,
2257 text + textoffset, safelength,
b75a7d8f
A
2258 NULL, status);
2259 }
2260 else {
2261 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2262 safetext = strsrch->canonicalPrefixAccents;
2263 }
2264
2265 UCollationElements *coleiter = strsrch->utilIter;
2266 // if status is a failure, ucol_setText does nothing
2267 ucol_setText(coleiter, safetext, safetextlength, status);
2268 // status checked in loop below
729e4ab9 2269
b331163b
A
2270 int32_t *ce = strsrch->pattern.ces;
2271 int32_t celength = strsrch->pattern.cesLength;
b75a7d8f
A
2272 int ceindex = 0;
2273 UBool isSafe = TRUE; // safe zone indication flag for position
2274 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
729e4ab9 2275
b75a7d8f 2276 while (ceindex < celength) {
374ca955 2277 int32_t textce = ucol_next(coleiter, status);
b75a7d8f
A
2278 if (U_FAILURE(*status)) {
2279 if (isSafe) {
2280 cleanUpSafeText(strsrch, safetext, safebuffer);
2281 }
2282 return USEARCH_DONE;
2283 }
2284 if (textce == UCOL_NULLORDER) {
2285 // check if we have passed the safe buffer
2286 if (coleiter == strsrch->textIter) {
2287 cleanUpSafeText(strsrch, safetext, safebuffer);
2288 return USEARCH_DONE;
2289 }
2290 cleanUpSafeText(strsrch, safetext, safebuffer);
2291 safetext = safebuffer;
2292 coleiter = strsrch->textIter;
2293 setColEIterOffset(coleiter, safeoffset);
2294 // status checked at the start of the loop
2295 isSafe = FALSE;
2296 continue;
2297 }
2298 textce = getCE(strsrch, textce);
2299 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2300 // do the beginning stuff
2301 int32_t failedoffset = ucol_getOffset(coleiter);
2302 if (isSafe && failedoffset <= prefixlength) {
2303 // alas... no hope. failed at rearranged accent set
2304 cleanUpSafeText(strsrch, safetext, safebuffer);
2305 return USEARCH_DONE;
2306 }
2307 else {
2308 if (isSafe) {
2309 failedoffset = safeoffset - failedoffset;
2310 cleanUpSafeText(strsrch, safetext, safebuffer);
2311 }
729e4ab9 2312
b75a7d8f 2313 // try rearranging the end accents
729e4ab9 2314 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
b75a7d8f
A
2315 textoffset, failedoffset, status);
2316 if (result != USEARCH_DONE) {
2317 // if status is a failure, ucol_setOffset does nothing
2318 setColEIterOffset(strsrch->textIter, result);
2319 }
2320 if (U_FAILURE(*status)) {
2321 return USEARCH_DONE;
2322 }
2323 return result;
2324 }
2325 }
2326 if (textce == ce[ceindex]) {
2327 ceindex ++;
2328 }
2329 }
2330 // set offset here
2331 if (isSafe) {
2332 int32_t result = ucol_getOffset(coleiter);
2333 // sets the text iterator here with the correct expansion and offset
2334 int32_t leftoverces = getExpansionSuffix(coleiter);
2335 cleanUpSafeText(strsrch, safetext, safebuffer);
729e4ab9 2336 if (result <= prefixlength) {
b75a7d8f
A
2337 result = textoffset;
2338 }
2339 else {
2340 result = textoffset + (safeoffset - result);
2341 }
2342 setColEIterOffset(strsrch->textIter, result);
2343 setExpansionSuffix(strsrch->textIter, leftoverces);
2344 return result;
2345 }
729e4ab9
A
2346
2347 return ucol_getOffset(coleiter);
b75a7d8f
A
2348}
2349
2350/**
2351* Trying out the substring and sees if it can be a canonical match.
729e4ab9 2352* This will try normalizing the starting accents and arranging them into
b75a7d8f 2353* canonical equivalents and check their corresponding ces with the pattern ce.
729e4ab9
A
2354* Prefix accents in the text will be grouped according to their combining
2355* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
2356* match with the pattern.
2357* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2358* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 2359* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
2360* "\u0301\u0325".
2361* step 2: check if any of the generated substrings matches the pattern.
2362* Internal method, status assumed to be success, caller has to check status
2363* before calling this method.
2364* @param strsrch string search data
729e4ab9 2365* @param textoffset start offset in the collation element text that starts
b75a7d8f
A
2366* with the accents to be rearranged
2367* @param status output error status if any
2368* @return TRUE if the match is valid, FALSE otherwise
2369*/
2370static
729e4ab9
A
2371UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2372 int32_t textoffset,
b75a7d8f
A
2373 UErrorCode *status)
2374{
2375 const UChar *text = strsrch->search->text;
2376 int32_t temp = textoffset;
2377 int32_t textlength = strsrch->search->textLength;
2378 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2379 UCollationElements *coleiter = strsrch->textIter;
2380 int32_t offset = ucol_getOffset(coleiter);
2381 if (strsrch->pattern.hasSuffixAccents) {
729e4ab9 2382 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
b75a7d8f
A
2383 offset, status);
2384 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2385 setColEIterOffset(coleiter, offset);
2386 return TRUE;
2387 }
2388 }
2389 return FALSE;
2390 }
2391
2392 if (!strsrch->pattern.hasPrefixAccents) {
2393 return FALSE;
2394 }
2395
2396 UChar accents[INITIAL_ARRAY_SIZE_];
2397 // offset to the last base character in substring to search
2398 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2399 // normalizing the offensive string
729e4ab9
A
2400 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2401 0, accents, INITIAL_ARRAY_SIZE_, status);
b75a7d8f 2402 // status checked in loop
729e4ab9 2403
b75a7d8f
A
2404 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2405 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2406
374ca955 2407 // 2 power n - 1 plus the full set of accents
729e4ab9 2408 int32_t count = (2 << (size - 1)) - 1;
b75a7d8f
A
2409 while (U_SUCCESS(*status) && count > 0) {
2410 UChar *rearrange = strsrch->canonicalPrefixAccents;
2411 // copy the base characters
2412 for (int k = 0; k < accentsindex[0]; k ++) {
2413 *rearrange ++ = accents[k];
2414 }
2415 // forming all possible canonical rearrangement by dropping
2416 // sets of accents
2417 for (int i = 0; i <= size - 1; i ++) {
2418 int32_t mask = 1 << (size - i - 1);
2419 if (count & mask) {
2420 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2421 *rearrange ++ = accents[j];
2422 }
2423 }
2424 }
2425 *rearrange = 0;
729e4ab9 2426 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
b75a7d8f
A
2427 baseoffset, status);
2428 if (offset != USEARCH_DONE) {
2429 return TRUE; // match found
2430 }
2431 count --;
2432 }
2433 return FALSE;
2434}
2435
2436/**
729e4ab9 2437* Checks match for contraction.
b75a7d8f
A
2438* If the match starts with a partial contraction we fail.
2439* Internal method, status assumed to be success, caller has to check status
2440* before calling this method.
2441* @param strsrch string search data
2442* @param start offset of potential match, to be modified if necessary
2443* @param end offset of potential match, to be modified if necessary
2444* @param status only error status if any
2445* @return TRUE if match passes the contraction test, FALSE otherwise
2446*/
2447static
729e4ab9
A
2448UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2449 int32_t *start,
2450 int32_t *end, UErrorCode *status)
b75a7d8f
A
2451{
2452 UCollationElements *coleiter = strsrch->textIter;
2453 int32_t textlength = strsrch->search->textLength;
2454 int32_t temp = *end;
2455 const UCollator *collator = strsrch->collator;
2456 const UChar *text = strsrch->search->text;
729e4ab9 2457 // This part checks if either if the start of the match contains potential
b75a7d8f 2458 // contraction. If so we'll have to iterate through them
729e4ab9 2459 // Since we used ucol_next while previously looking for the potential
374ca955
A
2460 // match, this guarantees that our end will not be a partial contraction,
2461 // or a partial supplementary character.
b75a7d8f
A
2462 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2463 int32_t expansion = getExpansionSuffix(coleiter);
2464 UBool expandflag = expansion > 0;
2465 setColEIterOffset(coleiter, *end);
2466 while (expansion > 0) {
2467 // getting rid of the redundant ce
2468 // since forward contraction/expansion may have extra ces
2469 // if we are in the normalization buffer, hasAccentsBeforeMatch
2470 // would have taken care of it.
2471 // E.g. the character \u01FA will have an expansion of 3, but if
729e4ab9 2472 // we are only looking for A ring A\u030A, we'll have to skip the
b75a7d8f
A
2473 // last ce in the expansion buffer
2474 ucol_previous(coleiter, status);
374ca955
A
2475 if (U_FAILURE(*status)) {
2476 return FALSE;
2477 }
b75a7d8f
A
2478 if (ucol_getOffset(coleiter) != temp) {
2479 *end = temp;
2480 temp = ucol_getOffset(coleiter);
2481 }
2482 expansion --;
2483 }
2484
b331163b
A
2485 int32_t *patternce = strsrch->pattern.ces;
2486 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
2487 int32_t count = patterncelength;
2488 while (count > 0) {
374ca955 2489 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
b75a7d8f
A
2490 // status checked below, note that if status is a failure
2491 // ucol_previous returns UCOL_NULLORDER
2492 if (ce == UCOL_IGNORABLE) {
2493 continue;
2494 }
729e4ab9 2495 if (expandflag && count == 0 &&
b75a7d8f
A
2496 getColElemIterOffset(coleiter, FALSE) != temp) {
2497 *end = temp;
2498 temp = ucol_getOffset(coleiter);
2499 }
729e4ab9 2500 if (count == patterncelength &&
b75a7d8f 2501 ce != patternce[patterncelength - 1]) {
729e4ab9 2502 // accents may have extra starting ces, this occurs when a
b75a7d8f 2503 // pure accent pattern is matched without rearrangement
374ca955 2504 int32_t expected = patternce[patterncelength - 1];
4388f060 2505 U16_BACK_1(text, 0, *end);
b75a7d8f
A
2506 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2507 ce = getCE(strsrch, ucol_previous(coleiter, status));
729e4ab9 2508 while (U_SUCCESS(*status) && ce != expected &&
b75a7d8f
A
2509 ce != UCOL_NULLORDER &&
2510 ucol_getOffset(coleiter) <= *start) {
2511 ce = getCE(strsrch, ucol_previous(coleiter, status));
2512 }
2513 }
2514 }
2515 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2516 (*start) --;
2517 *start = getPreviousBaseOffset(text, *start);
2518 return FALSE;
2519 }
2520 count --;
2521 }
729e4ab9 2522 }
b75a7d8f
A
2523 return TRUE;
2524}
2525
2526/**
2527* Checks and sets the match information if found.
729e4ab9 2528* Checks
b75a7d8f
A
2529* <ul>
2530* <li> the potential match does not repeat the previous match
2531* <li> boundaries are correct
2532* <li> potential match does not end in the middle of a contraction
2533* <li> identical matches
2534* <\ul>
2535* Otherwise the offset will be shifted to the next character.
2536* Internal method, status assumed to be success, caller has to check status
2537* before calling this method.
2538* @param strsrch string search data
2539* @param textoffset offset in the collation element text. the returned value
729e4ab9 2540* will be the truncated start offset of the match or the new start
b75a7d8f
A
2541* search offset.
2542* @param status only error status if any
2543* @return TRUE if the match is valid, FALSE otherwise
2544*/
2545static
729e4ab9
A
2546inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2547 int32_t *textoffset,
b75a7d8f
A
2548 UErrorCode *status)
2549{
2550 // to ensure that the start and ends are not composite characters
2551 UCollationElements *coleiter = strsrch->textIter;
2552 // if we have a canonical accent match
729e4ab9
A
2553 if ((strsrch->pattern.hasSuffixAccents &&
2554 strsrch->canonicalSuffixAccents[0]) ||
2555 (strsrch->pattern.hasPrefixAccents &&
b75a7d8f
A
2556 strsrch->canonicalPrefixAccents[0])) {
2557 strsrch->search->matchedIndex = *textoffset;
729e4ab9
A
2558 strsrch->search->matchedLength =
2559 getNextUStringSearchBaseOffset(strsrch,
b75a7d8f
A
2560 getColElemIterOffset(coleiter, FALSE))
2561 - *textoffset;
2562 return TRUE;
2563 }
2564
2565 int32_t end = ucol_getOffset(coleiter);
2566 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
729e4ab9 2567 status) ||
b75a7d8f
A
2568 U_FAILURE(*status)) {
2569 return FALSE;
2570 }
2571
2572 end = getNextUStringSearchBaseOffset(strsrch, end);
2573 // this totally matches, however we need to check if it is repeating
729e4ab9
A
2574 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2575 !isBreakUnit(strsrch, *textoffset, end) ||
b75a7d8f
A
2576 !checkIdentical(strsrch, *textoffset, end)) {
2577 (*textoffset) --;
729e4ab9 2578 *textoffset = getPreviousBaseOffset(strsrch->search->text,
b75a7d8f
A
2579 *textoffset);
2580 return FALSE;
2581 }
729e4ab9 2582
b75a7d8f
A
2583 strsrch->search->matchedIndex = *textoffset;
2584 strsrch->search->matchedLength = end - *textoffset;
2585 return TRUE;
2586}
46f4442e 2587#endif // #if BOYER_MOORE
b75a7d8f
A
2588
2589// constructors and destructor -------------------------------------------
2590
729e4ab9
A
2591U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2592 int32_t patternlength,
2593 const UChar *text,
b75a7d8f
A
2594 int32_t textlength,
2595 const char *locale,
2596 UBreakIterator *breakiter,
729e4ab9 2597 UErrorCode *status)
b75a7d8f
A
2598{
2599 if (U_FAILURE(*status)) {
2600 return NULL;
2601 }
2602#if UCONFIG_NO_BREAK_ITERATION
2603 if (breakiter != NULL) {
2604 *status = U_UNSUPPORTED_ERROR;
2605 return NULL;
2606 }
2607#endif
2608 if (locale) {
2609 // ucol_open internally checks for status
2610 UCollator *collator = ucol_open(locale, status);
2611 // pattern, text checks are done in usearch_openFromCollator
729e4ab9
A
2612 UStringSearch *result = usearch_openFromCollator(pattern,
2613 patternlength, text, textlength,
b75a7d8f
A
2614 collator, breakiter, status);
2615
2616 if (result == NULL || U_FAILURE(*status)) {
2617 if (collator) {
2618 ucol_close(collator);
2619 }
2620 return NULL;
2621 }
2622 else {
2623 result->ownCollator = TRUE;
2624 }
2625 return result;
2626 }
2627 *status = U_ILLEGAL_ARGUMENT_ERROR;
2628 return NULL;
2629}
2630
2631U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
729e4ab9 2632 const UChar *pattern,
b75a7d8f 2633 int32_t patternlength,
729e4ab9 2634 const UChar *text,
b75a7d8f
A
2635 int32_t textlength,
2636 const UCollator *collator,
2637 UBreakIterator *breakiter,
729e4ab9 2638 UErrorCode *status)
b75a7d8f
A
2639{
2640 if (U_FAILURE(*status)) {
2641 return NULL;
2642 }
2643#if UCONFIG_NO_BREAK_ITERATION
2644 if (breakiter != NULL) {
2645 *status = U_UNSUPPORTED_ERROR;
2646 return NULL;
2647 }
2648#endif
2649 if (pattern == NULL || text == NULL || collator == NULL) {
2650 *status = U_ILLEGAL_ARGUMENT_ERROR;
73c04bcf 2651 return NULL;
b75a7d8f
A
2652 }
2653
374ca955
A
2654 // string search does not really work when numeric collation is turned on
2655 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2656 *status = U_UNSUPPORTED_ERROR;
73c04bcf 2657 return NULL;
374ca955
A
2658 }
2659
b75a7d8f
A
2660 if (U_SUCCESS(*status)) {
2661 initializeFCD(status);
2662 if (U_FAILURE(*status)) {
2663 return NULL;
2664 }
2665
2666 UStringSearch *result;
2667 if (textlength == -1) {
2668 textlength = u_strlen(text);
2669 }
2670 if (patternlength == -1) {
2671 patternlength = u_strlen(pattern);
2672 }
2673 if (textlength <= 0 || patternlength <= 0) {
2674 *status = U_ILLEGAL_ARGUMENT_ERROR;
2675 return NULL;
2676 }
729e4ab9 2677
b75a7d8f
A
2678 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2679 if (result == NULL) {
2680 *status = U_MEMORY_ALLOCATION_ERROR;
2681 return NULL;
2682 }
2683
2684 result->collator = collator;
2685 result->strength = ucol_getStrength(collator);
2686 result->ceMask = getMask(result->strength);
729e4ab9
A
2687 result->toShift =
2688 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
b75a7d8f
A
2689 UCOL_SHIFTED;
2690 result->variableTop = ucol_getVariableTop(collator, status);
2691
b331163b 2692 result->nfd = Normalizer2::getNFDInstance(*status);
729e4ab9 2693
b75a7d8f
A
2694 if (U_FAILURE(*status)) {
2695 uprv_free(result);
2696 return NULL;
2697 }
2698
2699 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2700 if (result->search == NULL) {
2701 *status = U_MEMORY_ALLOCATION_ERROR;
2702 uprv_free(result);
2703 return NULL;
2704 }
2705
2706 result->search->text = text;
2707 result->search->textLength = textlength;
2708
2709 result->pattern.text = pattern;
2710 result->pattern.textLength = patternlength;
b331163b
A
2711 result->pattern.ces = NULL;
2712 result->pattern.pces = NULL;
729e4ab9 2713
b75a7d8f
A
2714 result->search->breakIter = breakiter;
2715#if !UCONFIG_NO_BREAK_ITERATION
729e4ab9 2716 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
b75a7d8f 2717 if (breakiter) {
4388f060 2718 ubrk_setText(breakiter, text, textlength, status);
b75a7d8f
A
2719 }
2720#endif
2721
2722 result->ownCollator = FALSE;
2723 result->search->matchedLength = 0;
2724 result->search->matchedIndex = USEARCH_DONE;
46f4442e 2725 result->utilIter = NULL;
729e4ab9 2726 result->textIter = ucol_openElements(collator, text,
b75a7d8f 2727 textlength, status);
57a6839d 2728 result->textProcessedIter = NULL;
b75a7d8f
A
2729 if (U_FAILURE(*status)) {
2730 usearch_close(result);
2731 return NULL;
2732 }
2733
b75a7d8f
A
2734 result->search->isOverlap = FALSE;
2735 result->search->isCanonicalMatch = FALSE;
729e4ab9 2736 result->search->elementComparisonType = 0;
b75a7d8f
A
2737 result->search->isForwardSearching = TRUE;
2738 result->search->reset = TRUE;
729e4ab9 2739
b75a7d8f
A
2740 initialize(result, status);
2741
2742 if (U_FAILURE(*status)) {
2743 usearch_close(result);
2744 return NULL;
2745 }
2746
2747 return result;
2748 }
2749 return NULL;
2750}
2751
2752U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2753{
2754 if (strsrch) {
b331163b
A
2755 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2756 strsrch->pattern.ces) {
2757 uprv_free(strsrch->pattern.ces);
b75a7d8f 2758 }
46f4442e 2759
b331163b
A
2760 if (strsrch->pattern.pces != NULL &&
2761 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2762 uprv_free(strsrch->pattern.pces);
46f4442e
A
2763 }
2764
57a6839d 2765 delete strsrch->textProcessedIter;
b75a7d8f
A
2766 ucol_closeElements(strsrch->textIter);
2767 ucol_closeElements(strsrch->utilIter);
46f4442e 2768
b75a7d8f
A
2769 if (strsrch->ownCollator && strsrch->collator) {
2770 ucol_close((UCollator *)strsrch->collator);
2771 }
46f4442e
A
2772
2773#if !UCONFIG_NO_BREAK_ITERATION
2774 if (strsrch->search->internalBreakIter) {
4388f060 2775 ubrk_close(strsrch->search->internalBreakIter);
46f4442e
A
2776 }
2777#endif
2778
b75a7d8f
A
2779 uprv_free(strsrch->search);
2780 uprv_free(strsrch);
2781 }
2782}
2783
57a6839d
A
2784namespace {
2785
2786UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2787 if (U_FAILURE(*status)) { return FALSE; }
2788 if (strsrch->textProcessedIter == NULL) {
2789 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2790 if (strsrch->textProcessedIter == NULL) {
2791 *status = U_MEMORY_ALLOCATION_ERROR;
2792 return FALSE;
2793 }
2794 } else {
2795 strsrch->textProcessedIter->init(strsrch->textIter);
2796 }
2797 return TRUE;
2798}
2799
2800}
2801
b75a7d8f
A
2802// set and get methods --------------------------------------------------
2803
729e4ab9 2804U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
b75a7d8f
A
2805 int32_t position,
2806 UErrorCode *status)
2807{
2808 if (U_SUCCESS(*status) && strsrch) {
2809 if (isOutOfBounds(strsrch->search->textLength, position)) {
2810 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2811 }
2812 else {
2813 setColEIterOffset(strsrch->textIter, position);
2814 }
2815 strsrch->search->matchedIndex = USEARCH_DONE;
2816 strsrch->search->matchedLength = 0;
729e4ab9 2817 strsrch->search->reset = FALSE;
b75a7d8f
A
2818 }
2819}
2820
2821U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2822{
2823 if (strsrch) {
2824 int32_t result = ucol_getOffset(strsrch->textIter);
2825 if (isOutOfBounds(strsrch->search->textLength, result)) {
2826 return USEARCH_DONE;
2827 }
2828 return result;
2829 }
2830 return USEARCH_DONE;
2831}
729e4ab9
A
2832
2833U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
b75a7d8f
A
2834 USearchAttribute attribute,
2835 USearchAttributeValue value,
2836 UErrorCode *status)
2837{
2838 if (U_SUCCESS(*status) && strsrch) {
2839 switch (attribute)
2840 {
2841 case USEARCH_OVERLAP :
2842 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2843 break;
2844 case USEARCH_CANONICAL_MATCH :
729e4ab9 2845 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
b75a7d8f
A
2846 FALSE);
2847 break;
729e4ab9
A
2848 case USEARCH_ELEMENT_COMPARISON :
2849 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2850 strsrch->search->elementComparisonType = (int16_t)value;
2851 } else {
2852 strsrch->search->elementComparisonType = 0;
2853 }
2854 break;
b75a7d8f
A
2855 case USEARCH_ATTRIBUTE_COUNT :
2856 default:
2857 *status = U_ILLEGAL_ARGUMENT_ERROR;
2858 }
2859 }
2860 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2861 *status = U_ILLEGAL_ARGUMENT_ERROR;
2862 }
2863}
729e4ab9 2864
b75a7d8f
A
2865U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2866 const UStringSearch *strsrch,
2867 USearchAttribute attribute)
2868{
2869 if (strsrch) {
2870 switch (attribute) {
2871 case USEARCH_OVERLAP :
729e4ab9 2872 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
b75a7d8f
A
2873 USEARCH_OFF);
2874 case USEARCH_CANONICAL_MATCH :
729e4ab9 2875 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
b75a7d8f 2876 USEARCH_OFF);
729e4ab9
A
2877 case USEARCH_ELEMENT_COMPARISON :
2878 {
2879 int16_t value = strsrch->search->elementComparisonType;
2880 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2881 return (USearchAttributeValue)value;
2882 } else {
2883 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2884 }
2885 }
b75a7d8f
A
2886 case USEARCH_ATTRIBUTE_COUNT :
2887 return USEARCH_DEFAULT;
2888 }
2889 }
2890 return USEARCH_DEFAULT;
2891}
2892
2893U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2894 const UStringSearch *strsrch)
2895{
2896 if (strsrch == NULL) {
2897 return USEARCH_DONE;
2898 }
2899 return strsrch->search->matchedIndex;
2900}
2901
2902
729e4ab9
A
2903U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2904 UChar *result,
2905 int32_t resultCapacity,
b75a7d8f
A
2906 UErrorCode *status)
2907{
2908 if (U_FAILURE(*status)) {
2909 return USEARCH_DONE;
2910 }
729e4ab9 2911 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
b75a7d8f
A
2912 result == NULL)) {
2913 *status = U_ILLEGAL_ARGUMENT_ERROR;
2914 return USEARCH_DONE;
2915 }
2916
2917 int32_t copylength = strsrch->search->matchedLength;
2918 int32_t copyindex = strsrch->search->matchedIndex;
2919 if (copyindex == USEARCH_DONE) {
2920 u_terminateUChars(result, resultCapacity, 0, status);
2921 return USEARCH_DONE;
2922 }
2923
2924 if (resultCapacity < copylength) {
2925 copylength = resultCapacity;
2926 }
2927 if (copylength > 0) {
729e4ab9 2928 uprv_memcpy(result, strsrch->search->text + copyindex,
b75a7d8f
A
2929 copylength * sizeof(UChar));
2930 }
729e4ab9 2931 return u_terminateUChars(result, resultCapacity,
b75a7d8f
A
2932 strsrch->search->matchedLength, status);
2933}
729e4ab9 2934
b75a7d8f
A
2935U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2936 const UStringSearch *strsrch)
2937{
2938 if (strsrch) {
2939 return strsrch->search->matchedLength;
2940 }
2941 return USEARCH_DONE;
2942}
2943
2944#if !UCONFIG_NO_BREAK_ITERATION
2945
729e4ab9 2946U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
b75a7d8f
A
2947 UBreakIterator *breakiter,
2948 UErrorCode *status)
2949{
2950 if (U_SUCCESS(*status) && strsrch) {
4388f060 2951 strsrch->search->breakIter = breakiter;
b75a7d8f 2952 if (breakiter) {
729e4ab9 2953 ubrk_setText(breakiter, strsrch->search->text,
b75a7d8f
A
2954 strsrch->search->textLength, status);
2955 }
2956 }
2957}
2958
729e4ab9 2959U_CAPI const UBreakIterator* U_EXPORT2
b75a7d8f
A
2960usearch_getBreakIterator(const UStringSearch *strsrch)
2961{
2962 if (strsrch) {
2963 return strsrch->search->breakIter;
2964 }
2965 return NULL;
2966}
729e4ab9 2967
b75a7d8f 2968#endif
729e4ab9
A
2969
2970U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
b75a7d8f
A
2971 const UChar *text,
2972 int32_t textlength,
2973 UErrorCode *status)
2974{
2975 if (U_SUCCESS(*status)) {
729e4ab9 2976 if (strsrch == NULL || text == NULL || textlength < -1 ||
b75a7d8f
A
2977 textlength == 0) {
2978 *status = U_ILLEGAL_ARGUMENT_ERROR;
2979 }
2980 else {
2981 if (textlength == -1) {
2982 textlength = u_strlen(text);
2983 }
2984 strsrch->search->text = text;
2985 strsrch->search->textLength = textlength;
2986 ucol_setText(strsrch->textIter, text, textlength, status);
2987 strsrch->search->matchedIndex = USEARCH_DONE;
2988 strsrch->search->matchedLength = 0;
2989 strsrch->search->reset = TRUE;
2990#if !UCONFIG_NO_BREAK_ITERATION
374ca955 2991 if (strsrch->search->breakIter != NULL) {
729e4ab9 2992 ubrk_setText(strsrch->search->breakIter, text,
374ca955
A
2993 textlength, status);
2994 }
46f4442e 2995 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
b75a7d8f
A
2996#endif
2997 }
2998 }
2999}
3000
729e4ab9 3001U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
b75a7d8f
A
3002 int32_t *length)
3003{
3004 if (strsrch) {
3005 *length = strsrch->search->textLength;
3006 return strsrch->search->text;
3007 }
3008 return NULL;
3009}
3010
729e4ab9 3011U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
b75a7d8f
A
3012 const UCollator *collator,
3013 UErrorCode *status)
3014{
3015 if (U_SUCCESS(*status)) {
3016 if (collator == NULL) {
3017 *status = U_ILLEGAL_ARGUMENT_ERROR;
3018 return;
3019 }
46f4442e 3020
b75a7d8f 3021 if (strsrch) {
57a6839d
A
3022 delete strsrch->textProcessedIter;
3023 strsrch->textProcessedIter = NULL;
3024 ucol_closeElements(strsrch->textIter);
3025 ucol_closeElements(strsrch->utilIter);
3026 strsrch->textIter = strsrch->utilIter = NULL;
b75a7d8f
A
3027 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3028 ucol_close((UCollator *)strsrch->collator);
3029 strsrch->ownCollator = FALSE;
3030 }
3031 strsrch->collator = collator;
3032 strsrch->strength = ucol_getStrength(collator);
3033 strsrch->ceMask = getMask(strsrch->strength);
46f4442e 3034#if !UCONFIG_NO_BREAK_ITERATION
4388f060
A
3035 ubrk_close(strsrch->search->internalBreakIter);
3036 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3037 strsrch->search->text, strsrch->search->textLength, status);
46f4442e 3038#endif
b75a7d8f 3039 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
729e4ab9
A
3040 strsrch->toShift =
3041 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
b75a7d8f
A
3042 UCOL_SHIFTED;
3043 // if status is a failure, ucol_getVariableTop returns 0
3044 strsrch->variableTop = ucol_getVariableTop(collator, status);
57a6839d
A
3045 strsrch->textIter = ucol_openElements(collator,
3046 strsrch->search->text,
3047 strsrch->search->textLength,
3048 status);
3049 strsrch->utilIter = ucol_openElements(
3050 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3051 // initialize() _after_ setting the iterators for the new collator.
3052 initialize(strsrch, status);
b75a7d8f 3053 }
46f4442e
A
3054
3055 // **** are these calls needed?
3056 // **** we call uprv_init_pce in initializePatternPCETable
b331163b 3057 // **** and the CEIBuffer constructor...
46f4442e
A
3058#if 0
3059 uprv_init_pce(strsrch->textIter);
3060 uprv_init_pce(strsrch->utilIter);
3061#endif
b75a7d8f
A
3062 }
3063}
3064
3065U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3066{
3067 if (strsrch) {
3068 return (UCollator *)strsrch->collator;
3069 }
3070 return NULL;
3071}
3072
729e4ab9 3073U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
b75a7d8f
A
3074 const UChar *pattern,
3075 int32_t patternlength,
3076 UErrorCode *status)
3077{
3078 if (U_SUCCESS(*status)) {
3079 if (strsrch == NULL || pattern == NULL) {
3080 *status = U_ILLEGAL_ARGUMENT_ERROR;
3081 }
3082 else {
3083 if (patternlength == -1) {
3084 patternlength = u_strlen(pattern);
3085 }
3086 if (patternlength == 0) {
3087 *status = U_ILLEGAL_ARGUMENT_ERROR;
3088 return;
3089 }
3090 strsrch->pattern.text = pattern;
3091 strsrch->pattern.textLength = patternlength;
3092 initialize(strsrch, status);
3093 }
3094 }
3095}
3096
729e4ab9
A
3097U_CAPI const UChar* U_EXPORT2
3098usearch_getPattern(const UStringSearch *strsrch,
b75a7d8f
A
3099 int32_t *length)
3100{
3101 if (strsrch) {
3102 *length = strsrch->pattern.textLength;
3103 return strsrch->pattern.text;
3104 }
3105 return NULL;
3106}
3107
3108// miscellanous methods --------------------------------------------------
3109
729e4ab9
A
3110U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3111 UErrorCode *status)
b75a7d8f
A
3112{
3113 if (strsrch && U_SUCCESS(*status)) {
3114 strsrch->search->isForwardSearching = TRUE;
3115 usearch_setOffset(strsrch, 0, status);
3116 if (U_SUCCESS(*status)) {
3117 return usearch_next(strsrch, status);
3118 }
3119 }
3120 return USEARCH_DONE;
3121}
3122
729e4ab9 3123U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
b75a7d8f
A
3124 int32_t position,
3125 UErrorCode *status)
3126{
3127 if (strsrch && U_SUCCESS(*status)) {
3128 strsrch->search->isForwardSearching = TRUE;
3129 // position checked in usearch_setOffset
3130 usearch_setOffset(strsrch, position, status);
3131 if (U_SUCCESS(*status)) {
729e4ab9 3132 return usearch_next(strsrch, status);
b75a7d8f
A
3133 }
3134 }
3135 return USEARCH_DONE;
3136}
729e4ab9
A
3137
3138U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
b75a7d8f
A
3139 UErrorCode *status)
3140{
3141 if (strsrch && U_SUCCESS(*status)) {
3142 strsrch->search->isForwardSearching = FALSE;
3143 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3144 if (U_SUCCESS(*status)) {
3145 return usearch_previous(strsrch, status);
3146 }
3147 }
3148 return USEARCH_DONE;
3149}
3150
729e4ab9 3151U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
b75a7d8f
A
3152 int32_t position,
3153 UErrorCode *status)
3154{
3155 if (strsrch && U_SUCCESS(*status)) {
3156 strsrch->search->isForwardSearching = FALSE;
3157 // position checked in usearch_setOffset
3158 usearch_setOffset(strsrch, position, status);
3159 if (U_SUCCESS(*status)) {
729e4ab9 3160 return usearch_previous(strsrch, status);
b75a7d8f
A
3161 }
3162 }
3163 return USEARCH_DONE;
3164}
729e4ab9 3165
b75a7d8f 3166/**
729e4ab9
A
3167* If a direction switch is required, we'll count the number of ces till the
3168* beginning of the collation element iterator and iterate forwards that
3169* number of times. This is so that we get to the correct point within the
b75a7d8f
A
3170* string to continue the search in. Imagine when we are in the middle of the
3171* normalization buffer when the change in direction is request. arrrgghh....
3172* After searching the offset within the collation element iterator will be
3173* shifted to the start of the match. If a match is not found, the offset would
729e4ab9 3174* have been set to the end of the text string in the collation element
b75a7d8f
A
3175* iterator.
3176* Okay, here's my take on normalization buffer. The only time when there can
3177* be 2 matches within the same normalization is when the pattern is consists
3178* of all accents. But since the offset returned is from the text string, we
729e4ab9 3179* should not confuse the caller by returning the second match within the
b75a7d8f
A
3180* same normalization buffer. If we do, the 2 results will have the same match
3181* offsets, and that'll be confusing. I'll return the next match that doesn't
729e4ab9 3182* fall within the same normalization buffer. Note this does not affect the
b75a7d8f
A
3183* results of matches spanning the text and the normalization buffer.
3184* The position to start searching is taken from the collation element
3185* iterator. Callers of this API would have to set the offset in the collation
3186* element iterator before using this method.
3187*/
3188U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3189 UErrorCode *status)
729e4ab9 3190{
b75a7d8f 3191 if (U_SUCCESS(*status) && strsrch) {
374ca955
A
3192 // note offset is either equivalent to the start of the previous match
3193 // or is set by the user
3194 int32_t offset = usearch_getOffset(strsrch);
3195 USearch *search = strsrch->search;
3196 search->reset = FALSE;
3197 int32_t textlength = search->textLength;
b75a7d8f 3198 if (search->isForwardSearching) {
46f4442e 3199#if BOYER_MOORE
374ca955 3200 if (offset == textlength
729e4ab9 3201 || (!search->isOverlap &&
b75a7d8f 3202 (offset + strsrch->pattern.defaultShiftSize > textlength ||
729e4ab9 3203 (search->matchedIndex != USEARCH_DONE &&
374ca955 3204 offset + search->matchedLength >= textlength)))) {
b75a7d8f
A
3205 // not enough characters to match
3206 setMatchNotFound(strsrch);
729e4ab9 3207 return USEARCH_DONE;
b75a7d8f 3208 }
46f4442e
A
3209#else
3210 if (offset == textlength ||
3211 (! search->isOverlap &&
3212 (search->matchedIndex != USEARCH_DONE &&
3213 offset + search->matchedLength > textlength))) {
3214 // not enough characters to match
3215 setMatchNotFound(strsrch);
3216 return USEARCH_DONE;
3217 }
3218#endif
b75a7d8f
A
3219 }
3220 else {
729e4ab9
A
3221 // switching direction.
3222 // if matchedIndex == USEARCH_DONE, it means that either a
b75a7d8f 3223 // setOffset has been called or that previous ran off the text
729e4ab9 3224 // string. the iterator would have been set to offset 0 if a
b75a7d8f
A
3225 // match is not found.
3226 search->isForwardSearching = TRUE;
374ca955 3227 if (search->matchedIndex != USEARCH_DONE) {
b75a7d8f
A
3228 // there's no need to set the collation element iterator
3229 // the next call to next will set the offset.
374ca955 3230 return search->matchedIndex;
b75a7d8f
A
3231 }
3232 }
3233
3234 if (U_SUCCESS(*status)) {
b331163b 3235 if (strsrch->pattern.cesLength == 0) {
374ca955 3236 if (search->matchedIndex == USEARCH_DONE) {
b75a7d8f
A
3237 search->matchedIndex = offset;
3238 }
3239 else { // moves by codepoints
4388f060 3240 U16_FWD_1(search->text, search->matchedIndex, textlength);
b75a7d8f 3241 }
729e4ab9 3242
b75a7d8f
A
3243 search->matchedLength = 0;
3244 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3245 // status checked below
3246 if (search->matchedIndex == textlength) {
3247 search->matchedIndex = USEARCH_DONE;
3248 }
3249 }
3250 else {
374ca955
A
3251 if (search->matchedLength > 0) {
3252 // if matchlength is 0 we are at the start of the iteration
3253 if (search->isOverlap) {
3254 ucol_setOffset(strsrch->textIter, offset + 1, status);
3255 }
3256 else {
729e4ab9 3257 ucol_setOffset(strsrch->textIter,
374ca955
A
3258 offset + search->matchedLength, status);
3259 }
3260 }
3261 else {
3262 // for boundary check purposes. this will ensure that the
3263 // next match will not preceed the current offset
3264 // note search->matchedIndex will always be set to something
3265 // in the code
3266 search->matchedIndex = offset - 1;
3267 }
3268
3269 if (search->isCanonicalMatch) {
3270 // can't use exact here since extra accents are allowed.
3271 usearch_handleNextCanonical(strsrch, status);
3272 }
3273 else {
3274 usearch_handleNextExact(strsrch, status);
3275 }
3276 }
3277
b75a7d8f
A
3278 if (U_FAILURE(*status)) {
3279 return USEARCH_DONE;
3280 }
374ca955 3281
46f4442e
A
3282#if !BOYER_MOORE
3283 if (search->matchedIndex == USEARCH_DONE) {
3284 ucol_setOffset(strsrch->textIter, search->textLength, status);
3285 } else {
3286 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3287 }
3288#endif
3289
b75a7d8f
A
3290 return search->matchedIndex;
3291 }
3292 }
3293 return USEARCH_DONE;
3294}
3295
3296U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3297 UErrorCode *status)
3298{
3299 if (U_SUCCESS(*status) && strsrch) {
3300 int32_t offset;
3301 USearch *search = strsrch->search;
3302 if (search->reset) {
3303 offset = search->textLength;
3304 search->isForwardSearching = FALSE;
3305 search->reset = FALSE;
3306 setColEIterOffset(strsrch->textIter, offset);
3307 }
3308 else {
3309 offset = usearch_getOffset(strsrch);
3310 }
729e4ab9 3311
b75a7d8f
A
3312 int32_t matchedindex = search->matchedIndex;
3313 if (search->isForwardSearching == TRUE) {
729e4ab9
A
3314 // switching direction.
3315 // if matchedIndex == USEARCH_DONE, it means that either a
b75a7d8f 3316 // setOffset has been called or that next ran off the text
729e4ab9 3317 // string. the iterator would have been set to offset textLength if
b75a7d8f
A
3318 // a match is not found.
3319 search->isForwardSearching = FALSE;
3320 if (matchedindex != USEARCH_DONE) {
3321 return matchedindex;
3322 }
3323 }
3324 else {
46f4442e 3325#if BOYER_MOORE
b75a7d8f 3326 if (offset == 0 || matchedindex == 0 ||
729e4ab9 3327 (!search->isOverlap &&
b75a7d8f 3328 (offset < strsrch->pattern.defaultShiftSize ||
729e4ab9 3329 (matchedindex != USEARCH_DONE &&
b75a7d8f
A
3330 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3331 // not enough characters to match
3332 setMatchNotFound(strsrch);
729e4ab9 3333 return USEARCH_DONE;
b75a7d8f 3334 }
46f4442e
A
3335#else
3336 // Could check pattern length, but the
3337 // linear search will do the right thing
3338 if (offset == 0 || matchedindex == 0) {
3339 setMatchNotFound(strsrch);
3340 return USEARCH_DONE;
3341 }
3342#endif
b75a7d8f
A
3343 }
3344
3345 if (U_SUCCESS(*status)) {
b331163b 3346 if (strsrch->pattern.cesLength == 0) {
729e4ab9 3347 search->matchedIndex =
b75a7d8f
A
3348 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3349 if (search->matchedIndex == 0) {
3350 setMatchNotFound(strsrch);
3351 // status checked below
3352 }
3353 else { // move by codepoints
4388f060 3354 U16_BACK_1(search->text, 0, search->matchedIndex);
b75a7d8f
A
3355 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3356 // status checked below
3357 search->matchedLength = 0;
3358 }
3359 }
3360 else {
3361 if (strsrch->search->isCanonicalMatch) {
3362 // can't use exact here since extra accents are allowed.
3363 usearch_handlePreviousCanonical(strsrch, status);
3364 // status checked below
3365 }
3366 else {
3367 usearch_handlePreviousExact(strsrch, status);
3368 // status checked below
3369 }
3370 }
3371
3372 if (U_FAILURE(*status)) {
3373 return USEARCH_DONE;
3374 }
729e4ab9 3375
b75a7d8f
A
3376 return search->matchedIndex;
3377 }
3378 }
3379 return USEARCH_DONE;
3380}
3381
3382
729e4ab9 3383
b75a7d8f
A
3384U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3385{
729e4ab9
A
3386 /*
3387 reset is setting the attributes that are already in
b75a7d8f
A
3388 string search, hence all attributes in the collator should
3389 be retrieved without any problems
3390 */
3391 if (strsrch) {
3392 UErrorCode status = U_ZERO_ERROR;
3393 UBool sameCollAttribute = TRUE;
3394 uint32_t ceMask;
3395 UBool shift;
3396 uint32_t varTop;
3397
729e4ab9
A
3398 // **** hack to deal w/ how processed CEs encode quaternary ****
3399 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3400 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3401 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3402 sameCollAttribute = FALSE;
3403 }
3404
b75a7d8f
A
3405 strsrch->strength = ucol_getStrength(strsrch->collator);
3406 ceMask = getMask(strsrch->strength);
3407 if (strsrch->ceMask != ceMask) {
3408 strsrch->ceMask = ceMask;
3409 sameCollAttribute = FALSE;
3410 }
729e4ab9 3411
b75a7d8f 3412 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
729e4ab9 3413 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
b75a7d8f
A
3414 &status) == UCOL_SHIFTED;
3415 if (strsrch->toShift != shift) {
3416 strsrch->toShift = shift;
3417 sameCollAttribute = FALSE;
3418 }
3419
3420 // if status is a failure, ucol_getVariableTop returns 0
3421 varTop = ucol_getVariableTop(strsrch->collator, &status);
3422 if (strsrch->variableTop != varTop) {
3423 strsrch->variableTop = varTop;
3424 sameCollAttribute = FALSE;
3425 }
3426 if (!sameCollAttribute) {
3427 initialize(strsrch, &status);
3428 }
57a6839d 3429 ucol_setText(strsrch->textIter, strsrch->search->text,
729e4ab9 3430 strsrch->search->textLength,
729e4ab9 3431 &status);
b75a7d8f
A
3432 strsrch->search->matchedLength = 0;
3433 strsrch->search->matchedIndex = USEARCH_DONE;
3434 strsrch->search->isOverlap = FALSE;
3435 strsrch->search->isCanonicalMatch = FALSE;
729e4ab9 3436 strsrch->search->elementComparisonType = 0;
b75a7d8f
A
3437 strsrch->search->isForwardSearching = TRUE;
3438 strsrch->search->reset = TRUE;
3439 }
3440}
3441
46f4442e
A
3442//
3443// CEI Collation Element + source text index.
3444// These structs are kept in the circular buffer.
3445//
3446struct CEI {
3447 int64_t ce;
3448 int32_t lowIndex;
3449 int32_t highIndex;
3450};
3451
3452U_NAMESPACE_BEGIN
3453
57a6839d 3454namespace {
46f4442e 3455//
b331163b 3456// CEIBuffer A circular buffer of CEs-with-index from the text being searched.
46f4442e 3457//
4388f060
A
3458#define DEFAULT_CEBUFFER_SIZE 96
3459#define CEBUFFER_EXTRA 32
3460// Some typical max values to make buffer size more reasonable for asymmetric search.
3461// #8694 is for a better long-term solution to allocation of this buffer.
3462#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3463#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3464#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
b331163b 3465struct CEIBuffer {
46f4442e
A
3466 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3467 CEI *buf;
3468 int32_t bufSize;
3469 int32_t firstIx;
3470 int32_t limitIx;
3471 UCollationElements *ceIter;
3472 UStringSearch *strSearch;
3473
3474
3475
b331163b
A
3476 CEIBuffer(UStringSearch *ss, UErrorCode *status);
3477 ~CEIBuffer();
46f4442e
A
3478 const CEI *get(int32_t index);
3479 const CEI *getPrevious(int32_t index);
3480};
3481
3482
b331163b 3483CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
46f4442e
A
3484 buf = defBuf;
3485 strSearch = ss;
b331163b 3486 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
4388f060
A
3487 if (ss->search->elementComparisonType != 0) {
3488 const UChar * patText = ss->pattern.text;
3489 if (patText) {
3490 const UChar * patTextLimit = patText + ss->pattern.textLength;
3491 while ( patText < patTextLimit ) {
3492 UChar c = *patText++;
3493 if (MIGHT_BE_JAMO_L(c)) {
3494 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3495 } else {
3496 // No check for surrogates, we might allocate slightly more buffer than necessary.
3497 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3498 }
3499 }
3500 }
3501 }
46f4442e
A
3502 ceIter = ss->textIter;
3503 firstIx = 0;
3504 limitIx = 0;
3505
57a6839d 3506 if (!initTextProcessedIter(ss, status)) { return; }
46f4442e
A
3507
3508 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3509 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3510 if (buf == NULL) {
3511 *status = U_MEMORY_ALLOCATION_ERROR;
3512 }
3513 }
3514}
3515
3516// TODO: add a reset or init function so that allocated
3517// buffers can be retained & reused.
3518
b331163b 3519CEIBuffer::~CEIBuffer() {
46f4442e
A
3520 if (buf != defBuf) {
3521 uprv_free(buf);
3522 }
3523}
3524
3525
3526// Get the CE with the specified index.
3527// Index must be in the range
3528// n-history_size < index < n+1
3529// where n is the largest index to have been fetched by some previous call to this function.
3530// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3531//
b331163b 3532const CEI *CEIBuffer::get(int32_t index) {
46f4442e
A
3533 int i = index % bufSize;
3534
3535 if (index>=firstIx && index<limitIx) {
3536 // The request was for an entry already in our buffer.
3537 // Just return it.
3538 return &buf[i];
3539 }
3540
3541 // Caller is requesting a new, never accessed before, CE.
3542 // Verify that it is the next one in sequence, which is all
3543 // that is allowed.
3544 if (index != limitIx) {
3545 U_ASSERT(FALSE);
3546
3547 return NULL;
3548 }
3549
3550 // Manage the circular CE buffer indexing
3551 limitIx++;
3552
3553 if (limitIx - firstIx >= bufSize) {
3554 // The buffer is full, knock out the lowest-indexed entry.
3555 firstIx++;
3556 }
3557
3558 UErrorCode status = U_ZERO_ERROR;
3559
57a6839d 3560 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
46f4442e
A
3561
3562 return &buf[i];
3563}
3564
3565// Get the CE with the specified index.
3566// Index must be in the range
3567// n-history_size < index < n+1
3568// where n is the largest index to have been fetched by some previous call to this function.
3569// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3570//
b331163b 3571const CEI *CEIBuffer::getPrevious(int32_t index) {
46f4442e
A
3572 int i = index % bufSize;
3573
3574 if (index>=firstIx && index<limitIx) {
3575 // The request was for an entry already in our buffer.
3576 // Just return it.
3577 return &buf[i];
3578 }
3579
3580 // Caller is requesting a new, never accessed before, CE.
3581 // Verify that it is the next one in sequence, which is all
3582 // that is allowed.
3583 if (index != limitIx) {
3584 U_ASSERT(FALSE);
3585
3586 return NULL;
3587 }
3588
3589 // Manage the circular CE buffer indexing
3590 limitIx++;
3591
3592 if (limitIx - firstIx >= bufSize) {
3593 // The buffer is full, knock out the lowest-indexed entry.
3594 firstIx++;
3595 }
3596
3597 UErrorCode status = U_ZERO_ERROR;
3598
57a6839d 3599 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
46f4442e
A
3600
3601 return &buf[i];
3602}
3603
57a6839d
A
3604}
3605
46f4442e
A
3606U_NAMESPACE_END
3607
3608
3609// #define USEARCH_DEBUG
3610
3611#ifdef USEARCH_DEBUG
3612#include <stdio.h>
3613#include <stdlib.h>
3614#endif
3615
3616/*
3617 * Find the next break boundary after startIndex. If the UStringSearch object
3618 * has an external break iterator, use that. Otherwise use the internal character
3619 * break iterator.
3620 */
3621static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3622#if 0
3623 const UChar *text = strsrch->search->text;
3624 int32_t textLen = strsrch->search->textLength;
729e4ab9 3625
46f4442e
A
3626 U_ASSERT(startIndex>=0);
3627 U_ASSERT(startIndex<=textLen);
729e4ab9 3628
46f4442e
A
3629 if (startIndex >= textLen) {
3630 return startIndex;
3631 }
3632
3633 UChar32 c;
3634 int32_t i = startIndex;
3635 U16_NEXT(text, i, textLen, c);
729e4ab9 3636
46f4442e
A
3637 // If we are on a control character, stop without looking for combining marks.
3638 // Control characters do not combine.
3639 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3640 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3641 return i;
3642 }
729e4ab9 3643
46f4442e
A
3644 // The initial character was not a control, and can thus accept trailing
3645 // combining characters. Advance over however many of them there are.
3646 int32_t indexOfLastCharChecked;
3647 for (;;) {
3648 indexOfLastCharChecked = i;
3649 if (i>=textLen) {
3650 break;
3651 }
3652 U16_NEXT(text, i, textLen, c);
3653 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3654 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3655 break;
3656 }
3657 }
3658 return indexOfLastCharChecked;
3659#elif !UCONFIG_NO_BREAK_ITERATION
3660 UBreakIterator *breakiterator = strsrch->search->breakIter;
3661
3662 if (breakiterator == NULL) {
3663 breakiterator = strsrch->search->internalBreakIter;
3664 }
3665
3666 if (breakiterator != NULL) {
4388f060 3667 return ubrk_following(breakiterator, startIndex);
46f4442e
A
3668 }
3669
3670 return startIndex;
3671#else
3672 // **** or should we use the original code? ****
3673 return startIndex;
3674#endif
3675
3676}
3677
3678/*
3679 * Returns TRUE if index is on a break boundary. If the UStringSearch
3680 * has an external break iterator, test using that, otherwise test
3681 * using the internal character break iterator.
3682 */
3683static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3684#if 0
3685 const UChar *text = strsrch->search->text;
3686 int32_t textLen = strsrch->search->textLength;
729e4ab9 3687
46f4442e
A
3688 U_ASSERT(index>=0);
3689 U_ASSERT(index<=textLen);
729e4ab9 3690
46f4442e 3691 if (index>=textLen || index<=0) {
4388f060 3692 return TRUE;
46f4442e 3693 }
729e4ab9 3694
46f4442e
A
3695 // If the character at the current index is not a GRAPHEME_EXTEND
3696 // then we can not be within a combining sequence.
3697 UChar32 c;
3698 U16_GET(text, 0, index, textLen, c);
3699 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3700 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
4388f060 3701 return TRUE;
46f4442e 3702 }
729e4ab9 3703
46f4442e
A
3704 // We are at a combining mark. If the preceding character is anything
3705 // except a CONTROL, CR or LF, we are in a combining sequence.
729e4ab9 3706 U16_PREV(text, 0, index, c);
46f4442e 3707 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
729e4ab9 3708 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
4388f060 3709 return !combining;
46f4442e
A
3710#elif !UCONFIG_NO_BREAK_ITERATION
3711 UBreakIterator *breakiterator = strsrch->search->breakIter;
3712
3713 if (breakiterator == NULL) {
3714 breakiterator = strsrch->search->internalBreakIter;
3715 }
3716
4388f060 3717 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
46f4442e
A
3718#else
3719 // **** or use the original code? ****
4388f060 3720 return TRUE;
46f4442e 3721#endif
729e4ab9 3722}
46f4442e
A
3723
3724#if 0
3725static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3726{
3727#if !UCONFIG_NO_BREAK_ITERATION
3728 UBreakIterator *breakiterator = strsrch->search->breakIter;
3729
3730 if (breakiterator != NULL) {
3731 int32_t startindex = ubrk_first(breakiterator);
3732 int32_t endindex = ubrk_last(breakiterator);
729e4ab9 3733
46f4442e
A
3734 // out-of-range indexes are never boundary positions
3735 if (start < startindex || start > endindex ||
3736 end < startindex || end > endindex) {
3737 return FALSE;
3738 }
3739
729e4ab9 3740 return ubrk_isBoundary(breakiterator, start) &&
46f4442e
A
3741 ubrk_isBoundary(breakiterator, end);
3742 }
3743#endif
3744
3745 return TRUE;
3746}
3747#endif
3748
729e4ab9
A
3749typedef enum {
3750 U_CE_MATCH = -1,
3751 U_CE_NO_MATCH = 0,
3752 U_CE_SKIP_TARG,
3753 U_CE_SKIP_PATN
3754} UCompareCEsResult;
3755#define U_CE_LEVEL2_BASE 0x00000005
3756#define U_CE_LEVEL3_BASE 0x00050000
3757
3758static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3759 if (targCE == patCE) {
3760 return U_CE_MATCH;
3761 }
3762 if (compareType == 0) {
3763 return U_CE_NO_MATCH;
3764 }
46f4442e 3765
729e4ab9
A
3766 int64_t targCEshifted = targCE >> 32;
3767 int64_t patCEshifted = patCE >> 32;
3768 int64_t mask;
3769
3770 mask = 0xFFFF0000;
3771 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3772 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3773 if ( targLev1 != patLev1 ) {
3774 if ( targLev1 == 0 ) {
3775 return U_CE_SKIP_TARG;
3776 }
3777 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3778 return U_CE_SKIP_PATN;
3779 }
3780 return U_CE_NO_MATCH;
3781 }
3782
3783 mask = 0x0000FFFF;
3784 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3785 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3786 if ( targLev2 != patLev2 ) {
3787 if ( targLev2 == 0 ) {
3788 return U_CE_SKIP_TARG;
3789 }
3790 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3791 return U_CE_SKIP_PATN;
3792 }
3793 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3794 U_CE_MATCH: U_CE_NO_MATCH;
3795 }
3796
3797 mask = 0xFFFF0000;
3798 int32_t targLev3 = (int32_t)(targCE & mask);
3799 int32_t patLev3 = (int32_t)(patCE & mask);
3800 if ( targLev3 != patLev3 ) {
3801 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3802 U_CE_MATCH: U_CE_NO_MATCH;
3803 }
3804
3805 return U_CE_MATCH;
3806}
3807
3808#if BOYER_MOORE
3809// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3810#endif
3811
2ca993e8
A
3812namespace {
3813
3814UChar32 codePointAt(const USearch &search, int32_t index) {
3815 if (index < search.textLength) {
3816 UChar32 c;
3817 U16_NEXT(search.text, index, search.textLength, c);
3818 return c;
3819 }
3820 return U_SENTINEL;
3821}
3822
3823UChar32 codePointBefore(const USearch &search, int32_t index) {
3824 if (0 < index) {
3825 UChar32 c;
3826 U16_PREV(search.text, 0, index, c);
3827 return c;
3828 }
3829 return U_SENTINEL;
3830}
3831
3832} // namespace
3833
46f4442e
A
3834U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3835 int32_t startIdx,
3836 int32_t *matchStart,
3837 int32_t *matchLimit,
729e4ab9 3838 UErrorCode *status)
46f4442e
A
3839{
3840 if (U_FAILURE(*status)) {
3841 return FALSE;
3842 }
3843
3844 // TODO: reject search patterns beginning with a combining char.
3845
3846#ifdef USEARCH_DEBUG
3847 if (getenv("USEARCH_DEBUG") != NULL) {
3848 printf("Pattern CEs\n");
b331163b
A
3849 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3850 printf(" %8x", strsrch->pattern.ces[ii]);
46f4442e
A
3851 }
3852 printf("\n");
3853 }
729e4ab9 3854
46f4442e
A
3855#endif
3856 // Input parameter sanity check.
3857 // TODO: should input indicies clip to the text length
3858 // in the same way that UText does.
b331163b 3859 if(strsrch->pattern.cesLength == 0 ||
46f4442e
A
3860 startIdx < 0 ||
3861 startIdx > strsrch->search->textLength ||
b331163b 3862 strsrch->pattern.ces == NULL) {
46f4442e
A
3863 *status = U_ILLEGAL_ARGUMENT_ERROR;
3864 return FALSE;
3865 }
3866
b331163b 3867 if (strsrch->pattern.pces == NULL) {
46f4442e
A
3868 initializePatternPCETable(strsrch, status);
3869 }
3870
3871 ucol_setOffset(strsrch->textIter, startIdx, status);
b331163b 3872 CEIBuffer ceb(strsrch, status);
46f4442e 3873
729e4ab9
A
3874
3875 int32_t targetIx = 0;
3876 const CEI *targetCEI = NULL;
46f4442e
A
3877 int32_t patIx;
3878 UBool found;
3879
3880 int32_t mStart = -1;
3881 int32_t mLimit = -1;
3882 int32_t minLimit;
3883 int32_t maxLimit;
729e4ab9
A
3884
3885
3886
46f4442e
A
3887 // Outer loop moves over match starting positions in the
3888 // target CE space.
729e4ab9
A
3889 // Here we see the target as a sequence of collation elements, resulting from the following:
3890 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3891 // (for example, digraphs such as IJ may be broken into two characters).
3892 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3893 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3894 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3895 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3896 // then the CE is deleted, so the following code sees only CEs that are relevant.
3897 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3898 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3899 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3900 //
46f4442e
A
3901 for(targetIx=0; ; targetIx++)
3902 {
3903 found = TRUE;
3904 // Inner loop checks for a match beginning at each
3905 // position from the outer loop.
729e4ab9
A
3906 int32_t targetIxOffset = 0;
3907 int64_t patCE = 0;
4388f060
A
3908 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3909 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3910 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3911 const CEI *firstCEI = ceb.get(targetIx);
3912 if (firstCEI == NULL) {
3913 *status = U_INTERNAL_PROGRAM_ERROR;
3914 found = FALSE;
3915 break;
3916 }
3917
b331163b
A
3918 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3919 patCE = strsrch->pattern.pces[patIx];
729e4ab9 3920 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
46f4442e 3921 // Compare CE from target string with CE from the pattern.
729e4ab9 3922 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
46f4442e 3923 // which will fail the compare, below.
729e4ab9
A
3924 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3925 if ( ceMatch == U_CE_NO_MATCH ) {
46f4442e
A
3926 found = FALSE;
3927 break;
729e4ab9
A
3928 } else if ( ceMatch > U_CE_NO_MATCH ) {
3929 if ( ceMatch == U_CE_SKIP_TARG ) {
3930 // redo with same patCE, next targCE
3931 patIx--;
3932 targetIxOffset++;
3933 } else { // ceMatch == U_CE_SKIP_PATN
3934 // redo with same targCE, next patCE
3935 targetIxOffset--;
3936 }
46f4442e
A
3937 }
3938 }
b331163b 3939 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
46f4442e 3940
729e4ab9 3941 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
46f4442e
A
3942 // No match at this targetIx. Try again at the next.
3943 continue;
3944 }
3945
3946 if (!found) {
3947 // No match at all, we have run off the end of the target text.
3948 break;
3949 }
3950
3951
3952 // We have found a match in CE space.
3953 // Now determine the bounds in string index space.
3954 // There still is a chance of match failure if the CE range not correspond to
3955 // an acceptable character range.
3956 //
729e4ab9 3957 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
46f4442e 3958
46f4442e
A
3959 mStart = firstCEI->lowIndex;
3960 minLimit = lastCEI->lowIndex;
46f4442e
A
3961
3962 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3963 // extended to the end of input, and the match is good.
3964
3965 // Look at the high and low indices of the CE following the match. If
3966 // they are the same it means one of two things:
3967 // 1. The match extended to the last CE from the target text, which is OK, or
3968 // 2. The last CE that was part of the match is in an expansion that extends
3969 // to the first CE after the match. In this case, we reject the match.
4388f060 3970 const CEI *nextCEI = 0;
729e4ab9 3971 if (strsrch->search->elementComparisonType == 0) {
4388f060 3972 nextCEI = ceb.get(targetIx + targetIxOffset);
729e4ab9
A
3973 maxLimit = nextCEI->lowIndex;
3974 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3975 found = FALSE;
3976 }
3977 } else {
729e4ab9
A
3978 for ( ; ; ++targetIxOffset ) {
3979 nextCEI = ceb.get(targetIx + targetIxOffset);
3980 maxLimit = nextCEI->lowIndex;
4388f060 3981 // If we are at the end of the target too, match succeeds
729e4ab9
A
3982 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3983 break;
3984 }
3985 // As long as the next CE has primary weight of 0,
3986 // it is part of the last target element matched by the pattern;
3987 // make sure it can be part of a match with the last patCE
3988 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
4388f060
A
3989 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3990 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3991 found = FALSE;
3992 break;
3993 }
729e4ab9
A
3994 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3995 // target element, but it has non-zero primary weight => match fails
3996 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
4388f060
A
3997 found = false;
3998 break;
729e4ab9
A
3999 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4000 } else {
4388f060 4001 break;
729e4ab9
A
4002 }
4003 }
46f4442e 4004 }
729e4ab9 4005
46f4442e
A
4006
4007 // Check for the start of the match being within a combining sequence.
4008 // This can happen if the pattern itself begins with a combining char, and
4009 // the match found combining marks in the target text that were attached
4010 // to something else.
4011 // This type of match should be rejected for not completely consuming a
4012 // combining sequence.
4388f060 4013 if (!isBreakBoundary(strsrch, mStart)) {
46f4442e
A
4014 found = FALSE;
4015 }
4016
4017 // Check for the start of the match being within an Collation Element Expansion,
4018 // meaning that the first char of the match is only partially matched.
729e4ab9 4019 // With exapnsions, the first CE will report the index of the source
46f4442e 4020 // character, and all subsequent (expansions) CEs will report the source index of the
729e4ab9 4021 // _following_ character.
46f4442e
A
4022 int32_t secondIx = firstCEI->highIndex;
4023 if (mStart == secondIx) {
4024 found = FALSE;
4025 }
729e4ab9 4026
2ca993e8
A
4027 // Allow matches to end in the middle of a grapheme cluster if the following
4028 // conditions are met; this is needed to make prefix search work properly in
4029 // Indic, see #11750
4030 // * the default breakIter is being used
4031 // * the next collation element after this combining sequence
4032 // - has non-zero primary weight
4033 // - corresponds to a separate character following the one at end of the current match
4034 // (the second of these conditions, and perhaps both, may be redundant given the
4035 // subsequent check for normalization boundary; however they are likely much faster
4036 // tests in any case)
4037 // * the match limit is a normalization boundary
4038 UBool allowMidclusterMatch = FALSE;
4039 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4040 allowMidclusterMatch =
4041 strsrch->search->breakIter == NULL &&
4042 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4043 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4044 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4045 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4046 }
4047 // If those conditions are met, then:
4048 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4049 // the match limit may be backed off to a previous break boundary. This handles
4050 // cases in which mLimit includes target characters that are ignorable with current
4051 // settings (such as space) and which extend beyond the pattern match.
4052 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4053 // * do NOT require that match limit be on a breakIter boundary
4054
46f4442e
A
4055 // Advance the match end position to the first acceptable match boundary.
4056 // This advances the index over any combining charcters.
4057 mLimit = maxLimit;
4058 if (minLimit < maxLimit) {
4388f060
A
4059 // When the last CE's low index is same with its high index, the CE is likely
4060 // a part of expansion. In this case, the index is located just after the
4061 // character corresponding to the CEs compared above. If the index is right
4062 // at the break boundary, move the position to the next boundary will result
4063 // incorrect match length when there are ignorable characters exist between
4064 // the position and the next character produces CE(s). See ticket#8482.
4065 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4066 mLimit = minLimit;
4067 } else {
4068 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
2ca993e8
A
4069 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4070 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4071 // (i.e. we back off mLimit to the previous breakIterator boundary).
4072 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4388f060
A
4073 mLimit = nba;
4074 }
46f4442e
A
4075 }
4076 }
729e4ab9 4077
46f4442e
A
4078 #ifdef USEARCH_DEBUG
4079 if (getenv("USEARCH_DEBUG") != NULL) {
4080 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4081 }
4082 #endif
729e4ab9 4083
2ca993e8 4084 if (!allowMidclusterMatch) {
b331163b
A
4085 // If advancing to the end of a combining sequence in character indexing space
4086 // advanced us beyond the end of the match in CE space, reject this match.
4087 if (mLimit > maxLimit) {
4088 found = FALSE;
4089 }
46f4442e 4090
b331163b
A
4091 if (!isBreakBoundary(strsrch, mLimit)) {
4092 found = FALSE;
4093 }
46f4442e
A
4094 }
4095
729e4ab9
A
4096 if (! checkIdentical(strsrch, mStart, mLimit)) {
4097 found = FALSE;
4098 }
4099
46f4442e
A
4100 if (found) {
4101 break;
4102 }
4103 }
4104
4105 #ifdef USEARCH_DEBUG
4106 if (getenv("USEARCH_DEBUG") != NULL) {
4107 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4108 int32_t lastToPrint = ceb.limitIx+2;
4109 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4110 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4111 }
4112 printf("\n%s\n", found? "match found" : "no match");
4113 }
4114 #endif
4115
4116 // All Done. Store back the match bounds to the caller.
4117 //
4118 if (found==FALSE) {
4119 mLimit = -1;
4120 mStart = -1;
4121 }
4122
4123 if (matchStart != NULL) {
4124 *matchStart= mStart;
4125 }
4126
4127 if (matchLimit != NULL) {
4128 *matchLimit = mLimit;
4129 }
4130
4131 return found;
4132}
4133
46f4442e
A
4134U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4135 int32_t startIdx,
4136 int32_t *matchStart,
4137 int32_t *matchLimit,
729e4ab9 4138 UErrorCode *status)
46f4442e
A
4139{
4140 if (U_FAILURE(*status)) {
4141 return FALSE;
4142 }
4143
4144 // TODO: reject search patterns beginning with a combining char.
4145
4146#ifdef USEARCH_DEBUG
4147 if (getenv("USEARCH_DEBUG") != NULL) {
4148 printf("Pattern CEs\n");
b331163b
A
4149 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4150 printf(" %8x", strsrch->pattern.ces[ii]);
46f4442e
A
4151 }
4152 printf("\n");
4153 }
729e4ab9 4154
46f4442e
A
4155#endif
4156 // Input parameter sanity check.
4157 // TODO: should input indicies clip to the text length
4158 // in the same way that UText does.
b331163b 4159 if(strsrch->pattern.cesLength == 0 ||
46f4442e
A
4160 startIdx < 0 ||
4161 startIdx > strsrch->search->textLength ||
b331163b 4162 strsrch->pattern.ces == NULL) {
46f4442e
A
4163 *status = U_ILLEGAL_ARGUMENT_ERROR;
4164 return FALSE;
4165 }
4166
b331163b 4167 if (strsrch->pattern.pces == NULL) {
46f4442e
A
4168 initializePatternPCETable(strsrch, status);
4169 }
4170
b331163b 4171 CEIBuffer ceb(strsrch, status);
729e4ab9 4172 int32_t targetIx = 0;
46f4442e
A
4173
4174 /*
4175 * Pre-load the buffer with the CE's for the grapheme
4176 * after our starting position so that we're sure that
4177 * we can look at the CE following the match when we
4178 * check the match boundaries.
4179 *
4180 * This will also pre-fetch the first CE that we'll
4181 * consider for the match.
4182 */
4183 if (startIdx < strsrch->search->textLength) {
4184 UBreakIterator *bi = strsrch->search->internalBreakIter;
4185 int32_t next = ubrk_following(bi, startIdx);
4186
4187 ucol_setOffset(strsrch->textIter, next, status);
4188
4189 for (targetIx = 0; ; targetIx += 1) {
4190 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4191 break;
4192 }
4193 }
4194 } else {
4195 ucol_setOffset(strsrch->textIter, startIdx, status);
4196 }
46f4442e 4197
729e4ab9
A
4198
4199 const CEI *targetCEI = NULL;
46f4442e
A
4200 int32_t patIx;
4201 UBool found;
4202
4203 int32_t limitIx = targetIx;
4204 int32_t mStart = -1;
4205 int32_t mLimit = -1;
4206 int32_t minLimit;
4207 int32_t maxLimit;
729e4ab9
A
4208
4209
4210
46f4442e
A
4211 // Outer loop moves over match starting positions in the
4212 // target CE space.
729e4ab9
A
4213 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4214 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4215 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4216 // and the beginning of the base text.
46f4442e
A
4217 for(targetIx = limitIx; ; targetIx += 1)
4218 {
4219 found = TRUE;
4388f060
A
4220 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4221 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4222 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4223 const CEI *lastCEI = ceb.getPrevious(targetIx);
4224 if (lastCEI == NULL) {
4225 *status = U_INTERNAL_PROGRAM_ERROR;
4226 found = FALSE;
4227 break;
4228 }
46f4442e
A
4229 // Inner loop checks for a match beginning at each
4230 // position from the outer loop.
729e4ab9 4231 int32_t targetIxOffset = 0;
b331163b
A
4232 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4233 int64_t patCE = strsrch->pattern.pces[patIx];
46f4442e 4234
b331163b 4235 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
46f4442e
A
4236 // Compare CE from target string with CE from the pattern.
4237 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4238 // which will fail the compare, below.
729e4ab9
A
4239 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4240 if ( ceMatch == U_CE_NO_MATCH ) {
46f4442e
A
4241 found = FALSE;
4242 break;
729e4ab9
A
4243 } else if ( ceMatch > U_CE_NO_MATCH ) {
4244 if ( ceMatch == U_CE_SKIP_TARG ) {
4245 // redo with same patCE, next targCE
4246 patIx++;
4247 targetIxOffset++;
4248 } else { // ceMatch == U_CE_SKIP_PATN
4249 // redo with same targCE, next patCE
4250 targetIxOffset--;
4251 }
46f4442e
A
4252 }
4253 }
4254
729e4ab9 4255 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
46f4442e
A
4256 // No match at this targetIx. Try again at the next.
4257 continue;
4258 }
4259
4260 if (!found) {
4261 // No match at all, we have run off the end of the target text.
4262 break;
4263 }
4264
4265
4266 // We have found a match in CE space.
4267 // Now determine the bounds in string index space.
4268 // There still is a chance of match failure if the CE range not correspond to
4269 // an acceptable character range.
4270 //
b331163b 4271 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
46f4442e 4272 mStart = firstCEI->lowIndex;
46f4442e
A
4273
4274 // Check for the start of the match being within a combining sequence.
4275 // This can happen if the pattern itself begins with a combining char, and
4276 // the match found combining marks in the target text that were attached
4277 // to something else.
4278 // This type of match should be rejected for not completely consuming a
4279 // combining sequence.
4388f060 4280 if (!isBreakBoundary(strsrch, mStart)) {
46f4442e
A
4281 found = FALSE;
4282 }
4283
4284 // Look at the high index of the first CE in the match. If it's the same as the
4285 // low index, the first CE in the match is in the middle of an expansion.
4286 if (mStart == firstCEI->highIndex) {
4287 found = FALSE;
4288 }
729e4ab9 4289
46f4442e 4290
4388f060
A
4291 minLimit = lastCEI->lowIndex;
4292
4293 if (targetIx > 0) {
4294 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4295 // extended to the end of input, and the match is good.
4296
4297 // Look at the high and low indices of the CE following the match. If
4298 // they are the same it means one of two things:
4299 // 1. The match extended to the last CE from the target text, which is OK, or
4300 // 2. The last CE that was part of the match is in an expansion that extends
4301 // to the first CE after the match. In this case, we reject the match.
4302 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4303
4304 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4305 found = FALSE;
4306 }
4307
4308 mLimit = maxLimit = nextCEI->lowIndex;
4309
2ca993e8
A
4310 // Allow matches to end in the middle of a grapheme cluster if the following
4311 // conditions are met; this is needed to make prefix search work properly in
4312 // Indic, see #11750
4313 // * the default breakIter is being used
4314 // * the next collation element after this combining sequence
4315 // - has non-zero primary weight
4316 // - corresponds to a separate character following the one at end of the current match
4317 // (the second of these conditions, and perhaps both, may be redundant given the
4318 // subsequent check for normalization boundary; however they are likely much faster
4319 // tests in any case)
4320 // * the match limit is a normalization boundary
4321 UBool allowMidclusterMatch = FALSE;
4322 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4323 allowMidclusterMatch =
4324 strsrch->search->breakIter == NULL &&
4325 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4326 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4327 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4328 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4329 }
4330 // If those conditions are met, then:
4331 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4332 // the match limit may be backed off to a previous break boundary. This handles
4333 // cases in which mLimit includes target characters that are ignorable with current
4334 // settings (such as space) and which extend beyond the pattern match.
4335 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4336 // * do NOT require that match limit be on a breakIter boundary
4337
4388f060 4338 // Advance the match end position to the first acceptable match boundary.
b331163b 4339 // This advances the index over any combining characters.
4388f060
A
4340 if (minLimit < maxLimit) {
4341 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
2ca993e8
A
4342 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4343 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4344 // (i.e. we back off mLimit to the previous breakIterator boundary).
4345 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4388f060
A
4346 mLimit = nba;
4347 }
46f4442e 4348 }
4388f060 4349
2ca993e8 4350 if (!allowMidclusterMatch) {
b331163b
A
4351 // If advancing to the end of a combining sequence in character indexing space
4352 // advanced us beyond the end of the match in CE space, reject this match.
4353 if (mLimit > maxLimit) {
4354 found = FALSE;
4355 }
4388f060 4356
b331163b
A
4357 // Make sure the end of the match is on a break boundary
4358 if (!isBreakBoundary(strsrch, mLimit)) {
4359 found = FALSE;
4360 }
4388f060
A
4361 }
4362
4363 } else {
4364 // No non-ignorable CEs after this point.
4365 // The maximum position is detected by boundary after
4366 // the last non-ignorable CE. Combining sequence
4367 // across the start index will be truncated.
4368 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4369 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
46f4442e 4370 }
729e4ab9 4371
46f4442e
A
4372 #ifdef USEARCH_DEBUG
4373 if (getenv("USEARCH_DEBUG") != NULL) {
4374 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4375 }
4376 #endif
729e4ab9 4377
46f4442e 4378
729e4ab9
A
4379 if (! checkIdentical(strsrch, mStart, mLimit)) {
4380 found = FALSE;
4381 }
4382
46f4442e
A
4383 if (found) {
4384 break;
4385 }
4386 }
4387
4388 #ifdef USEARCH_DEBUG
4389 if (getenv("USEARCH_DEBUG") != NULL) {
4390 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4391 int32_t lastToPrint = ceb.limitIx+2;
4392 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4393 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4394 }
4395 printf("\n%s\n", found? "match found" : "no match");
4396 }
4397 #endif
4398
4399 // All Done. Store back the match bounds to the caller.
4400 //
4401 if (found==FALSE) {
4402 mLimit = -1;
4403 mStart = -1;
4404 }
4405
4406 if (matchStart != NULL) {
4407 *matchStart= mStart;
4408 }
4409
4410 if (matchLimit != NULL) {
4411 *matchLimit = mLimit;
4412 }
4413
4414 return found;
4415}
4416
b75a7d8f
A
4417// internal use methods declared in usrchimp.h -----------------------------
4418
4419UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4420{
4421 if (U_FAILURE(*status)) {
4422 setMatchNotFound(strsrch);
4423 return FALSE;
4424 }
4425
46f4442e 4426#if BOYER_MOORE
374ca955 4427 UCollationElements *coleiter = strsrch->textIter;
b75a7d8f 4428 int32_t textlength = strsrch->search->textLength;
b331163b
A
4429 int32_t *patternce = strsrch->pattern.ces;
4430 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
4431 int32_t textoffset = ucol_getOffset(coleiter);
4432
374ca955 4433 // status used in setting coleiter offset, since offset is checked in
729e4ab9 4434 // shiftForward before setting the coleiter offset, status never
374ca955 4435 // a failure
729e4ab9 4436 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4437 patterncelength);
4438 while (textoffset <= textlength)
4439 {
4440 uint32_t patternceindex = patterncelength - 1;
374ca955 4441 int32_t targetce;
b75a7d8f 4442 UBool found = FALSE;
374ca955
A
4443 int32_t lastce = UCOL_NULLORDER;
4444
4445 setColEIterOffset(coleiter, textoffset);
4446
46f4442e 4447 for (;;) {
b75a7d8f
A
4448 // finding the last pattern ce match, imagine composite characters
4449 // for example: search for pattern A in text \u00C0
4450 // we'll have to skip \u0300 the grave first before we get to A
4451 targetce = ucol_previous(coleiter, status);
4452 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4453 found = FALSE;
4454 break;
4455 }
4456 targetce = getCE(strsrch, targetce);
729e4ab9
A
4457 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4458 // this is for the text \u0315\u0300 that requires
b75a7d8f
A
4459 // normalization and pattern \u0300, where \u0315 is ignorable
4460 continue;
4461 }
4462 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4463 lastce = targetce;
4464 }
729e4ab9 4465 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4466 if (targetce == patternce[patternceindex]) {
4467 // the first ce can be a contraction
4468 found = TRUE;
4469 break;
4470 }
4471 if (!hasExpansion(coleiter)) {
4472 found = FALSE;
4473 break;
4474 }
4475 }
4476
46f4442e 4477 //targetce = lastce;
729e4ab9 4478
b75a7d8f 4479 while (found && patternceindex > 0) {
4388f060 4480 lastce = targetce;
b75a7d8f
A
4481 targetce = ucol_previous(coleiter, status);
4482 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4483 found = FALSE;
4484 break;
4485 }
4486 targetce = getCE(strsrch, targetce);
4487 if (targetce == UCOL_IGNORABLE) {
4488 continue;
4489 }
4490
4491 patternceindex --;
729e4ab9
A
4492 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4493 found = found && targetce == patternce[patternceindex];
b75a7d8f 4494 }
729e4ab9 4495
46f4442e 4496 targetce = lastce;
b75a7d8f
A
4497
4498 if (!found) {
374ca955
A
4499 if (U_FAILURE(*status)) {
4500 break;
4501 }
729e4ab9 4502 textoffset = shiftForward(strsrch, textoffset, lastce,
b75a7d8f
A
4503 patternceindex);
4504 // status checked at loop.
4505 patternceindex = patterncelength;
4506 continue;
4507 }
374ca955
A
4508
4509 if (checkNextExactMatch(strsrch, &textoffset, status)) {
b75a7d8f 4510 // status checked in ucol_setOffset
374ca955
A
4511 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4512 return TRUE;
b75a7d8f
A
4513 }
4514 }
4515 setMatchNotFound(strsrch);
374ca955 4516 return FALSE;
46f4442e
A
4517#else
4518 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4519 int32_t start = -1;
4520 int32_t end = -1;
4521
4522 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4523 strsrch->search->matchedIndex = start;
4524 strsrch->search->matchedLength = end - start;
4525 return TRUE;
4526 } else {
4527 setMatchNotFound(strsrch);
4528 return FALSE;
4529 }
4530#endif
b75a7d8f
A
4531}
4532
4533UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4534{
4535 if (U_FAILURE(*status)) {
4536 setMatchNotFound(strsrch);
4537 return FALSE;
4538 }
4539
46f4442e 4540#if BOYER_MOORE
b75a7d8f
A
4541 UCollationElements *coleiter = strsrch->textIter;
4542 int32_t textlength = strsrch->search->textLength;
b331163b
A
4543 int32_t *patternce = strsrch->pattern.ces;
4544 int32_t patterncelength = strsrch->pattern.cesLength;
374ca955 4545 int32_t textoffset = ucol_getOffset(coleiter);
729e4ab9 4546 UBool hasPatternAccents =
b75a7d8f 4547 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
729e4ab9
A
4548
4549 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4550 patterncelength);
4551 strsrch->canonicalPrefixAccents[0] = 0;
4552 strsrch->canonicalSuffixAccents[0] = 0;
729e4ab9 4553
b75a7d8f
A
4554 while (textoffset <= textlength)
4555 {
4556 int32_t patternceindex = patterncelength - 1;
374ca955 4557 int32_t targetce;
b75a7d8f 4558 UBool found = FALSE;
374ca955
A
4559 int32_t lastce = UCOL_NULLORDER;
4560
4561 setColEIterOffset(coleiter, textoffset);
4562
4563 for (;;) {
b75a7d8f
A
4564 // finding the last pattern ce match, imagine composite characters
4565 // for example: search for pattern A in text \u00C0
4566 // we'll have to skip \u0300 the grave first before we get to A
4567 targetce = ucol_previous(coleiter, status);
4568 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4569 found = FALSE;
4570 break;
4571 }
4572 targetce = getCE(strsrch, targetce);
4573 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4574 lastce = targetce;
4575 }
729e4ab9 4576 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4577 if (targetce == patternce[patternceindex]) {
4578 // the first ce can be a contraction
4579 found = TRUE;
4580 break;
4581 }
4582 if (!hasExpansion(coleiter)) {
4583 found = FALSE;
4584 break;
4585 }
4586 }
729e4ab9 4587
b75a7d8f
A
4588 while (found && patternceindex > 0) {
4589 targetce = ucol_previous(coleiter, status);
4590 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4591 found = FALSE;
4592 break;
4593 }
4594 targetce = getCE(strsrch, targetce);
4595 if (targetce == UCOL_IGNORABLE) {
4596 continue;
4597 }
4598
4599 patternceindex --;
729e4ab9
A
4600 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4601 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4602 }
4603
4604 // initializing the rearranged accent array
4605 if (hasPatternAccents && !found) {
4606 strsrch->canonicalPrefixAccents[0] = 0;
4607 strsrch->canonicalSuffixAccents[0] = 0;
374ca955
A
4608 if (U_FAILURE(*status)) {
4609 break;
4610 }
b75a7d8f
A
4611 found = doNextCanonicalMatch(strsrch, textoffset, status);
4612 }
4613
4614 if (!found) {
374ca955
A
4615 if (U_FAILURE(*status)) {
4616 break;
4617 }
729e4ab9 4618 textoffset = shiftForward(strsrch, textoffset, lastce,
b75a7d8f
A
4619 patternceindex);
4620 // status checked at loop
4621 patternceindex = patterncelength;
4622 continue;
4623 }
729e4ab9 4624
b75a7d8f
A
4625 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4626 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4627 return TRUE;
4628 }
4629 }
4630 setMatchNotFound(strsrch);
4631 return FALSE;
46f4442e
A
4632#else
4633 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4634 int32_t start = -1;
4635 int32_t end = -1;
4636
4637 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4638 strsrch->search->matchedIndex = start;
4639 strsrch->search->matchedLength = end - start;
4640 return TRUE;
4641 } else {
4642 setMatchNotFound(strsrch);
4643 return FALSE;
4644 }
4645#endif
b75a7d8f
A
4646}
4647
4648UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4649{
4650 if (U_FAILURE(*status)) {
4651 setMatchNotFound(strsrch);
4652 return FALSE;
4653 }
4654
46f4442e 4655#if BOYER_MOORE
b75a7d8f 4656 UCollationElements *coleiter = strsrch->textIter;
b331163b
A
4657 int32_t *patternce = strsrch->pattern.ces;
4658 int32_t patterncelength = strsrch->pattern.cesLength;
374ca955 4659 int32_t textoffset = ucol_getOffset(coleiter);
b75a7d8f
A
4660
4661 // shifting it check for setting offset
4662 // if setOffset is called previously or there was no previous match, we
4663 // leave the offset as it is.
4664 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4665 textoffset = strsrch->search->matchedIndex;
4666 }
729e4ab9
A
4667
4668 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f 4669 patterncelength);
729e4ab9 4670
b75a7d8f
A
4671 while (textoffset >= 0)
4672 {
4673 int32_t patternceindex = 1;
374ca955 4674 int32_t targetce;
b75a7d8f 4675 UBool found = FALSE;
374ca955 4676 int32_t firstce = UCOL_NULLORDER;
b75a7d8f 4677
374ca955 4678 // if status is a failure, ucol_setOffset does nothing
b75a7d8f 4679 setColEIterOffset(coleiter, textoffset);
374ca955
A
4680
4681 for (;;) {
729e4ab9
A
4682 // finding the first pattern ce match, imagine composite
4683 // characters. for example: search for pattern \u0300 in text
4684 // \u00C0, we'll have to skip A first before we get to
b75a7d8f
A
4685 // \u0300 the grave accent
4686 targetce = ucol_next(coleiter, status);
4687 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4688 found = FALSE;
4689 break;
4690 }
4691 targetce = getCE(strsrch, targetce);
4692 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4693 firstce = targetce;
4694 }
46f4442e 4695 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
b75a7d8f 4696 continue;
729e4ab9
A
4697 }
4698 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4699 if (targetce == patternce[0]) {
4700 found = TRUE;
4701 break;
4702 }
4703 if (!hasExpansion(coleiter)) {
4704 // checking for accents in composite character
4705 found = FALSE;
4706 break;
4707 }
4708 }
4709
46f4442e 4710 //targetce = firstce;
729e4ab9 4711
b75a7d8f 4712 while (found && (patternceindex < patterncelength)) {
4388f060 4713 firstce = targetce;
b75a7d8f
A
4714 targetce = ucol_next(coleiter, status);
4715 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4716 found = FALSE;
4717 break;
4718 }
4719 targetce = getCE(strsrch, targetce);
4720 if (targetce == UCOL_IGNORABLE) {
4721 continue;
4722 }
4723
729e4ab9
A
4724 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4725 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4726 patternceindex ++;
4727 }
729e4ab9 4728
46f4442e 4729 targetce = firstce;
b75a7d8f
A
4730
4731 if (!found) {
374ca955
A
4732 if (U_FAILURE(*status)) {
4733 break;
4734 }
729e4ab9
A
4735
4736 textoffset = reverseShift(strsrch, textoffset, targetce,
b75a7d8f
A
4737 patternceindex);
4738 patternceindex = 0;
4739 continue;
4740 }
729e4ab9 4741
b75a7d8f
A
4742 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4743 setColEIterOffset(coleiter, textoffset);
4744 return TRUE;
4745 }
4746 }
4747 setMatchNotFound(strsrch);
4748 return FALSE;
46f4442e 4749#else
4388f060
A
4750 int32_t textOffset;
4751
4752 if (strsrch->search->isOverlap) {
4753 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4754 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4755 } else {
4756 // move the start position at the end of possible match
4757 initializePatternPCETable(strsrch, status);
57a6839d
A
4758 if (!initTextProcessedIter(strsrch, status)) {
4759 setMatchNotFound(strsrch);
4760 return FALSE;
4761 }
b331163b 4762 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
57a6839d 4763 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4388f060
A
4764 if (pce == UCOL_PROCESSED_NULLORDER) {
4765 // at the end of the text
4766 break;
4767 }
4768 }
4769 if (U_FAILURE(*status)) {
4770 setMatchNotFound(strsrch);
4771 return FALSE;
4772 }
4773 textOffset = ucol_getOffset(strsrch->textIter);
4774 }
4775 } else {
4776 textOffset = ucol_getOffset(strsrch->textIter);
4777 }
4778
46f4442e
A
4779 int32_t start = -1;
4780 int32_t end = -1;
4781
4782 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4783 strsrch->search->matchedIndex = start;
4784 strsrch->search->matchedLength = end - start;
4785 return TRUE;
4786 } else {
4787 setMatchNotFound(strsrch);
4788 return FALSE;
4789 }
4790#endif
b75a7d8f
A
4791}
4792
729e4ab9 4793UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
b75a7d8f
A
4794 UErrorCode *status)
4795{
4796 if (U_FAILURE(*status)) {
4797 setMatchNotFound(strsrch);
4798 return FALSE;
4799 }
4800
46f4442e 4801#if BOYER_MOORE
b75a7d8f 4802 UCollationElements *coleiter = strsrch->textIter;
b331163b
A
4803 int32_t *patternce = strsrch->pattern.ces;
4804 int32_t patterncelength = strsrch->pattern.cesLength;
374ca955 4805 int32_t textoffset = ucol_getOffset(coleiter);
729e4ab9 4806 UBool hasPatternAccents =
b75a7d8f 4807 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
729e4ab9 4808
b75a7d8f
A
4809 // shifting it check for setting offset
4810 // if setOffset is called previously or there was no previous match, we
4811 // leave the offset as it is.
4812 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4813 textoffset = strsrch->search->matchedIndex;
4814 }
729e4ab9
A
4815
4816 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4817 patterncelength);
4818 strsrch->canonicalPrefixAccents[0] = 0;
4819 strsrch->canonicalSuffixAccents[0] = 0;
729e4ab9 4820
b75a7d8f
A
4821 while (textoffset >= 0)
4822 {
4823 int32_t patternceindex = 1;
374ca955 4824 int32_t targetce;
b75a7d8f 4825 UBool found = FALSE;
374ca955 4826 int32_t firstce = UCOL_NULLORDER;
b75a7d8f
A
4827
4828 setColEIterOffset(coleiter, textoffset);
46f4442e 4829 for (;;) {
729e4ab9
A
4830 // finding the first pattern ce match, imagine composite
4831 // characters. for example: search for pattern \u0300 in text
4832 // \u00C0, we'll have to skip A first before we get to
b75a7d8f
A
4833 // \u0300 the grave accent
4834 targetce = ucol_next(coleiter, status);
4835 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4836 found = FALSE;
4837 break;
4838 }
4839 targetce = getCE(strsrch, targetce);
4840 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4841 firstce = targetce;
4842 }
729e4ab9
A
4843
4844 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4845 if (targetce == patternce[0]) {
4846 // the first ce can be a contraction
4847 found = TRUE;
4848 break;
4849 }
4850 if (!hasExpansion(coleiter)) {
4851 // checking for accents in composite character
4852 found = FALSE;
4853 break;
4854 }
4855 }
4856
4857 targetce = firstce;
729e4ab9 4858
b75a7d8f
A
4859 while (found && patternceindex < patterncelength) {
4860 targetce = ucol_next(coleiter, status);
4861 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4862 found = FALSE;
4863 break;
4864 }
4865 targetce = getCE(strsrch, targetce);
4866 if (targetce == UCOL_IGNORABLE) {
4867 continue;
4868 }
4869
729e4ab9
A
4870 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4871 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4872 patternceindex ++;
4873 }
4874
4875 // initializing the rearranged accent array
4876 if (hasPatternAccents && !found) {
4877 strsrch->canonicalPrefixAccents[0] = 0;
4878 strsrch->canonicalSuffixAccents[0] = 0;
374ca955 4879 if (U_FAILURE(*status)) {
b75a7d8f
A
4880 break;
4881 }
4882 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4883 }
4884
4885 if (!found) {
374ca955 4886 if (U_FAILURE(*status)) {
b75a7d8f
A
4887 break;
4888 }
729e4ab9 4889 textoffset = reverseShift(strsrch, textoffset, targetce,
b75a7d8f
A
4890 patternceindex);
4891 patternceindex = 0;
4892 continue;
4893 }
4894
4895 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4896 setColEIterOffset(coleiter, textoffset);
4897 return TRUE;
4898 }
4899 }
4900 setMatchNotFound(strsrch);
4901 return FALSE;
46f4442e 4902#else
4388f060
A
4903 int32_t textOffset;
4904
4905 if (strsrch->search->isOverlap) {
4906 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4907 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4908 } else {
4909 // move the start position at the end of possible match
4910 initializePatternPCETable(strsrch, status);
57a6839d
A
4911 if (!initTextProcessedIter(strsrch, status)) {
4912 setMatchNotFound(strsrch);
4913 return FALSE;
4914 }
b331163b 4915 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
57a6839d 4916 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4388f060
A
4917 if (pce == UCOL_PROCESSED_NULLORDER) {
4918 // at the end of the text
4919 break;
4920 }
4921 }
4922 if (U_FAILURE(*status)) {
4923 setMatchNotFound(strsrch);
4924 return FALSE;
4925 }
4926 textOffset = ucol_getOffset(strsrch->textIter);
4927 }
4928 } else {
4929 textOffset = ucol_getOffset(strsrch->textIter);
4930 }
4931
46f4442e
A
4932 int32_t start = -1;
4933 int32_t end = -1;
4934
4935 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4936 strsrch->search->matchedIndex = start;
4937 strsrch->search->matchedLength = end - start;
4938 return TRUE;
4939 } else {
4940 setMatchNotFound(strsrch);
4941 return FALSE;
4942 }
4943#endif
b75a7d8f
A
4944}
4945
4946#endif /* #if !UCONFIG_NO_COLLATION */