]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/usearch.cpp
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / usearch.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
b331163b 5* Copyright (C) 2001-2015 IBM and others. All rights reserved.
b75a7d8f
A
6**********************************************************************
7* Date Name Description
8* 07/02/2001 synwee Creation.
9**********************************************************************
10*/
11
12#include "unicode/utypes.h"
13
46f4442e 14#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
b75a7d8f
A
15
16#include "unicode/usearch.h"
17#include "unicode/ustring.h"
18#include "unicode/uchar.h"
4388f060 19#include "unicode/utf16.h"
729e4ab9 20#include "normalizer2impl.h"
b75a7d8f
A
21#include "usrchimp.h"
22#include "cmemory.h"
374ca955 23#include "ucln_in.h"
46f4442e 24#include "uassert.h"
729e4ab9 25#include "ustr_imp.h"
3d1f044b
A
26#if U_PLATFORM_IS_DARWIN_BASED
27// Add logging for error conditions as in <rdar://51554495>
28#include <os/log.h>
29#endif
46f4442e
A
30
31U_NAMESPACE_USE
32
33// don't use Boyer-Moore
729e4ab9 34// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
46f4442e 35#define BOYER_MOORE 0
b75a7d8f
A
36
37// internal definition ---------------------------------------------------
38
39#define LAST_BYTE_MASK_ 0xFF
40#define SECOND_LAST_BYTE_SHIFT_ 8
41#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
42
4388f060 43static const Normalizer2Impl *g_nfcImpl = NULL;
b75a7d8f
A
44
45// internal methods -------------------------------------------------
46
47/**
48* Fast collation element iterator setOffset.
49* This function does not check for bounds.
50* @param coleiter collation element iterator
51* @param offset to set
52*/
729e4ab9 53static
b75a7d8f
A
54inline void setColEIterOffset(UCollationElements *elems,
55 int32_t offset)
56{
57a6839d
A
57 // Note: Not "fast" any more after the 2013 collation rewrite.
58 // We do not want to expose more internals than necessary.
59 UErrorCode status = U_ZERO_ERROR;
60 ucol_setOffset(elems, offset, &status);
b75a7d8f
A
61}
62
63/**
64* Getting the mask for collation strength
65* @param strength collation strength
66* @return collation element mask
67*/
68static
729e4ab9 69inline uint32_t getMask(UCollationStrength strength)
b75a7d8f 70{
729e4ab9 71 switch (strength)
b75a7d8f
A
72 {
73 case UCOL_PRIMARY:
74 return UCOL_PRIMARYORDERMASK;
75 case UCOL_SECONDARY:
76 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
77 default:
729e4ab9 78 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
b75a7d8f
A
79 UCOL_PRIMARYORDERMASK;
80 }
81}
82
83/**
2ca993e8
A
84* @param ce 32-bit collation element
85* @return hash code
b75a7d8f
A
86*/
87static
2ca993e8 88inline int hashFromCE32(uint32_t ce)
b75a7d8f 89{
2ca993e8
A
90 int hc = (int)(
91 ((((((ce >> 24) * 37) +
92 (ce >> 16)) * 37) +
93 (ce >> 8)) * 37) +
94 ce);
95 hc %= MAX_TABLE_SIZE_;
96 if (hc < 0) {
97 hc += MAX_TABLE_SIZE_;
98 }
99 return hc;
b75a7d8f
A
100}
101
374ca955
A
102U_CDECL_BEGIN
103static UBool U_CALLCONV
104usearch_cleanup(void) {
4388f060 105 g_nfcImpl = NULL;
374ca955
A
106 return TRUE;
107}
108U_CDECL_END
109
b75a7d8f
A
110/**
111* Initializing the fcd tables.
112* Internal method, status assumed to be a success.
729e4ab9 113* @param status output error if any, caller to check status before calling
b75a7d8f
A
114* method, status assumed to be success when passed in.
115*/
116static
729e4ab9 117inline void initializeFCD(UErrorCode *status)
b75a7d8f 118{
4388f060
A
119 if (g_nfcImpl == NULL) {
120 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
374ca955 121 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
b75a7d8f
A
122 }
123}
124
125/**
126* Gets the fcd value for a character at the argument index.
127* This method takes into accounts of the supplementary characters.
128* @param str UTF16 string where character for fcd retrieval resides
729e4ab9
A
129* @param offset position of the character whose fcd is to be retrieved, to be
130* overwritten with the next character position, taking
b75a7d8f
A
131* surrogate characters into consideration.
132* @param strlength length of the argument string
133* @return fcd value
134*/
135static
729e4ab9 136uint16_t getFCD(const UChar *str, int32_t *offset,
b75a7d8f
A
137 int32_t strlength)
138{
729e4ab9 139 const UChar *temp = str + *offset;
4388f060 140 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
729e4ab9 141 *offset = (int32_t)(temp - str);
b75a7d8f
A
142 return result;
143}
144
145/**
729e4ab9 146* Getting the modified collation elements taking into account the collation
b75a7d8f
A
147* attributes
148* @param strsrch string search data
729e4ab9 149* @param sourcece
b75a7d8f
A
150* @return the modified collation element
151*/
152static
374ca955 153inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
b75a7d8f
A
154{
155 // note for tertiary we can't use the collator->tertiaryMask, that
156 // is a preprocessed mask that takes into account case options. since
157 // we are only concerned with exact matches, we don't need that.
158 sourcece &= strsrch->ceMask;
729e4ab9 159
b75a7d8f
A
160 if (strsrch->toShift) {
161 // alternate handling here, since only the 16 most significant digits
162 // is only used, we can safely do a compare without masking
163 // if the ce is a variable, we mask and get only the primary values
164 // no shifting to quartenary is required since all primary values
165 // less than variabletop will need to be masked off anyway.
166 if (strsrch->variableTop > sourcece) {
729e4ab9 167 if (strsrch->strength >= UCOL_QUATERNARY) {
b75a7d8f
A
168 sourcece &= UCOL_PRIMARYORDERMASK;
169 }
729e4ab9 170 else {
b75a7d8f
A
171 sourcece = UCOL_IGNORABLE;
172 }
173 }
729e4ab9
A
174 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
175 sourcece = 0xFFFF;
b75a7d8f
A
176 }
177
178 return sourcece;
179}
180
729e4ab9 181/**
b75a7d8f
A
182* Allocate a memory and returns NULL if it failed.
183* Internal method, status assumed to be a success.
184* @param size to allocate
729e4ab9 185* @param status output error if any, caller to check status before calling
b75a7d8f
A
186* method, status assumed to be success when passed in.
187* @return newly allocated array, NULL otherwise
188*/
189static
729e4ab9 190inline void * allocateMemory(uint32_t size, UErrorCode *status)
b75a7d8f
A
191{
192 uint32_t *result = (uint32_t *)uprv_malloc(size);
193 if (result == NULL) {
194 *status = U_MEMORY_ALLOCATION_ERROR;
195 }
196 return result;
197}
198
199/**
200* Adds a uint32_t value to a destination array.
729e4ab9 201* Creates a new array if we run out of space. The caller will have to
b75a7d8f 202* manually deallocate the newly allocated array.
729e4ab9
A
203* Internal method, status assumed to be success, caller has to check status
204* before calling this method. destination not to be NULL and has at least
b75a7d8f
A
205* size destinationlength.
206* @param destination target array
207* @param offset destination offset to add value
208* @param destinationlength target array size, return value for the new size
209* @param value to be added
210* @param increments incremental size expected
729e4ab9 211* @param status output error if any, caller to check status before calling
b75a7d8f
A
212* method, status assumed to be success when passed in.
213* @return new destination array, destination if there was no new allocation
214*/
215static
729e4ab9
A
216inline int32_t * addTouint32_tArray(int32_t *destination,
217 uint32_t offset,
218 uint32_t *destinationlength,
374ca955 219 uint32_t value,
729e4ab9
A
220 uint32_t increments,
221 UErrorCode *status)
b75a7d8f
A
222{
223 uint32_t newlength = *destinationlength;
224 if (offset + 1 == newlength) {
225 newlength += increments;
374ca955
A
226 int32_t *temp = (int32_t *)allocateMemory(
227 sizeof(int32_t) * newlength, status);
b75a7d8f
A
228 if (U_FAILURE(*status)) {
229 return NULL;
230 }
a62d09fc 231 uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
b75a7d8f
A
232 *destinationlength = newlength;
233 destination = temp;
234 }
235 destination[offset] = value;
236 return destination;
237}
238
46f4442e
A
239/**
240* Adds a uint64_t value to a destination array.
729e4ab9 241* Creates a new array if we run out of space. The caller will have to
46f4442e 242* manually deallocate the newly allocated array.
729e4ab9
A
243* Internal method, status assumed to be success, caller has to check status
244* before calling this method. destination not to be NULL and has at least
46f4442e
A
245* size destinationlength.
246* @param destination target array
247* @param offset destination offset to add value
248* @param destinationlength target array size, return value for the new size
249* @param value to be added
250* @param increments incremental size expected
729e4ab9 251* @param status output error if any, caller to check status before calling
46f4442e
A
252* method, status assumed to be success when passed in.
253* @return new destination array, destination if there was no new allocation
254*/
255static
729e4ab9
A
256inline int64_t * addTouint64_tArray(int64_t *destination,
257 uint32_t offset,
258 uint32_t *destinationlength,
46f4442e 259 uint64_t value,
729e4ab9
A
260 uint32_t increments,
261 UErrorCode *status)
46f4442e
A
262{
263 uint32_t newlength = *destinationlength;
264 if (offset + 1 == newlength) {
265 newlength += increments;
266 int64_t *temp = (int64_t *)allocateMemory(
267 sizeof(int64_t) * newlength, status);
729e4ab9 268
46f4442e
A
269 if (U_FAILURE(*status)) {
270 return NULL;
271 }
272
a62d09fc 273 uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
46f4442e
A
274 *destinationlength = newlength;
275 destination = temp;
276 }
277
278 destination[offset] = value;
279
280 return destination;
281}
282
b75a7d8f
A
283/**
284* Initializing the ce table for a pattern.
285* Stores non-ignorable collation keys.
729e4ab9
A
286* Table size will be estimated by the size of the pattern text. Table
287* expansion will be perform as we go along. Adding 1 to ensure that the table
b75a7d8f
A
288* size definitely increases.
289* Internal method, status assumed to be a success.
290* @param strsrch string search data
729e4ab9 291* @param status output error if any, caller to check status before calling
b75a7d8f 292* method, status assumed to be success when passed in.
729e4ab9 293* @return total number of expansions
b75a7d8f
A
294*/
295static
729e4ab9 296inline uint16_t initializePatternCETable(UStringSearch *strsrch,
b75a7d8f
A
297 UErrorCode *status)
298{
299 UPattern *pattern = &(strsrch->pattern);
300 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
b331163b 301 int32_t *cetable = pattern->cesBuffer;
b75a7d8f
A
302 uint32_t patternlength = pattern->textLength;
303 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 304
b75a7d8f 305 if (coleiter == NULL) {
729e4ab9 306 coleiter = ucol_openElements(strsrch->collator, pattern->text,
b75a7d8f 307 patternlength, status);
729e4ab9
A
308 // status will be checked in ucol_next(..) later and if it is an
309 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
b75a7d8f
A
310 // returned.
311 strsrch->utilIter = coleiter;
312 }
313 else {
57a6839d 314 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
729e4ab9
A
315 }
316 if(U_FAILURE(*status)) {
317 return 0;
b75a7d8f 318 }
729e4ab9 319
b331163b
A
320 if (pattern->ces != cetable && pattern->ces) {
321 uprv_free(pattern->ces);
b75a7d8f 322 }
729e4ab9 323
b75a7d8f
A
324 uint16_t offset = 0;
325 uint16_t result = 0;
374ca955 326 int32_t ce;
b75a7d8f
A
327
328 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
329 U_SUCCESS(*status)) {
330 uint32_t newce = getCE(strsrch, ce);
331 if (newce) {
729e4ab9 332 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
b75a7d8f 333 newce,
729e4ab9 334 patternlength - ucol_getOffset(coleiter) + 1,
b75a7d8f
A
335 status);
336 if (U_FAILURE(*status)) {
337 return 0;
338 }
339 offset ++;
b331163b 340 if (cetable != temp && cetable != pattern->cesBuffer) {
b75a7d8f
A
341 uprv_free(cetable);
342 }
343 cetable = temp;
344 }
345 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
346 }
347
348 cetable[offset] = 0;
b331163b
A
349 pattern->ces = cetable;
350 pattern->cesLength = offset;
b75a7d8f
A
351
352 return result;
353}
354
46f4442e
A
355/**
356* Initializing the pce table for a pattern.
357* Stores non-ignorable collation keys.
729e4ab9
A
358* Table size will be estimated by the size of the pattern text. Table
359* expansion will be perform as we go along. Adding 1 to ensure that the table
46f4442e
A
360* size definitely increases.
361* Internal method, status assumed to be a success.
362* @param strsrch string search data
729e4ab9 363* @param status output error if any, caller to check status before calling
46f4442e 364* method, status assumed to be success when passed in.
729e4ab9 365* @return total number of expansions
46f4442e
A
366*/
367static
729e4ab9 368inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
46f4442e
A
369 UErrorCode *status)
370{
371 UPattern *pattern = &(strsrch->pattern);
372 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
b331163b 373 int64_t *pcetable = pattern->pcesBuffer;
46f4442e
A
374 uint32_t patternlength = pattern->textLength;
375 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 376
46f4442e 377 if (coleiter == NULL) {
729e4ab9 378 coleiter = ucol_openElements(strsrch->collator, pattern->text,
46f4442e 379 patternlength, status);
729e4ab9
A
380 // status will be checked in ucol_next(..) later and if it is an
381 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
46f4442e
A
382 // returned.
383 strsrch->utilIter = coleiter;
384 } else {
57a6839d 385 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
729e4ab9
A
386 }
387 if(U_FAILURE(*status)) {
388 return 0;
46f4442e 389 }
729e4ab9 390
b331163b
A
391 if (pattern->pces != pcetable && pattern->pces != NULL) {
392 uprv_free(pattern->pces);
46f4442e 393 }
729e4ab9 394
46f4442e
A
395 uint16_t offset = 0;
396 uint16_t result = 0;
397 int64_t pce;
398
57a6839d 399 icu::UCollationPCE iter(coleiter);
46f4442e
A
400
401 // ** Should processed CEs be signed or unsigned?
729e4ab9 402 // ** (the rest of the code in this file seems to play fast-and-loose with
46f4442e 403 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
57a6839d 404 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
46f4442e 405 U_SUCCESS(*status)) {
729e4ab9 406 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
46f4442e 407 pce,
729e4ab9 408 patternlength - ucol_getOffset(coleiter) + 1,
46f4442e
A
409 status);
410
411 if (U_FAILURE(*status)) {
412 return 0;
413 }
414
415 offset += 1;
416
b331163b 417 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
46f4442e
A
418 uprv_free(pcetable);
419 }
420
421 pcetable = temp;
422 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
423 }
424
425 pcetable[offset] = 0;
b331163b
A
426 pattern->pces = pcetable;
427 pattern->pcesLength = offset;
46f4442e
A
428
429 return result;
430}
431
b75a7d8f
A
432/**
433* Initializes the pattern struct.
434* Internal method, status assumed to be success.
435* @param strsrch UStringSearch data storage
729e4ab9 436* @param status output error if any, caller to check status before calling
b75a7d8f
A
437* method, status assumed to be success when passed in.
438* @return expansionsize the total expansion size of the pattern
729e4ab9 439*/
b75a7d8f 440static
729e4ab9 441inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
b75a7d8f 442{
57a6839d 443 if (U_FAILURE(*status)) { return 0; }
b75a7d8f
A
444 UPattern *pattern = &(strsrch->pattern);
445 const UChar *patterntext = pattern->text;
446 int32_t length = pattern->textLength;
447 int32_t index = 0;
729e4ab9 448
46f4442e
A
449 // Since the strength is primary, accents are ignored in the pattern.
450 if (strsrch->strength == UCOL_PRIMARY) {
4388f060
A
451 pattern->hasPrefixAccents = 0;
452 pattern->hasSuffixAccents = 0;
46f4442e 453 } else {
4388f060
A
454 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
455 SECOND_LAST_BYTE_SHIFT_;
456 index = length;
457 U16_BACK_1(patterntext, 0, index);
458 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
459 LAST_BYTE_MASK_;
46f4442e
A
460 }
461
462 // ** HACK **
b331163b
A
463 if (strsrch->pattern.pces != NULL) {
464 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
465 uprv_free(strsrch->pattern.pces);
46f4442e
A
466 }
467
b331163b 468 strsrch->pattern.pces = NULL;
46f4442e 469 }
b75a7d8f 470
b75a7d8f 471 // since intializePattern is an internal method status is a success.
729e4ab9 472 return initializePatternCETable(strsrch, status);
b75a7d8f
A
473}
474
475/**
476* Initializing shift tables, with the default values.
477* If a corresponding default value is 0, the shift table is not set.
729e4ab9 478* @param shift table for forwards shift
b75a7d8f
A
479* @param backshift table for backwards shift
480* @param cetable table containing pattern ce
481* @param cesize size of the pattern ces
482* @param expansionsize total size of the expansions
483* @param defaultforward the default forward value
484* @param defaultbackward the default backward value
485*/
486static
729e4ab9
A
487inline void setShiftTable(int16_t shift[], int16_t backshift[],
488 int32_t *cetable, int32_t cesize,
b75a7d8f
A
489 int16_t expansionsize,
490 int16_t defaultforward,
491 int16_t defaultbackward)
492{
729e4ab9 493 // estimate the value to shift. to do that we estimate the smallest
b75a7d8f 494 // number of characters to give the relevant ces, ie approximately
729e4ab9 495 // the number of ces minus their expansion, since expansions can come
b75a7d8f
A
496 // from a character.
497 int32_t count;
498 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
499 shift[count] = defaultforward;
500 }
501 cesize --; // down to the last index
502 for (count = 0; count < cesize; count ++) {
503 // number of ces from right of array to the count
504 int temp = defaultforward - count - 1;
3d1f044b 505 shift[hashFromCE32(cetable[count])] = temp > 1 ? static_cast<int16_t>(temp) : 1;
b75a7d8f 506 }
2ca993e8 507 shift[hashFromCE32(cetable[cesize])] = 1;
b75a7d8f 508 // for ignorables we just shift by one. see test examples.
2ca993e8 509 shift[hashFromCE32(0)] = 1;
729e4ab9 510
b75a7d8f
A
511 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
512 backshift[count] = defaultbackward;
513 }
514 for (count = cesize; count > 0; count --) {
515 // the original value count does not seem to work
2ca993e8 516 backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
b75a7d8f
A
517 (int16_t)(count - expansionsize) : 1;
518 }
2ca993e8
A
519 backshift[hashFromCE32(cetable[0])] = 1;
520 backshift[hashFromCE32(0)] = 1;
b75a7d8f
A
521}
522
523/**
524* Building of the pattern collation element list and the boyer moore strsrch
525* table.
526* The canonical match will only be performed after the default match fails.
527* For both cases we need to remember the size of the composed and decomposed
528* versions of the string. Since the Boyer-Moore shift calculations shifts by
729e4ab9
A
529* a number of characters in the text and tries to match the pattern from that
530* offset, the shift value can not be too large in case we miss some
531* characters. To choose a right shift size, we estimate the NFC form of the
532* and use its size as a shift guide. The NFC form should be the small
b75a7d8f
A
533* possible representation of the pattern. Anyways, we'll err on the smaller
534* shift size. Hence the calculation for minlength.
729e4ab9
A
535* Canonical match will be performed slightly differently. We'll split the
536* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
537* the first and last base character (MS), the ending accents (EA). Matches
b75a7d8f
A
538* will be done on MS first, and only when we match MS then some processing
539* will be required for the prefix and end accents in order to determine if
729e4ab9
A
540* they match PA and EA. Hence the default shift values
541* for the canonical match will take the size of either end's accent into
b75a7d8f
A
542* consideration. Forwards search will take the end accents into consideration
543* for the default shift values and the backwards search will take the prefix
544* accents into consideration.
545* If pattern has no non-ignorable ce, we return a illegal argument error.
546* Internal method, status assumed to be success.
547* @param strsrch UStringSearch data storage
548* @param status for output errors if it occurs, status is assumed to be a
549* success when it is passed in.
729e4ab9 550*/
b75a7d8f 551static
729e4ab9 552inline void initialize(UStringSearch *strsrch, UErrorCode *status)
b75a7d8f 553{
729e4ab9 554 int16_t expandlength = initializePattern(strsrch, status);
b331163b 555 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
b75a7d8f 556 UPattern *pattern = &strsrch->pattern;
b331163b 557 int32_t cesize = pattern->cesLength;
b75a7d8f 558
729e4ab9 559 int16_t minlength = cesize > expandlength
374ca955 560 ? (int16_t)cesize - expandlength : 1;
b75a7d8f 561 pattern->defaultShiftSize = minlength;
b331163b 562 setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
b75a7d8f
A
563 cesize, expandlength, minlength, minlength);
564 return;
565 }
566 strsrch->pattern.defaultShiftSize = 0;
567}
568
46f4442e
A
569#if BOYER_MOORE
570/**
729e4ab9 571* Check to make sure that the match length is at the end of the character by
46f4442e 572* using the breakiterator.
729e4ab9 573* @param strsrch string search data
46f4442e
A
574* @param start target text start offset
575* @param end target text end offset
576*/
577static
729e4ab9 578void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
46f4442e
A
579 int32_t *end)
580{
581#if !UCONFIG_NO_BREAK_ITERATION
582 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
583 if (breakiterator) {
4388f060
A
584 int32_t matchend = *end;
585 //int32_t matchstart = *start;
729e4ab9 586
4388f060
A
587 if (!ubrk_isBoundary(breakiterator, matchend)) {
588 *end = ubrk_following(breakiterator, matchend);
46f4442e 589 }
729e4ab9 590
4388f060
A
591 /* Check the start of the matched text to make sure it doesn't have any accents
592 * before it. This code may not be necessary and so it is commented out */
593 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
594 *start = ubrk_preceding(breakiterator, matchstart);
595 }*/
46f4442e
A
596 }
597#endif
598}
599
b75a7d8f 600/**
729e4ab9
A
601* Determine whether the target text in UStringSearch bounded by the offset
602* start and end is one or more whole units of text as
b75a7d8f 603* determined by the breakiterator in UStringSearch.
729e4ab9 604* @param strsrch string search data
b75a7d8f
A
605* @param start target text start offset
606* @param end target text end offset
607*/
608static
729e4ab9 609UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
b75a7d8f
A
610 int32_t end)
611{
612#if !UCONFIG_NO_BREAK_ITERATION
613 UBreakIterator *breakiterator = strsrch->search->breakIter;
46f4442e 614 //TODO: Add here.
b75a7d8f
A
615 if (breakiterator) {
616 int32_t startindex = ubrk_first(breakiterator);
617 int32_t endindex = ubrk_last(breakiterator);
729e4ab9 618
b75a7d8f
A
619 // out-of-range indexes are never boundary positions
620 if (start < startindex || start > endindex ||
621 end < startindex || end > endindex) {
622 return FALSE;
623 }
729e4ab9
A
624 // otherwise, we can use following() on the position before the
625 // specified one and return true of the position we get back is the
b75a7d8f 626 // one the user specified
729e4ab9
A
627 UBool result = (start == startindex ||
628 ubrk_following(breakiterator, start - 1) == start) &&
629 (end == endindex ||
b75a7d8f
A
630 ubrk_following(breakiterator, end - 1) == end);
631 if (result) {
632 // iterates the individual ces
633 UCollationElements *coleiter = strsrch->utilIter;
729e4ab9 634 const UChar *text = strsrch->search->text +
b75a7d8f
A
635 start;
636 UErrorCode status = U_ZERO_ERROR;
637 ucol_setText(coleiter, text, end - start, &status);
b331163b 638 for (int32_t count = 0; count < strsrch->pattern.cesLength;
b75a7d8f 639 count ++) {
374ca955 640 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
641 if (ce == UCOL_IGNORABLE) {
642 count --;
643 continue;
644 }
b331163b 645 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
b75a7d8f
A
646 return FALSE;
647 }
648 }
374ca955 649 int32_t nextce = ucol_next(coleiter, &status);
b75a7d8f
A
650 while (ucol_getOffset(coleiter) == (end - start)
651 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
652 nextce = ucol_next(coleiter, &status);
653 }
654 if (ucol_getOffset(coleiter) == (end - start)
655 && nextce != UCOL_NULLORDER) {
656 // extra collation elements at the end of the match
657 return FALSE;
658 }
659 }
660 return result;
661 }
662#endif
663 return TRUE;
664}
665
666/**
729e4ab9
A
667* Getting the next base character offset if current offset is an accent,
668* or the current offset if the current character contains a base character.
b75a7d8f
A
669* accents the following base character will be returned
670* @param text string
671* @param textoffset current offset
672* @param textlength length of text string
673* @return the next base character or the current offset
674* if the current character is contains a base character.
675*/
676static
729e4ab9 677inline int32_t getNextBaseOffset(const UChar *text,
b75a7d8f
A
678 int32_t textoffset,
679 int32_t textlength)
680{
681 if (textoffset < textlength) {
682 int32_t temp = textoffset;
683 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
729e4ab9 684 while (temp < textlength) {
b75a7d8f 685 int32_t result = temp;
729e4ab9 686 if ((getFCD(text, &temp, textlength) >>
b75a7d8f
A
687 SECOND_LAST_BYTE_SHIFT_) == 0) {
688 return result;
689 }
690 }
691 return textlength;
692 }
693 }
694 return textoffset;
695}
696
697/**
698* Gets the next base character offset depending on the string search pattern
699* data
700* @param strsrch string search data
701* @param textoffset current offset, one offset away from the last character
702* to search for.
703* @return start index of the next base character or the current offset
704* if the current character is contains a base character.
705*/
706static
729e4ab9 707inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
b75a7d8f
A
708 int32_t textoffset)
709{
374ca955 710 int32_t textlength = strsrch->search->textLength;
729e4ab9 711 if (strsrch->pattern.hasSuffixAccents &&
b75a7d8f
A
712 textoffset < textlength) {
713 int32_t temp = textoffset;
714 const UChar *text = strsrch->search->text;
4388f060 715 U16_BACK_1(text, 0, temp);
b75a7d8f
A
716 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
717 return getNextBaseOffset(text, textoffset, textlength);
718 }
719 }
720 return textoffset;
721}
722
723/**
724* Shifting the collation element iterator position forward to prepare for
725* a following match. If the last character is a unsafe character, we'll only
726* shift by 1 to capture contractions, normalization etc.
727* Internal method, status assumed to be success.
728* @param text strsrch string search data
729* @param textoffset start text position to do search
730* @param ce the text ce which failed the match.
731* @param patternceindex index of the ce within the pattern ce buffer which
732* failed the match
733* @return final offset
734*/
735static
736inline int32_t shiftForward(UStringSearch *strsrch,
737 int32_t textoffset,
374ca955 738 int32_t ce,
b75a7d8f
A
739 int32_t patternceindex)
740{
374ca955 741 UPattern *pattern = &(strsrch->pattern);
b75a7d8f 742 if (ce != UCOL_NULLORDER) {
2ca993e8 743 int32_t shift = pattern->shift[hashFromCE32(ce)];
729e4ab9 744 // this is to adjust for characters in the middle of the
b75a7d8f 745 // substring for matching that failed.
b331163b 746 int32_t adjust = pattern->cesLength - patternceindex;
b75a7d8f
A
747 if (adjust > 1 && shift >= adjust) {
748 shift -= adjust - 1;
749 }
750 textoffset += shift;
751 }
752 else {
753 textoffset += pattern->defaultShiftSize;
754 }
729e4ab9 755
b75a7d8f
A
756 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
757 // check for unsafe characters
729e4ab9 758 // * if it is the start or middle of a contraction: to be done after
b75a7d8f
A
759 // a initial match is found
760 // * thai or lao base consonant character: similar to contraction
761 // * high surrogate character: similar to contraction
762 // * next character is a accent: shift to the next base character
763 return textoffset;
764}
46f4442e 765#endif // #if BOYER_MOORE
b75a7d8f
A
766
767/**
729e4ab9 768* sets match not found
b75a7d8f
A
769* @param strsrch string search data
770*/
771static
729e4ab9 772inline void setMatchNotFound(UStringSearch *strsrch)
b75a7d8f
A
773{
774 // this method resets the match result regardless of the error status.
775 strsrch->search->matchedIndex = USEARCH_DONE;
776 strsrch->search->matchedLength = 0;
777 if (strsrch->search->isForwardSearching) {
778 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
779 }
780 else {
781 setColEIterOffset(strsrch->textIter, 0);
782 }
783}
784
46f4442e 785#if BOYER_MOORE
b75a7d8f
A
786/**
787* Gets the offset to the next safe point in text.
788* ie. not the middle of a contraction, swappable characters or supplementary
789* characters.
790* @param collator collation sata
791* @param text string to work with
792* @param textoffset offset in string
793* @param textlength length of text string
794* @return offset to the next safe character
795*/
796static
729e4ab9 797inline int32_t getNextSafeOffset(const UCollator *collator,
b75a7d8f
A
798 const UChar *text,
799 int32_t textoffset,
800 int32_t textlength)
801{
802 int32_t result = textoffset; // first contraction character
803 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
804 result ++;
805 }
729e4ab9 806 return result;
b75a7d8f
A
807}
808
729e4ab9 809/**
b75a7d8f
A
810* This checks for accents in the potential match started with a .
811* composite character.
729e4ab9
A
812* This is really painful... we have to check that composite character do not
813* have any extra accents. We have to normalize the potential match and find
b75a7d8f 814* the immediate decomposed character before the match.
729e4ab9 815* The first composite character would have been taken care of by the fcd
b75a7d8f 816* checks in checkForwardExactMatch.
729e4ab9
A
817* This is the slow path after the fcd of the first character and
818* the last character has been checked by checkForwardExactMatch and we
b75a7d8f
A
819* determine that the potential match has extra non-ignorable preceding
820* ces.
729e4ab9 821* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
b75a7d8f
A
822* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
823* Note here that accents checking are slow and cautioned in the API docs.
824* Internal method, status assumed to be a success, caller should check status
825* before calling this method
826* @param strsrch string search data
827* @param start index of the potential unfriendly composite character
828* @param end index of the potential unfriendly composite character
829* @param status output error status if any.
830* @return TRUE if there is non-ignorable accents before at the beginning
831* of the match, FALSE otherwise.
832*/
833
834static
835UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
729e4ab9 836 int32_t end,
b75a7d8f
A
837 UErrorCode *status)
838{
839 UBool result = FALSE;
840 if (strsrch->pattern.hasPrefixAccents) {
841 int32_t length = end - start;
842 int32_t offset = 0;
843 const UChar *text = strsrch->search->text + start;
729e4ab9 844
4388f060 845 U16_FWD_1(text, offset, length);
b75a7d8f
A
846 // we are only concerned with the first composite character
847 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
729e4ab9 848 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
b75a7d8f
A
849 text, 0, length);
850 if (safeoffset != length) {
851 safeoffset ++;
852 }
853 UChar *norm = NULL;
854 UChar buffer[INITIAL_ARRAY_SIZE_];
729e4ab9
A
855 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
856 buffer, INITIAL_ARRAY_SIZE_,
857 status);
b75a7d8f
A
858 if (U_FAILURE(*status)) {
859 return FALSE;
860 }
861 if (size >= INITIAL_ARRAY_SIZE_) {
862 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
863 status);
729e4ab9 864 // if allocation failed, status will be set to
b75a7d8f
A
865 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
866 // checks for it.
729e4ab9 867 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
b75a7d8f
A
868 size, status);
869 if (U_FAILURE(*status) && norm != NULL) {
870 uprv_free(norm);
871 return FALSE;
872 }
873 }
874 else {
875 norm = buffer;
876 }
877
878 UCollationElements *coleiter = strsrch->utilIter;
879 ucol_setText(coleiter, norm, size, status);
b331163b 880 uint32_t firstce = strsrch->pattern.ces[0];
b75a7d8f
A
881 UBool ignorable = TRUE;
882 uint32_t ce = UCOL_IGNORABLE;
46f4442e 883 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
b75a7d8f
A
884 offset = ucol_getOffset(coleiter);
885 if (ce != firstce && ce != UCOL_IGNORABLE) {
886 ignorable = FALSE;
887 }
888 ce = ucol_next(coleiter, status);
889 }
890 UChar32 codepoint;
4388f060 891 U16_PREV(norm, 0, offset, codepoint);
b75a7d8f
A
892 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
893
894 if (norm != buffer) {
895 uprv_free(norm);
896 }
897 }
898 }
899
900 return result;
901}
902
903/**
729e4ab9 904* Used by exact matches, checks if there are accents before the match.
b75a7d8f 905* This is really painful... we have to check that composite characters at
729e4ab9
A
906* the start of the matches have to not have any extra accents.
907* We check the FCD of the character first, if it starts with an accent and
b75a7d8f 908* the first pattern ce does not match the first ce of the character, we bail.
729e4ab9
A
909* Otherwise we try normalizing the first composite
910* character and find the immediate decomposed character before the match to
b75a7d8f 911* see if it is an non-ignorable accent.
729e4ab9
A
912* Now normalizing the first composite character is enough because we ensure
913* that when the match is passed in here with extra beginning ces, the
b75a7d8f 914* first or last ce that match has to occur within the first character.
729e4ab9 915* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
b75a7d8f
A
916* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
917* Note here that accents checking are slow and cautioned in the API docs.
918* @param strsrch string search data
729e4ab9 919* @param start offset
b75a7d8f 920* @param end offset
729e4ab9 921* @return TRUE if there are accents on either side of the match,
b75a7d8f
A
922* FALSE otherwise
923*/
924static
925UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
729e4ab9 926 int32_t end)
b75a7d8f
A
927{
928 if (strsrch->pattern.hasPrefixAccents) {
929 UCollationElements *coleiter = strsrch->textIter;
930 UErrorCode status = U_ZERO_ERROR;
931 // we have been iterating forwards previously
932 uint32_t ignorable = TRUE;
b331163b 933 int32_t firstce = strsrch->pattern.ces[0];
b75a7d8f 934
374ca955
A
935 setColEIterOffset(coleiter, start);
936 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
937 if (U_FAILURE(status)) {
b75a7d8f
A
938 return TRUE;
939 }
940 while (ce != firstce) {
941 if (ce != UCOL_IGNORABLE) {
942 ignorable = FALSE;
943 }
944 ce = getCE(strsrch, ucol_next(coleiter, &status));
46f4442e 945 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
b75a7d8f
A
946 return TRUE;
947 }
948 }
374ca955 949 if (!ignorable && inNormBuf(coleiter)) {
b75a7d8f 950 // within normalization buffer, discontiguous handled here
374ca955 951 return TRUE;
b75a7d8f
A
952 }
953
374ca955 954 // within text
b75a7d8f 955 int32_t temp = start;
374ca955 956 // original code
729e4ab9
A
957 // accent = (getFCD(strsrch->search->text, &temp,
958 // strsrch->search->textLength)
959 // >> SECOND_LAST_BYTE_SHIFT_);
374ca955 960 // however this code does not work well with VC7 .net in release mode.
729e4ab9 961 // maybe the inlines for getFCD combined with shifting has bugs in
374ca955 962 // VC7. anyways this is a work around.
729e4ab9 963 UBool accent = getFCD(strsrch->search->text, &temp,
b75a7d8f
A
964 strsrch->search->textLength) > 0xFF;
965 if (!accent) {
374ca955 966 return checkExtraMatchAccents(strsrch, start, end, &status);
b75a7d8f 967 }
374ca955 968 if (!ignorable) {
b75a7d8f
A
969 return TRUE;
970 }
971 if (start > 0) {
972 temp = start;
4388f060 973 U16_BACK_1(strsrch->search->text, 0, temp);
729e4ab9 974 if (getFCD(strsrch->search->text, &temp,
b75a7d8f
A
975 strsrch->search->textLength) & LAST_BYTE_MASK_) {
976 setColEIterOffset(coleiter, start);
977 ce = ucol_previous(coleiter, &status);
729e4ab9 978 if (U_FAILURE(status) ||
b75a7d8f
A
979 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
980 return TRUE;
981 }
982 }
983 }
984 }
729e4ab9 985
b75a7d8f
A
986 return FALSE;
987}
988
989/**
990* Used by exact matches, checks if there are accents bounding the match.
991* Note this is the initial boundary check. If the potential match
992* starts or ends with composite characters, the accents in those
993* characters will be determined later.
729e4ab9 994* Not doing backwards iteration here, since discontiguos contraction for
b75a7d8f 995* backwards collation element iterator, use up too many characters.
729e4ab9 996* E.g. looking for \u030A ring in \u01FA A ring above and acute,
b75a7d8f
A
997* should fail since there is a acute at the end of \u01FA
998* Note here that accents checking are slow and cautioned in the API docs.
999* @param strsrch string search data
1000* @param start offset of match
1001* @param end end offset of the match
729e4ab9 1002* @return TRUE if there are accents on either side of the match,
b75a7d8f
A
1003* FALSE otherwise
1004*/
1005static
729e4ab9
A
1006UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1007 int32_t end)
b75a7d8f
A
1008{
1009 if (strsrch->pattern.hasSuffixAccents) {
1010 const UChar *text = strsrch->search->text;
1011 int32_t temp = end;
1012 int32_t textlength = strsrch->search->textLength;
4388f060 1013 U16_BACK_1(text, 0, temp);
b75a7d8f 1014 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
b331163b 1015 int32_t firstce = strsrch->pattern.ces[0];
b75a7d8f
A
1016 UCollationElements *coleiter = strsrch->textIter;
1017 UErrorCode status = U_ZERO_ERROR;
4388f060 1018 int32_t ce;
b75a7d8f 1019 setColEIterOffset(coleiter, start);
46f4442e
A
1020 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1021 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
b75a7d8f
A
1022 return TRUE;
1023 }
1024 }
1025 int32_t count = 1;
b331163b 1026 while (count < strsrch->pattern.cesLength) {
729e4ab9 1027 if (getCE(strsrch, ucol_next(coleiter, &status))
b75a7d8f
A
1028 == UCOL_IGNORABLE) {
1029 // Thai can give an ignorable here.
1030 count --;
1031 }
1032 if (U_FAILURE(status)) {
1033 return TRUE;
1034 }
1035 count ++;
1036 }
729e4ab9 1037
4388f060 1038 ce = ucol_next(coleiter, &status);
b75a7d8f
A
1039 if (U_FAILURE(status)) {
1040 return TRUE;
1041 }
46f4442e 1042 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
4388f060 1043 ce = getCE(strsrch, ce);
46f4442e 1044 }
b75a7d8f
A
1045 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1046 if (ucol_getOffset(coleiter) <= end) {
1047 return TRUE;
1048 }
1049 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1050 return TRUE;
1051 }
1052 }
1053 }
1054 }
1055 return FALSE;
1056}
46f4442e 1057#endif // #if BOYER_MOORE
b75a7d8f
A
1058
1059/**
1060* Checks if the offset runs out of the text string
729e4ab9 1061* @param offset
b75a7d8f
A
1062* @param textlength of the text string
1063* @return TRUE if offset is out of bounds, FALSE otherwise
1064*/
1065static
1066inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1067{
1068 return offset < 0 || offset > textlength;
1069}
1070
1071/**
1072* Checks for identical match
1073* @param strsrch string search data
1074* @param start offset of possible match
1075* @param end offset of possible match
1076* @return TRUE if identical match is found
1077*/
1078static
729e4ab9
A
1079inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1080 int32_t end)
b75a7d8f 1081{
b75a7d8f
A
1082 if (strsrch->strength != UCOL_IDENTICAL) {
1083 return TRUE;
1084 }
1085
729e4ab9
A
1086 // Note: We could use Normalizer::compare() or similar, but for short strings
1087 // which may not be in FCD it might be faster to just NFD them.
1088 UErrorCode status = U_ZERO_ERROR;
1089 UnicodeString t2, p2;
1090 strsrch->nfd->normalize(
1091 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1092 strsrch->nfd->normalize(
1093 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
73c04bcf 1094 // return FALSE if NFD failed
729e4ab9 1095 return U_SUCCESS(status) && t2 == p2;
b75a7d8f
A
1096}
1097
729e4ab9 1098#if BOYER_MOORE
b75a7d8f
A
1099/**
1100* Checks to see if the match is repeated
1101* @param strsrch string search data
1102* @param start new match start index
1103* @param end new match end index
1104* @return TRUE if the the match is repeated, FALSE otherwise
1105*/
1106static
1107inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1108 int32_t start,
1109 int32_t end)
1110{
1111 int32_t lastmatchindex = strsrch->search->matchedIndex;
1112 UBool result;
1113 if (lastmatchindex == USEARCH_DONE) {
1114 return FALSE;
1115 }
1116 if (strsrch->search->isForwardSearching) {
1117 result = start <= lastmatchindex;
1118 }
1119 else {
1120 result = start >= lastmatchindex;
1121 }
374ca955 1122 if (!result && !strsrch->search->isOverlap) {
b75a7d8f
A
1123 if (strsrch->search->isForwardSearching) {
1124 result = start < lastmatchindex + strsrch->search->matchedLength;
1125 }
1126 else {
1127 result = end > lastmatchindex;
1128 }
1129 }
1130 return result;
1131}
1132
1133/**
1134* Gets the collation element iterator's current offset.
1135* @param coleiter collation element iterator
1136* @param forwards flag TRUE if we are moving in th forwards direction
729e4ab9 1137* @return current offset
b75a7d8f
A
1138*/
1139static
1140inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1141 UBool forwards)
1142{
1143 int32_t result = ucol_getOffset(coleiter);
1144 // intricacies of the the backwards collation element iterator
46f4442e 1145 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
b75a7d8f
A
1146 result ++;
1147 }
1148 return result;
1149}
1150
1151/**
729e4ab9 1152* Checks match for contraction.
b75a7d8f
A
1153* If the match ends with a partial contraction we fail.
1154* If the match starts too far off (because of backwards iteration) we try to
1155* chip off the extra characters depending on whether a breakiterator has
1156* been used.
729e4ab9 1157* Internal method, error assumed to be success, caller has to check status
b75a7d8f
A
1158* before calling this method.
1159* @param strsrch string search data
1160* @param start offset of potential match, to be modified if necessary
1161* @param end offset of potential match, to be modified if necessary
1162* @param status output error status if any
1163* @return TRUE if match passes the contraction test, FALSE otherwise
1164*/
1165
1166static
729e4ab9
A
1167UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1168 int32_t *start,
1169 int32_t *end, UErrorCode *status)
b75a7d8f
A
1170{
1171 UCollationElements *coleiter = strsrch->textIter;
1172 int32_t textlength = strsrch->search->textLength;
46f4442e 1173 int32_t temp = *start;
b75a7d8f
A
1174 const UCollator *collator = strsrch->collator;
1175 const UChar *text = strsrch->search->text;
729e4ab9 1176 // This part checks if either ends of the match contains potential
b75a7d8f 1177 // contraction. If so we'll have to iterate through them
374ca955
A
1178 // The start contraction needs to be checked since ucol_previous dumps
1179 // all characters till the first safe character into the buffer.
729e4ab9
A
1180 // *start + 1 is used to test for the unsafe characters instead of *start
1181 // because ucol_prev takes all unsafe characters till the first safe
1182 // character ie *start. so by testing *start + 1, we can estimate if
1183 // excess prefix characters has been included in the potential search
374ca955 1184 // results.
729e4ab9
A
1185 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1186 (*start + 1 < textlength
b75a7d8f
A
1187 && ucol_unsafeCP(text[*start + 1], collator))) {
1188 int32_t expansion = getExpansionPrefix(coleiter);
1189 UBool expandflag = expansion > 0;
1190 setColEIterOffset(coleiter, *start);
1191 while (expansion > 0) {
1192 // getting rid of the redundant ce, caused by setOffset.
729e4ab9
A
1193 // since backward contraction/expansion may have extra ces if we
1194 // are in the normalization buffer, hasAccentsBeforeMatch would
b75a7d8f
A
1195 // have taken care of it.
1196 // E.g. the character \u01FA will have an expansion of 3, but if
1197 // we are only looking for acute and ring \u030A and \u0301, we'll
1198 // have to skip the first ce in the expansion buffer.
1199 ucol_next(coleiter, status);
374ca955
A
1200 if (U_FAILURE(*status)) {
1201 return FALSE;
1202 }
b75a7d8f
A
1203 if (ucol_getOffset(coleiter) != temp) {
1204 *start = temp;
1205 temp = ucol_getOffset(coleiter);
1206 }
1207 expansion --;
1208 }
1209
b331163b
A
1210 int32_t *patternce = strsrch->pattern.ces;
1211 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
1212 int32_t count = 0;
1213 while (count < patterncelength) {
374ca955 1214 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
b75a7d8f
A
1215 if (ce == UCOL_IGNORABLE) {
1216 continue;
1217 }
1218 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1219 *start = temp;
1220 temp = ucol_getOffset(coleiter);
1221 }
1222 if (U_FAILURE(*status) || ce != patternce[count]) {
1223 (*end) ++;
729e4ab9 1224 *end = getNextUStringSearchBaseOffset(strsrch, *end);
b75a7d8f
A
1225 return FALSE;
1226 }
1227 count ++;
1228 }
729e4ab9 1229 }
b75a7d8f
A
1230 return TRUE;
1231}
1232
1233/**
1234* Checks and sets the match information if found.
729e4ab9 1235* Checks
b75a7d8f
A
1236* <ul>
1237* <li> the potential match does not repeat the previous match
1238* <li> boundaries are correct
1239* <li> exact matches has no extra accents
1240* <li> identical matchesb
1241* <li> potential match does not end in the middle of a contraction
1242* <\ul>
1243* Otherwise the offset will be shifted to the next character.
729e4ab9 1244* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1245* before calling this method.
1246* @param strsrch string search data
1247* @param textoffset offset in the collation element text. the returned value
729e4ab9 1248* will be the truncated end offset of the match or the new start
b75a7d8f
A
1249* search offset.
1250* @param status output error status if any
1251* @return TRUE if the match is valid, FALSE otherwise
1252*/
1253static
729e4ab9 1254inline UBool checkNextExactMatch(UStringSearch *strsrch,
b75a7d8f
A
1255 int32_t *textoffset, UErrorCode *status)
1256{
1257 UCollationElements *coleiter = strsrch->textIter;
729e4ab9
A
1258 int32_t start = getColElemIterOffset(coleiter, FALSE);
1259
374ca955
A
1260 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1261 return FALSE;
b75a7d8f
A
1262 }
1263
1264 // this totally matches, however we need to check if it is repeating
1265 if (!isBreakUnit(strsrch, start, *textoffset) ||
729e4ab9
A
1266 checkRepeatedMatch(strsrch, start, *textoffset) ||
1267 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
b75a7d8f
A
1268 !checkIdentical(strsrch, start, *textoffset) ||
1269 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
374ca955
A
1270
1271 (*textoffset) ++;
729e4ab9 1272 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
374ca955 1273 return FALSE;
b75a7d8f 1274 }
46f4442e
A
1275
1276 //Add breakiterator boundary check for primary strength search.
1277 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
4388f060 1278 checkBreakBoundary(strsrch, &start, textoffset);
46f4442e 1279 }
729e4ab9 1280
b75a7d8f
A
1281 // totally match, we will get rid of the ending ignorables.
1282 strsrch->search->matchedIndex = start;
1283 strsrch->search->matchedLength = *textoffset - start;
374ca955 1284 return TRUE;
b75a7d8f
A
1285}
1286
1287/**
729e4ab9 1288* Getting the previous base character offset, or the current offset if the
b75a7d8f
A
1289* current character is a base character
1290* @param text string
1291* @param textoffset one offset after the current character
729e4ab9 1292* @return the offset of the next character after the base character or the first
b75a7d8f
A
1293* composed character with accents
1294*/
1295static
729e4ab9 1296inline int32_t getPreviousBaseOffset(const UChar *text,
b75a7d8f
A
1297 int32_t textoffset)
1298{
1299 if (textoffset > 0) {
46f4442e 1300 for (;;) {
b75a7d8f 1301 int32_t result = textoffset;
4388f060 1302 U16_BACK_1(text, 0, textoffset);
b75a7d8f
A
1303 int32_t temp = textoffset;
1304 uint16_t fcd = getFCD(text, &temp, result);
1305 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1306 if (fcd & LAST_BYTE_MASK_) {
1307 return textoffset;
1308 }
1309 return result;
1310 }
1311 if (textoffset == 0) {
1312 return 0;
1313 }
1314 }
1315 }
1316 return textoffset;
1317}
1318
1319/**
1320* Getting the indexes of the accents that are not blocked in the argument
1321* accent array
1322* @param accents array of accents in nfd terminated by a 0.
1323* @param accentsindex array of indexes of the accents that are not blocked
1324*/
1325static
1326inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1327{
1328 int32_t index = 0;
1329 int32_t length = u_strlen(accents);
1330 UChar32 codepoint = 0;
1331 int cclass = 0;
1332 int result = 0;
1333 int32_t temp;
1334 while (index < length) {
1335 temp = index;
4388f060 1336 U16_NEXT(accents, index, length, codepoint);
b75a7d8f
A
1337 if (u_getCombiningClass(codepoint) != cclass) {
1338 cclass = u_getCombiningClass(codepoint);
1339 accentsindex[result] = temp;
1340 result ++;
1341 }
1342 }
1343 accentsindex[result] = length;
1344 return result;
1345}
1346
1347/**
1348* Appends 3 UChar arrays to a destination array.
729e4ab9 1349* Creates a new array if we run out of space. The caller will have to
b75a7d8f 1350* manually deallocate the newly allocated array.
729e4ab9
A
1351* Internal method, status assumed to be success, caller has to check status
1352* before calling this method. destination not to be NULL and has at least
b75a7d8f
A
1353* size destinationlength.
1354* @param destination target array
1355* @param destinationlength target array size, returning the appended length
1356* @param source1 null-terminated first array
1357* @param source2 second array
1358* @param source2length length of seond array
1359* @param source3 null-terminated third array
1360* @param status error status if any
1361* @return new destination array, destination if there was no new allocation
1362*/
1363static
729e4ab9
A
1364inline UChar * addToUCharArray( UChar *destination,
1365 int32_t *destinationlength,
1366 const UChar *source1,
b75a7d8f 1367 const UChar *source2,
729e4ab9
A
1368 int32_t source2length,
1369 const UChar *source3,
1370 UErrorCode *status)
b75a7d8f
A
1371{
1372 int32_t source1length = source1 ? u_strlen(source1) : 0;
729e4ab9
A
1373 int32_t source3length = source3 ? u_strlen(source3) : 0;
1374 if (*destinationlength < source1length + source2length + source3length +
1375 1)
b75a7d8f
A
1376 {
1377 destination = (UChar *)allocateMemory(
1378 (source1length + source2length + source3length + 1) * sizeof(UChar),
1379 status);
729e4ab9 1380 // if error allocating memory, status will be
b75a7d8f
A
1381 // U_MEMORY_ALLOCATION_ERROR
1382 if (U_FAILURE(*status)) {
1383 *destinationlength = 0;
1384 return NULL;
1385 }
1386 }
1387 if (source1length != 0) {
a62d09fc 1388 u_memcpy(destination, source1, source1length);
b75a7d8f
A
1389 }
1390 if (source2length != 0) {
729e4ab9 1391 uprv_memcpy(destination + source1length, source2,
b75a7d8f
A
1392 sizeof(UChar) * source2length);
1393 }
1394 if (source3length != 0) {
729e4ab9 1395 uprv_memcpy(destination + source1length + source2length, source3,
b75a7d8f
A
1396 sizeof(UChar) * source3length);
1397 }
1398 *destinationlength = source1length + source2length + source3length;
1399 return destination;
1400}
1401
1402/**
1403* Running through a collation element iterator to see if the contents matches
1404* pattern in string search data
1405* @param strsrch string search data
1406* @param coleiter collation element iterator
1407* @return TRUE if a match if found, FALSE otherwise
1408*/
1409static
729e4ab9 1410inline UBool checkCollationMatch(const UStringSearch *strsrch,
b75a7d8f
A
1411 UCollationElements *coleiter)
1412{
b331163b
A
1413 int patternceindex = strsrch->pattern.cesLength;
1414 int32_t *patternce = strsrch->pattern.ces;
b75a7d8f
A
1415 UErrorCode status = U_ZERO_ERROR;
1416 while (patternceindex > 0) {
374ca955 1417 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
b75a7d8f
A
1418 if (ce == UCOL_IGNORABLE) {
1419 continue;
1420 }
1421 if (U_FAILURE(status) || ce != *patternce) {
1422 return FALSE;
1423 }
1424 patternce ++;
1425 patternceindex --;
1426 }
1427 return TRUE;
1428}
1429
1430/**
1431* Rearranges the front accents to try matching.
729e4ab9
A
1432* Prefix accents in the text will be grouped according to their combining
1433* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
1434* match with the pattern.
1435* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1436* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 1437* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
1438* "\u0301\u0325".
1439* step 2: check if any of the generated substrings matches the pattern.
1440* Internal method, status is assumed to be success, caller has to check status
1441* before calling this method.
1442* @param strsrch string search match
1443* @param start first offset of the accents to start searching
1444* @param end start of the last accent set
1445* @param status output error status if any
1446* @return USEARCH_DONE if a match is not found, otherwise return the starting
1447* offset of the match. Note this start includes all preceding accents.
1448*/
1449static
729e4ab9 1450int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
b75a7d8f 1451 int32_t start,
729e4ab9 1452 int32_t end,
b75a7d8f
A
1453 UErrorCode *status)
1454{
1455 const UChar *text = strsrch->search->text;
1456 int32_t textlength = strsrch->search->textLength;
1457 int32_t tempstart = start;
1458
1459 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1460 // die... failed at a base character
1461 return USEARCH_DONE;
1462 }
1463
1464 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1465 start = getPreviousBaseOffset(text, tempstart);
1466
1467 UChar accents[INITIAL_ARRAY_SIZE_];
1468 // normalizing the offensive string
729e4ab9
A
1469 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1470 INITIAL_ARRAY_SIZE_, status);
b75a7d8f
A
1471 if (U_FAILURE(*status)) {
1472 return USEARCH_DONE;
1473 }
729e4ab9
A
1474
1475 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1476 int32_t accentsize = getUnblockedAccentIndex(accents,
b75a7d8f 1477 accentsindex);
729e4ab9 1478 int32_t count = (2 << (accentsize - 1)) - 1;
b75a7d8f
A
1479 UChar buffer[INITIAL_ARRAY_SIZE_];
1480 UCollationElements *coleiter = strsrch->utilIter;
1481 while (U_SUCCESS(*status) && count > 0) {
1482 UChar *rearrange = strsrch->canonicalPrefixAccents;
1483 // copy the base characters
1484 for (int k = 0; k < accentsindex[0]; k ++) {
1485 *rearrange ++ = accents[k];
1486 }
1487 // forming all possible canonical rearrangement by dropping
1488 // sets of accents
1489 for (int i = 0; i <= accentsize - 1; i ++) {
1490 int32_t mask = 1 << (accentsize - i - 1);
1491 if (count & mask) {
1492 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1493 *rearrange ++ = accents[j];
1494 }
1495 }
1496 }
1497 *rearrange = 0;
1498 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1499 UChar *match = addToUCharArray(buffer, &matchsize,
1500 strsrch->canonicalPrefixAccents,
1501 strsrch->search->text + offset,
1502 end - offset,
1503 strsrch->canonicalSuffixAccents,
1504 status);
729e4ab9 1505
b75a7d8f
A
1506 // if status is a failure, ucol_setText does nothing.
1507 // run the collator iterator through this match
1508 ucol_setText(coleiter, match, matchsize, status);
1509 if (U_SUCCESS(*status)) {
1510 if (checkCollationMatch(strsrch, coleiter)) {
1511 if (match != buffer) {
1512 uprv_free(match);
1513 }
1514 return start;
1515 }
1516 }
1517 count --;
1518 }
1519 return USEARCH_DONE;
1520}
1521
1522/**
1523* Gets the offset to the safe point in text before textoffset.
1524* ie. not the middle of a contraction, swappable characters or supplementary
1525* characters.
1526* @param collator collation sata
1527* @param text string to work with
1528* @param textoffset offset in string
1529* @param textlength length of text string
1530* @return offset to the previous safe character
1531*/
1532static
729e4ab9 1533inline uint32_t getPreviousSafeOffset(const UCollator *collator,
b75a7d8f
A
1534 const UChar *text,
1535 int32_t textoffset)
1536{
1537 int32_t result = textoffset; // first contraction character
1538 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1539 result --;
1540 }
1541 if (result != 0) {
1542 // the first contraction character is consider unsafe here
1543 result --;
1544 }
729e4ab9 1545 return result;
b75a7d8f
A
1546}
1547
1548/**
1549* Cleaning up after we passed the safe zone
1550* @param strsrch string search data
1551* @param safetext safe text array
1552* @param safebuffer safe text buffer
1553* @param coleiter collation element iterator for safe text
1554*/
1555static
1556inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1557 UChar *safebuffer)
1558{
729e4ab9 1559 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
b75a7d8f
A
1560 {
1561 uprv_free(safetext);
1562 }
1563}
1564
1565/**
1566* Take the rearranged end accents and tries matching. If match failed at
1567* a seperate preceding set of accents (seperated from the rearranged on by
729e4ab9 1568* at least a base character) then we rearrange the preceding accents and
b75a7d8f 1569* tries matching again.
729e4ab9 1570* We allow skipping of the ends of the accent set if the ces do not match.
b75a7d8f
A
1571* However if the failure is found before the accent set, it fails.
1572* Internal method, status assumed to be success, caller has to check status
1573* before calling this method.
1574* @param strsrch string search data
1575* @param textoffset of the start of the rearranged accent
1576* @param status output error status if any
1577* @return USEARCH_DONE if a match is not found, otherwise return the starting
1578* offset of the match. Note this start includes all preceding accents.
1579*/
1580static
729e4ab9 1581int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
b75a7d8f
A
1582 int32_t textoffset,
1583 UErrorCode *status)
1584{
1585 const UChar *text = strsrch->search->text;
1586 const UCollator *collator = strsrch->collator;
1587 int32_t safelength = 0;
1588 UChar *safetext;
1589 int32_t safetextlength;
1590 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1591 UCollationElements *coleiter = strsrch->utilIter;
1592 int32_t safeoffset = textoffset;
1593
729e4ab9 1594 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
b75a7d8f
A
1595 collator)) {
1596 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1597 safelength = textoffset - safeoffset;
1598 safetextlength = INITIAL_ARRAY_SIZE_;
729e4ab9
A
1599 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1600 text + safeoffset, safelength,
1601 strsrch->canonicalSuffixAccents,
b75a7d8f
A
1602 status);
1603 }
1604 else {
1605 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1606 safetext = strsrch->canonicalSuffixAccents;
1607 }
1608
1609 // if status is a failure, ucol_setText does nothing
1610 ucol_setText(coleiter, safetext, safetextlength, status);
1611 // status checked in loop below
1612
b331163b
A
1613 int32_t *ce = strsrch->pattern.ces;
1614 int32_t celength = strsrch->pattern.cesLength;
b75a7d8f
A
1615 int ceindex = celength - 1;
1616 UBool isSafe = TRUE; // indication flag for position in safe zone
729e4ab9 1617
b75a7d8f 1618 while (ceindex >= 0) {
374ca955 1619 int32_t textce = ucol_previous(coleiter, status);
b75a7d8f
A
1620 if (U_FAILURE(*status)) {
1621 if (isSafe) {
1622 cleanUpSafeText(strsrch, safetext, safebuffer);
1623 }
1624 return USEARCH_DONE;
1625 }
1626 if (textce == UCOL_NULLORDER) {
1627 // check if we have passed the safe buffer
1628 if (coleiter == strsrch->textIter) {
1629 cleanUpSafeText(strsrch, safetext, safebuffer);
1630 return USEARCH_DONE;
1631 }
1632 cleanUpSafeText(strsrch, safetext, safebuffer);
1633 safetext = safebuffer;
1634 coleiter = strsrch->textIter;
1635 setColEIterOffset(coleiter, safeoffset);
1636 // status checked at the start of the loop
1637 isSafe = FALSE;
1638 continue;
1639 }
1640 textce = getCE(strsrch, textce);
1641 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1642 // do the beginning stuff
1643 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1644 if (isSafe && failedoffset >= safelength) {
1645 // alas... no hope. failed at rearranged accent set
1646 cleanUpSafeText(strsrch, safetext, safebuffer);
1647 return USEARCH_DONE;
1648 }
1649 else {
1650 if (isSafe) {
1651 failedoffset += safeoffset;
1652 cleanUpSafeText(strsrch, safetext, safebuffer);
1653 }
729e4ab9 1654
b75a7d8f 1655 // try rearranging the front accents
729e4ab9 1656 int32_t result = doNextCanonicalPrefixMatch(strsrch,
b75a7d8f
A
1657 failedoffset, textoffset, status);
1658 if (result != USEARCH_DONE) {
1659 // if status is a failure, ucol_setOffset does nothing
1660 setColEIterOffset(strsrch->textIter, result);
1661 }
1662 if (U_FAILURE(*status)) {
1663 return USEARCH_DONE;
1664 }
1665 return result;
1666 }
1667 }
1668 if (textce == ce[ceindex]) {
1669 ceindex --;
1670 }
1671 }
1672 // set offset here
1673 if (isSafe) {
1674 int32_t result = getColElemIterOffset(coleiter, FALSE);
1675 // sets the text iterator here with the correct expansion and offset
1676 int32_t leftoverces = getExpansionPrefix(coleiter);
1677 cleanUpSafeText(strsrch, safetext, safebuffer);
729e4ab9 1678 if (result >= safelength) {
b75a7d8f
A
1679 result = textoffset;
1680 }
1681 else {
1682 result += safeoffset;
1683 }
1684 setColEIterOffset(strsrch->textIter, result);
729e4ab9 1685 strsrch->textIter->iteratordata_.toReturn =
b75a7d8f
A
1686 setExpansionPrefix(strsrch->textIter, leftoverces);
1687 return result;
1688 }
729e4ab9
A
1689
1690 return ucol_getOffset(coleiter);
b75a7d8f
A
1691}
1692
1693/**
1694* Trying out the substring and sees if it can be a canonical match.
1695* This will try normalizing the end accents and arranging them into canonical
1696* equivalents and check their corresponding ces with the pattern ce.
729e4ab9
A
1697* Suffix accents in the text will be grouped according to their combining
1698* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
1699* match with the pattern.
1700* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1701* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 1702* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
1703* "\u0301\u0325".
1704* step 2: check if any of the generated substrings matches the pattern.
729e4ab9 1705* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1706* before calling this method.
1707* @param strsrch string search data
729e4ab9 1708* @param textoffset end offset in the collation element text that ends with
b75a7d8f
A
1709* the accents to be rearranged
1710* @param status error status if any
1711* @return TRUE if the match is valid, FALSE otherwise
1712*/
1713static
729e4ab9
A
1714UBool doNextCanonicalMatch(UStringSearch *strsrch,
1715 int32_t textoffset,
b75a7d8f
A
1716 UErrorCode *status)
1717{
1718 const UChar *text = strsrch->search->text;
1719 int32_t temp = textoffset;
4388f060 1720 U16_BACK_1(text, 0, temp);
b75a7d8f
A
1721 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1722 UCollationElements *coleiter = strsrch->textIter;
1723 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1724 if (strsrch->pattern.hasPrefixAccents) {
729e4ab9 1725 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
b75a7d8f
A
1726 status);
1727 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1728 setColEIterOffset(coleiter, offset);
1729 return TRUE;
1730 }
1731 }
1732 return FALSE;
1733 }
1734
1735 if (!strsrch->pattern.hasSuffixAccents) {
1736 return FALSE;
1737 }
1738
1739 UChar accents[INITIAL_ARRAY_SIZE_];
1740 // offset to the last base character in substring to search
1741 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1742 // normalizing the offensive string
729e4ab9
A
1743 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1744 0, accents, INITIAL_ARRAY_SIZE_, status);
b75a7d8f 1745 // status checked in loop below
729e4ab9 1746
b75a7d8f
A
1747 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1748 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1749
374ca955
A
1750 // 2 power n - 1 plus the full set of accents
1751 int32_t count = (2 << (size - 1)) - 1;
b75a7d8f
A
1752 while (U_SUCCESS(*status) && count > 0) {
1753 UChar *rearrange = strsrch->canonicalSuffixAccents;
1754 // copy the base characters
1755 for (int k = 0; k < accentsindex[0]; k ++) {
1756 *rearrange ++ = accents[k];
1757 }
1758 // forming all possible canonical rearrangement by dropping
1759 // sets of accents
1760 for (int i = 0; i <= size - 1; i ++) {
1761 int32_t mask = 1 << (size - i - 1);
1762 if (count & mask) {
1763 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1764 *rearrange ++ = accents[j];
1765 }
1766 }
1767 }
1768 *rearrange = 0;
729e4ab9 1769 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
b75a7d8f
A
1770 status);
1771 if (offset != USEARCH_DONE) {
1772 return TRUE; // match found
1773 }
1774 count --;
1775 }
1776 return FALSE;
1777}
1778
1779/**
729e4ab9 1780* Gets the previous base character offset depending on the string search
b75a7d8f
A
1781* pattern data
1782* @param strsrch string search data
1783* @param textoffset current offset, current character
1784* @return the offset of the next character after this base character or itself
1785* if it is a composed character with accents
1786*/
1787static
729e4ab9 1788inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
b75a7d8f
A
1789 int32_t textoffset)
1790{
1791 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1792 const UChar *text = strsrch->search->text;
1793 int32_t offset = textoffset;
729e4ab9 1794 if (getFCD(text, &offset, strsrch->search->textLength) >>
b75a7d8f
A
1795 SECOND_LAST_BYTE_SHIFT_) {
1796 return getPreviousBaseOffset(text, textoffset);
1797 }
1798 }
1799 return textoffset;
1800}
1801
1802/**
729e4ab9 1803* Checks match for contraction.
b75a7d8f
A
1804* If the match ends with a partial contraction we fail.
1805* If the match starts too far off (because of backwards iteration) we try to
1806* chip off the extra characters
729e4ab9 1807* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1808* before calling this method.
1809* @param strsrch string search data
1810* @param start offset of potential match, to be modified if necessary
1811* @param end offset of potential match, to be modified if necessary
1812* @param status output error status if any
1813* @return TRUE if match passes the contraction test, FALSE otherwise
1814*/
1815static
729e4ab9
A
1816UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1817 int32_t *start,
1818 int32_t *end,
1819 UErrorCode *status)
b75a7d8f
A
1820{
1821 UCollationElements *coleiter = strsrch->textIter;
1822 int32_t textlength = strsrch->search->textLength;
1823 int32_t temp = *start;
1824 const UCollator *collator = strsrch->collator;
1825 const UChar *text = strsrch->search->text;
729e4ab9 1826 // This part checks if either ends of the match contains potential
b75a7d8f 1827 // contraction. If so we'll have to iterate through them
729e4ab9
A
1828 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1829 (*start + 1 < textlength
b75a7d8f
A
1830 && ucol_unsafeCP(text[*start + 1], collator))) {
1831 int32_t expansion = getExpansionPrefix(coleiter);
1832 UBool expandflag = expansion > 0;
1833 setColEIterOffset(coleiter, *start);
1834 while (expansion > 0) {
1835 // getting rid of the redundant ce, caused by setOffset.
729e4ab9
A
1836 // since backward contraction/expansion may have extra ces if we
1837 // are in the normalization buffer, hasAccentsBeforeMatch would
b75a7d8f
A
1838 // have taken care of it.
1839 // E.g. the character \u01FA will have an expansion of 3, but if
1840 // we are only looking for acute and ring \u030A and \u0301, we'll
1841 // have to skip the first ce in the expansion buffer.
1842 ucol_next(coleiter, status);
374ca955
A
1843 if (U_FAILURE(*status)) {
1844 return FALSE;
1845 }
b75a7d8f
A
1846 if (ucol_getOffset(coleiter) != temp) {
1847 *start = temp;
1848 temp = ucol_getOffset(coleiter);
1849 }
1850 expansion --;
1851 }
1852
b331163b
A
1853 int32_t *patternce = strsrch->pattern.ces;
1854 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
1855 int32_t count = 0;
1856 int32_t textlength = strsrch->search->textLength;
1857 while (count < patterncelength) {
374ca955 1858 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
b75a7d8f
A
1859 // status checked below, note that if status is a failure
1860 // ucol_next returns UCOL_NULLORDER
1861 if (ce == UCOL_IGNORABLE) {
1862 continue;
1863 }
1864 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1865 *start = temp;
1866 temp = ucol_getOffset(coleiter);
1867 }
1868
1869 if (count == 0 && ce != patternce[0]) {
729e4ab9 1870 // accents may have extra starting ces, this occurs when a
b75a7d8f
A
1871 // pure accent pattern is matched without rearrangement
1872 // text \u0325\u0300 and looking for \u0300
729e4ab9 1873 int32_t expected = patternce[0];
b75a7d8f
A
1874 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1875 ce = getCE(strsrch, ucol_next(coleiter, status));
729e4ab9 1876 while (U_SUCCESS(*status) && ce != expected &&
b75a7d8f
A
1877 ce != UCOL_NULLORDER &&
1878 ucol_getOffset(coleiter) <= *end) {
1879 ce = getCE(strsrch, ucol_next(coleiter, status));
1880 }
1881 }
1882 }
1883 if (U_FAILURE(*status) || ce != patternce[count]) {
1884 (*end) ++;
729e4ab9 1885 *end = getNextUStringSearchBaseOffset(strsrch, *end);
b75a7d8f
A
1886 return FALSE;
1887 }
1888 count ++;
1889 }
729e4ab9 1890 }
b75a7d8f
A
1891 return TRUE;
1892}
1893
1894/**
1895* Checks and sets the match information if found.
729e4ab9 1896* Checks
b75a7d8f
A
1897* <ul>
1898* <li> the potential match does not repeat the previous match
1899* <li> boundaries are correct
1900* <li> potential match does not end in the middle of a contraction
1901* <li> identical matches
1902* <\ul>
1903* Otherwise the offset will be shifted to the next character.
729e4ab9 1904* Internal method, status assumed to be success, caller has to check the
b75a7d8f
A
1905* status before calling this method.
1906* @param strsrch string search data
1907* @param textoffset offset in the collation element text. the returned value
729e4ab9 1908* will be the truncated end offset of the match or the new start
b75a7d8f
A
1909* search offset.
1910* @param status output error status if any
1911* @return TRUE if the match is valid, FALSE otherwise
1912*/
1913static
729e4ab9
A
1914inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1915 int32_t *textoffset,
b75a7d8f
A
1916 UErrorCode *status)
1917{
1918 // to ensure that the start and ends are not composite characters
1919 UCollationElements *coleiter = strsrch->textIter;
1920 // if we have a canonical accent match
729e4ab9
A
1921 if ((strsrch->pattern.hasSuffixAccents &&
1922 strsrch->canonicalSuffixAccents[0]) ||
1923 (strsrch->pattern.hasPrefixAccents &&
b75a7d8f
A
1924 strsrch->canonicalPrefixAccents[0])) {
1925 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1926 strsrch,
1927 ucol_getOffset(coleiter));
729e4ab9 1928 strsrch->search->matchedLength = *textoffset -
b75a7d8f
A
1929 strsrch->search->matchedIndex;
1930 return TRUE;
1931 }
1932
1933 int32_t start = getColElemIterOffset(coleiter, FALSE);
729e4ab9 1934 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
b75a7d8f
A
1935 status) || U_FAILURE(*status)) {
1936 return FALSE;
1937 }
729e4ab9 1938
b75a7d8f
A
1939 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1940 // this totally matches, however we need to check if it is repeating
729e4ab9
A
1941 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1942 !isBreakUnit(strsrch, start, *textoffset) ||
b75a7d8f
A
1943 !checkIdentical(strsrch, start, *textoffset)) {
1944 (*textoffset) ++;
729e4ab9 1945 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
b75a7d8f
A
1946 strsrch->search->textLength);
1947 return FALSE;
1948 }
729e4ab9 1949
b75a7d8f
A
1950 strsrch->search->matchedIndex = start;
1951 strsrch->search->matchedLength = *textoffset - start;
1952 return TRUE;
1953}
1954
1955/**
1956* Shifting the collation element iterator position forward to prepare for
1957* a preceding match. If the first character is a unsafe character, we'll only
1958* shift by 1 to capture contractions, normalization etc.
729e4ab9 1959* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
1960* before calling this method.
1961* @param text strsrch string search data
1962* @param textoffset start text position to do search
1963* @param ce the text ce which failed the match.
1964* @param patternceindex index of the ce within the pattern ce buffer which
1965* failed the match
1966* @return final offset
1967*/
1968static
1969inline int32_t reverseShift(UStringSearch *strsrch,
1970 int32_t textoffset,
374ca955 1971 int32_t ce,
b75a7d8f 1972 int32_t patternceindex)
729e4ab9 1973{
b75a7d8f
A
1974 if (strsrch->search->isOverlap) {
1975 if (textoffset != strsrch->search->textLength) {
1976 textoffset --;
1977 }
1978 else {
1979 textoffset -= strsrch->pattern.defaultShiftSize;
1980 }
1981 }
1982 else {
1983 if (ce != UCOL_NULLORDER) {
2ca993e8 1984 int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
729e4ab9
A
1985
1986 // this is to adjust for characters in the middle of the substring
b75a7d8f
A
1987 // for matching that failed.
1988 int32_t adjust = patternceindex;
1989 if (adjust > 1 && shift > adjust) {
1990 shift -= adjust - 1;
1991 }
1992 textoffset -= shift;
1993 }
1994 else {
1995 textoffset -= strsrch->pattern.defaultShiftSize;
1996 }
729e4ab9 1997 }
b75a7d8f
A
1998 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1999 return textoffset;
2000}
2001
2002/**
729e4ab9 2003* Checks match for contraction.
b75a7d8f 2004* If the match starts with a partial contraction we fail.
729e4ab9 2005* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
2006* before calling this method.
2007* @param strsrch string search data
2008* @param start offset of potential match, to be modified if necessary
2009* @param end offset of potential match, to be modified if necessary
2010* @param status output error status if any
2011* @return TRUE if match passes the contraction test, FALSE otherwise
2012*/
2013static
729e4ab9
A
2014UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2015 int32_t *start,
2016 int32_t *end, UErrorCode *status)
b75a7d8f
A
2017{
2018 UCollationElements *coleiter = strsrch->textIter;
2019 int32_t textlength = strsrch->search->textLength;
2020 int32_t temp = *end;
2021 const UCollator *collator = strsrch->collator;
2022 const UChar *text = strsrch->search->text;
729e4ab9 2023 // This part checks if either if the start of the match contains potential
b75a7d8f 2024 // contraction. If so we'll have to iterate through them
729e4ab9 2025 // Since we used ucol_next while previously looking for the potential
374ca955
A
2026 // match, this guarantees that our end will not be a partial contraction,
2027 // or a partial supplementary character.
b75a7d8f
A
2028 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2029 int32_t expansion = getExpansionSuffix(coleiter);
2030 UBool expandflag = expansion > 0;
2031 setColEIterOffset(coleiter, *end);
2032 while (U_SUCCESS(*status) && expansion > 0) {
2033 // getting rid of the redundant ce
2034 // since forward contraction/expansion may have extra ces
2035 // if we are in the normalization buffer, hasAccentsBeforeMatch
2036 // would have taken care of it.
2037 // E.g. the character \u01FA will have an expansion of 3, but if
729e4ab9 2038 // we are only looking for A ring A\u030A, we'll have to skip the
b75a7d8f
A
2039 // last ce in the expansion buffer
2040 ucol_previous(coleiter, status);
374ca955
A
2041 if (U_FAILURE(*status)) {
2042 return FALSE;
2043 }
b75a7d8f
A
2044 if (ucol_getOffset(coleiter) != temp) {
2045 *end = temp;
2046 temp = ucol_getOffset(coleiter);
2047 }
2048 expansion --;
2049 }
2050
b331163b
A
2051 int32_t *patternce = strsrch->pattern.ces;
2052 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
2053 int32_t count = patterncelength;
2054 while (count > 0) {
374ca955 2055 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
b75a7d8f
A
2056 // status checked below, note that if status is a failure
2057 // ucol_previous returns UCOL_NULLORDER
2058 if (ce == UCOL_IGNORABLE) {
2059 continue;
2060 }
729e4ab9 2061 if (expandflag && count == 0 &&
b75a7d8f
A
2062 getColElemIterOffset(coleiter, FALSE) != temp) {
2063 *end = temp;
2064 temp = ucol_getOffset(coleiter);
2065 }
2066 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2067 (*start) --;
2068 *start = getPreviousBaseOffset(text, *start);
2069 return FALSE;
2070 }
2071 count --;
2072 }
729e4ab9 2073 }
b75a7d8f
A
2074 return TRUE;
2075}
2076
2077/**
2078* Checks and sets the match information if found.
729e4ab9 2079* Checks
b75a7d8f
A
2080* <ul>
2081* <li> the current match does not repeat the last match
2082* <li> boundaries are correct
2083* <li> exact matches has no extra accents
2084* <li> identical matches
2085* <\ul>
2086* Otherwise the offset will be shifted to the preceding character.
2087* Internal method, status assumed to be success, caller has to check status
2088* before calling this method.
2089* @param strsrch string search data
729e4ab9 2090* @param collator
b75a7d8f
A
2091* @param coleiter collation element iterator
2092* @param text string
2093* @param textoffset offset in the collation element text. the returned value
729e4ab9 2094* will be the truncated start offset of the match or the new start
b75a7d8f
A
2095* search offset.
2096* @param status output error status if any
2097* @return TRUE if the match is valid, FALSE otherwise
2098*/
2099static
729e4ab9
A
2100inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2101 int32_t *textoffset,
b75a7d8f
A
2102 UErrorCode *status)
2103{
2104 // to ensure that the start and ends are not composite characters
729e4ab9 2105 int32_t end = ucol_getOffset(strsrch->textIter);
b75a7d8f
A
2106 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2107 || U_FAILURE(*status)) {
2108 return FALSE;
2109 }
729e4ab9 2110
b75a7d8f
A
2111 // this totally matches, however we need to check if it is repeating
2112 // the old match
729e4ab9 2113 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
b75a7d8f
A
2114 !isBreakUnit(strsrch, *textoffset, end) ||
2115 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
729e4ab9 2116 !checkIdentical(strsrch, *textoffset, end) ||
b75a7d8f
A
2117 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2118 (*textoffset) --;
729e4ab9 2119 *textoffset = getPreviousBaseOffset(strsrch->search->text,
b75a7d8f
A
2120 *textoffset);
2121 return FALSE;
2122 }
729e4ab9 2123
46f4442e
A
2124 //Add breakiterator boundary check for primary strength search.
2125 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
4388f060 2126 checkBreakBoundary(strsrch, textoffset, &end);
46f4442e 2127 }
729e4ab9 2128
b75a7d8f
A
2129 strsrch->search->matchedIndex = *textoffset;
2130 strsrch->search->matchedLength = end - *textoffset;
2131 return TRUE;
2132}
2133
2134/**
2135* Rearranges the end accents to try matching.
729e4ab9
A
2136* Suffix accents in the text will be grouped according to their combining
2137* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
2138* match with the pattern.
2139* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2140* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 2141* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
2142* "\u0301\u0325".
2143* step 2: check if any of the generated substrings matches the pattern.
729e4ab9 2144* Internal method, status assumed to be success, user has to check status
b75a7d8f
A
2145* before calling this method.
2146* @param strsrch string search match
2147* @param start offset of the first base character
2148* @param end start of the last accent set
2149* @param status only error status if any
2150* @return USEARCH_DONE if a match is not found, otherwise return the ending
2151* offset of the match. Note this start includes all following accents.
2152*/
2153static
729e4ab9 2154int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
b75a7d8f 2155 int32_t start,
729e4ab9 2156 int32_t end,
b75a7d8f
A
2157 UErrorCode *status)
2158{
2159 const UChar *text = strsrch->search->text;
2160 int32_t tempend = end;
2161
4388f060 2162 U16_BACK_1(text, 0, tempend);
729e4ab9 2163 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
b75a7d8f
A
2164 LAST_BYTE_MASK_)) {
2165 // die... failed at a base character
2166 return USEARCH_DONE;
2167 }
2168 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2169
2170 if (U_SUCCESS(*status)) {
2171 UChar accents[INITIAL_ARRAY_SIZE_];
2172 int32_t offset = getPreviousBaseOffset(text, end);
2173 // normalizing the offensive string
729e4ab9
A
2174 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2175 INITIAL_ARRAY_SIZE_, status);
2176
2177 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2178 int32_t accentsize = getUnblockedAccentIndex(accents,
b75a7d8f 2179 accentsindex);
729e4ab9 2180 int32_t count = (2 << (accentsize - 1)) - 1;
b75a7d8f
A
2181 UChar buffer[INITIAL_ARRAY_SIZE_];
2182 UCollationElements *coleiter = strsrch->utilIter;
2183 while (U_SUCCESS(*status) && count > 0) {
2184 UChar *rearrange = strsrch->canonicalSuffixAccents;
2185 // copy the base characters
2186 for (int k = 0; k < accentsindex[0]; k ++) {
2187 *rearrange ++ = accents[k];
2188 }
2189 // forming all possible canonical rearrangement by dropping
2190 // sets of accents
2191 for (int i = 0; i <= accentsize - 1; i ++) {
2192 int32_t mask = 1 << (accentsize - i - 1);
2193 if (count & mask) {
2194 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2195 *rearrange ++ = accents[j];
2196 }
2197 }
2198 }
2199 *rearrange = 0;
2200 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2201 UChar *match = addToUCharArray(buffer, &matchsize,
2202 strsrch->canonicalPrefixAccents,
2203 strsrch->search->text + start,
2204 offset - start,
2205 strsrch->canonicalSuffixAccents,
2206 status);
729e4ab9 2207
b75a7d8f
A
2208 // run the collator iterator through this match
2209 // if status is a failure ucol_setText does nothing
2210 ucol_setText(coleiter, match, matchsize, status);
2211 if (U_SUCCESS(*status)) {
2212 if (checkCollationMatch(strsrch, coleiter)) {
2213 if (match != buffer) {
2214 uprv_free(match);
2215 }
2216 return end;
2217 }
2218 }
2219 count --;
2220 }
2221 }
2222 return USEARCH_DONE;
2223}
2224
2225/**
2226* Take the rearranged start accents and tries matching. If match failed at
2227* a seperate following set of accents (seperated from the rearranged on by
729e4ab9 2228* at least a base character) then we rearrange the preceding accents and
b75a7d8f 2229* tries matching again.
729e4ab9 2230* We allow skipping of the ends of the accent set if the ces do not match.
b75a7d8f 2231* However if the failure is found before the accent set, it fails.
729e4ab9 2232* Internal method, status assumed to be success, caller has to check status
b75a7d8f
A
2233* before calling this method.
2234* @param strsrch string search data
2235* @param textoffset of the ends of the rearranged accent
2236* @param status output error status if any
2237* @return USEARCH_DONE if a match is not found, otherwise return the ending
2238* offset of the match. Note this start includes all following accents.
2239*/
2240static
729e4ab9 2241int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
b75a7d8f
A
2242 int32_t textoffset,
2243 UErrorCode *status)
2244{
2245 const UChar *text = strsrch->search->text;
2246 const UCollator *collator = strsrch->collator;
2247 int32_t safelength = 0;
2248 UChar *safetext;
2249 int32_t safetextlength;
2250 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2251 int32_t safeoffset = textoffset;
2252
729e4ab9 2253 if (textoffset &&
b75a7d8f
A
2254 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2255 u_strlen(strsrch->canonicalPrefixAccents) - 1
2256 ], collator)) {
729e4ab9 2257 safeoffset = getNextSafeOffset(collator, text, textoffset,
b75a7d8f
A
2258 strsrch->search->textLength);
2259 safelength = safeoffset - textoffset;
2260 safetextlength = INITIAL_ARRAY_SIZE_;
729e4ab9
A
2261 safetext = addToUCharArray(safebuffer, &safetextlength,
2262 strsrch->canonicalPrefixAccents,
2263 text + textoffset, safelength,
b75a7d8f
A
2264 NULL, status);
2265 }
2266 else {
2267 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2268 safetext = strsrch->canonicalPrefixAccents;
2269 }
2270
2271 UCollationElements *coleiter = strsrch->utilIter;
2272 // if status is a failure, ucol_setText does nothing
2273 ucol_setText(coleiter, safetext, safetextlength, status);
2274 // status checked in loop below
729e4ab9 2275
b331163b
A
2276 int32_t *ce = strsrch->pattern.ces;
2277 int32_t celength = strsrch->pattern.cesLength;
b75a7d8f
A
2278 int ceindex = 0;
2279 UBool isSafe = TRUE; // safe zone indication flag for position
2280 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
729e4ab9 2281
b75a7d8f 2282 while (ceindex < celength) {
374ca955 2283 int32_t textce = ucol_next(coleiter, status);
b75a7d8f
A
2284 if (U_FAILURE(*status)) {
2285 if (isSafe) {
2286 cleanUpSafeText(strsrch, safetext, safebuffer);
2287 }
2288 return USEARCH_DONE;
2289 }
2290 if (textce == UCOL_NULLORDER) {
2291 // check if we have passed the safe buffer
2292 if (coleiter == strsrch->textIter) {
2293 cleanUpSafeText(strsrch, safetext, safebuffer);
2294 return USEARCH_DONE;
2295 }
2296 cleanUpSafeText(strsrch, safetext, safebuffer);
2297 safetext = safebuffer;
2298 coleiter = strsrch->textIter;
2299 setColEIterOffset(coleiter, safeoffset);
2300 // status checked at the start of the loop
2301 isSafe = FALSE;
2302 continue;
2303 }
2304 textce = getCE(strsrch, textce);
2305 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2306 // do the beginning stuff
2307 int32_t failedoffset = ucol_getOffset(coleiter);
2308 if (isSafe && failedoffset <= prefixlength) {
2309 // alas... no hope. failed at rearranged accent set
2310 cleanUpSafeText(strsrch, safetext, safebuffer);
2311 return USEARCH_DONE;
2312 }
2313 else {
2314 if (isSafe) {
2315 failedoffset = safeoffset - failedoffset;
2316 cleanUpSafeText(strsrch, safetext, safebuffer);
2317 }
729e4ab9 2318
b75a7d8f 2319 // try rearranging the end accents
729e4ab9 2320 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
b75a7d8f
A
2321 textoffset, failedoffset, status);
2322 if (result != USEARCH_DONE) {
2323 // if status is a failure, ucol_setOffset does nothing
2324 setColEIterOffset(strsrch->textIter, result);
2325 }
2326 if (U_FAILURE(*status)) {
2327 return USEARCH_DONE;
2328 }
2329 return result;
2330 }
2331 }
2332 if (textce == ce[ceindex]) {
2333 ceindex ++;
2334 }
2335 }
2336 // set offset here
2337 if (isSafe) {
2338 int32_t result = ucol_getOffset(coleiter);
2339 // sets the text iterator here with the correct expansion and offset
2340 int32_t leftoverces = getExpansionSuffix(coleiter);
2341 cleanUpSafeText(strsrch, safetext, safebuffer);
729e4ab9 2342 if (result <= prefixlength) {
b75a7d8f
A
2343 result = textoffset;
2344 }
2345 else {
2346 result = textoffset + (safeoffset - result);
2347 }
2348 setColEIterOffset(strsrch->textIter, result);
2349 setExpansionSuffix(strsrch->textIter, leftoverces);
2350 return result;
2351 }
729e4ab9
A
2352
2353 return ucol_getOffset(coleiter);
b75a7d8f
A
2354}
2355
2356/**
2357* Trying out the substring and sees if it can be a canonical match.
729e4ab9 2358* This will try normalizing the starting accents and arranging them into
b75a7d8f 2359* canonical equivalents and check their corresponding ces with the pattern ce.
729e4ab9
A
2360* Prefix accents in the text will be grouped according to their combining
2361* class and the groups will be mixed and matched to try find the perfect
b75a7d8f
A
2362* match with the pattern.
2363* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2364* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
729e4ab9 2365* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
b75a7d8f
A
2366* "\u0301\u0325".
2367* step 2: check if any of the generated substrings matches the pattern.
2368* Internal method, status assumed to be success, caller has to check status
2369* before calling this method.
2370* @param strsrch string search data
729e4ab9 2371* @param textoffset start offset in the collation element text that starts
b75a7d8f
A
2372* with the accents to be rearranged
2373* @param status output error status if any
2374* @return TRUE if the match is valid, FALSE otherwise
2375*/
2376static
729e4ab9
A
2377UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2378 int32_t textoffset,
b75a7d8f
A
2379 UErrorCode *status)
2380{
2381 const UChar *text = strsrch->search->text;
2382 int32_t temp = textoffset;
2383 int32_t textlength = strsrch->search->textLength;
2384 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2385 UCollationElements *coleiter = strsrch->textIter;
2386 int32_t offset = ucol_getOffset(coleiter);
2387 if (strsrch->pattern.hasSuffixAccents) {
729e4ab9 2388 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
b75a7d8f
A
2389 offset, status);
2390 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2391 setColEIterOffset(coleiter, offset);
2392 return TRUE;
2393 }
2394 }
2395 return FALSE;
2396 }
2397
2398 if (!strsrch->pattern.hasPrefixAccents) {
2399 return FALSE;
2400 }
2401
2402 UChar accents[INITIAL_ARRAY_SIZE_];
2403 // offset to the last base character in substring to search
2404 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2405 // normalizing the offensive string
729e4ab9
A
2406 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2407 0, accents, INITIAL_ARRAY_SIZE_, status);
b75a7d8f 2408 // status checked in loop
729e4ab9 2409
b75a7d8f
A
2410 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2411 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2412
374ca955 2413 // 2 power n - 1 plus the full set of accents
729e4ab9 2414 int32_t count = (2 << (size - 1)) - 1;
b75a7d8f
A
2415 while (U_SUCCESS(*status) && count > 0) {
2416 UChar *rearrange = strsrch->canonicalPrefixAccents;
2417 // copy the base characters
2418 for (int k = 0; k < accentsindex[0]; k ++) {
2419 *rearrange ++ = accents[k];
2420 }
2421 // forming all possible canonical rearrangement by dropping
2422 // sets of accents
2423 for (int i = 0; i <= size - 1; i ++) {
2424 int32_t mask = 1 << (size - i - 1);
2425 if (count & mask) {
2426 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2427 *rearrange ++ = accents[j];
2428 }
2429 }
2430 }
2431 *rearrange = 0;
729e4ab9 2432 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
b75a7d8f
A
2433 baseoffset, status);
2434 if (offset != USEARCH_DONE) {
2435 return TRUE; // match found
2436 }
2437 count --;
2438 }
2439 return FALSE;
2440}
2441
2442/**
729e4ab9 2443* Checks match for contraction.
b75a7d8f
A
2444* If the match starts with a partial contraction we fail.
2445* Internal method, status assumed to be success, caller has to check status
2446* before calling this method.
2447* @param strsrch string search data
2448* @param start offset of potential match, to be modified if necessary
2449* @param end offset of potential match, to be modified if necessary
2450* @param status only error status if any
2451* @return TRUE if match passes the contraction test, FALSE otherwise
2452*/
2453static
729e4ab9
A
2454UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2455 int32_t *start,
2456 int32_t *end, UErrorCode *status)
b75a7d8f
A
2457{
2458 UCollationElements *coleiter = strsrch->textIter;
2459 int32_t textlength = strsrch->search->textLength;
2460 int32_t temp = *end;
2461 const UCollator *collator = strsrch->collator;
2462 const UChar *text = strsrch->search->text;
729e4ab9 2463 // This part checks if either if the start of the match contains potential
b75a7d8f 2464 // contraction. If so we'll have to iterate through them
729e4ab9 2465 // Since we used ucol_next while previously looking for the potential
374ca955
A
2466 // match, this guarantees that our end will not be a partial contraction,
2467 // or a partial supplementary character.
b75a7d8f
A
2468 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2469 int32_t expansion = getExpansionSuffix(coleiter);
2470 UBool expandflag = expansion > 0;
2471 setColEIterOffset(coleiter, *end);
2472 while (expansion > 0) {
2473 // getting rid of the redundant ce
2474 // since forward contraction/expansion may have extra ces
2475 // if we are in the normalization buffer, hasAccentsBeforeMatch
2476 // would have taken care of it.
2477 // E.g. the character \u01FA will have an expansion of 3, but if
729e4ab9 2478 // we are only looking for A ring A\u030A, we'll have to skip the
b75a7d8f
A
2479 // last ce in the expansion buffer
2480 ucol_previous(coleiter, status);
374ca955
A
2481 if (U_FAILURE(*status)) {
2482 return FALSE;
2483 }
b75a7d8f
A
2484 if (ucol_getOffset(coleiter) != temp) {
2485 *end = temp;
2486 temp = ucol_getOffset(coleiter);
2487 }
2488 expansion --;
2489 }
2490
b331163b
A
2491 int32_t *patternce = strsrch->pattern.ces;
2492 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
2493 int32_t count = patterncelength;
2494 while (count > 0) {
374ca955 2495 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
b75a7d8f
A
2496 // status checked below, note that if status is a failure
2497 // ucol_previous returns UCOL_NULLORDER
2498 if (ce == UCOL_IGNORABLE) {
2499 continue;
2500 }
729e4ab9 2501 if (expandflag && count == 0 &&
b75a7d8f
A
2502 getColElemIterOffset(coleiter, FALSE) != temp) {
2503 *end = temp;
2504 temp = ucol_getOffset(coleiter);
2505 }
729e4ab9 2506 if (count == patterncelength &&
b75a7d8f 2507 ce != patternce[patterncelength - 1]) {
729e4ab9 2508 // accents may have extra starting ces, this occurs when a
b75a7d8f 2509 // pure accent pattern is matched without rearrangement
374ca955 2510 int32_t expected = patternce[patterncelength - 1];
4388f060 2511 U16_BACK_1(text, 0, *end);
b75a7d8f
A
2512 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2513 ce = getCE(strsrch, ucol_previous(coleiter, status));
729e4ab9 2514 while (U_SUCCESS(*status) && ce != expected &&
b75a7d8f
A
2515 ce != UCOL_NULLORDER &&
2516 ucol_getOffset(coleiter) <= *start) {
2517 ce = getCE(strsrch, ucol_previous(coleiter, status));
2518 }
2519 }
2520 }
2521 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2522 (*start) --;
2523 *start = getPreviousBaseOffset(text, *start);
2524 return FALSE;
2525 }
2526 count --;
2527 }
729e4ab9 2528 }
b75a7d8f
A
2529 return TRUE;
2530}
2531
2532/**
2533* Checks and sets the match information if found.
729e4ab9 2534* Checks
b75a7d8f
A
2535* <ul>
2536* <li> the potential match does not repeat the previous match
2537* <li> boundaries are correct
2538* <li> potential match does not end in the middle of a contraction
2539* <li> identical matches
2540* <\ul>
2541* Otherwise the offset will be shifted to the next character.
2542* Internal method, status assumed to be success, caller has to check status
2543* before calling this method.
2544* @param strsrch string search data
2545* @param textoffset offset in the collation element text. the returned value
729e4ab9 2546* will be the truncated start offset of the match or the new start
b75a7d8f
A
2547* search offset.
2548* @param status only error status if any
2549* @return TRUE if the match is valid, FALSE otherwise
2550*/
2551static
729e4ab9
A
2552inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2553 int32_t *textoffset,
b75a7d8f
A
2554 UErrorCode *status)
2555{
2556 // to ensure that the start and ends are not composite characters
2557 UCollationElements *coleiter = strsrch->textIter;
2558 // if we have a canonical accent match
729e4ab9
A
2559 if ((strsrch->pattern.hasSuffixAccents &&
2560 strsrch->canonicalSuffixAccents[0]) ||
2561 (strsrch->pattern.hasPrefixAccents &&
b75a7d8f
A
2562 strsrch->canonicalPrefixAccents[0])) {
2563 strsrch->search->matchedIndex = *textoffset;
729e4ab9
A
2564 strsrch->search->matchedLength =
2565 getNextUStringSearchBaseOffset(strsrch,
b75a7d8f
A
2566 getColElemIterOffset(coleiter, FALSE))
2567 - *textoffset;
2568 return TRUE;
2569 }
2570
2571 int32_t end = ucol_getOffset(coleiter);
2572 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
729e4ab9 2573 status) ||
b75a7d8f
A
2574 U_FAILURE(*status)) {
2575 return FALSE;
2576 }
2577
2578 end = getNextUStringSearchBaseOffset(strsrch, end);
2579 // this totally matches, however we need to check if it is repeating
729e4ab9
A
2580 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2581 !isBreakUnit(strsrch, *textoffset, end) ||
b75a7d8f
A
2582 !checkIdentical(strsrch, *textoffset, end)) {
2583 (*textoffset) --;
729e4ab9 2584 *textoffset = getPreviousBaseOffset(strsrch->search->text,
b75a7d8f
A
2585 *textoffset);
2586 return FALSE;
2587 }
729e4ab9 2588
b75a7d8f
A
2589 strsrch->search->matchedIndex = *textoffset;
2590 strsrch->search->matchedLength = end - *textoffset;
2591 return TRUE;
2592}
46f4442e 2593#endif // #if BOYER_MOORE
b75a7d8f
A
2594
2595// constructors and destructor -------------------------------------------
2596
729e4ab9
A
2597U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2598 int32_t patternlength,
2599 const UChar *text,
b75a7d8f
A
2600 int32_t textlength,
2601 const char *locale,
2602 UBreakIterator *breakiter,
729e4ab9 2603 UErrorCode *status)
b75a7d8f
A
2604{
2605 if (U_FAILURE(*status)) {
2606 return NULL;
2607 }
2608#if UCONFIG_NO_BREAK_ITERATION
2609 if (breakiter != NULL) {
2610 *status = U_UNSUPPORTED_ERROR;
2611 return NULL;
2612 }
2613#endif
2614 if (locale) {
2615 // ucol_open internally checks for status
2616 UCollator *collator = ucol_open(locale, status);
2617 // pattern, text checks are done in usearch_openFromCollator
729e4ab9
A
2618 UStringSearch *result = usearch_openFromCollator(pattern,
2619 patternlength, text, textlength,
b75a7d8f
A
2620 collator, breakiter, status);
2621
2622 if (result == NULL || U_FAILURE(*status)) {
2623 if (collator) {
2624 ucol_close(collator);
2625 }
2626 return NULL;
2627 }
2628 else {
2629 result->ownCollator = TRUE;
2630 }
2631 return result;
2632 }
2633 *status = U_ILLEGAL_ARGUMENT_ERROR;
2634 return NULL;
2635}
2636
2637U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
729e4ab9 2638 const UChar *pattern,
b75a7d8f 2639 int32_t patternlength,
729e4ab9 2640 const UChar *text,
b75a7d8f
A
2641 int32_t textlength,
2642 const UCollator *collator,
2643 UBreakIterator *breakiter,
729e4ab9 2644 UErrorCode *status)
b75a7d8f
A
2645{
2646 if (U_FAILURE(*status)) {
2647 return NULL;
2648 }
2649#if UCONFIG_NO_BREAK_ITERATION
2650 if (breakiter != NULL) {
2651 *status = U_UNSUPPORTED_ERROR;
2652 return NULL;
2653 }
2654#endif
2655 if (pattern == NULL || text == NULL || collator == NULL) {
2656 *status = U_ILLEGAL_ARGUMENT_ERROR;
73c04bcf 2657 return NULL;
b75a7d8f
A
2658 }
2659
374ca955
A
2660 // string search does not really work when numeric collation is turned on
2661 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2662 *status = U_UNSUPPORTED_ERROR;
73c04bcf 2663 return NULL;
374ca955
A
2664 }
2665
b75a7d8f
A
2666 if (U_SUCCESS(*status)) {
2667 initializeFCD(status);
2668 if (U_FAILURE(*status)) {
2669 return NULL;
2670 }
2671
2672 UStringSearch *result;
2673 if (textlength == -1) {
2674 textlength = u_strlen(text);
2675 }
2676 if (patternlength == -1) {
2677 patternlength = u_strlen(pattern);
2678 }
2679 if (textlength <= 0 || patternlength <= 0) {
2680 *status = U_ILLEGAL_ARGUMENT_ERROR;
2681 return NULL;
2682 }
729e4ab9 2683
b75a7d8f
A
2684 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2685 if (result == NULL) {
2686 *status = U_MEMORY_ALLOCATION_ERROR;
2687 return NULL;
2688 }
2689
2690 result->collator = collator;
2691 result->strength = ucol_getStrength(collator);
2692 result->ceMask = getMask(result->strength);
729e4ab9
A
2693 result->toShift =
2694 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
b75a7d8f
A
2695 UCOL_SHIFTED;
2696 result->variableTop = ucol_getVariableTop(collator, status);
2697
b331163b 2698 result->nfd = Normalizer2::getNFDInstance(*status);
729e4ab9 2699
b75a7d8f
A
2700 if (U_FAILURE(*status)) {
2701 uprv_free(result);
2702 return NULL;
2703 }
2704
2705 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2706 if (result->search == NULL) {
2707 *status = U_MEMORY_ALLOCATION_ERROR;
2708 uprv_free(result);
2709 return NULL;
2710 }
2711
2712 result->search->text = text;
2713 result->search->textLength = textlength;
2714
2715 result->pattern.text = pattern;
2716 result->pattern.textLength = patternlength;
b331163b
A
2717 result->pattern.ces = NULL;
2718 result->pattern.pces = NULL;
729e4ab9 2719
b75a7d8f
A
2720 result->search->breakIter = breakiter;
2721#if !UCONFIG_NO_BREAK_ITERATION
729e4ab9 2722 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
b75a7d8f 2723 if (breakiter) {
4388f060 2724 ubrk_setText(breakiter, text, textlength, status);
b75a7d8f
A
2725 }
2726#endif
2727
2728 result->ownCollator = FALSE;
2729 result->search->matchedLength = 0;
2730 result->search->matchedIndex = USEARCH_DONE;
46f4442e 2731 result->utilIter = NULL;
729e4ab9 2732 result->textIter = ucol_openElements(collator, text,
b75a7d8f 2733 textlength, status);
57a6839d 2734 result->textProcessedIter = NULL;
b75a7d8f
A
2735 if (U_FAILURE(*status)) {
2736 usearch_close(result);
2737 return NULL;
2738 }
2739
b75a7d8f
A
2740 result->search->isOverlap = FALSE;
2741 result->search->isCanonicalMatch = FALSE;
729e4ab9 2742 result->search->elementComparisonType = 0;
b75a7d8f
A
2743 result->search->isForwardSearching = TRUE;
2744 result->search->reset = TRUE;
729e4ab9 2745
b75a7d8f
A
2746 initialize(result, status);
2747
2748 if (U_FAILURE(*status)) {
2749 usearch_close(result);
2750 return NULL;
2751 }
2752
2753 return result;
2754 }
2755 return NULL;
2756}
2757
2758U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2759{
2760 if (strsrch) {
b331163b
A
2761 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2762 strsrch->pattern.ces) {
2763 uprv_free(strsrch->pattern.ces);
b75a7d8f 2764 }
46f4442e 2765
b331163b
A
2766 if (strsrch->pattern.pces != NULL &&
2767 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2768 uprv_free(strsrch->pattern.pces);
46f4442e
A
2769 }
2770
57a6839d 2771 delete strsrch->textProcessedIter;
b75a7d8f
A
2772 ucol_closeElements(strsrch->textIter);
2773 ucol_closeElements(strsrch->utilIter);
46f4442e 2774
b75a7d8f
A
2775 if (strsrch->ownCollator && strsrch->collator) {
2776 ucol_close((UCollator *)strsrch->collator);
2777 }
46f4442e
A
2778
2779#if !UCONFIG_NO_BREAK_ITERATION
2780 if (strsrch->search->internalBreakIter) {
4388f060 2781 ubrk_close(strsrch->search->internalBreakIter);
46f4442e
A
2782 }
2783#endif
2784
b75a7d8f
A
2785 uprv_free(strsrch->search);
2786 uprv_free(strsrch);
2787 }
2788}
2789
57a6839d
A
2790namespace {
2791
2792UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2793 if (U_FAILURE(*status)) { return FALSE; }
2794 if (strsrch->textProcessedIter == NULL) {
2795 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2796 if (strsrch->textProcessedIter == NULL) {
2797 *status = U_MEMORY_ALLOCATION_ERROR;
2798 return FALSE;
2799 }
2800 } else {
2801 strsrch->textProcessedIter->init(strsrch->textIter);
2802 }
2803 return TRUE;
2804}
2805
2806}
2807
b75a7d8f
A
2808// set and get methods --------------------------------------------------
2809
729e4ab9 2810U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
b75a7d8f
A
2811 int32_t position,
2812 UErrorCode *status)
2813{
2814 if (U_SUCCESS(*status) && strsrch) {
2815 if (isOutOfBounds(strsrch->search->textLength, position)) {
2816 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2817 }
2818 else {
2819 setColEIterOffset(strsrch->textIter, position);
2820 }
2821 strsrch->search->matchedIndex = USEARCH_DONE;
2822 strsrch->search->matchedLength = 0;
729e4ab9 2823 strsrch->search->reset = FALSE;
b75a7d8f
A
2824 }
2825}
2826
2827U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2828{
2829 if (strsrch) {
2830 int32_t result = ucol_getOffset(strsrch->textIter);
2831 if (isOutOfBounds(strsrch->search->textLength, result)) {
2832 return USEARCH_DONE;
2833 }
2834 return result;
2835 }
2836 return USEARCH_DONE;
2837}
729e4ab9
A
2838
2839U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
b75a7d8f
A
2840 USearchAttribute attribute,
2841 USearchAttributeValue value,
2842 UErrorCode *status)
2843{
2844 if (U_SUCCESS(*status) && strsrch) {
2845 switch (attribute)
2846 {
2847 case USEARCH_OVERLAP :
2848 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2849 break;
2850 case USEARCH_CANONICAL_MATCH :
729e4ab9 2851 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
b75a7d8f
A
2852 FALSE);
2853 break;
729e4ab9
A
2854 case USEARCH_ELEMENT_COMPARISON :
2855 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2856 strsrch->search->elementComparisonType = (int16_t)value;
2857 } else {
2858 strsrch->search->elementComparisonType = 0;
2859 }
2860 break;
b75a7d8f
A
2861 case USEARCH_ATTRIBUTE_COUNT :
2862 default:
2863 *status = U_ILLEGAL_ARGUMENT_ERROR;
2864 }
2865 }
2866 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2867 *status = U_ILLEGAL_ARGUMENT_ERROR;
2868 }
2869}
729e4ab9 2870
b75a7d8f
A
2871U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2872 const UStringSearch *strsrch,
2873 USearchAttribute attribute)
2874{
2875 if (strsrch) {
2876 switch (attribute) {
2877 case USEARCH_OVERLAP :
729e4ab9 2878 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
b75a7d8f
A
2879 USEARCH_OFF);
2880 case USEARCH_CANONICAL_MATCH :
729e4ab9 2881 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
b75a7d8f 2882 USEARCH_OFF);
729e4ab9
A
2883 case USEARCH_ELEMENT_COMPARISON :
2884 {
2885 int16_t value = strsrch->search->elementComparisonType;
2886 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2887 return (USearchAttributeValue)value;
2888 } else {
2889 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2890 }
2891 }
b75a7d8f
A
2892 case USEARCH_ATTRIBUTE_COUNT :
2893 return USEARCH_DEFAULT;
2894 }
2895 }
2896 return USEARCH_DEFAULT;
2897}
2898
2899U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2900 const UStringSearch *strsrch)
2901{
2902 if (strsrch == NULL) {
2903 return USEARCH_DONE;
2904 }
2905 return strsrch->search->matchedIndex;
2906}
2907
2908
729e4ab9
A
2909U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2910 UChar *result,
2911 int32_t resultCapacity,
b75a7d8f
A
2912 UErrorCode *status)
2913{
2914 if (U_FAILURE(*status)) {
2915 return USEARCH_DONE;
2916 }
729e4ab9 2917 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
b75a7d8f
A
2918 result == NULL)) {
2919 *status = U_ILLEGAL_ARGUMENT_ERROR;
2920 return USEARCH_DONE;
2921 }
2922
2923 int32_t copylength = strsrch->search->matchedLength;
2924 int32_t copyindex = strsrch->search->matchedIndex;
2925 if (copyindex == USEARCH_DONE) {
2926 u_terminateUChars(result, resultCapacity, 0, status);
2927 return USEARCH_DONE;
2928 }
2929
2930 if (resultCapacity < copylength) {
2931 copylength = resultCapacity;
2932 }
2933 if (copylength > 0) {
729e4ab9 2934 uprv_memcpy(result, strsrch->search->text + copyindex,
b75a7d8f
A
2935 copylength * sizeof(UChar));
2936 }
729e4ab9 2937 return u_terminateUChars(result, resultCapacity,
b75a7d8f
A
2938 strsrch->search->matchedLength, status);
2939}
729e4ab9 2940
b75a7d8f
A
2941U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2942 const UStringSearch *strsrch)
2943{
2944 if (strsrch) {
2945 return strsrch->search->matchedLength;
2946 }
2947 return USEARCH_DONE;
2948}
2949
2950#if !UCONFIG_NO_BREAK_ITERATION
2951
729e4ab9 2952U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
b75a7d8f
A
2953 UBreakIterator *breakiter,
2954 UErrorCode *status)
2955{
2956 if (U_SUCCESS(*status) && strsrch) {
4388f060 2957 strsrch->search->breakIter = breakiter;
b75a7d8f 2958 if (breakiter) {
729e4ab9 2959 ubrk_setText(breakiter, strsrch->search->text,
b75a7d8f
A
2960 strsrch->search->textLength, status);
2961 }
2962 }
2963}
2964
729e4ab9 2965U_CAPI const UBreakIterator* U_EXPORT2
b75a7d8f
A
2966usearch_getBreakIterator(const UStringSearch *strsrch)
2967{
2968 if (strsrch) {
2969 return strsrch->search->breakIter;
2970 }
2971 return NULL;
2972}
729e4ab9 2973
b75a7d8f 2974#endif
729e4ab9
A
2975
2976U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
b75a7d8f
A
2977 const UChar *text,
2978 int32_t textlength,
2979 UErrorCode *status)
2980{
2981 if (U_SUCCESS(*status)) {
729e4ab9 2982 if (strsrch == NULL || text == NULL || textlength < -1 ||
b75a7d8f
A
2983 textlength == 0) {
2984 *status = U_ILLEGAL_ARGUMENT_ERROR;
2985 }
2986 else {
2987 if (textlength == -1) {
2988 textlength = u_strlen(text);
2989 }
2990 strsrch->search->text = text;
2991 strsrch->search->textLength = textlength;
2992 ucol_setText(strsrch->textIter, text, textlength, status);
2993 strsrch->search->matchedIndex = USEARCH_DONE;
2994 strsrch->search->matchedLength = 0;
2995 strsrch->search->reset = TRUE;
2996#if !UCONFIG_NO_BREAK_ITERATION
374ca955 2997 if (strsrch->search->breakIter != NULL) {
729e4ab9 2998 ubrk_setText(strsrch->search->breakIter, text,
374ca955
A
2999 textlength, status);
3000 }
46f4442e 3001 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
b75a7d8f
A
3002#endif
3003 }
3004 }
3005}
3006
729e4ab9 3007U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
b75a7d8f
A
3008 int32_t *length)
3009{
3010 if (strsrch) {
3011 *length = strsrch->search->textLength;
3012 return strsrch->search->text;
3013 }
3014 return NULL;
3015}
3016
729e4ab9 3017U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
b75a7d8f
A
3018 const UCollator *collator,
3019 UErrorCode *status)
3020{
3021 if (U_SUCCESS(*status)) {
3022 if (collator == NULL) {
3023 *status = U_ILLEGAL_ARGUMENT_ERROR;
3024 return;
3025 }
46f4442e 3026
b75a7d8f 3027 if (strsrch) {
57a6839d
A
3028 delete strsrch->textProcessedIter;
3029 strsrch->textProcessedIter = NULL;
3030 ucol_closeElements(strsrch->textIter);
3031 ucol_closeElements(strsrch->utilIter);
3032 strsrch->textIter = strsrch->utilIter = NULL;
b75a7d8f
A
3033 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3034 ucol_close((UCollator *)strsrch->collator);
3035 strsrch->ownCollator = FALSE;
3036 }
3037 strsrch->collator = collator;
3038 strsrch->strength = ucol_getStrength(collator);
3039 strsrch->ceMask = getMask(strsrch->strength);
46f4442e 3040#if !UCONFIG_NO_BREAK_ITERATION
4388f060
A
3041 ubrk_close(strsrch->search->internalBreakIter);
3042 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3043 strsrch->search->text, strsrch->search->textLength, status);
46f4442e 3044#endif
b75a7d8f 3045 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
729e4ab9
A
3046 strsrch->toShift =
3047 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
b75a7d8f
A
3048 UCOL_SHIFTED;
3049 // if status is a failure, ucol_getVariableTop returns 0
3050 strsrch->variableTop = ucol_getVariableTop(collator, status);
57a6839d
A
3051 strsrch->textIter = ucol_openElements(collator,
3052 strsrch->search->text,
3053 strsrch->search->textLength,
3054 status);
3055 strsrch->utilIter = ucol_openElements(
3056 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3057 // initialize() _after_ setting the iterators for the new collator.
3058 initialize(strsrch, status);
b75a7d8f 3059 }
46f4442e
A
3060
3061 // **** are these calls needed?
3062 // **** we call uprv_init_pce in initializePatternPCETable
b331163b 3063 // **** and the CEIBuffer constructor...
46f4442e
A
3064#if 0
3065 uprv_init_pce(strsrch->textIter);
3066 uprv_init_pce(strsrch->utilIter);
3067#endif
b75a7d8f
A
3068 }
3069}
3070
3071U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3072{
3073 if (strsrch) {
3074 return (UCollator *)strsrch->collator;
3075 }
3076 return NULL;
3077}
3078
729e4ab9 3079U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
b75a7d8f
A
3080 const UChar *pattern,
3081 int32_t patternlength,
3082 UErrorCode *status)
3083{
3084 if (U_SUCCESS(*status)) {
3085 if (strsrch == NULL || pattern == NULL) {
3086 *status = U_ILLEGAL_ARGUMENT_ERROR;
3087 }
3088 else {
3089 if (patternlength == -1) {
3090 patternlength = u_strlen(pattern);
3091 }
3092 if (patternlength == 0) {
3093 *status = U_ILLEGAL_ARGUMENT_ERROR;
3094 return;
3095 }
3096 strsrch->pattern.text = pattern;
3097 strsrch->pattern.textLength = patternlength;
3098 initialize(strsrch, status);
3099 }
3100 }
3101}
3102
729e4ab9
A
3103U_CAPI const UChar* U_EXPORT2
3104usearch_getPattern(const UStringSearch *strsrch,
b75a7d8f
A
3105 int32_t *length)
3106{
3107 if (strsrch) {
3108 *length = strsrch->pattern.textLength;
3109 return strsrch->pattern.text;
3110 }
3111 return NULL;
3112}
3113
3114// miscellanous methods --------------------------------------------------
3115
729e4ab9
A
3116U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3117 UErrorCode *status)
b75a7d8f
A
3118{
3119 if (strsrch && U_SUCCESS(*status)) {
3120 strsrch->search->isForwardSearching = TRUE;
3121 usearch_setOffset(strsrch, 0, status);
3122 if (U_SUCCESS(*status)) {
3123 return usearch_next(strsrch, status);
3124 }
3125 }
3126 return USEARCH_DONE;
3127}
3128
729e4ab9 3129U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
b75a7d8f
A
3130 int32_t position,
3131 UErrorCode *status)
3132{
3133 if (strsrch && U_SUCCESS(*status)) {
3134 strsrch->search->isForwardSearching = TRUE;
3135 // position checked in usearch_setOffset
3136 usearch_setOffset(strsrch, position, status);
3137 if (U_SUCCESS(*status)) {
729e4ab9 3138 return usearch_next(strsrch, status);
b75a7d8f
A
3139 }
3140 }
3141 return USEARCH_DONE;
3142}
729e4ab9
A
3143
3144U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
b75a7d8f
A
3145 UErrorCode *status)
3146{
3147 if (strsrch && U_SUCCESS(*status)) {
3148 strsrch->search->isForwardSearching = FALSE;
3149 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3150 if (U_SUCCESS(*status)) {
3151 return usearch_previous(strsrch, status);
3152 }
3153 }
3154 return USEARCH_DONE;
3155}
3156
729e4ab9 3157U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
b75a7d8f
A
3158 int32_t position,
3159 UErrorCode *status)
3160{
3161 if (strsrch && U_SUCCESS(*status)) {
3162 strsrch->search->isForwardSearching = FALSE;
3163 // position checked in usearch_setOffset
3164 usearch_setOffset(strsrch, position, status);
3165 if (U_SUCCESS(*status)) {
729e4ab9 3166 return usearch_previous(strsrch, status);
b75a7d8f
A
3167 }
3168 }
3169 return USEARCH_DONE;
3170}
729e4ab9 3171
b75a7d8f 3172/**
729e4ab9
A
3173* If a direction switch is required, we'll count the number of ces till the
3174* beginning of the collation element iterator and iterate forwards that
3175* number of times. This is so that we get to the correct point within the
b75a7d8f
A
3176* string to continue the search in. Imagine when we are in the middle of the
3177* normalization buffer when the change in direction is request. arrrgghh....
3178* After searching the offset within the collation element iterator will be
3179* shifted to the start of the match. If a match is not found, the offset would
729e4ab9 3180* have been set to the end of the text string in the collation element
b75a7d8f
A
3181* iterator.
3182* Okay, here's my take on normalization buffer. The only time when there can
3183* be 2 matches within the same normalization is when the pattern is consists
3184* of all accents. But since the offset returned is from the text string, we
729e4ab9 3185* should not confuse the caller by returning the second match within the
b75a7d8f
A
3186* same normalization buffer. If we do, the 2 results will have the same match
3187* offsets, and that'll be confusing. I'll return the next match that doesn't
729e4ab9 3188* fall within the same normalization buffer. Note this does not affect the
b75a7d8f
A
3189* results of matches spanning the text and the normalization buffer.
3190* The position to start searching is taken from the collation element
3191* iterator. Callers of this API would have to set the offset in the collation
3192* element iterator before using this method.
3193*/
3194U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3195 UErrorCode *status)
729e4ab9 3196{
b75a7d8f 3197 if (U_SUCCESS(*status) && strsrch) {
374ca955
A
3198 // note offset is either equivalent to the start of the previous match
3199 // or is set by the user
3200 int32_t offset = usearch_getOffset(strsrch);
3201 USearch *search = strsrch->search;
3202 search->reset = FALSE;
3203 int32_t textlength = search->textLength;
b75a7d8f 3204 if (search->isForwardSearching) {
46f4442e 3205#if BOYER_MOORE
374ca955 3206 if (offset == textlength
729e4ab9 3207 || (!search->isOverlap &&
b75a7d8f 3208 (offset + strsrch->pattern.defaultShiftSize > textlength ||
729e4ab9 3209 (search->matchedIndex != USEARCH_DONE &&
374ca955 3210 offset + search->matchedLength >= textlength)))) {
b75a7d8f
A
3211 // not enough characters to match
3212 setMatchNotFound(strsrch);
729e4ab9 3213 return USEARCH_DONE;
b75a7d8f 3214 }
46f4442e
A
3215#else
3216 if (offset == textlength ||
3217 (! search->isOverlap &&
3218 (search->matchedIndex != USEARCH_DONE &&
3219 offset + search->matchedLength > textlength))) {
3220 // not enough characters to match
3221 setMatchNotFound(strsrch);
3222 return USEARCH_DONE;
3223 }
3224#endif
b75a7d8f
A
3225 }
3226 else {
729e4ab9
A
3227 // switching direction.
3228 // if matchedIndex == USEARCH_DONE, it means that either a
b75a7d8f 3229 // setOffset has been called or that previous ran off the text
729e4ab9 3230 // string. the iterator would have been set to offset 0 if a
b75a7d8f
A
3231 // match is not found.
3232 search->isForwardSearching = TRUE;
374ca955 3233 if (search->matchedIndex != USEARCH_DONE) {
b75a7d8f
A
3234 // there's no need to set the collation element iterator
3235 // the next call to next will set the offset.
374ca955 3236 return search->matchedIndex;
b75a7d8f
A
3237 }
3238 }
3239
3240 if (U_SUCCESS(*status)) {
b331163b 3241 if (strsrch->pattern.cesLength == 0) {
374ca955 3242 if (search->matchedIndex == USEARCH_DONE) {
b75a7d8f
A
3243 search->matchedIndex = offset;
3244 }
3245 else { // moves by codepoints
4388f060 3246 U16_FWD_1(search->text, search->matchedIndex, textlength);
b75a7d8f 3247 }
729e4ab9 3248
b75a7d8f
A
3249 search->matchedLength = 0;
3250 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3251 // status checked below
3252 if (search->matchedIndex == textlength) {
3253 search->matchedIndex = USEARCH_DONE;
3254 }
3255 }
3256 else {
374ca955
A
3257 if (search->matchedLength > 0) {
3258 // if matchlength is 0 we are at the start of the iteration
3259 if (search->isOverlap) {
3260 ucol_setOffset(strsrch->textIter, offset + 1, status);
3261 }
3262 else {
729e4ab9 3263 ucol_setOffset(strsrch->textIter,
374ca955
A
3264 offset + search->matchedLength, status);
3265 }
3266 }
3267 else {
3268 // for boundary check purposes. this will ensure that the
3269 // next match will not preceed the current offset
3270 // note search->matchedIndex will always be set to something
3271 // in the code
3272 search->matchedIndex = offset - 1;
3273 }
3274
3275 if (search->isCanonicalMatch) {
3276 // can't use exact here since extra accents are allowed.
3277 usearch_handleNextCanonical(strsrch, status);
3278 }
3279 else {
3280 usearch_handleNextExact(strsrch, status);
3281 }
3282 }
3283
b75a7d8f
A
3284 if (U_FAILURE(*status)) {
3285 return USEARCH_DONE;
3286 }
374ca955 3287
46f4442e
A
3288#if !BOYER_MOORE
3289 if (search->matchedIndex == USEARCH_DONE) {
3290 ucol_setOffset(strsrch->textIter, search->textLength, status);
3291 } else {
3292 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3293 }
3294#endif
3295
b75a7d8f
A
3296 return search->matchedIndex;
3297 }
3298 }
3299 return USEARCH_DONE;
3300}
3301
3302U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3303 UErrorCode *status)
3304{
3305 if (U_SUCCESS(*status) && strsrch) {
3306 int32_t offset;
3307 USearch *search = strsrch->search;
3308 if (search->reset) {
3309 offset = search->textLength;
3310 search->isForwardSearching = FALSE;
3311 search->reset = FALSE;
3312 setColEIterOffset(strsrch->textIter, offset);
3313 }
3314 else {
3315 offset = usearch_getOffset(strsrch);
3316 }
729e4ab9 3317
b75a7d8f
A
3318 int32_t matchedindex = search->matchedIndex;
3319 if (search->isForwardSearching == TRUE) {
729e4ab9
A
3320 // switching direction.
3321 // if matchedIndex == USEARCH_DONE, it means that either a
b75a7d8f 3322 // setOffset has been called or that next ran off the text
729e4ab9 3323 // string. the iterator would have been set to offset textLength if
b75a7d8f
A
3324 // a match is not found.
3325 search->isForwardSearching = FALSE;
3326 if (matchedindex != USEARCH_DONE) {
3327 return matchedindex;
3328 }
3329 }
3330 else {
46f4442e 3331#if BOYER_MOORE
b75a7d8f 3332 if (offset == 0 || matchedindex == 0 ||
729e4ab9 3333 (!search->isOverlap &&
b75a7d8f 3334 (offset < strsrch->pattern.defaultShiftSize ||
729e4ab9 3335 (matchedindex != USEARCH_DONE &&
b75a7d8f
A
3336 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3337 // not enough characters to match
3338 setMatchNotFound(strsrch);
729e4ab9 3339 return USEARCH_DONE;
b75a7d8f 3340 }
46f4442e
A
3341#else
3342 // Could check pattern length, but the
3343 // linear search will do the right thing
3344 if (offset == 0 || matchedindex == 0) {
3345 setMatchNotFound(strsrch);
3346 return USEARCH_DONE;
3347 }
3348#endif
b75a7d8f
A
3349 }
3350
3351 if (U_SUCCESS(*status)) {
b331163b 3352 if (strsrch->pattern.cesLength == 0) {
729e4ab9 3353 search->matchedIndex =
b75a7d8f
A
3354 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3355 if (search->matchedIndex == 0) {
3356 setMatchNotFound(strsrch);
3357 // status checked below
3358 }
3359 else { // move by codepoints
4388f060 3360 U16_BACK_1(search->text, 0, search->matchedIndex);
b75a7d8f
A
3361 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3362 // status checked below
3363 search->matchedLength = 0;
3364 }
3365 }
3366 else {
3367 if (strsrch->search->isCanonicalMatch) {
3368 // can't use exact here since extra accents are allowed.
3369 usearch_handlePreviousCanonical(strsrch, status);
3370 // status checked below
3371 }
3372 else {
3373 usearch_handlePreviousExact(strsrch, status);
3374 // status checked below
3375 }
3376 }
3377
3378 if (U_FAILURE(*status)) {
3379 return USEARCH_DONE;
3380 }
729e4ab9 3381
b75a7d8f
A
3382 return search->matchedIndex;
3383 }
3384 }
3385 return USEARCH_DONE;
3386}
3387
3388
729e4ab9 3389
b75a7d8f
A
3390U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3391{
729e4ab9
A
3392 /*
3393 reset is setting the attributes that are already in
b75a7d8f
A
3394 string search, hence all attributes in the collator should
3395 be retrieved without any problems
3396 */
3397 if (strsrch) {
3398 UErrorCode status = U_ZERO_ERROR;
3399 UBool sameCollAttribute = TRUE;
3400 uint32_t ceMask;
3401 UBool shift;
3402 uint32_t varTop;
3403
729e4ab9
A
3404 // **** hack to deal w/ how processed CEs encode quaternary ****
3405 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3406 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3407 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3408 sameCollAttribute = FALSE;
3409 }
3410
b75a7d8f
A
3411 strsrch->strength = ucol_getStrength(strsrch->collator);
3412 ceMask = getMask(strsrch->strength);
3413 if (strsrch->ceMask != ceMask) {
3414 strsrch->ceMask = ceMask;
3415 sameCollAttribute = FALSE;
3416 }
729e4ab9 3417
b75a7d8f 3418 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
729e4ab9 3419 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
b75a7d8f
A
3420 &status) == UCOL_SHIFTED;
3421 if (strsrch->toShift != shift) {
3422 strsrch->toShift = shift;
3423 sameCollAttribute = FALSE;
3424 }
3425
3426 // if status is a failure, ucol_getVariableTop returns 0
3427 varTop = ucol_getVariableTop(strsrch->collator, &status);
3428 if (strsrch->variableTop != varTop) {
3429 strsrch->variableTop = varTop;
3430 sameCollAttribute = FALSE;
3431 }
3432 if (!sameCollAttribute) {
3433 initialize(strsrch, &status);
3434 }
57a6839d 3435 ucol_setText(strsrch->textIter, strsrch->search->text,
729e4ab9 3436 strsrch->search->textLength,
729e4ab9 3437 &status);
b75a7d8f
A
3438 strsrch->search->matchedLength = 0;
3439 strsrch->search->matchedIndex = USEARCH_DONE;
3440 strsrch->search->isOverlap = FALSE;
3441 strsrch->search->isCanonicalMatch = FALSE;
729e4ab9 3442 strsrch->search->elementComparisonType = 0;
b75a7d8f
A
3443 strsrch->search->isForwardSearching = TRUE;
3444 strsrch->search->reset = TRUE;
3445 }
3446}
3447
46f4442e
A
3448//
3449// CEI Collation Element + source text index.
3450// These structs are kept in the circular buffer.
3451//
3452struct CEI {
3453 int64_t ce;
3454 int32_t lowIndex;
3455 int32_t highIndex;
3456};
3457
3458U_NAMESPACE_BEGIN
3459
57a6839d 3460namespace {
46f4442e 3461//
b331163b 3462// CEIBuffer A circular buffer of CEs-with-index from the text being searched.
46f4442e 3463//
4388f060
A
3464#define DEFAULT_CEBUFFER_SIZE 96
3465#define CEBUFFER_EXTRA 32
3466// Some typical max values to make buffer size more reasonable for asymmetric search.
3467// #8694 is for a better long-term solution to allocation of this buffer.
3468#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3469#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3470#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
b331163b 3471struct CEIBuffer {
46f4442e
A
3472 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3473 CEI *buf;
3474 int32_t bufSize;
3475 int32_t firstIx;
3476 int32_t limitIx;
3477 UCollationElements *ceIter;
3478 UStringSearch *strSearch;
3479
3480
3481
b331163b
A
3482 CEIBuffer(UStringSearch *ss, UErrorCode *status);
3483 ~CEIBuffer();
46f4442e
A
3484 const CEI *get(int32_t index);
3485 const CEI *getPrevious(int32_t index);
3486};
3487
3488
b331163b 3489CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
46f4442e
A
3490 buf = defBuf;
3491 strSearch = ss;
b331163b 3492 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
4388f060
A
3493 if (ss->search->elementComparisonType != 0) {
3494 const UChar * patText = ss->pattern.text;
3495 if (patText) {
3496 const UChar * patTextLimit = patText + ss->pattern.textLength;
3497 while ( patText < patTextLimit ) {
3498 UChar c = *patText++;
3499 if (MIGHT_BE_JAMO_L(c)) {
3500 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3501 } else {
3502 // No check for surrogates, we might allocate slightly more buffer than necessary.
3503 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3504 }
3505 }
3506 }
3507 }
46f4442e
A
3508 ceIter = ss->textIter;
3509 firstIx = 0;
3510 limitIx = 0;
3511
57a6839d 3512 if (!initTextProcessedIter(ss, status)) { return; }
46f4442e
A
3513
3514 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3515 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3516 if (buf == NULL) {
3517 *status = U_MEMORY_ALLOCATION_ERROR;
3518 }
3519 }
3520}
3521
3522// TODO: add a reset or init function so that allocated
3523// buffers can be retained & reused.
3524
b331163b 3525CEIBuffer::~CEIBuffer() {
46f4442e
A
3526 if (buf != defBuf) {
3527 uprv_free(buf);
3528 }
3529}
3530
3531
3532// Get the CE with the specified index.
3533// Index must be in the range
3534// n-history_size < index < n+1
3535// where n is the largest index to have been fetched by some previous call to this function.
3536// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3537//
b331163b 3538const CEI *CEIBuffer::get(int32_t index) {
46f4442e
A
3539 int i = index % bufSize;
3540
3541 if (index>=firstIx && index<limitIx) {
3542 // The request was for an entry already in our buffer.
3543 // Just return it.
3544 return &buf[i];
3545 }
3546
3547 // Caller is requesting a new, never accessed before, CE.
3548 // Verify that it is the next one in sequence, which is all
3549 // that is allowed.
3550 if (index != limitIx) {
3d1f044b
A
3551#if U_PLATFORM_IS_DARWIN_BASED
3552 os_log(OS_LOG_DEFAULT, "# CEIBuffer::get param err, index %d limitIx %d", index, limitIx); // <rdar://51554495>
3553#endif
46f4442e
A
3554 U_ASSERT(FALSE);
3555
3556 return NULL;
3557 }
3558
3559 // Manage the circular CE buffer indexing
3560 limitIx++;
3561
3562 if (limitIx - firstIx >= bufSize) {
3563 // The buffer is full, knock out the lowest-indexed entry.
3564 firstIx++;
3565 }
3566
3567 UErrorCode status = U_ZERO_ERROR;
3568
57a6839d 3569 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
46f4442e
A
3570
3571 return &buf[i];
3572}
3573
3574// Get the CE with the specified index.
3575// Index must be in the range
3576// n-history_size < index < n+1
3577// where n is the largest index to have been fetched by some previous call to this function.
3578// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3579//
b331163b 3580const CEI *CEIBuffer::getPrevious(int32_t index) {
46f4442e
A
3581 int i = index % bufSize;
3582
3583 if (index>=firstIx && index<limitIx) {
3584 // The request was for an entry already in our buffer.
3585 // Just return it.
3586 return &buf[i];
3587 }
3588
3589 // Caller is requesting a new, never accessed before, CE.
3590 // Verify that it is the next one in sequence, which is all
3591 // that is allowed.
3592 if (index != limitIx) {
3d1f044b
A
3593#if U_PLATFORM_IS_DARWIN_BASED
3594 os_log(OS_LOG_DEFAULT, "# CEIBuffer::getPrevious param err, index %d limitIx %d", index, limitIx); // <rdar://51554495>
3595#endif
46f4442e
A
3596 U_ASSERT(FALSE);
3597
3598 return NULL;
3599 }
3600
3601 // Manage the circular CE buffer indexing
3602 limitIx++;
3603
3604 if (limitIx - firstIx >= bufSize) {
3605 // The buffer is full, knock out the lowest-indexed entry.
3606 firstIx++;
3607 }
3608
3609 UErrorCode status = U_ZERO_ERROR;
3610
57a6839d 3611 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
46f4442e
A
3612
3613 return &buf[i];
3614}
3615
57a6839d
A
3616}
3617
46f4442e
A
3618U_NAMESPACE_END
3619
3620
3621// #define USEARCH_DEBUG
3622
3623#ifdef USEARCH_DEBUG
3624#include <stdio.h>
3625#include <stdlib.h>
3626#endif
3627
3628/*
3629 * Find the next break boundary after startIndex. If the UStringSearch object
3630 * has an external break iterator, use that. Otherwise use the internal character
3631 * break iterator.
3632 */
3633static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3634#if 0
3635 const UChar *text = strsrch->search->text;
3636 int32_t textLen = strsrch->search->textLength;
729e4ab9 3637
46f4442e
A
3638 U_ASSERT(startIndex>=0);
3639 U_ASSERT(startIndex<=textLen);
729e4ab9 3640
46f4442e
A
3641 if (startIndex >= textLen) {
3642 return startIndex;
3643 }
3644
3645 UChar32 c;
3646 int32_t i = startIndex;
3647 U16_NEXT(text, i, textLen, c);
729e4ab9 3648
46f4442e
A
3649 // If we are on a control character, stop without looking for combining marks.
3650 // Control characters do not combine.
3651 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3652 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3653 return i;
3654 }
729e4ab9 3655
46f4442e
A
3656 // The initial character was not a control, and can thus accept trailing
3657 // combining characters. Advance over however many of them there are.
3658 int32_t indexOfLastCharChecked;
3659 for (;;) {
3660 indexOfLastCharChecked = i;
3661 if (i>=textLen) {
3662 break;
3663 }
3664 U16_NEXT(text, i, textLen, c);
3665 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3666 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3667 break;
3668 }
3669 }
3670 return indexOfLastCharChecked;
3671#elif !UCONFIG_NO_BREAK_ITERATION
3672 UBreakIterator *breakiterator = strsrch->search->breakIter;
3673
3674 if (breakiterator == NULL) {
3675 breakiterator = strsrch->search->internalBreakIter;
3676 }
3677
3678 if (breakiterator != NULL) {
4388f060 3679 return ubrk_following(breakiterator, startIndex);
46f4442e
A
3680 }
3681
3682 return startIndex;
3683#else
3684 // **** or should we use the original code? ****
3685 return startIndex;
3686#endif
3687
3688}
3689
3690/*
3691 * Returns TRUE if index is on a break boundary. If the UStringSearch
3692 * has an external break iterator, test using that, otherwise test
3693 * using the internal character break iterator.
3694 */
3695static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3696#if 0
3697 const UChar *text = strsrch->search->text;
3698 int32_t textLen = strsrch->search->textLength;
729e4ab9 3699
46f4442e
A
3700 U_ASSERT(index>=0);
3701 U_ASSERT(index<=textLen);
729e4ab9 3702
46f4442e 3703 if (index>=textLen || index<=0) {
4388f060 3704 return TRUE;
46f4442e 3705 }
729e4ab9 3706
46f4442e
A
3707 // If the character at the current index is not a GRAPHEME_EXTEND
3708 // then we can not be within a combining sequence.
3709 UChar32 c;
3710 U16_GET(text, 0, index, textLen, c);
3711 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3712 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
4388f060 3713 return TRUE;
46f4442e 3714 }
729e4ab9 3715
46f4442e
A
3716 // We are at a combining mark. If the preceding character is anything
3717 // except a CONTROL, CR or LF, we are in a combining sequence.
729e4ab9 3718 U16_PREV(text, 0, index, c);
46f4442e 3719 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
729e4ab9 3720 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
4388f060 3721 return !combining;
46f4442e
A
3722#elif !UCONFIG_NO_BREAK_ITERATION
3723 UBreakIterator *breakiterator = strsrch->search->breakIter;
3724
3725 if (breakiterator == NULL) {
3726 breakiterator = strsrch->search->internalBreakIter;
3727 }
3728
4388f060 3729 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
46f4442e
A
3730#else
3731 // **** or use the original code? ****
4388f060 3732 return TRUE;
46f4442e 3733#endif
729e4ab9 3734}
46f4442e
A
3735
3736#if 0
3737static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3738{
3739#if !UCONFIG_NO_BREAK_ITERATION
3740 UBreakIterator *breakiterator = strsrch->search->breakIter;
3741
3742 if (breakiterator != NULL) {
3743 int32_t startindex = ubrk_first(breakiterator);
3744 int32_t endindex = ubrk_last(breakiterator);
729e4ab9 3745
46f4442e
A
3746 // out-of-range indexes are never boundary positions
3747 if (start < startindex || start > endindex ||
3748 end < startindex || end > endindex) {
3749 return FALSE;
3750 }
3751
729e4ab9 3752 return ubrk_isBoundary(breakiterator, start) &&
46f4442e
A
3753 ubrk_isBoundary(breakiterator, end);
3754 }
3755#endif
3756
3757 return TRUE;
3758}
3759#endif
3760
729e4ab9
A
3761typedef enum {
3762 U_CE_MATCH = -1,
3763 U_CE_NO_MATCH = 0,
3764 U_CE_SKIP_TARG,
3765 U_CE_SKIP_PATN
3766} UCompareCEsResult;
3767#define U_CE_LEVEL2_BASE 0x00000005
3768#define U_CE_LEVEL3_BASE 0x00050000
3769
3770static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3771 if (targCE == patCE) {
3772 return U_CE_MATCH;
3773 }
3774 if (compareType == 0) {
3775 return U_CE_NO_MATCH;
3776 }
46f4442e 3777
729e4ab9
A
3778 int64_t targCEshifted = targCE >> 32;
3779 int64_t patCEshifted = patCE >> 32;
3780 int64_t mask;
3781
3782 mask = 0xFFFF0000;
3783 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3784 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3785 if ( targLev1 != patLev1 ) {
3786 if ( targLev1 == 0 ) {
3787 return U_CE_SKIP_TARG;
3788 }
3789 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3790 return U_CE_SKIP_PATN;
3791 }
3792 return U_CE_NO_MATCH;
3793 }
3794
3795 mask = 0x0000FFFF;
3796 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3797 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3798 if ( targLev2 != patLev2 ) {
3799 if ( targLev2 == 0 ) {
3800 return U_CE_SKIP_TARG;
3801 }
3802 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3803 return U_CE_SKIP_PATN;
3804 }
3805 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3806 U_CE_MATCH: U_CE_NO_MATCH;
3807 }
3808
3809 mask = 0xFFFF0000;
3810 int32_t targLev3 = (int32_t)(targCE & mask);
3811 int32_t patLev3 = (int32_t)(patCE & mask);
3812 if ( targLev3 != patLev3 ) {
3813 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3814 U_CE_MATCH: U_CE_NO_MATCH;
3815 }
3816
3817 return U_CE_MATCH;
3818}
3819
3820#if BOYER_MOORE
3821// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3822#endif
3823
2ca993e8
A
3824namespace {
3825
3826UChar32 codePointAt(const USearch &search, int32_t index) {
3827 if (index < search.textLength) {
3828 UChar32 c;
3829 U16_NEXT(search.text, index, search.textLength, c);
3830 return c;
3831 }
3832 return U_SENTINEL;
3833}
3834
3835UChar32 codePointBefore(const USearch &search, int32_t index) {
3836 if (0 < index) {
3837 UChar32 c;
3838 U16_PREV(search.text, 0, index, c);
3839 return c;
3840 }
3841 return U_SENTINEL;
3842}
3843
3844} // namespace
3845
46f4442e
A
3846U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3847 int32_t startIdx,
3848 int32_t *matchStart,
3849 int32_t *matchLimit,
729e4ab9 3850 UErrorCode *status)
46f4442e
A
3851{
3852 if (U_FAILURE(*status)) {
3853 return FALSE;
3854 }
3855
3856 // TODO: reject search patterns beginning with a combining char.
3857
3858#ifdef USEARCH_DEBUG
3859 if (getenv("USEARCH_DEBUG") != NULL) {
3860 printf("Pattern CEs\n");
b331163b
A
3861 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3862 printf(" %8x", strsrch->pattern.ces[ii]);
46f4442e
A
3863 }
3864 printf("\n");
3865 }
729e4ab9 3866
46f4442e
A
3867#endif
3868 // Input parameter sanity check.
3869 // TODO: should input indicies clip to the text length
3870 // in the same way that UText does.
b331163b 3871 if(strsrch->pattern.cesLength == 0 ||
46f4442e
A
3872 startIdx < 0 ||
3873 startIdx > strsrch->search->textLength ||
b331163b 3874 strsrch->pattern.ces == NULL) {
46f4442e
A
3875 *status = U_ILLEGAL_ARGUMENT_ERROR;
3876 return FALSE;
3877 }
3878
b331163b 3879 if (strsrch->pattern.pces == NULL) {
46f4442e
A
3880 initializePatternPCETable(strsrch, status);
3881 }
3882
3883 ucol_setOffset(strsrch->textIter, startIdx, status);
b331163b 3884 CEIBuffer ceb(strsrch, status);
46f4442e 3885
729e4ab9
A
3886
3887 int32_t targetIx = 0;
3888 const CEI *targetCEI = NULL;
46f4442e
A
3889 int32_t patIx;
3890 UBool found;
3891
3892 int32_t mStart = -1;
3893 int32_t mLimit = -1;
3894 int32_t minLimit;
3895 int32_t maxLimit;
729e4ab9
A
3896
3897
3898
46f4442e
A
3899 // Outer loop moves over match starting positions in the
3900 // target CE space.
729e4ab9
A
3901 // Here we see the target as a sequence of collation elements, resulting from the following:
3902 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3903 // (for example, digraphs such as IJ may be broken into two characters).
3904 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3905 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3906 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3907 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3908 // then the CE is deleted, so the following code sees only CEs that are relevant.
3909 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3910 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3911 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3912 //
46f4442e
A
3913 for(targetIx=0; ; targetIx++)
3914 {
3915 found = TRUE;
3916 // Inner loop checks for a match beginning at each
3917 // position from the outer loop.
729e4ab9
A
3918 int32_t targetIxOffset = 0;
3919 int64_t patCE = 0;
4388f060
A
3920 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3921 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3922 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3923 const CEI *firstCEI = ceb.get(targetIx);
3924 if (firstCEI == NULL) {
3925 *status = U_INTERNAL_PROGRAM_ERROR;
3926 found = FALSE;
3927 break;
3928 }
3929
b331163b
A
3930 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3931 patCE = strsrch->pattern.pces[patIx];
729e4ab9 3932 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
46f4442e 3933 // Compare CE from target string with CE from the pattern.
729e4ab9 3934 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
46f4442e 3935 // which will fail the compare, below.
729e4ab9
A
3936 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3937 if ( ceMatch == U_CE_NO_MATCH ) {
46f4442e
A
3938 found = FALSE;
3939 break;
729e4ab9
A
3940 } else if ( ceMatch > U_CE_NO_MATCH ) {
3941 if ( ceMatch == U_CE_SKIP_TARG ) {
3942 // redo with same patCE, next targCE
3943 patIx--;
3944 targetIxOffset++;
3945 } else { // ceMatch == U_CE_SKIP_PATN
3946 // redo with same targCE, next patCE
3947 targetIxOffset--;
3948 }
46f4442e
A
3949 }
3950 }
b331163b 3951 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
46f4442e 3952
729e4ab9 3953 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
46f4442e
A
3954 // No match at this targetIx. Try again at the next.
3955 continue;
3956 }
3957
3958 if (!found) {
3959 // No match at all, we have run off the end of the target text.
3960 break;
3961 }
3962
3963
3964 // We have found a match in CE space.
3965 // Now determine the bounds in string index space.
3966 // There still is a chance of match failure if the CE range not correspond to
3967 // an acceptable character range.
3968 //
729e4ab9 3969 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
46f4442e 3970
46f4442e
A
3971 mStart = firstCEI->lowIndex;
3972 minLimit = lastCEI->lowIndex;
46f4442e
A
3973
3974 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3975 // extended to the end of input, and the match is good.
3976
3977 // Look at the high and low indices of the CE following the match. If
3978 // they are the same it means one of two things:
3979 // 1. The match extended to the last CE from the target text, which is OK, or
3980 // 2. The last CE that was part of the match is in an expansion that extends
3981 // to the first CE after the match. In this case, we reject the match.
4388f060 3982 const CEI *nextCEI = 0;
729e4ab9 3983 if (strsrch->search->elementComparisonType == 0) {
4388f060 3984 nextCEI = ceb.get(targetIx + targetIxOffset);
729e4ab9
A
3985 maxLimit = nextCEI->lowIndex;
3986 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3987 found = FALSE;
3988 }
3989 } else {
729e4ab9
A
3990 for ( ; ; ++targetIxOffset ) {
3991 nextCEI = ceb.get(targetIx + targetIxOffset);
3992 maxLimit = nextCEI->lowIndex;
4388f060 3993 // If we are at the end of the target too, match succeeds
729e4ab9
A
3994 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3995 break;
3996 }
3997 // As long as the next CE has primary weight of 0,
3998 // it is part of the last target element matched by the pattern;
3999 // make sure it can be part of a match with the last patCE
4000 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
4388f060
A
4001 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
4002 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
4003 found = FALSE;
4004 break;
4005 }
729e4ab9
A
4006 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
4007 // target element, but it has non-zero primary weight => match fails
4008 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
4388f060
A
4009 found = false;
4010 break;
729e4ab9
A
4011 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4012 } else {
4388f060 4013 break;
729e4ab9
A
4014 }
4015 }
46f4442e 4016 }
729e4ab9 4017
46f4442e
A
4018
4019 // Check for the start of the match being within a combining sequence.
4020 // This can happen if the pattern itself begins with a combining char, and
4021 // the match found combining marks in the target text that were attached
4022 // to something else.
4023 // This type of match should be rejected for not completely consuming a
4024 // combining sequence.
4388f060 4025 if (!isBreakBoundary(strsrch, mStart)) {
46f4442e
A
4026 found = FALSE;
4027 }
4028
4029 // Check for the start of the match being within an Collation Element Expansion,
4030 // meaning that the first char of the match is only partially matched.
729e4ab9 4031 // With exapnsions, the first CE will report the index of the source
46f4442e 4032 // character, and all subsequent (expansions) CEs will report the source index of the
729e4ab9 4033 // _following_ character.
46f4442e
A
4034 int32_t secondIx = firstCEI->highIndex;
4035 if (mStart == secondIx) {
4036 found = FALSE;
4037 }
729e4ab9 4038
2ca993e8
A
4039 // Allow matches to end in the middle of a grapheme cluster if the following
4040 // conditions are met; this is needed to make prefix search work properly in
4041 // Indic, see #11750
4042 // * the default breakIter is being used
4043 // * the next collation element after this combining sequence
4044 // - has non-zero primary weight
4045 // - corresponds to a separate character following the one at end of the current match
4046 // (the second of these conditions, and perhaps both, may be redundant given the
4047 // subsequent check for normalization boundary; however they are likely much faster
4048 // tests in any case)
4049 // * the match limit is a normalization boundary
4050 UBool allowMidclusterMatch = FALSE;
4051 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4052 allowMidclusterMatch =
4053 strsrch->search->breakIter == NULL &&
4054 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4055 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4056 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4057 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4058 }
4059 // If those conditions are met, then:
4060 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4061 // the match limit may be backed off to a previous break boundary. This handles
4062 // cases in which mLimit includes target characters that are ignorable with current
4063 // settings (such as space) and which extend beyond the pattern match.
4064 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4065 // * do NOT require that match limit be on a breakIter boundary
4066
46f4442e
A
4067 // Advance the match end position to the first acceptable match boundary.
4068 // This advances the index over any combining charcters.
4069 mLimit = maxLimit;
4070 if (minLimit < maxLimit) {
4388f060
A
4071 // When the last CE's low index is same with its high index, the CE is likely
4072 // a part of expansion. In this case, the index is located just after the
4073 // character corresponding to the CEs compared above. If the index is right
4074 // at the break boundary, move the position to the next boundary will result
4075 // incorrect match length when there are ignorable characters exist between
4076 // the position and the next character produces CE(s). See ticket#8482.
4077 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4078 mLimit = minLimit;
4079 } else {
4080 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
2ca993e8
A
4081 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4082 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4083 // (i.e. we back off mLimit to the previous breakIterator boundary).
4084 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4388f060
A
4085 mLimit = nba;
4086 }
46f4442e
A
4087 }
4088 }
729e4ab9 4089
46f4442e
A
4090 #ifdef USEARCH_DEBUG
4091 if (getenv("USEARCH_DEBUG") != NULL) {
4092 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4093 }
4094 #endif
729e4ab9 4095
2ca993e8 4096 if (!allowMidclusterMatch) {
b331163b
A
4097 // If advancing to the end of a combining sequence in character indexing space
4098 // advanced us beyond the end of the match in CE space, reject this match.
4099 if (mLimit > maxLimit) {
4100 found = FALSE;
4101 }
46f4442e 4102
b331163b
A
4103 if (!isBreakBoundary(strsrch, mLimit)) {
4104 found = FALSE;
4105 }
46f4442e
A
4106 }
4107
729e4ab9
A
4108 if (! checkIdentical(strsrch, mStart, mLimit)) {
4109 found = FALSE;
4110 }
4111
46f4442e
A
4112 if (found) {
4113 break;
4114 }
4115 }
4116
4117 #ifdef USEARCH_DEBUG
4118 if (getenv("USEARCH_DEBUG") != NULL) {
4119 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4120 int32_t lastToPrint = ceb.limitIx+2;
4121 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4122 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4123 }
4124 printf("\n%s\n", found? "match found" : "no match");
4125 }
4126 #endif
4127
4128 // All Done. Store back the match bounds to the caller.
4129 //
4130 if (found==FALSE) {
4131 mLimit = -1;
4132 mStart = -1;
4133 }
4134
4135 if (matchStart != NULL) {
4136 *matchStart= mStart;
4137 }
4138
4139 if (matchLimit != NULL) {
4140 *matchLimit = mLimit;
4141 }
4142
4143 return found;
4144}
4145
46f4442e
A
4146U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4147 int32_t startIdx,
4148 int32_t *matchStart,
4149 int32_t *matchLimit,
729e4ab9 4150 UErrorCode *status)
46f4442e
A
4151{
4152 if (U_FAILURE(*status)) {
4153 return FALSE;
4154 }
4155
4156 // TODO: reject search patterns beginning with a combining char.
4157
4158#ifdef USEARCH_DEBUG
4159 if (getenv("USEARCH_DEBUG") != NULL) {
4160 printf("Pattern CEs\n");
b331163b
A
4161 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4162 printf(" %8x", strsrch->pattern.ces[ii]);
46f4442e
A
4163 }
4164 printf("\n");
4165 }
729e4ab9 4166
46f4442e
A
4167#endif
4168 // Input parameter sanity check.
4169 // TODO: should input indicies clip to the text length
4170 // in the same way that UText does.
b331163b 4171 if(strsrch->pattern.cesLength == 0 ||
46f4442e
A
4172 startIdx < 0 ||
4173 startIdx > strsrch->search->textLength ||
b331163b 4174 strsrch->pattern.ces == NULL) {
46f4442e
A
4175 *status = U_ILLEGAL_ARGUMENT_ERROR;
4176 return FALSE;
4177 }
4178
b331163b 4179 if (strsrch->pattern.pces == NULL) {
46f4442e
A
4180 initializePatternPCETable(strsrch, status);
4181 }
4182
b331163b 4183 CEIBuffer ceb(strsrch, status);
729e4ab9 4184 int32_t targetIx = 0;
46f4442e
A
4185
4186 /*
4187 * Pre-load the buffer with the CE's for the grapheme
4188 * after our starting position so that we're sure that
4189 * we can look at the CE following the match when we
4190 * check the match boundaries.
4191 *
4192 * This will also pre-fetch the first CE that we'll
4193 * consider for the match.
4194 */
4195 if (startIdx < strsrch->search->textLength) {
4196 UBreakIterator *bi = strsrch->search->internalBreakIter;
4197 int32_t next = ubrk_following(bi, startIdx);
4198
4199 ucol_setOffset(strsrch->textIter, next, status);
4200
4201 for (targetIx = 0; ; targetIx += 1) {
4202 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4203 break;
4204 }
4205 }
4206 } else {
4207 ucol_setOffset(strsrch->textIter, startIdx, status);
4208 }
46f4442e 4209
729e4ab9
A
4210
4211 const CEI *targetCEI = NULL;
46f4442e
A
4212 int32_t patIx;
4213 UBool found;
4214
4215 int32_t limitIx = targetIx;
4216 int32_t mStart = -1;
4217 int32_t mLimit = -1;
4218 int32_t minLimit;
4219 int32_t maxLimit;
729e4ab9
A
4220
4221
4222
46f4442e
A
4223 // Outer loop moves over match starting positions in the
4224 // target CE space.
729e4ab9
A
4225 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4226 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4227 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4228 // and the beginning of the base text.
46f4442e
A
4229 for(targetIx = limitIx; ; targetIx += 1)
4230 {
4231 found = TRUE;
4388f060
A
4232 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4233 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4234 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4235 const CEI *lastCEI = ceb.getPrevious(targetIx);
4236 if (lastCEI == NULL) {
4237 *status = U_INTERNAL_PROGRAM_ERROR;
4238 found = FALSE;
4239 break;
4240 }
46f4442e
A
4241 // Inner loop checks for a match beginning at each
4242 // position from the outer loop.
729e4ab9 4243 int32_t targetIxOffset = 0;
b331163b
A
4244 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4245 int64_t patCE = strsrch->pattern.pces[patIx];
46f4442e 4246
b331163b 4247 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
46f4442e
A
4248 // Compare CE from target string with CE from the pattern.
4249 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4250 // which will fail the compare, below.
729e4ab9
A
4251 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4252 if ( ceMatch == U_CE_NO_MATCH ) {
46f4442e
A
4253 found = FALSE;
4254 break;
729e4ab9
A
4255 } else if ( ceMatch > U_CE_NO_MATCH ) {
4256 if ( ceMatch == U_CE_SKIP_TARG ) {
4257 // redo with same patCE, next targCE
4258 patIx++;
4259 targetIxOffset++;
4260 } else { // ceMatch == U_CE_SKIP_PATN
4261 // redo with same targCE, next patCE
4262 targetIxOffset--;
4263 }
46f4442e
A
4264 }
4265 }
4266
729e4ab9 4267 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
46f4442e
A
4268 // No match at this targetIx. Try again at the next.
4269 continue;
4270 }
4271
4272 if (!found) {
4273 // No match at all, we have run off the end of the target text.
4274 break;
4275 }
4276
4277
4278 // We have found a match in CE space.
4279 // Now determine the bounds in string index space.
4280 // There still is a chance of match failure if the CE range not correspond to
4281 // an acceptable character range.
4282 //
b331163b 4283 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
46f4442e 4284 mStart = firstCEI->lowIndex;
46f4442e
A
4285
4286 // Check for the start of the match being within a combining sequence.
4287 // This can happen if the pattern itself begins with a combining char, and
4288 // the match found combining marks in the target text that were attached
4289 // to something else.
4290 // This type of match should be rejected for not completely consuming a
4291 // combining sequence.
4388f060 4292 if (!isBreakBoundary(strsrch, mStart)) {
46f4442e
A
4293 found = FALSE;
4294 }
4295
4296 // Look at the high index of the first CE in the match. If it's the same as the
4297 // low index, the first CE in the match is in the middle of an expansion.
4298 if (mStart == firstCEI->highIndex) {
4299 found = FALSE;
4300 }
729e4ab9 4301
46f4442e 4302
4388f060
A
4303 minLimit = lastCEI->lowIndex;
4304
4305 if (targetIx > 0) {
4306 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4307 // extended to the end of input, and the match is good.
4308
4309 // Look at the high and low indices of the CE following the match. If
4310 // they are the same it means one of two things:
4311 // 1. The match extended to the last CE from the target text, which is OK, or
4312 // 2. The last CE that was part of the match is in an expansion that extends
4313 // to the first CE after the match. In this case, we reject the match.
4314 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4315
4316 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4317 found = FALSE;
4318 }
4319
4320 mLimit = maxLimit = nextCEI->lowIndex;
4321
2ca993e8
A
4322 // Allow matches to end in the middle of a grapheme cluster if the following
4323 // conditions are met; this is needed to make prefix search work properly in
4324 // Indic, see #11750
4325 // * the default breakIter is being used
4326 // * the next collation element after this combining sequence
4327 // - has non-zero primary weight
4328 // - corresponds to a separate character following the one at end of the current match
4329 // (the second of these conditions, and perhaps both, may be redundant given the
4330 // subsequent check for normalization boundary; however they are likely much faster
4331 // tests in any case)
4332 // * the match limit is a normalization boundary
4333 UBool allowMidclusterMatch = FALSE;
4334 if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4335 allowMidclusterMatch =
4336 strsrch->search->breakIter == NULL &&
4337 nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4338 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4339 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4340 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4341 }
4342 // If those conditions are met, then:
4343 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4344 // the match limit may be backed off to a previous break boundary. This handles
4345 // cases in which mLimit includes target characters that are ignorable with current
4346 // settings (such as space) and which extend beyond the pattern match.
4347 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4348 // * do NOT require that match limit be on a breakIter boundary
4349
4388f060 4350 // Advance the match end position to the first acceptable match boundary.
b331163b 4351 // This advances the index over any combining characters.
4388f060
A
4352 if (minLimit < maxLimit) {
4353 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
2ca993e8
A
4354 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4355 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4356 // (i.e. we back off mLimit to the previous breakIterator boundary).
4357 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4388f060
A
4358 mLimit = nba;
4359 }
46f4442e 4360 }
4388f060 4361
2ca993e8 4362 if (!allowMidclusterMatch) {
b331163b
A
4363 // If advancing to the end of a combining sequence in character indexing space
4364 // advanced us beyond the end of the match in CE space, reject this match.
4365 if (mLimit > maxLimit) {
4366 found = FALSE;
4367 }
4388f060 4368
b331163b
A
4369 // Make sure the end of the match is on a break boundary
4370 if (!isBreakBoundary(strsrch, mLimit)) {
4371 found = FALSE;
4372 }
4388f060
A
4373 }
4374
4375 } else {
4376 // No non-ignorable CEs after this point.
4377 // The maximum position is detected by boundary after
4378 // the last non-ignorable CE. Combining sequence
4379 // across the start index will be truncated.
4380 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4381 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
46f4442e 4382 }
729e4ab9 4383
46f4442e
A
4384 #ifdef USEARCH_DEBUG
4385 if (getenv("USEARCH_DEBUG") != NULL) {
4386 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4387 }
4388 #endif
729e4ab9 4389
46f4442e 4390
729e4ab9
A
4391 if (! checkIdentical(strsrch, mStart, mLimit)) {
4392 found = FALSE;
4393 }
4394
46f4442e
A
4395 if (found) {
4396 break;
4397 }
4398 }
4399
4400 #ifdef USEARCH_DEBUG
4401 if (getenv("USEARCH_DEBUG") != NULL) {
4402 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4403 int32_t lastToPrint = ceb.limitIx+2;
4404 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4405 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4406 }
4407 printf("\n%s\n", found? "match found" : "no match");
4408 }
4409 #endif
4410
4411 // All Done. Store back the match bounds to the caller.
4412 //
4413 if (found==FALSE) {
4414 mLimit = -1;
4415 mStart = -1;
4416 }
4417
4418 if (matchStart != NULL) {
4419 *matchStart= mStart;
4420 }
4421
4422 if (matchLimit != NULL) {
4423 *matchLimit = mLimit;
4424 }
4425
4426 return found;
4427}
4428
b75a7d8f
A
4429// internal use methods declared in usrchimp.h -----------------------------
4430
4431UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4432{
4433 if (U_FAILURE(*status)) {
4434 setMatchNotFound(strsrch);
4435 return FALSE;
4436 }
4437
46f4442e 4438#if BOYER_MOORE
374ca955 4439 UCollationElements *coleiter = strsrch->textIter;
b75a7d8f 4440 int32_t textlength = strsrch->search->textLength;
b331163b
A
4441 int32_t *patternce = strsrch->pattern.ces;
4442 int32_t patterncelength = strsrch->pattern.cesLength;
b75a7d8f
A
4443 int32_t textoffset = ucol_getOffset(coleiter);
4444
374ca955 4445 // status used in setting coleiter offset, since offset is checked in
729e4ab9 4446 // shiftForward before setting the coleiter offset, status never
374ca955 4447 // a failure
729e4ab9 4448 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4449 patterncelength);
4450 while (textoffset <= textlength)
4451 {
4452 uint32_t patternceindex = patterncelength - 1;
374ca955 4453 int32_t targetce;
b75a7d8f 4454 UBool found = FALSE;
374ca955
A
4455 int32_t lastce = UCOL_NULLORDER;
4456
4457 setColEIterOffset(coleiter, textoffset);
4458
46f4442e 4459 for (;;) {
b75a7d8f
A
4460 // finding the last pattern ce match, imagine composite characters
4461 // for example: search for pattern A in text \u00C0
4462 // we'll have to skip \u0300 the grave first before we get to A
4463 targetce = ucol_previous(coleiter, status);
4464 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4465 found = FALSE;
4466 break;
4467 }
4468 targetce = getCE(strsrch, targetce);
729e4ab9
A
4469 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4470 // this is for the text \u0315\u0300 that requires
b75a7d8f
A
4471 // normalization and pattern \u0300, where \u0315 is ignorable
4472 continue;
4473 }
4474 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4475 lastce = targetce;
4476 }
729e4ab9 4477 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4478 if (targetce == patternce[patternceindex]) {
4479 // the first ce can be a contraction
4480 found = TRUE;
4481 break;
4482 }
4483 if (!hasExpansion(coleiter)) {
4484 found = FALSE;
4485 break;
4486 }
4487 }
4488
46f4442e 4489 //targetce = lastce;
729e4ab9 4490
b75a7d8f 4491 while (found && patternceindex > 0) {
4388f060 4492 lastce = targetce;
b75a7d8f
A
4493 targetce = ucol_previous(coleiter, status);
4494 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4495 found = FALSE;
4496 break;
4497 }
4498 targetce = getCE(strsrch, targetce);
4499 if (targetce == UCOL_IGNORABLE) {
4500 continue;
4501 }
4502
4503 patternceindex --;
729e4ab9
A
4504 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4505 found = found && targetce == patternce[patternceindex];
b75a7d8f 4506 }
729e4ab9 4507
46f4442e 4508 targetce = lastce;
b75a7d8f
A
4509
4510 if (!found) {
374ca955
A
4511 if (U_FAILURE(*status)) {
4512 break;
4513 }
729e4ab9 4514 textoffset = shiftForward(strsrch, textoffset, lastce,
b75a7d8f
A
4515 patternceindex);
4516 // status checked at loop.
4517 patternceindex = patterncelength;
4518 continue;
4519 }
374ca955
A
4520
4521 if (checkNextExactMatch(strsrch, &textoffset, status)) {
b75a7d8f 4522 // status checked in ucol_setOffset
374ca955
A
4523 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4524 return TRUE;
b75a7d8f
A
4525 }
4526 }
4527 setMatchNotFound(strsrch);
374ca955 4528 return FALSE;
46f4442e
A
4529#else
4530 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4531 int32_t start = -1;
4532 int32_t end = -1;
4533
4534 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4535 strsrch->search->matchedIndex = start;
4536 strsrch->search->matchedLength = end - start;
4537 return TRUE;
4538 } else {
4539 setMatchNotFound(strsrch);
4540 return FALSE;
4541 }
4542#endif
b75a7d8f
A
4543}
4544
4545UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4546{
4547 if (U_FAILURE(*status)) {
4548 setMatchNotFound(strsrch);
4549 return FALSE;
4550 }
4551
46f4442e 4552#if BOYER_MOORE
b75a7d8f
A
4553 UCollationElements *coleiter = strsrch->textIter;
4554 int32_t textlength = strsrch->search->textLength;
b331163b
A
4555 int32_t *patternce = strsrch->pattern.ces;
4556 int32_t patterncelength = strsrch->pattern.cesLength;
374ca955 4557 int32_t textoffset = ucol_getOffset(coleiter);
729e4ab9 4558 UBool hasPatternAccents =
b75a7d8f 4559 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
729e4ab9
A
4560
4561 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4562 patterncelength);
4563 strsrch->canonicalPrefixAccents[0] = 0;
4564 strsrch->canonicalSuffixAccents[0] = 0;
729e4ab9 4565
b75a7d8f
A
4566 while (textoffset <= textlength)
4567 {
4568 int32_t patternceindex = patterncelength - 1;
374ca955 4569 int32_t targetce;
b75a7d8f 4570 UBool found = FALSE;
374ca955
A
4571 int32_t lastce = UCOL_NULLORDER;
4572
4573 setColEIterOffset(coleiter, textoffset);
4574
4575 for (;;) {
b75a7d8f
A
4576 // finding the last pattern ce match, imagine composite characters
4577 // for example: search for pattern A in text \u00C0
4578 // we'll have to skip \u0300 the grave first before we get to A
4579 targetce = ucol_previous(coleiter, status);
4580 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4581 found = FALSE;
4582 break;
4583 }
4584 targetce = getCE(strsrch, targetce);
4585 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4586 lastce = targetce;
4587 }
729e4ab9 4588 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4589 if (targetce == patternce[patternceindex]) {
4590 // the first ce can be a contraction
4591 found = TRUE;
4592 break;
4593 }
4594 if (!hasExpansion(coleiter)) {
4595 found = FALSE;
4596 break;
4597 }
4598 }
729e4ab9 4599
b75a7d8f
A
4600 while (found && patternceindex > 0) {
4601 targetce = ucol_previous(coleiter, status);
4602 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4603 found = FALSE;
4604 break;
4605 }
4606 targetce = getCE(strsrch, targetce);
4607 if (targetce == UCOL_IGNORABLE) {
4608 continue;
4609 }
4610
4611 patternceindex --;
729e4ab9
A
4612 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4613 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4614 }
4615
4616 // initializing the rearranged accent array
4617 if (hasPatternAccents && !found) {
4618 strsrch->canonicalPrefixAccents[0] = 0;
4619 strsrch->canonicalSuffixAccents[0] = 0;
374ca955
A
4620 if (U_FAILURE(*status)) {
4621 break;
4622 }
b75a7d8f
A
4623 found = doNextCanonicalMatch(strsrch, textoffset, status);
4624 }
4625
4626 if (!found) {
374ca955
A
4627 if (U_FAILURE(*status)) {
4628 break;
4629 }
729e4ab9 4630 textoffset = shiftForward(strsrch, textoffset, lastce,
b75a7d8f
A
4631 patternceindex);
4632 // status checked at loop
4633 patternceindex = patterncelength;
4634 continue;
4635 }
729e4ab9 4636
b75a7d8f
A
4637 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4638 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4639 return TRUE;
4640 }
4641 }
4642 setMatchNotFound(strsrch);
4643 return FALSE;
46f4442e
A
4644#else
4645 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4646 int32_t start = -1;
4647 int32_t end = -1;
4648
4649 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4650 strsrch->search->matchedIndex = start;
4651 strsrch->search->matchedLength = end - start;
4652 return TRUE;
4653 } else {
4654 setMatchNotFound(strsrch);
4655 return FALSE;
4656 }
4657#endif
b75a7d8f
A
4658}
4659
4660UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4661{
4662 if (U_FAILURE(*status)) {
4663 setMatchNotFound(strsrch);
4664 return FALSE;
4665 }
4666
46f4442e 4667#if BOYER_MOORE
b75a7d8f 4668 UCollationElements *coleiter = strsrch->textIter;
b331163b
A
4669 int32_t *patternce = strsrch->pattern.ces;
4670 int32_t patterncelength = strsrch->pattern.cesLength;
374ca955 4671 int32_t textoffset = ucol_getOffset(coleiter);
b75a7d8f
A
4672
4673 // shifting it check for setting offset
4674 // if setOffset is called previously or there was no previous match, we
4675 // leave the offset as it is.
4676 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4677 textoffset = strsrch->search->matchedIndex;
4678 }
729e4ab9
A
4679
4680 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f 4681 patterncelength);
729e4ab9 4682
b75a7d8f
A
4683 while (textoffset >= 0)
4684 {
4685 int32_t patternceindex = 1;
374ca955 4686 int32_t targetce;
b75a7d8f 4687 UBool found = FALSE;
374ca955 4688 int32_t firstce = UCOL_NULLORDER;
b75a7d8f 4689
374ca955 4690 // if status is a failure, ucol_setOffset does nothing
b75a7d8f 4691 setColEIterOffset(coleiter, textoffset);
374ca955
A
4692
4693 for (;;) {
729e4ab9
A
4694 // finding the first pattern ce match, imagine composite
4695 // characters. for example: search for pattern \u0300 in text
4696 // \u00C0, we'll have to skip A first before we get to
b75a7d8f
A
4697 // \u0300 the grave accent
4698 targetce = ucol_next(coleiter, status);
4699 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4700 found = FALSE;
4701 break;
4702 }
4703 targetce = getCE(strsrch, targetce);
4704 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4705 firstce = targetce;
4706 }
46f4442e 4707 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
b75a7d8f 4708 continue;
729e4ab9
A
4709 }
4710 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4711 if (targetce == patternce[0]) {
4712 found = TRUE;
4713 break;
4714 }
4715 if (!hasExpansion(coleiter)) {
4716 // checking for accents in composite character
4717 found = FALSE;
4718 break;
4719 }
4720 }
4721
46f4442e 4722 //targetce = firstce;
729e4ab9 4723
b75a7d8f 4724 while (found && (patternceindex < patterncelength)) {
4388f060 4725 firstce = targetce;
b75a7d8f
A
4726 targetce = ucol_next(coleiter, status);
4727 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4728 found = FALSE;
4729 break;
4730 }
4731 targetce = getCE(strsrch, targetce);
4732 if (targetce == UCOL_IGNORABLE) {
4733 continue;
4734 }
4735
729e4ab9
A
4736 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4737 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4738 patternceindex ++;
4739 }
729e4ab9 4740
46f4442e 4741 targetce = firstce;
b75a7d8f
A
4742
4743 if (!found) {
374ca955
A
4744 if (U_FAILURE(*status)) {
4745 break;
4746 }
729e4ab9
A
4747
4748 textoffset = reverseShift(strsrch, textoffset, targetce,
b75a7d8f
A
4749 patternceindex);
4750 patternceindex = 0;
4751 continue;
4752 }
729e4ab9 4753
b75a7d8f
A
4754 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4755 setColEIterOffset(coleiter, textoffset);
4756 return TRUE;
4757 }
4758 }
4759 setMatchNotFound(strsrch);
4760 return FALSE;
46f4442e 4761#else
4388f060
A
4762 int32_t textOffset;
4763
4764 if (strsrch->search->isOverlap) {
4765 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4766 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4767 } else {
4768 // move the start position at the end of possible match
4769 initializePatternPCETable(strsrch, status);
57a6839d
A
4770 if (!initTextProcessedIter(strsrch, status)) {
4771 setMatchNotFound(strsrch);
4772 return FALSE;
4773 }
b331163b 4774 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
57a6839d 4775 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4388f060
A
4776 if (pce == UCOL_PROCESSED_NULLORDER) {
4777 // at the end of the text
4778 break;
4779 }
4780 }
4781 if (U_FAILURE(*status)) {
4782 setMatchNotFound(strsrch);
4783 return FALSE;
4784 }
4785 textOffset = ucol_getOffset(strsrch->textIter);
4786 }
4787 } else {
4788 textOffset = ucol_getOffset(strsrch->textIter);
4789 }
4790
46f4442e
A
4791 int32_t start = -1;
4792 int32_t end = -1;
4793
4794 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4795 strsrch->search->matchedIndex = start;
4796 strsrch->search->matchedLength = end - start;
4797 return TRUE;
4798 } else {
4799 setMatchNotFound(strsrch);
4800 return FALSE;
4801 }
4802#endif
b75a7d8f
A
4803}
4804
729e4ab9 4805UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
b75a7d8f
A
4806 UErrorCode *status)
4807{
4808 if (U_FAILURE(*status)) {
4809 setMatchNotFound(strsrch);
4810 return FALSE;
4811 }
4812
46f4442e 4813#if BOYER_MOORE
b75a7d8f 4814 UCollationElements *coleiter = strsrch->textIter;
b331163b
A
4815 int32_t *patternce = strsrch->pattern.ces;
4816 int32_t patterncelength = strsrch->pattern.cesLength;
374ca955 4817 int32_t textoffset = ucol_getOffset(coleiter);
729e4ab9 4818 UBool hasPatternAccents =
b75a7d8f 4819 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
729e4ab9 4820
b75a7d8f
A
4821 // shifting it check for setting offset
4822 // if setOffset is called previously or there was no previous match, we
4823 // leave the offset as it is.
4824 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4825 textoffset = strsrch->search->matchedIndex;
4826 }
729e4ab9
A
4827
4828 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
b75a7d8f
A
4829 patterncelength);
4830 strsrch->canonicalPrefixAccents[0] = 0;
4831 strsrch->canonicalSuffixAccents[0] = 0;
729e4ab9 4832
b75a7d8f
A
4833 while (textoffset >= 0)
4834 {
4835 int32_t patternceindex = 1;
374ca955 4836 int32_t targetce;
b75a7d8f 4837 UBool found = FALSE;
374ca955 4838 int32_t firstce = UCOL_NULLORDER;
b75a7d8f
A
4839
4840 setColEIterOffset(coleiter, textoffset);
46f4442e 4841 for (;;) {
729e4ab9
A
4842 // finding the first pattern ce match, imagine composite
4843 // characters. for example: search for pattern \u0300 in text
4844 // \u00C0, we'll have to skip A first before we get to
b75a7d8f
A
4845 // \u0300 the grave accent
4846 targetce = ucol_next(coleiter, status);
4847 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4848 found = FALSE;
4849 break;
4850 }
4851 targetce = getCE(strsrch, targetce);
4852 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4853 firstce = targetce;
4854 }
729e4ab9
A
4855
4856 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
b75a7d8f
A
4857 if (targetce == patternce[0]) {
4858 // the first ce can be a contraction
4859 found = TRUE;
4860 break;
4861 }
4862 if (!hasExpansion(coleiter)) {
4863 // checking for accents in composite character
4864 found = FALSE;
4865 break;
4866 }
4867 }
4868
4869 targetce = firstce;
729e4ab9 4870
b75a7d8f
A
4871 while (found && patternceindex < patterncelength) {
4872 targetce = ucol_next(coleiter, status);
4873 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4874 found = FALSE;
4875 break;
4876 }
4877 targetce = getCE(strsrch, targetce);
4878 if (targetce == UCOL_IGNORABLE) {
4879 continue;
4880 }
4881
729e4ab9
A
4882 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4883 found = found && targetce == patternce[patternceindex];
b75a7d8f
A
4884 patternceindex ++;
4885 }
4886
4887 // initializing the rearranged accent array
4888 if (hasPatternAccents && !found) {
4889 strsrch->canonicalPrefixAccents[0] = 0;
4890 strsrch->canonicalSuffixAccents[0] = 0;
374ca955 4891 if (U_FAILURE(*status)) {
b75a7d8f
A
4892 break;
4893 }
4894 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4895 }
4896
4897 if (!found) {
374ca955 4898 if (U_FAILURE(*status)) {
b75a7d8f
A
4899 break;
4900 }
729e4ab9 4901 textoffset = reverseShift(strsrch, textoffset, targetce,
b75a7d8f
A
4902 patternceindex);
4903 patternceindex = 0;
4904 continue;
4905 }
4906
4907 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4908 setColEIterOffset(coleiter, textoffset);
4909 return TRUE;
4910 }
4911 }
4912 setMatchNotFound(strsrch);
4913 return FALSE;
46f4442e 4914#else
4388f060
A
4915 int32_t textOffset;
4916
4917 if (strsrch->search->isOverlap) {
4918 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4919 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4920 } else {
4921 // move the start position at the end of possible match
4922 initializePatternPCETable(strsrch, status);
57a6839d
A
4923 if (!initTextProcessedIter(strsrch, status)) {
4924 setMatchNotFound(strsrch);
4925 return FALSE;
4926 }
b331163b 4927 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
57a6839d 4928 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4388f060
A
4929 if (pce == UCOL_PROCESSED_NULLORDER) {
4930 // at the end of the text
4931 break;
4932 }
4933 }
4934 if (U_FAILURE(*status)) {
4935 setMatchNotFound(strsrch);
4936 return FALSE;
4937 }
4938 textOffset = ucol_getOffset(strsrch->textIter);
4939 }
4940 } else {
4941 textOffset = ucol_getOffset(strsrch->textIter);
4942 }
4943
46f4442e
A
4944 int32_t start = -1;
4945 int32_t end = -1;
4946
4947 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4948 strsrch->search->matchedIndex = start;
4949 strsrch->search->matchedLength = end - start;
4950 return TRUE;
4951 } else {
4952 setMatchNotFound(strsrch);
4953 return FALSE;
4954 }
4955#endif
b75a7d8f
A
4956}
4957
4958#endif /* #if !UCONFIG_NO_COLLATION */