git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	**********************************************************************
	3	* Copyright (C) 2001-2010 IBM and others. All rights reserved.
	4	**********************************************************************
	5	* Date Name Description
	6	* 07/02/2001 synwee Creation.
	7	**********************************************************************
	8	*/
	9
	10	#include "unicode/utypes.h"
	11
	12	#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
	13
	14	#include "unicode/usearch.h"
	15	#include "unicode/ustring.h"
	16	#include "unicode/uchar.h"
	17	#include "normalizer2impl.h"
	18	#include "ucol_imp.h"
	19	#include "usrchimp.h"
	20	#include "cmemory.h"
	21	#include "ucln_in.h"
	22	#include "uassert.h"
	23	#include "ustr_imp.h"
	24
	25	U_NAMESPACE_USE
	26
	27	// don't use Boyer-Moore
	28	// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
	29	#define BOYER_MOORE 0
	30
	31	#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
	32
	33	// internal definition ---------------------------------------------------
	34
	35	#define LAST_BYTE_MASK_ 0xFF
	36	#define SECOND_LAST_BYTE_SHIFT_ 8
	37	#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
	38
	39	static const uint16_t *fcdTrieIndex = NULL;
	40	static UChar32 fcdHighStart = 0;
	41
	42	// internal methods -------------------------------------------------
	43
	44	/**
	45	* Fast collation element iterator setOffset.
	46	* This function does not check for bounds.
	47	* @param coleiter collation element iterator
	48	* @param offset to set
	49	*/
	50	static
	51	inline void setColEIterOffset(UCollationElements *elems,
	52	int32_t offset)
	53	{
	54	collIterate *ci = &(elems->iteratordata_);
	55	ci->pos = ci->string + offset;
	56	ci->CEpos = ci->toReturn = ci->extendCEs ? ci->extendCEs : ci->CEs;
	57	if (ci->flags & UCOL_ITER_INNORMBUF) {
	58	ci->flags = ci->origFlags;
	59	}
	60	ci->fcdPosition = NULL;
	61
	62	ci->offsetReturn = NULL;
	63	ci->offsetStore = ci->offsetBuffer;
	64	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
	65	}
	66
	67	/**
	68	* Getting the mask for collation strength
	69	* @param strength collation strength
	70	* @return collation element mask
	71	*/
	72	static
	73	inline uint32_t getMask(UCollationStrength strength)
	74	{
	75	switch (strength)
	76	{
	77	case UCOL_PRIMARY:
	78	return UCOL_PRIMARYORDERMASK;
	79	case UCOL_SECONDARY:
	80	return UCOL_SECONDARYORDERMASK \| UCOL_PRIMARYORDERMASK;
	81	default:
	82	return UCOL_TERTIARYORDERMASK \| UCOL_SECONDARYORDERMASK \|
	83	UCOL_PRIMARYORDERMASK;
	84	}
	85	}
	86
	87	/**
	88	* This is to squeeze the 21bit ces into a 256 table
	89	* @param ce collation element
	90	* @return collapsed version of the collation element
	91	*/
	92	static
	93	inline int hash(uint32_t ce)
	94	{
	95	// the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
	96	// well with the new collation where most of the latin 1 characters
	97	// are of the value xx000xxx. their hashes will most of the time be 0
	98	// to be discussed on the hash algo.
	99	return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
	100	}
	101
	102	U_CDECL_BEGIN
	103	static UBool U_CALLCONV
	104	usearch_cleanup(void) {
	105	fcdTrieIndex = NULL;
	106	return TRUE;
	107	}
	108	U_CDECL_END
	109
	110	/**
	111	* Initializing the fcd tables.
	112	* Internal method, status assumed to be a success.
	113	* @param status output error if any, caller to check status before calling
	114	* method, status assumed to be success when passed in.
	115	*/
	116	static
	117	inline void initializeFCD(UErrorCode *status)
	118	{
	119	if (fcdTrieIndex == NULL) {
	120	fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
	121	ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
	122	}
	123	}
	124
	125	/**
	126	* Gets the fcd value for a character at the argument index.
	127	* This method takes into accounts of the supplementary characters.
	128	* @param str UTF16 string where character for fcd retrieval resides
	129	* @param offset position of the character whose fcd is to be retrieved, to be
	130	* overwritten with the next character position, taking
	131	* surrogate characters into consideration.
	132	* @param strlength length of the argument string
	133	* @return fcd value
	134	*/
	135	static
	136	uint16_t getFCD(const UChar str, int32_t offset,
	137	int32_t strlength)
	138	{
	139	const UChar temp = str + offset;
	140	uint16_t result = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, temp, str + strlength);
	141	*offset = (int32_t)(temp - str);
	142	return result;
	143	}
	144
	145	/**
	146	* Getting the modified collation elements taking into account the collation
	147	* attributes
	148	* @param strsrch string search data
	149	* @param sourcece
	150	* @return the modified collation element
	151	*/
	152	static
	153	inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
	154	{
	155	// note for tertiary we can't use the collator->tertiaryMask, that
	156	// is a preprocessed mask that takes into account case options. since
	157	// we are only concerned with exact matches, we don't need that.
	158	sourcece &= strsrch->ceMask;
	159
	160	if (strsrch->toShift) {
	161	// alternate handling here, since only the 16 most significant digits
	162	// is only used, we can safely do a compare without masking
	163	// if the ce is a variable, we mask and get only the primary values
	164	// no shifting to quartenary is required since all primary values
	165	// less than variabletop will need to be masked off anyway.
	166	if (strsrch->variableTop > sourcece) {
	167	if (strsrch->strength >= UCOL_QUATERNARY) {
	168	sourcece &= UCOL_PRIMARYORDERMASK;
	169	}
	170	else {
	171	sourcece = UCOL_IGNORABLE;
	172	}
	173	}
	174	} else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
	175	sourcece = 0xFFFF;
	176	}
	177
	178	return sourcece;
	179	}
	180
	181	/**
	182	* Allocate a memory and returns NULL if it failed.
	183	* Internal method, status assumed to be a success.
	184	* @param size to allocate
	185	* @param status output error if any, caller to check status before calling
	186	* method, status assumed to be success when passed in.
	187	* @return newly allocated array, NULL otherwise
	188	*/
	189	static
	190	inline void * allocateMemory(uint32_t size, UErrorCode *status)
	191	{
	192	uint32_t result = (uint32_t )uprv_malloc(size);
	193	if (result == NULL) {
	194	*status = U_MEMORY_ALLOCATION_ERROR;
	195	}
	196	return result;
	197	}
	198
	199	/**
	200	* Adds a uint32_t value to a destination array.
	201	* Creates a new array if we run out of space. The caller will have to
	202	* manually deallocate the newly allocated array.
	203	* Internal method, status assumed to be success, caller has to check status
	204	* before calling this method. destination not to be NULL and has at least
	205	* size destinationlength.
	206	* @param destination target array
	207	* @param offset destination offset to add value
	208	* @param destinationlength target array size, return value for the new size
	209	* @param value to be added
	210	* @param increments incremental size expected
	211	* @param status output error if any, caller to check status before calling
	212	* method, status assumed to be success when passed in.
	213	* @return new destination array, destination if there was no new allocation
	214	*/
	215	static
	216	inline int32_t * addTouint32_tArray(int32_t *destination,
	217	uint32_t offset,
	218	uint32_t *destinationlength,
	219	uint32_t value,
	220	uint32_t increments,
	221	UErrorCode *status)
	222	{
	223	uint32_t newlength = *destinationlength;
	224	if (offset + 1 == newlength) {
	225	newlength += increments;
	226	int32_t temp = (int32_t )allocateMemory(
	227	sizeof(int32_t) * newlength, status);
	228	if (U_FAILURE(*status)) {
	229	return NULL;
	230	}
	231	uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
	232	*destinationlength = newlength;
	233	destination = temp;
	234	}
	235	destination[offset] = value;
	236	return destination;
	237	}
	238
	239	/**
	240	* Adds a uint64_t value to a destination array.
	241	* Creates a new array if we run out of space. The caller will have to
	242	* manually deallocate the newly allocated array.
	243	* Internal method, status assumed to be success, caller has to check status
	244	* before calling this method. destination not to be NULL and has at least
	245	* size destinationlength.
	246	* @param destination target array
	247	* @param offset destination offset to add value
	248	* @param destinationlength target array size, return value for the new size
	249	* @param value to be added
	250	* @param increments incremental size expected
	251	* @param status output error if any, caller to check status before calling
	252	* method, status assumed to be success when passed in.
	253	* @return new destination array, destination if there was no new allocation
	254	*/
	255	static
	256	inline int64_t * addTouint64_tArray(int64_t *destination,
	257	uint32_t offset,
	258	uint32_t *destinationlength,
	259	uint64_t value,
	260	uint32_t increments,
	261	UErrorCode *status)
	262	{
	263	uint32_t newlength = *destinationlength;
	264	if (offset + 1 == newlength) {
	265	newlength += increments;
	266	int64_t temp = (int64_t )allocateMemory(
	267	sizeof(int64_t) * newlength, status);
	268
	269	if (U_FAILURE(*status)) {
	270	return NULL;
	271	}
	272
	273	uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
	274	*destinationlength = newlength;
	275	destination = temp;
	276	}
	277
	278	destination[offset] = value;
	279
	280	return destination;
	281	}
	282
	283	/**
	284	* Initializing the ce table for a pattern.
	285	* Stores non-ignorable collation keys.
	286	* Table size will be estimated by the size of the pattern text. Table
	287	* expansion will be perform as we go along. Adding 1 to ensure that the table
	288	* size definitely increases.
	289	* Internal method, status assumed to be a success.
	290	* @param strsrch string search data
	291	* @param status output error if any, caller to check status before calling
	292	* method, status assumed to be success when passed in.
	293	* @return total number of expansions
	294	*/
	295	static
	296	inline uint16_t initializePatternCETable(UStringSearch *strsrch,
	297	UErrorCode *status)
	298	{
	299	UPattern *pattern = &(strsrch->pattern);
	300	uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
	301	int32_t *cetable = pattern->CEBuffer;
	302	uint32_t patternlength = pattern->textLength;
	303	UCollationElements *coleiter = strsrch->utilIter;
	304
	305	if (coleiter == NULL) {
	306	coleiter = ucol_openElements(strsrch->collator, pattern->text,
	307	patternlength, status);
	308	// status will be checked in ucol_next(..) later and if it is an
	309	// error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
	310	// returned.
	311	strsrch->utilIter = coleiter;
	312	}
	313	else {
	314	uprv_init_collIterate(strsrch->collator, pattern->text,
	315	pattern->textLength,
	316	&coleiter->iteratordata_,
	317	status);
	318	}
	319	if(U_FAILURE(*status)) {
	320	return 0;
	321	}
	322
	323	if (pattern->CE != cetable && pattern->CE) {
	324	uprv_free(pattern->CE);
	325	}
	326
	327	uint16_t offset = 0;
	328	uint16_t result = 0;
	329	int32_t ce;
	330
	331	while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
	332	U_SUCCESS(*status)) {
	333	uint32_t newce = getCE(strsrch, ce);
	334	if (newce) {
	335	int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
	336	newce,
	337	patternlength - ucol_getOffset(coleiter) + 1,
	338	status);
	339	if (U_FAILURE(*status)) {
	340	return 0;
	341	}
	342	offset ++;
	343	if (cetable != temp && cetable != pattern->CEBuffer) {
	344	uprv_free(cetable);
	345	}
	346	cetable = temp;
	347	}
	348	result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
	349	}
	350
	351	cetable[offset] = 0;
	352	pattern->CE = cetable;
	353	pattern->CELength = offset;
	354
	355	return result;
	356	}
	357
	358	/**
	359	* Initializing the pce table for a pattern.
	360	* Stores non-ignorable collation keys.
	361	* Table size will be estimated by the size of the pattern text. Table
	362	* expansion will be perform as we go along. Adding 1 to ensure that the table
	363	* size definitely increases.
	364	* Internal method, status assumed to be a success.
	365	* @param strsrch string search data
	366	* @param status output error if any, caller to check status before calling
	367	* method, status assumed to be success when passed in.
	368	* @return total number of expansions
	369	*/
	370	static
	371	inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
	372	UErrorCode *status)
	373	{
	374	UPattern *pattern = &(strsrch->pattern);
	375	uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
	376	int64_t *pcetable = pattern->PCEBuffer;
	377	uint32_t patternlength = pattern->textLength;
	378	UCollationElements *coleiter = strsrch->utilIter;
	379
	380	if (coleiter == NULL) {
	381	coleiter = ucol_openElements(strsrch->collator, pattern->text,
	382	patternlength, status);
	383	// status will be checked in ucol_next(..) later and if it is an
	384	// error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
	385	// returned.
	386	strsrch->utilIter = coleiter;
	387	} else {
	388	uprv_init_collIterate(strsrch->collator, pattern->text,
	389	pattern->textLength,
	390	&coleiter->iteratordata_,
	391	status);
	392	}
	393	if(U_FAILURE(*status)) {
	394	return 0;
	395	}
	396
	397	if (pattern->PCE != pcetable && pattern->PCE != NULL) {
	398	uprv_free(pattern->PCE);
	399	}
	400
	401	uint16_t offset = 0;
	402	uint16_t result = 0;
	403	int64_t pce;
	404
	405	uprv_init_pce(coleiter);
	406
	407	// ** Should processed CEs be signed or unsigned?
	408	// ** (the rest of the code in this file seems to play fast-and-loose with
	409	// ** whether a CE is signed or unsigned. For example, look at routine above this one.)
	410	while ((pce = ucol_nextProcessed(coleiter, NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
	411	U_SUCCESS(*status)) {
	412	int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
	413	pce,
	414	patternlength - ucol_getOffset(coleiter) + 1,
	415	status);
	416
	417	if (U_FAILURE(*status)) {
	418	return 0;
	419	}
	420
	421	offset += 1;
	422
	423	if (pcetable != temp && pcetable != pattern->PCEBuffer) {
	424	uprv_free(pcetable);
	425	}
	426
	427	pcetable = temp;
	428	//result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
	429	}
	430
	431	pcetable[offset] = 0;
	432	pattern->PCE = pcetable;
	433	pattern->PCELength = offset;
	434
	435	return result;
	436	}
	437
	438	/**
	439	* Initializes the pattern struct.
	440	* Internal method, status assumed to be success.
	441	* @param strsrch UStringSearch data storage
	442	* @param status output error if any, caller to check status before calling
	443	* method, status assumed to be success when passed in.
	444	* @return expansionsize the total expansion size of the pattern
	445	*/
	446	static
	447	inline int16_t initializePattern(UStringSearch strsrch, UErrorCode status)
	448	{
	449	UPattern *pattern = &(strsrch->pattern);
	450	const UChar *patterntext = pattern->text;
	451	int32_t length = pattern->textLength;
	452	int32_t index = 0;
	453
	454	// Since the strength is primary, accents are ignored in the pattern.
	455	if (strsrch->strength == UCOL_PRIMARY) {
	456	pattern->hasPrefixAccents = 0;
	457	pattern->hasSuffixAccents = 0;
	458	} else {
	459	pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
	460	SECOND_LAST_BYTE_SHIFT_;
	461	index = length;
	462	UTF_BACK_1(patterntext, 0, index);
	463	pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
	464	LAST_BYTE_MASK_;
	465	}
	466
	467	// HACK
	468	if (strsrch->pattern.PCE != NULL) {
	469	if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
	470	uprv_free(strsrch->pattern.PCE);
	471	}
	472
	473	strsrch->pattern.PCE = NULL;
	474	}
	475
	476	// since intializePattern is an internal method status is a success.
	477	return initializePatternCETable(strsrch, status);
	478	}
	479
	480	/**
	481	* Initializing shift tables, with the default values.
	482	* If a corresponding default value is 0, the shift table is not set.
	483	* @param shift table for forwards shift
	484	* @param backshift table for backwards shift
	485	* @param cetable table containing pattern ce
	486	* @param cesize size of the pattern ces
	487	* @param expansionsize total size of the expansions
	488	* @param defaultforward the default forward value
	489	* @param defaultbackward the default backward value
	490	*/
	491	static
	492	inline void setShiftTable(int16_t shift[], int16_t backshift[],
	493	int32_t *cetable, int32_t cesize,
	494	int16_t expansionsize,
	495	int16_t defaultforward,
	496	int16_t defaultbackward)
	497	{
	498	// estimate the value to shift. to do that we estimate the smallest
	499	// number of characters to give the relevant ces, ie approximately
	500	// the number of ces minus their expansion, since expansions can come
	501	// from a character.
	502	int32_t count;
	503	for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
	504	shift[count] = defaultforward;
	505	}
	506	cesize --; // down to the last index
	507	for (count = 0; count < cesize; count ++) {
	508	// number of ces from right of array to the count
	509	int temp = defaultforward - count - 1;
	510	shift[hash(cetable[count])] = temp > 1 ? temp : 1;
	511	}
	512	shift[hash(cetable[cesize])] = 1;
	513	// for ignorables we just shift by one. see test examples.
	514	shift[hash(0)] = 1;
	515
	516	for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
	517	backshift[count] = defaultbackward;
	518	}
	519	for (count = cesize; count > 0; count --) {
	520	// the original value count does not seem to work
	521	backshift[hash(cetable[count])] = count > expansionsize ?
	522	(int16_t)(count - expansionsize) : 1;
	523	}
	524	backshift[hash(cetable[0])] = 1;
	525	backshift[hash(0)] = 1;
	526	}
	527
	528	/**
	529	* Building of the pattern collation element list and the boyer moore strsrch
	530	* table.
	531	* The canonical match will only be performed after the default match fails.
	532	* For both cases we need to remember the size of the composed and decomposed
	533	* versions of the string. Since the Boyer-Moore shift calculations shifts by
	534	* a number of characters in the text and tries to match the pattern from that
	535	* offset, the shift value can not be too large in case we miss some
	536	* characters. To choose a right shift size, we estimate the NFC form of the
	537	* and use its size as a shift guide. The NFC form should be the small
	538	* possible representation of the pattern. Anyways, we'll err on the smaller
	539	* shift size. Hence the calculation for minlength.
	540	* Canonical match will be performed slightly differently. We'll split the
	541	* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
	542	* the first and last base character (MS), the ending accents (EA). Matches
	543	* will be done on MS first, and only when we match MS then some processing
	544	* will be required for the prefix and end accents in order to determine if
	545	* they match PA and EA. Hence the default shift values
	546	* for the canonical match will take the size of either end's accent into
	547	* consideration. Forwards search will take the end accents into consideration
	548	* for the default shift values and the backwards search will take the prefix
	549	* accents into consideration.
	550	* If pattern has no non-ignorable ce, we return a illegal argument error.
	551	* Internal method, status assumed to be success.
	552	* @param strsrch UStringSearch data storage
	553	* @param status for output errors if it occurs, status is assumed to be a
	554	* success when it is passed in.
	555	*/
	556	static
	557	inline void initialize(UStringSearch strsrch, UErrorCode status)
	558	{
	559	int16_t expandlength = initializePattern(strsrch, status);
	560	if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
	561	UPattern *pattern = &strsrch->pattern;
	562	int32_t cesize = pattern->CELength;
	563
	564	int16_t minlength = cesize > expandlength
	565	? (int16_t)cesize - expandlength : 1;
	566	pattern->defaultShiftSize = minlength;
	567	setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
	568	cesize, expandlength, minlength, minlength);
	569	return;
	570	}
	571	strsrch->pattern.defaultShiftSize = 0;
	572	}
	573
	574	#if BOYER_MOORE
	575	/**
	576	* Check to make sure that the match length is at the end of the character by
	577	* using the breakiterator.
	578	* @param strsrch string search data
	579	* @param start target text start offset
	580	* @param end target text end offset
	581	*/
	582	static
	583	void checkBreakBoundary(const UStringSearch strsrch, int32_t /start/,
	584	int32_t *end)
	585	{
	586	#if !UCONFIG_NO_BREAK_ITERATION
	587	UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
	588	if (breakiterator) {
	589	int32_t matchend = *end;
	590	//int32_t matchstart = *start;
	591
	592	if (!ubrk_isBoundary(breakiterator, matchend)) {
	593	*end = ubrk_following(breakiterator, matchend);
	594	}
	595
	596	/* Check the start of the matched text to make sure it doesn't have any accents
	597	* before it. This code may not be necessary and so it is commented out */
	598	/*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
	599	*start = ubrk_preceding(breakiterator, matchstart);
	600	}*/
	601	}
	602	#endif
	603	}
	604
	605	/**
	606	* Determine whether the target text in UStringSearch bounded by the offset
	607	* start and end is one or more whole units of text as
	608	* determined by the breakiterator in UStringSearch.
	609	* @param strsrch string search data
	610	* @param start target text start offset
	611	* @param end target text end offset
	612	*/
	613	static
	614	UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
	615	int32_t end)
	616	{
	617	#if !UCONFIG_NO_BREAK_ITERATION
	618	UBreakIterator *breakiterator = strsrch->search->breakIter;
	619	//TODO: Add here.
	620	if (breakiterator) {
	621	int32_t startindex = ubrk_first(breakiterator);
	622	int32_t endindex = ubrk_last(breakiterator);
	623
	624	// out-of-range indexes are never boundary positions
	625	if (start < startindex \|\| start > endindex \|\|
	626	end < startindex \|\| end > endindex) {
	627	return FALSE;
	628	}
	629	// otherwise, we can use following() on the position before the
	630	// specified one and return true of the position we get back is the
	631	// one the user specified
	632	UBool result = (start == startindex \|\|
	633	ubrk_following(breakiterator, start - 1) == start) &&
	634	(end == endindex \|\|
	635	ubrk_following(breakiterator, end - 1) == end);
	636	if (result) {
	637	// iterates the individual ces
	638	UCollationElements *coleiter = strsrch->utilIter;
	639	const UChar *text = strsrch->search->text +
	640	start;
	641	UErrorCode status = U_ZERO_ERROR;
	642	ucol_setText(coleiter, text, end - start, &status);
	643	for (int32_t count = 0; count < strsrch->pattern.CELength;
	644	count ++) {
	645	int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
	646	if (ce == UCOL_IGNORABLE) {
	647	count --;
	648	continue;
	649	}
	650	if (U_FAILURE(status) \|\| ce != strsrch->pattern.CE[count]) {
	651	return FALSE;
	652	}
	653	}
	654	int32_t nextce = ucol_next(coleiter, &status);
	655	while (ucol_getOffset(coleiter) == (end - start)
	656	&& getCE(strsrch, nextce) == UCOL_IGNORABLE) {
	657	nextce = ucol_next(coleiter, &status);
	658	}
	659	if (ucol_getOffset(coleiter) == (end - start)
	660	&& nextce != UCOL_NULLORDER) {
	661	// extra collation elements at the end of the match
	662	return FALSE;
	663	}
	664	}
	665	return result;
	666	}
	667	#endif
	668	return TRUE;
	669	}
	670
	671	/**
	672	* Getting the next base character offset if current offset is an accent,
	673	* or the current offset if the current character contains a base character.
	674	* accents the following base character will be returned
	675	* @param text string
	676	* @param textoffset current offset
	677	* @param textlength length of text string
	678	* @return the next base character or the current offset
	679	* if the current character is contains a base character.
	680	*/
	681	static
	682	inline int32_t getNextBaseOffset(const UChar *text,
	683	int32_t textoffset,
	684	int32_t textlength)
	685	{
	686	if (textoffset < textlength) {
	687	int32_t temp = textoffset;
	688	if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
	689	while (temp < textlength) {
	690	int32_t result = temp;
	691	if ((getFCD(text, &temp, textlength) >>
	692	SECOND_LAST_BYTE_SHIFT_) == 0) {
	693	return result;
	694	}
	695	}
	696	return textlength;
	697	}
	698	}
	699	return textoffset;
	700	}
	701
	702	/**
	703	* Gets the next base character offset depending on the string search pattern
	704	* data
	705	* @param strsrch string search data
	706	* @param textoffset current offset, one offset away from the last character
	707	* to search for.
	708	* @return start index of the next base character or the current offset
	709	* if the current character is contains a base character.
	710	*/
	711	static
	712	inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
	713	int32_t textoffset)
	714	{
	715	int32_t textlength = strsrch->search->textLength;
	716	if (strsrch->pattern.hasSuffixAccents &&
	717	textoffset < textlength) {
	718	int32_t temp = textoffset;
	719	const UChar *text = strsrch->search->text;
	720	UTF_BACK_1(text, 0, temp);
	721	if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
	722	return getNextBaseOffset(text, textoffset, textlength);
	723	}
	724	}
	725	return textoffset;
	726	}
	727
	728	/**
	729	* Shifting the collation element iterator position forward to prepare for
	730	* a following match. If the last character is a unsafe character, we'll only
	731	* shift by 1 to capture contractions, normalization etc.
	732	* Internal method, status assumed to be success.
	733	* @param text strsrch string search data
	734	* @param textoffset start text position to do search
	735	* @param ce the text ce which failed the match.
	736	* @param patternceindex index of the ce within the pattern ce buffer which
	737	* failed the match
	738	* @return final offset
	739	*/
	740	static
	741	inline int32_t shiftForward(UStringSearch *strsrch,
	742	int32_t textoffset,
	743	int32_t ce,
	744	int32_t patternceindex)
	745	{
	746	UPattern *pattern = &(strsrch->pattern);
	747	if (ce != UCOL_NULLORDER) {
	748	int32_t shift = pattern->shift[hash(ce)];
	749	// this is to adjust for characters in the middle of the
	750	// substring for matching that failed.
	751	int32_t adjust = pattern->CELength - patternceindex;
	752	if (adjust > 1 && shift >= adjust) {
	753	shift -= adjust - 1;
	754	}
	755	textoffset += shift;
	756	}
	757	else {
	758	textoffset += pattern->defaultShiftSize;
	759	}
	760
	761	textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
	762	// check for unsafe characters
	763	// * if it is the start or middle of a contraction: to be done after
	764	// a initial match is found
	765	// * thai or lao base consonant character: similar to contraction
	766	// * high surrogate character: similar to contraction
	767	// * next character is a accent: shift to the next base character
	768	return textoffset;
	769	}
	770	#endif // #if BOYER_MOORE
	771
	772	/**
	773	* sets match not found
	774	* @param strsrch string search data
	775	*/
	776	static
	777	inline void setMatchNotFound(UStringSearch *strsrch)
	778	{
	779	// this method resets the match result regardless of the error status.
	780	strsrch->search->matchedIndex = USEARCH_DONE;
	781	strsrch->search->matchedLength = 0;
	782	if (strsrch->search->isForwardSearching) {
	783	setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
	784	}
	785	else {
	786	setColEIterOffset(strsrch->textIter, 0);
	787	}
	788	}
	789
	790	#if BOYER_MOORE
	791	/**
	792	* Gets the offset to the next safe point in text.
	793	* ie. not the middle of a contraction, swappable characters or supplementary
	794	* characters.
	795	* @param collator collation sata
	796	* @param text string to work with
	797	* @param textoffset offset in string
	798	* @param textlength length of text string
	799	* @return offset to the next safe character
	800	*/
	801	static
	802	inline int32_t getNextSafeOffset(const UCollator *collator,
	803	const UChar *text,
	804	int32_t textoffset,
	805	int32_t textlength)
	806	{
	807	int32_t result = textoffset; // first contraction character
	808	while (result != textlength && ucol_unsafeCP(text[result], collator)) {
	809	result ++;
	810	}
	811	return result;
	812	}
	813
	814	/**
	815	* This checks for accents in the potential match started with a .
	816	* composite character.
	817	* This is really painful... we have to check that composite character do not
	818	* have any extra accents. We have to normalize the potential match and find
	819	* the immediate decomposed character before the match.
	820	* The first composite character would have been taken care of by the fcd
	821	* checks in checkForwardExactMatch.
	822	* This is the slow path after the fcd of the first character and
	823	* the last character has been checked by checkForwardExactMatch and we
	824	* determine that the potential match has extra non-ignorable preceding
	825	* ces.
	826	* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
	827	* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
	828	* Note here that accents checking are slow and cautioned in the API docs.
	829	* Internal method, status assumed to be a success, caller should check status
	830	* before calling this method
	831	* @param strsrch string search data
	832	* @param start index of the potential unfriendly composite character
	833	* @param end index of the potential unfriendly composite character
	834	* @param status output error status if any.
	835	* @return TRUE if there is non-ignorable accents before at the beginning
	836	* of the match, FALSE otherwise.
	837	*/
	838
	839	static
	840	UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
	841	int32_t end,
	842	UErrorCode *status)
	843	{
	844	UBool result = FALSE;
	845	if (strsrch->pattern.hasPrefixAccents) {
	846	int32_t length = end - start;
	847	int32_t offset = 0;
	848	const UChar *text = strsrch->search->text + start;
	849
	850	UTF_FWD_1(text, offset, length);
	851	// we are only concerned with the first composite character
	852	if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
	853	int32_t safeoffset = getNextSafeOffset(strsrch->collator,
	854	text, 0, length);
	855	if (safeoffset != length) {
	856	safeoffset ++;
	857	}
	858	UChar *norm = NULL;
	859	UChar buffer[INITIAL_ARRAY_SIZE_];
	860	int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
	861	buffer, INITIAL_ARRAY_SIZE_,
	862	status);
	863	if (U_FAILURE(*status)) {
	864	return FALSE;
	865	}
	866	if (size >= INITIAL_ARRAY_SIZE_) {
	867	norm = (UChar )allocateMemory((size + 1) sizeof(UChar),
	868	status);
	869	// if allocation failed, status will be set to
	870	// U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
	871	// checks for it.
	872	size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
	873	size, status);
	874	if (U_FAILURE(*status) && norm != NULL) {
	875	uprv_free(norm);
	876	return FALSE;
	877	}
	878	}
	879	else {
	880	norm = buffer;
	881	}
	882
	883	UCollationElements *coleiter = strsrch->utilIter;
	884	ucol_setText(coleiter, norm, size, status);
	885	uint32_t firstce = strsrch->pattern.CE[0];
	886	UBool ignorable = TRUE;
	887	uint32_t ce = UCOL_IGNORABLE;
	888	while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
	889	offset = ucol_getOffset(coleiter);
	890	if (ce != firstce && ce != UCOL_IGNORABLE) {
	891	ignorable = FALSE;
	892	}
	893	ce = ucol_next(coleiter, status);
	894	}
	895	UChar32 codepoint;
	896	UTF_PREV_CHAR(norm, 0, offset, codepoint);
	897	result = !ignorable && (u_getCombiningClass(codepoint) != 0);
	898
	899	if (norm != buffer) {
	900	uprv_free(norm);
	901	}
	902	}
	903	}
	904
	905	return result;
	906	}
	907
	908	/**
	909	* Used by exact matches, checks if there are accents before the match.
	910	* This is really painful... we have to check that composite characters at
	911	* the start of the matches have to not have any extra accents.
	912	* We check the FCD of the character first, if it starts with an accent and
	913	* the first pattern ce does not match the first ce of the character, we bail.
	914	* Otherwise we try normalizing the first composite
	915	* character and find the immediate decomposed character before the match to
	916	* see if it is an non-ignorable accent.
	917	* Now normalizing the first composite character is enough because we ensure
	918	* that when the match is passed in here with extra beginning ces, the
	919	* first or last ce that match has to occur within the first character.
	920	* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
	921	* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
	922	* Note here that accents checking are slow and cautioned in the API docs.
	923	* @param strsrch string search data
	924	* @param start offset
	925	* @param end offset
	926	* @return TRUE if there are accents on either side of the match,
	927	* FALSE otherwise
	928	*/
	929	static
	930	UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
	931	int32_t end)
	932	{
	933	if (strsrch->pattern.hasPrefixAccents) {
	934	UCollationElements *coleiter = strsrch->textIter;
	935	UErrorCode status = U_ZERO_ERROR;
	936	// we have been iterating forwards previously
	937	uint32_t ignorable = TRUE;
	938	int32_t firstce = strsrch->pattern.CE[0];
	939
	940	setColEIterOffset(coleiter, start);
	941	int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
	942	if (U_FAILURE(status)) {
	943	return TRUE;
	944	}
	945	while (ce != firstce) {
	946	if (ce != UCOL_IGNORABLE) {
	947	ignorable = FALSE;
	948	}
	949	ce = getCE(strsrch, ucol_next(coleiter, &status));
	950	if (U_FAILURE(status) \|\| ce == UCOL_NULLORDER) {
	951	return TRUE;
	952	}
	953	}
	954	if (!ignorable && inNormBuf(coleiter)) {
	955	// within normalization buffer, discontiguous handled here
	956	return TRUE;
	957	}
	958
	959	// within text
	960	int32_t temp = start;
	961	// original code
	962	// accent = (getFCD(strsrch->search->text, &temp,
	963	// strsrch->search->textLength)
	964	// >> SECOND_LAST_BYTE_SHIFT_);
	965	// however this code does not work well with VC7 .net in release mode.
	966	// maybe the inlines for getFCD combined with shifting has bugs in
	967	// VC7. anyways this is a work around.
	968	UBool accent = getFCD(strsrch->search->text, &temp,
	969	strsrch->search->textLength) > 0xFF;
	970	if (!accent) {
	971	return checkExtraMatchAccents(strsrch, start, end, &status);
	972	}
	973	if (!ignorable) {
	974	return TRUE;
	975	}
	976	if (start > 0) {
	977	temp = start;
	978	UTF_BACK_1(strsrch->search->text, 0, temp);
	979	if (getFCD(strsrch->search->text, &temp,
	980	strsrch->search->textLength) & LAST_BYTE_MASK_) {
	981	setColEIterOffset(coleiter, start);
	982	ce = ucol_previous(coleiter, &status);
	983	if (U_FAILURE(status) \|\|
	984	(ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
	985	return TRUE;
	986	}
	987	}
	988	}
	989	}
	990
	991	return FALSE;
	992	}
	993
	994	/**
	995	* Used by exact matches, checks if there are accents bounding the match.
	996	* Note this is the initial boundary check. If the potential match
	997	* starts or ends with composite characters, the accents in those
	998	* characters will be determined later.
	999	* Not doing backwards iteration here, since discontiguos contraction for
	1000	* backwards collation element iterator, use up too many characters.
	1001	* E.g. looking for \u030A ring in \u01FA A ring above and acute,
	1002	* should fail since there is a acute at the end of \u01FA
	1003	* Note here that accents checking are slow and cautioned in the API docs.
	1004	* @param strsrch string search data
	1005	* @param start offset of match
	1006	* @param end end offset of the match
	1007	* @return TRUE if there are accents on either side of the match,
	1008	* FALSE otherwise
	1009	*/
	1010	static
	1011	UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
	1012	int32_t end)
	1013	{
	1014	if (strsrch->pattern.hasSuffixAccents) {
	1015	const UChar *text = strsrch->search->text;
	1016	int32_t temp = end;
	1017	int32_t textlength = strsrch->search->textLength;
	1018	UTF_BACK_1(text, 0, temp);
	1019	if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
	1020	int32_t firstce = strsrch->pattern.CE[0];
	1021	UCollationElements *coleiter = strsrch->textIter;
	1022	UErrorCode status = U_ZERO_ERROR;
	1023	int32_t ce;
	1024	setColEIterOffset(coleiter, start);
	1025	while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
	1026	if (U_FAILURE(status) \|\| ce == UCOL_NULLORDER) {
	1027	return TRUE;
	1028	}
	1029	}
	1030	int32_t count = 1;
	1031	while (count < strsrch->pattern.CELength) {
	1032	if (getCE(strsrch, ucol_next(coleiter, &status))
	1033	== UCOL_IGNORABLE) {
	1034	// Thai can give an ignorable here.
	1035	count --;
	1036	}
	1037	if (U_FAILURE(status)) {
	1038	return TRUE;
	1039	}
	1040	count ++;
	1041	}
	1042
	1043	ce = ucol_next(coleiter, &status);
	1044	if (U_FAILURE(status)) {
	1045	return TRUE;
	1046	}
	1047	if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
	1048	ce = getCE(strsrch, ce);
	1049	}
	1050	if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
	1051	if (ucol_getOffset(coleiter) <= end) {
	1052	return TRUE;
	1053	}
	1054	if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
	1055	return TRUE;
	1056	}
	1057	}
	1058	}
	1059	}
	1060	return FALSE;
	1061	}
	1062	#endif // #if BOYER_MOORE
	1063
	1064	/**
	1065	* Checks if the offset runs out of the text string
	1066	* @param offset
	1067	* @param textlength of the text string
	1068	* @return TRUE if offset is out of bounds, FALSE otherwise
	1069	*/
	1070	static
	1071	inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
	1072	{
	1073	return offset < 0 \|\| offset > textlength;
	1074	}
	1075
	1076	/**
	1077	* Checks for identical match
	1078	* @param strsrch string search data
	1079	* @param start offset of possible match
	1080	* @param end offset of possible match
	1081	* @return TRUE if identical match is found
	1082	*/
	1083	static
	1084	inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
	1085	int32_t end)
	1086	{
	1087	if (strsrch->strength != UCOL_IDENTICAL) {
	1088	return TRUE;
	1089	}
	1090
	1091	// Note: We could use Normalizer::compare() or similar, but for short strings
	1092	// which may not be in FCD it might be faster to just NFD them.
	1093	UErrorCode status = U_ZERO_ERROR;
	1094	UnicodeString t2, p2;
	1095	strsrch->nfd->normalize(
	1096	UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
	1097	strsrch->nfd->normalize(
	1098	UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
	1099	// return FALSE if NFD failed
	1100	return U_SUCCESS(status) && t2 == p2;
	1101	}
	1102
	1103	#if BOYER_MOORE
	1104	/**
	1105	* Checks to see if the match is repeated
	1106	* @param strsrch string search data
	1107	* @param start new match start index
	1108	* @param end new match end index
	1109	* @return TRUE if the the match is repeated, FALSE otherwise
	1110	*/
	1111	static
	1112	inline UBool checkRepeatedMatch(UStringSearch *strsrch,
	1113	int32_t start,
	1114	int32_t end)
	1115	{
	1116	int32_t lastmatchindex = strsrch->search->matchedIndex;
	1117	UBool result;
	1118	if (lastmatchindex == USEARCH_DONE) {
	1119	return FALSE;
	1120	}
	1121	if (strsrch->search->isForwardSearching) {
	1122	result = start <= lastmatchindex;
	1123	}
	1124	else {
	1125	result = start >= lastmatchindex;
	1126	}
	1127	if (!result && !strsrch->search->isOverlap) {
	1128	if (strsrch->search->isForwardSearching) {
	1129	result = start < lastmatchindex + strsrch->search->matchedLength;
	1130	}
	1131	else {
	1132	result = end > lastmatchindex;
	1133	}
	1134	}
	1135	return result;
	1136	}
	1137
	1138	/**
	1139	* Gets the collation element iterator's current offset.
	1140	* @param coleiter collation element iterator
	1141	* @param forwards flag TRUE if we are moving in th forwards direction
	1142	* @return current offset
	1143	*/
	1144	static
	1145	inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
	1146	UBool forwards)
	1147	{
	1148	int32_t result = ucol_getOffset(coleiter);
	1149	// intricacies of the the backwards collation element iterator
	1150	if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
	1151	result ++;
	1152	}
	1153	return result;
	1154	}
	1155
	1156	/**
	1157	* Checks match for contraction.
	1158	* If the match ends with a partial contraction we fail.
	1159	* If the match starts too far off (because of backwards iteration) we try to
	1160	* chip off the extra characters depending on whether a breakiterator has
	1161	* been used.
	1162	* Internal method, error assumed to be success, caller has to check status
	1163	* before calling this method.
	1164	* @param strsrch string search data
	1165	* @param start offset of potential match, to be modified if necessary
	1166	* @param end offset of potential match, to be modified if necessary
	1167	* @param status output error status if any
	1168	* @return TRUE if match passes the contraction test, FALSE otherwise
	1169	*/
	1170
	1171	static
	1172	UBool checkNextExactContractionMatch(UStringSearch *strsrch,
	1173	int32_t *start,
	1174	int32_t end, UErrorCode status)
	1175	{
	1176	UCollationElements *coleiter = strsrch->textIter;
	1177	int32_t textlength = strsrch->search->textLength;
	1178	int32_t temp = *start;
	1179	const UCollator *collator = strsrch->collator;
	1180	const UChar *text = strsrch->search->text;
	1181	// This part checks if either ends of the match contains potential
	1182	// contraction. If so we'll have to iterate through them
	1183	// The start contraction needs to be checked since ucol_previous dumps
	1184	// all characters till the first safe character into the buffer.
	1185	// start + 1 is used to test for the unsafe characters instead of start
	1186	// because ucol_prev takes all unsafe characters till the first safe
	1187	// character ie start. so by testing start + 1, we can estimate if
	1188	// excess prefix characters has been included in the potential search
	1189	// results.
	1190	if ((end < textlength && ucol_unsafeCP(text[end], collator)) \|\|
	1191	(*start + 1 < textlength
	1192	&& ucol_unsafeCP(text[*start + 1], collator))) {
	1193	int32_t expansion = getExpansionPrefix(coleiter);
	1194	UBool expandflag = expansion > 0;
	1195	setColEIterOffset(coleiter, *start);
	1196	while (expansion > 0) {
	1197	// getting rid of the redundant ce, caused by setOffset.
	1198	// since backward contraction/expansion may have extra ces if we
	1199	// are in the normalization buffer, hasAccentsBeforeMatch would
	1200	// have taken care of it.
	1201	// E.g. the character \u01FA will have an expansion of 3, but if
	1202	// we are only looking for acute and ring \u030A and \u0301, we'll
	1203	// have to skip the first ce in the expansion buffer.
	1204	ucol_next(coleiter, status);
	1205	if (U_FAILURE(*status)) {
	1206	return FALSE;
	1207	}
	1208	if (ucol_getOffset(coleiter) != temp) {
	1209	*start = temp;
	1210	temp = ucol_getOffset(coleiter);
	1211	}
	1212	expansion --;
	1213	}
	1214
	1215	int32_t *patternce = strsrch->pattern.CE;
	1216	int32_t patterncelength = strsrch->pattern.CELength;
	1217	int32_t count = 0;
	1218	while (count < patterncelength) {
	1219	int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
	1220	if (ce == UCOL_IGNORABLE) {
	1221	continue;
	1222	}
	1223	if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
	1224	*start = temp;
	1225	temp = ucol_getOffset(coleiter);
	1226	}
	1227	if (U_FAILURE(*status) \|\| ce != patternce[count]) {
	1228	(*end) ++;
	1229	end = getNextUStringSearchBaseOffset(strsrch, end);
	1230	return FALSE;
	1231	}
	1232	count ++;
	1233	}
	1234	}
	1235	return TRUE;
	1236	}
	1237
	1238	/**
	1239	* Checks and sets the match information if found.
	1240	* Checks
	1241	* <ul>
	1242	* <li> the potential match does not repeat the previous match
	1243	* <li> boundaries are correct
	1244	* <li> exact matches has no extra accents
	1245	* <li> identical matchesb
	1246	* <li> potential match does not end in the middle of a contraction
	1247	* <\ul>
	1248	* Otherwise the offset will be shifted to the next character.
	1249	* Internal method, status assumed to be success, caller has to check status
	1250	* before calling this method.
	1251	* @param strsrch string search data
	1252	* @param textoffset offset in the collation element text. the returned value
	1253	* will be the truncated end offset of the match or the new start
	1254	* search offset.
	1255	* @param status output error status if any
	1256	* @return TRUE if the match is valid, FALSE otherwise
	1257	*/
	1258	static
	1259	inline UBool checkNextExactMatch(UStringSearch *strsrch,
	1260	int32_t textoffset, UErrorCode status)
	1261	{
	1262	UCollationElements *coleiter = strsrch->textIter;
	1263	int32_t start = getColElemIterOffset(coleiter, FALSE);
	1264
	1265	if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
	1266	return FALSE;
	1267	}
	1268
	1269	// this totally matches, however we need to check if it is repeating
	1270	if (!isBreakUnit(strsrch, start, *textoffset) \|\|
	1271	checkRepeatedMatch(strsrch, start, *textoffset) \|\|
	1272	hasAccentsBeforeMatch(strsrch, start, *textoffset) \|\|
	1273	!checkIdentical(strsrch, start, *textoffset) \|\|
	1274	hasAccentsAfterMatch(strsrch, start, *textoffset)) {
	1275
	1276	(*textoffset) ++;
	1277	textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
	1278	return FALSE;
	1279	}
	1280
	1281	//Add breakiterator boundary check for primary strength search.
	1282	if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
	1283	checkBreakBoundary(strsrch, &start, textoffset);
	1284	}
	1285
	1286	// totally match, we will get rid of the ending ignorables.
	1287	strsrch->search->matchedIndex = start;
	1288	strsrch->search->matchedLength = *textoffset - start;
	1289	return TRUE;
	1290	}
	1291
	1292	/**
	1293	* Getting the previous base character offset, or the current offset if the
	1294	* current character is a base character
	1295	* @param text string
	1296	* @param textoffset one offset after the current character
	1297	* @return the offset of the next character after the base character or the first
	1298	* composed character with accents
	1299	*/
	1300	static
	1301	inline int32_t getPreviousBaseOffset(const UChar *text,
	1302	int32_t textoffset)
	1303	{
	1304	if (textoffset > 0) {
	1305	for (;;) {
	1306	int32_t result = textoffset;
	1307	UTF_BACK_1(text, 0, textoffset);
	1308	int32_t temp = textoffset;
	1309	uint16_t fcd = getFCD(text, &temp, result);
	1310	if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
	1311	if (fcd & LAST_BYTE_MASK_) {
	1312	return textoffset;
	1313	}
	1314	return result;
	1315	}
	1316	if (textoffset == 0) {
	1317	return 0;
	1318	}
	1319	}
	1320	}
	1321	return textoffset;
	1322	}
	1323
	1324	/**
	1325	* Getting the indexes of the accents that are not blocked in the argument
	1326	* accent array
	1327	* @param accents array of accents in nfd terminated by a 0.
	1328	* @param accentsindex array of indexes of the accents that are not blocked
	1329	*/
	1330	static
	1331	inline int getUnblockedAccentIndex(UChar accents, int32_t accentsindex)
	1332	{
	1333	int32_t index = 0;
	1334	int32_t length = u_strlen(accents);
	1335	UChar32 codepoint = 0;
	1336	int cclass = 0;
	1337	int result = 0;
	1338	int32_t temp;
	1339	while (index < length) {
	1340	temp = index;
	1341	UTF_NEXT_CHAR(accents, index, length, codepoint);
	1342	if (u_getCombiningClass(codepoint) != cclass) {
	1343	cclass = u_getCombiningClass(codepoint);
	1344	accentsindex[result] = temp;
	1345	result ++;
	1346	}
	1347	}
	1348	accentsindex[result] = length;
	1349	return result;
	1350	}
	1351
	1352	/**
	1353	* Appends 3 UChar arrays to a destination array.
	1354	* Creates a new array if we run out of space. The caller will have to
	1355	* manually deallocate the newly allocated array.
	1356	* Internal method, status assumed to be success, caller has to check status
	1357	* before calling this method. destination not to be NULL and has at least
	1358	* size destinationlength.
	1359	* @param destination target array
	1360	* @param destinationlength target array size, returning the appended length
	1361	* @param source1 null-terminated first array
	1362	* @param source2 second array
	1363	* @param source2length length of seond array
	1364	* @param source3 null-terminated third array
	1365	* @param status error status if any
	1366	* @return new destination array, destination if there was no new allocation
	1367	*/
	1368	static
	1369	inline UChar * addToUCharArray( UChar *destination,
	1370	int32_t *destinationlength,
	1371	const UChar *source1,
	1372	const UChar *source2,
	1373	int32_t source2length,
	1374	const UChar *source3,
	1375	UErrorCode *status)
	1376	{
	1377	int32_t source1length = source1 ? u_strlen(source1) : 0;
	1378	int32_t source3length = source3 ? u_strlen(source3) : 0;
	1379	if (*destinationlength < source1length + source2length + source3length +
	1380	1)
	1381	{
	1382	destination = (UChar *)allocateMemory(
	1383	(source1length + source2length + source3length + 1) * sizeof(UChar),
	1384	status);
	1385	// if error allocating memory, status will be
	1386	// U_MEMORY_ALLOCATION_ERROR
	1387	if (U_FAILURE(*status)) {
	1388	*destinationlength = 0;
	1389	return NULL;
	1390	}
	1391	}
	1392	if (source1length != 0) {
	1393	uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
	1394	}
	1395	if (source2length != 0) {
	1396	uprv_memcpy(destination + source1length, source2,
	1397	sizeof(UChar) * source2length);
	1398	}
	1399	if (source3length != 0) {
	1400	uprv_memcpy(destination + source1length + source2length, source3,
	1401	sizeof(UChar) * source3length);
	1402	}
	1403	*destinationlength = source1length + source2length + source3length;
	1404	return destination;
	1405	}
	1406
	1407	/**
	1408	* Running through a collation element iterator to see if the contents matches
	1409	* pattern in string search data
	1410	* @param strsrch string search data
	1411	* @param coleiter collation element iterator
	1412	* @return TRUE if a match if found, FALSE otherwise
	1413	*/
	1414	static
	1415	inline UBool checkCollationMatch(const UStringSearch *strsrch,
	1416	UCollationElements *coleiter)
	1417	{
	1418	int patternceindex = strsrch->pattern.CELength;
	1419	int32_t *patternce = strsrch->pattern.CE;
	1420	UErrorCode status = U_ZERO_ERROR;
	1421	while (patternceindex > 0) {
	1422	int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
	1423	if (ce == UCOL_IGNORABLE) {
	1424	continue;
	1425	}
	1426	if (U_FAILURE(status) \|\| ce != *patternce) {
	1427	return FALSE;
	1428	}
	1429	patternce ++;
	1430	patternceindex --;
	1431	}
	1432	return TRUE;
	1433	}
	1434
	1435	/**
	1436	* Rearranges the front accents to try matching.
	1437	* Prefix accents in the text will be grouped according to their combining
	1438	* class and the groups will be mixed and matched to try find the perfect
	1439	* match with the pattern.
	1440	* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
	1441	* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
	1442	* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
	1443	* "\u0301\u0325".
	1444	* step 2: check if any of the generated substrings matches the pattern.
	1445	* Internal method, status is assumed to be success, caller has to check status
	1446	* before calling this method.
	1447	* @param strsrch string search match
	1448	* @param start first offset of the accents to start searching
	1449	* @param end start of the last accent set
	1450	* @param status output error status if any
	1451	* @return USEARCH_DONE if a match is not found, otherwise return the starting
	1452	* offset of the match. Note this start includes all preceding accents.
	1453	*/
	1454	static
	1455	int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
	1456	int32_t start,
	1457	int32_t end,
	1458	UErrorCode *status)
	1459	{
	1460	const UChar *text = strsrch->search->text;
	1461	int32_t textlength = strsrch->search->textLength;
	1462	int32_t tempstart = start;
	1463
	1464	if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
	1465	// die... failed at a base character
	1466	return USEARCH_DONE;
	1467	}
	1468
	1469	int32_t offset = getNextBaseOffset(text, tempstart, textlength);
	1470	start = getPreviousBaseOffset(text, tempstart);
	1471
	1472	UChar accents[INITIAL_ARRAY_SIZE_];
	1473	// normalizing the offensive string
	1474	unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
	1475	INITIAL_ARRAY_SIZE_, status);
	1476	if (U_FAILURE(*status)) {
	1477	return USEARCH_DONE;
	1478	}
	1479
	1480	int32_t accentsindex[INITIAL_ARRAY_SIZE_];
	1481	int32_t accentsize = getUnblockedAccentIndex(accents,
	1482	accentsindex);
	1483	int32_t count = (2 << (accentsize - 1)) - 1;
	1484	UChar buffer[INITIAL_ARRAY_SIZE_];
	1485	UCollationElements *coleiter = strsrch->utilIter;
	1486	while (U_SUCCESS(*status) && count > 0) {
	1487	UChar *rearrange = strsrch->canonicalPrefixAccents;
	1488	// copy the base characters
	1489	for (int k = 0; k < accentsindex[0]; k ++) {
	1490	*rearrange ++ = accents[k];
	1491	}
	1492	// forming all possible canonical rearrangement by dropping
	1493	// sets of accents
	1494	for (int i = 0; i <= accentsize - 1; i ++) {
	1495	int32_t mask = 1 << (accentsize - i - 1);
	1496	if (count & mask) {
	1497	for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
	1498	*rearrange ++ = accents[j];
	1499	}
	1500	}
	1501	}
	1502	*rearrange = 0;
	1503	int32_t matchsize = INITIAL_ARRAY_SIZE_;
	1504	UChar *match = addToUCharArray(buffer, &matchsize,
	1505	strsrch->canonicalPrefixAccents,
	1506	strsrch->search->text + offset,
	1507	end - offset,
	1508	strsrch->canonicalSuffixAccents,
	1509	status);
	1510
	1511	// if status is a failure, ucol_setText does nothing.
	1512	// run the collator iterator through this match
	1513	ucol_setText(coleiter, match, matchsize, status);
	1514	if (U_SUCCESS(*status)) {
	1515	if (checkCollationMatch(strsrch, coleiter)) {
	1516	if (match != buffer) {
	1517	uprv_free(match);
	1518	}
	1519	return start;
	1520	}
	1521	}
	1522	count --;
	1523	}
	1524	return USEARCH_DONE;
	1525	}
	1526
	1527	/**
	1528	* Gets the offset to the safe point in text before textoffset.
	1529	* ie. not the middle of a contraction, swappable characters or supplementary
	1530	* characters.
	1531	* @param collator collation sata
	1532	* @param text string to work with
	1533	* @param textoffset offset in string
	1534	* @param textlength length of text string
	1535	* @return offset to the previous safe character
	1536	*/
	1537	static
	1538	inline uint32_t getPreviousSafeOffset(const UCollator *collator,
	1539	const UChar *text,
	1540	int32_t textoffset)
	1541	{
	1542	int32_t result = textoffset; // first contraction character
	1543	while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
	1544	result --;
	1545	}
	1546	if (result != 0) {
	1547	// the first contraction character is consider unsafe here
	1548	result --;
	1549	}
	1550	return result;
	1551	}
	1552
	1553	/**
	1554	* Cleaning up after we passed the safe zone
	1555	* @param strsrch string search data
	1556	* @param safetext safe text array
	1557	* @param safebuffer safe text buffer
	1558	* @param coleiter collation element iterator for safe text
	1559	*/
	1560	static
	1561	inline void cleanUpSafeText(const UStringSearch strsrch, UChar safetext,
	1562	UChar *safebuffer)
	1563	{
	1564	if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
	1565	{
	1566	uprv_free(safetext);
	1567	}
	1568	}
	1569
	1570	/**
	1571	* Take the rearranged end accents and tries matching. If match failed at
	1572	* a seperate preceding set of accents (seperated from the rearranged on by
	1573	* at least a base character) then we rearrange the preceding accents and
	1574	* tries matching again.
	1575	* We allow skipping of the ends of the accent set if the ces do not match.
	1576	* However if the failure is found before the accent set, it fails.
	1577	* Internal method, status assumed to be success, caller has to check status
	1578	* before calling this method.
	1579	* @param strsrch string search data
	1580	* @param textoffset of the start of the rearranged accent
	1581	* @param status output error status if any
	1582	* @return USEARCH_DONE if a match is not found, otherwise return the starting
	1583	* offset of the match. Note this start includes all preceding accents.
	1584	*/
	1585	static
	1586	int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
	1587	int32_t textoffset,
	1588	UErrorCode *status)
	1589	{
	1590	const UChar *text = strsrch->search->text;
	1591	const UCollator *collator = strsrch->collator;
	1592	int32_t safelength = 0;
	1593	UChar *safetext;
	1594	int32_t safetextlength;
	1595	UChar safebuffer[INITIAL_ARRAY_SIZE_];
	1596	UCollationElements *coleiter = strsrch->utilIter;
	1597	int32_t safeoffset = textoffset;
	1598
	1599	if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
	1600	collator)) {
	1601	safeoffset = getPreviousSafeOffset(collator, text, textoffset);
	1602	safelength = textoffset - safeoffset;
	1603	safetextlength = INITIAL_ARRAY_SIZE_;
	1604	safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
	1605	text + safeoffset, safelength,
	1606	strsrch->canonicalSuffixAccents,
	1607	status);
	1608	}
	1609	else {
	1610	safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
	1611	safetext = strsrch->canonicalSuffixAccents;
	1612	}
	1613
	1614	// if status is a failure, ucol_setText does nothing
	1615	ucol_setText(coleiter, safetext, safetextlength, status);
	1616	// status checked in loop below
	1617
	1618	int32_t *ce = strsrch->pattern.CE;
	1619	int32_t celength = strsrch->pattern.CELength;
	1620	int ceindex = celength - 1;
	1621	UBool isSafe = TRUE; // indication flag for position in safe zone
	1622
	1623	while (ceindex >= 0) {
	1624	int32_t textce = ucol_previous(coleiter, status);
	1625	if (U_FAILURE(*status)) {
	1626	if (isSafe) {
	1627	cleanUpSafeText(strsrch, safetext, safebuffer);
	1628	}
	1629	return USEARCH_DONE;
	1630	}
	1631	if (textce == UCOL_NULLORDER) {
	1632	// check if we have passed the safe buffer
	1633	if (coleiter == strsrch->textIter) {
	1634	cleanUpSafeText(strsrch, safetext, safebuffer);
	1635	return USEARCH_DONE;
	1636	}
	1637	cleanUpSafeText(strsrch, safetext, safebuffer);
	1638	safetext = safebuffer;
	1639	coleiter = strsrch->textIter;
	1640	setColEIterOffset(coleiter, safeoffset);
	1641	// status checked at the start of the loop
	1642	isSafe = FALSE;
	1643	continue;
	1644	}
	1645	textce = getCE(strsrch, textce);
	1646	if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
	1647	// do the beginning stuff
	1648	int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
	1649	if (isSafe && failedoffset >= safelength) {
	1650	// alas... no hope. failed at rearranged accent set
	1651	cleanUpSafeText(strsrch, safetext, safebuffer);
	1652	return USEARCH_DONE;
	1653	}
	1654	else {
	1655	if (isSafe) {
	1656	failedoffset += safeoffset;
	1657	cleanUpSafeText(strsrch, safetext, safebuffer);
	1658	}
	1659
	1660	// try rearranging the front accents
	1661	int32_t result = doNextCanonicalPrefixMatch(strsrch,
	1662	failedoffset, textoffset, status);
	1663	if (result != USEARCH_DONE) {
	1664	// if status is a failure, ucol_setOffset does nothing
	1665	setColEIterOffset(strsrch->textIter, result);
	1666	}
	1667	if (U_FAILURE(*status)) {
	1668	return USEARCH_DONE;
	1669	}
	1670	return result;
	1671	}
	1672	}
	1673	if (textce == ce[ceindex]) {
	1674	ceindex --;
	1675	}
	1676	}
	1677	// set offset here
	1678	if (isSafe) {
	1679	int32_t result = getColElemIterOffset(coleiter, FALSE);
	1680	// sets the text iterator here with the correct expansion and offset
	1681	int32_t leftoverces = getExpansionPrefix(coleiter);
	1682	cleanUpSafeText(strsrch, safetext, safebuffer);
	1683	if (result >= safelength) {
	1684	result = textoffset;
	1685	}
	1686	else {
	1687	result += safeoffset;
	1688	}
	1689	setColEIterOffset(strsrch->textIter, result);
	1690	strsrch->textIter->iteratordata_.toReturn =
	1691	setExpansionPrefix(strsrch->textIter, leftoverces);
	1692	return result;
	1693	}
	1694
	1695	return ucol_getOffset(coleiter);
	1696	}
	1697
	1698	/**
	1699	* Trying out the substring and sees if it can be a canonical match.
	1700	* This will try normalizing the end accents and arranging them into canonical
	1701	* equivalents and check their corresponding ces with the pattern ce.
	1702	* Suffix accents in the text will be grouped according to their combining
	1703	* class and the groups will be mixed and matched to try find the perfect
	1704	* match with the pattern.
	1705	* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
	1706	* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
	1707	* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
	1708	* "\u0301\u0325".
	1709	* step 2: check if any of the generated substrings matches the pattern.
	1710	* Internal method, status assumed to be success, caller has to check status
	1711	* before calling this method.
	1712	* @param strsrch string search data
	1713	* @param textoffset end offset in the collation element text that ends with
	1714	* the accents to be rearranged
	1715	* @param status error status if any
	1716	* @return TRUE if the match is valid, FALSE otherwise
	1717	*/
	1718	static
	1719	UBool doNextCanonicalMatch(UStringSearch *strsrch,
	1720	int32_t textoffset,
	1721	UErrorCode *status)
	1722	{
	1723	const UChar *text = strsrch->search->text;
	1724	int32_t temp = textoffset;
	1725	UTF_BACK_1(text, 0, temp);
	1726	if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
	1727	UCollationElements *coleiter = strsrch->textIter;
	1728	int32_t offset = getColElemIterOffset(coleiter, FALSE);
	1729	if (strsrch->pattern.hasPrefixAccents) {
	1730	offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
	1731	status);
	1732	if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
	1733	setColEIterOffset(coleiter, offset);
	1734	return TRUE;
	1735	}
	1736	}
	1737	return FALSE;
	1738	}
	1739
	1740	if (!strsrch->pattern.hasSuffixAccents) {
	1741	return FALSE;
	1742	}
	1743
	1744	UChar accents[INITIAL_ARRAY_SIZE_];
	1745	// offset to the last base character in substring to search
	1746	int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
	1747	// normalizing the offensive string
	1748	unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
	1749	0, accents, INITIAL_ARRAY_SIZE_, status);
	1750	// status checked in loop below
	1751
	1752	int32_t accentsindex[INITIAL_ARRAY_SIZE_];
	1753	int32_t size = getUnblockedAccentIndex(accents, accentsindex);
	1754
	1755	// 2 power n - 1 plus the full set of accents
	1756	int32_t count = (2 << (size - 1)) - 1;
	1757	while (U_SUCCESS(*status) && count > 0) {
	1758	UChar *rearrange = strsrch->canonicalSuffixAccents;
	1759	// copy the base characters
	1760	for (int k = 0; k < accentsindex[0]; k ++) {
	1761	*rearrange ++ = accents[k];
	1762	}
	1763	// forming all possible canonical rearrangement by dropping
	1764	// sets of accents
	1765	for (int i = 0; i <= size - 1; i ++) {
	1766	int32_t mask = 1 << (size - i - 1);
	1767	if (count & mask) {
	1768	for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
	1769	*rearrange ++ = accents[j];
	1770	}
	1771	}
	1772	}
	1773	*rearrange = 0;
	1774	int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
	1775	status);
	1776	if (offset != USEARCH_DONE) {
	1777	return TRUE; // match found
	1778	}
	1779	count --;
	1780	}
	1781	return FALSE;
	1782	}
	1783
	1784	/**
	1785	* Gets the previous base character offset depending on the string search
	1786	* pattern data
	1787	* @param strsrch string search data
	1788	* @param textoffset current offset, current character
	1789	* @return the offset of the next character after this base character or itself
	1790	* if it is a composed character with accents
	1791	*/
	1792	static
	1793	inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
	1794	int32_t textoffset)
	1795	{
	1796	if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
	1797	const UChar *text = strsrch->search->text;
	1798	int32_t offset = textoffset;
	1799	if (getFCD(text, &offset, strsrch->search->textLength) >>
	1800	SECOND_LAST_BYTE_SHIFT_) {
	1801	return getPreviousBaseOffset(text, textoffset);
	1802	}
	1803	}
	1804	return textoffset;
	1805	}
	1806
	1807	/**
	1808	* Checks match for contraction.
	1809	* If the match ends with a partial contraction we fail.
	1810	* If the match starts too far off (because of backwards iteration) we try to
	1811	* chip off the extra characters
	1812	* Internal method, status assumed to be success, caller has to check status
	1813	* before calling this method.
	1814	* @param strsrch string search data
	1815	* @param start offset of potential match, to be modified if necessary
	1816	* @param end offset of potential match, to be modified if necessary
	1817	* @param status output error status if any
	1818	* @return TRUE if match passes the contraction test, FALSE otherwise
	1819	*/
	1820	static
	1821	UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
	1822	int32_t *start,
	1823	int32_t *end,
	1824	UErrorCode *status)
	1825	{
	1826	UCollationElements *coleiter = strsrch->textIter;
	1827	int32_t textlength = strsrch->search->textLength;
	1828	int32_t temp = *start;
	1829	const UCollator *collator = strsrch->collator;
	1830	const UChar *text = strsrch->search->text;
	1831	// This part checks if either ends of the match contains potential
	1832	// contraction. If so we'll have to iterate through them
	1833	if ((end < textlength && ucol_unsafeCP(text[end], collator)) \|\|
	1834	(*start + 1 < textlength
	1835	&& ucol_unsafeCP(text[*start + 1], collator))) {
	1836	int32_t expansion = getExpansionPrefix(coleiter);
	1837	UBool expandflag = expansion > 0;
	1838	setColEIterOffset(coleiter, *start);
	1839	while (expansion > 0) {
	1840	// getting rid of the redundant ce, caused by setOffset.
	1841	// since backward contraction/expansion may have extra ces if we
	1842	// are in the normalization buffer, hasAccentsBeforeMatch would
	1843	// have taken care of it.
	1844	// E.g. the character \u01FA will have an expansion of 3, but if
	1845	// we are only looking for acute and ring \u030A and \u0301, we'll
	1846	// have to skip the first ce in the expansion buffer.
	1847	ucol_next(coleiter, status);
	1848	if (U_FAILURE(*status)) {
	1849	return FALSE;
	1850	}
	1851	if (ucol_getOffset(coleiter) != temp) {
	1852	*start = temp;
	1853	temp = ucol_getOffset(coleiter);
	1854	}
	1855	expansion --;
	1856	}
	1857
	1858	int32_t *patternce = strsrch->pattern.CE;
	1859	int32_t patterncelength = strsrch->pattern.CELength;
	1860	int32_t count = 0;
	1861	int32_t textlength = strsrch->search->textLength;
	1862	while (count < patterncelength) {
	1863	int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
	1864	// status checked below, note that if status is a failure
	1865	// ucol_next returns UCOL_NULLORDER
	1866	if (ce == UCOL_IGNORABLE) {
	1867	continue;
	1868	}
	1869	if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
	1870	*start = temp;
	1871	temp = ucol_getOffset(coleiter);
	1872	}
	1873
	1874	if (count == 0 && ce != patternce[0]) {
	1875	// accents may have extra starting ces, this occurs when a
	1876	// pure accent pattern is matched without rearrangement
	1877	// text \u0325\u0300 and looking for \u0300
	1878	int32_t expected = patternce[0];
	1879	if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
	1880	ce = getCE(strsrch, ucol_next(coleiter, status));
	1881	while (U_SUCCESS(*status) && ce != expected &&
	1882	ce != UCOL_NULLORDER &&
	1883	ucol_getOffset(coleiter) <= *end) {
	1884	ce = getCE(strsrch, ucol_next(coleiter, status));
	1885	}
	1886	}
	1887	}
	1888	if (U_FAILURE(*status) \|\| ce != patternce[count]) {
	1889	(*end) ++;
	1890	end = getNextUStringSearchBaseOffset(strsrch, end);
	1891	return FALSE;
	1892	}
	1893	count ++;
	1894	}
	1895	}
	1896	return TRUE;
	1897	}
	1898
	1899	/**
	1900	* Checks and sets the match information if found.
	1901	* Checks
	1902	* <ul>
	1903	* <li> the potential match does not repeat the previous match
	1904	* <li> boundaries are correct
	1905	* <li> potential match does not end in the middle of a contraction
	1906	* <li> identical matches
	1907	* <\ul>
	1908	* Otherwise the offset will be shifted to the next character.
	1909	* Internal method, status assumed to be success, caller has to check the
	1910	* status before calling this method.
	1911	* @param strsrch string search data
	1912	* @param textoffset offset in the collation element text. the returned value
	1913	* will be the truncated end offset of the match or the new start
	1914	* search offset.
	1915	* @param status output error status if any
	1916	* @return TRUE if the match is valid, FALSE otherwise
	1917	*/
	1918	static
	1919	inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
	1920	int32_t *textoffset,
	1921	UErrorCode *status)
	1922	{
	1923	// to ensure that the start and ends are not composite characters
	1924	UCollationElements *coleiter = strsrch->textIter;
	1925	// if we have a canonical accent match
	1926	if ((strsrch->pattern.hasSuffixAccents &&
	1927	strsrch->canonicalSuffixAccents[0]) \|\|
	1928	(strsrch->pattern.hasPrefixAccents &&
	1929	strsrch->canonicalPrefixAccents[0])) {
	1930	strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
	1931	strsrch,
	1932	ucol_getOffset(coleiter));
	1933	strsrch->search->matchedLength = *textoffset -
	1934	strsrch->search->matchedIndex;
	1935	return TRUE;
	1936	}
	1937
	1938	int32_t start = getColElemIterOffset(coleiter, FALSE);
	1939	if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
	1940	status) \|\| U_FAILURE(*status)) {
	1941	return FALSE;
	1942	}
	1943
	1944	start = getPreviousUStringSearchBaseOffset(strsrch, start);
	1945	// this totally matches, however we need to check if it is repeating
	1946	if (checkRepeatedMatch(strsrch, start, *textoffset) \|\|
	1947	!isBreakUnit(strsrch, start, *textoffset) \|\|
	1948	!checkIdentical(strsrch, start, *textoffset)) {
	1949	(*textoffset) ++;
	1950	textoffset = getNextBaseOffset(strsrch->search->text, textoffset,
	1951	strsrch->search->textLength);
	1952	return FALSE;
	1953	}
	1954
	1955	strsrch->search->matchedIndex = start;
	1956	strsrch->search->matchedLength = *textoffset - start;
	1957	return TRUE;
	1958	}
	1959
	1960	/**
	1961	* Shifting the collation element iterator position forward to prepare for
	1962	* a preceding match. If the first character is a unsafe character, we'll only
	1963	* shift by 1 to capture contractions, normalization etc.
	1964	* Internal method, status assumed to be success, caller has to check status
	1965	* before calling this method.
	1966	* @param text strsrch string search data
	1967	* @param textoffset start text position to do search
	1968	* @param ce the text ce which failed the match.
	1969	* @param patternceindex index of the ce within the pattern ce buffer which
	1970	* failed the match
	1971	* @return final offset
	1972	*/
	1973	static
	1974	inline int32_t reverseShift(UStringSearch *strsrch,
	1975	int32_t textoffset,
	1976	int32_t ce,
	1977	int32_t patternceindex)
	1978	{
	1979	if (strsrch->search->isOverlap) {
	1980	if (textoffset != strsrch->search->textLength) {
	1981	textoffset --;
	1982	}
	1983	else {
	1984	textoffset -= strsrch->pattern.defaultShiftSize;
	1985	}
	1986	}
	1987	else {
	1988	if (ce != UCOL_NULLORDER) {
	1989	int32_t shift = strsrch->pattern.backShift[hash(ce)];
	1990
	1991	// this is to adjust for characters in the middle of the substring
	1992	// for matching that failed.
	1993	int32_t adjust = patternceindex;
	1994	if (adjust > 1 && shift > adjust) {
	1995	shift -= adjust - 1;
	1996	}
	1997	textoffset -= shift;
	1998	}
	1999	else {
	2000	textoffset -= strsrch->pattern.defaultShiftSize;
	2001	}
	2002	}
	2003	textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
	2004	return textoffset;
	2005	}
	2006
	2007	/**
	2008	* Checks match for contraction.
	2009	* If the match starts with a partial contraction we fail.
	2010	* Internal method, status assumed to be success, caller has to check status
	2011	* before calling this method.
	2012	* @param strsrch string search data
	2013	* @param start offset of potential match, to be modified if necessary
	2014	* @param end offset of potential match, to be modified if necessary
	2015	* @param status output error status if any
	2016	* @return TRUE if match passes the contraction test, FALSE otherwise
	2017	*/
	2018	static
	2019	UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
	2020	int32_t *start,
	2021	int32_t end, UErrorCode status)
	2022	{
	2023	UCollationElements *coleiter = strsrch->textIter;
	2024	int32_t textlength = strsrch->search->textLength;
	2025	int32_t temp = *end;
	2026	const UCollator *collator = strsrch->collator;
	2027	const UChar *text = strsrch->search->text;
	2028	// This part checks if either if the start of the match contains potential
	2029	// contraction. If so we'll have to iterate through them
	2030	// Since we used ucol_next while previously looking for the potential
	2031	// match, this guarantees that our end will not be a partial contraction,
	2032	// or a partial supplementary character.
	2033	if (start < textlength && ucol_unsafeCP(text[start], collator)) {
	2034	int32_t expansion = getExpansionSuffix(coleiter);
	2035	UBool expandflag = expansion > 0;
	2036	setColEIterOffset(coleiter, *end);
	2037	while (U_SUCCESS(*status) && expansion > 0) {
	2038	// getting rid of the redundant ce
	2039	// since forward contraction/expansion may have extra ces
	2040	// if we are in the normalization buffer, hasAccentsBeforeMatch
	2041	// would have taken care of it.
	2042	// E.g. the character \u01FA will have an expansion of 3, but if
	2043	// we are only looking for A ring A\u030A, we'll have to skip the
	2044	// last ce in the expansion buffer
	2045	ucol_previous(coleiter, status);
	2046	if (U_FAILURE(*status)) {
	2047	return FALSE;
	2048	}
	2049	if (ucol_getOffset(coleiter) != temp) {
	2050	*end = temp;
	2051	temp = ucol_getOffset(coleiter);
	2052	}
	2053	expansion --;
	2054	}
	2055
	2056	int32_t *patternce = strsrch->pattern.CE;
	2057	int32_t patterncelength = strsrch->pattern.CELength;
	2058	int32_t count = patterncelength;
	2059	while (count > 0) {
	2060	int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
	2061	// status checked below, note that if status is a failure
	2062	// ucol_previous returns UCOL_NULLORDER
	2063	if (ce == UCOL_IGNORABLE) {
	2064	continue;
	2065	}
	2066	if (expandflag && count == 0 &&
	2067	getColElemIterOffset(coleiter, FALSE) != temp) {
	2068	*end = temp;
	2069	temp = ucol_getOffset(coleiter);
	2070	}
	2071	if (U_FAILURE(*status) \|\| ce != patternce[count - 1]) {
	2072	(*start) --;
	2073	start = getPreviousBaseOffset(text, start);
	2074	return FALSE;
	2075	}
	2076	count --;
	2077	}
	2078	}
	2079	return TRUE;
	2080	}
	2081
	2082	/**
	2083	* Checks and sets the match information if found.
	2084	* Checks
	2085	* <ul>
	2086	* <li> the current match does not repeat the last match
	2087	* <li> boundaries are correct
	2088	* <li> exact matches has no extra accents
	2089	* <li> identical matches
	2090	* <\ul>
	2091	* Otherwise the offset will be shifted to the preceding character.
	2092	* Internal method, status assumed to be success, caller has to check status
	2093	* before calling this method.
	2094	* @param strsrch string search data
	2095	* @param collator
	2096	* @param coleiter collation element iterator
	2097	* @param text string
	2098	* @param textoffset offset in the collation element text. the returned value
	2099	* will be the truncated start offset of the match or the new start
	2100	* search offset.
	2101	* @param status output error status if any
	2102	* @return TRUE if the match is valid, FALSE otherwise
	2103	*/
	2104	static
	2105	inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
	2106	int32_t *textoffset,
	2107	UErrorCode *status)
	2108	{
	2109	// to ensure that the start and ends are not composite characters
	2110	int32_t end = ucol_getOffset(strsrch->textIter);
	2111	if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
	2112	\|\| U_FAILURE(*status)) {
	2113	return FALSE;
	2114	}
	2115
	2116	// this totally matches, however we need to check if it is repeating
	2117	// the old match
	2118	if (checkRepeatedMatch(strsrch, *textoffset, end) \|\|
	2119	!isBreakUnit(strsrch, *textoffset, end) \|\|
	2120	hasAccentsBeforeMatch(strsrch, *textoffset, end) \|\|
	2121	!checkIdentical(strsrch, *textoffset, end) \|\|
	2122	hasAccentsAfterMatch(strsrch, *textoffset, end)) {
	2123	(*textoffset) --;
	2124	*textoffset = getPreviousBaseOffset(strsrch->search->text,
	2125	*textoffset);
	2126	return FALSE;
	2127	}
	2128
	2129	//Add breakiterator boundary check for primary strength search.
	2130	if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
	2131	checkBreakBoundary(strsrch, textoffset, &end);
	2132	}
	2133
	2134	strsrch->search->matchedIndex = *textoffset;
	2135	strsrch->search->matchedLength = end - *textoffset;
	2136	return TRUE;
	2137	}
	2138
	2139	/**
	2140	* Rearranges the end accents to try matching.
	2141	* Suffix accents in the text will be grouped according to their combining
	2142	* class and the groups will be mixed and matched to try find the perfect
	2143	* match with the pattern.
	2144	* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
	2145	* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
	2146	* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
	2147	* "\u0301\u0325".
	2148	* step 2: check if any of the generated substrings matches the pattern.
	2149	* Internal method, status assumed to be success, user has to check status
	2150	* before calling this method.
	2151	* @param strsrch string search match
	2152	* @param start offset of the first base character
	2153	* @param end start of the last accent set
	2154	* @param status only error status if any
	2155	* @return USEARCH_DONE if a match is not found, otherwise return the ending
	2156	* offset of the match. Note this start includes all following accents.
	2157	*/
	2158	static
	2159	int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
	2160	int32_t start,
	2161	int32_t end,
	2162	UErrorCode *status)
	2163	{
	2164	const UChar *text = strsrch->search->text;
	2165	int32_t tempend = end;
	2166
	2167	UTF_BACK_1(text, 0, tempend);
	2168	if (!(getFCD(text, &tempend, strsrch->search->textLength) &
	2169	LAST_BYTE_MASK_)) {
	2170	// die... failed at a base character
	2171	return USEARCH_DONE;
	2172	}
	2173	end = getNextBaseOffset(text, end, strsrch->search->textLength);
	2174
	2175	if (U_SUCCESS(*status)) {
	2176	UChar accents[INITIAL_ARRAY_SIZE_];
	2177	int32_t offset = getPreviousBaseOffset(text, end);
	2178	// normalizing the offensive string
	2179	unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
	2180	INITIAL_ARRAY_SIZE_, status);
	2181
	2182	int32_t accentsindex[INITIAL_ARRAY_SIZE_];
	2183	int32_t accentsize = getUnblockedAccentIndex(accents,
	2184	accentsindex);
	2185	int32_t count = (2 << (accentsize - 1)) - 1;
	2186	UChar buffer[INITIAL_ARRAY_SIZE_];
	2187	UCollationElements *coleiter = strsrch->utilIter;
	2188	while (U_SUCCESS(*status) && count > 0) {
	2189	UChar *rearrange = strsrch->canonicalSuffixAccents;
	2190	// copy the base characters
	2191	for (int k = 0; k < accentsindex[0]; k ++) {
	2192	*rearrange ++ = accents[k];
	2193	}
	2194	// forming all possible canonical rearrangement by dropping
	2195	// sets of accents
	2196	for (int i = 0; i <= accentsize - 1; i ++) {
	2197	int32_t mask = 1 << (accentsize - i - 1);
	2198	if (count & mask) {
	2199	for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
	2200	*rearrange ++ = accents[j];
	2201	}
	2202	}
	2203	}
	2204	*rearrange = 0;
	2205	int32_t matchsize = INITIAL_ARRAY_SIZE_;
	2206	UChar *match = addToUCharArray(buffer, &matchsize,
	2207	strsrch->canonicalPrefixAccents,
	2208	strsrch->search->text + start,
	2209	offset - start,
	2210	strsrch->canonicalSuffixAccents,
	2211	status);
	2212
	2213	// run the collator iterator through this match
	2214	// if status is a failure ucol_setText does nothing
	2215	ucol_setText(coleiter, match, matchsize, status);
	2216	if (U_SUCCESS(*status)) {
	2217	if (checkCollationMatch(strsrch, coleiter)) {
	2218	if (match != buffer) {
	2219	uprv_free(match);
	2220	}
	2221	return end;
	2222	}
	2223	}
	2224	count --;
	2225	}
	2226	}
	2227	return USEARCH_DONE;
	2228	}
	2229
	2230	/**
	2231	* Take the rearranged start accents and tries matching. If match failed at
	2232	* a seperate following set of accents (seperated from the rearranged on by
	2233	* at least a base character) then we rearrange the preceding accents and
	2234	* tries matching again.
	2235	* We allow skipping of the ends of the accent set if the ces do not match.
	2236	* However if the failure is found before the accent set, it fails.
	2237	* Internal method, status assumed to be success, caller has to check status
	2238	* before calling this method.
	2239	* @param strsrch string search data
	2240	* @param textoffset of the ends of the rearranged accent
	2241	* @param status output error status if any
	2242	* @return USEARCH_DONE if a match is not found, otherwise return the ending
	2243	* offset of the match. Note this start includes all following accents.
	2244	*/
	2245	static
	2246	int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
	2247	int32_t textoffset,
	2248	UErrorCode *status)
	2249	{
	2250	const UChar *text = strsrch->search->text;
	2251	const UCollator *collator = strsrch->collator;
	2252	int32_t safelength = 0;
	2253	UChar *safetext;
	2254	int32_t safetextlength;
	2255	UChar safebuffer[INITIAL_ARRAY_SIZE_];
	2256	int32_t safeoffset = textoffset;
	2257
	2258	if (textoffset &&
	2259	ucol_unsafeCP(strsrch->canonicalPrefixAccents[
	2260	u_strlen(strsrch->canonicalPrefixAccents) - 1
	2261	], collator)) {
	2262	safeoffset = getNextSafeOffset(collator, text, textoffset,
	2263	strsrch->search->textLength);
	2264	safelength = safeoffset - textoffset;
	2265	safetextlength = INITIAL_ARRAY_SIZE_;
	2266	safetext = addToUCharArray(safebuffer, &safetextlength,
	2267	strsrch->canonicalPrefixAccents,
	2268	text + textoffset, safelength,
	2269	NULL, status);
	2270	}
	2271	else {
	2272	safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
	2273	safetext = strsrch->canonicalPrefixAccents;
	2274	}
	2275
	2276	UCollationElements *coleiter = strsrch->utilIter;
	2277	// if status is a failure, ucol_setText does nothing
	2278	ucol_setText(coleiter, safetext, safetextlength, status);
	2279	// status checked in loop below
	2280
	2281	int32_t *ce = strsrch->pattern.CE;
	2282	int32_t celength = strsrch->pattern.CELength;
	2283	int ceindex = 0;
	2284	UBool isSafe = TRUE; // safe zone indication flag for position
	2285	int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
	2286
	2287	while (ceindex < celength) {
	2288	int32_t textce = ucol_next(coleiter, status);
	2289	if (U_FAILURE(*status)) {
	2290	if (isSafe) {
	2291	cleanUpSafeText(strsrch, safetext, safebuffer);
	2292	}
	2293	return USEARCH_DONE;
	2294	}
	2295	if (textce == UCOL_NULLORDER) {
	2296	// check if we have passed the safe buffer
	2297	if (coleiter == strsrch->textIter) {
	2298	cleanUpSafeText(strsrch, safetext, safebuffer);
	2299	return USEARCH_DONE;
	2300	}
	2301	cleanUpSafeText(strsrch, safetext, safebuffer);
	2302	safetext = safebuffer;
	2303	coleiter = strsrch->textIter;
	2304	setColEIterOffset(coleiter, safeoffset);
	2305	// status checked at the start of the loop
	2306	isSafe = FALSE;
	2307	continue;
	2308	}
	2309	textce = getCE(strsrch, textce);
	2310	if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
	2311	// do the beginning stuff
	2312	int32_t failedoffset = ucol_getOffset(coleiter);
	2313	if (isSafe && failedoffset <= prefixlength) {
	2314	// alas... no hope. failed at rearranged accent set
	2315	cleanUpSafeText(strsrch, safetext, safebuffer);
	2316	return USEARCH_DONE;
	2317	}
	2318	else {
	2319	if (isSafe) {
	2320	failedoffset = safeoffset - failedoffset;
	2321	cleanUpSafeText(strsrch, safetext, safebuffer);
	2322	}
	2323
	2324	// try rearranging the end accents
	2325	int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
	2326	textoffset, failedoffset, status);
	2327	if (result != USEARCH_DONE) {
	2328	// if status is a failure, ucol_setOffset does nothing
	2329	setColEIterOffset(strsrch->textIter, result);
	2330	}
	2331	if (U_FAILURE(*status)) {
	2332	return USEARCH_DONE;
	2333	}
	2334	return result;
	2335	}
	2336	}
	2337	if (textce == ce[ceindex]) {
	2338	ceindex ++;
	2339	}
	2340	}
	2341	// set offset here
	2342	if (isSafe) {
	2343	int32_t result = ucol_getOffset(coleiter);
	2344	// sets the text iterator here with the correct expansion and offset
	2345	int32_t leftoverces = getExpansionSuffix(coleiter);
	2346	cleanUpSafeText(strsrch, safetext, safebuffer);
	2347	if (result <= prefixlength) {
	2348	result = textoffset;
	2349	}
	2350	else {
	2351	result = textoffset + (safeoffset - result);
	2352	}
	2353	setColEIterOffset(strsrch->textIter, result);
	2354	setExpansionSuffix(strsrch->textIter, leftoverces);
	2355	return result;
	2356	}
	2357
	2358	return ucol_getOffset(coleiter);
	2359	}
	2360
	2361	/**
	2362	* Trying out the substring and sees if it can be a canonical match.
	2363	* This will try normalizing the starting accents and arranging them into
	2364	* canonical equivalents and check their corresponding ces with the pattern ce.
	2365	* Prefix accents in the text will be grouped according to their combining
	2366	* class and the groups will be mixed and matched to try find the perfect
	2367	* match with the pattern.
	2368	* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
	2369	* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
	2370	* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
	2371	* "\u0301\u0325".
	2372	* step 2: check if any of the generated substrings matches the pattern.
	2373	* Internal method, status assumed to be success, caller has to check status
	2374	* before calling this method.
	2375	* @param strsrch string search data
	2376	* @param textoffset start offset in the collation element text that starts
	2377	* with the accents to be rearranged
	2378	* @param status output error status if any
	2379	* @return TRUE if the match is valid, FALSE otherwise
	2380	*/
	2381	static
	2382	UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
	2383	int32_t textoffset,
	2384	UErrorCode *status)
	2385	{
	2386	const UChar *text = strsrch->search->text;
	2387	int32_t temp = textoffset;
	2388	int32_t textlength = strsrch->search->textLength;
	2389	if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
	2390	UCollationElements *coleiter = strsrch->textIter;
	2391	int32_t offset = ucol_getOffset(coleiter);
	2392	if (strsrch->pattern.hasSuffixAccents) {
	2393	offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
	2394	offset, status);
	2395	if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
	2396	setColEIterOffset(coleiter, offset);
	2397	return TRUE;
	2398	}
	2399	}
	2400	return FALSE;
	2401	}
	2402
	2403	if (!strsrch->pattern.hasPrefixAccents) {
	2404	return FALSE;
	2405	}
	2406
	2407	UChar accents[INITIAL_ARRAY_SIZE_];
	2408	// offset to the last base character in substring to search
	2409	int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
	2410	// normalizing the offensive string
	2411	unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
	2412	0, accents, INITIAL_ARRAY_SIZE_, status);
	2413	// status checked in loop
	2414
	2415	int32_t accentsindex[INITIAL_ARRAY_SIZE_];
	2416	int32_t size = getUnblockedAccentIndex(accents, accentsindex);
	2417
	2418	// 2 power n - 1 plus the full set of accents
	2419	int32_t count = (2 << (size - 1)) - 1;
	2420	while (U_SUCCESS(*status) && count > 0) {
	2421	UChar *rearrange = strsrch->canonicalPrefixAccents;
	2422	// copy the base characters
	2423	for (int k = 0; k < accentsindex[0]; k ++) {
	2424	*rearrange ++ = accents[k];
	2425	}
	2426	// forming all possible canonical rearrangement by dropping
	2427	// sets of accents
	2428	for (int i = 0; i <= size - 1; i ++) {
	2429	int32_t mask = 1 << (size - i - 1);
	2430	if (count & mask) {
	2431	for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
	2432	*rearrange ++ = accents[j];
	2433	}
	2434	}
	2435	}
	2436	*rearrange = 0;
	2437	int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
	2438	baseoffset, status);
	2439	if (offset != USEARCH_DONE) {
	2440	return TRUE; // match found
	2441	}
	2442	count --;
	2443	}
	2444	return FALSE;
	2445	}
	2446
	2447	/**
	2448	* Checks match for contraction.
	2449	* If the match starts with a partial contraction we fail.
	2450	* Internal method, status assumed to be success, caller has to check status
	2451	* before calling this method.
	2452	* @param strsrch string search data
	2453	* @param start offset of potential match, to be modified if necessary
	2454	* @param end offset of potential match, to be modified if necessary
	2455	* @param status only error status if any
	2456	* @return TRUE if match passes the contraction test, FALSE otherwise
	2457	*/
	2458	static
	2459	UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
	2460	int32_t *start,
	2461	int32_t end, UErrorCode status)
	2462	{
	2463	UCollationElements *coleiter = strsrch->textIter;
	2464	int32_t textlength = strsrch->search->textLength;
	2465	int32_t temp = *end;
	2466	const UCollator *collator = strsrch->collator;
	2467	const UChar *text = strsrch->search->text;
	2468	// This part checks if either if the start of the match contains potential
	2469	// contraction. If so we'll have to iterate through them
	2470	// Since we used ucol_next while previously looking for the potential
	2471	// match, this guarantees that our end will not be a partial contraction,
	2472	// or a partial supplementary character.
	2473	if (start < textlength && ucol_unsafeCP(text[start], collator)) {
	2474	int32_t expansion = getExpansionSuffix(coleiter);
	2475	UBool expandflag = expansion > 0;
	2476	setColEIterOffset(coleiter, *end);
	2477	while (expansion > 0) {
	2478	// getting rid of the redundant ce
	2479	// since forward contraction/expansion may have extra ces
	2480	// if we are in the normalization buffer, hasAccentsBeforeMatch
	2481	// would have taken care of it.
	2482	// E.g. the character \u01FA will have an expansion of 3, but if
	2483	// we are only looking for A ring A\u030A, we'll have to skip the
	2484	// last ce in the expansion buffer
	2485	ucol_previous(coleiter, status);
	2486	if (U_FAILURE(*status)) {
	2487	return FALSE;
	2488	}
	2489	if (ucol_getOffset(coleiter) != temp) {
	2490	*end = temp;
	2491	temp = ucol_getOffset(coleiter);
	2492	}
	2493	expansion --;
	2494	}
	2495
	2496	int32_t *patternce = strsrch->pattern.CE;
	2497	int32_t patterncelength = strsrch->pattern.CELength;
	2498	int32_t count = patterncelength;
	2499	while (count > 0) {
	2500	int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
	2501	// status checked below, note that if status is a failure
	2502	// ucol_previous returns UCOL_NULLORDER
	2503	if (ce == UCOL_IGNORABLE) {
	2504	continue;
	2505	}
	2506	if (expandflag && count == 0 &&
	2507	getColElemIterOffset(coleiter, FALSE) != temp) {
	2508	*end = temp;
	2509	temp = ucol_getOffset(coleiter);
	2510	}
	2511	if (count == patterncelength &&
	2512	ce != patternce[patterncelength - 1]) {
	2513	// accents may have extra starting ces, this occurs when a
	2514	// pure accent pattern is matched without rearrangement
	2515	int32_t expected = patternce[patterncelength - 1];
	2516	UTF_BACK_1(text, 0, *end);
	2517	if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
	2518	ce = getCE(strsrch, ucol_previous(coleiter, status));
	2519	while (U_SUCCESS(*status) && ce != expected &&
	2520	ce != UCOL_NULLORDER &&
	2521	ucol_getOffset(coleiter) <= *start) {
	2522	ce = getCE(strsrch, ucol_previous(coleiter, status));
	2523	}
	2524	}
	2525	}
	2526	if (U_FAILURE(*status) \|\| ce != patternce[count - 1]) {
	2527	(*start) --;
	2528	start = getPreviousBaseOffset(text, start);
	2529	return FALSE;
	2530	}
	2531	count --;
	2532	}
	2533	}
	2534	return TRUE;
	2535	}
	2536
	2537	/**
	2538	* Checks and sets the match information if found.
	2539	* Checks
	2540	* <ul>
	2541	* <li> the potential match does not repeat the previous match
	2542	* <li> boundaries are correct
	2543	* <li> potential match does not end in the middle of a contraction
	2544	* <li> identical matches
	2545	* <\ul>
	2546	* Otherwise the offset will be shifted to the next character.
	2547	* Internal method, status assumed to be success, caller has to check status
	2548	* before calling this method.
	2549	* @param strsrch string search data
	2550	* @param textoffset offset in the collation element text. the returned value
	2551	* will be the truncated start offset of the match or the new start
	2552	* search offset.
	2553	* @param status only error status if any
	2554	* @return TRUE if the match is valid, FALSE otherwise
	2555	*/
	2556	static
	2557	inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
	2558	int32_t *textoffset,
	2559	UErrorCode *status)
	2560	{
	2561	// to ensure that the start and ends are not composite characters
	2562	UCollationElements *coleiter = strsrch->textIter;
	2563	// if we have a canonical accent match
	2564	if ((strsrch->pattern.hasSuffixAccents &&
	2565	strsrch->canonicalSuffixAccents[0]) \|\|
	2566	(strsrch->pattern.hasPrefixAccents &&
	2567	strsrch->canonicalPrefixAccents[0])) {
	2568	strsrch->search->matchedIndex = *textoffset;
	2569	strsrch->search->matchedLength =
	2570	getNextUStringSearchBaseOffset(strsrch,
	2571	getColElemIterOffset(coleiter, FALSE))
	2572	- *textoffset;
	2573	return TRUE;
	2574	}
	2575
	2576	int32_t end = ucol_getOffset(coleiter);
	2577	if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
	2578	status) \|\|
	2579	U_FAILURE(*status)) {
	2580	return FALSE;
	2581	}
	2582
	2583	end = getNextUStringSearchBaseOffset(strsrch, end);
	2584	// this totally matches, however we need to check if it is repeating
	2585	if (checkRepeatedMatch(strsrch, *textoffset, end) \|\|
	2586	!isBreakUnit(strsrch, *textoffset, end) \|\|
	2587	!checkIdentical(strsrch, *textoffset, end)) {
	2588	(*textoffset) --;
	2589	*textoffset = getPreviousBaseOffset(strsrch->search->text,
	2590	*textoffset);
	2591	return FALSE;
	2592	}
	2593
	2594	strsrch->search->matchedIndex = *textoffset;
	2595	strsrch->search->matchedLength = end - *textoffset;
	2596	return TRUE;
	2597	}
	2598	#endif // #if BOYER_MOORE
	2599
	2600	// constructors and destructor -------------------------------------------
	2601
	2602	U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
	2603	int32_t patternlength,
	2604	const UChar *text,
	2605	int32_t textlength,
	2606	const char *locale,
	2607	UBreakIterator *breakiter,
	2608	UErrorCode *status)
	2609	{
	2610	if (U_FAILURE(*status)) {
	2611	return NULL;
	2612	}
	2613	#if UCONFIG_NO_BREAK_ITERATION
	2614	if (breakiter != NULL) {
	2615	*status = U_UNSUPPORTED_ERROR;
	2616	return NULL;
	2617	}
	2618	#endif
	2619	if (locale) {
	2620	// ucol_open internally checks for status
	2621	UCollator *collator = ucol_open(locale, status);
	2622	// pattern, text checks are done in usearch_openFromCollator
	2623	UStringSearch *result = usearch_openFromCollator(pattern,
	2624	patternlength, text, textlength,
	2625	collator, breakiter, status);
	2626
	2627	if (result == NULL \|\| U_FAILURE(*status)) {
	2628	if (collator) {
	2629	ucol_close(collator);
	2630	}
	2631	return NULL;
	2632	}
	2633	else {
	2634	result->ownCollator = TRUE;
	2635	}
	2636	return result;
	2637	}
	2638	*status = U_ILLEGAL_ARGUMENT_ERROR;
	2639	return NULL;
	2640	}
	2641
	2642	U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
	2643	const UChar *pattern,
	2644	int32_t patternlength,
	2645	const UChar *text,
	2646	int32_t textlength,
	2647	const UCollator *collator,
	2648	UBreakIterator *breakiter,
	2649	UErrorCode *status)
	2650	{
	2651	if (U_FAILURE(*status)) {
	2652	return NULL;
	2653	}
	2654	#if UCONFIG_NO_BREAK_ITERATION
	2655	if (breakiter != NULL) {
	2656	*status = U_UNSUPPORTED_ERROR;
	2657	return NULL;
	2658	}
	2659	#endif
	2660	if (pattern == NULL \|\| text == NULL \|\| collator == NULL) {
	2661	*status = U_ILLEGAL_ARGUMENT_ERROR;
	2662	return NULL;
	2663	}
	2664
	2665	// string search does not really work when numeric collation is turned on
	2666	if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
	2667	*status = U_UNSUPPORTED_ERROR;
	2668	return NULL;
	2669	}
	2670
	2671	if (U_SUCCESS(*status)) {
	2672	initializeFCD(status);
	2673	if (U_FAILURE(*status)) {
	2674	return NULL;
	2675	}
	2676
	2677	UStringSearch *result;
	2678	if (textlength == -1) {
	2679	textlength = u_strlen(text);
	2680	}
	2681	if (patternlength == -1) {
	2682	patternlength = u_strlen(pattern);
	2683	}
	2684	if (textlength <= 0 \|\| patternlength <= 0) {
	2685	*status = U_ILLEGAL_ARGUMENT_ERROR;
	2686	return NULL;
	2687	}
	2688
	2689	result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
	2690	if (result == NULL) {
	2691	*status = U_MEMORY_ALLOCATION_ERROR;
	2692	return NULL;
	2693	}
	2694
	2695	result->collator = collator;
	2696	result->strength = ucol_getStrength(collator);
	2697	result->ceMask = getMask(result->strength);
	2698	result->toShift =
	2699	ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
	2700	UCOL_SHIFTED;
	2701	result->variableTop = ucol_getVariableTop(collator, status);
	2702
	2703	result->nfd = Normalizer2Factory::getNFDInstance(*status);
	2704
	2705	if (U_FAILURE(*status)) {
	2706	uprv_free(result);
	2707	return NULL;
	2708	}
	2709
	2710	result->search = (USearch *)uprv_malloc(sizeof(USearch));
	2711	if (result->search == NULL) {
	2712	*status = U_MEMORY_ALLOCATION_ERROR;
	2713	uprv_free(result);
	2714	return NULL;
	2715	}
	2716
	2717	result->search->text = text;
	2718	result->search->textLength = textlength;
	2719
	2720	result->pattern.text = pattern;
	2721	result->pattern.textLength = patternlength;
	2722	result->pattern.CE = NULL;
	2723	result->pattern.PCE = NULL;
	2724
	2725	result->search->breakIter = breakiter;
	2726	#if !UCONFIG_NO_BREAK_ITERATION
	2727	result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
	2728	if (breakiter) {
	2729	ubrk_setText(breakiter, text, textlength, status);
	2730	}
	2731	#endif
	2732
	2733	result->ownCollator = FALSE;
	2734	result->search->matchedLength = 0;
	2735	result->search->matchedIndex = USEARCH_DONE;
	2736	result->utilIter = NULL;
	2737	result->textIter = ucol_openElements(collator, text,
	2738	textlength, status);
	2739	if (U_FAILURE(*status)) {
	2740	usearch_close(result);
	2741	return NULL;
	2742	}
	2743
	2744	result->search->isOverlap = FALSE;
	2745	result->search->isCanonicalMatch = FALSE;
	2746	result->search->elementComparisonType = 0;
	2747	result->search->isForwardSearching = TRUE;
	2748	result->search->reset = TRUE;
	2749
	2750	initialize(result, status);
	2751
	2752	if (U_FAILURE(*status)) {
	2753	usearch_close(result);
	2754	return NULL;
	2755	}
	2756
	2757	return result;
	2758	}
	2759	return NULL;
	2760	}
	2761
	2762	U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
	2763	{
	2764	if (strsrch) {
	2765	if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
	2766	strsrch->pattern.CE) {
	2767	uprv_free(strsrch->pattern.CE);
	2768	}
	2769
	2770	if (strsrch->pattern.PCE != NULL &&
	2771	strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
	2772	uprv_free(strsrch->pattern.PCE);
	2773	}
	2774
	2775	ucol_closeElements(strsrch->textIter);
	2776	ucol_closeElements(strsrch->utilIter);
	2777
	2778	if (strsrch->ownCollator && strsrch->collator) {
	2779	ucol_close((UCollator *)strsrch->collator);
	2780	}
	2781
	2782	#if !UCONFIG_NO_BREAK_ITERATION
	2783	if (strsrch->search->internalBreakIter) {
	2784	ubrk_close(strsrch->search->internalBreakIter);
	2785	}
	2786	#endif
	2787
	2788	uprv_free(strsrch->search);
	2789	uprv_free(strsrch);
	2790	}
	2791	}
	2792
	2793	// set and get methods --------------------------------------------------
	2794
	2795	U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
	2796	int32_t position,
	2797	UErrorCode *status)
	2798	{
	2799	if (U_SUCCESS(*status) && strsrch) {
	2800	if (isOutOfBounds(strsrch->search->textLength, position)) {
	2801	*status = U_INDEX_OUTOFBOUNDS_ERROR;
	2802	}
	2803	else {
	2804	setColEIterOffset(strsrch->textIter, position);
	2805	}
	2806	strsrch->search->matchedIndex = USEARCH_DONE;
	2807	strsrch->search->matchedLength = 0;
	2808	strsrch->search->reset = FALSE;
	2809	}
	2810	}
	2811
	2812	U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
	2813	{
	2814	if (strsrch) {
	2815	int32_t result = ucol_getOffset(strsrch->textIter);
	2816	if (isOutOfBounds(strsrch->search->textLength, result)) {
	2817	return USEARCH_DONE;
	2818	}
	2819	return result;
	2820	}
	2821	return USEARCH_DONE;
	2822	}
	2823
	2824	U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
	2825	USearchAttribute attribute,
	2826	USearchAttributeValue value,
	2827	UErrorCode *status)
	2828	{
	2829	if (U_SUCCESS(*status) && strsrch) {
	2830	switch (attribute)
	2831	{
	2832	case USEARCH_OVERLAP :
	2833	strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
	2834	break;
	2835	case USEARCH_CANONICAL_MATCH :
	2836	strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
	2837	FALSE);
	2838	break;
	2839	case USEARCH_ELEMENT_COMPARISON :
	2840	if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD \|\| value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
	2841	strsrch->search->elementComparisonType = (int16_t)value;
	2842	} else {
	2843	strsrch->search->elementComparisonType = 0;
	2844	}
	2845	break;
	2846	case USEARCH_ATTRIBUTE_COUNT :
	2847	default:
	2848	*status = U_ILLEGAL_ARGUMENT_ERROR;
	2849	}
	2850	}
	2851	if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
	2852	*status = U_ILLEGAL_ARGUMENT_ERROR;
	2853	}
	2854	}
	2855
	2856	U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
	2857	const UStringSearch *strsrch,
	2858	USearchAttribute attribute)
	2859	{
	2860	if (strsrch) {
	2861	switch (attribute) {
	2862	case USEARCH_OVERLAP :
	2863	return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
	2864	USEARCH_OFF);
	2865	case USEARCH_CANONICAL_MATCH :
	2866	return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
	2867	USEARCH_OFF);
	2868	case USEARCH_ELEMENT_COMPARISON :
	2869	{
	2870	int16_t value = strsrch->search->elementComparisonType;
	2871	if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD \|\| value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
	2872	return (USearchAttributeValue)value;
	2873	} else {
	2874	return USEARCH_STANDARD_ELEMENT_COMPARISON;
	2875	}
	2876	}
	2877	case USEARCH_ATTRIBUTE_COUNT :
	2878	return USEARCH_DEFAULT;
	2879	}
	2880	}
	2881	return USEARCH_DEFAULT;
	2882	}
	2883
	2884	U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
	2885	const UStringSearch *strsrch)
	2886	{
	2887	if (strsrch == NULL) {
	2888	return USEARCH_DONE;
	2889	}
	2890	return strsrch->search->matchedIndex;
	2891	}
	2892
	2893
	2894	U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
	2895	UChar *result,
	2896	int32_t resultCapacity,
	2897	UErrorCode *status)
	2898	{
	2899	if (U_FAILURE(*status)) {
	2900	return USEARCH_DONE;
	2901	}
	2902	if (strsrch == NULL \|\| resultCapacity < 0 \|\| (resultCapacity > 0 &&
	2903	result == NULL)) {
	2904	*status = U_ILLEGAL_ARGUMENT_ERROR;
	2905	return USEARCH_DONE;
	2906	}
	2907
	2908	int32_t copylength = strsrch->search->matchedLength;
	2909	int32_t copyindex = strsrch->search->matchedIndex;
	2910	if (copyindex == USEARCH_DONE) {
	2911	u_terminateUChars(result, resultCapacity, 0, status);
	2912	return USEARCH_DONE;
	2913	}
	2914
	2915	if (resultCapacity < copylength) {
	2916	copylength = resultCapacity;
	2917	}
	2918	if (copylength > 0) {
	2919	uprv_memcpy(result, strsrch->search->text + copyindex,
	2920	copylength * sizeof(UChar));
	2921	}
	2922	return u_terminateUChars(result, resultCapacity,
	2923	strsrch->search->matchedLength, status);
	2924	}
	2925
	2926	U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
	2927	const UStringSearch *strsrch)
	2928	{
	2929	if (strsrch) {
	2930	return strsrch->search->matchedLength;
	2931	}
	2932	return USEARCH_DONE;
	2933	}
	2934
	2935	#if !UCONFIG_NO_BREAK_ITERATION
	2936
	2937	U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
	2938	UBreakIterator *breakiter,
	2939	UErrorCode *status)
	2940	{
	2941	if (U_SUCCESS(*status) && strsrch) {
	2942	strsrch->search->breakIter = breakiter;
	2943	if (breakiter) {
	2944	ubrk_setText(breakiter, strsrch->search->text,
	2945	strsrch->search->textLength, status);
	2946	}
	2947	}
	2948	}
	2949
	2950	U_CAPI const UBreakIterator* U_EXPORT2
	2951	usearch_getBreakIterator(const UStringSearch *strsrch)
	2952	{
	2953	if (strsrch) {
	2954	return strsrch->search->breakIter;
	2955	}
	2956	return NULL;
	2957	}
	2958
	2959	#endif
	2960
	2961	U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
	2962	const UChar *text,
	2963	int32_t textlength,
	2964	UErrorCode *status)
	2965	{
	2966	if (U_SUCCESS(*status)) {
	2967	if (strsrch == NULL \|\| text == NULL \|\| textlength < -1 \|\|
	2968	textlength == 0) {
	2969	*status = U_ILLEGAL_ARGUMENT_ERROR;
	2970	}
	2971	else {
	2972	if (textlength == -1) {
	2973	textlength = u_strlen(text);
	2974	}
	2975	strsrch->search->text = text;
	2976	strsrch->search->textLength = textlength;
	2977	ucol_setText(strsrch->textIter, text, textlength, status);
	2978	strsrch->search->matchedIndex = USEARCH_DONE;
	2979	strsrch->search->matchedLength = 0;
	2980	strsrch->search->reset = TRUE;
	2981	#if !UCONFIG_NO_BREAK_ITERATION
	2982	if (strsrch->search->breakIter != NULL) {
	2983	ubrk_setText(strsrch->search->breakIter, text,
	2984	textlength, status);
	2985	}
	2986	ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
	2987	#endif
	2988	}
	2989	}
	2990	}
	2991
	2992	U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
	2993	int32_t *length)
	2994	{
	2995	if (strsrch) {
	2996	*length = strsrch->search->textLength;
	2997	return strsrch->search->text;
	2998	}
	2999	return NULL;
	3000	}
	3001
	3002	U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
	3003	const UCollator *collator,
	3004	UErrorCode *status)
	3005	{
	3006	if (U_SUCCESS(*status)) {
	3007	if (collator == NULL) {
	3008	*status = U_ILLEGAL_ARGUMENT_ERROR;
	3009	return;
	3010	}
	3011
	3012	if (strsrch) {
	3013	if (strsrch->ownCollator && (strsrch->collator != collator)) {
	3014	ucol_close((UCollator *)strsrch->collator);
	3015	strsrch->ownCollator = FALSE;
	3016	}
	3017	strsrch->collator = collator;
	3018	strsrch->strength = ucol_getStrength(collator);
	3019	strsrch->ceMask = getMask(strsrch->strength);
	3020	#if !UCONFIG_NO_BREAK_ITERATION
	3021	ubrk_close(strsrch->search->internalBreakIter);
	3022	strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
	3023	strsrch->search->text, strsrch->search->textLength, status);
	3024	#endif
	3025	// if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
	3026	strsrch->toShift =
	3027	ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
	3028	UCOL_SHIFTED;
	3029	// if status is a failure, ucol_getVariableTop returns 0
	3030	strsrch->variableTop = ucol_getVariableTop(collator, status);
	3031	if (U_SUCCESS(*status)) {
	3032	initialize(strsrch, status);
	3033	if (U_SUCCESS(*status)) {
	3034	/* free offset buffer to avoid memory leak before initializing. */
	3035	ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
	3036	uprv_init_collIterate(collator, strsrch->search->text,
	3037	strsrch->search->textLength,
	3038	&(strsrch->textIter->iteratordata_),
	3039	status);
	3040	strsrch->utilIter->iteratordata_.coll = collator;
	3041	}
	3042	}
	3043	}
	3044
	3045	// **** are these calls needed?
	3046	// **** we call uprv_init_pce in initializePatternPCETable
	3047	// **** and the CEBuffer constructor...
	3048	#if 0
	3049	uprv_init_pce(strsrch->textIter);
	3050	uprv_init_pce(strsrch->utilIter);
	3051	#endif
	3052	}
	3053	}
	3054
	3055	U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
	3056	{
	3057	if (strsrch) {
	3058	return (UCollator *)strsrch->collator;
	3059	}
	3060	return NULL;
	3061	}
	3062
	3063	U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
	3064	const UChar *pattern,
	3065	int32_t patternlength,
	3066	UErrorCode *status)
	3067	{
	3068	if (U_SUCCESS(*status)) {
	3069	if (strsrch == NULL \|\| pattern == NULL) {
	3070	*status = U_ILLEGAL_ARGUMENT_ERROR;
	3071	}
	3072	else {
	3073	if (patternlength == -1) {
	3074	patternlength = u_strlen(pattern);
	3075	}
	3076	if (patternlength == 0) {
	3077	*status = U_ILLEGAL_ARGUMENT_ERROR;
	3078	return;
	3079	}
	3080	strsrch->pattern.text = pattern;
	3081	strsrch->pattern.textLength = patternlength;
	3082	initialize(strsrch, status);
	3083	}
	3084	}
	3085	}
	3086
	3087	U_CAPI const UChar* U_EXPORT2
	3088	usearch_getPattern(const UStringSearch *strsrch,
	3089	int32_t *length)
	3090	{
	3091	if (strsrch) {
	3092	*length = strsrch->pattern.textLength;
	3093	return strsrch->pattern.text;
	3094	}
	3095	return NULL;
	3096	}
	3097
	3098	// miscellanous methods --------------------------------------------------
	3099
	3100	U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
	3101	UErrorCode *status)
	3102	{
	3103	if (strsrch && U_SUCCESS(*status)) {
	3104	strsrch->search->isForwardSearching = TRUE;
	3105	usearch_setOffset(strsrch, 0, status);
	3106	if (U_SUCCESS(*status)) {
	3107	return usearch_next(strsrch, status);
	3108	}
	3109	}
	3110	return USEARCH_DONE;
	3111	}
	3112
	3113	U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
	3114	int32_t position,
	3115	UErrorCode *status)
	3116	{
	3117	if (strsrch && U_SUCCESS(*status)) {
	3118	strsrch->search->isForwardSearching = TRUE;
	3119	// position checked in usearch_setOffset
	3120	usearch_setOffset(strsrch, position, status);
	3121	if (U_SUCCESS(*status)) {
	3122	return usearch_next(strsrch, status);
	3123	}
	3124	}
	3125	return USEARCH_DONE;
	3126	}
	3127
	3128	U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
	3129	UErrorCode *status)
	3130	{
	3131	if (strsrch && U_SUCCESS(*status)) {
	3132	strsrch->search->isForwardSearching = FALSE;
	3133	usearch_setOffset(strsrch, strsrch->search->textLength, status);
	3134	if (U_SUCCESS(*status)) {
	3135	return usearch_previous(strsrch, status);
	3136	}
	3137	}
	3138	return USEARCH_DONE;
	3139	}
	3140
	3141	U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
	3142	int32_t position,
	3143	UErrorCode *status)
	3144	{
	3145	if (strsrch && U_SUCCESS(*status)) {
	3146	strsrch->search->isForwardSearching = FALSE;
	3147	// position checked in usearch_setOffset
	3148	usearch_setOffset(strsrch, position, status);
	3149	if (U_SUCCESS(*status)) {
	3150	return usearch_previous(strsrch, status);
	3151	}
	3152	}
	3153	return USEARCH_DONE;
	3154	}
	3155
	3156	/**
	3157	* If a direction switch is required, we'll count the number of ces till the
	3158	* beginning of the collation element iterator and iterate forwards that
	3159	* number of times. This is so that we get to the correct point within the
	3160	* string to continue the search in. Imagine when we are in the middle of the
	3161	* normalization buffer when the change in direction is request. arrrgghh....
	3162	* After searching the offset within the collation element iterator will be
	3163	* shifted to the start of the match. If a match is not found, the offset would
	3164	* have been set to the end of the text string in the collation element
	3165	* iterator.
	3166	* Okay, here's my take on normalization buffer. The only time when there can
	3167	* be 2 matches within the same normalization is when the pattern is consists
	3168	* of all accents. But since the offset returned is from the text string, we
	3169	* should not confuse the caller by returning the second match within the
	3170	* same normalization buffer. If we do, the 2 results will have the same match
	3171	* offsets, and that'll be confusing. I'll return the next match that doesn't
	3172	* fall within the same normalization buffer. Note this does not affect the
	3173	* results of matches spanning the text and the normalization buffer.
	3174	* The position to start searching is taken from the collation element
	3175	* iterator. Callers of this API would have to set the offset in the collation
	3176	* element iterator before using this method.
	3177	*/
	3178	U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
	3179	UErrorCode *status)
	3180	{
	3181	if (U_SUCCESS(*status) && strsrch) {
	3182	// note offset is either equivalent to the start of the previous match
	3183	// or is set by the user
	3184	int32_t offset = usearch_getOffset(strsrch);
	3185	USearch *search = strsrch->search;
	3186	search->reset = FALSE;
	3187	int32_t textlength = search->textLength;
	3188	if (search->isForwardSearching) {
	3189	#if BOYER_MOORE
	3190	if (offset == textlength
	3191	\|\| (!search->isOverlap &&
	3192	(offset + strsrch->pattern.defaultShiftSize > textlength \|\|
	3193	(search->matchedIndex != USEARCH_DONE &&
	3194	offset + search->matchedLength >= textlength)))) {
	3195	// not enough characters to match
	3196	setMatchNotFound(strsrch);
	3197	return USEARCH_DONE;
	3198	}
	3199	#else
	3200	if (offset == textlength \|\|
	3201	(! search->isOverlap &&
	3202	(search->matchedIndex != USEARCH_DONE &&
	3203	offset + search->matchedLength > textlength))) {
	3204	// not enough characters to match
	3205	setMatchNotFound(strsrch);
	3206	return USEARCH_DONE;
	3207	}
	3208	#endif
	3209	}
	3210	else {
	3211	// switching direction.
	3212	// if matchedIndex == USEARCH_DONE, it means that either a
	3213	// setOffset has been called or that previous ran off the text
	3214	// string. the iterator would have been set to offset 0 if a
	3215	// match is not found.
	3216	search->isForwardSearching = TRUE;
	3217	if (search->matchedIndex != USEARCH_DONE) {
	3218	// there's no need to set the collation element iterator
	3219	// the next call to next will set the offset.
	3220	return search->matchedIndex;
	3221	}
	3222	}
	3223
	3224	if (U_SUCCESS(*status)) {
	3225	if (strsrch->pattern.CELength == 0) {
	3226	if (search->matchedIndex == USEARCH_DONE) {
	3227	search->matchedIndex = offset;
	3228	}
	3229	else { // moves by codepoints
	3230	UTF_FWD_1(search->text, search->matchedIndex, textlength);
	3231	}
	3232
	3233	search->matchedLength = 0;
	3234	setColEIterOffset(strsrch->textIter, search->matchedIndex);
	3235	// status checked below
	3236	if (search->matchedIndex == textlength) {
	3237	search->matchedIndex = USEARCH_DONE;
	3238	}
	3239	}
	3240	else {
	3241	if (search->matchedLength > 0) {
	3242	// if matchlength is 0 we are at the start of the iteration
	3243	if (search->isOverlap) {
	3244	ucol_setOffset(strsrch->textIter, offset + 1, status);
	3245	}
	3246	else {
	3247	ucol_setOffset(strsrch->textIter,
	3248	offset + search->matchedLength, status);
	3249	}
	3250	}
	3251	else {
	3252	// for boundary check purposes. this will ensure that the
	3253	// next match will not preceed the current offset
	3254	// note search->matchedIndex will always be set to something
	3255	// in the code
	3256	search->matchedIndex = offset - 1;
	3257	}
	3258
	3259	if (search->isCanonicalMatch) {
	3260	// can't use exact here since extra accents are allowed.
	3261	usearch_handleNextCanonical(strsrch, status);
	3262	}
	3263	else {
	3264	usearch_handleNextExact(strsrch, status);
	3265	}
	3266	}
	3267
	3268	if (U_FAILURE(*status)) {
	3269	return USEARCH_DONE;
	3270	}
	3271
	3272	#if !BOYER_MOORE
	3273	if (search->matchedIndex == USEARCH_DONE) {
	3274	ucol_setOffset(strsrch->textIter, search->textLength, status);
	3275	} else {
	3276	ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
	3277	}
	3278	#endif
	3279
	3280	return search->matchedIndex;
	3281	}
	3282	}
	3283	return USEARCH_DONE;
	3284	}
	3285
	3286	U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
	3287	UErrorCode *status)
	3288	{
	3289	if (U_SUCCESS(*status) && strsrch) {
	3290	int32_t offset;
	3291	USearch *search = strsrch->search;
	3292	if (search->reset) {
	3293	offset = search->textLength;
	3294	search->isForwardSearching = FALSE;
	3295	search->reset = FALSE;
	3296	setColEIterOffset(strsrch->textIter, offset);
	3297	}
	3298	else {
	3299	offset = usearch_getOffset(strsrch);
	3300	}
	3301
	3302	int32_t matchedindex = search->matchedIndex;
	3303	if (search->isForwardSearching == TRUE) {
	3304	// switching direction.
	3305	// if matchedIndex == USEARCH_DONE, it means that either a
	3306	// setOffset has been called or that next ran off the text
	3307	// string. the iterator would have been set to offset textLength if
	3308	// a match is not found.
	3309	search->isForwardSearching = FALSE;
	3310	if (matchedindex != USEARCH_DONE) {
	3311	return matchedindex;
	3312	}
	3313	}
	3314	else {
	3315	#if BOYER_MOORE
	3316	if (offset == 0 \|\| matchedindex == 0 \|\|
	3317	(!search->isOverlap &&
	3318	(offset < strsrch->pattern.defaultShiftSize \|\|
	3319	(matchedindex != USEARCH_DONE &&
	3320	matchedindex < strsrch->pattern.defaultShiftSize)))) {
	3321	// not enough characters to match
	3322	setMatchNotFound(strsrch);
	3323	return USEARCH_DONE;
	3324	}
	3325	#else
	3326	// Could check pattern length, but the
	3327	// linear search will do the right thing
	3328	if (offset == 0 \|\| matchedindex == 0) {
	3329	setMatchNotFound(strsrch);
	3330	return USEARCH_DONE;
	3331	}
	3332	#endif
	3333	}
	3334
	3335	if (U_SUCCESS(*status)) {
	3336	if (strsrch->pattern.CELength == 0) {
	3337	search->matchedIndex =
	3338	(matchedindex == USEARCH_DONE ? offset : matchedindex);
	3339	if (search->matchedIndex == 0) {
	3340	setMatchNotFound(strsrch);
	3341	// status checked below
	3342	}
	3343	else { // move by codepoints
	3344	UTF_BACK_1(search->text, 0, search->matchedIndex);
	3345	setColEIterOffset(strsrch->textIter, search->matchedIndex);
	3346	// status checked below
	3347	search->matchedLength = 0;
	3348	}
	3349	}
	3350	else {
	3351	#if !BOYER_MOORE
	3352	if (search->matchedIndex != USEARCH_DONE) {
	3353	if (search->isOverlap) {
	3354	ucol_setOffset(strsrch->textIter, search->matchedIndex + search->matchedLength - 2, status);
	3355	}
	3356	}
	3357	#endif
	3358
	3359	if (strsrch->search->isCanonicalMatch) {
	3360	// can't use exact here since extra accents are allowed.
	3361	usearch_handlePreviousCanonical(strsrch, status);
	3362	// status checked below
	3363	}
	3364	else {
	3365	usearch_handlePreviousExact(strsrch, status);
	3366	// status checked below
	3367	}
	3368	}
	3369
	3370	if (U_FAILURE(*status)) {
	3371	return USEARCH_DONE;
	3372	}
	3373
	3374	return search->matchedIndex;
	3375	}
	3376	}
	3377	return USEARCH_DONE;
	3378	}
	3379
	3380
	3381
	3382	U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
	3383	{
	3384	/*
	3385	reset is setting the attributes that are already in
	3386	string search, hence all attributes in the collator should
	3387	be retrieved without any problems
	3388	*/
	3389	if (strsrch) {
	3390	UErrorCode status = U_ZERO_ERROR;
	3391	UBool sameCollAttribute = TRUE;
	3392	uint32_t ceMask;
	3393	UBool shift;
	3394	uint32_t varTop;
	3395
	3396	// ** hack to deal w/ how processed CEs encode quaternary **
	3397	UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
	3398	if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) \|\|
	3399	(strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
	3400	sameCollAttribute = FALSE;
	3401	}
	3402
	3403	strsrch->strength = ucol_getStrength(strsrch->collator);
	3404	ceMask = getMask(strsrch->strength);
	3405	if (strsrch->ceMask != ceMask) {
	3406	strsrch->ceMask = ceMask;
	3407	sameCollAttribute = FALSE;
	3408	}
	3409
	3410	// if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
	3411	shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
	3412	&status) == UCOL_SHIFTED;
	3413	if (strsrch->toShift != shift) {
	3414	strsrch->toShift = shift;
	3415	sameCollAttribute = FALSE;
	3416	}
	3417
	3418	// if status is a failure, ucol_getVariableTop returns 0
	3419	varTop = ucol_getVariableTop(strsrch->collator, &status);
	3420	if (strsrch->variableTop != varTop) {
	3421	strsrch->variableTop = varTop;
	3422	sameCollAttribute = FALSE;
	3423	}
	3424	if (!sameCollAttribute) {
	3425	initialize(strsrch, &status);
	3426	}
	3427	/* free offset buffer to avoid memory leak before initializing. */
	3428	ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
	3429	uprv_init_collIterate(strsrch->collator, strsrch->search->text,
	3430	strsrch->search->textLength,
	3431	&(strsrch->textIter->iteratordata_),
	3432	&status);
	3433	strsrch->search->matchedLength = 0;
	3434	strsrch->search->matchedIndex = USEARCH_DONE;
	3435	strsrch->search->isOverlap = FALSE;
	3436	strsrch->search->isCanonicalMatch = FALSE;
	3437	strsrch->search->elementComparisonType = 0;
	3438	strsrch->search->isForwardSearching = TRUE;
	3439	strsrch->search->reset = TRUE;
	3440	}
	3441	}
	3442
	3443	//
	3444	// CEI Collation Element + source text index.
	3445	// These structs are kept in the circular buffer.
	3446	//
	3447	struct CEI {
	3448	int64_t ce;
	3449	int32_t lowIndex;
	3450	int32_t highIndex;
	3451	};
	3452
	3453	U_NAMESPACE_BEGIN
	3454
	3455
	3456	//
	3457	// CEBuffer A circular buffer of CEs from the text being searched.
	3458	//
	3459	#define DEFAULT_CEBUFFER_SIZE 50
	3460	struct CEBuffer {
	3461	CEI defBuf[DEFAULT_CEBUFFER_SIZE];
	3462	CEI *buf;
	3463	int32_t bufSize;
	3464	int32_t firstIx;
	3465	int32_t limitIx;
	3466	UCollationElements *ceIter;
	3467	UStringSearch *strSearch;
	3468
	3469
	3470
	3471	CEBuffer(UStringSearch ss, UErrorCode status);
	3472	~CEBuffer();
	3473	const CEI *get(int32_t index);
	3474	const CEI *getPrevious(int32_t index);
	3475	};
	3476
	3477
	3478	CEBuffer::CEBuffer(UStringSearch ss, UErrorCode status) {
	3479	buf = defBuf;
	3480	strSearch = ss;
	3481	bufSize = ss->pattern.CELength+10;
	3482	ceIter = ss->textIter;
	3483	firstIx = 0;
	3484	limitIx = 0;
	3485
	3486	uprv_init_pce(ceIter);
	3487
	3488	if (bufSize>DEFAULT_CEBUFFER_SIZE) {
	3489	buf = (CEI )uprv_malloc(bufSize sizeof(CEI));
	3490	if (buf == NULL) {
	3491	*status = U_MEMORY_ALLOCATION_ERROR;
	3492	}
	3493	}
	3494	}
	3495
	3496	// TODO: add a reset or init function so that allocated
	3497	// buffers can be retained & reused.
	3498
	3499	CEBuffer::~CEBuffer() {
	3500	if (buf != defBuf) {
	3501	uprv_free(buf);
	3502	}
	3503	}
	3504
	3505
	3506	// Get the CE with the specified index.
	3507	// Index must be in the range
	3508	// n-history_size < index < n+1
	3509	// where n is the largest index to have been fetched by some previous call to this function.
	3510	// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
	3511	//
	3512	const CEI *CEBuffer::get(int32_t index) {
	3513	int i = index % bufSize;
	3514
	3515	if (index>=firstIx && index<limitIx) {
	3516	// The request was for an entry already in our buffer.
	3517	// Just return it.
	3518	return &buf[i];
	3519	}
	3520
	3521	// Caller is requesting a new, never accessed before, CE.
	3522	// Verify that it is the next one in sequence, which is all
	3523	// that is allowed.
	3524	if (index != limitIx) {
	3525	U_ASSERT(FALSE);
	3526
	3527	return NULL;
	3528	}
	3529
	3530	// Manage the circular CE buffer indexing
	3531	limitIx++;
	3532
	3533	if (limitIx - firstIx >= bufSize) {
	3534	// The buffer is full, knock out the lowest-indexed entry.
	3535	firstIx++;
	3536	}
	3537
	3538	UErrorCode status = U_ZERO_ERROR;
	3539
	3540	buf[i].ce = ucol_nextProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
	3541
	3542	return &buf[i];
	3543	}
	3544
	3545	// Get the CE with the specified index.
	3546	// Index must be in the range
	3547	// n-history_size < index < n+1
	3548	// where n is the largest index to have been fetched by some previous call to this function.
	3549	// The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
	3550	//
	3551	const CEI *CEBuffer::getPrevious(int32_t index) {
	3552	int i = index % bufSize;
	3553
	3554	if (index>=firstIx && index<limitIx) {
	3555	// The request was for an entry already in our buffer.
	3556	// Just return it.
	3557	return &buf[i];
	3558	}
	3559
	3560	// Caller is requesting a new, never accessed before, CE.
	3561	// Verify that it is the next one in sequence, which is all
	3562	// that is allowed.
	3563	if (index != limitIx) {
	3564	U_ASSERT(FALSE);
	3565
	3566	return NULL;
	3567	}
	3568
	3569	// Manage the circular CE buffer indexing
	3570	limitIx++;
	3571
	3572	if (limitIx - firstIx >= bufSize) {
	3573	// The buffer is full, knock out the lowest-indexed entry.
	3574	firstIx++;
	3575	}
	3576
	3577	UErrorCode status = U_ZERO_ERROR;
	3578
	3579	buf[i].ce = ucol_previousProcessed(ceIter, &buf[i].lowIndex, &buf[i].highIndex, &status);
	3580
	3581	return &buf[i];
	3582	}
	3583
	3584	U_NAMESPACE_END
	3585
	3586
	3587	// #define USEARCH_DEBUG
	3588
	3589	#ifdef USEARCH_DEBUG
	3590	#include <stdio.h>
	3591	#include <stdlib.h>
	3592	#endif
	3593
	3594	/*
	3595	* Find the next break boundary after startIndex. If the UStringSearch object
	3596	* has an external break iterator, use that. Otherwise use the internal character
	3597	* break iterator.
	3598	*/
	3599	static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
	3600	#if 0
	3601	const UChar *text = strsrch->search->text;
	3602	int32_t textLen = strsrch->search->textLength;
	3603
	3604	U_ASSERT(startIndex>=0);
	3605	U_ASSERT(startIndex<=textLen);
	3606
	3607	if (startIndex >= textLen) {
	3608	return startIndex;
	3609	}
	3610
	3611	UChar32 c;
	3612	int32_t i = startIndex;
	3613	U16_NEXT(text, i, textLen, c);
	3614
	3615	// If we are on a control character, stop without looking for combining marks.
	3616	// Control characters do not combine.
	3617	int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
	3618	if (gcProperty==U_GCB_CONTROL \|\| gcProperty==U_GCB_LF \|\| gcProperty==U_GCB_CR) {
	3619	return i;
	3620	}
	3621
	3622	// The initial character was not a control, and can thus accept trailing
	3623	// combining characters. Advance over however many of them there are.
	3624	int32_t indexOfLastCharChecked;
	3625	for (;;) {
	3626	indexOfLastCharChecked = i;
	3627	if (i>=textLen) {
	3628	break;
	3629	}
	3630	U16_NEXT(text, i, textLen, c);
	3631	gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
	3632	if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
	3633	break;
	3634	}
	3635	}
	3636	return indexOfLastCharChecked;
	3637	#elif !UCONFIG_NO_BREAK_ITERATION
	3638	UBreakIterator *breakiterator = strsrch->search->breakIter;
	3639
	3640	if (breakiterator == NULL) {
	3641	breakiterator = strsrch->search->internalBreakIter;
	3642	}
	3643
	3644	if (breakiterator != NULL) {
	3645	return ubrk_following(breakiterator, startIndex);
	3646	}
	3647
	3648	return startIndex;
	3649	#else
	3650	// ** or should we use the original code? **
	3651	return startIndex;
	3652	#endif
	3653
	3654	}
	3655
	3656	/*
	3657	* Returns TRUE if index is on a break boundary. If the UStringSearch
	3658	* has an external break iterator, test using that, otherwise test
	3659	* using the internal character break iterator.
	3660	*/
	3661	static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
	3662	#if 0
	3663	const UChar *text = strsrch->search->text;
	3664	int32_t textLen = strsrch->search->textLength;
	3665
	3666	U_ASSERT(index>=0);
	3667	U_ASSERT(index<=textLen);
	3668
	3669	if (index>=textLen \|\| index<=0) {
	3670	return FALSE;
	3671	}
	3672
	3673	// If the character at the current index is not a GRAPHEME_EXTEND
	3674	// then we can not be within a combining sequence.
	3675	UChar32 c;
	3676	U16_GET(text, 0, index, textLen, c);
	3677	int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
	3678	if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
	3679	return FALSE;
	3680	}
	3681
	3682	// We are at a combining mark. If the preceding character is anything
	3683	// except a CONTROL, CR or LF, we are in a combining sequence.
	3684	U16_PREV(text, 0, index, c);
	3685	gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
	3686	UBool combining = !(gcProperty==U_GCB_CONTROL \|\| gcProperty==U_GCB_LF \|\| gcProperty==U_GCB_CR);
	3687	return combining;
	3688	#elif !UCONFIG_NO_BREAK_ITERATION
	3689	UBreakIterator *breakiterator = strsrch->search->breakIter;
	3690
	3691	if (breakiterator == NULL) {
	3692	breakiterator = strsrch->search->internalBreakIter;
	3693	}
	3694
	3695	return (breakiterator != NULL && ! ubrk_isBoundary(breakiterator, index));
	3696	#else
	3697	// ** or use the original code? **
	3698	return FALSE;
	3699	#endif
	3700	}
	3701
	3702	#if 0
	3703	static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
	3704	{
	3705	#if !UCONFIG_NO_BREAK_ITERATION
	3706	UBreakIterator *breakiterator = strsrch->search->breakIter;
	3707
	3708	if (breakiterator != NULL) {
	3709	int32_t startindex = ubrk_first(breakiterator);
	3710	int32_t endindex = ubrk_last(breakiterator);
	3711
	3712	// out-of-range indexes are never boundary positions
	3713	if (start < startindex \|\| start > endindex \|\|
	3714	end < startindex \|\| end > endindex) {
	3715	return FALSE;
	3716	}
	3717
	3718	return ubrk_isBoundary(breakiterator, start) &&
	3719	ubrk_isBoundary(breakiterator, end);
	3720	}
	3721	#endif
	3722
	3723	return TRUE;
	3724	}
	3725	#endif
	3726
	3727	typedef enum {
	3728	U_CE_MATCH = -1,
	3729	U_CE_NO_MATCH = 0,
	3730	U_CE_SKIP_TARG,
	3731	U_CE_SKIP_PATN
	3732	} UCompareCEsResult;
	3733	#define U_CE_LEVEL2_BASE 0x00000005
	3734	#define U_CE_LEVEL3_BASE 0x00050000
	3735
	3736	static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
	3737	if (targCE == patCE) {
	3738	return U_CE_MATCH;
	3739	}
	3740	if (compareType == 0) {
	3741	return U_CE_NO_MATCH;
	3742	}
	3743
	3744	int64_t targCEshifted = targCE >> 32;
	3745	int64_t patCEshifted = patCE >> 32;
	3746	int64_t mask;
	3747
	3748	mask = 0xFFFF0000;
	3749	int32_t targLev1 = (int32_t)(targCEshifted & mask);
	3750	int32_t patLev1 = (int32_t)(patCEshifted & mask);
	3751	if ( targLev1 != patLev1 ) {
	3752	if ( targLev1 == 0 ) {
	3753	return U_CE_SKIP_TARG;
	3754	}
	3755	if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
	3756	return U_CE_SKIP_PATN;
	3757	}
	3758	return U_CE_NO_MATCH;
	3759	}
	3760
	3761	mask = 0x0000FFFF;
	3762	int32_t targLev2 = (int32_t)(targCEshifted & mask);
	3763	int32_t patLev2 = (int32_t)(patCEshifted & mask);
	3764	if ( targLev2 != patLev2 ) {
	3765	if ( targLev2 == 0 ) {
	3766	return U_CE_SKIP_TARG;
	3767	}
	3768	if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
	3769	return U_CE_SKIP_PATN;
	3770	}
	3771	return (patLev2 == U_CE_LEVEL2_BASE \|\| (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
	3772	U_CE_MATCH: U_CE_NO_MATCH;
	3773	}
	3774
	3775	mask = 0xFFFF0000;
	3776	int32_t targLev3 = (int32_t)(targCE & mask);
	3777	int32_t patLev3 = (int32_t)(patCE & mask);
	3778	if ( targLev3 != patLev3 ) {
	3779	return (patLev3 == U_CE_LEVEL3_BASE \|\| (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
	3780	U_CE_MATCH: U_CE_NO_MATCH;
	3781	}
	3782
	3783	return U_CE_MATCH;
	3784	}
	3785
	3786	#if BOYER_MOORE
	3787	// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
	3788	#endif
	3789
	3790	U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
	3791	int32_t startIdx,
	3792	int32_t *matchStart,
	3793	int32_t *matchLimit,
	3794	UErrorCode *status)
	3795	{
	3796	if (U_FAILURE(*status)) {
	3797	return FALSE;
	3798	}
	3799
	3800	// TODO: reject search patterns beginning with a combining char.
	3801
	3802	#ifdef USEARCH_DEBUG
	3803	if (getenv("USEARCH_DEBUG") != NULL) {
	3804	printf("Pattern CEs\n");
	3805	for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
	3806	printf(" %8x", strsrch->pattern.CE[ii]);
	3807	}
	3808	printf("\n");
	3809	}
	3810
	3811	#endif
	3812	// Input parameter sanity check.
	3813	// TODO: should input indicies clip to the text length
	3814	// in the same way that UText does.
	3815	if(strsrch->pattern.CELength == 0 \|\|
	3816	startIdx < 0 \|\|
	3817	startIdx > strsrch->search->textLength \|\|
	3818	strsrch->pattern.CE == NULL) {
	3819	*status = U_ILLEGAL_ARGUMENT_ERROR;
	3820	return FALSE;
	3821	}
	3822
	3823	if (strsrch->pattern.PCE == NULL) {
	3824	initializePatternPCETable(strsrch, status);
	3825	}
	3826
	3827	ucol_setOffset(strsrch->textIter, startIdx, status);
	3828	CEBuffer ceb(strsrch, status);
	3829
	3830
	3831	int32_t targetIx = 0;
	3832	const CEI *targetCEI = NULL;
	3833	int32_t patIx;
	3834	UBool found;
	3835
	3836	int32_t mStart = -1;
	3837	int32_t mLimit = -1;
	3838	int32_t minLimit;
	3839	int32_t maxLimit;
	3840
	3841
	3842
	3843	// Outer loop moves over match starting positions in the
	3844	// target CE space.
	3845	// Here we see the target as a sequence of collation elements, resulting from the following:
	3846	// 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
	3847	// (for example, digraphs such as IJ may be broken into two characters).
	3848	// 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
	3849	// 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
	3850	// fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
	3851	// CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
	3852	// then the CE is deleted, so the following code sees only CEs that are relevant.
	3853	// For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
	3854	// If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
	3855	// characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
	3856	//
	3857	for(targetIx=0; ; targetIx++)
	3858	{
	3859	found = TRUE;
	3860	// Inner loop checks for a match beginning at each
	3861	// position from the outer loop.
	3862	int32_t targetIxOffset = 0;
	3863	int64_t patCE = 0;
	3864	for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
	3865	patCE = strsrch->pattern.PCE[patIx];
	3866	targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
	3867	// Compare CE from target string with CE from the pattern.
	3868	// Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
	3869	// which will fail the compare, below.
	3870	UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
	3871	if ( ceMatch == U_CE_NO_MATCH ) {
	3872	found = FALSE;
	3873	break;
	3874	} else if ( ceMatch > U_CE_NO_MATCH ) {
	3875	if ( ceMatch == U_CE_SKIP_TARG ) {
	3876	// redo with same patCE, next targCE
	3877	patIx--;
	3878	targetIxOffset++;
	3879	} else { // ceMatch == U_CE_SKIP_PATN
	3880	// redo with same targCE, next patCE
	3881	targetIxOffset--;
	3882	}
	3883	}
	3884	}
	3885	targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far
	3886
	3887	if (!found && ((targetCEI == NULL) \|\| (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
	3888	// No match at this targetIx. Try again at the next.
	3889	continue;
	3890	}
	3891
	3892	if (!found) {
	3893	// No match at all, we have run off the end of the target text.
	3894	break;
	3895	}
	3896
	3897
	3898	// We have found a match in CE space.
	3899	// Now determine the bounds in string index space.
	3900	// There still is a chance of match failure if the CE range not correspond to
	3901	// an acceptable character range.
	3902	//
	3903	const CEI *firstCEI = ceb.get(targetIx);
	3904	const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
	3905
	3906	mStart = firstCEI->lowIndex;
	3907	minLimit = lastCEI->lowIndex;
	3908
	3909	// Look at the CE following the match. If it is UCOL_NULLORDER the match
	3910	// extended to the end of input, and the match is good.
	3911
	3912	// Look at the high and low indices of the CE following the match. If
	3913	// they are the same it means one of two things:
	3914	// 1. The match extended to the last CE from the target text, which is OK, or
	3915	// 2. The last CE that was part of the match is in an expansion that extends
	3916	// to the first CE after the match. In this case, we reject the match.
	3917	if (strsrch->search->elementComparisonType == 0) {
	3918	const CEI *nextCEI = ceb.get(targetIx + targetIxOffset);
	3919	maxLimit = nextCEI->lowIndex;
	3920	if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
	3921	found = FALSE;
	3922	}
	3923	} else {
	3924	const CEI *nextCEI;
	3925	for ( ; ; ++targetIxOffset ) {
	3926	nextCEI = ceb.get(targetIx + targetIxOffset);
	3927	maxLimit = nextCEI->lowIndex;
	3928	// If we are at the end of the target too, match succeeds
	3929	if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
	3930	break;
	3931	}
	3932	// As long as the next CE has primary weight of 0,
	3933	// it is part of the last target element matched by the pattern;
	3934	// make sure it can be part of a match with the last patCE
	3935	if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
	3936	UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
	3937	if ( ceMatch == U_CE_NO_MATCH \|\| ceMatch == U_CE_SKIP_PATN ) {
	3938	found = FALSE;
	3939	break;
	3940	}
	3941	// If lowIndex == highIndex, this target CE is part of an expansion of the last matched
	3942	// target element, but it has non-zero primary weight => match fails
	3943	} else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
	3944	found = false;
	3945	break;
	3946	// Else the target CE is not part of an expansion of the last matched element, match succeeds
	3947	} else {
	3948	break;
	3949	}
	3950	}
	3951	}
	3952
	3953
	3954	// Check for the start of the match being within a combining sequence.
	3955	// This can happen if the pattern itself begins with a combining char, and
	3956	// the match found combining marks in the target text that were attached
	3957	// to something else.
	3958	// This type of match should be rejected for not completely consuming a
	3959	// combining sequence.
	3960	if (isBreakBoundary(strsrch, mStart)) {
	3961	found = FALSE;
	3962	}
	3963
	3964	// Check for the start of the match being within an Collation Element Expansion,
	3965	// meaning that the first char of the match is only partially matched.
	3966	// With exapnsions, the first CE will report the index of the source
	3967	// character, and all subsequent (expansions) CEs will report the source index of the
	3968	// _following_ character.
	3969	int32_t secondIx = firstCEI->highIndex;
	3970	if (mStart == secondIx) {
	3971	found = FALSE;
	3972	}
	3973
	3974	// Advance the match end position to the first acceptable match boundary.
	3975	// This advances the index over any combining charcters.
	3976	mLimit = maxLimit;
	3977	if (minLimit < maxLimit) {
	3978	int32_t nba = nextBoundaryAfter(strsrch, minLimit);
	3979
	3980	if (nba >= lastCEI->highIndex) {
	3981	mLimit = nba;
	3982	}
	3983	}
	3984
	3985	#ifdef USEARCH_DEBUG
	3986	if (getenv("USEARCH_DEBUG") != NULL) {
	3987	printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
	3988	}
	3989	#endif
	3990
	3991	// If advancing to the end of a combining sequence in character indexing space
	3992	// advanced us beyond the end of the match in CE space, reject this match.
	3993	if (mLimit > maxLimit) {
	3994	found = FALSE;
	3995	}
	3996
	3997	if (isBreakBoundary(strsrch, mLimit)) {
	3998	found = FALSE;
	3999	}
	4000
	4001	if (! checkIdentical(strsrch, mStart, mLimit)) {
	4002	found = FALSE;
	4003	}
	4004
	4005	if (found) {
	4006	break;
	4007	}
	4008	}
	4009
	4010	#ifdef USEARCH_DEBUG
	4011	if (getenv("USEARCH_DEBUG") != NULL) {
	4012	printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
	4013	int32_t lastToPrint = ceb.limitIx+2;
	4014	for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
	4015	printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
	4016	}
	4017	printf("\n%s\n", found? "match found" : "no match");
	4018	}
	4019	#endif
	4020
	4021	// All Done. Store back the match bounds to the caller.
	4022	//
	4023	if (found==FALSE) {
	4024	mLimit = -1;
	4025	mStart = -1;
	4026	}
	4027
	4028	if (matchStart != NULL) {
	4029	*matchStart= mStart;
	4030	}
	4031
	4032	if (matchLimit != NULL) {
	4033	*matchLimit = mLimit;
	4034	}
	4035
	4036	return found;
	4037	}
	4038
	4039
	4040	U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
	4041	int32_t startIdx,
	4042	int32_t *matchStart,
	4043	int32_t *matchLimit,
	4044	UErrorCode *status)
	4045	{
	4046	if (U_FAILURE(*status)) {
	4047	return FALSE;
	4048	}
	4049
	4050	// TODO: reject search patterns beginning with a combining char.
	4051
	4052	#ifdef USEARCH_DEBUG
	4053	if (getenv("USEARCH_DEBUG") != NULL) {
	4054	printf("Pattern CEs\n");
	4055	for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
	4056	printf(" %8x", strsrch->pattern.CE[ii]);
	4057	}
	4058	printf("\n");
	4059	}
	4060
	4061	#endif
	4062	// Input parameter sanity check.
	4063	// TODO: should input indicies clip to the text length
	4064	// in the same way that UText does.
	4065	if(strsrch->pattern.CELength == 0 \|\|
	4066	startIdx < 0 \|\|
	4067	startIdx > strsrch->search->textLength \|\|
	4068	strsrch->pattern.CE == NULL) {
	4069	*status = U_ILLEGAL_ARGUMENT_ERROR;
	4070	return FALSE;
	4071	}
	4072
	4073	if (strsrch->pattern.PCE == NULL) {
	4074	initializePatternPCETable(strsrch, status);
	4075	}
	4076
	4077	CEBuffer ceb(strsrch, status);
	4078	int32_t targetIx = 0;
	4079
	4080	/*
	4081	* Pre-load the buffer with the CE's for the grapheme
	4082	* after our starting position so that we're sure that
	4083	* we can look at the CE following the match when we
	4084	* check the match boundaries.
	4085	*
	4086	* This will also pre-fetch the first CE that we'll
	4087	* consider for the match.
	4088	*/
	4089	if (startIdx < strsrch->search->textLength) {
	4090	UBreakIterator *bi = strsrch->search->internalBreakIter;
	4091	int32_t next = ubrk_following(bi, startIdx);
	4092
	4093	ucol_setOffset(strsrch->textIter, next, status);
	4094
	4095	for (targetIx = 0; ; targetIx += 1) {
	4096	if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
	4097	break;
	4098	}
	4099	}
	4100	} else {
	4101	ucol_setOffset(strsrch->textIter, startIdx, status);
	4102	}
	4103
	4104
	4105	const CEI *targetCEI = NULL;
	4106	int32_t patIx;
	4107	UBool found;
	4108
	4109	int32_t limitIx = targetIx;
	4110	int32_t mStart = -1;
	4111	int32_t mLimit = -1;
	4112	int32_t minLimit;
	4113	int32_t maxLimit;
	4114
	4115
	4116
	4117	// Outer loop moves over match starting positions in the
	4118	// target CE space.
	4119	// Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
	4120	// But patIx is 0 at the beginning of the pattern and increases toward the end.
	4121	// So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
	4122	// and the beginning of the base text.
	4123	for(targetIx = limitIx; ; targetIx += 1)
	4124	{
	4125	found = TRUE;
	4126	// Inner loop checks for a match beginning at each
	4127	// position from the outer loop.
	4128	int32_t targetIxOffset = 0;
	4129	for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
	4130	int64_t patCE = strsrch->pattern.PCE[patIx];
	4131
	4132	targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset);
	4133	// Compare CE from target string with CE from the pattern.
	4134	// Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
	4135	// which will fail the compare, below.
	4136	UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
	4137	if ( ceMatch == U_CE_NO_MATCH ) {
	4138	found = FALSE;
	4139	break;
	4140	} else if ( ceMatch > U_CE_NO_MATCH ) {
	4141	if ( ceMatch == U_CE_SKIP_TARG ) {
	4142	// redo with same patCE, next targCE
	4143	patIx++;
	4144	targetIxOffset++;
	4145	} else { // ceMatch == U_CE_SKIP_PATN
	4146	// redo with same targCE, next patCE
	4147	targetIxOffset--;
	4148	}
	4149	}
	4150	}
	4151
	4152	if (!found && ((targetCEI == NULL) \|\| (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
	4153	// No match at this targetIx. Try again at the next.
	4154	continue;
	4155	}
	4156
	4157	if (!found) {
	4158	// No match at all, we have run off the end of the target text.
	4159	break;
	4160	}
	4161
	4162
	4163	// We have found a match in CE space.
	4164	// Now determine the bounds in string index space.
	4165	// There still is a chance of match failure if the CE range not correspond to
	4166	// an acceptable character range.
	4167	//
	4168	const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset);
	4169	const CEI *lastCEI = ceb.getPrevious(targetIx);
	4170	const CEI *nextCEI = targetIx > 0? ceb.getPrevious(targetIx - 1) : NULL;
	4171
	4172	mStart = firstCEI->lowIndex;
	4173	minLimit = lastCEI->lowIndex;
	4174	maxLimit = targetIx > 0? nextCEI->lowIndex : lastCEI->highIndex;
	4175
	4176	// Look at the CE following the match. If it is UCOL_NULLORDER the match
	4177	// extended to the end of input, and the match is good.
	4178
	4179	// Look at the high and low indices of the CE following the match. If
	4180	// they are the same it means one of two things:
	4181	// 1. The match extended to the last CE from the target text, which is OK, or
	4182	// 2. The last CE that was part of the match is in an expansion that extends
	4183	// to the first CE after the match. In this case, we reject the match.
	4184	if (targetIx >= 1) {
	4185	if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
	4186	found = FALSE;
	4187	}
	4188	}
	4189
	4190
	4191	// Check for the start of the match being within a combining sequence.
	4192	// This can happen if the pattern itself begins with a combining char, and
	4193	// the match found combining marks in the target text that were attached
	4194	// to something else.
	4195	// This type of match should be rejected for not completely consuming a
	4196	// combining sequence.
	4197	if (isBreakBoundary(strsrch, mStart)) {
	4198	found = FALSE;
	4199	}
	4200
	4201	// Look at the high index of the first CE in the match. If it's the same as the
	4202	// low index, the first CE in the match is in the middle of an expansion.
	4203	if (mStart == firstCEI->highIndex) {
	4204	found = FALSE;
	4205	}
	4206
	4207	// Advance the match end position to the first acceptable match boundary.
	4208	// This advances the index over any combining charcters.
	4209	mLimit = maxLimit;
	4210	if (/targetIx > 0 &&/ minLimit < maxLimit) {
	4211	int32_t nba = nextBoundaryAfter(strsrch, minLimit);
	4212
	4213	if (nba >= lastCEI->highIndex) {
	4214	mLimit = nba;
	4215	}
	4216	}
	4217
	4218	#ifdef USEARCH_DEBUG
	4219	if (getenv("USEARCH_DEBUG") != NULL) {
	4220	printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
	4221	}
	4222	#endif
	4223
	4224	// If advancing to the end of a combining sequence in character indexing space
	4225	// advanced us beyond the end of the match in CE space, reject this match.
	4226	if (mLimit > maxLimit) {
	4227	found = FALSE;
	4228	}
	4229
	4230	// Make sure the end of the match is on a break boundary
	4231	if (isBreakBoundary(strsrch, mLimit)) {
	4232	found = FALSE;
	4233	}
	4234
	4235	if (! checkIdentical(strsrch, mStart, mLimit)) {
	4236	found = FALSE;
	4237	}
	4238
	4239	if (found) {
	4240	break;
	4241	}
	4242	}
	4243
	4244	#ifdef USEARCH_DEBUG
	4245	if (getenv("USEARCH_DEBUG") != NULL) {
	4246	printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
	4247	int32_t lastToPrint = ceb.limitIx+2;
	4248	for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
	4249	printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
	4250	}
	4251	printf("\n%s\n", found? "match found" : "no match");
	4252	}
	4253	#endif
	4254
	4255	// All Done. Store back the match bounds to the caller.
	4256	//
	4257	if (found==FALSE) {
	4258	mLimit = -1;
	4259	mStart = -1;
	4260	}
	4261
	4262	if (matchStart != NULL) {
	4263	*matchStart= mStart;
	4264	}
	4265
	4266	if (matchLimit != NULL) {
	4267	*matchLimit = mLimit;
	4268	}
	4269
	4270	return found;
	4271	}
	4272
	4273
	4274
	4275
	4276	// internal use methods declared in usrchimp.h -----------------------------
	4277
	4278	UBool usearch_handleNextExact(UStringSearch strsrch, UErrorCode status)
	4279	{
	4280	if (U_FAILURE(*status)) {
	4281	setMatchNotFound(strsrch);
	4282	return FALSE;
	4283	}
	4284
	4285	#if BOYER_MOORE
	4286	UCollationElements *coleiter = strsrch->textIter;
	4287	int32_t textlength = strsrch->search->textLength;
	4288	int32_t *patternce = strsrch->pattern.CE;
	4289	int32_t patterncelength = strsrch->pattern.CELength;
	4290	int32_t textoffset = ucol_getOffset(coleiter);
	4291
	4292	// status used in setting coleiter offset, since offset is checked in
	4293	// shiftForward before setting the coleiter offset, status never
	4294	// a failure
	4295	textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
	4296	patterncelength);
	4297	while (textoffset <= textlength)
	4298	{
	4299	uint32_t patternceindex = patterncelength - 1;
	4300	int32_t targetce;
	4301	UBool found = FALSE;
	4302	int32_t lastce = UCOL_NULLORDER;
	4303
	4304	setColEIterOffset(coleiter, textoffset);
	4305
	4306	for (;;) {
	4307	// finding the last pattern ce match, imagine composite characters
	4308	// for example: search for pattern A in text \u00C0
	4309	// we'll have to skip \u0300 the grave first before we get to A
	4310	targetce = ucol_previous(coleiter, status);
	4311	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
	4312	found = FALSE;
	4313	break;
	4314	}
	4315	targetce = getCE(strsrch, targetce);
	4316	if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
	4317	// this is for the text \u0315\u0300 that requires
	4318	// normalization and pattern \u0300, where \u0315 is ignorable
	4319	continue;
	4320	}
	4321	if (lastce == UCOL_NULLORDER \|\| lastce == UCOL_IGNORABLE) {
	4322	lastce = targetce;
	4323	}
	4324	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
	4325	if (targetce == patternce[patternceindex]) {
	4326	// the first ce can be a contraction
	4327	found = TRUE;
	4328	break;
	4329	}
	4330	if (!hasExpansion(coleiter)) {
	4331	found = FALSE;
	4332	break;
	4333	}
	4334	}
	4335
	4336	//targetce = lastce;
	4337
	4338	while (found && patternceindex > 0) {
	4339	lastce = targetce;
	4340	targetce = ucol_previous(coleiter, status);
	4341	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
	4342	found = FALSE;
	4343	break;
	4344	}
	4345	targetce = getCE(strsrch, targetce);
	4346	if (targetce == UCOL_IGNORABLE) {
	4347	continue;
	4348	}
	4349
	4350	patternceindex --;
	4351	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
	4352	found = found && targetce == patternce[patternceindex];
	4353	}
	4354
	4355	targetce = lastce;
	4356
	4357	if (!found) {
	4358	if (U_FAILURE(*status)) {
	4359	break;
	4360	}
	4361	textoffset = shiftForward(strsrch, textoffset, lastce,
	4362	patternceindex);
	4363	// status checked at loop.
	4364	patternceindex = patterncelength;
	4365	continue;
	4366	}
	4367
	4368	if (checkNextExactMatch(strsrch, &textoffset, status)) {
	4369	// status checked in ucol_setOffset
	4370	setColEIterOffset(coleiter, strsrch->search->matchedIndex);
	4371	return TRUE;
	4372	}
	4373	}
	4374	setMatchNotFound(strsrch);
	4375	return FALSE;
	4376	#else
	4377	int32_t textOffset = ucol_getOffset(strsrch->textIter);
	4378	int32_t start = -1;
	4379	int32_t end = -1;
	4380
	4381	if (usearch_search(strsrch, textOffset, &start, &end, status)) {
	4382	strsrch->search->matchedIndex = start;
	4383	strsrch->search->matchedLength = end - start;
	4384	return TRUE;
	4385	} else {
	4386	setMatchNotFound(strsrch);
	4387	return FALSE;
	4388	}
	4389	#endif
	4390	}
	4391
	4392	UBool usearch_handleNextCanonical(UStringSearch strsrch, UErrorCode status)
	4393	{
	4394	if (U_FAILURE(*status)) {
	4395	setMatchNotFound(strsrch);
	4396	return FALSE;
	4397	}
	4398
	4399	#if BOYER_MOORE
	4400	UCollationElements *coleiter = strsrch->textIter;
	4401	int32_t textlength = strsrch->search->textLength;
	4402	int32_t *patternce = strsrch->pattern.CE;
	4403	int32_t patterncelength = strsrch->pattern.CELength;
	4404	int32_t textoffset = ucol_getOffset(coleiter);
	4405	UBool hasPatternAccents =
	4406	strsrch->pattern.hasSuffixAccents \|\| strsrch->pattern.hasPrefixAccents;
	4407
	4408	textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
	4409	patterncelength);
	4410	strsrch->canonicalPrefixAccents[0] = 0;
	4411	strsrch->canonicalSuffixAccents[0] = 0;
	4412
	4413	while (textoffset <= textlength)
	4414	{
	4415	int32_t patternceindex = patterncelength - 1;
	4416	int32_t targetce;
	4417	UBool found = FALSE;
	4418	int32_t lastce = UCOL_NULLORDER;
	4419
	4420	setColEIterOffset(coleiter, textoffset);
	4421
	4422	for (;;) {
	4423	// finding the last pattern ce match, imagine composite characters
	4424	// for example: search for pattern A in text \u00C0
	4425	// we'll have to skip \u0300 the grave first before we get to A
	4426	targetce = ucol_previous(coleiter, status);
	4427	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
	4428	found = FALSE;
	4429	break;
	4430	}
	4431	targetce = getCE(strsrch, targetce);
	4432	if (lastce == UCOL_NULLORDER \|\| lastce == UCOL_IGNORABLE) {
	4433	lastce = targetce;
	4434	}
	4435	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
	4436	if (targetce == patternce[patternceindex]) {
	4437	// the first ce can be a contraction
	4438	found = TRUE;
	4439	break;
	4440	}
	4441	if (!hasExpansion(coleiter)) {
	4442	found = FALSE;
	4443	break;
	4444	}
	4445	}
	4446
	4447	while (found && patternceindex > 0) {
	4448	targetce = ucol_previous(coleiter, status);
	4449	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
	4450	found = FALSE;
	4451	break;
	4452	}
	4453	targetce = getCE(strsrch, targetce);
	4454	if (targetce == UCOL_IGNORABLE) {
	4455	continue;
	4456	}
	4457
	4458	patternceindex --;
	4459	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
	4460	found = found && targetce == patternce[patternceindex];
	4461	}
	4462
	4463	// initializing the rearranged accent array
	4464	if (hasPatternAccents && !found) {
	4465	strsrch->canonicalPrefixAccents[0] = 0;
	4466	strsrch->canonicalSuffixAccents[0] = 0;
	4467	if (U_FAILURE(*status)) {
	4468	break;
	4469	}
	4470	found = doNextCanonicalMatch(strsrch, textoffset, status);
	4471	}
	4472
	4473	if (!found) {
	4474	if (U_FAILURE(*status)) {
	4475	break;
	4476	}
	4477	textoffset = shiftForward(strsrch, textoffset, lastce,
	4478	patternceindex);
	4479	// status checked at loop
	4480	patternceindex = patterncelength;
	4481	continue;
	4482	}
	4483
	4484	if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
	4485	setColEIterOffset(coleiter, strsrch->search->matchedIndex);
	4486	return TRUE;
	4487	}
	4488	}
	4489	setMatchNotFound(strsrch);
	4490	return FALSE;
	4491	#else
	4492	int32_t textOffset = ucol_getOffset(strsrch->textIter);
	4493	int32_t start = -1;
	4494	int32_t end = -1;
	4495
	4496	if (usearch_search(strsrch, textOffset, &start, &end, status)) {
	4497	strsrch->search->matchedIndex = start;
	4498	strsrch->search->matchedLength = end - start;
	4499	return TRUE;
	4500	} else {
	4501	setMatchNotFound(strsrch);
	4502	return FALSE;
	4503	}
	4504	#endif
	4505	}
	4506
	4507	UBool usearch_handlePreviousExact(UStringSearch strsrch, UErrorCode status)
	4508	{
	4509	if (U_FAILURE(*status)) {
	4510	setMatchNotFound(strsrch);
	4511	return FALSE;
	4512	}
	4513
	4514	#if BOYER_MOORE
	4515	UCollationElements *coleiter = strsrch->textIter;
	4516	int32_t *patternce = strsrch->pattern.CE;
	4517	int32_t patterncelength = strsrch->pattern.CELength;
	4518	int32_t textoffset = ucol_getOffset(coleiter);
	4519
	4520	// shifting it check for setting offset
	4521	// if setOffset is called previously or there was no previous match, we
	4522	// leave the offset as it is.
	4523	if (strsrch->search->matchedIndex != USEARCH_DONE) {
	4524	textoffset = strsrch->search->matchedIndex;
	4525	}
	4526
	4527	textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
	4528	patterncelength);
	4529
	4530	while (textoffset >= 0)
	4531	{
	4532	int32_t patternceindex = 1;
	4533	int32_t targetce;
	4534	UBool found = FALSE;
	4535	int32_t firstce = UCOL_NULLORDER;
	4536
	4537	// if status is a failure, ucol_setOffset does nothing
	4538	setColEIterOffset(coleiter, textoffset);
	4539
	4540	for (;;) {
	4541	// finding the first pattern ce match, imagine composite
	4542	// characters. for example: search for pattern \u0300 in text
	4543	// \u00C0, we'll have to skip A first before we get to
	4544	// \u0300 the grave accent
	4545	targetce = ucol_next(coleiter, status);
	4546	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
	4547	found = FALSE;
	4548	break;
	4549	}
	4550	targetce = getCE(strsrch, targetce);
	4551	if (firstce == UCOL_NULLORDER \|\| firstce == UCOL_IGNORABLE) {
	4552	firstce = targetce;
	4553	}
	4554	if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
	4555	continue;
	4556	}
	4557	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
	4558	if (targetce == patternce[0]) {
	4559	found = TRUE;
	4560	break;
	4561	}
	4562	if (!hasExpansion(coleiter)) {
	4563	// checking for accents in composite character
	4564	found = FALSE;
	4565	break;
	4566	}
	4567	}
	4568
	4569	//targetce = firstce;
	4570
	4571	while (found && (patternceindex < patterncelength)) {
	4572	firstce = targetce;
	4573	targetce = ucol_next(coleiter, status);
	4574	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
	4575	found = FALSE;
	4576	break;
	4577	}
	4578	targetce = getCE(strsrch, targetce);
	4579	if (targetce == UCOL_IGNORABLE) {
	4580	continue;
	4581	}
	4582
	4583	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
	4584	found = found && targetce == patternce[patternceindex];
	4585	patternceindex ++;
	4586	}
	4587
	4588	targetce = firstce;
	4589
	4590	if (!found) {
	4591	if (U_FAILURE(*status)) {
	4592	break;
	4593	}
	4594
	4595	textoffset = reverseShift(strsrch, textoffset, targetce,
	4596	patternceindex);
	4597	patternceindex = 0;
	4598	continue;
	4599	}
	4600
	4601	if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
	4602	setColEIterOffset(coleiter, textoffset);
	4603	return TRUE;
	4604	}
	4605	}
	4606	setMatchNotFound(strsrch);
	4607	return FALSE;
	4608	#else
	4609	int32_t textOffset = ucol_getOffset(strsrch->textIter);
	4610	int32_t start = -1;
	4611	int32_t end = -1;
	4612
	4613	if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
	4614	strsrch->search->matchedIndex = start;
	4615	strsrch->search->matchedLength = end - start;
	4616	return TRUE;
	4617	} else {
	4618	setMatchNotFound(strsrch);
	4619	return FALSE;
	4620	}
	4621	#endif
	4622	}
	4623
	4624	UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
	4625	UErrorCode *status)
	4626	{
	4627	if (U_FAILURE(*status)) {
	4628	setMatchNotFound(strsrch);
	4629	return FALSE;
	4630	}
	4631
	4632	#if BOYER_MOORE
	4633	UCollationElements *coleiter = strsrch->textIter;
	4634	int32_t *patternce = strsrch->pattern.CE;
	4635	int32_t patterncelength = strsrch->pattern.CELength;
	4636	int32_t textoffset = ucol_getOffset(coleiter);
	4637	UBool hasPatternAccents =
	4638	strsrch->pattern.hasSuffixAccents \|\| strsrch->pattern.hasPrefixAccents;
	4639
	4640	// shifting it check for setting offset
	4641	// if setOffset is called previously or there was no previous match, we
	4642	// leave the offset as it is.
	4643	if (strsrch->search->matchedIndex != USEARCH_DONE) {
	4644	textoffset = strsrch->search->matchedIndex;
	4645	}
	4646
	4647	textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
	4648	patterncelength);
	4649	strsrch->canonicalPrefixAccents[0] = 0;
	4650	strsrch->canonicalSuffixAccents[0] = 0;
	4651
	4652	while (textoffset >= 0)
	4653	{
	4654	int32_t patternceindex = 1;
	4655	int32_t targetce;
	4656	UBool found = FALSE;
	4657	int32_t firstce = UCOL_NULLORDER;
	4658
	4659	setColEIterOffset(coleiter, textoffset);
	4660	for (;;) {
	4661	// finding the first pattern ce match, imagine composite
	4662	// characters. for example: search for pattern \u0300 in text
	4663	// \u00C0, we'll have to skip A first before we get to
	4664	// \u0300 the grave accent
	4665	targetce = ucol_next(coleiter, status);
	4666	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
	4667	found = FALSE;
	4668	break;
	4669	}
	4670	targetce = getCE(strsrch, targetce);
	4671	if (firstce == UCOL_NULLORDER \|\| firstce == UCOL_IGNORABLE) {
	4672	firstce = targetce;
	4673	}
	4674
	4675	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
	4676	if (targetce == patternce[0]) {
	4677	// the first ce can be a contraction
	4678	found = TRUE;
	4679	break;
	4680	}
	4681	if (!hasExpansion(coleiter)) {
	4682	// checking for accents in composite character
	4683	found = FALSE;
	4684	break;
	4685	}
	4686	}
	4687
	4688	targetce = firstce;
	4689
	4690	while (found && patternceindex < patterncelength) {
	4691	targetce = ucol_next(coleiter, status);
	4692	if (U_FAILURE(*status) \|\| targetce == UCOL_NULLORDER) {
	4693	found = FALSE;
	4694	break;
	4695	}
	4696	targetce = getCE(strsrch, targetce);
	4697	if (targetce == UCOL_IGNORABLE) {
	4698	continue;
	4699	}
	4700
	4701	// TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
	4702	found = found && targetce == patternce[patternceindex];
	4703	patternceindex ++;
	4704	}
	4705
	4706	// initializing the rearranged accent array
	4707	if (hasPatternAccents && !found) {
	4708	strsrch->canonicalPrefixAccents[0] = 0;
	4709	strsrch->canonicalSuffixAccents[0] = 0;
	4710	if (U_FAILURE(*status)) {
	4711	break;
	4712	}
	4713	found = doPreviousCanonicalMatch(strsrch, textoffset, status);
	4714	}
	4715
	4716	if (!found) {
	4717	if (U_FAILURE(*status)) {
	4718	break;
	4719	}
	4720	textoffset = reverseShift(strsrch, textoffset, targetce,
	4721	patternceindex);
	4722	patternceindex = 0;
	4723	continue;
	4724	}
	4725
	4726	if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
	4727	setColEIterOffset(coleiter, textoffset);
	4728	return TRUE;
	4729	}
	4730	}
	4731	setMatchNotFound(strsrch);
	4732	return FALSE;
	4733	#else
	4734	int32_t textOffset = ucol_getOffset(strsrch->textIter);
	4735	int32_t start = -1;
	4736	int32_t end = -1;
	4737
	4738	if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
	4739	strsrch->search->matchedIndex = start;
	4740	strsrch->search->matchedLength = end - start;
	4741	return TRUE;
	4742	} else {
	4743	setMatchNotFound(strsrch);
	4744	return FALSE;
	4745	}
	4746	#endif
	4747	}
	4748
	4749	#endif /* #if !UCONFIG_NO_COLLATION */