[apple/icu.git] / icuSources / i18n / strmatch.cpp

/*
**********************************************************************
*   Copyright (c) 2001-2004, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   07/23/01    aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "strmatch.h"
#include "rbt_data.h"
#include "util.h"
#include "unicode/uniset.h"

U_NAMESPACE_BEGIN

static const UChar EMPTY[] = { 0 }; // empty string: ""

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)

StringMatcher::StringMatcher(const UnicodeString& theString,
                             int32_t start,
                             int32_t limit,
                             int32_t segmentNum,
                             const TransliterationRuleData& theData) :
    data(&theData),
    segmentNumber(segmentNum),
    matchStart(-1),
    matchLimit(-1)
{
    theString.extractBetween(start, limit, pattern);
}

StringMatcher::StringMatcher(const StringMatcher& o) :
    UnicodeFunctor(o),
    UnicodeMatcher(o),
    UnicodeReplacer(o),
    pattern(o.pattern),
    data(o.data),
    segmentNumber(o.segmentNumber),
    matchStart(o.matchStart),
    matchLimit(o.matchLimit)
{
}

/**
 * Destructor
 */
StringMatcher::~StringMatcher() {
}

/**
 * Implement UnicodeFunctor
 */
UnicodeFunctor* StringMatcher::clone() const {
    return new StringMatcher(*this);
}

/**
 * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
 * and return the pointer.
 */
UnicodeMatcher* StringMatcher::toMatcher() const {
    return (UnicodeMatcher*) this;
}

/**
 * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
 * and return the pointer.
 */
UnicodeReplacer* StringMatcher::toReplacer() const {
    return (UnicodeReplacer*) this;
}

/**
 * Implement UnicodeMatcher
 */
UMatchDegree StringMatcher::matches(const Replaceable& text,
                                    int32_t& offset,
                                    int32_t limit,
                                    UBool incremental) {
    int32_t i;
    int32_t cursor = offset;
    if (limit < cursor) {
        // Match in the reverse direction
        for (i=pattern.length()-1; i>=0; --i) {
            UChar keyChar = pattern.charAt(i);
            UnicodeMatcher* subm = data->lookupMatcher(keyChar);
            if (subm == 0) {
                if (cursor > limit &&
                    keyChar == text.charAt(cursor)) {
                    --cursor;
                } else {
                    return U_MISMATCH;
                }
            } else {
                UMatchDegree m =
                    subm->matches(text, cursor, limit, incremental);
                if (m != U_MATCH) {
                    return m;
                }
            }
        }
        // Record the match position, but adjust for a normal
        // forward start, limit, and only if a prior match does not
        // exist -- we want the rightmost match.
        if (matchStart < 0) {
            matchStart = cursor+1;
            matchLimit = offset+1;
        }
    } else {
        for (i=0; i<pattern.length(); ++i) {
            if (incremental && cursor == limit) {
                // We've reached the context limit without a mismatch and
                // without completing our match.
                return U_PARTIAL_MATCH;
            }
            UChar keyChar = pattern.charAt(i);
            UnicodeMatcher* subm = data->lookupMatcher(keyChar);
            if (subm == 0) {
                // Don't need the cursor < limit check if
                // incremental is TRUE (because it's done above); do need
                // it otherwise.
                if (cursor < limit &&
                    keyChar == text.charAt(cursor)) {
                    ++cursor;
                } else {
                    return U_MISMATCH;
                }
            } else {
                UMatchDegree m =
                    subm->matches(text, cursor, limit, incremental);
                if (m != U_MATCH) {
                    return m;
                }
            }
        }
        // Record the match position
        matchStart = offset;
        matchLimit = cursor;
    }

    offset = cursor;
    return U_MATCH;
}

/**
 * Implement UnicodeMatcher
 */
UnicodeString& StringMatcher::toPattern(UnicodeString& result,
                                        UBool escapeUnprintable) const
{
    result.truncate(0);
    UnicodeString str, quoteBuf;
    if (segmentNumber > 0) {
        result.append((UChar)40); /*(*/
    }
    for (int32_t i=0; i<pattern.length(); ++i) {
        UChar keyChar = pattern.charAt(i);
        const UnicodeMatcher* m = data->lookupMatcher(keyChar);
        if (m == 0) {
            ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
        } else {
            ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
                         TRUE, escapeUnprintable, quoteBuf);
        }
    }
    if (segmentNumber > 0) {
        result.append((UChar)41); /*)*/
    }
    // Flush quoteBuf out to result
    ICU_Utility::appendToRule(result, -1,
                              TRUE, escapeUnprintable, quoteBuf);
    return result;
}

/**
 * Implement UnicodeMatcher
 */
UBool StringMatcher::matchesIndexValue(uint8_t v) const {
    if (pattern.length() == 0) {
        return TRUE;
    }
    UChar32 c = pattern.char32At(0);
    const UnicodeMatcher *m = data->lookupMatcher(c);
    return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
}

/**
 * Implement UnicodeMatcher
 */
void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
    UChar32 ch;
    for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
        ch = pattern.char32At(i);
        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
        if (matcher == NULL) {
            toUnionTo.add(ch);
        } else {
            matcher->addMatchSetTo(toUnionTo);
        }
    }
}

/**
 * UnicodeReplacer API
 */
int32_t StringMatcher::replace(Replaceable& text,
                               int32_t start,
                               int32_t limit,
                               int32_t& /*cursor*/) {
    
    int32_t outLen = 0;
    
    // Copy segment with out-of-band data
    int32_t dest = limit;
    // If there was no match, that means that a quantifier
    // matched zero-length.  E.g., x (a)* y matched "xy".
    if (matchStart >= 0) {
        if (matchStart != matchLimit) {
            text.copy(matchStart, matchLimit, dest);
            outLen = matchLimit - matchStart;
        }
    }
    
    text.handleReplaceBetween(start, limit, EMPTY); // delete original text
    
    return outLen;
}

/**
 * UnicodeReplacer API
 */
UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
                                                UBool /*escapeUnprintable*/) const {
    // assert(segmentNumber > 0);
    rule.truncate(0);
    rule.append((UChar)0x0024 /*$*/);
    ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
    return rule;
}

/**
 * Remove any match info.  This must be called before performing a
 * set of matches with this segment.
 */
 void StringMatcher::resetMatch() {
    matchStart = matchLimit = -1;
}

/**
 * Union the set of all characters that may output by this object
 * into the given set.
 * @param toUnionTo the set into which to union the output characters
 */
void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
    // The output of this replacer varies; it is the source text between
    // matchStart and matchLimit.  Since this varies depending on the
    // input text, we can't compute it here.  We can either do nothing
    // or we can add ALL characters to the set.  It's probably more useful
    // to do nothing.
}

/**
 * Implement UnicodeFunctor
 */
void StringMatcher::setData(const TransliterationRuleData* d) {
    data = d;
    int32_t i = 0;
    while (i<pattern.length()) {
        UChar32 c = pattern.char32At(i);
        UnicodeFunctor* f = data->lookup(c);
        if (f != NULL) {
            f->setData(data);
        }
        i += UTF_CHAR_LENGTH(c);
    }    
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof
Commit	Line	Data
b75a7d8f	1	/*
374ca955 A	2	**********************************************************************
	3	* Copyright (c) 2001-2004, International Business Machines Corporation
	4	* and others. All Rights Reserved.
b75a7d8f A	5	**********************************************************************
	6	* Date Name Description
	7	* 07/23/01 aliu Creation.
	8	**********************************************************************
	9	*/
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "strmatch.h"
	16	#include "rbt_data.h"
	17	#include "util.h"
	18	#include "unicode/uniset.h"
	19
	20	U_NAMESPACE_BEGIN
	21
374ca955	22	static const UChar EMPTY[] = { 0 }; // empty string: ""
b75a7d8f	23
374ca955	24	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
b75a7d8f A	25
	26	StringMatcher::StringMatcher(const UnicodeString& theString,
	27	int32_t start,
	28	int32_t limit,
	29	int32_t segmentNum,
	30	const TransliterationRuleData& theData) :
	31	data(&theData),
	32	segmentNumber(segmentNum),
	33	matchStart(-1),
	34	matchLimit(-1)
	35	{
	36	theString.extractBetween(start, limit, pattern);
	37	}
	38
	39	StringMatcher::StringMatcher(const StringMatcher& o) :
374ca955	40	UnicodeFunctor(o),
b75a7d8f	41	UnicodeMatcher(o),
374ca955	42	UnicodeReplacer(o),
b75a7d8f A	43	pattern(o.pattern),
	44	data(o.data),
	45	segmentNumber(o.segmentNumber),
	46	matchStart(o.matchStart),
	47	matchLimit(o.matchLimit)
	48	{
	49	}
	50
	51	/**
	52	* Destructor
	53	*/
	54	StringMatcher::~StringMatcher() {
	55	}
	56
	57	/**
	58	* Implement UnicodeFunctor
	59	*/
	60	UnicodeFunctor* StringMatcher::clone() const {
	61	return new StringMatcher(*this);
	62	}
	63
	64	/**
	65	* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
	66	* and return the pointer.
	67	*/
	68	UnicodeMatcher* StringMatcher::toMatcher() const {
	69	return (UnicodeMatcher*) this;
	70	}
	71
	72	/**
	73	* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
	74	* and return the pointer.
	75	*/
	76	UnicodeReplacer* StringMatcher::toReplacer() const {
	77	return (UnicodeReplacer*) this;
	78	}
	79
	80	/**
	81	* Implement UnicodeMatcher
	82	*/
	83	UMatchDegree StringMatcher::matches(const Replaceable& text,
	84	int32_t& offset,
	85	int32_t limit,
	86	UBool incremental) {
	87	int32_t i;
	88	int32_t cursor = offset;
	89	if (limit < cursor) {
	90	// Match in the reverse direction
	91	for (i=pattern.length()-1; i>=0; --i) {
	92	UChar keyChar = pattern.charAt(i);
	93	UnicodeMatcher* subm = data->lookupMatcher(keyChar);
	94	if (subm == 0) {
	95	if (cursor > limit &&
	96	keyChar == text.charAt(cursor)) {
	97	--cursor;
	98	} else {
	99	return U_MISMATCH;
	100	}
	101	} else {
	102	UMatchDegree m =
	103	subm->matches(text, cursor, limit, incremental);
	104	if (m != U_MATCH) {
	105	return m;
	106	}
107	}
108	}
109	// Record the match position, but adjust for a normal
110	// forward start, limit, and only if a prior match does not
111	// exist -- we want the rightmost match.
112	if (matchStart < 0) {
113	matchStart = cursor+1;
114	matchLimit = offset+1;
115	}
116	} else {
117	for (i=0; i<pattern.length(); ++i) {
118	if (incremental && cursor == limit) {
119	// We've reached the context limit without a mismatch and
120	// without completing our match.
121	return U_PARTIAL_MATCH;
122	}
123	UChar keyChar = pattern.charAt(i);
124	UnicodeMatcher* subm = data->lookupMatcher(keyChar);
125	if (subm == 0) {
126	// Don't need the cursor < limit check if
127	// incremental is TRUE (because it's done above); do need
128	// it otherwise.
129	if (cursor < limit &&
130	keyChar == text.charAt(cursor)) {
131	++cursor;
132	} else {
133	return U_MISMATCH;
134	}
135	} else {
136	UMatchDegree m =
137	subm->matches(text, cursor, limit, incremental);
138	if (m != U_MATCH) {
139	return m;
140	}
141	}
142	}
143	// Record the match position
144	matchStart = offset;
145	matchLimit = cursor;
146	}
147
148	offset = cursor;
149	return U_MATCH;
150	}
151
152	/**
153	* Implement UnicodeMatcher
154	*/
155	UnicodeString& StringMatcher::toPattern(UnicodeString& result,
156	UBool escapeUnprintable) const
157	{
158	result.truncate(0);
159	UnicodeString str, quoteBuf;
160	if (segmentNumber > 0) {
161	result.append((UChar)40); /(/
162	}
163	for (int32_t i=0; i<pattern.length(); ++i) {
164	UChar keyChar = pattern.charAt(i);
165	const UnicodeMatcher* m = data->lookupMatcher(keyChar);
166	if (m == 0) {
167	ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
168	} else {
169	ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
170	TRUE, escapeUnprintable, quoteBuf);
171	}
172	}
173	if (segmentNumber > 0) {
174	result.append((UChar)41); /)/
175	}
176	// Flush quoteBuf out to result
177	ICU_Utility::appendToRule(result, -1,
178	TRUE, escapeUnprintable, quoteBuf);
179	return result;
180	}
181
182	/**
183	* Implement UnicodeMatcher
184	*/
185	UBool StringMatcher::matchesIndexValue(uint8_t v) const {
186	if (pattern.length() == 0) {
187	return TRUE;
188	}
189	UChar32 c = pattern.char32At(0);
190	const UnicodeMatcher *m = data->lookupMatcher(c);
191	return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
192	}
193
194	/**
195	* Implement UnicodeMatcher
196	*/
197	void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
198	UChar32 ch;
199	for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
374ca955 A	200	ch = pattern.char32At(i);
	201	const UnicodeMatcher* matcher = data->lookupMatcher(ch);
	202	if (matcher == NULL) {
	203	toUnionTo.add(ch);
	204	} else {
	205	matcher->addMatchSetTo(toUnionTo);
	206	}
b75a7d8f A	207	}
	208	}
	209
	210	/**
	211	* UnicodeReplacer API
	212	*/
	213	int32_t StringMatcher::replace(Replaceable& text,
	214	int32_t start,
	215	int32_t limit,
374ca955	216	int32_t& /cursor/) {
b75a7d8f A	217
	218	int32_t outLen = 0;
	219
	220	// Copy segment with out-of-band data
	221	int32_t dest = limit;
	222	// If there was no match, that means that a quantifier
	223	// matched zero-length. E.g., x (a)* y matched "xy".
	224	if (matchStart >= 0) {
	225	if (matchStart != matchLimit) {
	226	text.copy(matchStart, matchLimit, dest);
	227	outLen = matchLimit - matchStart;
	228	}
	229	}
	230
	231	text.handleReplaceBetween(start, limit, EMPTY); // delete original text
	232
	233	return outLen;
	234	}
	235
	236	/**
	237	* UnicodeReplacer API
	238	*/
	239	UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
374ca955	240	UBool /escapeUnprintable/) const {
b75a7d8f A	241	// assert(segmentNumber > 0);
	242	rule.truncate(0);
	243	rule.append((UChar)0x0024 /$/);
	244	ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
	245	return rule;
	246	}
	247
	248	/**
	249	* Remove any match info. This must be called before performing a
	250	* set of matches with this segment.
	251	*/
	252	void StringMatcher::resetMatch() {
	253	matchStart = matchLimit = -1;
	254	}
	255
	256	/**
	257	* Union the set of all characters that may output by this object
	258	* into the given set.
	259	* @param toUnionTo the set into which to union the output characters
	260	*/
374ca955	261	void StringMatcher::addReplacementSetTo(UnicodeSet& /toUnionTo/) const {
b75a7d8f A	262	// The output of this replacer varies; it is the source text between
	263	// matchStart and matchLimit. Since this varies depending on the
	264	// input text, we can't compute it here. We can either do nothing
	265	// or we can add ALL characters to the set. It's probably more useful
	266	// to do nothing.
	267	}
	268
	269	/**
	270	* Implement UnicodeFunctor
	271	*/
	272	void StringMatcher::setData(const TransliterationRuleData* d) {
	273	data = d;
	274	int32_t i = 0;
	275	while (i<pattern.length()) {
	276	UChar32 c = pattern.char32At(i);
	277	UnicodeFunctor* f = data->lookup(c);
	278	if (f != NULL) {
	279	f->setData(data);
	280	}
	281	i += UTF_CHAR_LENGTH(c);
	282	}
	283	}
	284
	285	U_NAMESPACE_END
	286
	287	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
	288
	289	//eof