[apple/icu.git] / icuSources / i18n / strmatch.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
*   Copyright (c) 2001-2012, International Business Machines Corporation
*   and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   07/23/01    aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "strmatch.h"
#include "rbt_data.h"
#include "util.h"
#include "unicode/uniset.h"
#include "unicode/utf16.h"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)

StringMatcher::StringMatcher(const UnicodeString& theString,
                             int32_t start,
                             int32_t limit,
                             int32_t segmentNum,
                             const TransliterationRuleData& theData) :
    data(&theData),
    segmentNumber(segmentNum),
    matchStart(-1),
    matchLimit(-1)
{
    theString.extractBetween(start, limit, pattern);
}

StringMatcher::StringMatcher(const StringMatcher& o) :
    UnicodeFunctor(o),
    UnicodeMatcher(o),
    UnicodeReplacer(o),
    pattern(o.pattern),
    data(o.data),
    segmentNumber(o.segmentNumber),
    matchStart(o.matchStart),
    matchLimit(o.matchLimit)
{
}

/**
 * Destructor
 */
StringMatcher::~StringMatcher() {
}

/**
 * Implement UnicodeFunctor
 */
UnicodeFunctor* StringMatcher::clone() const {
    return new StringMatcher(*this);
}

/**
 * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
 * and return the pointer.
 */
UnicodeMatcher* StringMatcher::toMatcher() const {
  StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
  UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
  
  return nonconst_base;
}

/**
 * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
 * and return the pointer.
 */
UnicodeReplacer* StringMatcher::toReplacer() const {
  StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
  UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
  
  return nonconst_base;
}

/**
 * Implement UnicodeMatcher
 */
UMatchDegree StringMatcher::matches(const Replaceable& text,
                                    int32_t& offset,
                                    int32_t limit,
                                    UBool incremental) {
    int32_t i;
    int32_t cursor = offset;
    if (limit < cursor) {
        // Match in the reverse direction
        for (i=pattern.length()-1; i>=0; --i) {
            UChar keyChar = pattern.charAt(i);
            UnicodeMatcher* subm = data->lookupMatcher(keyChar);
            if (subm == 0) {
                if (cursor > limit &&
                    keyChar == text.charAt(cursor)) {
                    --cursor;
                } else {
                    return U_MISMATCH;
                }
            } else {
                UMatchDegree m =
                    subm->matches(text, cursor, limit, incremental);
                if (m != U_MATCH) {
                    return m;
                }
            }
        }
        // Record the match position, but adjust for a normal
        // forward start, limit, and only if a prior match does not
        // exist -- we want the rightmost match.
        if (matchStart < 0) {
            matchStart = cursor+1;
            matchLimit = offset+1;
        }
    } else {
        for (i=0; i<pattern.length(); ++i) {
            if (incremental && cursor == limit) {
                // We've reached the context limit without a mismatch and
                // without completing our match.
                return U_PARTIAL_MATCH;
            }
            UChar keyChar = pattern.charAt(i);
            UnicodeMatcher* subm = data->lookupMatcher(keyChar);
            if (subm == 0) {
                // Don't need the cursor < limit check if
                // incremental is TRUE (because it's done above); do need
                // it otherwise.
                if (cursor < limit &&
                    keyChar == text.charAt(cursor)) {
                    ++cursor;
                } else {
                    return U_MISMATCH;
                }
            } else {
                UMatchDegree m =
                    subm->matches(text, cursor, limit, incremental);
                if (m != U_MATCH) {
                    return m;
                }
            }
        }
        // Record the match position
        matchStart = offset;
        matchLimit = cursor;
    }

    offset = cursor;
    return U_MATCH;
}

/**
 * Implement UnicodeMatcher
 */
UnicodeString& StringMatcher::toPattern(UnicodeString& result,
                                        UBool escapeUnprintable) const
{
    result.truncate(0);
    UnicodeString str, quoteBuf;
    if (segmentNumber > 0) {
        result.append((UChar)40); /*(*/
    }
    for (int32_t i=0; i<pattern.length(); ++i) {
        UChar keyChar = pattern.charAt(i);
        const UnicodeMatcher* m = data->lookupMatcher(keyChar);
        if (m == 0) {
            ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
        } else {
            ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
                         TRUE, escapeUnprintable, quoteBuf);
        }
    }
    if (segmentNumber > 0) {
        result.append((UChar)41); /*)*/
    }
    // Flush quoteBuf out to result
    ICU_Utility::appendToRule(result, -1,
                              TRUE, escapeUnprintable, quoteBuf);
    return result;
}

/**
 * Implement UnicodeMatcher
 */
UBool StringMatcher::matchesIndexValue(uint8_t v) const {
    if (pattern.length() == 0) {
        return TRUE;
    }
    UChar32 c = pattern.char32At(0);
    const UnicodeMatcher *m = data->lookupMatcher(c);
    return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
}

/**
 * Implement UnicodeMatcher
 */
void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
    UChar32 ch;
    for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
        ch = pattern.char32At(i);
        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
        if (matcher == NULL) {
            toUnionTo.add(ch);
        } else {
            matcher->addMatchSetTo(toUnionTo);
        }
    }
}

/**
 * UnicodeReplacer API
 */
int32_t StringMatcher::replace(Replaceable& text,
                               int32_t start,
                               int32_t limit,
                               int32_t& /*cursor*/) {
    
    int32_t outLen = 0;
    
    // Copy segment with out-of-band data
    int32_t dest = limit;
    // If there was no match, that means that a quantifier
    // matched zero-length.  E.g., x (a)* y matched "xy".
    if (matchStart >= 0) {
        if (matchStart != matchLimit) {
            text.copy(matchStart, matchLimit, dest);
            outLen = matchLimit - matchStart;
        }
    }
    
    text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
    
    return outLen;
}

/**
 * UnicodeReplacer API
 */
UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
                                                UBool /*escapeUnprintable*/) const {
    // assert(segmentNumber > 0);
    rule.truncate(0);
    rule.append((UChar)0x0024 /*$*/);
    ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
    return rule;
}

/**
 * Remove any match info.  This must be called before performing a
 * set of matches with this segment.
 */
 void StringMatcher::resetMatch() {
    matchStart = matchLimit = -1;
}

/**
 * Union the set of all characters that may output by this object
 * into the given set.
 * @param toUnionTo the set into which to union the output characters
 */
void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
    // The output of this replacer varies; it is the source text between
    // matchStart and matchLimit.  Since this varies depending on the
    // input text, we can't compute it here.  We can either do nothing
    // or we can add ALL characters to the set.  It's probably more useful
    // to do nothing.
}

/**
 * Implement UnicodeFunctor
 */
void StringMatcher::setData(const TransliterationRuleData* d) {
    data = d;
    int32_t i = 0;
    while (i<pattern.length()) {
        UChar32 c = pattern.char32At(i);
        UnicodeFunctor* f = data->lookup(c);
        if (f != NULL) {
            f->setData(data);
        }
        i += U16_LENGTH(c);
    }
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

//eof
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f	3	/*
374ca955	4	**********************************************************************
51004dcb	5	* Copyright (c) 2001-2012, International Business Machines Corporation
374ca955	6	* and others. All Rights Reserved.
b75a7d8f A	7	**********************************************************************
	8	* Date Name Description
	9	* 07/23/01 aliu Creation.
	10	**********************************************************************
	11	*/
	12
	13	#include "unicode/utypes.h"
	14
	15	#if !UCONFIG_NO_TRANSLITERATION
	16
	17	#include "strmatch.h"
	18	#include "rbt_data.h"
	19	#include "util.h"
	20	#include "unicode/uniset.h"
4388f060	21	#include "unicode/utf16.h"
b75a7d8f A	22
	23	U_NAMESPACE_BEGIN
	24
374ca955	25	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
b75a7d8f A	26
	27	StringMatcher::StringMatcher(const UnicodeString& theString,
	28	int32_t start,
	29	int32_t limit,
	30	int32_t segmentNum,
	31	const TransliterationRuleData& theData) :
	32	data(&theData),
	33	segmentNumber(segmentNum),
	34	matchStart(-1),
	35	matchLimit(-1)
	36	{
	37	theString.extractBetween(start, limit, pattern);
	38	}
	39
	40	StringMatcher::StringMatcher(const StringMatcher& o) :
374ca955	41	UnicodeFunctor(o),
b75a7d8f	42	UnicodeMatcher(o),
374ca955	43	UnicodeReplacer(o),
b75a7d8f A	44	pattern(o.pattern),
	45	data(o.data),
	46	segmentNumber(o.segmentNumber),
	47	matchStart(o.matchStart),
	48	matchLimit(o.matchLimit)
	49	{
	50	}
	51
	52	/**
	53	* Destructor
	54	*/
	55	StringMatcher::~StringMatcher() {
	56	}
	57
	58	/**
	59	* Implement UnicodeFunctor
	60	*/
	61	UnicodeFunctor* StringMatcher::clone() const {
	62	return new StringMatcher(*this);
	63	}
	64
	65	/**
	66	* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
	67	* and return the pointer.
	68	*/
	69	UnicodeMatcher* StringMatcher::toMatcher() const {
51004dcb A	70	StringMatcher nonconst_this = const_cast<StringMatcher >(this);
	71	UnicodeMatcher nonconst_base = static_cast<UnicodeMatcher >(nonconst_this);
	72
	73	return nonconst_base;
b75a7d8f A	74	}
	75
	76	/**
	77	* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
	78	* and return the pointer.
	79	*/
	80	UnicodeReplacer* StringMatcher::toReplacer() const {
51004dcb A	81	StringMatcher nonconst_this = const_cast<StringMatcher >(this);
	82	UnicodeReplacer nonconst_base = static_cast<UnicodeReplacer >(nonconst_this);
	83
	84	return nonconst_base;
b75a7d8f A	85	}
	86
	87	/**
	88	* Implement UnicodeMatcher
	89	*/
	90	UMatchDegree StringMatcher::matches(const Replaceable& text,
	91	int32_t& offset,
	92	int32_t limit,
	93	UBool incremental) {
	94	int32_t i;
	95	int32_t cursor = offset;
	96	if (limit < cursor) {
	97	// Match in the reverse direction
	98	for (i=pattern.length()-1; i>=0; --i) {
	99	UChar keyChar = pattern.charAt(i);
	100	UnicodeMatcher* subm = data->lookupMatcher(keyChar);
	101	if (subm == 0) {
	102	if (cursor > limit &&
	103	keyChar == text.charAt(cursor)) {
	104	--cursor;
	105	} else {
	106	return U_MISMATCH;
	107	}
	108	} else {
	109	UMatchDegree m =
	110	subm->matches(text, cursor, limit, incremental);
	111	if (m != U_MATCH) {
	112	return m;
	113	}
	114	}
	115	}
	116	// Record the match position, but adjust for a normal
	117	// forward start, limit, and only if a prior match does not
	118	// exist -- we want the rightmost match.
	119	if (matchStart < 0) {
	120	matchStart = cursor+1;
	121	matchLimit = offset+1;
	122	}
	123	} else {
	124	for (i=0; i<pattern.length(); ++i) {
	125	if (incremental && cursor == limit) {
	126	// We've reached the context limit without a mismatch and
	127	// without completing our match.
	128	return U_PARTIAL_MATCH;
	129	}
	130	UChar keyChar = pattern.charAt(i);
	131	UnicodeMatcher* subm = data->lookupMatcher(keyChar);
	132	if (subm == 0) {
	133	// Don't need the cursor < limit check if
	134	// incremental is TRUE (because it's done above); do need
	135	// it otherwise.
	136	if (cursor < limit &&
	137	keyChar == text.charAt(cursor)) {
	138	++cursor;
	139	} else {
	140	return U_MISMATCH;
	141	}
	142	} else {
	143	UMatchDegree m =
	144	subm->matches(text, cursor, limit, incremental);
	145	if (m != U_MATCH) {
	146	return m;
	147	}
	148	}
149	}
150	// Record the match position
151	matchStart = offset;
152	matchLimit = cursor;
153	}
154
155	offset = cursor;
156	return U_MATCH;
157	}
158
159	/**
160	* Implement UnicodeMatcher
161	*/
162	UnicodeString& StringMatcher::toPattern(UnicodeString& result,
163	UBool escapeUnprintable) const
164	{
165	result.truncate(0);
166	UnicodeString str, quoteBuf;
167	if (segmentNumber > 0) {
168	result.append((UChar)40); /(/
169	}
170	for (int32_t i=0; i<pattern.length(); ++i) {
171	UChar keyChar = pattern.charAt(i);
172	const UnicodeMatcher* m = data->lookupMatcher(keyChar);
173	if (m == 0) {
174	ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
175	} else {
176	ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
177	TRUE, escapeUnprintable, quoteBuf);
178	}
179	}
180	if (segmentNumber > 0) {
181	result.append((UChar)41); /)/
182	}
183	// Flush quoteBuf out to result
184	ICU_Utility::appendToRule(result, -1,
185	TRUE, escapeUnprintable, quoteBuf);
186	return result;
187	}
188
189	/**
190	* Implement UnicodeMatcher
191	*/
192	UBool StringMatcher::matchesIndexValue(uint8_t v) const {
193	if (pattern.length() == 0) {
194	return TRUE;
195	}
196	UChar32 c = pattern.char32At(0);
197	const UnicodeMatcher *m = data->lookupMatcher(c);
198	return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
199	}
200
201	/**
202	* Implement UnicodeMatcher
203	*/
204	void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
205	UChar32 ch;
4388f060	206	for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
374ca955 A	207	ch = pattern.char32At(i);
	208	const UnicodeMatcher* matcher = data->lookupMatcher(ch);
	209	if (matcher == NULL) {
	210	toUnionTo.add(ch);
	211	} else {
	212	matcher->addMatchSetTo(toUnionTo);
	213	}
b75a7d8f A	214	}
	215	}
	216
	217	/**
	218	* UnicodeReplacer API
	219	*/
	220	int32_t StringMatcher::replace(Replaceable& text,
	221	int32_t start,
	222	int32_t limit,
374ca955	223	int32_t& /cursor/) {
b75a7d8f A	224
	225	int32_t outLen = 0;
	226
	227	// Copy segment with out-of-band data
	228	int32_t dest = limit;
	229	// If there was no match, that means that a quantifier
	230	// matched zero-length. E.g., x (a)* y matched "xy".
	231	if (matchStart >= 0) {
	232	if (matchStart != matchLimit) {
	233	text.copy(matchStart, matchLimit, dest);
	234	outLen = matchLimit - matchStart;
	235	}
	236	}
	237
4388f060	238	text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
b75a7d8f A	239
	240	return outLen;
	241	}
	242
	243	/**
	244	* UnicodeReplacer API
	245	*/
	246	UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
374ca955	247	UBool /escapeUnprintable/) const {
b75a7d8f A	248	// assert(segmentNumber > 0);
	249	rule.truncate(0);
	250	rule.append((UChar)0x0024 /$/);
	251	ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
	252	return rule;
	253	}
	254
	255	/**
	256	* Remove any match info. This must be called before performing a
	257	* set of matches with this segment.
	258	*/
	259	void StringMatcher::resetMatch() {
	260	matchStart = matchLimit = -1;
	261	}
	262
	263	/**
	264	* Union the set of all characters that may output by this object
	265	* into the given set.
	266	* @param toUnionTo the set into which to union the output characters
	267	*/
374ca955	268	void StringMatcher::addReplacementSetTo(UnicodeSet& /toUnionTo/) const {
b75a7d8f A	269	// The output of this replacer varies; it is the source text between
	270	// matchStart and matchLimit. Since this varies depending on the
	271	// input text, we can't compute it here. We can either do nothing
	272	// or we can add ALL characters to the set. It's probably more useful
	273	// to do nothing.
	274	}
	275
	276	/**
	277	* Implement UnicodeFunctor
	278	*/
	279	void StringMatcher::setData(const TransliterationRuleData* d) {
	280	data = d;
	281	int32_t i = 0;
	282	while (i<pattern.length()) {
	283	UChar32 c = pattern.char32At(i);
	284	UnicodeFunctor* f = data->lookup(c);
	285	if (f != NULL) {
	286	f->setData(data);
	287	}
4388f060 A	288	i += U16_LENGTH(c);
4388f060 A	289	}
b75a7d8f A	290	}
	291
	292	U_NAMESPACE_END
	293
	294	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
	295
	296	//eof