[apple/icu.git] / icuSources / i18n / strmatch.h

/*
 * Copyright (C) 2001-2011, International Business Machines Corporation
 * and others. All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   07/23/01    aliu        Creation.
 **********************************************************************
 */
#ifndef STRMATCH_H
#define STRMATCH_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/unistr.h"
#include "unicode/unifunct.h"
#include "unicode/unimatch.h"
#include "unicode/unirepl.h"

U_NAMESPACE_BEGIN

class TransliterationRuleData;

/**
 * An object that matches a fixed input string, implementing the
 * UnicodeMatcher API.  This object also implements the
 * UnicodeReplacer API, allowing it to emit the matched text as
 * output.  Since the match text may contain flexible match elements,
 * such as UnicodeSets, the emitted text is not the match pattern, but
 * instead a substring of the actual matched text.  Following
 * convention, the output text is the leftmost match seen up to this
 * point.
 *
 * A StringMatcher may represent a segment, in which case it has a
 * positive segment number.  This affects how the matcher converts
 * itself to a pattern but does not otherwise affect its function.
 *
 * A StringMatcher that is not a segment should not be used as a
 * UnicodeReplacer.
 */
class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {

 public:

    /**
     * Construct a matcher that matches the given pattern string.
     * @param string the pattern to be matched, possibly containing
     * stand-ins that represent nested UnicodeMatcher objects.
     * @param start inclusive start index of text to be replaced
     * @param limit exclusive end index of text to be replaced;
     * must be greater than or equal to start
     * @param segmentNum the segment number from 1..n, or 0 if this is
     * not a segment.
     * @param data context object mapping stand-ins to
     * UnicodeMatcher objects.
     */
    StringMatcher(const UnicodeString& string,
                  int32_t start,
                  int32_t limit,
                  int32_t segmentNum,
                  const TransliterationRuleData& data);

    /**
     * Copy constructor
     * @param o  the object to be copied.
     */
    StringMatcher(const StringMatcher& o);
        
    /**
     * Destructor
     */
    virtual ~StringMatcher();

    /**
     * Implement UnicodeFunctor
     * @return a copy of the object.
     */
    virtual UnicodeFunctor* clone() const;

    /**
     * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     * and return the pointer.
     * @return the UnicodeMatcher point.
     */
    virtual UnicodeMatcher* toMatcher() const;

    /**
     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     * and return the pointer.
     * @return the UnicodeReplacer pointer.
     */
    virtual UnicodeReplacer* toReplacer() const;

    /**
     * Implement UnicodeMatcher
     * @param text the text to be matched
     * @param offset on input, the index into text at which to begin
     * matching.  On output, the limit of the matched text.  The
     * number of matched characters is the output value of offset
     * minus the input value.  Offset should always point to the
     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
     * both on entry and upon return.
     * @param limit the limit index of text to be matched.  Greater
     * than offset for a forward direction match, less than offset for
     * a backward direction match.  The last character to be
     * considered for matching will be text.charAt(limit-1) in the
     * forward direction or text.charAt(limit+1) in the backward
     * direction.
     * @param incremental  if TRUE, then assume further characters may
     * be inserted at limit and check for partial matching.  Otherwise
     * assume the text as given is complete.
     * @return a match degree value indicating a full match, a partial
     * match, or a mismatch.  If incremental is FALSE then
     * U_PARTIAL_MATCH should never be returned.
     */
    virtual UMatchDegree matches(const Replaceable& text,
                                 int32_t& offset,
                                 int32_t limit,
                                 UBool incremental);

    /**
     * Implement UnicodeMatcher
     * @param result            Output param to receive the pattern.
     * @param escapeUnprintable if True then escape the unprintable characters.
     * @return                  A reference to 'result'.
     */
    virtual UnicodeString& toPattern(UnicodeString& result,
                                     UBool escapeUnprintable = FALSE) const;

    /**
     * Implement UnicodeMatcher
     * Returns TRUE if this matcher will match a character c, where c
     * & 0xFF == v, at offset, in the forward direction (with limit >
     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
     * indexing.
     * @param v    the given value
     * @return     TRUE if this matcher will match a character c, 
     *             where c & 0xFF == v
     */
    virtual UBool matchesIndexValue(uint8_t v) const;

    /**
     * Implement UnicodeMatcher
     */
    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;

    /**
     * Implement UnicodeFunctor
     */
    virtual void setData(const TransliterationRuleData*);

    /**
     * Replace characters in 'text' from 'start' to 'limit' with the
     * output text of this object.  Update the 'cursor' parameter to
     * give the cursor position and return the length of the
     * replacement text.
     *
     * @param text the text to be matched
     * @param start inclusive start index of text to be replaced
     * @param limit exclusive end index of text to be replaced;
     * must be greater than or equal to start
     * @param cursor output parameter for the cursor position.
     * Not all replacer objects will update this, but in a complete
     * tree of replacer objects, representing the entire output side
     * of a transliteration rule, at least one must update it.
     * @return the number of 16-bit code units in the text replacing
     * the characters at offsets start..(limit-1) in text
     */
    virtual int32_t replace(Replaceable& text,
                            int32_t start,
                            int32_t limit,
                            int32_t& cursor);

    /**
     * Returns a string representation of this replacer.  If the
     * result of calling this function is passed to the appropriate
     * parser, typically TransliteratorParser, it will produce another
     * replacer that is equal to this one.
     * @param result the string to receive the pattern.  Previous
     * contents will be deleted.
     * @param escapeUnprintable if TRUE then convert unprintable
     * character to their hex escape representations, \\uxxxx or
     * \\Uxxxxxxxx.  Unprintable characters are defined by
     * Utility.isUnprintable().
     * @return a reference to 'result'.
     */
    virtual UnicodeString& toReplacerPattern(UnicodeString& result,
                                             UBool escapeUnprintable) const;

    /**
     * Remove any match data.  This must be called before performing a
     * set of matches with this segment.
     */
    void resetMatch();

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     */
    virtual UClassID getDynamicClassID() const;

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     */
    static UClassID U_EXPORT2 getStaticClassID();

    /**
     * Union the set of all characters that may output by this object
     * into the given set.
     * @param toUnionTo the set into which to union the output characters
     */
    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;

 private:

    /**
     * The text to be matched.
     */
    UnicodeString pattern;

    /**
     * Context object that maps stand-ins to matcher and replacer
     * objects.
     */
    const TransliterationRuleData* data;

    /**
     * The segment number, 1-based, or 0 if not a segment.
     */
    int32_t segmentNumber;

    /**
     * Start offset, in the match text, of the <em>rightmost</em>
     * match.
     */
    int32_t matchStart;

    /**
     * Limit offset, in the match text, of the <em>rightmost</em>
     * match.
     */
    int32_t matchLimit;

};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif
Commit	Line	Data
b75a7d8f	1	/*
4388f060	2	* Copyright (C) 2001-2011, International Business Machines Corporation
374ca955 A	3	* and others. All Rights Reserved.
	4	**********************************************************************
	5	* Date Name Description
	6	* 07/23/01 aliu Creation.
	7	**********************************************************************
	8	*/
b75a7d8f A	9	#ifndef STRMATCH_H
	10	#define STRMATCH_H
	11
	12	#include "unicode/utypes.h"
	13
	14	#if !UCONFIG_NO_TRANSLITERATION
	15
	16	#include "unicode/unistr.h"
	17	#include "unicode/unifunct.h"
	18	#include "unicode/unimatch.h"
	19	#include "unicode/unirepl.h"
	20
	21	U_NAMESPACE_BEGIN
	22
	23	class TransliterationRuleData;
	24
	25	/**
	26	* An object that matches a fixed input string, implementing the
	27	* UnicodeMatcher API. This object also implements the
	28	* UnicodeReplacer API, allowing it to emit the matched text as
	29	* output. Since the match text may contain flexible match elements,
	30	* such as UnicodeSets, the emitted text is not the match pattern, but
	31	* instead a substring of the actual matched text. Following
	32	* convention, the output text is the leftmost match seen up to this
	33	* point.
	34	*
	35	* A StringMatcher may represent a segment, in which case it has a
	36	* positive segment number. This affects how the matcher converts
	37	* itself to a pattern but does not otherwise affect its function.
	38	*
	39	* A StringMatcher that is not a segment should not be used as a
	40	* UnicodeReplacer.
	41	*/
	42	class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
	43
	44	public:
	45
	46	/**
	47	* Construct a matcher that matches the given pattern string.
	48	* @param string the pattern to be matched, possibly containing
	49	* stand-ins that represent nested UnicodeMatcher objects.
	50	* @param start inclusive start index of text to be replaced
	51	* @param limit exclusive end index of text to be replaced;
	52	* must be greater than or equal to start
	53	* @param segmentNum the segment number from 1..n, or 0 if this is
	54	* not a segment.
	55	* @param data context object mapping stand-ins to
	56	* UnicodeMatcher objects.
	57	*/
	58	StringMatcher(const UnicodeString& string,
	59	int32_t start,
	60	int32_t limit,
	61	int32_t segmentNum,
	62	const TransliterationRuleData& data);
	63
	64	/**
	65	* Copy constructor
	66	* @param o the object to be copied.
	67	*/
	68	StringMatcher(const StringMatcher& o);
	69
	70	/**
	71	* Destructor
	72	*/
73	virtual ~StringMatcher();
74
75	/**
76	* Implement UnicodeFunctor
77	* @return a copy of the object.
78	*/
79	virtual UnicodeFunctor* clone() const;
80
81	/**
82	* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
83	* and return the pointer.
84	* @return the UnicodeMatcher point.
85	*/
86	virtual UnicodeMatcher* toMatcher() const;
87
88	/**
89	* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
90	* and return the pointer.
91	* @return the UnicodeReplacer pointer.
92	*/
93	virtual UnicodeReplacer* toReplacer() const;
94
95	/**
96	* Implement UnicodeMatcher
97	* @param text the text to be matched
98	* @param offset on input, the index into text at which to begin
99	* matching. On output, the limit of the matched text. The
100	* number of matched characters is the output value of offset
101	* minus the input value. Offset should always point to the
102	* HIGH SURROGATE (leading code unit) of a pair of surrogates,
103	* both on entry and upon return.
104	* @param limit the limit index of text to be matched. Greater
105	* than offset for a forward direction match, less than offset for
106	* a backward direction match. The last character to be
107	* considered for matching will be text.charAt(limit-1) in the
108	* forward direction or text.charAt(limit+1) in the backward
109	* direction.
110	* @param incremental if TRUE, then assume further characters may
111	* be inserted at limit and check for partial matching. Otherwise
112	* assume the text as given is complete.
113	* @return a match degree value indicating a full match, a partial
114	* match, or a mismatch. If incremental is FALSE then
115	* U_PARTIAL_MATCH should never be returned.
116	*/
117	virtual UMatchDegree matches(const Replaceable& text,
118	int32_t& offset,
119	int32_t limit,
120	UBool incremental);
121
122	/**
123	* Implement UnicodeMatcher
124	* @param result Output param to receive the pattern.
125	* @param escapeUnprintable if True then escape the unprintable characters.
126	* @return A reference to 'result'.
127	*/
128	virtual UnicodeString& toPattern(UnicodeString& result,
129	UBool escapeUnprintable = FALSE) const;
130
131	/**
132	* Implement UnicodeMatcher
133	* Returns TRUE if this matcher will match a character c, where c
134	* & 0xFF == v, at offset, in the forward direction (with limit >
135	* offset). This is used by <tt>RuleBasedTransliterator</tt> for
136	* indexing.
137	* @param v the given value
138	* @return TRUE if this matcher will match a character c,
139	* where c & 0xFF == v
140	*/
141	virtual UBool matchesIndexValue(uint8_t v) const;
142
143	/**
144	* Implement UnicodeMatcher
145	*/
146	virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
147
148	/**
149	* Implement UnicodeFunctor
150	*/
151	virtual void setData(const TransliterationRuleData*);
152
153	/**
154	* Replace characters in 'text' from 'start' to 'limit' with the
155	* output text of this object. Update the 'cursor' parameter to
156	* give the cursor position and return the length of the
157	* replacement text.
158	*
159	* @param text the text to be matched
160	* @param start inclusive start index of text to be replaced
161	* @param limit exclusive end index of text to be replaced;
162	* must be greater than or equal to start
163	* @param cursor output parameter for the cursor position.
164	* Not all replacer objects will update this, but in a complete
165	* tree of replacer objects, representing the entire output side
166	* of a transliteration rule, at least one must update it.
167	* @return the number of 16-bit code units in the text replacing
168	* the characters at offsets start..(limit-1) in text
169	*/
170	virtual int32_t replace(Replaceable& text,
171	int32_t start,
172	int32_t limit,
173	int32_t& cursor);
174
175	/**
176	* Returns a string representation of this replacer. If the
177	* result of calling this function is passed to the appropriate
178	* parser, typically TransliteratorParser, it will produce another
179	* replacer that is equal to this one.
180	* @param result the string to receive the pattern. Previous
181	* contents will be deleted.
182	* @param escapeUnprintable if TRUE then convert unprintable
183	* character to their hex escape representations, \\uxxxx or
184	* \\Uxxxxxxxx. Unprintable characters are defined by
185	* Utility.isUnprintable().
186	* @return a reference to 'result'.
187	*/
188	virtual UnicodeString& toReplacerPattern(UnicodeString& result,
189	UBool escapeUnprintable) const;
190
191	/**
192	* Remove any match data. This must be called before performing a
193	* set of matches with this segment.
194	*/
195	void resetMatch();
196
197	/**
198	* ICU "poor man's RTTI", returns a UClassID for the actual class.
b75a7d8f	199	*/
374ca955	200	virtual UClassID getDynamicClassID() const;
b75a7d8f A	201
	202	/**
	203	* ICU "poor man's RTTI", returns a UClassID for this class.
b75a7d8f	204	*/
374ca955	205	static UClassID U_EXPORT2 getStaticClassID();
b75a7d8f A	206
	207	/**
	208	* Union the set of all characters that may output by this object
	209	* into the given set.
	210	* @param toUnionTo the set into which to union the output characters
	211	*/
	212	virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
	213
	214	private:
	215
	216	/**
	217	* The text to be matched.
	218	*/
	219	UnicodeString pattern;
	220
	221	/**
	222	* Context object that maps stand-ins to matcher and replacer
	223	* objects.
	224	*/
	225	const TransliterationRuleData* data;
	226
	227	/**
	228	* The segment number, 1-based, or 0 if not a segment.
	229	*/
	230	int32_t segmentNumber;
	231
	232	/**
	233	* Start offset, in the match text, of the <em>rightmost</em>
	234	* match.
	235	*/
	236	int32_t matchStart;
	237
	238	/**
	239	* Limit offset, in the match text, of the <em>rightmost</em>
	240	* match.
	241	*/
	242	int32_t matchLimit;
	243
b75a7d8f A	244	};
	245
	246	U_NAMESPACE_END
	247
	248	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
	249
	250	#endif