[apple/icu.git] / icuSources / i18n / strmatch.h

/*
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   07/23/01    aliu        Creation.
**********************************************************************
*/
#ifndef STRMATCH_H
#define STRMATCH_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/unistr.h"
#include "unicode/unifunct.h"
#include "unicode/unimatch.h"
#include "unicode/unirepl.h"

U_NAMESPACE_BEGIN

class TransliterationRuleData;

/**
 * An object that matches a fixed input string, implementing the
 * UnicodeMatcher API.  This object also implements the
 * UnicodeReplacer API, allowing it to emit the matched text as
 * output.  Since the match text may contain flexible match elements,
 * such as UnicodeSets, the emitted text is not the match pattern, but
 * instead a substring of the actual matched text.  Following
 * convention, the output text is the leftmost match seen up to this
 * point.
 *
 * A StringMatcher may represent a segment, in which case it has a
 * positive segment number.  This affects how the matcher converts
 * itself to a pattern but does not otherwise affect its function.
 *
 * A StringMatcher that is not a segment should not be used as a
 * UnicodeReplacer.
 */
class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {

 public:

    /**
     * Construct a matcher that matches the given pattern string.
     * @param string the pattern to be matched, possibly containing
     * stand-ins that represent nested UnicodeMatcher objects.
     * @param start inclusive start index of text to be replaced
     * @param limit exclusive end index of text to be replaced;
     * must be greater than or equal to start
     * @param segmentNum the segment number from 1..n, or 0 if this is
     * not a segment.
     * @param data context object mapping stand-ins to
     * UnicodeMatcher objects.
     */
    StringMatcher(const UnicodeString& string,
                  int32_t start,
                  int32_t limit,
                  int32_t segmentNum,
                  const TransliterationRuleData& data);

    /**
     * Copy constructor
     * @param o  the object to be copied.
     */
    StringMatcher(const StringMatcher& o);
        
    /**
     * Destructor
     */
    virtual ~StringMatcher();

    /**
     * Implement UnicodeFunctor
     * @return a copy of the object.
     */
    virtual UnicodeFunctor* clone() const;

    /**
     * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     * and return the pointer.
     * @return the UnicodeMatcher point.
     */
    virtual UnicodeMatcher* toMatcher() const;

    /**
     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     * and return the pointer.
     * @return the UnicodeReplacer pointer.
     */
    virtual UnicodeReplacer* toReplacer() const;

    /**
     * Implement UnicodeMatcher
     * @param text the text to be matched
     * @param offset on input, the index into text at which to begin
     * matching.  On output, the limit of the matched text.  The
     * number of matched characters is the output value of offset
     * minus the input value.  Offset should always point to the
     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
     * both on entry and upon return.
     * @param limit the limit index of text to be matched.  Greater
     * than offset for a forward direction match, less than offset for
     * a backward direction match.  The last character to be
     * considered for matching will be text.charAt(limit-1) in the
     * forward direction or text.charAt(limit+1) in the backward
     * direction.
     * @param incremental  if TRUE, then assume further characters may
     * be inserted at limit and check for partial matching.  Otherwise
     * assume the text as given is complete.
     * @return a match degree value indicating a full match, a partial
     * match, or a mismatch.  If incremental is FALSE then
     * U_PARTIAL_MATCH should never be returned.
     */
    virtual UMatchDegree matches(const Replaceable& text,
                                 int32_t& offset,
                                 int32_t limit,
                                 UBool incremental);

    /**
     * Implement UnicodeMatcher
     * @param result            Output param to receive the pattern.
     * @param escapeUnprintable if True then escape the unprintable characters.
     * @return                  A reference to 'result'.
     */
    virtual UnicodeString& toPattern(UnicodeString& result,
                                     UBool escapeUnprintable = FALSE) const;

    /**
     * Implement UnicodeMatcher
     * Returns TRUE if this matcher will match a character c, where c
     * & 0xFF == v, at offset, in the forward direction (with limit >
     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
     * indexing.
     * @param v    the given value
     * @return     TRUE if this matcher will match a character c, 
     *             where c & 0xFF == v
     */
    virtual UBool matchesIndexValue(uint8_t v) const;

    /**
     * Implement UnicodeMatcher
     */
    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;

    /**
     * Implement UnicodeFunctor
     */
    virtual void setData(const TransliterationRuleData*);

    /**
     * Replace characters in 'text' from 'start' to 'limit' with the
     * output text of this object.  Update the 'cursor' parameter to
     * give the cursor position and return the length of the
     * replacement text.
     *
     * @param text the text to be matched
     * @param start inclusive start index of text to be replaced
     * @param limit exclusive end index of text to be replaced;
     * must be greater than or equal to start
     * @param cursor output parameter for the cursor position.
     * Not all replacer objects will update this, but in a complete
     * tree of replacer objects, representing the entire output side
     * of a transliteration rule, at least one must update it.
     * @return the number of 16-bit code units in the text replacing
     * the characters at offsets start..(limit-1) in text
     */
    virtual int32_t replace(Replaceable& text,
                            int32_t start,
                            int32_t limit,
                            int32_t& cursor);

    /**
     * Returns a string representation of this replacer.  If the
     * result of calling this function is passed to the appropriate
     * parser, typically TransliteratorParser, it will produce another
     * replacer that is equal to this one.
     * @param result the string to receive the pattern.  Previous
     * contents will be deleted.
     * @param escapeUnprintable if TRUE then convert unprintable
     * character to their hex escape representations, \\uxxxx or
     * \\Uxxxxxxxx.  Unprintable characters are defined by
     * Utility.isUnprintable().
     * @return a reference to 'result'.
     */
    virtual UnicodeString& toReplacerPattern(UnicodeString& result,
                                             UBool escapeUnprintable) const;

    /**
     * Remove any match data.  This must be called before performing a
     * set of matches with this segment.
     */
    void resetMatch();

    /**
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
     *
     * @draft ICU 2.2
     */
    virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }

    /**
     * ICU "poor man's RTTI", returns a UClassID for this class.
     *
     * @draft ICU 2.2
     */
    static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }

    /**
     * Union the set of all characters that may output by this object
     * into the given set.
     * @param toUnionTo the set into which to union the output characters
     */
    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;

 private:

    /**
     * The text to be matched.
     */
    UnicodeString pattern;

    /**
     * Context object that maps stand-ins to matcher and replacer
     * objects.
     */
    const TransliterationRuleData* data;

    /**
     * The segment number, 1-based, or 0 if not a segment.
     */
    int32_t segmentNumber;

    /**
     * Start offset, in the match text, of the <em>rightmost</em>
     * match.
     */
    int32_t matchStart;

    /**
     * Limit offset, in the match text, of the <em>rightmost</em>
     * match.
     */
    int32_t matchLimit;

    /**
     * The address of this static class variable serves as this class's ID
     * for ICU "poor man's RTTI".
     */
    static const char fgClassID;
};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif
Commit	Line	Data
b75a7d8f A	1	/*
	2	* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
	3	**********************************************************************
	4	* Date Name Description
	5	* 07/23/01 aliu Creation.
	6	**********************************************************************
	7	*/
	8	#ifndef STRMATCH_H
	9	#define STRMATCH_H
	10
	11	#include "unicode/utypes.h"
	12
	13	#if !UCONFIG_NO_TRANSLITERATION
	14
	15	#include "unicode/unistr.h"
	16	#include "unicode/unifunct.h"
	17	#include "unicode/unimatch.h"
	18	#include "unicode/unirepl.h"
	19
	20	U_NAMESPACE_BEGIN
	21
	22	class TransliterationRuleData;
	23
	24	/**
	25	* An object that matches a fixed input string, implementing the
	26	* UnicodeMatcher API. This object also implements the
	27	* UnicodeReplacer API, allowing it to emit the matched text as
	28	* output. Since the match text may contain flexible match elements,
	29	* such as UnicodeSets, the emitted text is not the match pattern, but
	30	* instead a substring of the actual matched text. Following
	31	* convention, the output text is the leftmost match seen up to this
	32	* point.
	33	*
	34	* A StringMatcher may represent a segment, in which case it has a
	35	* positive segment number. This affects how the matcher converts
	36	* itself to a pattern but does not otherwise affect its function.
	37	*
	38	* A StringMatcher that is not a segment should not be used as a
	39	* UnicodeReplacer.
	40	*/
	41	class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
	42
	43	public:
	44
	45	/**
	46	* Construct a matcher that matches the given pattern string.
	47	* @param string the pattern to be matched, possibly containing
	48	* stand-ins that represent nested UnicodeMatcher objects.
	49	* @param start inclusive start index of text to be replaced
	50	* @param limit exclusive end index of text to be replaced;
	51	* must be greater than or equal to start
	52	* @param segmentNum the segment number from 1..n, or 0 if this is
	53	* not a segment.
	54	* @param data context object mapping stand-ins to
	55	* UnicodeMatcher objects.
	56	*/
	57	StringMatcher(const UnicodeString& string,
	58	int32_t start,
	59	int32_t limit,
	60	int32_t segmentNum,
	61	const TransliterationRuleData& data);
	62
	63	/**
	64	* Copy constructor
65	* @param o the object to be copied.
66	*/
67	StringMatcher(const StringMatcher& o);
68
69	/**
70	* Destructor
71	*/
72	virtual ~StringMatcher();
73
74	/**
75	* Implement UnicodeFunctor
76	* @return a copy of the object.
77	*/
78	virtual UnicodeFunctor* clone() const;
79
80	/**
81	* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
82	* and return the pointer.
83	* @return the UnicodeMatcher point.
84	*/
85	virtual UnicodeMatcher* toMatcher() const;
86
87	/**
88	* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
89	* and return the pointer.
90	* @return the UnicodeReplacer pointer.
91	*/
92	virtual UnicodeReplacer* toReplacer() const;
93
94	/**
95	* Implement UnicodeMatcher
96	* @param text the text to be matched
97	* @param offset on input, the index into text at which to begin
98	* matching. On output, the limit of the matched text. The
99	* number of matched characters is the output value of offset
100	* minus the input value. Offset should always point to the
101	* HIGH SURROGATE (leading code unit) of a pair of surrogates,
102	* both on entry and upon return.
103	* @param limit the limit index of text to be matched. Greater
104	* than offset for a forward direction match, less than offset for
105	* a backward direction match. The last character to be
106	* considered for matching will be text.charAt(limit-1) in the
107	* forward direction or text.charAt(limit+1) in the backward
108	* direction.
109	* @param incremental if TRUE, then assume further characters may
110	* be inserted at limit and check for partial matching. Otherwise
111	* assume the text as given is complete.
112	* @return a match degree value indicating a full match, a partial
113	* match, or a mismatch. If incremental is FALSE then
114	* U_PARTIAL_MATCH should never be returned.
115	*/
116	virtual UMatchDegree matches(const Replaceable& text,
117	int32_t& offset,
118	int32_t limit,
119	UBool incremental);
120
121	/**
122	* Implement UnicodeMatcher
123	* @param result Output param to receive the pattern.
124	* @param escapeUnprintable if True then escape the unprintable characters.
125	* @return A reference to 'result'.
126	*/
127	virtual UnicodeString& toPattern(UnicodeString& result,
128	UBool escapeUnprintable = FALSE) const;
129
130	/**
131	* Implement UnicodeMatcher
132	* Returns TRUE if this matcher will match a character c, where c
133	* & 0xFF == v, at offset, in the forward direction (with limit >
134	* offset). This is used by <tt>RuleBasedTransliterator</tt> for
135	* indexing.
136	* @param v the given value
137	* @return TRUE if this matcher will match a character c,
138	* where c & 0xFF == v
139	*/
140	virtual UBool matchesIndexValue(uint8_t v) const;
141
142	/**
143	* Implement UnicodeMatcher
144	*/
145	virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
146
147	/**
148	* Implement UnicodeFunctor
149	*/
150	virtual void setData(const TransliterationRuleData*);
151
152	/**
153	* Replace characters in 'text' from 'start' to 'limit' with the
154	* output text of this object. Update the 'cursor' parameter to
155	* give the cursor position and return the length of the
156	* replacement text.
157	*
158	* @param text the text to be matched
159	* @param start inclusive start index of text to be replaced
160	* @param limit exclusive end index of text to be replaced;
161	* must be greater than or equal to start
162	* @param cursor output parameter for the cursor position.
163	* Not all replacer objects will update this, but in a complete
164	* tree of replacer objects, representing the entire output side
165	* of a transliteration rule, at least one must update it.
166	* @return the number of 16-bit code units in the text replacing
167	* the characters at offsets start..(limit-1) in text
168	*/
169	virtual int32_t replace(Replaceable& text,
170	int32_t start,
171	int32_t limit,
172	int32_t& cursor);
173
174	/**
175	* Returns a string representation of this replacer. If the
176	* result of calling this function is passed to the appropriate
177	* parser, typically TransliteratorParser, it will produce another
178	* replacer that is equal to this one.
179	* @param result the string to receive the pattern. Previous
180	* contents will be deleted.
181	* @param escapeUnprintable if TRUE then convert unprintable
182	* character to their hex escape representations, \\uxxxx or
183	* \\Uxxxxxxxx. Unprintable characters are defined by
184	* Utility.isUnprintable().
185	* @return a reference to 'result'.
186	*/
187	virtual UnicodeString& toReplacerPattern(UnicodeString& result,
188	UBool escapeUnprintable) const;
189
190	/**
191	* Remove any match data. This must be called before performing a
192	* set of matches with this segment.
193	*/
194	void resetMatch();
195
196	/**
197	* ICU "poor man's RTTI", returns a UClassID for the actual class.
198	*
199	* @draft ICU 2.2
200	*/
201	virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
202
203	/**
204	* ICU "poor man's RTTI", returns a UClassID for this class.
205	*
206	* @draft ICU 2.2
207	*/
208	static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
209
210	/**
211	* Union the set of all characters that may output by this object
212	* into the given set.
213	* @param toUnionTo the set into which to union the output characters
214	*/
215	virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
216
217	private:
218
219	/**
220	* The text to be matched.
221	*/
222	UnicodeString pattern;
223
224	/**
225	* Context object that maps stand-ins to matcher and replacer
226	* objects.
227	*/
228	const TransliterationRuleData* data;
229
230	/**
231	* The segment number, 1-based, or 0 if not a segment.
232	*/
233	int32_t segmentNumber;
234
235	/**
236	* Start offset, in the match text, of the <em>rightmost</em>
237	* match.
238	*/
239	int32_t matchStart;
240
241	/**
242	* Limit offset, in the match text, of the <em>rightmost</em>
243	* match.
244	*/
245	int32_t matchLimit;
246
247	/**
248	* The address of this static class variable serves as this class's ID
249	* for ICU "poor man's RTTI".
250	*/
251	static const char fgClassID;
252	};
253
254	U_NAMESPACE_END
255
256	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
257
258	#endif