]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | * Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved. | |
3 | ********************************************************************** | |
4 | * Date Name Description | |
5 | * 07/23/01 aliu Creation. | |
6 | ********************************************************************** | |
7 | */ | |
8 | #ifndef STRMATCH_H | |
9 | #define STRMATCH_H | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_TRANSLITERATION | |
14 | ||
15 | #include "unicode/unistr.h" | |
16 | #include "unicode/unifunct.h" | |
17 | #include "unicode/unimatch.h" | |
18 | #include "unicode/unirepl.h" | |
19 | ||
20 | U_NAMESPACE_BEGIN | |
21 | ||
22 | class TransliterationRuleData; | |
23 | ||
24 | /** | |
25 | * An object that matches a fixed input string, implementing the | |
26 | * UnicodeMatcher API. This object also implements the | |
27 | * UnicodeReplacer API, allowing it to emit the matched text as | |
28 | * output. Since the match text may contain flexible match elements, | |
29 | * such as UnicodeSets, the emitted text is not the match pattern, but | |
30 | * instead a substring of the actual matched text. Following | |
31 | * convention, the output text is the leftmost match seen up to this | |
32 | * point. | |
33 | * | |
34 | * A StringMatcher may represent a segment, in which case it has a | |
35 | * positive segment number. This affects how the matcher converts | |
36 | * itself to a pattern but does not otherwise affect its function. | |
37 | * | |
38 | * A StringMatcher that is not a segment should not be used as a | |
39 | * UnicodeReplacer. | |
40 | */ | |
41 | class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { | |
42 | ||
43 | public: | |
44 | ||
45 | /** | |
46 | * Construct a matcher that matches the given pattern string. | |
47 | * @param string the pattern to be matched, possibly containing | |
48 | * stand-ins that represent nested UnicodeMatcher objects. | |
49 | * @param start inclusive start index of text to be replaced | |
50 | * @param limit exclusive end index of text to be replaced; | |
51 | * must be greater than or equal to start | |
52 | * @param segmentNum the segment number from 1..n, or 0 if this is | |
53 | * not a segment. | |
54 | * @param data context object mapping stand-ins to | |
55 | * UnicodeMatcher objects. | |
56 | */ | |
57 | StringMatcher(const UnicodeString& string, | |
58 | int32_t start, | |
59 | int32_t limit, | |
60 | int32_t segmentNum, | |
61 | const TransliterationRuleData& data); | |
62 | ||
63 | /** | |
64 | * Copy constructor | |
65 | * @param o the object to be copied. | |
66 | */ | |
67 | StringMatcher(const StringMatcher& o); | |
68 | ||
69 | /** | |
70 | * Destructor | |
71 | */ | |
72 | virtual ~StringMatcher(); | |
73 | ||
74 | /** | |
75 | * Implement UnicodeFunctor | |
76 | * @return a copy of the object. | |
77 | */ | |
78 | virtual UnicodeFunctor* clone() const; | |
79 | ||
80 | /** | |
81 | * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer | |
82 | * and return the pointer. | |
83 | * @return the UnicodeMatcher point. | |
84 | */ | |
85 | virtual UnicodeMatcher* toMatcher() const; | |
86 | ||
87 | /** | |
88 | * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer | |
89 | * and return the pointer. | |
90 | * @return the UnicodeReplacer pointer. | |
91 | */ | |
92 | virtual UnicodeReplacer* toReplacer() const; | |
93 | ||
94 | /** | |
95 | * Implement UnicodeMatcher | |
96 | * @param text the text to be matched | |
97 | * @param offset on input, the index into text at which to begin | |
98 | * matching. On output, the limit of the matched text. The | |
99 | * number of matched characters is the output value of offset | |
100 | * minus the input value. Offset should always point to the | |
101 | * HIGH SURROGATE (leading code unit) of a pair of surrogates, | |
102 | * both on entry and upon return. | |
103 | * @param limit the limit index of text to be matched. Greater | |
104 | * than offset for a forward direction match, less than offset for | |
105 | * a backward direction match. The last character to be | |
106 | * considered for matching will be text.charAt(limit-1) in the | |
107 | * forward direction or text.charAt(limit+1) in the backward | |
108 | * direction. | |
109 | * @param incremental if TRUE, then assume further characters may | |
110 | * be inserted at limit and check for partial matching. Otherwise | |
111 | * assume the text as given is complete. | |
112 | * @return a match degree value indicating a full match, a partial | |
113 | * match, or a mismatch. If incremental is FALSE then | |
114 | * U_PARTIAL_MATCH should never be returned. | |
115 | */ | |
116 | virtual UMatchDegree matches(const Replaceable& text, | |
117 | int32_t& offset, | |
118 | int32_t limit, | |
119 | UBool incremental); | |
120 | ||
121 | /** | |
122 | * Implement UnicodeMatcher | |
123 | * @param result Output param to receive the pattern. | |
124 | * @param escapeUnprintable if True then escape the unprintable characters. | |
125 | * @return A reference to 'result'. | |
126 | */ | |
127 | virtual UnicodeString& toPattern(UnicodeString& result, | |
128 | UBool escapeUnprintable = FALSE) const; | |
129 | ||
130 | /** | |
131 | * Implement UnicodeMatcher | |
132 | * Returns TRUE if this matcher will match a character c, where c | |
133 | * & 0xFF == v, at offset, in the forward direction (with limit > | |
134 | * offset). This is used by <tt>RuleBasedTransliterator</tt> for | |
135 | * indexing. | |
136 | * @param v the given value | |
137 | * @return TRUE if this matcher will match a character c, | |
138 | * where c & 0xFF == v | |
139 | */ | |
140 | virtual UBool matchesIndexValue(uint8_t v) const; | |
141 | ||
142 | /** | |
143 | * Implement UnicodeMatcher | |
144 | */ | |
145 | virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; | |
146 | ||
147 | /** | |
148 | * Implement UnicodeFunctor | |
149 | */ | |
150 | virtual void setData(const TransliterationRuleData*); | |
151 | ||
152 | /** | |
153 | * Replace characters in 'text' from 'start' to 'limit' with the | |
154 | * output text of this object. Update the 'cursor' parameter to | |
155 | * give the cursor position and return the length of the | |
156 | * replacement text. | |
157 | * | |
158 | * @param text the text to be matched | |
159 | * @param start inclusive start index of text to be replaced | |
160 | * @param limit exclusive end index of text to be replaced; | |
161 | * must be greater than or equal to start | |
162 | * @param cursor output parameter for the cursor position. | |
163 | * Not all replacer objects will update this, but in a complete | |
164 | * tree of replacer objects, representing the entire output side | |
165 | * of a transliteration rule, at least one must update it. | |
166 | * @return the number of 16-bit code units in the text replacing | |
167 | * the characters at offsets start..(limit-1) in text | |
168 | */ | |
169 | virtual int32_t replace(Replaceable& text, | |
170 | int32_t start, | |
171 | int32_t limit, | |
172 | int32_t& cursor); | |
173 | ||
174 | /** | |
175 | * Returns a string representation of this replacer. If the | |
176 | * result of calling this function is passed to the appropriate | |
177 | * parser, typically TransliteratorParser, it will produce another | |
178 | * replacer that is equal to this one. | |
179 | * @param result the string to receive the pattern. Previous | |
180 | * contents will be deleted. | |
181 | * @param escapeUnprintable if TRUE then convert unprintable | |
182 | * character to their hex escape representations, \\uxxxx or | |
183 | * \\Uxxxxxxxx. Unprintable characters are defined by | |
184 | * Utility.isUnprintable(). | |
185 | * @return a reference to 'result'. | |
186 | */ | |
187 | virtual UnicodeString& toReplacerPattern(UnicodeString& result, | |
188 | UBool escapeUnprintable) const; | |
189 | ||
190 | /** | |
191 | * Remove any match data. This must be called before performing a | |
192 | * set of matches with this segment. | |
193 | */ | |
194 | void resetMatch(); | |
195 | ||
196 | /** | |
197 | * ICU "poor man's RTTI", returns a UClassID for the actual class. | |
198 | * | |
199 | * @draft ICU 2.2 | |
200 | */ | |
201 | virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); } | |
202 | ||
203 | /** | |
204 | * ICU "poor man's RTTI", returns a UClassID for this class. | |
205 | * | |
206 | * @draft ICU 2.2 | |
207 | */ | |
208 | static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; } | |
209 | ||
210 | /** | |
211 | * Union the set of all characters that may output by this object | |
212 | * into the given set. | |
213 | * @param toUnionTo the set into which to union the output characters | |
214 | */ | |
215 | virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; | |
216 | ||
217 | private: | |
218 | ||
219 | /** | |
220 | * The text to be matched. | |
221 | */ | |
222 | UnicodeString pattern; | |
223 | ||
224 | /** | |
225 | * Context object that maps stand-ins to matcher and replacer | |
226 | * objects. | |
227 | */ | |
228 | const TransliterationRuleData* data; | |
229 | ||
230 | /** | |
231 | * The segment number, 1-based, or 0 if not a segment. | |
232 | */ | |
233 | int32_t segmentNumber; | |
234 | ||
235 | /** | |
236 | * Start offset, in the match text, of the <em>rightmost</em> | |
237 | * match. | |
238 | */ | |
239 | int32_t matchStart; | |
240 | ||
241 | /** | |
242 | * Limit offset, in the match text, of the <em>rightmost</em> | |
243 | * match. | |
244 | */ | |
245 | int32_t matchLimit; | |
246 | ||
247 | /** | |
248 | * The address of this static class variable serves as this class's ID | |
249 | * for ICU "poor man's RTTI". | |
250 | */ | |
251 | static const char fgClassID; | |
252 | }; | |
253 | ||
254 | U_NAMESPACE_END | |
255 | ||
256 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | |
257 | ||
258 | #endif |