]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved. | |
3 | ********************************************************************** | |
4 | * Date Name Description | |
5 | * 11/17/99 aliu Creation. | |
6 | ********************************************************************** | |
7 | */ | |
8 | #ifndef RBT_RULE_H | |
9 | #define RBT_RULE_H | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_TRANSLITERATION | |
14 | ||
15 | #include "unicode/uobject.h" | |
16 | #include "unicode/unistr.h" | |
17 | #include "unicode/utrans.h" | |
18 | #include "unicode/unimatch.h" | |
19 | ||
20 | U_NAMESPACE_BEGIN | |
21 | ||
22 | class Replaceable; | |
23 | class TransliterationRuleData; | |
24 | class StringMatcher; | |
25 | class UnicodeFunctor; | |
26 | ||
27 | /** | |
28 | * A transliteration rule used by | |
29 | * <code>RuleBasedTransliterator</code>. | |
30 | * <code>TransliterationRule</code> is an immutable object. | |
31 | * | |
32 | * <p>A rule consists of an input pattern and an output string. When | |
33 | * the input pattern is matched, the output string is emitted. The | |
34 | * input pattern consists of zero or more characters which are matched | |
35 | * exactly (the key) and optional context. Context must match if it | |
36 | * is specified. Context may be specified before the key, after the | |
37 | * key, or both. The key, preceding context, and following context | |
38 | * may contain variables. Variables represent a set of Unicode | |
39 | * characters, such as the letters <i>a</i> through <i>z</i>. | |
40 | * Variables are detected by looking up each character in a supplied | |
41 | * variable list to see if it has been so defined. | |
42 | * | |
43 | * <p>A rule may contain segments in its input string and segment | |
44 | * references in its output string. A segment is a substring of the | |
45 | * input pattern, indicated by an offset and limit. The segment may | |
46 | * be in the preceding or following context. It may not span a | |
47 | * context boundary. A segment reference is a special character in | |
48 | * the output string that causes a segment of the input string (not | |
49 | * the input pattern) to be copied to the output string. The range of | |
50 | * special characters that represent segment references is defined by | |
51 | * RuleBasedTransliterator.Data. | |
52 | * | |
53 | * @author Alan Liu | |
54 | */ | |
55 | class TransliterationRule : public UMemory { | |
56 | ||
57 | private: | |
58 | ||
59 | // TODO Eliminate the pattern and keyLength data members. They | |
60 | // are used only by masks() and getIndexValue() which are called | |
61 | // only during build time, not during run-time. Perhaps these | |
62 | // methods and pattern/keyLength can be isolated into a separate | |
63 | // object. | |
64 | ||
65 | /** | |
66 | * The match that must occur before the key, or null if there is no | |
67 | * preceding context. | |
68 | */ | |
69 | StringMatcher *anteContext; | |
70 | ||
71 | /** | |
72 | * The matcher object for the key. If null, then the key is empty. | |
73 | */ | |
74 | StringMatcher *key; | |
75 | ||
76 | /** | |
77 | * The match that must occur after the key, or null if there is no | |
78 | * following context. | |
79 | */ | |
80 | StringMatcher *postContext; | |
81 | ||
82 | /** | |
83 | * The object that performs the replacement if the key, | |
84 | * anteContext, and postContext are matched. Never null. | |
85 | */ | |
86 | UnicodeFunctor* output; | |
87 | ||
88 | /** | |
89 | * The string that must be matched, consisting of the anteContext, key, | |
90 | * and postContext, concatenated together, in that order. Some components | |
91 | * may be empty (zero length). | |
92 | * @see anteContextLength | |
93 | * @see keyLength | |
94 | */ | |
95 | UnicodeString pattern; | |
96 | ||
97 | /** | |
98 | * An array of matcher objects corresponding to the input pattern | |
99 | * segments. If there are no segments this is null. N.B. This is | |
100 | * a UnicodeMatcher for generality, but in practice it is always a | |
101 | * StringMatcher. In the future we may generalize this, but for | |
102 | * now we sometimes cast down to StringMatcher. | |
103 | * | |
104 | * The array is owned, but the pointers within it are not. | |
105 | */ | |
106 | UnicodeFunctor** segments; | |
107 | ||
108 | /** | |
109 | * The number of elements in segments[] or zero if segments is NULL. | |
110 | */ | |
111 | int32_t segmentsCount; | |
112 | ||
113 | /** | |
114 | * The length of the string that must match before the key. If | |
115 | * zero, then there is no matching requirement before the key. | |
116 | * Substring [0,anteContextLength) of pattern is the anteContext. | |
117 | */ | |
118 | int32_t anteContextLength; | |
119 | ||
120 | /** | |
121 | * The length of the key. Substring [anteContextLength, | |
122 | * anteContextLength + keyLength) is the key. | |
123 | ||
124 | */ | |
125 | int32_t keyLength; | |
126 | ||
127 | /** | |
128 | * Miscellaneous attributes. | |
129 | */ | |
130 | int8_t flags; | |
131 | ||
132 | /** | |
133 | * Flag attributes. | |
134 | */ | |
135 | enum { | |
136 | ANCHOR_START = 1, | |
137 | ANCHOR_END = 2 | |
138 | }; | |
139 | ||
140 | /** | |
141 | * An alias pointer to the data for this rule. The data provides | |
142 | * lookup services for matchers and segments. | |
143 | */ | |
144 | const TransliterationRuleData* data; | |
145 | ||
146 | public: | |
147 | ||
148 | /** | |
149 | * Construct a new rule with the given input, output text, and other | |
150 | * attributes. A cursor position may be specified for the output text. | |
151 | * @param input input string, including key and optional ante and | |
152 | * post context. | |
153 | * @param anteContextPos offset into input to end of ante context, or -1 if | |
154 | * none. Must be <= input.length() if not -1. | |
155 | * @param postContextPos offset into input to start of post context, or -1 | |
156 | * if none. Must be <= input.length() if not -1, and must be >= | |
157 | * anteContextPos. | |
158 | * @param outputStr output string. | |
159 | * @param cursorPosition offset into output at which cursor is located, or -1 if | |
160 | * none. If less than zero, then the cursor is placed after the | |
161 | * <code>output</code>; that is, -1 is equivalent to | |
162 | * <code>output.length()</code>. If greater than | |
163 | * <code>output.length()</code> then an exception is thrown. | |
164 | * @param cursorOffset an offset to be added to cursorPos to position the | |
165 | * cursor either in the ante context, if < 0, or in the post context, if > | |
166 | * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to | |
167 | * "xyz" and moves the cursor to before "a". It would have a cursorOffset | |
168 | * of -3. | |
169 | * @param segs array of UnicodeMatcher corresponding to input pattern | |
170 | * segments, or null if there are none. The array itself is adopted, | |
171 | * but the pointers within it are not. | |
172 | * @param segsCount number of elements in segs[]. | |
173 | * @param anchorStart TRUE if the the rule is anchored on the left to | |
174 | * the context start. | |
175 | * @param anchorEnd TRUE if the rule is anchored on the right to the | |
176 | * context limit. | |
177 | * @param data the rule data. | |
178 | * @param status Output parameter filled in with success or failure status. | |
179 | */ | |
180 | TransliterationRule(const UnicodeString& input, | |
181 | int32_t anteContextPos, int32_t postContextPos, | |
182 | const UnicodeString& outputStr, | |
183 | int32_t cursorPosition, int32_t cursorOffset, | |
184 | UnicodeFunctor** segs, | |
185 | int32_t segsCount, | |
186 | UBool anchorStart, UBool anchorEnd, | |
187 | const TransliterationRuleData* data, | |
188 | UErrorCode& status); | |
189 | ||
190 | /** | |
191 | * Copy constructor. | |
192 | * @param other the object to be copied. | |
193 | */ | |
194 | TransliterationRule(TransliterationRule& other); | |
195 | ||
196 | /** | |
197 | * Destructor. | |
198 | */ | |
199 | virtual ~TransliterationRule(); | |
200 | ||
201 | /** | |
202 | * Change the data object that this rule belongs to. Used | |
203 | * internally by the TransliterationRuleData copy constructor. | |
204 | * @param data the new data value to be set. | |
205 | */ | |
206 | void setData(const TransliterationRuleData* data); | |
207 | ||
208 | /** | |
209 | * Return the preceding context length. This method is needed to | |
210 | * support the <code>Transliterator</code> method | |
211 | * <code>getMaximumContextLength()</code>. Internally, this is | |
212 | * implemented as the anteContextLength, optionally plus one if | |
213 | * there is a start anchor. The one character anchor gap is | |
214 | * needed to make repeated incremental transliteration with | |
215 | * anchors work. | |
216 | * @return the preceding context length. | |
217 | */ | |
218 | virtual int32_t getContextLength(void) const; | |
219 | ||
220 | /** | |
221 | * Internal method. Returns 8-bit index value for this rule. | |
222 | * This is the low byte of the first character of the key, | |
223 | * unless the first character of the key is a set. If it's a | |
224 | * set, or otherwise can match multiple keys, the index value is -1. | |
225 | * @return 8-bit index value for this rule. | |
226 | */ | |
227 | int16_t getIndexValue() const; | |
228 | ||
229 | /** | |
230 | * Internal method. Returns true if this rule matches the given | |
231 | * index value. The index value is an 8-bit integer, 0..255, | |
232 | * representing the low byte of the first character of the key. | |
233 | * It matches this rule if it matches the first character of the | |
234 | * key, or if the first character of the key is a set, and the set | |
235 | * contains any character with a low byte equal to the index | |
236 | * value. If the rule contains only ante context, as in foo)>bar, | |
237 | * then it will match any key. | |
238 | * @param v the given index value. | |
239 | * @return true if this rule matches the given index value. | |
240 | */ | |
241 | UBool matchesIndexValue(uint8_t v) const; | |
242 | ||
243 | /** | |
244 | * Return true if this rule masks another rule. If r1 masks r2 then | |
245 | * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks | |
246 | * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". | |
247 | * "[c]a>x" masks "[dc]a>y". | |
248 | * @param r2 the given rule to be compared with. | |
249 | * @return true if this rule masks 'r2' | |
250 | */ | |
251 | virtual UBool masks(const TransliterationRule& r2) const; | |
252 | ||
253 | /** | |
254 | * Attempt a match and replacement at the given position. Return | |
255 | * the degree of match between this rule and the given text. The | |
256 | * degree of match may be mismatch, a partial match, or a full | |
257 | * match. A mismatch means at least one character of the text | |
258 | * does not match the context or key. A partial match means some | |
259 | * context and key characters match, but the text is not long | |
260 | * enough to match all of them. A full match means all context | |
261 | * and key characters match. | |
262 | * | |
263 | * If a full match is obtained, perform a replacement, update pos, | |
264 | * and return U_MATCH. Otherwise both text and pos are unchanged. | |
265 | * | |
266 | * @param text the text | |
267 | * @param pos the position indices | |
268 | * @param incremental if TRUE, test for partial matches that may | |
269 | * be completed by additional text inserted at pos.limit. | |
270 | * @return one of <code>U_MISMATCH</code>, | |
271 | * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If | |
272 | * incremental is FALSE then U_PARTIAL_MATCH will not be returned. | |
273 | */ | |
274 | UMatchDegree matchAndReplace(Replaceable& text, | |
275 | UTransPosition& pos, | |
276 | UBool incremental) const; | |
277 | ||
278 | /** | |
279 | * Create a rule string that represents this rule object. Append | |
280 | * it to the given string. | |
281 | */ | |
282 | virtual UnicodeString& toRule(UnicodeString& pat, | |
283 | UBool escapeUnprintable) const; | |
284 | ||
285 | /** | |
286 | * Union the set of all characters that may be modified by this rule | |
287 | * into the given set. | |
288 | */ | |
289 | void addSourceSetTo(UnicodeSet& toUnionTo) const; | |
290 | ||
291 | /** | |
292 | * Union the set of all characters that may be emitted by this rule | |
293 | * into the given set. | |
294 | */ | |
295 | void addTargetSetTo(UnicodeSet& toUnionTo) const; | |
296 | ||
297 | private: | |
298 | ||
299 | friend class StringMatcher; | |
300 | ||
301 | TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class | |
302 | }; | |
303 | ||
304 | U_NAMESPACE_END | |
305 | ||
306 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | |
307 | ||
308 | #endif |