]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
374ca955 | 3 | * Copyright (C) 1999-2004, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * Date Name Description | |
7 | * 11/17/99 aliu Creation. | |
8 | ********************************************************************** | |
9 | */ | |
10 | #ifndef RBT_H | |
11 | #define RBT_H | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | ||
15 | #if !UCONFIG_NO_TRANSLITERATION | |
16 | ||
17 | #include "unicode/translit.h" | |
18 | #include "unicode/utypes.h" | |
19 | #include "unicode/parseerr.h" | |
374ca955 A |
20 | #include "unicode/udata.h" |
21 | ||
22 | #define U_ICUDATA_TRANSLIT U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "translit" | |
b75a7d8f A |
23 | |
24 | U_NAMESPACE_BEGIN | |
25 | ||
26 | class TransliterationRuleData; | |
27 | ||
28 | /** | |
29 | * <code>RuleBasedTransliterator</code> is a transliterator | |
30 | * that reads a set of rules in order to determine how to perform | |
31 | * translations. Rule sets are stored in resource bundles indexed by | |
32 | * name. Rules within a rule set are separated by semicolons (';'). | |
33 | * To include a literal semicolon, prefix it with a backslash ('\'). | |
34 | * Whitespace, as defined by <code>Character.isWhitespace()</code>, | |
35 | * is ignored. If the first non-blank character on a line is '#', | |
36 | * the entire line is ignored as a comment. </p> | |
37 | * | |
38 | * <p>Each set of rules consists of two groups, one forward, and one | |
39 | * reverse. This is a convention that is not enforced; rules for one | |
40 | * direction may be omitted, with the result that translations in | |
41 | * that direction will not modify the source text. In addition, | |
42 | * bidirectional forward-reverse rules may be specified for | |
43 | * symmetrical transformations.</p> | |
44 | * | |
45 | * <p><b>Rule syntax</b> </p> | |
46 | * | |
47 | * <p>Rule statements take one of the following forms: </p> | |
48 | * | |
49 | * <dl> | |
50 | * <dt><code>$alefmadda=\u0622;</code></dt> | |
51 | * <dd><strong>Variable definition.</strong> The name on the | |
52 | * left is assigned the text on the right. In this example, | |
53 | * after this statement, instances of the left hand name, | |
54 | * "<code>$alefmadda</code>", will be replaced by | |
55 | * the Unicode character U+0622. Variable names must begin | |
56 | * with a letter and consist only of letters, digits, and | |
57 | * underscores. Case is significant. Duplicate names cause | |
58 | * an exception to be thrown, that is, variables cannot be | |
59 | * redefined. The right hand side may contain well-formed | |
60 | * text of any length, including no text at all ("<code>$empty=;</code>"). | |
61 | * The right hand side may contain embedded <code>UnicodeSet</code> | |
62 | * patterns, for example, "<code>$softvowel=[eiyEIY]</code>".</dd> | |
63 | * <dd> </dd> | |
64 | * <dt><code>ai>$alefmadda;</code></dt> | |
65 | * <dd><strong>Forward translation rule.</strong> This rule | |
66 | * states that the string on the left will be changed to the | |
67 | * string on the right when performing forward | |
68 | * transliteration.</dd> | |
69 | * <dt> </dt> | |
70 | * <dt><code>ai<$alefmadda;</code></dt> | |
71 | * <dd><strong>Reverse translation rule.</strong> This rule | |
72 | * states that the string on the right will be changed to | |
73 | * the string on the left when performing reverse | |
74 | * transliteration.</dd> | |
75 | * </dl> | |
76 | * | |
77 | * <dl> | |
78 | * <dt><code>ai<>$alefmadda;</code></dt> | |
79 | * <dd><strong>Bidirectional translation rule.</strong> This | |
80 | * rule states that the string on the right will be changed | |
81 | * to the string on the left when performing forward | |
82 | * transliteration, and vice versa when performing reverse | |
83 | * transliteration.</dd> | |
84 | * </dl> | |
85 | * | |
86 | * <p>Translation rules consist of a <em>match pattern</em> and an <em>output | |
87 | * string</em>. The match pattern consists of literal characters, | |
88 | * optionally preceded by context, and optionally followed by | |
89 | * context. Context characters, like literal pattern characters, | |
90 | * must be matched in the text being transliterated. However, unlike | |
91 | * literal pattern characters, they are not replaced by the output | |
92 | * text. For example, the pattern "<code>abc{def}</code>" | |
93 | * indicates the characters "<code>def</code>" must be | |
94 | * preceded by "<code>abc</code>" for a successful match. | |
95 | * If there is a successful match, "<code>def</code>" will | |
96 | * be replaced, but not "<code>abc</code>". The final '<code>}</code>' | |
97 | * is optional, so "<code>abc{def</code>" is equivalent to | |
98 | * "<code>abc{def}</code>". Another example is "<code>{123}456</code>" | |
99 | * (or "<code>123}456</code>") in which the literal | |
100 | * pattern "<code>123</code>" must be followed by "<code>456</code>". | |
101 | * </p> | |
102 | * | |
103 | * <p>The output string of a forward or reverse rule consists of | |
104 | * characters to replace the literal pattern characters. If the | |
105 | * output string contains the character '<code>|</code>', this is | |
106 | * taken to indicate the location of the <em>cursor</em> after | |
107 | * replacement. The cursor is the point in the text at which the | |
108 | * next replacement, if any, will be applied. The cursor is usually | |
109 | * placed within the replacement text; however, it can actually be | |
110 | * placed into the precending or following context by using the | |
111 | * special character '<code>@</code>'. Examples:</p> | |
112 | * | |
113 | * <blockquote> | |
114 | * <p><code>a {foo} z > | @ bar; # foo -> bar, move cursor | |
115 | * before a<br> | |
116 | * {foo} xyz > bar @@|; # foo -> bar, cursor between | |
117 | * y and z</code></p> | |
118 | * </blockquote> | |
119 | * | |
120 | * <p><b>UnicodeSet</b></p> | |
121 | * | |
122 | * <p><code>UnicodeSet</code> patterns may appear anywhere that | |
123 | * makes sense. They may appear in variable definitions. | |
124 | * Contrariwise, <code>UnicodeSet</code> patterns may themselves | |
125 | * contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>", | |
126 | * or "<code>$range=a-z;$ll=[$range]</code>".</p> | |
127 | * | |
128 | * <p><code>UnicodeSet</code> patterns may also be embedded directly | |
129 | * into rule strings. Thus, the following two rules are equivalent:</p> | |
130 | * | |
131 | * <blockquote> | |
132 | * <p><code>$vowel=[aeiou]; $vowel>'*'; # One way to do this<br> | |
133 | * [aeiou]>'*'; | |
134 | * # | |
135 | * Another way</code></p> | |
136 | * </blockquote> | |
137 | * | |
138 | * <p>See {@link UnicodeSet} for more documentation and examples.</p> | |
139 | * | |
140 | * <p><b>Segments</b></p> | |
141 | * | |
142 | * <p>Segments of the input string can be matched and copied to the | |
143 | * output string. This makes certain sets of rules simpler and more | |
144 | * general, and makes reordering possible. For example:</p> | |
145 | * | |
146 | * <blockquote> | |
147 | * <p><code>([a-z]) > $1 $1; | |
148 | * # | |
149 | * double lowercase letters<br> | |
150 | * ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs</code></p> | |
151 | * </blockquote> | |
152 | * | |
153 | * <p>The segment of the input string to be copied is delimited by | |
154 | * "<code>(</code>" and "<code>)</code>". Up to | |
155 | * nine segments may be defined. Segments may not overlap. In the | |
156 | * output string, "<code>$1</code>" through "<code>$9</code>" | |
157 | * represent the input string segments, in left-to-right order of | |
158 | * definition.</p> | |
159 | * | |
160 | * <p><b>Anchors</b></p> | |
161 | * | |
162 | * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the | |
163 | * special characters '<code>^</code>' and '<code>$</code>'. For example:</p> | |
164 | * | |
165 | * <blockquote> | |
166 | * <p><code>^ a > 'BEG_A'; # match 'a' at start of text<br> | |
167 | * a > 'A'; # match other instances | |
168 | * of 'a'<br> | |
169 | * z $ > 'END_Z'; # match 'z' at end of text<br> | |
170 | * z > 'Z'; # match other instances | |
171 | * of 'z'</code></p> | |
172 | * </blockquote> | |
173 | * | |
174 | * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>. | |
175 | * This is done by including a virtual anchor character '<code>$</code>' at the end of the | |
176 | * set pattern. Although this is usually the match chafacter for the end anchor, the set will | |
177 | * match either the beginning or the end of the text, depending on its placement. For | |
178 | * example:</p> | |
179 | * | |
180 | * <blockquote> | |
181 | * <p><code>$x = [a-z$]; # match 'a' through 'z' OR anchor<br> | |
182 | * $x 1 > 2; # match '1' after a-z or at the start<br> | |
183 | * 3 $x > 4; # match '3' before a-z or at the end</code></p> | |
184 | * </blockquote> | |
185 | * | |
186 | * <p><b>Example</b> </p> | |
187 | * | |
188 | * <p>The following example rules illustrate many of the features of | |
189 | * the rule language. </p> | |
190 | * | |
191 | * <table border="0" cellpadding="4"> | |
192 | * <tr> | |
193 | * <td valign="top">Rule 1.</td> | |
194 | * <td valign="top" nowrap><code>abc{def}>x|y</code></td> | |
195 | * </tr> | |
196 | * <tr> | |
197 | * <td valign="top">Rule 2.</td> | |
198 | * <td valign="top" nowrap><code>xyz>r</code></td> | |
199 | * </tr> | |
200 | * <tr> | |
201 | * <td valign="top">Rule 3.</td> | |
202 | * <td valign="top" nowrap><code>yz>q</code></td> | |
203 | * </tr> | |
204 | * </table> | |
205 | * | |
206 | * <p>Applying these rules to the string "<code>adefabcdefz</code>" | |
207 | * yields the following results: </p> | |
208 | * | |
209 | * <table border="0" cellpadding="4"> | |
210 | * <tr> | |
211 | * <td valign="top" nowrap><code>|adefabcdefz</code></td> | |
212 | * <td valign="top">Initial state, no rules match. Advance | |
213 | * cursor.</td> | |
214 | * </tr> | |
215 | * <tr> | |
216 | * <td valign="top" nowrap><code>a|defabcdefz</code></td> | |
217 | * <td valign="top">Still no match. Rule 1 does not match | |
218 | * because the preceding context is not present.</td> | |
219 | * </tr> | |
220 | * <tr> | |
221 | * <td valign="top" nowrap><code>ad|efabcdefz</code></td> | |
222 | * <td valign="top">Still no match. Keep advancing until | |
223 | * there is a match...</td> | |
224 | * </tr> | |
225 | * <tr> | |
226 | * <td valign="top" nowrap><code>ade|fabcdefz</code></td> | |
227 | * <td valign="top">...</td> | |
228 | * </tr> | |
229 | * <tr> | |
230 | * <td valign="top" nowrap><code>adef|abcdefz</code></td> | |
231 | * <td valign="top">...</td> | |
232 | * </tr> | |
233 | * <tr> | |
234 | * <td valign="top" nowrap><code>adefa|bcdefz</code></td> | |
235 | * <td valign="top">...</td> | |
236 | * </tr> | |
237 | * <tr> | |
238 | * <td valign="top" nowrap><code>adefab|cdefz</code></td> | |
239 | * <td valign="top">...</td> | |
240 | * </tr> | |
241 | * <tr> | |
242 | * <td valign="top" nowrap><code>adefabc|defz</code></td> | |
243 | * <td valign="top">Rule 1 matches; replace "<code>def</code>" | |
244 | * with "<code>xy</code>" and back up the cursor | |
245 | * to before the '<code>y</code>'.</td> | |
246 | * </tr> | |
247 | * <tr> | |
248 | * <td valign="top" nowrap><code>adefabcx|yz</code></td> | |
249 | * <td valign="top">Although "<code>xyz</code>" is | |
250 | * present, rule 2 does not match because the cursor is | |
251 | * before the '<code>y</code>', not before the '<code>x</code>'. | |
252 | * Rule 3 does match. Replace "<code>yz</code>" | |
253 | * with "<code>q</code>".</td> | |
254 | * </tr> | |
255 | * <tr> | |
256 | * <td valign="top" nowrap><code>adefabcxq|</code></td> | |
257 | * <td valign="top">The cursor is at the end; | |
258 | * transliteration is complete.</td> | |
259 | * </tr> | |
260 | * </table> | |
261 | * | |
262 | * <p>The order of rules is significant. If multiple rules may match | |
263 | * at some point, the first matching rule is applied. </p> | |
264 | * | |
265 | * <p>Forward and reverse rules may have an empty output string. | |
266 | * Otherwise, an empty left or right hand side of any statement is a | |
267 | * syntax error. </p> | |
268 | * | |
269 | * <p>Single quotes are used to quote any character other than a | |
270 | * digit or letter. To specify a single quote itself, inside or | |
271 | * outside of quotes, use two single quotes in a row. For example, | |
272 | * the rule "<code>'>'>o''clock</code>" changes the | |
273 | * string "<code>></code>" to the string "<code>o'clock</code>". | |
274 | * </p> | |
275 | * | |
276 | * <p><b>Notes</b> </p> | |
277 | * | |
278 | * <p>While a RuleBasedTransliterator is being built, it checks that | |
279 | * the rules are added in proper order. For example, if the rule | |
280 | * "a>x" is followed by the rule "ab>y", | |
281 | * then the second rule will throw an exception. The reason is that | |
282 | * the second rule can never be triggered, since the first rule | |
283 | * always matches anything it matches. In other words, the first | |
284 | * rule <em>masks</em> the second rule. </p> | |
285 | * | |
286 | * @author Alan Liu | |
287 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
288 | */ | |
289 | class U_I18N_API RuleBasedTransliterator : public Transliterator { | |
374ca955 | 290 | private: |
b75a7d8f A |
291 | /** |
292 | * The data object is immutable, so we can freely share it with | |
293 | * other instances of RBT, as long as we do NOT own this object. | |
374ca955 | 294 | * TODO: data is no longer immutable. See bugs #1866, 2155 |
b75a7d8f | 295 | */ |
374ca955 | 296 | TransliterationRuleData* fData; |
b75a7d8f A |
297 | |
298 | /** | |
299 | * If true, we own the data object and must delete it. | |
300 | */ | |
301 | UBool isDataOwned; | |
302 | ||
303 | public: | |
304 | ||
305 | /** | |
306 | * Constructs a new transliterator from the given rules. | |
307 | * @param rules rules, separated by ';' | |
308 | * @param direction either FORWARD or REVERSE. | |
309 | * @exception IllegalArgumentException if rules are malformed. | |
310 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
311 | */ | |
312 | RuleBasedTransliterator(const UnicodeString& id, | |
313 | const UnicodeString& rules, | |
314 | UTransDirection direction, | |
315 | UnicodeFilter* adoptedFilter, | |
316 | UParseError& parseError, | |
317 | UErrorCode& status); | |
318 | ||
319 | /** | |
320 | * Constructs a new transliterator from the given rules. | |
321 | * @param rules rules, separated by ';' | |
322 | * @param direction either FORWARD or REVERSE. | |
323 | * @exception IllegalArgumentException if rules are malformed. | |
324 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
325 | */ | |
326 | RuleBasedTransliterator(const UnicodeString& id, | |
327 | const UnicodeString& rules, | |
328 | UTransDirection direction, | |
329 | UnicodeFilter* adoptedFilter, | |
330 | UErrorCode& status); | |
331 | ||
332 | /** | |
333 | * Covenience constructor with no filter. | |
334 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
335 | */ | |
336 | RuleBasedTransliterator(const UnicodeString& id, | |
337 | const UnicodeString& rules, | |
338 | UTransDirection direction, | |
339 | UErrorCode& status); | |
340 | ||
341 | /** | |
342 | * Covenience constructor with no filter and FORWARD direction. | |
343 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
344 | */ | |
345 | RuleBasedTransliterator(const UnicodeString& id, | |
346 | const UnicodeString& rules, | |
347 | UErrorCode& status); | |
348 | ||
349 | /** | |
350 | * Covenience constructor with FORWARD direction. | |
351 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
352 | */ | |
353 | RuleBasedTransliterator(const UnicodeString& id, | |
354 | const UnicodeString& rules, | |
355 | UnicodeFilter* adoptedFilter, | |
356 | UErrorCode& status); | |
374ca955 | 357 | private: |
b75a7d8f A |
358 | |
359 | friend class TransliteratorRegistry; // to access TransliterationRuleData convenience ctor | |
360 | /** | |
361 | * Covenience constructor. | |
362 | * @param id the id for the transliterator. | |
363 | * @param theData the rule data for the transliterator. | |
364 | * @param adoptedFilter the filter for the transliterator | |
365 | */ | |
366 | RuleBasedTransliterator(const UnicodeString& id, | |
367 | const TransliterationRuleData* theData, | |
368 | UnicodeFilter* adoptedFilter = 0); | |
369 | ||
370 | ||
371 | friend class Transliterator; // to access following ct | |
372 | ||
373 | /** | |
374 | * Internal constructor. | |
375 | * @param id the id for the transliterator. | |
376 | * @param theData the rule data for the transliterator. | |
377 | * @param isDataAdopted determine who will own the 'data' object. True, the caller should not delete 'data'. | |
378 | */ | |
379 | RuleBasedTransliterator(const UnicodeString& id, | |
380 | TransliterationRuleData* data, | |
381 | UBool isDataAdopted); | |
382 | ||
374ca955 | 383 | public: |
b75a7d8f A |
384 | |
385 | /** | |
386 | * Copy constructor. | |
387 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
388 | */ | |
389 | RuleBasedTransliterator(const RuleBasedTransliterator&); | |
390 | ||
391 | virtual ~RuleBasedTransliterator(); | |
392 | ||
393 | /** | |
394 | * Implement Transliterator API. | |
395 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
396 | */ | |
374ca955 | 397 | virtual Transliterator* clone(void) const; |
b75a7d8f | 398 | |
374ca955 | 399 | protected: |
b75a7d8f A |
400 | /** |
401 | * Implements {@link Transliterator#handleTransliterate}. | |
402 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
403 | */ | |
404 | virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets, | |
405 | UBool isIncremental) const; | |
406 | ||
374ca955 | 407 | public: |
b75a7d8f A |
408 | /** |
409 | * Return a representation of this transliterator as source rules. | |
410 | * These rules will produce an equivalent transliterator if used | |
411 | * to construct a new transliterator. | |
412 | * @param result the string to receive the rules. Previous | |
413 | * contents will be deleted. | |
414 | * @param escapeUnprintable if TRUE then convert unprintable | |
415 | * character to their hex escape representations, \uxxxx or | |
416 | * \Uxxxxxxxx. Unprintable characters are those other than | |
417 | * U+000A, U+0020..U+007E. | |
418 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
419 | */ | |
420 | virtual UnicodeString& toRules(UnicodeString& result, | |
421 | UBool escapeUnprintable) const; | |
422 | ||
374ca955 | 423 | protected: |
b75a7d8f A |
424 | /** |
425 | * Implement Transliterator framework | |
426 | */ | |
427 | virtual void handleGetSourceSet(UnicodeSet& result) const; | |
428 | ||
374ca955 | 429 | public: |
b75a7d8f A |
430 | /** |
431 | * Override Transliterator framework | |
432 | */ | |
433 | virtual UnicodeSet& getTargetSet(UnicodeSet& result) const; | |
434 | ||
435 | /** | |
436 | * Return the class ID for this class. This is useful only for | |
437 | * comparing to a return value from getDynamicClassID(). For example: | |
438 | * <pre> | |
439 | * . Base* polymorphic_pointer = createPolymorphicObject(); | |
440 | * . if (polymorphic_pointer->getDynamicClassID() == | |
441 | * . Derived::getStaticClassID()) ... | |
442 | * </pre> | |
443 | * @return The class ID for all objects of this class. | |
444 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
445 | */ | |
374ca955 | 446 | static UClassID U_EXPORT2 getStaticClassID(void); |
b75a7d8f A |
447 | |
448 | /** | |
449 | * Returns a unique class ID <b>polymorphically</b>. This method | |
450 | * is to implement a simple version of RTTI, since not all C++ | |
451 | * compilers support genuine RTTI. Polymorphic operator==() and | |
452 | * clone() methods call this method. | |
453 | * | |
b75a7d8f A |
454 | * @return The class ID for this object. All objects of a given |
455 | * class have the same class ID. Objects of other classes have | |
456 | * different class IDs. | |
457 | */ | |
374ca955 | 458 | virtual UClassID getDynamicClassID(void) const; |
b75a7d8f A |
459 | |
460 | private: | |
461 | ||
b75a7d8f A |
462 | void _construct(const UnicodeString& rules, |
463 | UTransDirection direction, | |
464 | UParseError& parseError, | |
465 | UErrorCode& status); | |
466 | }; | |
467 | ||
b75a7d8f A |
468 | |
469 | U_NAMESPACE_END | |
470 | ||
471 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | |
472 | ||
473 | #endif |