]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ********************************************************************** | |
46f4442e | 5 | * Copyright (C) 1999-2007, International Business Machines |
b75a7d8f A |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** | |
8 | * Date Name Description | |
9 | * 11/17/99 aliu Creation. | |
10 | ********************************************************************** | |
11 | */ | |
12 | #ifndef RBT_H | |
13 | #define RBT_H | |
14 | ||
15 | #include "unicode/utypes.h" | |
16 | ||
17 | #if !UCONFIG_NO_TRANSLITERATION | |
18 | ||
19 | #include "unicode/translit.h" | |
20 | #include "unicode/utypes.h" | |
21 | #include "unicode/parseerr.h" | |
374ca955 A |
22 | #include "unicode/udata.h" |
23 | ||
24 | #define U_ICUDATA_TRANSLIT U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "translit" | |
b75a7d8f A |
25 | |
26 | U_NAMESPACE_BEGIN | |
27 | ||
28 | class TransliterationRuleData; | |
29 | ||
30 | /** | |
31 | * <code>RuleBasedTransliterator</code> is a transliterator | |
32 | * that reads a set of rules in order to determine how to perform | |
33 | * translations. Rule sets are stored in resource bundles indexed by | |
34 | * name. Rules within a rule set are separated by semicolons (';'). | |
35 | * To include a literal semicolon, prefix it with a backslash ('\'). | |
36 | * Whitespace, as defined by <code>Character.isWhitespace()</code>, | |
37 | * is ignored. If the first non-blank character on a line is '#', | |
38 | * the entire line is ignored as a comment. </p> | |
39 | * | |
40 | * <p>Each set of rules consists of two groups, one forward, and one | |
41 | * reverse. This is a convention that is not enforced; rules for one | |
42 | * direction may be omitted, with the result that translations in | |
43 | * that direction will not modify the source text. In addition, | |
44 | * bidirectional forward-reverse rules may be specified for | |
45 | * symmetrical transformations.</p> | |
46 | * | |
47 | * <p><b>Rule syntax</b> </p> | |
48 | * | |
49 | * <p>Rule statements take one of the following forms: </p> | |
50 | * | |
51 | * <dl> | |
52 | * <dt><code>$alefmadda=\u0622;</code></dt> | |
53 | * <dd><strong>Variable definition.</strong> The name on the | |
54 | * left is assigned the text on the right. In this example, | |
55 | * after this statement, instances of the left hand name, | |
56 | * "<code>$alefmadda</code>", will be replaced by | |
57 | * the Unicode character U+0622. Variable names must begin | |
58 | * with a letter and consist only of letters, digits, and | |
59 | * underscores. Case is significant. Duplicate names cause | |
60 | * an exception to be thrown, that is, variables cannot be | |
61 | * redefined. The right hand side may contain well-formed | |
62 | * text of any length, including no text at all ("<code>$empty=;</code>"). | |
63 | * The right hand side may contain embedded <code>UnicodeSet</code> | |
64 | * patterns, for example, "<code>$softvowel=[eiyEIY]</code>".</dd> | |
65 | * <dd> </dd> | |
66 | * <dt><code>ai>$alefmadda;</code></dt> | |
67 | * <dd><strong>Forward translation rule.</strong> This rule | |
68 | * states that the string on the left will be changed to the | |
69 | * string on the right when performing forward | |
70 | * transliteration.</dd> | |
71 | * <dt> </dt> | |
72 | * <dt><code>ai<$alefmadda;</code></dt> | |
73 | * <dd><strong>Reverse translation rule.</strong> This rule | |
74 | * states that the string on the right will be changed to | |
75 | * the string on the left when performing reverse | |
76 | * transliteration.</dd> | |
77 | * </dl> | |
78 | * | |
79 | * <dl> | |
80 | * <dt><code>ai<>$alefmadda;</code></dt> | |
81 | * <dd><strong>Bidirectional translation rule.</strong> This | |
82 | * rule states that the string on the right will be changed | |
83 | * to the string on the left when performing forward | |
84 | * transliteration, and vice versa when performing reverse | |
85 | * transliteration.</dd> | |
86 | * </dl> | |
87 | * | |
88 | * <p>Translation rules consist of a <em>match pattern</em> and an <em>output | |
89 | * string</em>. The match pattern consists of literal characters, | |
90 | * optionally preceded by context, and optionally followed by | |
91 | * context. Context characters, like literal pattern characters, | |
92 | * must be matched in the text being transliterated. However, unlike | |
93 | * literal pattern characters, they are not replaced by the output | |
94 | * text. For example, the pattern "<code>abc{def}</code>" | |
95 | * indicates the characters "<code>def</code>" must be | |
96 | * preceded by "<code>abc</code>" for a successful match. | |
97 | * If there is a successful match, "<code>def</code>" will | |
98 | * be replaced, but not "<code>abc</code>". The final '<code>}</code>' | |
99 | * is optional, so "<code>abc{def</code>" is equivalent to | |
100 | * "<code>abc{def}</code>". Another example is "<code>{123}456</code>" | |
101 | * (or "<code>123}456</code>") in which the literal | |
102 | * pattern "<code>123</code>" must be followed by "<code>456</code>". | |
103 | * </p> | |
104 | * | |
105 | * <p>The output string of a forward or reverse rule consists of | |
106 | * characters to replace the literal pattern characters. If the | |
107 | * output string contains the character '<code>|</code>', this is | |
108 | * taken to indicate the location of the <em>cursor</em> after | |
109 | * replacement. The cursor is the point in the text at which the | |
110 | * next replacement, if any, will be applied. The cursor is usually | |
111 | * placed within the replacement text; however, it can actually be | |
112 | * placed into the precending or following context by using the | |
113 | * special character '<code>@</code>'. Examples:</p> | |
114 | * | |
115 | * <blockquote> | |
116 | * <p><code>a {foo} z > | @ bar; # foo -> bar, move cursor | |
117 | * before a<br> | |
118 | * {foo} xyz > bar @@|; # foo -> bar, cursor between | |
119 | * y and z</code></p> | |
120 | * </blockquote> | |
121 | * | |
122 | * <p><b>UnicodeSet</b></p> | |
123 | * | |
124 | * <p><code>UnicodeSet</code> patterns may appear anywhere that | |
125 | * makes sense. They may appear in variable definitions. | |
126 | * Contrariwise, <code>UnicodeSet</code> patterns may themselves | |
127 | * contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>", | |
128 | * or "<code>$range=a-z;$ll=[$range]</code>".</p> | |
129 | * | |
130 | * <p><code>UnicodeSet</code> patterns may also be embedded directly | |
131 | * into rule strings. Thus, the following two rules are equivalent:</p> | |
132 | * | |
133 | * <blockquote> | |
134 | * <p><code>$vowel=[aeiou]; $vowel>'*'; # One way to do this<br> | |
135 | * [aeiou]>'*'; | |
136 | * # | |
137 | * Another way</code></p> | |
138 | * </blockquote> | |
139 | * | |
140 | * <p>See {@link UnicodeSet} for more documentation and examples.</p> | |
141 | * | |
142 | * <p><b>Segments</b></p> | |
143 | * | |
144 | * <p>Segments of the input string can be matched and copied to the | |
145 | * output string. This makes certain sets of rules simpler and more | |
146 | * general, and makes reordering possible. For example:</p> | |
147 | * | |
148 | * <blockquote> | |
149 | * <p><code>([a-z]) > $1 $1; | |
150 | * # | |
151 | * double lowercase letters<br> | |
152 | * ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs</code></p> | |
153 | * </blockquote> | |
154 | * | |
155 | * <p>The segment of the input string to be copied is delimited by | |
156 | * "<code>(</code>" and "<code>)</code>". Up to | |
157 | * nine segments may be defined. Segments may not overlap. In the | |
158 | * output string, "<code>$1</code>" through "<code>$9</code>" | |
159 | * represent the input string segments, in left-to-right order of | |
160 | * definition.</p> | |
161 | * | |
162 | * <p><b>Anchors</b></p> | |
163 | * | |
164 | * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the | |
165 | * special characters '<code>^</code>' and '<code>$</code>'. For example:</p> | |
166 | * | |
167 | * <blockquote> | |
168 | * <p><code>^ a > 'BEG_A'; # match 'a' at start of text<br> | |
169 | * a > 'A'; # match other instances | |
170 | * of 'a'<br> | |
171 | * z $ > 'END_Z'; # match 'z' at end of text<br> | |
172 | * z > 'Z'; # match other instances | |
173 | * of 'z'</code></p> | |
174 | * </blockquote> | |
175 | * | |
176 | * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>. | |
177 | * This is done by including a virtual anchor character '<code>$</code>' at the end of the | |
178 | * set pattern. Although this is usually the match chafacter for the end anchor, the set will | |
179 | * match either the beginning or the end of the text, depending on its placement. For | |
180 | * example:</p> | |
181 | * | |
182 | * <blockquote> | |
183 | * <p><code>$x = [a-z$]; # match 'a' through 'z' OR anchor<br> | |
184 | * $x 1 > 2; # match '1' after a-z or at the start<br> | |
185 | * 3 $x > 4; # match '3' before a-z or at the end</code></p> | |
186 | * </blockquote> | |
187 | * | |
188 | * <p><b>Example</b> </p> | |
189 | * | |
190 | * <p>The following example rules illustrate many of the features of | |
191 | * the rule language. </p> | |
192 | * | |
193 | * <table border="0" cellpadding="4"> | |
194 | * <tr> | |
195 | * <td valign="top">Rule 1.</td> | |
196 | * <td valign="top" nowrap><code>abc{def}>x|y</code></td> | |
197 | * </tr> | |
198 | * <tr> | |
199 | * <td valign="top">Rule 2.</td> | |
200 | * <td valign="top" nowrap><code>xyz>r</code></td> | |
201 | * </tr> | |
202 | * <tr> | |
203 | * <td valign="top">Rule 3.</td> | |
204 | * <td valign="top" nowrap><code>yz>q</code></td> | |
205 | * </tr> | |
206 | * </table> | |
207 | * | |
208 | * <p>Applying these rules to the string "<code>adefabcdefz</code>" | |
209 | * yields the following results: </p> | |
210 | * | |
211 | * <table border="0" cellpadding="4"> | |
212 | * <tr> | |
213 | * <td valign="top" nowrap><code>|adefabcdefz</code></td> | |
214 | * <td valign="top">Initial state, no rules match. Advance | |
215 | * cursor.</td> | |
216 | * </tr> | |
217 | * <tr> | |
218 | * <td valign="top" nowrap><code>a|defabcdefz</code></td> | |
219 | * <td valign="top">Still no match. Rule 1 does not match | |
220 | * because the preceding context is not present.</td> | |
221 | * </tr> | |
222 | * <tr> | |
223 | * <td valign="top" nowrap><code>ad|efabcdefz</code></td> | |
224 | * <td valign="top">Still no match. Keep advancing until | |
225 | * there is a match...</td> | |
226 | * </tr> | |
227 | * <tr> | |
228 | * <td valign="top" nowrap><code>ade|fabcdefz</code></td> | |
229 | * <td valign="top">...</td> | |
230 | * </tr> | |
231 | * <tr> | |
232 | * <td valign="top" nowrap><code>adef|abcdefz</code></td> | |
233 | * <td valign="top">...</td> | |
234 | * </tr> | |
235 | * <tr> | |
236 | * <td valign="top" nowrap><code>adefa|bcdefz</code></td> | |
237 | * <td valign="top">...</td> | |
238 | * </tr> | |
239 | * <tr> | |
240 | * <td valign="top" nowrap><code>adefab|cdefz</code></td> | |
241 | * <td valign="top">...</td> | |
242 | * </tr> | |
243 | * <tr> | |
244 | * <td valign="top" nowrap><code>adefabc|defz</code></td> | |
245 | * <td valign="top">Rule 1 matches; replace "<code>def</code>" | |
246 | * with "<code>xy</code>" and back up the cursor | |
247 | * to before the '<code>y</code>'.</td> | |
248 | * </tr> | |
249 | * <tr> | |
250 | * <td valign="top" nowrap><code>adefabcx|yz</code></td> | |
251 | * <td valign="top">Although "<code>xyz</code>" is | |
252 | * present, rule 2 does not match because the cursor is | |
253 | * before the '<code>y</code>', not before the '<code>x</code>'. | |
254 | * Rule 3 does match. Replace "<code>yz</code>" | |
255 | * with "<code>q</code>".</td> | |
256 | * </tr> | |
257 | * <tr> | |
258 | * <td valign="top" nowrap><code>adefabcxq|</code></td> | |
259 | * <td valign="top">The cursor is at the end; | |
260 | * transliteration is complete.</td> | |
261 | * </tr> | |
262 | * </table> | |
263 | * | |
264 | * <p>The order of rules is significant. If multiple rules may match | |
265 | * at some point, the first matching rule is applied. </p> | |
266 | * | |
267 | * <p>Forward and reverse rules may have an empty output string. | |
268 | * Otherwise, an empty left or right hand side of any statement is a | |
269 | * syntax error. </p> | |
270 | * | |
271 | * <p>Single quotes are used to quote any character other than a | |
272 | * digit or letter. To specify a single quote itself, inside or | |
273 | * outside of quotes, use two single quotes in a row. For example, | |
274 | * the rule "<code>'>'>o''clock</code>" changes the | |
275 | * string "<code>></code>" to the string "<code>o'clock</code>". | |
276 | * </p> | |
277 | * | |
278 | * <p><b>Notes</b> </p> | |
279 | * | |
280 | * <p>While a RuleBasedTransliterator is being built, it checks that | |
281 | * the rules are added in proper order. For example, if the rule | |
282 | * "a>x" is followed by the rule "ab>y", | |
283 | * then the second rule will throw an exception. The reason is that | |
284 | * the second rule can never be triggered, since the first rule | |
285 | * always matches anything it matches. In other words, the first | |
286 | * rule <em>masks</em> the second rule. </p> | |
287 | * | |
288 | * @author Alan Liu | |
289 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
290 | */ | |
46f4442e | 291 | class RuleBasedTransliterator : public Transliterator { |
374ca955 | 292 | private: |
b75a7d8f A |
293 | /** |
294 | * The data object is immutable, so we can freely share it with | |
295 | * other instances of RBT, as long as we do NOT own this object. | |
374ca955 | 296 | * TODO: data is no longer immutable. See bugs #1866, 2155 |
b75a7d8f | 297 | */ |
374ca955 | 298 | TransliterationRuleData* fData; |
b75a7d8f A |
299 | |
300 | /** | |
301 | * If true, we own the data object and must delete it. | |
302 | */ | |
303 | UBool isDataOwned; | |
304 | ||
305 | public: | |
306 | ||
307 | /** | |
308 | * Constructs a new transliterator from the given rules. | |
309 | * @param rules rules, separated by ';' | |
310 | * @param direction either FORWARD or REVERSE. | |
311 | * @exception IllegalArgumentException if rules are malformed. | |
312 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
313 | */ | |
314 | RuleBasedTransliterator(const UnicodeString& id, | |
315 | const UnicodeString& rules, | |
316 | UTransDirection direction, | |
317 | UnicodeFilter* adoptedFilter, | |
318 | UParseError& parseError, | |
319 | UErrorCode& status); | |
320 | ||
321 | /** | |
322 | * Constructs a new transliterator from the given rules. | |
323 | * @param rules rules, separated by ';' | |
324 | * @param direction either FORWARD or REVERSE. | |
325 | * @exception IllegalArgumentException if rules are malformed. | |
326 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
327 | */ | |
46f4442e | 328 | /*RuleBasedTransliterator(const UnicodeString& id, |
b75a7d8f A |
329 | const UnicodeString& rules, |
330 | UTransDirection direction, | |
331 | UnicodeFilter* adoptedFilter, | |
46f4442e | 332 | UErrorCode& status);*/ |
b75a7d8f A |
333 | |
334 | /** | |
335 | * Covenience constructor with no filter. | |
336 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
337 | */ | |
46f4442e | 338 | /*RuleBasedTransliterator(const UnicodeString& id, |
b75a7d8f A |
339 | const UnicodeString& rules, |
340 | UTransDirection direction, | |
46f4442e | 341 | UErrorCode& status);*/ |
b75a7d8f A |
342 | |
343 | /** | |
344 | * Covenience constructor with no filter and FORWARD direction. | |
345 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
346 | */ | |
46f4442e | 347 | /*RuleBasedTransliterator(const UnicodeString& id, |
b75a7d8f | 348 | const UnicodeString& rules, |
46f4442e | 349 | UErrorCode& status);*/ |
b75a7d8f A |
350 | |
351 | /** | |
352 | * Covenience constructor with FORWARD direction. | |
353 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
354 | */ | |
46f4442e | 355 | /*RuleBasedTransliterator(const UnicodeString& id, |
b75a7d8f A |
356 | const UnicodeString& rules, |
357 | UnicodeFilter* adoptedFilter, | |
46f4442e | 358 | UErrorCode& status);*/ |
374ca955 | 359 | private: |
b75a7d8f A |
360 | |
361 | friend class TransliteratorRegistry; // to access TransliterationRuleData convenience ctor | |
362 | /** | |
363 | * Covenience constructor. | |
364 | * @param id the id for the transliterator. | |
365 | * @param theData the rule data for the transliterator. | |
366 | * @param adoptedFilter the filter for the transliterator | |
367 | */ | |
368 | RuleBasedTransliterator(const UnicodeString& id, | |
369 | const TransliterationRuleData* theData, | |
370 | UnicodeFilter* adoptedFilter = 0); | |
371 | ||
372 | ||
373 | friend class Transliterator; // to access following ct | |
374 | ||
375 | /** | |
376 | * Internal constructor. | |
377 | * @param id the id for the transliterator. | |
378 | * @param theData the rule data for the transliterator. | |
379 | * @param isDataAdopted determine who will own the 'data' object. True, the caller should not delete 'data'. | |
380 | */ | |
381 | RuleBasedTransliterator(const UnicodeString& id, | |
382 | TransliterationRuleData* data, | |
383 | UBool isDataAdopted); | |
384 | ||
374ca955 | 385 | public: |
b75a7d8f A |
386 | |
387 | /** | |
388 | * Copy constructor. | |
389 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
390 | */ | |
391 | RuleBasedTransliterator(const RuleBasedTransliterator&); | |
392 | ||
393 | virtual ~RuleBasedTransliterator(); | |
394 | ||
395 | /** | |
396 | * Implement Transliterator API. | |
397 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
398 | */ | |
374ca955 | 399 | virtual Transliterator* clone(void) const; |
b75a7d8f | 400 | |
374ca955 | 401 | protected: |
b75a7d8f A |
402 | /** |
403 | * Implements {@link Transliterator#handleTransliterate}. | |
404 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
405 | */ | |
406 | virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets, | |
407 | UBool isIncremental) const; | |
408 | ||
374ca955 | 409 | public: |
b75a7d8f A |
410 | /** |
411 | * Return a representation of this transliterator as source rules. | |
412 | * These rules will produce an equivalent transliterator if used | |
413 | * to construct a new transliterator. | |
414 | * @param result the string to receive the rules. Previous | |
415 | * contents will be deleted. | |
416 | * @param escapeUnprintable if TRUE then convert unprintable | |
417 | * character to their hex escape representations, \uxxxx or | |
418 | * \Uxxxxxxxx. Unprintable characters are those other than | |
419 | * U+000A, U+0020..U+007E. | |
420 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
421 | */ | |
422 | virtual UnicodeString& toRules(UnicodeString& result, | |
423 | UBool escapeUnprintable) const; | |
424 | ||
374ca955 | 425 | protected: |
b75a7d8f A |
426 | /** |
427 | * Implement Transliterator framework | |
428 | */ | |
429 | virtual void handleGetSourceSet(UnicodeSet& result) const; | |
430 | ||
374ca955 | 431 | public: |
b75a7d8f A |
432 | /** |
433 | * Override Transliterator framework | |
434 | */ | |
435 | virtual UnicodeSet& getTargetSet(UnicodeSet& result) const; | |
436 | ||
437 | /** | |
438 | * Return the class ID for this class. This is useful only for | |
439 | * comparing to a return value from getDynamicClassID(). For example: | |
440 | * <pre> | |
441 | * . Base* polymorphic_pointer = createPolymorphicObject(); | |
442 | * . if (polymorphic_pointer->getDynamicClassID() == | |
443 | * . Derived::getStaticClassID()) ... | |
444 | * </pre> | |
445 | * @return The class ID for all objects of this class. | |
446 | * @internal Use transliterator factory methods instead since this class will be removed in that release. | |
447 | */ | |
46f4442e | 448 | U_I18N_API static UClassID U_EXPORT2 getStaticClassID(void); |
b75a7d8f A |
449 | |
450 | /** | |
451 | * Returns a unique class ID <b>polymorphically</b>. This method | |
452 | * is to implement a simple version of RTTI, since not all C++ | |
453 | * compilers support genuine RTTI. Polymorphic operator==() and | |
454 | * clone() methods call this method. | |
455 | * | |
b75a7d8f A |
456 | * @return The class ID for this object. All objects of a given |
457 | * class have the same class ID. Objects of other classes have | |
458 | * different class IDs. | |
459 | */ | |
374ca955 | 460 | virtual UClassID getDynamicClassID(void) const; |
b75a7d8f A |
461 | |
462 | private: | |
463 | ||
b75a7d8f A |
464 | void _construct(const UnicodeString& rules, |
465 | UTransDirection direction, | |
466 | UParseError& parseError, | |
467 | UErrorCode& status); | |
468 | }; | |
469 | ||
b75a7d8f A |
470 | |
471 | U_NAMESPACE_END | |
472 | ||
473 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | |
474 | ||
475 | #endif |