icuSources/i18n/unicode/translit.h

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 * Copyright (C) 1999-2014, International Business Machines
   6 * Corporation and others. All Rights Reserved.
   7 **********************************************************************
   8 *   Date        Name        Description
   9 *   11/17/99    aliu        Creation.
  10 **********************************************************************
  11 */
  12 #ifndef TRANSLIT_H
  13 #define TRANSLIT_H
  14
  15 #include "unicode/utypes.h"
  16
  17 /**
  18  * \file
  19  * \brief C++ API: Tranforms text from one format to another.
  20  */
  21
  22 #if !UCONFIG_NO_TRANSLITERATION
  23
  24 #include "unicode/uobject.h"
  25 #include "unicode/unistr.h"
  26 #include "unicode/parseerr.h"
  27 #include "unicode/utrans.h" // UTransPosition, UTransDirection
  28 #include "unicode/strenum.h"
  29
  30 #if U_SHOW_CPLUSPLUS_API
  31 U_NAMESPACE_BEGIN
  32
  33 class UnicodeFilter;
  34 class UnicodeSet;
  35 class TransliteratorParser;
  36 class NormalizationTransliterator;
  37 class TransliteratorIDParser;
  38
  39 /**
  40  *
  41  * <code>Transliterator</code> is an abstract class that
  42  * transliterates text from one format to another.  The most common
  43  * kind of transliterator is a script, or alphabet, transliterator.
  44  * For example, a Russian to Latin transliterator changes Russian text
  45  * written in Cyrillic characters to phonetically equivalent Latin
  46  * characters.  It does not <em>translate</em> Russian to English!
  47  * Transliteration, unlike translation, operates on characters, without
  48  * reference to the meanings of words and sentences.
  49  *
  50  * <p>Although script conversion is its most common use, a
  51  * transliterator can actually perform a more general class of tasks.
  52  * In fact, <code>Transliterator</code> defines a very general API
  53  * which specifies only that a segment of the input text is replaced
  54  * by new text.  The particulars of this conversion are determined
  55  * entirely by subclasses of <code>Transliterator</code>.
  56  *
  57  * <p><b>Transliterators are stateless</b>
  58  *
  59  * <p><code>Transliterator</code> objects are <em>stateless</em>; they
  60  * retain no information between calls to
  61  * <code>transliterate()</code>.  (However, this does <em>not</em>
  62  * mean that threads may share transliterators without synchronizing
  63  * them.  Transliterators are not immutable, so they must be
  64  * synchronized when shared between threads.)  This might seem to
  65  * limit the complexity of the transliteration operation.  In
  66  * practice, subclasses perform complex transliterations by delaying
  67  * the replacement of text until it is known that no other
  68  * replacements are possible.  In other words, although the
  69  * <code>Transliterator</code> objects are stateless, the source text
  70  * itself embodies all the needed information, and delayed operation
  71  * allows arbitrary complexity.
  72  *
  73  * <p><b>Batch transliteration</b>
  74  *
  75  * <p>The simplest way to perform transliteration is all at once, on a
  76  * string of existing text.  This is referred to as <em>batch</em>
  77  * transliteration.  For example, given a string <code>input</code>
  78  * and a transliterator <code>t</code>, the call
  79  *
  80  *     String result = t.transliterate(input);
  81  *
  82  * will transliterate it and return the result.  Other methods allow
  83  * the client to specify a substring to be transliterated and to use
  84  * {@link Replaceable } objects instead of strings, in order to
  85  * preserve out-of-band information (such as text styles).
  86  *
  87  * <p><b>Keyboard transliteration</b>
  88  *
  89  * <p>Somewhat more involved is <em>keyboard</em>, or incremental
  90  * transliteration.  This is the transliteration of text that is
  91  * arriving from some source (typically the user's keyboard) one
  92  * character at a time, or in some other piecemeal fashion.
  93  *
  94  * <p>In keyboard transliteration, a <code>Replaceable</code> buffer
  95  * stores the text.  As text is inserted, as much as possible is
  96  * transliterated on the fly.  This means a GUI that displays the
  97  * contents of the buffer may show text being modified as each new
  98  * character arrives.
  99  *
 100  * <p>Consider the simple rule-based Transliterator:
 101  * <pre>
 102  *     th>{theta}
 103  *     t>{tau}
 104  * </pre>
 105  *
 106  * When the user types 't', nothing will happen, since the
 107  * transliterator is waiting to see if the next character is 'h'.  To
 108  * remedy this, we introduce the notion of a cursor, marked by a '|'
 109  * in the output string:
 110  * <pre>
 111  *     t>|{tau}
 112  *     {tau}h>{theta}
 113  * </pre>
 114  *
 115  * Now when the user types 't', tau appears, and if the next character
 116  * is 'h', the tau changes to a theta.  This is accomplished by
 117  * maintaining a cursor position (independent of the insertion point,
 118  * and invisible in the GUI) across calls to
 119  * <code>transliterate()</code>.  Typically, the cursor will
 120  * be coincident with the insertion point, but in a case like the one
 121  * above, it will precede the insertion point.
 122  *
 123  * <p>Keyboard transliteration methods maintain a set of three indices
 124  * that are updated with each call to
 125  * <code>transliterate()</code>, including the cursor, start,
 126  * and limit.  Since these indices are changed by the method, they are
 127  * passed in an <code>int[]</code> array. The <code>START</code> index
 128  * marks the beginning of the substring that the transliterator will
 129  * look at.  It is advanced as text becomes committed (but it is not
 130  * the committed index; that's the <code>CURSOR</code>).  The
 131  * <code>CURSOR</code> index, described above, marks the point at
 132  * which the transliterator last stopped, either because it reached
 133  * the end, or because it required more characters to disambiguate
 134  * between possible inputs.  The <code>CURSOR</code> can also be
 135  * explicitly set by rules in a rule-based Transliterator.
 136  * Any characters before the <code>CURSOR</code> index are frozen;
 137  * future keyboard transliteration calls within this input sequence
 138  * will not change them.  New text is inserted at the
 139  * <code>LIMIT</code> index, which marks the end of the substring that
 140  * the transliterator looks at.
 141  *
 142  * <p>Because keyboard transliteration assumes that more characters
 143  * are to arrive, it is conservative in its operation.  It only
 144  * transliterates when it can do so unambiguously.  Otherwise it waits
 145  * for more characters to arrive.  When the client code knows that no
 146  * more characters are forthcoming, perhaps because the user has
 147  * performed some input termination operation, then it should call
 148  * <code>finishTransliteration()</code> to complete any
 149  * pending transliterations.
 150  *
 151  * <p><b>Inverses</b>
 152  *
 153  * <p>Pairs of transliterators may be inverses of one another.  For
 154  * example, if transliterator <b>A</b> transliterates characters by
 155  * incrementing their Unicode value (so "abc" -> "def"), and
 156  * transliterator <b>B</b> decrements character values, then <b>A</b>
 157  * is an inverse of <b>B</b> and vice versa.  If we compose <b>A</b>
 158  * with <b>B</b> in a compound transliterator, the result is the
 159  * indentity transliterator, that is, a transliterator that does not
 160  * change its input text.
 161  *
 162  * The <code>Transliterator</code> method <code>getInverse()</code>
 163  * returns a transliterator's inverse, if one exists, or
 164  * <code>null</code> otherwise.  However, the result of
 165  * <code>getInverse()</code> usually will <em>not</em> be a true
 166  * mathematical inverse.  This is because true inverse transliterators
 167  * are difficult to formulate.  For example, consider two
 168  * transliterators: <b>AB</b>, which transliterates the character 'A'
 169  * to 'B', and <b>BA</b>, which transliterates 'B' to 'A'.  It might
 170  * seem that these are exact inverses, since
 171  *
 172  * \htmlonly<blockquote>\endhtmlonly"A" x <b>AB</b> -> "B"<br>
 173  * "B" x <b>BA</b> -> "A"\htmlonly</blockquote>\endhtmlonly
 174  *
 175  * where 'x' represents transliteration.  However,
 176  *
 177  * \htmlonly<blockquote>\endhtmlonly"ABCD" x <b>AB</b> -> "BBCD"<br>
 178  * "BBCD" x <b>BA</b> -> "AACD"\htmlonly</blockquote>\endhtmlonly
 179  *
 180  * so <b>AB</b> composed with <b>BA</b> is not the
 181  * identity. Nonetheless, <b>BA</b> may be usefully considered to be
 182  * <b>AB</b>'s inverse, and it is on this basis that
 183  * <b>AB</b><code>.getInverse()</code> could legitimately return
 184  * <b>BA</b>.
 185  *
 186  * <p><b>IDs and display names</b>
 187  *
 188  * <p>A transliterator is designated by a short identifier string or
 189  * <em>ID</em>.  IDs follow the format <em>source-destination</em>,
 190  * where <em>source</em> describes the entity being replaced, and
 191  * <em>destination</em> describes the entity replacing
 192  * <em>source</em>.  The entities may be the names of scripts,
 193  * particular sequences of characters, or whatever else it is that the
 194  * transliterator converts to or from.  For example, a transliterator
 195  * from Russian to Latin might be named "Russian-Latin".  A
 196  * transliterator from keyboard escape sequences to Latin-1 characters
 197  * might be named "KeyboardEscape-Latin1".  By convention, system
 198  * entity names are in English, with the initial letters of words
 199  * capitalized; user entity names may follow any format so long as
 200  * they do not contain dashes.
 201  *
 202  * <p>In addition to programmatic IDs, transliterator objects have
 203  * display names for presentation in user interfaces, returned by
 204  * {@link #getDisplayName }.
 205  *
 206  * <p><b>Factory methods and registration</b>
 207  *
 208  * <p>In general, client code should use the factory method
 209  * {@link #createInstance } to obtain an instance of a
 210  * transliterator given its ID.  Valid IDs may be enumerated using
 211  * <code>getAvailableIDs()</code>.  Since transliterators are mutable,
 212  * multiple calls to {@link #createInstance } with the same ID will
 213  * return distinct objects.
 214  *
 215  * <p>In addition to the system transliterators registered at startup,
 216  * user transliterators may be registered by calling
 217  * <code>registerInstance()</code> at run time.  A registered instance
 218  * acts a template; future calls to {@link #createInstance } with the ID
 219  * of the registered object return clones of that object.  Thus any
 220  * object passed to <tt>registerInstance()</tt> must implement
 221  * <tt>clone()</tt> propertly.  To register a transliterator subclass
 222  * without instantiating it (until it is needed), users may call
 223  * {@link #registerFactory }.  In this case, the objects are
 224  * instantiated by invoking the zero-argument public constructor of
 225  * the class.
 226  *
 227  * <p><b>Subclassing</b>
 228  *
 229  * Subclasses must implement the abstract method
 230  * <code>handleTransliterate()</code>.  <p>Subclasses should override
 231  * the <code>transliterate()</code> method taking a
 232  * <code>Replaceable</code> and the <code>transliterate()</code>
 233  * method taking a <code>String</code> and <code>StringBuffer</code>
 234  * if the performance of these methods can be improved over the
 235  * performance obtained by the default implementations in this class.
 236  *
 237  * <p><b>Rule syntax</b>
 238  *
 239  * <p>A set of rules determines how to perform translations.
 240  * Rules within a rule set are separated by semicolons (';').
 241  * To include a literal semicolon, prefix it with a backslash ('\').
 242  * Unicode Pattern_White_Space is ignored.
 243  * If the first non-blank character on a line is '#',
 244  * the entire line is ignored as a comment.
 245  *
 246  * <p>Each set of rules consists of two groups, one forward, and one
 247  * reverse. This is a convention that is not enforced; rules for one
 248  * direction may be omitted, with the result that translations in
 249  * that direction will not modify the source text. In addition,
 250  * bidirectional forward-reverse rules may be specified for
 251  * symmetrical transformations.
 252  *
 253  * <p>Note: Another description of the Transliterator rule syntax is available in
 254  * <a href="https://www.unicode.org/reports/tr35/tr35-general.html#Transform_Rules_Syntax">section
 255  * Transform Rules Syntax of UTS #35: Unicode LDML</a>.
 256  * The rules are shown there using arrow symbols ← and → and ↔.
 257  * ICU supports both those and the equivalent ASCII symbols &lt; and &gt; and &lt;&gt;.
 258  *
 259  * <p>Rule statements take one of the following forms:
 260  *
 261  * <dl>
 262  *     <dt><code>$alefmadda=\\u0622;</code></dt>
 263  *     <dd><strong>Variable definition.</strong> The name on the
 264  *         left is assigned the text on the right. In this example,
 265  *         after this statement, instances of the left hand name,
 266  *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
 267  *         the Unicode character U+0622. Variable names must begin
 268  *         with a letter and consist only of letters, digits, and
 269  *         underscores. Case is significant. Duplicate names cause
 270  *         an exception to be thrown, that is, variables cannot be
 271  *         redefined. The right hand side may contain well-formed
 272  *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
 273  *         The right hand side may contain embedded <code>UnicodeSet</code>
 274  *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
 275  *     <dt><code>ai&gt;$alefmadda;</code></dt>
 276  *     <dd><strong>Forward translation rule.</strong> This rule
 277  *         states that the string on the left will be changed to the
 278  *         string on the right when performing forward
 279  *         transliteration.</dd>
 280  *     <dt><code>ai&lt;$alefmadda;</code></dt>
 281  *     <dd><strong>Reverse translation rule.</strong> This rule
 282  *         states that the string on the right will be changed to
 283  *         the string on the left when performing reverse
 284  *         transliteration.</dd>
 285  * </dl>
 286  *
 287  * <dl>
 288  *     <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
 289  *     <dd><strong>Bidirectional translation rule.</strong> This
 290  *         rule states that the string on the right will be changed
 291  *         to the string on the left when performing forward
 292  *         transliteration, and vice versa when performing reverse
 293  *         transliteration.</dd>
 294  * </dl>
 295  *
 296  * <p>Translation rules consist of a <em>match pattern</em> and an <em>output
 297  * string</em>. The match pattern consists of literal characters,
 298  * optionally preceded by context, and optionally followed by
 299  * context. Context characters, like literal pattern characters,
 300  * must be matched in the text being transliterated. However, unlike
 301  * literal pattern characters, they are not replaced by the output
 302  * text. For example, the pattern &quot;<code>abc{def}</code>&quot;
 303  * indicates the characters &quot;<code>def</code>&quot; must be
 304  * preceded by &quot;<code>abc</code>&quot; for a successful match.
 305  * If there is a successful match, &quot;<code>def</code>&quot; will
 306  * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'
 307  * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to
 308  * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;
 309  * (or &quot;<code>123}456</code>&quot;) in which the literal
 310  * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
 311  *
 312  * <p>The output string of a forward or reverse rule consists of
 313  * characters to replace the literal pattern characters. If the
 314  * output string contains the character '<code>|</code>', this is
 315  * taken to indicate the location of the <em>cursor</em> after
 316  * replacement. The cursor is the point in the text at which the
 317  * next replacement, if any, will be applied. The cursor is usually
 318  * placed within the replacement text; however, it can actually be
 319  * placed into the precending or following context by using the
 320  * special character '@'. Examples:
 321  *
 322  * <pre>
 323  *     a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor before a
 324  *     {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between y and z
 325  * </pre>
 326  *
 327  * <p><b>UnicodeSet</b>
 328  *
 329  * <p><code>UnicodeSet</code> patterns may appear anywhere that
 330  * makes sense. They may appear in variable definitions.
 331  * Contrariwise, <code>UnicodeSet</code> patterns may themselves
 332  * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
 333  * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.
 334  *
 335  * <p><code>UnicodeSet</code> patterns may also be embedded directly
 336  * into rule strings. Thus, the following two rules are equivalent:
 337  *
 338  * <pre>
 339  *     $vowel=[aeiou]; $vowel&gt;'*'; # One way to do this
 340  *     [aeiou]&gt;'*'; # Another way
 341  * </pre>
 342  *
 343  * <p>See {@link UnicodeSet} for more documentation and examples.
 344  *
 345  * <p><b>Segments</b>
 346  *
 347  * <p>Segments of the input string can be matched and copied to the
 348  * output string. This makes certain sets of rules simpler and more
 349  * general, and makes reordering possible. For example:
 350  *
 351  * <pre>
 352  *     ([a-z]) &gt; $1 $1; # double lowercase letters
 353  *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs
 354  * </pre>
 355  *
 356  * <p>The segment of the input string to be copied is delimited by
 357  * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
 358  * nine segments may be defined. Segments may not overlap. In the
 359  * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
 360  * represent the input string segments, in left-to-right order of
 361  * definition.
 362  *
 363  * <p><b>Anchors</b>
 364  *
 365  * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
 366  * special characters '<code>^</code>' and '<code>$</code>'. For example:
 367  *
 368  * <pre>
 369  *   ^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text
 370  *   &nbsp; a&nbsp;&nbsp; &gt; 'A'; # match other instances of 'a'
 371  *   &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text
 372  *   &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances of 'z'
 373  * </pre>
 374  *
 375  * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
 376  * This is done by including a virtual anchor character '<code>$</code>' at the end of the
 377  * set pattern. Although this is usually the match chafacter for the end anchor, the set will
 378  * match either the beginning or the end of the text, depending on its placement. For
 379  * example:
 380  *
 381  * <pre>
 382  *   $x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor
 383  *   $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start
 384  *   &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end
 385  * </pre>
 386  *
 387  * <p><b>Example</b>
 388  *
 389  * <p>The following example rules illustrate many of the features of
 390  * the rule language.
 391  *
 392  * <table border="0" cellpadding="4">
 393  *     <tr>
 394  *         <td style="vertical-align: top;">Rule 1.</td>
 395  *         <td style="vertical-align: top; write-space: nowrap;"><code>abc{def}&gt;x|y</code></td>
 396  *     </tr>
 397  *     <tr>
 398  *         <td style="vertical-align: top;">Rule 2.</td>
 399  *         <td style="vertical-align: top; write-space: nowrap;"><code>xyz&gt;r</code></td>
 400  *     </tr>
 401  *     <tr>
 402  *         <td style="vertical-align: top;">Rule 3.</td>
 403  *         <td style="vertical-align: top; write-space: nowrap;"><code>yz&gt;q</code></td>
 404  *     </tr>
 405  * </table>
 406  *
 407  * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
 408  * yields the following results:
 409  *
 410  * <table border="0" cellpadding="4">
 411  *     <tr>
 412  *         <td style="vertical-align: top; write-space: nowrap;"><code>|adefabcdefz</code></td>
 413  *         <td style="vertical-align: top;">Initial state, no rules match. Advance
 414  *         cursor.</td>
 415  *     </tr>
 416  *     <tr>
 417  *         <td style="vertical-align: top; write-space: nowrap;"><code>a|defabcdefz</code></td>
 418  *         <td style="vertical-align: top;">Still no match. Rule 1 does not match
 419  *         because the preceding context is not present.</td>
 420  *     </tr>
 421  *     <tr>
 422  *         <td style="vertical-align: top; write-space: nowrap;"><code>ad|efabcdefz</code></td>
 423  *         <td style="vertical-align: top;">Still no match. Keep advancing until
 424  *         there is a match...</td>
 425  *     </tr>
 426  *     <tr>
 427  *         <td style="vertical-align: top; write-space: nowrap;"><code>ade|fabcdefz</code></td>
 428  *         <td style="vertical-align: top;">...</td>
 429  *     </tr>
 430  *     <tr>
 431  *         <td style="vertical-align: top; write-space: nowrap;"><code>adef|abcdefz</code></td>
 432  *         <td style="vertical-align: top;">...</td>
 433  *     </tr>
 434  *     <tr>
 435  *         <td style="vertical-align: top; write-space: nowrap;"><code>adefa|bcdefz</code></td>
 436  *         <td style="vertical-align: top;">...</td>
 437  *     </tr>
 438  *     <tr>
 439  *         <td style="vertical-align: top; write-space: nowrap;"><code>adefab|cdefz</code></td>
 440  *         <td style="vertical-align: top;">...</td>
 441  *     </tr>
 442  *     <tr>
 443  *         <td style="vertical-align: top; write-space: nowrap;"><code>adefabc|defz</code></td>
 444  *         <td style="vertical-align: top;">Rule 1 matches; replace &quot;<code>def</code>&quot;
 445  *         with &quot;<code>xy</code>&quot; and back up the cursor
 446  *         to before the '<code>y</code>'.</td>
 447  *     </tr>
 448  *     <tr>
 449  *         <td style="vertical-align: top; write-space: nowrap;"><code>adefabcx|yz</code></td>
 450  *         <td style="vertical-align: top;">Although &quot;<code>xyz</code>&quot; is
 451  *         present, rule 2 does not match because the cursor is
 452  *         before the '<code>y</code>', not before the '<code>x</code>'.
 453  *         Rule 3 does match. Replace &quot;<code>yz</code>&quot;
 454  *         with &quot;<code>q</code>&quot;.</td>
 455  *     </tr>
 456  *     <tr>
 457  *         <td style="vertical-align: top; write-space: nowrap;"><code>adefabcxq|</code></td>
 458  *         <td style="vertical-align: top;">The cursor is at the end;
 459  *         transliteration is complete.</td>
 460  *     </tr>
 461  * </table>
 462  *
 463  * <p>The order of rules is significant. If multiple rules may match
 464  * at some point, the first matching rule is applied.
 465  *
 466  * <p>Forward and reverse rules may have an empty output string.
 467  * Otherwise, an empty left or right hand side of any statement is a
 468  * syntax error.
 469  *
 470  * <p>Single quotes are used to quote any character other than a
 471  * digit or letter. To specify a single quote itself, inside or
 472  * outside of quotes, use two single quotes in a row. For example,
 473  * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
 474  * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
 475  *
 476  * <p><b>Notes</b>
 477  *
 478  * <p>While a Transliterator is being built from rules, it checks that
 479  * the rules are added in proper order. For example, if the rule
 480  * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
 481  * then the second rule will throw an exception. The reason is that
 482  * the second rule can never be triggered, since the first rule
 483  * always matches anything it matches. In other words, the first
 484  * rule <em>masks</em> the second rule.
 485  *
 486  * @author Alan Liu
 487  * @stable ICU 2.0
 488  */
 489 class U_I18N_API Transliterator : public UObject {
 490
 491 private:
 492
 493     /**
 494      * Programmatic name, e.g., "Latin-Arabic".
 495      */
 496     UnicodeString ID;
 497
 498     /**
 499      * This transliterator's filter.  Any character for which
 500      * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
 501      * altered by this transliterator.  If <tt>filter</tt> is
 502      * <tt>null</tt> then no filtering is applied.
 503      */
 504     UnicodeFilter* filter;
 505
 506     int32_t maximumContextLength;
 507
 508  public:
 509
 510     /**
 511      * A context integer or pointer for a factory function, passed by
 512      * value.
 513      * @stable ICU 2.4
 514      */
 515     union Token {
 516         /**
 517          * This token, interpreted as a 32-bit integer.
 518          * @stable ICU 2.4
 519          */
 520         int32_t integer;
 521         /**
 522          * This token, interpreted as a native pointer.
 523          * @stable ICU 2.4
 524          */
 525         void*   pointer;
 526     };
 527
 528 #ifndef U_HIDE_INTERNAL_API
 529     /**
 530      * Return a token containing an integer.
 531      * @return a token containing an integer.
 532      * @internal
 533      */
 534     inline static Token integerToken(int32_t);
 535
 536     /**
 537      * Return a token containing a pointer.
 538      * @return a token containing a pointer.
 539      * @internal
 540      */
 541     inline static Token pointerToken(void*);
 542 #endif  /* U_HIDE_INTERNAL_API */
 543
 544     /**
 545      * A function that creates and returns a Transliterator.  When
 546      * invoked, it will be passed the ID string that is being
 547      * instantiated, together with the context pointer that was passed
 548      * in when the factory function was first registered.  Many
 549      * factory functions will ignore both parameters, however,
 550      * functions that are registered to more than one ID may use the
 551      * ID or the context parameter to parameterize the transliterator
 552      * they create.
 553      * @param ID      the string identifier for this transliterator
 554      * @param context a context pointer that will be stored and
 555      *                later passed to the factory function when an ID matching
 556      *                the registration ID is being instantiated with this factory.
 557      * @stable ICU 2.4
 558      */
 559     typedef Transliterator* (U_EXPORT2 *Factory)(const UnicodeString& ID, Token context);
 560
 561 protected:
 562
 563     /**
 564      * Default constructor.
 565      * @param ID the string identifier for this transliterator
 566      * @param adoptedFilter the filter.  Any character for which
 567      * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
 568      * altered by this transliterator.  If <tt>filter</tt> is
 569      * <tt>null</tt> then no filtering is applied.
 570      * @stable ICU 2.4
 571      */
 572     Transliterator(const UnicodeString& ID, UnicodeFilter* adoptedFilter);
 573
 574     /**
 575      * Copy constructor.
 576      * @stable ICU 2.4
 577      */
 578     Transliterator(const Transliterator&);
 579
 580     /**
 581      * Assignment operator.
 582      * @stable ICU 2.4
 583      */
 584     Transliterator& operator=(const Transliterator&);
 585
 586     /**
 587      * Create a transliterator from a basic ID.  This is an ID
 588      * containing only the forward direction source, target, and
 589      * variant.
 590      * @param id a basic ID of the form S-T or S-T/V.
 591      * @param canon canonical ID to assign to the object, or
 592      * NULL to leave the ID unchanged
 593      * @return a newly created Transliterator or null if the ID is
 594      * invalid.
 595      * @stable ICU 2.4
 596      */
 597     static Transliterator* createBasicInstance(const UnicodeString& id,
 598                                                const UnicodeString* canon);
 599
 600     friend class TransliteratorParser; // for parseID()
 601     friend class TransliteratorIDParser; // for createBasicInstance()
 602     friend class TransliteratorAlias; // for setID()
 603
 604 public:
 605
 606     /**
 607      * Destructor.
 608      * @stable ICU 2.0
 609      */
 610     virtual ~Transliterator();
 611
 612     /**
 613      * Implements Cloneable.
 614      * All subclasses are encouraged to implement this method if it is
 615      * possible and reasonable to do so.  Subclasses that are to be
 616      * registered with the system using <tt>registerInstance()</tt>
 617      * are required to implement this method.  If a subclass does not
 618      * implement clone() properly and is registered with the system
 619      * using registerInstance(), then the default clone() implementation
 620      * will return null, and calls to createInstance() will fail.
 621      *
 622      * @return a copy of the object.
 623      * @see #registerInstance
 624      * @stable ICU 2.0
 625      */
 626     virtual Transliterator* clone() const;
 627
 628     /**
 629      * Transliterates a segment of a string, with optional filtering.
 630      *
 631      * @param text the string to be transliterated
 632      * @param start the beginning index, inclusive; <code>0 <= start
 633      * <= limit</code>.
 634      * @param limit the ending index, exclusive; <code>start <= limit
 635      * <= text.length()</code>.
 636      * @return The new limit index.  The text previously occupying <code>[start,
 637      * limit)</code> has been transliterated, possibly to a string of a different
 638      * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
 639      * <em>new-limit</em> is the return value. If the input offsets are out of bounds,
 640      * the returned value is -1 and the input string remains unchanged.
 641      * @stable ICU 2.0
 642      */
 643     virtual int32_t transliterate(Replaceable& text,
 644                                   int32_t start, int32_t limit) const;
 645
 646     /**
 647      * Transliterates an entire string in place. Convenience method.
 648      * @param text the string to be transliterated
 649      * @stable ICU 2.0
 650      */
 651     virtual void transliterate(Replaceable& text) const;
 652
 653     /**
 654      * Transliterates the portion of the text buffer that can be
 655      * transliterated unambiguosly after new text has been inserted,
 656      * typically as a result of a keyboard event.  The new text in
 657      * <code>insertion</code> will be inserted into <code>text</code>
 658      * at <code>index.limit</code>, advancing
 659      * <code>index.limit</code> by <code>insertion.length()</code>.
 660      * Then the transliterator will try to transliterate characters of
 661      * <code>text</code> between <code>index.cursor</code> and
 662      * <code>index.limit</code>.  Characters before
 663      * <code>index.cursor</code> will not be changed.
 664      *
 665      * <p>Upon return, values in <code>index</code> will be updated.
 666      * <code>index.start</code> will be advanced to the first
 667      * character that future calls to this method will read.
 668      * <code>index.cursor</code> and <code>index.limit</code> will
 669      * be adjusted to delimit the range of text that future calls to
 670      * this method may change.
 671      *
 672      * <p>Typical usage of this method begins with an initial call
 673      * with <code>index.start</code> and <code>index.limit</code>
 674      * set to indicate the portion of <code>text</code> to be
 675      * transliterated, and <code>index.cursor == index.start</code>.
 676      * Thereafter, <code>index</code> can be used without
 677      * modification in future calls, provided that all changes to
 678      * <code>text</code> are made via this method.
 679      *
 680      * <p>This method assumes that future calls may be made that will
 681      * insert new text into the buffer.  As a result, it only performs
 682      * unambiguous transliterations.  After the last call to this
 683      * method, there may be untransliterated text that is waiting for
 684      * more input to resolve an ambiguity.  In order to perform these
 685      * pending transliterations, clients should call {@link
 686      * #finishTransliteration } after the last call to this
 687      * method has been made.
 688      *
 689      * @param text the buffer holding transliterated and untransliterated text
 690      * @param index an array of three integers.
 691      *
 692      * <ul><li><code>index.start</code>: the beginning index,
 693      * inclusive; <code>0 <= index.start <= index.limit</code>.
 694      *
 695      * <li><code>index.limit</code>: the ending index, exclusive;
 696      * <code>index.start <= index.limit <= text.length()</code>.
 697      * <code>insertion</code> is inserted at
 698      * <code>index.limit</code>.
 699      *
 700      * <li><code>index.cursor</code>: the next character to be
 701      * considered for transliteration; <code>index.start <=
 702      * index.cursor <= index.limit</code>.  Characters before
 703      * <code>index.cursor</code> will not be changed by future calls
 704      * to this method.</ul>
 705      *
 706      * @param insertion text to be inserted and possibly
 707      * transliterated into the translation buffer at
 708      * <code>index.limit</code>.  If <code>null</code> then no text
 709      * is inserted.
 710      * @param status    Output param to filled in with a success or an error.
 711      * @see #handleTransliterate
 712      * @exception IllegalArgumentException if <code>index</code>
 713      * is invalid
 714      * @see UTransPosition
 715      * @stable ICU 2.0
 716      */
 717     virtual void transliterate(Replaceable& text, UTransPosition& index,
 718                                const UnicodeString& insertion,
 719                                UErrorCode& status) const;
 720
 721     /**
 722      * Transliterates the portion of the text buffer that can be
 723      * transliterated unambiguosly after a new character has been
 724      * inserted, typically as a result of a keyboard event.  This is a
 725      * convenience method.
 726      * @param text the buffer holding transliterated and
 727      * untransliterated text
 728      * @param index an array of three integers.
 729      * @param insertion text to be inserted and possibly
 730      * transliterated into the translation buffer at
 731      * <code>index.limit</code>.
 732      * @param status    Output param to filled in with a success or an error.
 733      * @see #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode&) const
 734      * @stable ICU 2.0
 735      */
 736     virtual void transliterate(Replaceable& text, UTransPosition& index,
 737                                UChar32 insertion,
 738                                UErrorCode& status) const;
 739
 740     /**
 741      * Transliterates the portion of the text buffer that can be
 742      * transliterated unambiguosly.  This is a convenience method; see
 743      * {@link
 744      * #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode&) const }
 745      * for details.
 746      * @param text the buffer holding transliterated and
 747      * untransliterated text
 748      * @param index an array of three integers.
 749      * @param status    Output param to filled in with a success or an error.
 750      * @see #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode &) const
 751      * @stable ICU 2.0
 752      */
 753     virtual void transliterate(Replaceable& text, UTransPosition& index,
 754                                UErrorCode& status) const;
 755
 756     /**
 757      * Finishes any pending transliterations that were waiting for
 758      * more characters.  Clients should call this method as the last
 759      * call after a sequence of one or more calls to
 760      * <code>transliterate()</code>.
 761      * @param text the buffer holding transliterated and
 762      * untransliterated text.
 763      * @param index the array of indices previously passed to {@link
 764      * #transliterate }
 765      * @stable ICU 2.0
 766      */
 767     virtual void finishTransliteration(Replaceable& text,
 768                                        UTransPosition& index) const;
 769
 770 private:
 771
 772     /**
 773      * This internal method does incremental transliteration.  If the
 774      * 'insertion' is non-null then we append it to 'text' before
 775      * proceeding.  This method calls through to the pure virtual
 776      * framework method handleTransliterate() to do the actual
 777      * work.
 778      * @param text the buffer holding transliterated and
 779      * untransliterated text
 780      * @param index an array of three integers.  See {@link
 781      * #transliterate(Replaceable, int[], String)}.
 782      * @param insertion text to be inserted and possibly
 783      * transliterated into the translation buffer at
 784      * <code>index.limit</code>.
 785      * @param status    Output param to filled in with a success or an error.
 786      */
 787     void _transliterate(Replaceable& text,
 788                         UTransPosition& index,
 789                         const UnicodeString* insertion,
 790                         UErrorCode &status) const;
 791
 792 protected:
 793
 794     /**
 795      * Abstract method that concrete subclasses define to implement
 796      * their transliteration algorithm.  This method handles both
 797      * incremental and non-incremental transliteration.  Let
 798      * <code>originalStart</code> refer to the value of
 799      * <code>pos.start</code> upon entry.
 800      *
 801      * <ul>
 802      *  <li>If <code>incremental</code> is false, then this method
 803      *  should transliterate all characters between
 804      *  <code>pos.start</code> and <code>pos.limit</code>. Upon return
 805      *  <code>pos.start</code> must == <code> pos.limit</code>.</li>
 806      *
 807      *  <li>If <code>incremental</code> is true, then this method
 808      *  should transliterate all characters between
 809      *  <code>pos.start</code> and <code>pos.limit</code> that can be
 810      *  unambiguously transliterated, regardless of future insertions
 811      *  of text at <code>pos.limit</code>.  Upon return,
 812      *  <code>pos.start</code> should be in the range
 813      *  [<code>originalStart</code>, <code>pos.limit</code>).
 814      *  <code>pos.start</code> should be positioned such that
 815      *  characters [<code>originalStart</code>, <code>
 816      *  pos.start</code>) will not be changed in the future by this
 817      *  transliterator and characters [<code>pos.start</code>,
 818      *  <code>pos.limit</code>) are unchanged.</li>
 819      * </ul>
 820      *
 821      * <p>Implementations of this method should also obey the
 822      * following invariants:</p>
 823      *
 824      * <ul>
 825      *  <li> <code>pos.limit</code> and <code>pos.contextLimit</code>
 826      *  should be updated to reflect changes in length of the text
 827      *  between <code>pos.start</code> and <code>pos.limit</code>. The
 828      *  difference <code> pos.contextLimit - pos.limit</code> should
 829      *  not change.</li>
 830      *
 831      *  <li><code>pos.contextStart</code> should not change.</li>
 832      *
 833      *  <li>Upon return, neither <code>pos.start</code> nor
 834      *  <code>pos.limit</code> should be less than
 835      *  <code>originalStart</code>.</li>
 836      *
 837      *  <li>Text before <code>originalStart</code> and text after
 838      *  <code>pos.limit</code> should not change.</li>
 839      *
 840      *  <li>Text before <code>pos.contextStart</code> and text after
 841      *  <code> pos.contextLimit</code> should be ignored.</li>
 842      * </ul>
 843      *
 844      * <p>Subclasses may safely assume that all characters in
 845      * [<code>pos.start</code>, <code>pos.limit</code>) are filtered.
 846      * In other words, the filter has already been applied by the time
 847      * this method is called.  See
 848      * <code>filteredTransliterate()</code>.
 849      *
 850      * <p>This method is <b>not</b> for public consumption.  Calling
 851      * this method directly will transliterate
 852      * [<code>pos.start</code>, <code>pos.limit</code>) without
 853      * applying the filter. End user code should call <code>
 854      * transliterate()</code> instead of this method. Subclass code
 855      * and wrapping transliterators should call
 856      * <code>filteredTransliterate()</code> instead of this method.<p>
 857      *
 858      * @param text the buffer holding transliterated and
 859      * untransliterated text
 860      *
 861      * @param pos the indices indicating the start, limit, context
 862      * start, and context limit of the text.
 863      *
 864      * @param incremental if true, assume more text may be inserted at
 865      * <code>pos.limit</code> and act accordingly.  Otherwise,
 866      * transliterate all text between <code>pos.start</code> and
 867      * <code>pos.limit</code> and move <code>pos.start</code> up to
 868      * <code>pos.limit</code>.
 869      *
 870      * @see #transliterate
 871      * @stable ICU 2.4
 872      */
 873     virtual void handleTransliterate(Replaceable& text,
 874                                      UTransPosition& pos,
 875                                      UBool incremental) const = 0;
 876
 877 public:
 878     /**
 879      * Transliterate a substring of text, as specified by index, taking filters
 880      * into account.  This method is for subclasses that need to delegate to
 881      * another transliterator.
 882      * @param text the text to be transliterated
 883      * @param index the position indices
 884      * @param incremental if TRUE, then assume more characters may be inserted
 885      * at index.limit, and postpone processing to accomodate future incoming
 886      * characters
 887      * @stable ICU 2.4
 888      */
 889     virtual void filteredTransliterate(Replaceable& text,
 890                                        UTransPosition& index,
 891                                        UBool incremental) const;
 892
 893 private:
 894
 895     /**
 896      * Top-level transliteration method, handling filtering, incremental and
 897      * non-incremental transliteration, and rollback.  All transliteration
 898      * public API methods eventually call this method with a rollback argument
 899      * of TRUE.  Other entities may call this method but rollback should be
 900      * FALSE.
 901      *
 902      * <p>If this transliterator has a filter, break up the input text into runs
 903      * of unfiltered characters.  Pass each run to
 904      * subclass.handleTransliterate().
 905      *
 906      * <p>In incremental mode, if rollback is TRUE, perform a special
 907      * incremental procedure in which several passes are made over the input
 908      * text, adding one character at a time, and committing successful
 909      * transliterations as they occur.  Unsuccessful transliterations are rolled
 910      * back and retried with additional characters to give correct results.
 911      *
 912      * @param text the text to be transliterated
 913      * @param index the position indices
 914      * @param incremental if TRUE, then assume more characters may be inserted
 915      * at index.limit, and postpone processing to accomodate future incoming
 916      * characters
 917      * @param rollback if TRUE and if incremental is TRUE, then perform special
 918      * incremental processing, as described above, and undo partial
 919      * transliterations where necessary.  If incremental is FALSE then this
 920      * parameter is ignored.
 921      */
 922     virtual void filteredTransliterate(Replaceable& text,
 923                                        UTransPosition& index,
 924                                        UBool incremental,
 925                                        UBool rollback) const;
 926
 927 public:
 928
 929     /**
 930      * Returns the length of the longest context required by this transliterator.
 931      * This is <em>preceding</em> context.  The default implementation supplied
 932      * by <code>Transliterator</code> returns zero; subclasses
 933      * that use preceding context should override this method to return the
 934      * correct value.  For example, if a transliterator translates "ddd" (where
 935      * d is any digit) to "555" when preceded by "(ddd)", then the preceding
 936      * context length is 5, the length of "(ddd)".
 937      *
 938      * @return The maximum number of preceding context characters this
 939      * transliterator needs to examine
 940      * @stable ICU 2.0
 941      */
 942     int32_t getMaximumContextLength(void) const;
 943
 944 protected:
 945
 946     /**
 947      * Method for subclasses to use to set the maximum context length.
 948      * @param maxContextLength the new value to be set.
 949      * @see #getMaximumContextLength
 950      * @stable ICU 2.4
 951      */
 952     void setMaximumContextLength(int32_t maxContextLength);
 953
 954 public:
 955
 956     /**
 957      * Returns a programmatic identifier for this transliterator.
 958      * If this identifier is passed to <code>createInstance()</code>, it
 959      * will return this object, if it has been registered.
 960      * @return a programmatic identifier for this transliterator.
 961      * @see #registerInstance
 962      * @see #registerFactory
 963      * @see #getAvailableIDs
 964      * @stable ICU 2.0
 965      */
 966     virtual const UnicodeString& getID(void) const;
 967
 968     /**
 969      * Returns a name for this transliterator that is appropriate for
 970      * display to the user in the default locale.  See {@link
 971      * #getDisplayName } for details.
 972      * @param ID     the string identifier for this transliterator
 973      * @param result Output param to receive the display name
 974      * @return       A reference to 'result'.
 975      * @stable ICU 2.0
 976      */
 977     static UnicodeString& U_EXPORT2 getDisplayName(const UnicodeString& ID,
 978                                          UnicodeString& result);
 979
 980     /**
 981      * Returns a name for this transliterator that is appropriate for
 982      * display to the user in the given locale.  This name is taken
 983      * from the locale resource data in the standard manner of the
 984      * <code>java.text</code> package.
 985      *
 986      * <p>If no localized names exist in the system resource bundles,
 987      * a name is synthesized using a localized
 988      * <code>MessageFormat</code> pattern from the resource data.  The
 989      * arguments to this pattern are an integer followed by one or two
 990      * strings.  The integer is the number of strings, either 1 or 2.
 991      * The strings are formed by splitting the ID for this
 992      * transliterator at the first '-'.  If there is no '-', then the
 993      * entire ID forms the only string.
 994      * @param ID       the string identifier for this transliterator
 995      * @param inLocale the Locale in which the display name should be
 996      *                 localized.
 997      * @param result   Output param to receive the display name
 998      * @return         A reference to 'result'.
 999      * @stable ICU 2.0
1000      */
1001     static UnicodeString& U_EXPORT2 getDisplayName(const UnicodeString& ID,
1002                                          const Locale& inLocale,
1003                                          UnicodeString& result);
1004
1005     /**
1006      * Returns the filter used by this transliterator, or <tt>NULL</tt>
1007      * if this transliterator uses no filter.
1008      * @return the filter used by this transliterator, or <tt>NULL</tt>
1009      *         if this transliterator uses no filter.
1010      * @stable ICU 2.0
1011      */
1012     const UnicodeFilter* getFilter(void) const;
1013
1014     /**
1015      * Returns the filter used by this transliterator, or <tt>NULL</tt> if this
1016      * transliterator uses no filter.  The caller must eventually delete the
1017      * result.  After this call, this transliterator's filter is set to
1018      * <tt>NULL</tt>.
1019      * @return the filter used by this transliterator, or <tt>NULL</tt> if this
1020      *         transliterator uses no filter.
1021      * @stable ICU 2.4
1022      */
1023     UnicodeFilter* orphanFilter(void);
1024
1025     /**
1026      * Changes the filter used by this transliterator.  If the filter
1027      * is set to <tt>null</tt> then no filtering will occur.
1028      *
1029      * <p>Callers must take care if a transliterator is in use by
1030      * multiple threads.  The filter should not be changed by one
1031      * thread while another thread may be transliterating.
1032      * @param adoptedFilter the new filter to be adopted.
1033      * @stable ICU 2.0
1034      */
1035     void adoptFilter(UnicodeFilter* adoptedFilter);
1036
1037     /**
1038      * Returns this transliterator's inverse.  See the class
1039      * documentation for details.  This implementation simply inverts
1040      * the two entities in the ID and attempts to retrieve the
1041      * resulting transliterator.  That is, if <code>getID()</code>
1042      * returns "A-B", then this method will return the result of
1043      * <code>createInstance("B-A")</code>, or <code>null</code> if that
1044      * call fails.
1045      *
1046      * <p>Subclasses with knowledge of their inverse may wish to
1047      * override this method.
1048      *
1049      * @param status Output param to filled in with a success or an error.
1050      * @return a transliterator that is an inverse, not necessarily
1051      * exact, of this transliterator, or <code>null</code> if no such
1052      * transliterator is registered.
1053      * @see #registerInstance
1054      * @stable ICU 2.0
1055      */
1056     Transliterator* createInverse(UErrorCode& status) const;
1057
1058     /**
1059      * Returns a <code>Transliterator</code> object given its ID.
1060      * The ID must be either a system transliterator ID or a ID registered
1061      * using <code>registerInstance()</code>.
1062      *
1063      * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
1064      * @param dir        either FORWARD or REVERSE.
1065      * @param parseError Struct to recieve information on position
1066      *                   of error if an error is encountered
1067      * @param status     Output param to filled in with a success or an error.
1068      * @return A <code>Transliterator</code> object with the given ID
1069      * @see #registerInstance
1070      * @see #getAvailableIDs
1071      * @see #getID
1072      * @stable ICU 2.0
1073      */
1074     static Transliterator* U_EXPORT2 createInstance(const UnicodeString& ID,
1075                                           UTransDirection dir,
1076                                           UParseError& parseError,
1077                                           UErrorCode& status);
1078
1079     /**
1080      * Returns a <code>Transliterator</code> object given its ID.
1081      * The ID must be either a system transliterator ID or a ID registered
1082      * using <code>registerInstance()</code>.
1083      * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
1084      * @param dir        either FORWARD or REVERSE.
1085      * @param status     Output param to filled in with a success or an error.
1086      * @return A <code>Transliterator</code> object with the given ID
1087      * @stable ICU 2.0
1088      */
1089     static Transliterator* U_EXPORT2 createInstance(const UnicodeString& ID,
1090                                           UTransDirection dir,
1091                                           UErrorCode& status);
1092
1093     /**
1094      * Returns a <code>Transliterator</code> object constructed from
1095      * the given rule string.  This will be a rule-based Transliterator,
1096      * if the rule string contains only rules, or a
1097      * compound Transliterator, if it contains ID blocks, or a
1098      * null Transliterator, if it contains ID blocks which parse as
1099      * empty for the given direction.
1100      *
1101      * @param ID            the id for the transliterator.
1102      * @param rules         rules, separated by ';'
1103      * @param dir           either FORWARD or REVERSE.
1104      * @param parseError    Struct to receive information on position
1105      *                      of error if an error is encountered
1106      * @param status        Output param set to success/failure code.
1107      * @return a newly created Transliterator
1108      * @stable ICU 2.0
1109      */
1110     static Transliterator* U_EXPORT2 createFromRules(const UnicodeString& ID,
1111                                            const UnicodeString& rules,
1112                                            UTransDirection dir,
1113                                            UParseError& parseError,
1114                                            UErrorCode& status);
1115
1116     /**
1117      * Create a rule string that can be passed to createFromRules()
1118      * to recreate this transliterator.
1119      * @param result the string to receive the rules.  Previous
1120      * contents will be deleted.
1121      * @param escapeUnprintable if TRUE then convert unprintable
1122      * character to their hex escape representations, \\uxxxx or
1123      * \\Uxxxxxxxx.  Unprintable characters are those other than
1124      * U+000A, U+0020..U+007E.
1125      * @stable ICU 2.0
1126      */
1127     virtual UnicodeString& toRules(UnicodeString& result,
1128                                    UBool escapeUnprintable) const;
1129
1130     /**
1131      * Return the number of elements that make up this transliterator.
1132      * For example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
1133      * were created, the return value of this method would be 3.
1134      *
1135      * <p>If this transliterator is not composed of other
1136      * transliterators, then this method returns 1.
1137      * @return the number of transliterators that compose this
1138      * transliterator, or 1 if this transliterator is not composed of
1139      * multiple transliterators
1140      * @stable ICU 3.0
1141      */
1142     int32_t countElements() const;
1143
1144     /**
1145      * Return an element that makes up this transliterator.  For
1146      * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
1147      * were created, the return value of this method would be one
1148      * of the three transliterator objects that make up that
1149      * transliterator: [NFD, Jamo-Latin, Latin-Greek].
1150      *
1151      * <p>If this transliterator is not composed of other
1152      * transliterators, then this method will return a reference to
1153      * this transliterator when given the index 0.
1154      * @param index a value from 0..countElements()-1 indicating the
1155      * transliterator to return
1156      * @param ec input-output error code
1157      * @return one of the transliterators that makes up this
1158      * transliterator, if this transliterator is made up of multiple
1159      * transliterators, otherwise a reference to this object if given
1160      * an index of 0
1161      * @stable ICU 3.0
1162      */
1163     const Transliterator& getElement(int32_t index, UErrorCode& ec) const;
1164
1165     /**
1166      * Returns the set of all characters that may be modified in the
1167      * input text by this Transliterator.  This incorporates this
1168      * object's current filter; if the filter is changed, the return
1169      * value of this function will change.  The default implementation
1170      * returns an empty set.  Some subclasses may override {@link
1171      * #handleGetSourceSet } to return a more precise result.  The
1172      * return result is approximate in any case and is intended for
1173      * use by tests, tools, or utilities.
1174      * @param result receives result set; previous contents lost
1175      * @return a reference to result
1176      * @see #getTargetSet
1177      * @see #handleGetSourceSet
1178      * @stable ICU 2.4
1179      */
1180     UnicodeSet& getSourceSet(UnicodeSet& result) const;
1181
1182     /**
1183      * Framework method that returns the set of all characters that
1184      * may be modified in the input text by this Transliterator,
1185      * ignoring the effect of this object's filter.  The base class
1186      * implementation returns the empty set.  Subclasses that wish to
1187      * implement this should override this method.
1188      * @return the set of characters that this transliterator may
1189      * modify.  The set may be modified, so subclasses should return a
1190      * newly-created object.
1191      * @param result receives result set; previous contents lost
1192      * @see #getSourceSet
1193      * @see #getTargetSet
1194      * @stable ICU 2.4
1195      */
1196     virtual void handleGetSourceSet(UnicodeSet& result) const;
1197
1198     /**
1199      * Returns the set of all characters that may be generated as
1200      * replacement text by this transliterator.  The default
1201      * implementation returns the empty set.  Some subclasses may
1202      * override this method to return a more precise result.  The
1203      * return result is approximate in any case and is intended for
1204      * use by tests, tools, or utilities requiring such
1205      * meta-information.
1206      * @param result receives result set; previous contents lost
1207      * @return a reference to result
1208      * @see #getTargetSet
1209      * @stable ICU 2.4
1210      */
1211     virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
1212
1213 public:
1214
1215     /**
1216      * Registers a factory function that creates transliterators of
1217      * a given ID.
1218      *
1219      * Because ICU may choose to cache Transliterators internally, this must
1220      * be called at application startup, prior to any calls to
1221      * Transliterator::createXXX to avoid undefined behavior.
1222      *
1223      * @param id the ID being registered
1224      * @param factory a function pointer that will be copied and
1225      * called later when the given ID is passed to createInstance()
1226      * @param context a context pointer that will be stored and
1227      * later passed to the factory function when an ID matching
1228      * the registration ID is being instantiated with this factory.
1229      * @stable ICU 2.0
1230      */
1231     static void U_EXPORT2 registerFactory(const UnicodeString& id,
1232                                 Factory factory,
1233                                 Token context);
1234
1235     /**
1236      * Registers an instance <tt>obj</tt> of a subclass of
1237      * <code>Transliterator</code> with the system.  When
1238      * <tt>createInstance()</tt> is called with an ID string that is
1239      * equal to <tt>obj->getID()</tt>, then <tt>obj->clone()</tt> is
1240      * returned.
1241      *
1242      * After this call the Transliterator class owns the adoptedObj
1243      * and will delete it.
1244      *
1245      * Because ICU may choose to cache Transliterators internally, this must
1246      * be called at application startup, prior to any calls to
1247      * Transliterator::createXXX to avoid undefined behavior.
1248      *
1249      * @param adoptedObj an instance of subclass of
1250      * <code>Transliterator</code> that defines <tt>clone()</tt>
1251      * @see #createInstance
1252      * @see #registerFactory
1253      * @see #unregister
1254      * @stable ICU 2.0
1255      */
1256     static void U_EXPORT2 registerInstance(Transliterator* adoptedObj);
1257
1258     /**
1259      * Registers an ID string as an alias of another ID string.
1260      * That is, after calling this function, <tt>createInstance(aliasID)</tt>
1261      * will return the same thing as <tt>createInstance(realID)</tt>.
1262      * This is generally used to create shorter, more mnemonic aliases
1263      * for long compound IDs.
1264      *
1265      * @param aliasID The new ID being registered.
1266      * @param realID The ID that the new ID is to be an alias for.
1267      * This can be a compound ID and can include filters and should
1268      * refer to transliterators that have already been registered with
1269      * the framework, although this isn't checked.
1270      * @stable ICU 3.6
1271      */
1272      static void U_EXPORT2 registerAlias(const UnicodeString& aliasID,
1273                                          const UnicodeString& realID);
1274
1275 protected:
1276
1277 #ifndef U_HIDE_INTERNAL_API
1278     /**
1279      * @param id the ID being registered
1280      * @param factory a function pointer that will be copied and
1281      * called later when the given ID is passed to createInstance()
1282      * @param context a context pointer that will be stored and
1283      * later passed to the factory function when an ID matching
1284      * the registration ID is being instantiated with this factory.
1285      * @internal
1286      */
1287     static void _registerFactory(const UnicodeString& id,
1288                                  Factory factory,
1289                                  Token context);
1290
1291     /**
1292      * @internal
1293      */
1294     static void _registerInstance(Transliterator* adoptedObj);
1295
1296     /**
1297      * @internal
1298      */
1299     static void _registerAlias(const UnicodeString& aliasID, const UnicodeString& realID);
1300
1301     /**
1302      * Register two targets as being inverses of one another.  For
1303      * example, calling registerSpecialInverse("NFC", "NFD", true) causes
1304      * Transliterator to form the following inverse relationships:
1305      *
1306      * <pre>NFC => NFD
1307      * Any-NFC => Any-NFD
1308      * NFD => NFC
1309      * Any-NFD => Any-NFC</pre>
1310      *
1311      * (Without the special inverse registration, the inverse of NFC
1312      * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
1313      * that the presence or absence of "Any-" is preserved.
1314      *
1315      * <p>The relationship is symmetrical; registering (a, b) is
1316      * equivalent to registering (b, a).
1317      *
1318      * <p>The relevant IDs must still be registered separately as
1319      * factories or classes.
1320      *
1321      * <p>Only the targets are specified.  Special inverses always
1322      * have the form Any-Target1 <=> Any-Target2.  The target should
1323      * have canonical casing (the casing desired to be produced when
1324      * an inverse is formed) and should contain no whitespace or other
1325      * extraneous characters.
1326      *
1327      * @param target the target against which to register the inverse
1328      * @param inverseTarget the inverse of target, that is
1329      * Any-target.getInverse() => Any-inverseTarget
1330      * @param bidirectional if true, register the reverse relation
1331      * as well, that is, Any-inverseTarget.getInverse() => Any-target
1332      * @internal
1333      */
1334     static void _registerSpecialInverse(const UnicodeString& target,
1335                                         const UnicodeString& inverseTarget,
1336                                         UBool bidirectional);
1337 #endif  /* U_HIDE_INTERNAL_API */
1338
1339 public:
1340
1341     /**
1342      * Unregisters a transliterator or class.  This may be either
1343      * a system transliterator or a user transliterator or class.
1344      * Any attempt to construct an unregistered transliterator based
1345      * on its ID will fail.
1346      *
1347      * Because ICU may choose to cache Transliterators internally, this should
1348      * be called during application shutdown, after all calls to
1349      * Transliterator::createXXX to avoid undefined behavior.
1350      *
1351      * @param ID the ID of the transliterator or class
1352      * @return the <code>Object</code> that was registered with
1353      * <code>ID</code>, or <code>null</code> if none was
1354      * @see #registerInstance
1355      * @see #registerFactory
1356      * @stable ICU 2.0
1357      */
1358     static void U_EXPORT2 unregister(const UnicodeString& ID);
1359
1360 public:
1361
1362     /**
1363      * Return a StringEnumeration over the IDs available at the time of the
1364      * call, including user-registered IDs.
1365      * @param ec input-output error code
1366      * @return a newly-created StringEnumeration over the transliterators
1367      * available at the time of the call. The caller should delete this object
1368      * when done using it.
1369      * @stable ICU 3.0
1370      */
1371     static StringEnumeration* U_EXPORT2 getAvailableIDs(UErrorCode& ec);
1372
1373     /**
1374      * Return the number of registered source specifiers.
1375      * @return the number of registered source specifiers.
1376      * @stable ICU 2.0
1377      */
1378     static int32_t U_EXPORT2 countAvailableSources(void);
1379
1380     /**
1381      * Return a registered source specifier.
1382      * @param index which specifier to return, from 0 to n-1, where
1383      * n = countAvailableSources()
1384      * @param result fill-in paramter to receive the source specifier.
1385      * If index is out of range, result will be empty.
1386      * @return reference to result
1387      * @stable ICU 2.0
1388      */
1389     static UnicodeString& U_EXPORT2 getAvailableSource(int32_t index,
1390                                              UnicodeString& result);
1391
1392     /**
1393      * Return the number of registered target specifiers for a given
1394      * source specifier.
1395      * @param source the given source specifier.
1396      * @return the number of registered target specifiers for a given
1397      *         source specifier.
1398      * @stable ICU 2.0
1399      */
1400     static int32_t U_EXPORT2 countAvailableTargets(const UnicodeString& source);
1401
1402     /**
1403      * Return a registered target specifier for a given source.
1404      * @param index which specifier to return, from 0 to n-1, where
1405      * n = countAvailableTargets(source)
1406      * @param source the source specifier
1407      * @param result fill-in paramter to receive the target specifier.
1408      * If source is invalid or if index is out of range, result will
1409      * be empty.
1410      * @return reference to result
1411      * @stable ICU 2.0
1412      */
1413     static UnicodeString& U_EXPORT2 getAvailableTarget(int32_t index,
1414                                              const UnicodeString& source,
1415                                              UnicodeString& result);
1416
1417     /**
1418      * Return the number of registered variant specifiers for a given
1419      * source-target pair.
1420      * @param source    the source specifiers.
1421      * @param target    the target specifiers.
1422      * @stable ICU 2.0
1423      */
1424     static int32_t U_EXPORT2 countAvailableVariants(const UnicodeString& source,
1425                                           const UnicodeString& target);
1426
1427     /**
1428      * Return a registered variant specifier for a given source-target
1429      * pair.
1430      * @param index which specifier to return, from 0 to n-1, where
1431      * n = countAvailableVariants(source, target)
1432      * @param source the source specifier
1433      * @param target the target specifier
1434      * @param result fill-in paramter to receive the variant
1435      * specifier.  If source is invalid or if target is invalid or if
1436      * index is out of range, result will be empty.
1437      * @return reference to result
1438      * @stable ICU 2.0
1439      */
1440     static UnicodeString& U_EXPORT2 getAvailableVariant(int32_t index,
1441                                               const UnicodeString& source,
1442                                               const UnicodeString& target,
1443                                               UnicodeString& result);
1444
1445 protected:
1446
1447 #ifndef U_HIDE_INTERNAL_API
1448     /**
1449      * Non-mutexed internal method
1450      * @internal
1451      */
1452     static int32_t _countAvailableSources(void);
1453
1454     /**
1455      * Non-mutexed internal method
1456      * @internal
1457      */
1458     static UnicodeString& _getAvailableSource(int32_t index,
1459                                               UnicodeString& result);
1460
1461     /**
1462      * Non-mutexed internal method
1463      * @internal
1464      */
1465     static int32_t _countAvailableTargets(const UnicodeString& source);
1466
1467     /**
1468      * Non-mutexed internal method
1469      * @internal
1470      */
1471     static UnicodeString& _getAvailableTarget(int32_t index,
1472                                               const UnicodeString& source,
1473                                               UnicodeString& result);
1474
1475     /**
1476      * Non-mutexed internal method
1477      * @internal
1478      */
1479     static int32_t _countAvailableVariants(const UnicodeString& source,
1480                                            const UnicodeString& target);
1481
1482     /**
1483      * Non-mutexed internal method
1484      * @internal
1485      */
1486     static UnicodeString& _getAvailableVariant(int32_t index,
1487                                                const UnicodeString& source,
1488                                                const UnicodeString& target,
1489                                                UnicodeString& result);
1490 #endif  /* U_HIDE_INTERNAL_API */
1491
1492 protected:
1493
1494     /**
1495      * Set the ID of this transliterators.  Subclasses shouldn't do
1496      * this, unless the underlying script behavior has changed.
1497      * @param id the new id t to be set.
1498      * @stable ICU 2.4
1499      */
1500     void setID(const UnicodeString& id);
1501
1502 public:
1503
1504     /**
1505      * Return the class ID for this class.  This is useful only for
1506      * comparing to a return value from getDynamicClassID().
1507      * Note that Transliterator is an abstract base class, and therefor
1508      * no fully constructed object will  have a dynamic
1509      * UCLassID that equals the UClassID returned from
1510      * TRansliterator::getStaticClassID().
1511      * @return       The class ID for class Transliterator.
1512      * @stable ICU 2.0
1513      */
1514     static UClassID U_EXPORT2 getStaticClassID(void);
1515
1516     /**
1517      * Returns a unique class ID <b>polymorphically</b>.  This method
1518      * is to implement a simple version of RTTI, since not all C++
1519      * compilers support genuine RTTI.  Polymorphic operator==() and
1520      * clone() methods call this method.
1521      *
1522      * <p>Concrete subclasses of Transliterator must use the
1523      *    UOBJECT_DEFINE_RTTI_IMPLEMENTATION macro from
1524      *    uobject.h to provide the RTTI functions.
1525      *
1526      * @return The class ID for this object. All objects of a given
1527      * class have the same class ID.  Objects of other classes have
1528      * different class IDs.
1529      * @stable ICU 2.0
1530      */
1531     virtual UClassID getDynamicClassID(void) const = 0;
1532
1533 private:
1534     static UBool initializeRegistry(UErrorCode &status);
1535
1536 public:
1537 #ifndef U_HIDE_OBSOLETE_API
1538     /**
1539      * Return the number of IDs currently registered with the system.
1540      * To retrieve the actual IDs, call getAvailableID(i) with
1541      * i from 0 to countAvailableIDs() - 1.
1542      * @return the number of IDs currently registered with the system.
1543      * @obsolete ICU 3.4 use getAvailableIDs() instead
1544      */
1545     static int32_t U_EXPORT2 countAvailableIDs(void);
1546
1547     /**
1548      * Return the index-th available ID.  index must be between 0
1549      * and countAvailableIDs() - 1, inclusive.  If index is out of
1550      * range, the result of getAvailableID(0) is returned.
1551      * @param index the given ID index.
1552      * @return      the index-th available ID.  index must be between 0
1553      *              and countAvailableIDs() - 1, inclusive.  If index is out of
1554      *              range, the result of getAvailableID(0) is returned.
1555      * @obsolete ICU 3.4 use getAvailableIDs() instead; this function
1556      * is not thread safe, since it returns a reference to storage that
1557      * may become invalid if another thread calls unregister
1558      */
1559     static const UnicodeString& U_EXPORT2 getAvailableID(int32_t index);
1560 #endif  /* U_HIDE_OBSOLETE_API */
1561 };
1562
1563 inline int32_t Transliterator::getMaximumContextLength(void) const {
1564     return maximumContextLength;
1565 }
1566
1567 inline void Transliterator::setID(const UnicodeString& id) {
1568     ID = id;
1569     // NUL-terminate the ID string, which is a non-aliased copy.
1570     ID.append((char16_t)0);
1571     ID.truncate(ID.length()-1);
1572 }
1573
1574 #ifndef U_HIDE_INTERNAL_API
1575 inline Transliterator::Token Transliterator::integerToken(int32_t i) {
1576     Token t;
1577     t.integer = i;
1578     return t;
1579 }
1580
1581 inline Transliterator::Token Transliterator::pointerToken(void* p) {
1582     Token t;
1583     t.pointer = p;
1584     return t;
1585 }
1586 #endif  /* U_HIDE_INTERNAL_API */
1587
1588 U_NAMESPACE_END
1589 #endif // U_SHOW_CPLUSPLUS_API
1590
1591 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
1592
1593 #endif