]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/translit.cpp
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / translit.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f 3/*
46f4442e 4 **********************************************************************
2ca993e8 5 * Copyright (C) 1999-2016, International Business Machines
46f4442e
A
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 11/17/99 aliu Creation.
10 **********************************************************************
11 */
b75a7d8f 12
51004dcb 13#include "utypeinfo.h" // for 'typeid' to work
729e4ab9 14
b75a7d8f
A
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_TRANSLITERATION
18
19#include "unicode/putil.h"
20#include "unicode/translit.h"
21#include "unicode/locid.h"
22#include "unicode/msgfmt.h"
23#include "unicode/rep.h"
24#include "unicode/resbund.h"
25#include "unicode/unifilt.h"
b75a7d8f
A
26#include "unicode/uniset.h"
27#include "unicode/uscript.h"
374ca955 28#include "unicode/strenum.h"
4388f060 29#include "unicode/utf16.h"
b75a7d8f
A
30#include "cpdtrans.h"
31#include "nultrans.h"
32#include "rbt_data.h"
33#include "rbt_pars.h"
34#include "rbt.h"
35#include "transreg.h"
36#include "name2uni.h"
37#include "nortrans.h"
38#include "remtrans.h"
39#include "titletrn.h"
40#include "tolowtrn.h"
41#include "toupptrn.h"
42#include "uni2name.h"
46f4442e 43#include "brktrans.h"
b75a7d8f
A
44#include "esctrn.h"
45#include "unesctrn.h"
46#include "tridpars.h"
47#include "anytrans.h"
48#include "util.h"
49#include "hash.h"
50#include "mutex.h"
51#include "ucln_in.h"
52#include "uassert.h"
53#include "cmemory.h"
54#include "cstring.h"
73c04bcf 55#include "uinvchar.h"
b75a7d8f
A
56
57static const UChar TARGET_SEP = 0x002D; /*-*/
58static const UChar ID_DELIM = 0x003B; /*;*/
59static const UChar VARIANT_SEP = 0x002F; // '/'
60
61/**
62 * Prefix for resource bundle key for the display name for a
63 * transliterator. The ID is appended to this to form the key.
64 * The resource bundle value should be a String.
65 */
66static const char RB_DISPLAY_NAME_PREFIX[] = "%Translit%%";
67
68/**
69 * Prefix for resource bundle key for the display name for a
70 * transliterator SCRIPT. The ID is appended to this to form the key.
71 * The resource bundle value should be a String.
72 */
73static const char RB_SCRIPT_DISPLAY_NAME_PREFIX[] = "%Translit%";
74
75/**
76 * Resource bundle key for display name pattern.
77 * The resource bundle value should be a String forming a
78 * MessageFormat pattern, e.g.:
79 * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
80 */
81static const char RB_DISPLAY_NAME_PATTERN[] = "TransliteratorNamePattern";
82
83/**
84 * Resource bundle key for the list of RuleBasedTransliterator IDs.
85 * The resource bundle value should be a String[] with each element
86 * being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
87 * to obtain the class name in which the RB_RULE key will be sought.
88 */
89static const char RB_RULE_BASED_IDS[] = "RuleBasedTransliteratorIDs";
90
91/**
92 * The mutex controlling access to registry object.
93 */
3d1f044b
A
94static icu::UMutex *registryMutex() {
95 static icu::UMutex *m = STATIC_NEW(icu::UMutex);
96 return m;
97}
b75a7d8f
A
98
99/**
100 * System transliterator registry; non-null when initialized.
101 */
4388f060 102static icu::TransliteratorRegistry* registry = 0;
b75a7d8f
A
103
104// Macro to check/initialize the registry. ONLY USE WITHIN
105// MUTEX. Avoids function call when registry is initialized.
46f4442e 106#define HAVE_REGISTRY(status) (registry!=0 || initializeRegistry(status))
b75a7d8f 107
b75a7d8f
A
108U_NAMESPACE_BEGIN
109
374ca955 110UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(Transliterator)
b75a7d8f
A
111
112/**
113 * Return TRUE if the given UTransPosition is valid for text of
114 * the given length.
115 */
73c04bcf 116static inline UBool positionIsValid(UTransPosition& index, int32_t len) {
b75a7d8f
A
117 return !(index.contextStart < 0 ||
118 index.start < index.contextStart ||
119 index.limit < index.start ||
120 index.contextLimit < index.limit ||
121 len < index.contextLimit);
122}
123
124/**
125 * Default constructor.
126 * @param theID the string identifier for this transliterator
127 * @param theFilter the filter. Any character for which
128 * <tt>filter.contains()</tt> returns <tt>FALSE</tt> will not be
129 * altered by this transliterator. If <tt>filter</tt> is
130 * <tt>null</tt> then no filtering is applied.
131 */
132Transliterator::Transliterator(const UnicodeString& theID,
133 UnicodeFilter* adoptedFilter) :
134 UObject(), ID(theID), filter(adoptedFilter),
73c04bcf
A
135 maximumContextLength(0)
136{
137 // NUL-terminate the ID string, which is a non-aliased copy.
138 ID.append((UChar)0);
139 ID.truncate(ID.length()-1);
374ca955 140}
b75a7d8f
A
141
142/**
143 * Destructor.
144 */
145Transliterator::~Transliterator() {
73c04bcf
A
146 if (filter) {
147 delete filter;
148 }
b75a7d8f
A
149}
150
151/**
152 * Copy constructor.
153 */
154Transliterator::Transliterator(const Transliterator& other) :
155 UObject(other), ID(other.ID), filter(0),
73c04bcf
A
156 maximumContextLength(other.maximumContextLength)
157{
158 // NUL-terminate the ID string, which is a non-aliased copy.
159 ID.append((UChar)0);
160 ID.truncate(ID.length()-1);
374ca955 161
b75a7d8f
A
162 if (other.filter != 0) {
163 // We own the filter, so we must have our own copy
164 filter = (UnicodeFilter*) other.filter->clone();
165 }
166}
167
73c04bcf
A
168Transliterator* Transliterator::clone() const {
169 return NULL;
170}
171
b75a7d8f
A
172/**
173 * Assignment operator.
174 */
175Transliterator& Transliterator::operator=(const Transliterator& other) {
176 ID = other.ID;
374ca955
A
177 // NUL-terminate the ID string
178 ID.getTerminatedBuffer();
179
b75a7d8f
A
180 maximumContextLength = other.maximumContextLength;
181 adoptFilter((other.filter == 0) ? 0 : (UnicodeFilter*) other.filter->clone());
182 return *this;
183}
184
185/**
186 * Transliterates a segment of a string. <code>Transliterator</code> API.
187 * @param text the string to be transliterated
188 * @param start the beginning index, inclusive; <code>0 <= start
189 * <= limit</code>.
190 * @param limit the ending index, exclusive; <code>start <= limit
191 * <= text.length()</code>.
192 * @return the new limit index, or -1
193 */
194int32_t Transliterator::transliterate(Replaceable& text,
195 int32_t start, int32_t limit) const {
196 if (start < 0 ||
197 limit < start ||
198 text.length() < limit) {
199 return -1;
200 }
201
202 UTransPosition offsets;
203 offsets.contextStart= start;
204 offsets.contextLimit = limit;
205 offsets.start = start;
206 offsets.limit = limit;
207 filteredTransliterate(text, offsets, FALSE, TRUE);
208 return offsets.limit;
209}
210
211/**
212 * Transliterates an entire string in place. Convenience method.
213 * @param text the string to be transliterated
214 */
215void Transliterator::transliterate(Replaceable& text) const {
216 transliterate(text, 0, text.length());
217}
218
219/**
220 * Transliterates the portion of the text buffer that can be
221 * transliterated unambiguosly after new text has been inserted,
222 * typically as a result of a keyboard event. The new text in
223 * <code>insertion</code> will be inserted into <code>text</code>
224 * at <code>index.contextLimit</code>, advancing
225 * <code>index.contextLimit</code> by <code>insertion.length()</code>.
226 * Then the transliterator will try to transliterate characters of
227 * <code>text</code> between <code>index.start</code> and
228 * <code>index.contextLimit</code>. Characters before
229 * <code>index.start</code> will not be changed.
230 *
231 * <p>Upon return, values in <code>index</code> will be updated.
232 * <code>index.contextStart</code> will be advanced to the first
233 * character that future calls to this method will read.
234 * <code>index.start</code> and <code>index.contextLimit</code> will
235 * be adjusted to delimit the range of text that future calls to
236 * this method may change.
237 *
238 * <p>Typical usage of this method begins with an initial call
239 * with <code>index.contextStart</code> and <code>index.contextLimit</code>
240 * set to indicate the portion of <code>text</code> to be
241 * transliterated, and <code>index.start == index.contextStart</code>.
242 * Thereafter, <code>index</code> can be used without
243 * modification in future calls, provided that all changes to
244 * <code>text</code> are made via this method.
245 *
246 * <p>This method assumes that future calls may be made that will
247 * insert new text into the buffer. As a result, it only performs
248 * unambiguous transliterations. After the last call to this
249 * method, there may be untransliterated text that is waiting for
250 * more input to resolve an ambiguity. In order to perform these
251 * pending transliterations, clients should call {@link
252 * #finishKeyboardTransliteration} after the last call to this
253 * method has been made.
254 *
255 * @param text the buffer holding transliterated and untransliterated text
256 * @param index an array of three integers.
257 *
258 * <ul><li><code>index.contextStart</code>: the beginning index,
259 * inclusive; <code>0 <= index.contextStart <= index.contextLimit</code>.
260 *
261 * <li><code>index.contextLimit</code>: the ending index, exclusive;
262 * <code>index.contextStart <= index.contextLimit <= text.length()</code>.
263 * <code>insertion</code> is inserted at
264 * <code>index.contextLimit</code>.
265 *
266 * <li><code>index.start</code>: the next character to be
267 * considered for transliteration; <code>index.contextStart <=
268 * index.start <= index.contextLimit</code>. Characters before
269 * <code>index.start</code> will not be changed by future calls
270 * to this method.</ul>
271 *
272 * @param insertion text to be inserted and possibly
273 * transliterated into the translation buffer at
274 * <code>index.contextLimit</code>. If <code>null</code> then no text
275 * is inserted.
276 * @see #START
277 * @see #LIMIT
278 * @see #CURSOR
279 * @see #handleTransliterate
280 * @exception IllegalArgumentException if <code>index</code>
281 * is invalid
282 */
283void Transliterator::transliterate(Replaceable& text,
284 UTransPosition& index,
285 const UnicodeString& insertion,
286 UErrorCode &status) const {
287 _transliterate(text, index, &insertion, status);
288}
289
290/**
291 * Transliterates the portion of the text buffer that can be
292 * transliterated unambiguosly after a new character has been
293 * inserted, typically as a result of a keyboard event. This is a
294 * convenience method; see {@link
295 * #transliterate(Replaceable, int[], String)} for details.
296 * @param text the buffer holding transliterated and
297 * untransliterated text
298 * @param index an array of three integers. See {@link
299 * #transliterate(Replaceable, int[], String)}.
300 * @param insertion text to be inserted and possibly
301 * transliterated into the translation buffer at
302 * <code>index.contextLimit</code>.
303 * @see #transliterate(Replaceable, int[], String)
304 */
305void Transliterator::transliterate(Replaceable& text,
306 UTransPosition& index,
307 UChar32 insertion,
308 UErrorCode& status) const {
309 UnicodeString str(insertion);
310 _transliterate(text, index, &str, status);
311}
312
313/**
314 * Transliterates the portion of the text buffer that can be
315 * transliterated unambiguosly. This is a convenience method; see
316 * {@link #transliterate(Replaceable, int[], String)} for
317 * details.
318 * @param text the buffer holding transliterated and
319 * untransliterated text
320 * @param index an array of three integers. See {@link
321 * #transliterate(Replaceable, int[], String)}.
322 * @see #transliterate(Replaceable, int[], String)
323 */
324void Transliterator::transliterate(Replaceable& text,
325 UTransPosition& index,
326 UErrorCode& status) const {
327 _transliterate(text, index, 0, status);
328}
329
330/**
331 * Finishes any pending transliterations that were waiting for
332 * more characters. Clients should call this method as the last
333 * call after a sequence of one or more calls to
334 * <code>transliterate()</code>.
335 * @param text the buffer holding transliterated and
336 * untransliterated text.
337 * @param index the array of indices previously passed to {@link
338 * #transliterate}
339 */
340void Transliterator::finishTransliteration(Replaceable& text,
341 UTransPosition& index) const {
342 if (!positionIsValid(index, text.length())) {
343 return;
344 }
345
346 filteredTransliterate(text, index, FALSE, TRUE);
347}
348
349/**
350 * This internal method does keyboard transliteration. If the
351 * 'insertion' is non-null then we append it to 'text' before
352 * proceeding. This method calls through to the pure virtual
353 * framework method handleTransliterate() to do the actual
354 * work.
355 */
356void Transliterator::_transliterate(Replaceable& text,
357 UTransPosition& index,
358 const UnicodeString* insertion,
359 UErrorCode &status) const {
360 if (U_FAILURE(status)) {
361 return;
362 }
363
364 if (!positionIsValid(index, text.length())) {
365 status = U_ILLEGAL_ARGUMENT_ERROR;
366 return;
367 }
368
369// int32_t originalStart = index.contextStart;
370 if (insertion != 0) {
371 text.handleReplaceBetween(index.limit, index.limit, *insertion);
372 index.limit += insertion->length();
373 index.contextLimit += insertion->length();
374 }
375
376 if (index.limit > 0 &&
4388f060 377 U16_IS_LEAD(text.charAt(index.limit - 1))) {
b75a7d8f
A
378 // Oops, there is a dangling lead surrogate in the buffer.
379 // This will break most transliterators, since they will
380 // assume it is part of a pair. Don't transliterate until
381 // more text comes in.
382 return;
383 }
384
385 filteredTransliterate(text, index, TRUE, TRUE);
386
387#if 0
388 // TODO
389 // I CAN'T DO what I'm attempting below now that the Kleene star
390 // operator is supported. For example, in the rule
391
392 // ([:Lu:]+) { x } > $1;
393
394 // what is the maximum context length? getMaximumContextLength()
395 // will return 1, but this is just the length of the ante context
396 // part of the pattern string -- 1 character, which is a standin
397 // for a Quantifier, which contains a StringMatcher, which
398 // contains a UnicodeSet.
399
400 // There is a complicated way to make this work again, and that's
401 // to add a "maximum left context" protocol into the
402 // UnicodeMatcher hierarchy. At present I'm not convinced this is
403 // worth it.
404
405 // ---
406
407 // The purpose of the code below is to keep the context small
408 // while doing incremental transliteration. When part of the left
409 // context (between contextStart and start) is no longer needed,
410 // we try to advance contextStart past that portion. We use the
411 // maximum context length to do so.
412 int32_t newCS = index.start;
413 int32_t n = getMaximumContextLength();
414 while (newCS > originalStart && n-- > 0) {
415 --newCS;
4388f060 416 newCS -= U16_LENGTH(text.char32At(newCS)) - 1;
b75a7d8f
A
417 }
418 index.contextStart = uprv_max(newCS, originalStart);
419#endif
420}
421
422/**
423 * This method breaks up the input text into runs of unfiltered
424 * characters. It passes each such run to
425 * <subclass>.handleTransliterate(). Subclasses that can handle the
426 * filter logic more efficiently themselves may override this method.
427 *
428 * All transliteration calls in this class go through this method.
429 */
430void Transliterator::filteredTransliterate(Replaceable& text,
431 UTransPosition& index,
432 UBool incremental,
433 UBool rollback) const {
434 // Short circuit path for transliterators with no filter in
435 // non-incremental mode.
436 if (filter == 0 && !rollback) {
437 handleTransliterate(text, index, incremental);
438 return;
439 }
440
441 //----------------------------------------------------------------------
442 // This method processes text in two groupings:
443 //
444 // RUNS -- A run is a contiguous group of characters which are contained
445 // in the filter for this transliterator (filter.contains(ch) == TRUE).
446 // Text outside of runs may appear as context but it is not modified.
447 // The start and limit Position values are narrowed to each run.
448 //
449 // PASSES (incremental only) -- To make incremental mode work correctly,
450 // each run is broken up into n passes, where n is the length (in code
451 // points) of the run. Each pass contains the first n characters. If a
452 // pass is completely transliterated, it is committed, and further passes
453 // include characters after the committed text. If a pass is blocked,
454 // and does not transliterate completely, then this method rolls back
455 // the changes made during the pass, extends the pass by one code point,
456 // and tries again.
457 //----------------------------------------------------------------------
458
459 // globalLimit is the limit value for the entire operation. We
460 // set index.limit to the end of each unfiltered run before
461 // calling handleTransliterate(), so we need to maintain the real
462 // value of index.limit here. After each transliteration, we
463 // update globalLimit for insertions or deletions that have
464 // happened.
465 int32_t globalLimit = index.limit;
466
467 // If there is a non-null filter, then break the input text up. Say the
468 // input text has the form:
469 // xxxabcxxdefxx
470 // where 'x' represents a filtered character (filter.contains('x') ==
471 // false). Then we break this up into:
472 // xxxabc xxdef xx
473 // Each pass through the loop consumes a run of filtered
474 // characters (which are ignored) and a subsequent run of
475 // unfiltered characters (which are transliterated).
476
477 for (;;) {
478
479 if (filter != NULL) {
480 // Narrow the range to be transliterated to the first segment
481 // of unfiltered characters at or after index.start.
482
483 // Advance past filtered chars
484 UChar32 c;
485 while (index.start < globalLimit &&
486 !filter->contains(c=text.char32At(index.start))) {
4388f060 487 index.start += U16_LENGTH(c);
b75a7d8f
A
488 }
489
490 // Find the end of this run of unfiltered chars
491 index.limit = index.start;
492 while (index.limit < globalLimit &&
493 filter->contains(c=text.char32At(index.limit))) {
4388f060 494 index.limit += U16_LENGTH(c);
b75a7d8f
A
495 }
496 }
497
498 // Check to see if the unfiltered run is empty. This only
499 // happens at the end of the string when all the remaining
500 // characters are filtered.
501 if (index.limit == index.start) {
502 // assert(index.start == globalLimit);
503 break;
504 }
505
506 // Is this run incremental? If there is additional
507 // filtered text (if limit < globalLimit) then we pass in
508 // an incremental value of FALSE to force the subclass to
509 // complete the transliteration for this run.
510 UBool isIncrementalRun =
511 (index.limit < globalLimit ? FALSE : incremental);
512
513 int32_t delta;
514
515 // Implement rollback. To understand the need for rollback,
516 // consider the following transliterator:
517 //
518 // "t" is "a > A;"
519 // "u" is "A > b;"
520 // "v" is a compound of "t; NFD; u" with a filter [:Ll:]
521 //
522 // Now apply "c" to the input text "a". The result is "b". But if
523 // the transliteration is done incrementally, then the NFD holds
524 // things up after "t" has already transformed "a" to "A". When
525 // finishTransliterate() is called, "A" is _not_ processed because
526 // it gets excluded by the [:Ll:] filter, and the end result is "A"
527 // -- incorrect. The problem is that the filter is applied to a
528 // partially-transliterated result, when we only want it to apply to
529 // input text. Although this example hinges on a compound
530 // transliterator containing NFD and a specific filter, it can
531 // actually happen with any transliterator which may do a partial
532 // transformation in incremental mode into characters outside its
533 // filter.
534 //
535 // To handle this, when in incremental mode we supply characters to
536 // handleTransliterate() in several passes. Each pass adds one more
537 // input character to the input text. That is, for input "ABCD", we
538 // first try "A", then "AB", then "ABC", and finally "ABCD". If at
539 // any point we block (upon return, start < limit) then we roll
540 // back. If at any point we complete the run (upon return start ==
541 // limit) then we commit that run.
542
543 if (rollback && isIncrementalRun) {
544
545 int32_t runStart = index.start;
546 int32_t runLimit = index.limit;
547 int32_t runLength = runLimit - runStart;
548
549 // Make a rollback copy at the end of the string
550 int32_t rollbackOrigin = text.length();
551 text.copy(runStart, runLimit, rollbackOrigin);
552
553 // Variables reflecting the commitment of completely
554 // transliterated text. passStart is the runStart, advanced
555 // past committed text. rollbackStart is the rollbackOrigin,
556 // advanced past rollback text that corresponds to committed
557 // text.
558 int32_t passStart = runStart;
559 int32_t rollbackStart = rollbackOrigin;
560
561 // The limit for each pass; we advance by one code point with
562 // each iteration.
563 int32_t passLimit = index.start;
564
565 // Total length, in 16-bit code units, of uncommitted text.
566 // This is the length to be rolled back.
567 int32_t uncommittedLength = 0;
568
569 // Total delta (change in length) for all passes
570 int32_t totalDelta = 0;
571
572 // PASS MAIN LOOP -- Start with a single character, and extend
573 // the text by one character at a time. Roll back partial
574 // transliterations and commit complete transliterations.
575 for (;;) {
576 // Length of additional code point, either one or two
4388f060 577 int32_t charLength = U16_LENGTH(text.char32At(passLimit));
b75a7d8f
A
578 passLimit += charLength;
579 if (passLimit > runLimit) {
580 break;
581 }
582 uncommittedLength += charLength;
583
584 index.limit = passLimit;
585
586 // Delegate to subclass for actual transliteration. Upon
587 // return, start will be updated to point after the
588 // transliterated text, and limit and contextLimit will be
589 // adjusted for length changes.
590 handleTransliterate(text, index, TRUE);
591
592 delta = index.limit - passLimit; // change in length
593
594 // We failed to completely transliterate this pass.
595 // Roll back the text. Indices remain unchanged; reset
596 // them where necessary.
597 if (index.start != index.limit) {
598 // Find the rollbackStart, adjusted for length changes
599 // and the deletion of partially transliterated text.
600 int32_t rs = rollbackStart + delta - (index.limit - passStart);
601
602 // Delete the partially transliterated text
4388f060 603 text.handleReplaceBetween(passStart, index.limit, UnicodeString());
b75a7d8f
A
604
605 // Copy the rollback text back
606 text.copy(rs, rs + uncommittedLength, passStart);
607
608 // Restore indices to their original values
609 index.start = passStart;
610 index.limit = passLimit;
611 index.contextLimit -= delta;
612 }
613
614 // We did completely transliterate this pass. Update the
615 // commit indices to record how far we got. Adjust indices
616 // for length change.
617 else {
618 // Move the pass indices past the committed text.
619 passStart = passLimit = index.start;
620
621 // Adjust the rollbackStart for length changes and move
622 // it past the committed text. All characters we've
623 // processed to this point are committed now, so zero
624 // out the uncommittedLength.
625 rollbackStart += delta + uncommittedLength;
626 uncommittedLength = 0;
627
628 // Adjust indices for length changes.
629 runLimit += delta;
630 totalDelta += delta;
631 }
632 }
633
634 // Adjust overall limit and rollbackOrigin for insertions and
635 // deletions. Don't need to worry about contextLimit because
636 // handleTransliterate() maintains that.
637 rollbackOrigin += totalDelta;
638 globalLimit += totalDelta;
639
640 // Delete the rollback copy
4388f060 641 text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, UnicodeString());
b75a7d8f
A
642
643 // Move start past committed text
644 index.start = passStart;
645 }
646
647 else {
648 // Delegate to subclass for actual transliteration.
649 int32_t limit = index.limit;
650 handleTransliterate(text, index, isIncrementalRun);
651 delta = index.limit - limit; // change in length
652
653 // In a properly written transliterator, start == limit after
654 // handleTransliterate() returns when incremental is false.
655 // Catch cases where the subclass doesn't do this, and throw
656 // an exception. (Just pinning start to limit is a bad idea,
657 // because what's probably happening is that the subclass
658 // isn't transliterating all the way to the end, and it should
659 // in non-incremental mode.)
660 if (!incremental && index.start != index.limit) {
661 // We can't throw an exception, so just fudge things
662 index.start = index.limit;
663 }
664
665 // Adjust overall limit for insertions/deletions. Don't need
666 // to worry about contextLimit because handleTransliterate()
667 // maintains that.
668 globalLimit += delta;
669 }
670
671 if (filter == NULL || isIncrementalRun) {
672 break;
673 }
674
675 // If we did completely transliterate this
676 // run, then repeat with the next unfiltered run.
677 }
678
679 // Start is valid where it is. Limit needs to be put back where
680 // it was, modulo adjustments for deletions/insertions.
681 index.limit = globalLimit;
682}
683
684void Transliterator::filteredTransliterate(Replaceable& text,
685 UTransPosition& index,
686 UBool incremental) const {
687 filteredTransliterate(text, index, incremental, FALSE);
688}
689
690/**
691 * Method for subclasses to use to set the maximum context length.
692 * @see #getMaximumContextLength
693 */
694void Transliterator::setMaximumContextLength(int32_t maxContextLength) {
695 maximumContextLength = maxContextLength;
696}
697
698/**
699 * Returns a programmatic identifier for this transliterator.
700 * If this identifier is passed to <code>getInstance()</code>, it
701 * will return this object, if it has been registered.
702 * @see #registerInstance
703 * @see #getAvailableIDs
704 */
705const UnicodeString& Transliterator::getID(void) const {
706 return ID;
707}
708
709/**
710 * Returns a name for this transliterator that is appropriate for
711 * display to the user in the default locale. See {@link
712 * #getDisplayName(Locale)} for details.
713 */
374ca955 714UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& ID,
b75a7d8f
A
715 UnicodeString& result) {
716 return getDisplayName(ID, Locale::getDefault(), result);
717}
718
719/**
720 * Returns a name for this transliterator that is appropriate for
721 * display to the user in the given locale. This name is taken
722 * from the locale resource data in the standard manner of the
723 * <code>java.text</code> package.
724 *
725 * <p>If no localized names exist in the system resource bundles,
726 * a name is synthesized using a localized
727 * <code>MessageFormat</code> pattern from the resource data. The
728 * arguments to this pattern are an integer followed by one or two
729 * strings. The integer is the number of strings, either 1 or 2.
730 * The strings are formed by splitting the ID for this
731 * transliterator at the first TARGET_SEP. If there is no TARGET_SEP, then the
732 * entire ID forms the only string.
733 * @param inLocale the Locale in which the display name should be
734 * localized.
735 * @see java.text.MessageFormat
736 */
374ca955 737UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& id,
b75a7d8f
A
738 const Locale& inLocale,
739 UnicodeString& result) {
740 UErrorCode status = U_ZERO_ERROR;
741
374ca955 742 ResourceBundle bundle(U_ICUDATA_TRANSLIT, inLocale, status);
b75a7d8f
A
743
744 // Suspend checking status until later...
745
746 result.truncate(0);
747
748 // Normalize the ID
749 UnicodeString source, target, variant;
750 UBool sawSource;
751 TransliteratorIDParser::IDtoSTV(id, source, target, variant, sawSource);
752 if (target.length() < 1) {
753 // No target; malformed id
754 return result;
755 }
756 if (variant.length() > 0) { // Change "Foo" to "/Foo"
757 variant.insert(0, VARIANT_SEP);
758 }
759 UnicodeString ID(source);
760 ID.append(TARGET_SEP).append(target).append(variant);
761
762 // build the char* key
73c04bcf
A
763 if (uprv_isInvariantUString(ID.getBuffer(), ID.length())) {
764 char key[200];
765 uprv_strcpy(key, RB_DISPLAY_NAME_PREFIX);
766 int32_t length=(int32_t)uprv_strlen(RB_DISPLAY_NAME_PREFIX);
767 ID.extract(0, (int32_t)(sizeof(key)-length), key+length, (int32_t)(sizeof(key)-length), US_INV);
b75a7d8f 768
73c04bcf
A
769 // Try to retrieve a UnicodeString from the bundle.
770 UnicodeString resString = bundle.getStringEx(key, status);
b75a7d8f 771
73c04bcf
A
772 if (U_SUCCESS(status) && resString.length() != 0) {
773 return result = resString; // [sic] assign & return
774 }
b75a7d8f
A
775
776#if !UCONFIG_NO_FORMATTING
73c04bcf
A
777 // We have failed to get a name from the locale data. This is
778 // typical, since most transliterators will not have localized
779 // name data. The next step is to retrieve the MessageFormat
780 // pattern from the locale data and to use it to synthesize the
781 // name from the ID.
b75a7d8f 782
73c04bcf
A
783 status = U_ZERO_ERROR;
784 resString = bundle.getStringEx(RB_DISPLAY_NAME_PATTERN, status);
785
786 if (U_SUCCESS(status) && resString.length() != 0) {
787 MessageFormat msg(resString, inLocale, status);
788 // Suspend checking status until later...
789
790 // We pass either 2 or 3 Formattable objects to msg.
791 Formattable args[3];
792 int32_t nargs;
793 args[0].setLong(2); // # of args to follow
794 args[1].setString(source);
795 args[2].setString(target);
796 nargs = 3;
797
798 // Use display names for the scripts, if they exist
799 UnicodeString s;
800 length=(int32_t)uprv_strlen(RB_SCRIPT_DISPLAY_NAME_PREFIX);
801 for (int j=1; j<=2; ++j) {
802 status = U_ZERO_ERROR;
803 uprv_strcpy(key, RB_SCRIPT_DISPLAY_NAME_PREFIX);
804 args[j].getString(s);
805 if (uprv_isInvariantUString(s.getBuffer(), s.length())) {
806 s.extract(0, sizeof(key)-length-1, key+length, (int32_t)sizeof(key)-length-1, US_INV);
807
808 resString = bundle.getStringEx(key, status);
809
810 if (U_SUCCESS(status)) {
811 args[j] = resString;
812 }
813 }
814 }
b75a7d8f 815
73c04bcf
A
816 status = U_ZERO_ERROR;
817 FieldPosition pos; // ignored by msg
818 msg.format(args, nargs, result, pos, status);
b75a7d8f 819 if (U_SUCCESS(status)) {
73c04bcf
A
820 result.append(variant);
821 return result;
b75a7d8f
A
822 }
823 }
b75a7d8f 824#endif
73c04bcf 825 }
b75a7d8f
A
826
827 // We should not reach this point unless there is something
828 // wrong with the build or the RB_DISPLAY_NAME_PATTERN has
829 // been deleted from the root RB_LOCALE_ELEMENTS resource.
830 result = ID;
831 return result;
832}
833
834/**
835 * Returns the filter used by this transliterator, or <tt>null</tt>
836 * if this transliterator uses no filter. Caller musn't delete
837 * the result!
838 */
839const UnicodeFilter* Transliterator::getFilter(void) const {
840 return filter;
841}
842
843/**
844 * Returns the filter used by this transliterator, or
845 * <tt>NULL</tt> if this transliterator uses no filter. The
846 * caller must eventually delete the result. After this call,
847 * this transliterator's filter is set to <tt>NULL</tt>.
848 */
849UnicodeFilter* Transliterator::orphanFilter(void) {
850 UnicodeFilter *result = filter;
851 filter = NULL;
852 return result;
853}
854
855/**
856 * Changes the filter used by this transliterator. If the filter
857 * is set to <tt>null</tt> then no filtering will occur.
858 *
859 * <p>Callers must take care if a transliterator is in use by
860 * multiple threads. The filter should not be changed by one
861 * thread while another thread may be transliterating.
862 */
863void Transliterator::adoptFilter(UnicodeFilter* filterToAdopt) {
864 delete filter;
865 filter = filterToAdopt;
866}
867
868/**
869 * Returns this transliterator's inverse. See the class
870 * documentation for details. This implementation simply inverts
871 * the two entities in the ID and attempts to retrieve the
872 * resulting transliterator. That is, if <code>getID()</code>
873 * returns "A-B", then this method will return the result of
874 * <code>getInstance("B-A")</code>, or <code>null</code> if that
875 * call fails.
876 *
877 * <p>This method does not take filtering into account. The
878 * returned transliterator will have no filter.
879 *
880 * <p>Subclasses with knowledge of their inverse may wish to
881 * override this method.
882 *
883 * @return a transliterator that is an inverse, not necessarily
884 * exact, of this transliterator, or <code>null</code> if no such
885 * transliterator is registered.
886 * @see #registerInstance
887 */
888Transliterator* Transliterator::createInverse(UErrorCode& status) const {
889 UParseError parseError;
890 return Transliterator::createInstance(ID, UTRANS_REVERSE,parseError,status);
891}
892
374ca955
A
893Transliterator* U_EXPORT2
894Transliterator::createInstance(const UnicodeString& ID,
895 UTransDirection dir,
896 UErrorCode& status)
897{
b75a7d8f
A
898 UParseError parseError;
899 return createInstance(ID, dir, parseError, status);
900}
901
902/**
903 * Returns a <code>Transliterator</code> object given its ID.
904 * The ID must be either a system transliterator ID or a ID registered
905 * using <code>registerInstance()</code>.
906 *
907 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
908 * @return A <code>Transliterator</code> object with the given ID
909 * @see #registerInstance
910 * @see #getAvailableIDs
911 * @see #getID
912 */
374ca955
A
913Transliterator* U_EXPORT2
914Transliterator::createInstance(const UnicodeString& ID,
915 UTransDirection dir,
916 UParseError& parseError,
917 UErrorCode& status)
918{
b75a7d8f
A
919 if (U_FAILURE(status)) {
920 return 0;
921 }
922
923 UnicodeString canonID;
924 UVector list(status);
925 if (U_FAILURE(status)) {
926 return NULL;
927 }
928
929 UnicodeSet* globalFilter;
930 // TODO add code for parseError...currently unused, but
931 // later may be used by parsing code...
932 if (!TransliteratorIDParser::parseCompoundID(ID, dir, canonID, list, globalFilter)) {
933 status = U_INVALID_ID;
934 return NULL;
935 }
936
73c04bcf 937 TransliteratorIDParser::instantiateList(list, status);
b75a7d8f
A
938 if (U_FAILURE(status)) {
939 return NULL;
940 }
941
942 U_ASSERT(list.size() > 0);
943 Transliterator* t = NULL;
73c04bcf
A
944
945 if (list.size() > 1 || canonID.indexOf(ID_DELIM) >= 0) {
946 // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only
947 // has one child transliterator. This is so that toRules() will return the right thing
948 // (without any inactive ID), but our main ID still comes out correct. That is, if we
949 // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;"
950 // even though the ID is "(Lower);Latin-Greek;".
b75a7d8f 951 t = new CompoundTransliterator(list, parseError, status);
b75a7d8f 952 }
73c04bcf
A
953 else {
954 t = (Transliterator*)list.elementAt(0);
955 }
46f4442e
A
956 // Check null pointer
957 if (t != NULL) {
958 t->setID(canonID);
959 if (globalFilter != NULL) {
960 t->adoptFilter(globalFilter);
961 }
962 }
963 else if (U_SUCCESS(status)) {
964 status = U_MEMORY_ALLOCATION_ERROR;
b75a7d8f
A
965 }
966 return t;
967}
968
969/**
970 * Create a transliterator from a basic ID. This is an ID
971 * containing only the forward direction source, target, and
972 * variant.
973 * @param id a basic ID of the form S-T or S-T/V.
974 * @return a newly created Transliterator or null if the ID is
975 * invalid.
976 */
977Transliterator* Transliterator::createBasicInstance(const UnicodeString& id,
978 const UnicodeString* canon) {
979 UParseError pe;
980 UErrorCode ec = U_ZERO_ERROR;
981 TransliteratorAlias* alias = 0;
982 Transliterator* t = 0;
374ca955 983
3d1f044b 984 umtx_lock(registryMutex());
46f4442e 985 if (HAVE_REGISTRY(ec)) {
374ca955 986 t = registry->get(id, alias, ec);
b75a7d8f 987 }
3d1f044b 988 umtx_unlock(registryMutex());
b75a7d8f
A
989
990 if (U_FAILURE(ec)) {
991 delete t;
992 delete alias;
374ca955 993 return 0;
b75a7d8f
A
994 }
995
374ca955
A
996 // We may have not gotten a transliterator: Because we can't
997 // instantiate a transliterator from inside TransliteratorRegistry::
998 // get() (that would deadlock), we sometimes pass back an alias. This
999 // contains the data we need to finish the instantiation outside the
1000 // registry mutex. The alias may, in turn, generate another alias, so
1001 // we handle aliases in a loop. The max times through the loop is two.
1002 // [alan]
1003 while (alias != 0) {
b75a7d8f 1004 U_ASSERT(t==0);
374ca955
A
1005 // Rule-based aliases are handled with TransliteratorAlias::
1006 // parse(), followed by TransliteratorRegistry::reget().
1007 // Other aliases are handled with TransliteratorAlias::create().
1008 if (alias->isRuleBased()) {
1009 // Step 1. parse
73c04bcf 1010 TransliteratorParser parser(ec);
374ca955
A
1011 alias->parse(parser, pe, ec);
1012 delete alias;
1013 alias = 0;
1014
1015 // Step 2. reget
3d1f044b 1016 umtx_lock(registryMutex());
46f4442e 1017 if (HAVE_REGISTRY(ec)) {
374ca955
A
1018 t = registry->reget(id, parser, alias, ec);
1019 }
3d1f044b 1020 umtx_unlock(registryMutex());
374ca955
A
1021
1022 // Step 3. Loop back around!
1023 } else {
1024 t = alias->create(pe, ec);
1025 delete alias;
1026 alias = 0;
1027 break;
1028 }
b75a7d8f
A
1029 if (U_FAILURE(ec)) {
1030 delete t;
374ca955 1031 delete alias;
b75a7d8f 1032 t = NULL;
374ca955 1033 break;
b75a7d8f
A
1034 }
1035 }
1036
1037 if (t != NULL && canon != NULL) {
1038 t->setID(*canon);
1039 }
1040
1041 return t;
1042}
1043
1044/**
1045 * Returns a <code>Transliterator</code> object constructed from
1046 * the given rule string. This will be a RuleBasedTransliterator,
1047 * if the rule string contains only rules, or a
1048 * CompoundTransliterator, if it contains ID blocks, or a
1049 * NullTransliterator, if it contains ID blocks which parse as
1050 * empty for the given direction.
1051 */
374ca955
A
1052Transliterator* U_EXPORT2
1053Transliterator::createFromRules(const UnicodeString& ID,
1054 const UnicodeString& rules,
1055 UTransDirection dir,
1056 UParseError& parseError,
1057 UErrorCode& status)
1058{
b75a7d8f
A
1059 Transliterator* t = NULL;
1060
73c04bcf 1061 TransliteratorParser parser(status);
b75a7d8f
A
1062 parser.parse(rules, dir, parseError, status);
1063
1064 if (U_FAILURE(status)) {
1065 return 0;
1066 }
1067
1068 // NOTE: The logic here matches that in TransliteratorRegistry.
73c04bcf
A
1069 if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) {
1070 t = new NullTransliterator();
1071 }
1072 else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) {
1073 t = new RuleBasedTransliterator(ID, (TransliterationRuleData*)parser.dataVector.orphanElementAt(0), TRUE);
1074 }
1075 else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) {
1076 // idBlock, no data -- this is an alias. The ID has
1077 // been munged from reverse into forward mode, if
1078 // necessary, so instantiate the ID in the forward
1079 // direction.
1080 if (parser.compoundFilter != NULL) {
1081 UnicodeString filterPattern;
1082 parser.compoundFilter->toPattern(filterPattern, FALSE);
1083 t = createInstance(filterPattern + UnicodeString(ID_DELIM)
1084 + *((UnicodeString*)parser.idBlockVector.elementAt(0)), UTRANS_FORWARD, parseError, status);
b75a7d8f 1085 }
73c04bcf
A
1086 else
1087 t = createInstance(*((UnicodeString*)parser.idBlockVector.elementAt(0)), UTRANS_FORWARD, parseError, status);
1088
1089
1090 if (t != NULL) {
1091 t->setID(ID);
b75a7d8f 1092 }
73c04bcf
A
1093 }
1094 else {
1095 UVector transliterators(status);
1096 int32_t passNumber = 1;
1097
1098 int32_t limit = parser.idBlockVector.size();
1099 if (parser.dataVector.size() > limit)
1100 limit = parser.dataVector.size();
1101
1102 for (int32_t i = 0; i < limit; i++) {
1103 if (i < parser.idBlockVector.size()) {
1104 UnicodeString* idBlock = (UnicodeString*)parser.idBlockVector.elementAt(i);
1105 if (!idBlock->isEmpty()) {
1106 Transliterator* temp = createInstance(*idBlock, UTRANS_FORWARD, parseError, status);
729e4ab9 1107 if (temp != NULL && typeid(*temp) != typeid(NullTransliterator))
73c04bcf
A
1108 transliterators.addElement(temp, status);
1109 else
1110 delete temp;
1111 }
b75a7d8f 1112 }
73c04bcf
A
1113 if (!parser.dataVector.isEmpty()) {
1114 TransliterationRuleData* data = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
4388f060
A
1115 // TODO: Should passNumber be turned into a decimal-string representation (1 -> "1")?
1116 RuleBasedTransliterator* temprbt = new RuleBasedTransliterator(UnicodeString(CompoundTransliterator::PASS_STRING) + UnicodeString(passNumber++),
46f4442e
A
1117 data, TRUE);
1118 // Check if NULL before adding it to transliterators to avoid future usage of NULL pointer.
1119 if (temprbt == NULL) {
2ca993e8
A
1120 status = U_MEMORY_ALLOCATION_ERROR;
1121 return t;
46f4442e
A
1122 }
1123 transliterators.addElement(temprbt, status);
b75a7d8f 1124 }
b75a7d8f 1125 }
b75a7d8f 1126
73c04bcf 1127 t = new CompoundTransliterator(transliterators, passNumber - 1, parseError, status);
46f4442e
A
1128 // Null pointer check
1129 if (t != NULL) {
1130 t->setID(ID);
1131 t->adoptFilter(parser.orphanCompoundFilter());
1132 }
1133 }
1134 if (U_SUCCESS(status) && t == NULL) {
1135 status = U_MEMORY_ALLOCATION_ERROR;
73c04bcf 1136 }
b75a7d8f
A
1137 return t;
1138}
1139
1140UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
1141 UBool escapeUnprintable) const {
1142 // The base class implementation of toRules munges the ID into
1143 // the correct format. That is: foo => ::foo
1144 if (escapeUnprintable) {
1145 rulesSource.truncate(0);
1146 UnicodeString id = getID();
1147 for (int32_t i=0; i<id.length();) {
1148 UChar32 c = id.char32At(i);
1149 if (!ICU_Utility::escapeUnprintable(rulesSource, c)) {
1150 rulesSource.append(c);
1151 }
4388f060 1152 i += U16_LENGTH(c);
b75a7d8f
A
1153 }
1154 } else {
1155 rulesSource = getID();
1156 }
1157 // KEEP in sync with rbt_pars
1158 rulesSource.insert(0, UNICODE_STRING_SIMPLE("::"));
1159 rulesSource.append(ID_DELIM);
1160 return rulesSource;
1161}
1162
374ca955 1163int32_t Transliterator::countElements() const {
729e4ab9
A
1164 const CompoundTransliterator* ct = dynamic_cast<const CompoundTransliterator*>(this);
1165 return ct != NULL ? ct->getCount() : 0;
374ca955
A
1166}
1167
1168const Transliterator& Transliterator::getElement(int32_t index, UErrorCode& ec) const {
1169 if (U_FAILURE(ec)) {
1170 return *this;
1171 }
729e4ab9 1172 const CompoundTransliterator* cpd = dynamic_cast<const CompoundTransliterator*>(this);
374ca955
A
1173 int32_t n = (cpd == NULL) ? 1 : cpd->getCount();
1174 if (index < 0 || index >= n) {
1175 ec = U_INDEX_OUTOFBOUNDS_ERROR;
1176 return *this;
1177 } else {
1178 return (n == 1) ? *this : cpd->getTransliterator(index);
1179 }
1180}
1181
b75a7d8f
A
1182UnicodeSet& Transliterator::getSourceSet(UnicodeSet& result) const {
1183 handleGetSourceSet(result);
1184 if (filter != NULL) {
729e4ab9 1185 UnicodeSet* filterSet = dynamic_cast<UnicodeSet*>(filter);
46f4442e
A
1186 UBool deleteFilterSet = FALSE;
1187 // Most, but not all filters will be UnicodeSets. Optimize for
1188 // the high-runner case.
729e4ab9 1189 if (filterSet == NULL) {
46f4442e
A
1190 filterSet = new UnicodeSet();
1191 // Check null pointer
1192 if (filterSet == NULL) {
1193 return result;
1194 }
1195 deleteFilterSet = TRUE;
1196 filter->addMatchSetTo(*filterSet);
1197 }
1198 result.retainAll(*filterSet);
1199 if (deleteFilterSet) {
1200 delete filterSet;
1201 }
b75a7d8f
A
1202 }
1203 return result;
1204}
1205
1206void Transliterator::handleGetSourceSet(UnicodeSet& result) const {
1207 result.clear();
1208}
1209
1210UnicodeSet& Transliterator::getTargetSet(UnicodeSet& result) const {
1211 return result.clear();
1212}
1213
1214// For public consumption
374ca955 1215void U_EXPORT2 Transliterator::registerFactory(const UnicodeString& id,
b75a7d8f
A
1216 Transliterator::Factory factory,
1217 Transliterator::Token context) {
3d1f044b 1218 Mutex lock(registryMutex());
46f4442e
A
1219 UErrorCode ec = U_ZERO_ERROR;
1220 if (HAVE_REGISTRY(ec)) {
b75a7d8f
A
1221 _registerFactory(id, factory, context);
1222 }
1223}
1224
1225// To be called only by Transliterator subclasses that are called
1226// to register themselves by initializeRegistry().
1227void Transliterator::_registerFactory(const UnicodeString& id,
1228 Transliterator::Factory factory,
1229 Transliterator::Token context) {
46f4442e
A
1230 UErrorCode ec = U_ZERO_ERROR;
1231 registry->put(id, factory, context, TRUE, ec);
b75a7d8f
A
1232}
1233
1234// To be called only by Transliterator subclasses that are called
1235// to register themselves by initializeRegistry().
1236void Transliterator::_registerSpecialInverse(const UnicodeString& target,
1237 const UnicodeString& inverseTarget,
1238 UBool bidirectional) {
374ca955
A
1239 UErrorCode status = U_ZERO_ERROR;
1240 TransliteratorIDParser::registerSpecialInverse(target, inverseTarget, bidirectional, status);
b75a7d8f
A
1241}
1242
1243/**
1244 * Registers a instance <tt>obj</tt> of a subclass of
1245 * <code>Transliterator</code> with the system. This object must
1246 * implement the <tt>clone()</tt> method. When
1247 * <tt>getInstance()</tt> is called with an ID string that is
1248 * equal to <tt>obj.getID()</tt>, then <tt>obj.clone()</tt> is
1249 * returned.
1250 *
1251 * @param obj an instance of subclass of
1252 * <code>Transliterator</code> that defines <tt>clone()</tt>
1253 * @see #getInstance
1254 * @see #unregister
1255 */
374ca955 1256void U_EXPORT2 Transliterator::registerInstance(Transliterator* adoptedPrototype) {
3d1f044b 1257 Mutex lock(registryMutex());
46f4442e
A
1258 UErrorCode ec = U_ZERO_ERROR;
1259 if (HAVE_REGISTRY(ec)) {
b75a7d8f
A
1260 _registerInstance(adoptedPrototype);
1261 }
1262}
1263
1264void Transliterator::_registerInstance(Transliterator* adoptedPrototype) {
46f4442e
A
1265 UErrorCode ec = U_ZERO_ERROR;
1266 registry->put(adoptedPrototype, TRUE, ec);
b75a7d8f
A
1267}
1268
73c04bcf
A
1269void U_EXPORT2 Transliterator::registerAlias(const UnicodeString& aliasID,
1270 const UnicodeString& realID) {
3d1f044b 1271 Mutex lock(registryMutex());
46f4442e
A
1272 UErrorCode ec = U_ZERO_ERROR;
1273 if (HAVE_REGISTRY(ec)) {
73c04bcf
A
1274 _registerAlias(aliasID, realID);
1275 }
1276}
1277
1278void Transliterator::_registerAlias(const UnicodeString& aliasID,
1279 const UnicodeString& realID) {
46f4442e
A
1280 UErrorCode ec = U_ZERO_ERROR;
1281 registry->put(aliasID, realID, FALSE, TRUE, ec);
73c04bcf
A
1282}
1283
b75a7d8f
A
1284/**
1285 * Unregisters a transliterator or class. This may be either
1286 * a system transliterator or a user transliterator or class.
2ca993e8 1287 *
b75a7d8f
A
1288 * @param ID the ID of the transliterator or class
1289 * @see #registerInstance
1290
1291 */
374ca955 1292void U_EXPORT2 Transliterator::unregister(const UnicodeString& ID) {
3d1f044b 1293 Mutex lock(registryMutex());
46f4442e
A
1294 UErrorCode ec = U_ZERO_ERROR;
1295 if (HAVE_REGISTRY(ec)) {
b75a7d8f
A
1296 registry->remove(ID);
1297 }
1298}
1299
1300/**
374ca955 1301 * == OBSOLETE - remove in ICU 3.4 ==
b75a7d8f
A
1302 * Return the number of IDs currently registered with the system.
1303 * To retrieve the actual IDs, call getAvailableID(i) with
1304 * i from 0 to countAvailableIDs() - 1.
1305 */
374ca955 1306int32_t U_EXPORT2 Transliterator::countAvailableIDs(void) {
46f4442e 1307 int32_t retVal = 0;
3d1f044b 1308 Mutex lock(registryMutex());
46f4442e
A
1309 UErrorCode ec = U_ZERO_ERROR;
1310 if (HAVE_REGISTRY(ec)) {
1311 retVal = registry->countAvailableIDs();
1312 }
1313 return retVal;
b75a7d8f
A
1314}
1315
1316/**
374ca955 1317 * == OBSOLETE - remove in ICU 3.4 ==
b75a7d8f
A
1318 * Return the index-th available ID. index must be between 0
1319 * and countAvailableIDs() - 1, inclusive. If index is out of
1320 * range, the result of getAvailableID(0) is returned.
1321 */
374ca955 1322const UnicodeString& U_EXPORT2 Transliterator::getAvailableID(int32_t index) {
b75a7d8f 1323 const UnicodeString* result = NULL;
3d1f044b 1324 umtx_lock(registryMutex());
46f4442e
A
1325 UErrorCode ec = U_ZERO_ERROR;
1326 if (HAVE_REGISTRY(ec)) {
b75a7d8f
A
1327 result = &registry->getAvailableID(index);
1328 }
3d1f044b 1329 umtx_unlock(registryMutex());
b75a7d8f
A
1330 U_ASSERT(result != NULL); // fail if no registry
1331 return *result;
1332}
1333
374ca955
A
1334StringEnumeration* U_EXPORT2 Transliterator::getAvailableIDs(UErrorCode& ec) {
1335 if (U_FAILURE(ec)) return NULL;
1336 StringEnumeration* result = NULL;
3d1f044b 1337 umtx_lock(registryMutex());
46f4442e 1338 if (HAVE_REGISTRY(ec)) {
374ca955
A
1339 result = registry->getAvailableIDs();
1340 }
3d1f044b 1341 umtx_unlock(registryMutex());
374ca955
A
1342 if (result == NULL) {
1343 ec = U_INTERNAL_TRANSLITERATOR_ERROR;
1344 }
1345 return result;
1346}
1347
1348int32_t U_EXPORT2 Transliterator::countAvailableSources(void) {
3d1f044b 1349 Mutex lock(registryMutex());
46f4442e
A
1350 UErrorCode ec = U_ZERO_ERROR;
1351 return HAVE_REGISTRY(ec) ? _countAvailableSources() : 0;
b75a7d8f
A
1352}
1353
374ca955 1354UnicodeString& U_EXPORT2 Transliterator::getAvailableSource(int32_t index,
b75a7d8f 1355 UnicodeString& result) {
3d1f044b 1356 Mutex lock(registryMutex());
46f4442e
A
1357 UErrorCode ec = U_ZERO_ERROR;
1358 if (HAVE_REGISTRY(ec)) {
b75a7d8f
A
1359 _getAvailableSource(index, result);
1360 }
1361 return result;
1362}
1363
374ca955 1364int32_t U_EXPORT2 Transliterator::countAvailableTargets(const UnicodeString& source) {
3d1f044b 1365 Mutex lock(registryMutex());
46f4442e
A
1366 UErrorCode ec = U_ZERO_ERROR;
1367 return HAVE_REGISTRY(ec) ? _countAvailableTargets(source) : 0;
b75a7d8f
A
1368}
1369
374ca955 1370UnicodeString& U_EXPORT2 Transliterator::getAvailableTarget(int32_t index,
b75a7d8f
A
1371 const UnicodeString& source,
1372 UnicodeString& result) {
3d1f044b 1373 Mutex lock(registryMutex());
46f4442e
A
1374 UErrorCode ec = U_ZERO_ERROR;
1375 if (HAVE_REGISTRY(ec)) {
b75a7d8f
A
1376 _getAvailableTarget(index, source, result);
1377 }
1378 return result;
1379}
1380
374ca955 1381int32_t U_EXPORT2 Transliterator::countAvailableVariants(const UnicodeString& source,
b75a7d8f 1382 const UnicodeString& target) {
3d1f044b 1383 Mutex lock(registryMutex());
46f4442e
A
1384 UErrorCode ec = U_ZERO_ERROR;
1385 return HAVE_REGISTRY(ec) ? _countAvailableVariants(source, target) : 0;
b75a7d8f
A
1386}
1387
374ca955 1388UnicodeString& U_EXPORT2 Transliterator::getAvailableVariant(int32_t index,
b75a7d8f
A
1389 const UnicodeString& source,
1390 const UnicodeString& target,
1391 UnicodeString& result) {
3d1f044b 1392 Mutex lock(registryMutex());
46f4442e
A
1393 UErrorCode ec = U_ZERO_ERROR;
1394 if (HAVE_REGISTRY(ec)) {
b75a7d8f
A
1395 _getAvailableVariant(index, source, target, result);
1396 }
1397 return result;
1398}
1399
1400int32_t Transliterator::_countAvailableSources(void) {
1401 return registry->countAvailableSources();
1402}
1403
1404UnicodeString& Transliterator::_getAvailableSource(int32_t index,
1405 UnicodeString& result) {
1406 return registry->getAvailableSource(index, result);
1407}
1408
1409int32_t Transliterator::_countAvailableTargets(const UnicodeString& source) {
1410 return registry->countAvailableTargets(source);
1411}
1412
1413UnicodeString& Transliterator::_getAvailableTarget(int32_t index,
1414 const UnicodeString& source,
1415 UnicodeString& result) {
1416 return registry->getAvailableTarget(index, source, result);
1417}
1418
1419int32_t Transliterator::_countAvailableVariants(const UnicodeString& source,
1420 const UnicodeString& target) {
1421 return registry->countAvailableVariants(source, target);
1422}
1423
1424UnicodeString& Transliterator::_getAvailableVariant(int32_t index,
1425 const UnicodeString& source,
1426 const UnicodeString& target,
1427 UnicodeString& result) {
1428 return registry->getAvailableVariant(index, source, target, result);
1429}
1430
1431#ifdef U_USE_DEPRECATED_TRANSLITERATOR_API
1432
1433/**
1434 * Method for subclasses to use to obtain a character in the given
1435 * string, with filtering.
1436 * @deprecated the new architecture provides filtering at the top
1437 * level. This method will be removed Dec 31 2001.
1438 */
1439UChar Transliterator::filteredCharAt(const Replaceable& text, int32_t i) const {
1440 UChar c;
1441 const UnicodeFilter* localFilter = getFilter();
1442 return (localFilter == 0) ? text.charAt(i) :
1443 (localFilter->contains(c = text.charAt(i)) ? c : (UChar)0xFFFE);
1444}
1445
1446#endif
1447
1448/**
1449 * If the registry is initialized, return TRUE. If not, initialize it
1450 * and return TRUE. If the registry cannot be initialized, return
1451 * FALSE (rare).
1452 *
729e4ab9 1453 * IMPORTANT: Upon entry, registryMutex must be LOCKED. The entire
b75a7d8f
A
1454 * initialization is done with the lock held. There is NO REASON to
1455 * unlock, since no other thread that is waiting on the registryMutex
1456 * cannot itself proceed until the registry is initialized.
1457 */
46f4442e 1458UBool Transliterator::initializeRegistry(UErrorCode &status) {
b75a7d8f
A
1459 if (registry != 0) {
1460 return TRUE;
1461 }
1462
b75a7d8f
A
1463 registry = new TransliteratorRegistry(status);
1464 if (registry == 0 || U_FAILURE(status)) {
1465 delete registry;
1466 registry = 0;
1467 return FALSE; // can't create registry, no recovery
1468 }
1469
1470 /* The following code parses the index table located in
374ca955 1471 * icu/data/translit/root.txt. The index is an n x 4 table
b75a7d8f 1472 * that follows this format:
374ca955
A
1473 * <id>{
1474 * file{
1475 * resource{"<resource>"}
1476 * direction{"<direction>"}
1477 * }
1478 * }
1479 * <id>{
1480 * internal{
1481 * resource{"<resource>"}
1482 * direction{"<direction"}
1483 * }
1484 * }
1485 * <id>{
1486 * alias{"<getInstanceArg"}
1487 * }
b75a7d8f
A
1488 * <id> is the ID of the system transliterator being defined. These
1489 * are public IDs enumerated by Transliterator.getAvailableIDs(),
1490 * unless the second field is "internal".
2ca993e8 1491 *
b75a7d8f
A
1492 * <resource> is a ResourceReader resource name. Currently these refer
1493 * to file names under com/ibm/text/resources. This string is passed
1494 * directly to ResourceReader, together with <encoding>.
2ca993e8 1495 *
b75a7d8f 1496 * <direction> is either "FORWARD" or "REVERSE".
2ca993e8 1497 *
b75a7d8f
A
1498 * <getInstanceArg> is a string to be passed directly to
1499 * Transliterator.getInstance(). The returned Transliterator object
1500 * then has its ID changed to <id> and is returned.
1501 *
1502 * The extra blank field on "alias" lines is to make the array square.
1503 */
374ca955 1504 //static const char translit_index[] = "translit_index";
b75a7d8f 1505
2ca993e8
A
1506 UResourceBundle *bundle = ures_open(U_ICUDATA_TRANSLIT, NULL/*open default locale*/, &status);
1507 UResourceBundle *transIDs = ures_getByKey(bundle, RB_RULE_BASED_IDS, 0, &status);
b75a7d8f 1508 if (U_SUCCESS(status)) {
2ca993e8
A
1509 UResourceBundle *colBund = NULL;
1510 UResourceBundle* res = NULL;
1511 int32_t row, maxRows = ures_getSize(transIDs);
b75a7d8f 1512 for (row = 0; row < maxRows; row++) {
2ca993e8
A
1513 colBund = ures_getByIndex(transIDs, row, colBund, &status);
1514 if (U_FAILURE(status)) {
1515 break;
1516 }
1517 const char *tridKey = ures_getKey(colBund);
1518 if (tridKey == NULL || uprv_strstr(tridKey, "-t-") != NULL) {
1519 continue; // Apple version should not get any of these, eliminated the root.txt entries
1520 }
1521 res = ures_getNextResource(colBund, res, &status);
1522 if (U_FAILURE(status)) {
1523 break;
1524 }
1525 UnicodeString trID(tridKey, -1, US_INV);
1526 const char* typeStr = ures_getKey(res);
1527 int32_t len = 0, dlen = 0;
1528 UBool visible = FALSE;
1529 const UChar *resString;
1530 switch (typeStr[0]) {
1531 case 'f': // "file"
1532 visible = TRUE;
1533 // FALLTHROUGH
1534 case 'i': // "internal" => visible = FALSE
1535 // child resources are resource and direction
1536 {
1537 resString = ures_getStringByKey(res, "resource", &len, &status);
1538 const UChar* dirString = ures_getStringByKey(res, "direction", &dlen, &status);
1539 UTransDirection dir = (dlen <= 0 || dirString[0] == 0x0046 /*F*/)? UTRANS_FORWARD : UTRANS_REVERSE;
1540 registry->put(trID, UnicodeString(TRUE, resString, len), dir, TRUE, visible, status);
b75a7d8f 1541 }
2ca993e8
A
1542 break;
1543 case 'a': // "alias", string argument is alias
1544 resString = ures_getString(res, &len, &status);
1545 registry->put(trID, UnicodeString(TRUE, resString, len), TRUE, TRUE, status);
1546 break;
1547 default: // do nothing
1548 break;
b75a7d8f 1549 }
b75a7d8f 1550 }
2ca993e8
A
1551 ures_close(res);
1552 ures_close(colBund);
b75a7d8f
A
1553 }
1554
1555 ures_close(transIDs);
1556 ures_close(bundle);
1557
1558 // Manually add prototypes that the system knows about to the
1559 // cache. This is how new non-rule-based transliterators are
1560 // added to the system.
2ca993e8 1561
46f4442e
A
1562 // This is to allow for null pointer check
1563 NullTransliterator* tempNullTranslit = new NullTransliterator();
1564 LowercaseTransliterator* tempLowercaseTranslit = new LowercaseTransliterator();
1565 UppercaseTransliterator* tempUppercaseTranslit = new UppercaseTransliterator();
1566 TitlecaseTransliterator* tempTitlecaseTranslit = new TitlecaseTransliterator();
1567 UnicodeNameTransliterator* tempUnicodeTranslit = new UnicodeNameTransliterator();
1568 NameUnicodeTransliterator* tempNameUnicodeTranslit = new NameUnicodeTransliterator();
1569#if !UCONFIG_NO_BREAK_ITERATION
1570 // TODO: could or should these transliterators be referenced polymorphically once constructed?
1571 BreakTransliterator* tempBreakTranslit = new BreakTransliterator();
1572#endif
1573 // Check for null pointers
1574 if (tempNullTranslit == NULL || tempLowercaseTranslit == NULL || tempUppercaseTranslit == NULL ||
2ca993e8 1575 tempTitlecaseTranslit == NULL || tempUnicodeTranslit == NULL ||
46f4442e
A
1576#if !UCONFIG_NO_BREAK_ITERATION
1577 tempBreakTranslit == NULL ||
1578#endif
1579 tempNameUnicodeTranslit == NULL )
1580 {
1581 delete tempNullTranslit;
1582 delete tempLowercaseTranslit;
1583 delete tempUppercaseTranslit;
1584 delete tempTitlecaseTranslit;
1585 delete tempUnicodeTranslit;
1586 delete tempNameUnicodeTranslit;
1587#if !UCONFIG_NO_BREAK_ITERATION
1588 delete tempBreakTranslit;
1589#endif
1590 // Since there was an error, remove registry
1591 delete registry;
1592 registry = NULL;
b75a7d8f 1593
46f4442e
A
1594 status = U_MEMORY_ALLOCATION_ERROR;
1595 return 0;
1596 }
1597
1598 registry->put(tempNullTranslit, TRUE, status);
1599 registry->put(tempLowercaseTranslit, TRUE, status);
1600 registry->put(tempUppercaseTranslit, TRUE, status);
1601 registry->put(tempTitlecaseTranslit, TRUE, status);
1602 registry->put(tempUnicodeTranslit, TRUE, status);
1603 registry->put(tempNameUnicodeTranslit, TRUE, status);
1604#if !UCONFIG_NO_BREAK_ITERATION
1605 registry->put(tempBreakTranslit, FALSE, status); // FALSE means invisible.
1606#endif
b75a7d8f
A
1607
1608 RemoveTransliterator::registerIDs(); // Must be within mutex
1609 EscapeTransliterator::registerIDs();
1610 UnescapeTransliterator::registerIDs();
1611 NormalizationTransliterator::registerIDs();
1612 AnyTransliterator::registerIDs();
1613
73c04bcf
A
1614 _registerSpecialInverse(UNICODE_STRING_SIMPLE("Null"),
1615 UNICODE_STRING_SIMPLE("Null"), FALSE);
374ca955
A
1616 _registerSpecialInverse(UNICODE_STRING_SIMPLE("Upper"),
1617 UNICODE_STRING_SIMPLE("Lower"), TRUE);
1618 _registerSpecialInverse(UNICODE_STRING_SIMPLE("Title"),
1619 UNICODE_STRING_SIMPLE("Lower"), FALSE);
b75a7d8f 1620
729e4ab9 1621 ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup);
b75a7d8f
A
1622
1623 return TRUE;
1624}
1625
1626U_NAMESPACE_END
1627
b331163b 1628// Defined in transreg.h:
b75a7d8f
A
1629
1630/**
1631 * Release all static memory held by transliterator. This will
1632 * necessarily invalidate any rule-based transliterators held by the
1633 * user, because RBTs hold pointers to common data objects.
1634 */
729e4ab9 1635U_CFUNC UBool utrans_transliterator_cleanup(void) {
46f4442e 1636 U_NAMESPACE_USE
b75a7d8f
A
1637 TransliteratorIDParser::cleanup();
1638 if (registry) {
1639 delete registry;
1640 registry = NULL;
1641 }
b75a7d8f
A
1642 return TRUE;
1643}
1644
1645#endif /* #if !UCONFIG_NO_TRANSLITERATION */
1646
1647//eof