]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/translit.cpp
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / i18n / translit.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/17/99 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/putil.h"
16 #include "unicode/translit.h"
17 #include "unicode/locid.h"
18 #include "unicode/msgfmt.h"
19 #include "unicode/rep.h"
20 #include "unicode/resbund.h"
21 #include "unicode/unifilt.h"
22 #include "unicode/uniset.h"
23 #include "unicode/uscript.h"
24 #include "unicode/strenum.h"
25 #include "cpdtrans.h"
26 #include "nultrans.h"
27 #include "rbt_data.h"
28 #include "rbt_pars.h"
29 #include "rbt.h"
30 #include "transreg.h"
31 #include "name2uni.h"
32 #include "nortrans.h"
33 #include "remtrans.h"
34 #include "titletrn.h"
35 #include "tolowtrn.h"
36 #include "toupptrn.h"
37 #include "uni2name.h"
38 #include "esctrn.h"
39 #include "unesctrn.h"
40 #include "tridpars.h"
41 #include "anytrans.h"
42 #include "util.h"
43 #include "hash.h"
44 #include "mutex.h"
45 #include "ucln_in.h"
46 #include "uassert.h"
47 #include "cmemory.h"
48 #include "cstring.h"
49
50 static const UChar TARGET_SEP = 0x002D; /*-*/
51 static const UChar ID_DELIM = 0x003B; /*;*/
52 static const UChar VARIANT_SEP = 0x002F; // '/'
53
54 /**
55 * Prefix for resource bundle key for the display name for a
56 * transliterator. The ID is appended to this to form the key.
57 * The resource bundle value should be a String.
58 */
59 static const char RB_DISPLAY_NAME_PREFIX[] = "%Translit%%";
60
61 /**
62 * Prefix for resource bundle key for the display name for a
63 * transliterator SCRIPT. The ID is appended to this to form the key.
64 * The resource bundle value should be a String.
65 */
66 static const char RB_SCRIPT_DISPLAY_NAME_PREFIX[] = "%Translit%";
67
68 /**
69 * Resource bundle key for display name pattern.
70 * The resource bundle value should be a String forming a
71 * MessageFormat pattern, e.g.:
72 * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
73 */
74 static const char RB_DISPLAY_NAME_PATTERN[] = "TransliteratorNamePattern";
75
76 /**
77 * Resource bundle key for the list of RuleBasedTransliterator IDs.
78 * The resource bundle value should be a String[] with each element
79 * being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
80 * to obtain the class name in which the RB_RULE key will be sought.
81 */
82 static const char RB_RULE_BASED_IDS[] = "RuleBasedTransliteratorIDs";
83
84 /**
85 * The mutex controlling access to registry object.
86 */
87 static UMTX registryMutex = 0;
88
89 /**
90 * System transliterator registry; non-null when initialized.
91 */
92 static TransliteratorRegistry* registry = 0;
93
94 // Macro to check/initialize the registry. ONLY USE WITHIN
95 // MUTEX. Avoids function call when registry is initialized.
96 #define HAVE_REGISTRY (registry!=0 || initializeRegistry())
97
98 // Empty string
99 static const UChar EMPTY[] = {0}; //""
100
101 U_NAMESPACE_BEGIN
102
103 UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(Transliterator)
104
105 /**
106 * Return TRUE if the given UTransPosition is valid for text of
107 * the given length.
108 */
109 inline UBool positionIsValid(UTransPosition& index, int32_t len) {
110 return !(index.contextStart < 0 ||
111 index.start < index.contextStart ||
112 index.limit < index.start ||
113 index.contextLimit < index.limit ||
114 len < index.contextLimit);
115 }
116
117 /**
118 * Default constructor.
119 * @param theID the string identifier for this transliterator
120 * @param theFilter the filter. Any character for which
121 * <tt>filter.contains()</tt> returns <tt>FALSE</tt> will not be
122 * altered by this transliterator. If <tt>filter</tt> is
123 * <tt>null</tt> then no filtering is applied.
124 */
125 Transliterator::Transliterator(const UnicodeString& theID,
126 UnicodeFilter* adoptedFilter) :
127 UObject(), ID(theID), filter(adoptedFilter),
128 maximumContextLength(0) {
129
130 // NUL-terminate the ID string
131 ID.getTerminatedBuffer();
132 }
133
134 /**
135 * Destructor.
136 */
137 Transliterator::~Transliterator() {
138 delete filter;
139 }
140
141 /**
142 * Copy constructor.
143 */
144 Transliterator::Transliterator(const Transliterator& other) :
145 UObject(other), ID(other.ID), filter(0),
146 maximumContextLength(other.maximumContextLength) {
147
148 // NUL-terminate the ID string
149 ID.getTerminatedBuffer();
150
151 if (other.filter != 0) {
152 // We own the filter, so we must have our own copy
153 filter = (UnicodeFilter*) other.filter->clone();
154 }
155 }
156
157 /**
158 * Assignment operator.
159 */
160 Transliterator& Transliterator::operator=(const Transliterator& other) {
161 ID = other.ID;
162 // NUL-terminate the ID string
163 ID.getTerminatedBuffer();
164
165 maximumContextLength = other.maximumContextLength;
166 adoptFilter((other.filter == 0) ? 0 : (UnicodeFilter*) other.filter->clone());
167 return *this;
168 }
169
170 /**
171 * Transliterates a segment of a string. <code>Transliterator</code> API.
172 * @param text the string to be transliterated
173 * @param start the beginning index, inclusive; <code>0 <= start
174 * <= limit</code>.
175 * @param limit the ending index, exclusive; <code>start <= limit
176 * <= text.length()</code>.
177 * @return the new limit index, or -1
178 */
179 int32_t Transliterator::transliterate(Replaceable& text,
180 int32_t start, int32_t limit) const {
181 if (start < 0 ||
182 limit < start ||
183 text.length() < limit) {
184 return -1;
185 }
186
187 UTransPosition offsets;
188 offsets.contextStart= start;
189 offsets.contextLimit = limit;
190 offsets.start = start;
191 offsets.limit = limit;
192 filteredTransliterate(text, offsets, FALSE, TRUE);
193 return offsets.limit;
194 }
195
196 /**
197 * Transliterates an entire string in place. Convenience method.
198 * @param text the string to be transliterated
199 */
200 void Transliterator::transliterate(Replaceable& text) const {
201 transliterate(text, 0, text.length());
202 }
203
204 /**
205 * Transliterates the portion of the text buffer that can be
206 * transliterated unambiguosly after new text has been inserted,
207 * typically as a result of a keyboard event. The new text in
208 * <code>insertion</code> will be inserted into <code>text</code>
209 * at <code>index.contextLimit</code>, advancing
210 * <code>index.contextLimit</code> by <code>insertion.length()</code>.
211 * Then the transliterator will try to transliterate characters of
212 * <code>text</code> between <code>index.start</code> and
213 * <code>index.contextLimit</code>. Characters before
214 * <code>index.start</code> will not be changed.
215 *
216 * <p>Upon return, values in <code>index</code> will be updated.
217 * <code>index.contextStart</code> will be advanced to the first
218 * character that future calls to this method will read.
219 * <code>index.start</code> and <code>index.contextLimit</code> will
220 * be adjusted to delimit the range of text that future calls to
221 * this method may change.
222 *
223 * <p>Typical usage of this method begins with an initial call
224 * with <code>index.contextStart</code> and <code>index.contextLimit</code>
225 * set to indicate the portion of <code>text</code> to be
226 * transliterated, and <code>index.start == index.contextStart</code>.
227 * Thereafter, <code>index</code> can be used without
228 * modification in future calls, provided that all changes to
229 * <code>text</code> are made via this method.
230 *
231 * <p>This method assumes that future calls may be made that will
232 * insert new text into the buffer. As a result, it only performs
233 * unambiguous transliterations. After the last call to this
234 * method, there may be untransliterated text that is waiting for
235 * more input to resolve an ambiguity. In order to perform these
236 * pending transliterations, clients should call {@link
237 * #finishKeyboardTransliteration} after the last call to this
238 * method has been made.
239 *
240 * @param text the buffer holding transliterated and untransliterated text
241 * @param index an array of three integers.
242 *
243 * <ul><li><code>index.contextStart</code>: the beginning index,
244 * inclusive; <code>0 <= index.contextStart <= index.contextLimit</code>.
245 *
246 * <li><code>index.contextLimit</code>: the ending index, exclusive;
247 * <code>index.contextStart <= index.contextLimit <= text.length()</code>.
248 * <code>insertion</code> is inserted at
249 * <code>index.contextLimit</code>.
250 *
251 * <li><code>index.start</code>: the next character to be
252 * considered for transliteration; <code>index.contextStart <=
253 * index.start <= index.contextLimit</code>. Characters before
254 * <code>index.start</code> will not be changed by future calls
255 * to this method.</ul>
256 *
257 * @param insertion text to be inserted and possibly
258 * transliterated into the translation buffer at
259 * <code>index.contextLimit</code>. If <code>null</code> then no text
260 * is inserted.
261 * @see #START
262 * @see #LIMIT
263 * @see #CURSOR
264 * @see #handleTransliterate
265 * @exception IllegalArgumentException if <code>index</code>
266 * is invalid
267 */
268 void Transliterator::transliterate(Replaceable& text,
269 UTransPosition& index,
270 const UnicodeString& insertion,
271 UErrorCode &status) const {
272 _transliterate(text, index, &insertion, status);
273 }
274
275 /**
276 * Transliterates the portion of the text buffer that can be
277 * transliterated unambiguosly after a new character has been
278 * inserted, typically as a result of a keyboard event. This is a
279 * convenience method; see {@link
280 * #transliterate(Replaceable, int[], String)} for details.
281 * @param text the buffer holding transliterated and
282 * untransliterated text
283 * @param index an array of three integers. See {@link
284 * #transliterate(Replaceable, int[], String)}.
285 * @param insertion text to be inserted and possibly
286 * transliterated into the translation buffer at
287 * <code>index.contextLimit</code>.
288 * @see #transliterate(Replaceable, int[], String)
289 */
290 void Transliterator::transliterate(Replaceable& text,
291 UTransPosition& index,
292 UChar32 insertion,
293 UErrorCode& status) const {
294 UnicodeString str(insertion);
295 _transliterate(text, index, &str, status);
296 }
297
298 /**
299 * Transliterates the portion of the text buffer that can be
300 * transliterated unambiguosly. This is a convenience method; see
301 * {@link #transliterate(Replaceable, int[], String)} for
302 * details.
303 * @param text the buffer holding transliterated and
304 * untransliterated text
305 * @param index an array of three integers. See {@link
306 * #transliterate(Replaceable, int[], String)}.
307 * @see #transliterate(Replaceable, int[], String)
308 */
309 void Transliterator::transliterate(Replaceable& text,
310 UTransPosition& index,
311 UErrorCode& status) const {
312 _transliterate(text, index, 0, status);
313 }
314
315 /**
316 * Finishes any pending transliterations that were waiting for
317 * more characters. Clients should call this method as the last
318 * call after a sequence of one or more calls to
319 * <code>transliterate()</code>.
320 * @param text the buffer holding transliterated and
321 * untransliterated text.
322 * @param index the array of indices previously passed to {@link
323 * #transliterate}
324 */
325 void Transliterator::finishTransliteration(Replaceable& text,
326 UTransPosition& index) const {
327 if (!positionIsValid(index, text.length())) {
328 return;
329 }
330
331 filteredTransliterate(text, index, FALSE, TRUE);
332 }
333
334 /**
335 * This internal method does keyboard transliteration. If the
336 * 'insertion' is non-null then we append it to 'text' before
337 * proceeding. This method calls through to the pure virtual
338 * framework method handleTransliterate() to do the actual
339 * work.
340 */
341 void Transliterator::_transliterate(Replaceable& text,
342 UTransPosition& index,
343 const UnicodeString* insertion,
344 UErrorCode &status) const {
345 if (U_FAILURE(status)) {
346 return;
347 }
348
349 if (!positionIsValid(index, text.length())) {
350 status = U_ILLEGAL_ARGUMENT_ERROR;
351 return;
352 }
353
354 // int32_t originalStart = index.contextStart;
355 if (insertion != 0) {
356 text.handleReplaceBetween(index.limit, index.limit, *insertion);
357 index.limit += insertion->length();
358 index.contextLimit += insertion->length();
359 }
360
361 if (index.limit > 0 &&
362 UTF_IS_LEAD(text.charAt(index.limit - 1))) {
363 // Oops, there is a dangling lead surrogate in the buffer.
364 // This will break most transliterators, since they will
365 // assume it is part of a pair. Don't transliterate until
366 // more text comes in.
367 return;
368 }
369
370 filteredTransliterate(text, index, TRUE, TRUE);
371
372 #if 0
373 // TODO
374 // I CAN'T DO what I'm attempting below now that the Kleene star
375 // operator is supported. For example, in the rule
376
377 // ([:Lu:]+) { x } > $1;
378
379 // what is the maximum context length? getMaximumContextLength()
380 // will return 1, but this is just the length of the ante context
381 // part of the pattern string -- 1 character, which is a standin
382 // for a Quantifier, which contains a StringMatcher, which
383 // contains a UnicodeSet.
384
385 // There is a complicated way to make this work again, and that's
386 // to add a "maximum left context" protocol into the
387 // UnicodeMatcher hierarchy. At present I'm not convinced this is
388 // worth it.
389
390 // ---
391
392 // The purpose of the code below is to keep the context small
393 // while doing incremental transliteration. When part of the left
394 // context (between contextStart and start) is no longer needed,
395 // we try to advance contextStart past that portion. We use the
396 // maximum context length to do so.
397 int32_t newCS = index.start;
398 int32_t n = getMaximumContextLength();
399 while (newCS > originalStart && n-- > 0) {
400 --newCS;
401 newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1;
402 }
403 index.contextStart = uprv_max(newCS, originalStart);
404 #endif
405 }
406
407 /**
408 * This method breaks up the input text into runs of unfiltered
409 * characters. It passes each such run to
410 * <subclass>.handleTransliterate(). Subclasses that can handle the
411 * filter logic more efficiently themselves may override this method.
412 *
413 * All transliteration calls in this class go through this method.
414 */
415 void Transliterator::filteredTransliterate(Replaceable& text,
416 UTransPosition& index,
417 UBool incremental,
418 UBool rollback) const {
419 // Short circuit path for transliterators with no filter in
420 // non-incremental mode.
421 if (filter == 0 && !rollback) {
422 handleTransliterate(text, index, incremental);
423 return;
424 }
425
426 //----------------------------------------------------------------------
427 // This method processes text in two groupings:
428 //
429 // RUNS -- A run is a contiguous group of characters which are contained
430 // in the filter for this transliterator (filter.contains(ch) == TRUE).
431 // Text outside of runs may appear as context but it is not modified.
432 // The start and limit Position values are narrowed to each run.
433 //
434 // PASSES (incremental only) -- To make incremental mode work correctly,
435 // each run is broken up into n passes, where n is the length (in code
436 // points) of the run. Each pass contains the first n characters. If a
437 // pass is completely transliterated, it is committed, and further passes
438 // include characters after the committed text. If a pass is blocked,
439 // and does not transliterate completely, then this method rolls back
440 // the changes made during the pass, extends the pass by one code point,
441 // and tries again.
442 //----------------------------------------------------------------------
443
444 // globalLimit is the limit value for the entire operation. We
445 // set index.limit to the end of each unfiltered run before
446 // calling handleTransliterate(), so we need to maintain the real
447 // value of index.limit here. After each transliteration, we
448 // update globalLimit for insertions or deletions that have
449 // happened.
450 int32_t globalLimit = index.limit;
451
452 // If there is a non-null filter, then break the input text up. Say the
453 // input text has the form:
454 // xxxabcxxdefxx
455 // where 'x' represents a filtered character (filter.contains('x') ==
456 // false). Then we break this up into:
457 // xxxabc xxdef xx
458 // Each pass through the loop consumes a run of filtered
459 // characters (which are ignored) and a subsequent run of
460 // unfiltered characters (which are transliterated).
461
462 for (;;) {
463
464 if (filter != NULL) {
465 // Narrow the range to be transliterated to the first segment
466 // of unfiltered characters at or after index.start.
467
468 // Advance past filtered chars
469 UChar32 c;
470 while (index.start < globalLimit &&
471 !filter->contains(c=text.char32At(index.start))) {
472 index.start += UTF_CHAR_LENGTH(c);
473 }
474
475 // Find the end of this run of unfiltered chars
476 index.limit = index.start;
477 while (index.limit < globalLimit &&
478 filter->contains(c=text.char32At(index.limit))) {
479 index.limit += UTF_CHAR_LENGTH(c);
480 }
481 }
482
483 // Check to see if the unfiltered run is empty. This only
484 // happens at the end of the string when all the remaining
485 // characters are filtered.
486 if (index.limit == index.start) {
487 // assert(index.start == globalLimit);
488 break;
489 }
490
491 // Is this run incremental? If there is additional
492 // filtered text (if limit < globalLimit) then we pass in
493 // an incremental value of FALSE to force the subclass to
494 // complete the transliteration for this run.
495 UBool isIncrementalRun =
496 (index.limit < globalLimit ? FALSE : incremental);
497
498 int32_t delta;
499
500 // Implement rollback. To understand the need for rollback,
501 // consider the following transliterator:
502 //
503 // "t" is "a > A;"
504 // "u" is "A > b;"
505 // "v" is a compound of "t; NFD; u" with a filter [:Ll:]
506 //
507 // Now apply "c" to the input text "a". The result is "b". But if
508 // the transliteration is done incrementally, then the NFD holds
509 // things up after "t" has already transformed "a" to "A". When
510 // finishTransliterate() is called, "A" is _not_ processed because
511 // it gets excluded by the [:Ll:] filter, and the end result is "A"
512 // -- incorrect. The problem is that the filter is applied to a
513 // partially-transliterated result, when we only want it to apply to
514 // input text. Although this example hinges on a compound
515 // transliterator containing NFD and a specific filter, it can
516 // actually happen with any transliterator which may do a partial
517 // transformation in incremental mode into characters outside its
518 // filter.
519 //
520 // To handle this, when in incremental mode we supply characters to
521 // handleTransliterate() in several passes. Each pass adds one more
522 // input character to the input text. That is, for input "ABCD", we
523 // first try "A", then "AB", then "ABC", and finally "ABCD". If at
524 // any point we block (upon return, start < limit) then we roll
525 // back. If at any point we complete the run (upon return start ==
526 // limit) then we commit that run.
527
528 if (rollback && isIncrementalRun) {
529
530 int32_t runStart = index.start;
531 int32_t runLimit = index.limit;
532 int32_t runLength = runLimit - runStart;
533
534 // Make a rollback copy at the end of the string
535 int32_t rollbackOrigin = text.length();
536 text.copy(runStart, runLimit, rollbackOrigin);
537
538 // Variables reflecting the commitment of completely
539 // transliterated text. passStart is the runStart, advanced
540 // past committed text. rollbackStart is the rollbackOrigin,
541 // advanced past rollback text that corresponds to committed
542 // text.
543 int32_t passStart = runStart;
544 int32_t rollbackStart = rollbackOrigin;
545
546 // The limit for each pass; we advance by one code point with
547 // each iteration.
548 int32_t passLimit = index.start;
549
550 // Total length, in 16-bit code units, of uncommitted text.
551 // This is the length to be rolled back.
552 int32_t uncommittedLength = 0;
553
554 // Total delta (change in length) for all passes
555 int32_t totalDelta = 0;
556
557 // PASS MAIN LOOP -- Start with a single character, and extend
558 // the text by one character at a time. Roll back partial
559 // transliterations and commit complete transliterations.
560 for (;;) {
561 // Length of additional code point, either one or two
562 int32_t charLength =
563 UTF_CHAR_LENGTH(text.char32At(passLimit));
564 passLimit += charLength;
565 if (passLimit > runLimit) {
566 break;
567 }
568 uncommittedLength += charLength;
569
570 index.limit = passLimit;
571
572 // Delegate to subclass for actual transliteration. Upon
573 // return, start will be updated to point after the
574 // transliterated text, and limit and contextLimit will be
575 // adjusted for length changes.
576 handleTransliterate(text, index, TRUE);
577
578 delta = index.limit - passLimit; // change in length
579
580 // We failed to completely transliterate this pass.
581 // Roll back the text. Indices remain unchanged; reset
582 // them where necessary.
583 if (index.start != index.limit) {
584 // Find the rollbackStart, adjusted for length changes
585 // and the deletion of partially transliterated text.
586 int32_t rs = rollbackStart + delta - (index.limit - passStart);
587
588 // Delete the partially transliterated text
589 text.handleReplaceBetween(passStart, index.limit, EMPTY);
590
591 // Copy the rollback text back
592 text.copy(rs, rs + uncommittedLength, passStart);
593
594 // Restore indices to their original values
595 index.start = passStart;
596 index.limit = passLimit;
597 index.contextLimit -= delta;
598 }
599
600 // We did completely transliterate this pass. Update the
601 // commit indices to record how far we got. Adjust indices
602 // for length change.
603 else {
604 // Move the pass indices past the committed text.
605 passStart = passLimit = index.start;
606
607 // Adjust the rollbackStart for length changes and move
608 // it past the committed text. All characters we've
609 // processed to this point are committed now, so zero
610 // out the uncommittedLength.
611 rollbackStart += delta + uncommittedLength;
612 uncommittedLength = 0;
613
614 // Adjust indices for length changes.
615 runLimit += delta;
616 totalDelta += delta;
617 }
618 }
619
620 // Adjust overall limit and rollbackOrigin for insertions and
621 // deletions. Don't need to worry about contextLimit because
622 // handleTransliterate() maintains that.
623 rollbackOrigin += totalDelta;
624 globalLimit += totalDelta;
625
626 // Delete the rollback copy
627 text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, EMPTY);
628
629 // Move start past committed text
630 index.start = passStart;
631 }
632
633 else {
634 // Delegate to subclass for actual transliteration.
635 int32_t limit = index.limit;
636 handleTransliterate(text, index, isIncrementalRun);
637 delta = index.limit - limit; // change in length
638
639 // In a properly written transliterator, start == limit after
640 // handleTransliterate() returns when incremental is false.
641 // Catch cases where the subclass doesn't do this, and throw
642 // an exception. (Just pinning start to limit is a bad idea,
643 // because what's probably happening is that the subclass
644 // isn't transliterating all the way to the end, and it should
645 // in non-incremental mode.)
646 if (!incremental && index.start != index.limit) {
647 // We can't throw an exception, so just fudge things
648 index.start = index.limit;
649 }
650
651 // Adjust overall limit for insertions/deletions. Don't need
652 // to worry about contextLimit because handleTransliterate()
653 // maintains that.
654 globalLimit += delta;
655 }
656
657 if (filter == NULL || isIncrementalRun) {
658 break;
659 }
660
661 // If we did completely transliterate this
662 // run, then repeat with the next unfiltered run.
663 }
664
665 // Start is valid where it is. Limit needs to be put back where
666 // it was, modulo adjustments for deletions/insertions.
667 index.limit = globalLimit;
668 }
669
670 void Transliterator::filteredTransliterate(Replaceable& text,
671 UTransPosition& index,
672 UBool incremental) const {
673 filteredTransliterate(text, index, incremental, FALSE);
674 }
675
676 /**
677 * Method for subclasses to use to set the maximum context length.
678 * @see #getMaximumContextLength
679 */
680 void Transliterator::setMaximumContextLength(int32_t maxContextLength) {
681 maximumContextLength = maxContextLength;
682 }
683
684 /**
685 * Returns a programmatic identifier for this transliterator.
686 * If this identifier is passed to <code>getInstance()</code>, it
687 * will return this object, if it has been registered.
688 * @see #registerInstance
689 * @see #getAvailableIDs
690 */
691 const UnicodeString& Transliterator::getID(void) const {
692 return ID;
693 }
694
695 /**
696 * Returns a name for this transliterator that is appropriate for
697 * display to the user in the default locale. See {@link
698 * #getDisplayName(Locale)} for details.
699 */
700 UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& ID,
701 UnicodeString& result) {
702 return getDisplayName(ID, Locale::getDefault(), result);
703 }
704
705 /**
706 * Returns a name for this transliterator that is appropriate for
707 * display to the user in the given locale. This name is taken
708 * from the locale resource data in the standard manner of the
709 * <code>java.text</code> package.
710 *
711 * <p>If no localized names exist in the system resource bundles,
712 * a name is synthesized using a localized
713 * <code>MessageFormat</code> pattern from the resource data. The
714 * arguments to this pattern are an integer followed by one or two
715 * strings. The integer is the number of strings, either 1 or 2.
716 * The strings are formed by splitting the ID for this
717 * transliterator at the first TARGET_SEP. If there is no TARGET_SEP, then the
718 * entire ID forms the only string.
719 * @param inLocale the Locale in which the display name should be
720 * localized.
721 * @see java.text.MessageFormat
722 */
723 UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& id,
724 const Locale& inLocale,
725 UnicodeString& result) {
726 UErrorCode status = U_ZERO_ERROR;
727
728 ResourceBundle bundle(U_ICUDATA_TRANSLIT, inLocale, status);
729
730 // Suspend checking status until later...
731
732 result.truncate(0);
733
734 // Normalize the ID
735 UnicodeString source, target, variant;
736 UBool sawSource;
737 TransliteratorIDParser::IDtoSTV(id, source, target, variant, sawSource);
738 if (target.length() < 1) {
739 // No target; malformed id
740 return result;
741 }
742 if (variant.length() > 0) { // Change "Foo" to "/Foo"
743 variant.insert(0, VARIANT_SEP);
744 }
745 UnicodeString ID(source);
746 ID.append(TARGET_SEP).append(target).append(variant);
747
748 // build the char* key
749 char key[200];
750 uprv_strcpy(key, RB_DISPLAY_NAME_PREFIX);
751 int32_t length=(int32_t)uprv_strlen(RB_DISPLAY_NAME_PREFIX);
752 ID.extract(0, (int32_t)(sizeof(key)-length), key+length, "");
753
754 // Try to retrieve a UnicodeString from the bundle.
755 UnicodeString resString = bundle.getStringEx(key, status);
756
757 if (U_SUCCESS(status) && resString.length() != 0) {
758 return result = resString; // [sic] assign & return
759 }
760
761 #if !UCONFIG_NO_FORMATTING
762 // We have failed to get a name from the locale data. This is
763 // typical, since most transliterators will not have localized
764 // name data. The next step is to retrieve the MessageFormat
765 // pattern from the locale data and to use it to synthesize the
766 // name from the ID.
767
768 status = U_ZERO_ERROR;
769 resString = bundle.getStringEx(RB_DISPLAY_NAME_PATTERN, status);
770
771 if (U_SUCCESS(status) && resString.length() != 0) {
772 MessageFormat msg(resString, inLocale, status);
773 // Suspend checking status until later...
774
775 // We pass either 2 or 3 Formattable objects to msg.
776 Formattable args[3];
777 int32_t nargs;
778 args[0].setLong(2); // # of args to follow
779 args[1].setString(source);
780 args[2].setString(target);
781 nargs = 3;
782
783 // Use display names for the scripts, if they exist
784 UnicodeString s;
785 length=(int32_t)uprv_strlen(RB_SCRIPT_DISPLAY_NAME_PREFIX);
786 for (int j=1; j<=2; ++j) {
787 status = U_ZERO_ERROR;
788 uprv_strcpy(key, RB_SCRIPT_DISPLAY_NAME_PREFIX);
789 args[j].getString(s);
790 s.extract(0, sizeof(key)-length-1, key+length, "");
791
792 resString = bundle.getStringEx(key, status);
793
794 if (U_SUCCESS(status)) {
795 args[j] = resString;
796 }
797 }
798
799 status = U_ZERO_ERROR;
800 FieldPosition pos; // ignored by msg
801 msg.format(args, nargs, result, pos, status);
802 if (U_SUCCESS(status)) {
803 result.append(variant);
804 return result;
805 }
806 }
807 #endif
808
809 // We should not reach this point unless there is something
810 // wrong with the build or the RB_DISPLAY_NAME_PATTERN has
811 // been deleted from the root RB_LOCALE_ELEMENTS resource.
812 result = ID;
813 return result;
814 }
815
816 /**
817 * Returns the filter used by this transliterator, or <tt>null</tt>
818 * if this transliterator uses no filter. Caller musn't delete
819 * the result!
820 */
821 const UnicodeFilter* Transliterator::getFilter(void) const {
822 return filter;
823 }
824
825 /**
826 * Returns the filter used by this transliterator, or
827 * <tt>NULL</tt> if this transliterator uses no filter. The
828 * caller must eventually delete the result. After this call,
829 * this transliterator's filter is set to <tt>NULL</tt>.
830 */
831 UnicodeFilter* Transliterator::orphanFilter(void) {
832 UnicodeFilter *result = filter;
833 filter = NULL;
834 return result;
835 }
836
837 /**
838 * Changes the filter used by this transliterator. If the filter
839 * is set to <tt>null</tt> then no filtering will occur.
840 *
841 * <p>Callers must take care if a transliterator is in use by
842 * multiple threads. The filter should not be changed by one
843 * thread while another thread may be transliterating.
844 */
845 void Transliterator::adoptFilter(UnicodeFilter* filterToAdopt) {
846 delete filter;
847 filter = filterToAdopt;
848 }
849
850 /**
851 * Returns this transliterator's inverse. See the class
852 * documentation for details. This implementation simply inverts
853 * the two entities in the ID and attempts to retrieve the
854 * resulting transliterator. That is, if <code>getID()</code>
855 * returns "A-B", then this method will return the result of
856 * <code>getInstance("B-A")</code>, or <code>null</code> if that
857 * call fails.
858 *
859 * <p>This method does not take filtering into account. The
860 * returned transliterator will have no filter.
861 *
862 * <p>Subclasses with knowledge of their inverse may wish to
863 * override this method.
864 *
865 * @return a transliterator that is an inverse, not necessarily
866 * exact, of this transliterator, or <code>null</code> if no such
867 * transliterator is registered.
868 * @see #registerInstance
869 */
870 Transliterator* Transliterator::createInverse(UErrorCode& status) const {
871 UParseError parseError;
872 return Transliterator::createInstance(ID, UTRANS_REVERSE,parseError,status);
873 }
874
875 Transliterator* U_EXPORT2
876 Transliterator::createInstance(const UnicodeString& ID,
877 UTransDirection dir,
878 UErrorCode& status)
879 {
880 UParseError parseError;
881 return createInstance(ID, dir, parseError, status);
882 }
883
884 /**
885 * Returns a <code>Transliterator</code> object given its ID.
886 * The ID must be either a system transliterator ID or a ID registered
887 * using <code>registerInstance()</code>.
888 *
889 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
890 * @return A <code>Transliterator</code> object with the given ID
891 * @see #registerInstance
892 * @see #getAvailableIDs
893 * @see #getID
894 */
895 Transliterator* U_EXPORT2
896 Transliterator::createInstance(const UnicodeString& ID,
897 UTransDirection dir,
898 UParseError& parseError,
899 UErrorCode& status)
900 {
901 if (U_FAILURE(status)) {
902 return 0;
903 }
904
905 UnicodeString canonID;
906 UVector list(status);
907 if (U_FAILURE(status)) {
908 return NULL;
909 }
910
911 UnicodeSet* globalFilter;
912 // TODO add code for parseError...currently unused, but
913 // later may be used by parsing code...
914 if (!TransliteratorIDParser::parseCompoundID(ID, dir, canonID, list, globalFilter)) {
915 status = U_INVALID_ID;
916 return NULL;
917 }
918
919 TransliteratorIDParser::instantiateList(list, NULL, -1, status);
920 if (U_FAILURE(status)) {
921 return NULL;
922 }
923
924 U_ASSERT(list.size() > 0);
925 Transliterator* t = NULL;
926 switch (list.size()) {
927 case 1:
928 t = (Transliterator*) list.elementAt(0);
929 break;
930 default:
931 t = new CompoundTransliterator(list, parseError, status);
932 /* test for NULL */
933 if (t == 0) {
934 status = U_MEMORY_ALLOCATION_ERROR;
935 return 0;
936 }
937 if (U_FAILURE(status)) {
938 delete t;
939 return NULL;
940 }
941 break;
942 }
943 t->setID(canonID);
944 if (globalFilter != NULL) {
945 t->adoptFilter(globalFilter);
946 }
947 return t;
948 }
949
950 /**
951 * Create a transliterator from a basic ID. This is an ID
952 * containing only the forward direction source, target, and
953 * variant.
954 * @param id a basic ID of the form S-T or S-T/V.
955 * @return a newly created Transliterator or null if the ID is
956 * invalid.
957 */
958 Transliterator* Transliterator::createBasicInstance(const UnicodeString& id,
959 const UnicodeString* canon) {
960 UParseError pe;
961 UErrorCode ec = U_ZERO_ERROR;
962 TransliteratorAlias* alias = 0;
963 Transliterator* t = 0;
964
965 umtx_init(&registryMutex);
966 umtx_lock(&registryMutex);
967 if (HAVE_REGISTRY) {
968 t = registry->get(id, alias, ec);
969 }
970 umtx_unlock(&registryMutex);
971
972 if (U_FAILURE(ec)) {
973 delete t;
974 delete alias;
975 return 0;
976 }
977
978 // We may have not gotten a transliterator: Because we can't
979 // instantiate a transliterator from inside TransliteratorRegistry::
980 // get() (that would deadlock), we sometimes pass back an alias. This
981 // contains the data we need to finish the instantiation outside the
982 // registry mutex. The alias may, in turn, generate another alias, so
983 // we handle aliases in a loop. The max times through the loop is two.
984 // [alan]
985 while (alias != 0) {
986 U_ASSERT(t==0);
987 // Rule-based aliases are handled with TransliteratorAlias::
988 // parse(), followed by TransliteratorRegistry::reget().
989 // Other aliases are handled with TransliteratorAlias::create().
990 if (alias->isRuleBased()) {
991 // Step 1. parse
992 TransliteratorParser parser;
993 alias->parse(parser, pe, ec);
994 delete alias;
995 alias = 0;
996
997 // Step 2. reget
998 umtx_lock(&registryMutex);
999 if (HAVE_REGISTRY) {
1000 t = registry->reget(id, parser, alias, ec);
1001 }
1002 umtx_unlock(&registryMutex);
1003
1004 // Step 3. Loop back around!
1005 } else {
1006 t = alias->create(pe, ec);
1007 delete alias;
1008 alias = 0;
1009 break;
1010 }
1011 if (U_FAILURE(ec)) {
1012 delete t;
1013 delete alias;
1014 t = NULL;
1015 break;
1016 }
1017 }
1018
1019 if (t != NULL && canon != NULL) {
1020 t->setID(*canon);
1021 }
1022
1023 return t;
1024 }
1025
1026 /**
1027 * Returns a <code>Transliterator</code> object constructed from
1028 * the given rule string. This will be a RuleBasedTransliterator,
1029 * if the rule string contains only rules, or a
1030 * CompoundTransliterator, if it contains ID blocks, or a
1031 * NullTransliterator, if it contains ID blocks which parse as
1032 * empty for the given direction.
1033 */
1034 Transliterator* U_EXPORT2
1035 Transliterator::createFromRules(const UnicodeString& ID,
1036 const UnicodeString& rules,
1037 UTransDirection dir,
1038 UParseError& parseError,
1039 UErrorCode& status)
1040 {
1041 Transliterator* t = NULL;
1042
1043 TransliteratorParser parser;
1044 parser.parse(rules, dir, parseError, status);
1045
1046 if (U_FAILURE(status)) {
1047 return 0;
1048 }
1049
1050 // NOTE: The logic here matches that in TransliteratorRegistry.
1051 if (parser.idBlock.length() == 0) {
1052 if (parser.data == NULL) {
1053 // No idBlock, no data -- this is just an
1054 // alias for Null
1055 t = new NullTransliterator();
1056 } else {
1057 // No idBlock, data != 0 -- this is an
1058 // ordinary RBT_DATA.
1059 t = new RuleBasedTransliterator(ID, parser.orphanData(), TRUE); // TRUE == adopt data object
1060 }
1061 /* test for NULL */
1062 if (t == 0) {
1063 status = U_MEMORY_ALLOCATION_ERROR;
1064 return 0;
1065 }
1066 } else {
1067 if (parser.data == NULL) {
1068 // idBlock, no data -- this is an alias. The ID has
1069 // been munged from reverse into forward mode, if
1070 // necessary, so instantiate the ID in the forward
1071 // direction.
1072 t = createInstance(parser.idBlock, UTRANS_FORWARD, parseError, status);
1073 if (t != NULL) {
1074 t->setID(ID);
1075 }
1076 } else {
1077 // idBlock and data -- this is a compound
1078 // RBT
1079 UnicodeString id((UChar)0x005F); // '_'
1080 t = new RuleBasedTransliterator(id, parser.orphanData(), TRUE); // TRUE == adopt data object
1081 /* test for NULL */
1082 if (t == 0) {
1083 status = U_MEMORY_ALLOCATION_ERROR;
1084 return 0;
1085 }
1086 t = new CompoundTransliterator(ID, parser.idBlock, parser.idSplitPoint,
1087 t, status);
1088 /* test for NULL */
1089 if (t == 0) {
1090 status = U_MEMORY_ALLOCATION_ERROR;
1091 return 0;
1092 }
1093 if (U_FAILURE(status)) {
1094 delete t;
1095 t = 0;
1096 }
1097 if (parser.compoundFilter != NULL) {
1098 t->adoptFilter(parser.orphanCompoundFilter());
1099 }
1100 return t;
1101 }
1102 }
1103
1104 return t;
1105 }
1106
1107 UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
1108 UBool escapeUnprintable) const {
1109 // The base class implementation of toRules munges the ID into
1110 // the correct format. That is: foo => ::foo
1111 if (escapeUnprintable) {
1112 rulesSource.truncate(0);
1113 UnicodeString id = getID();
1114 for (int32_t i=0; i<id.length();) {
1115 UChar32 c = id.char32At(i);
1116 if (!ICU_Utility::escapeUnprintable(rulesSource, c)) {
1117 rulesSource.append(c);
1118 }
1119 i += UTF_CHAR_LENGTH(c);
1120 }
1121 } else {
1122 rulesSource = getID();
1123 }
1124 // KEEP in sync with rbt_pars
1125 rulesSource.insert(0, UNICODE_STRING_SIMPLE("::"));
1126 rulesSource.append(ID_DELIM);
1127 return rulesSource;
1128 }
1129
1130 int32_t Transliterator::countElements() const {
1131 return (this->getDynamicClassID() ==
1132 CompoundTransliterator::getStaticClassID()) ?
1133 ((const CompoundTransliterator*) this)->getCount() : 0;
1134 }
1135
1136 const Transliterator& Transliterator::getElement(int32_t index, UErrorCode& ec) const {
1137 if (U_FAILURE(ec)) {
1138 return *this;
1139 }
1140 const CompoundTransliterator* cpd =
1141 (this->getDynamicClassID() == CompoundTransliterator::getStaticClassID()) ?
1142 (const CompoundTransliterator*) this : 0;
1143 int32_t n = (cpd == NULL) ? 1 : cpd->getCount();
1144 if (index < 0 || index >= n) {
1145 ec = U_INDEX_OUTOFBOUNDS_ERROR;
1146 return *this;
1147 } else {
1148 return (n == 1) ? *this : cpd->getTransliterator(index);
1149 }
1150 }
1151
1152 UnicodeSet& Transliterator::getSourceSet(UnicodeSet& result) const {
1153 handleGetSourceSet(result);
1154 if (filter != NULL) {
1155 UnicodeSet* filterSet;
1156 UBool deleteFilterSet = FALSE;
1157 // Most, but not all filters will be UnicodeSets. Optimize for
1158 // the high-runner case.
1159 if (filter->getDynamicClassID() == UnicodeSet::getStaticClassID()) {
1160 filterSet = (UnicodeSet*) filter;
1161 } else {
1162 filterSet = new UnicodeSet();
1163 deleteFilterSet = TRUE;
1164 filter->addMatchSetTo(*filterSet);
1165 }
1166 result.retainAll(*filterSet);
1167 if (deleteFilterSet) {
1168 delete filterSet;
1169 }
1170 }
1171 return result;
1172 }
1173
1174 void Transliterator::handleGetSourceSet(UnicodeSet& result) const {
1175 result.clear();
1176 }
1177
1178 UnicodeSet& Transliterator::getTargetSet(UnicodeSet& result) const {
1179 return result.clear();
1180 }
1181
1182 // For public consumption
1183 void U_EXPORT2 Transliterator::registerFactory(const UnicodeString& id,
1184 Transliterator::Factory factory,
1185 Transliterator::Token context) {
1186 umtx_init(&registryMutex);
1187 Mutex lock(&registryMutex);
1188 if (HAVE_REGISTRY) {
1189 _registerFactory(id, factory, context);
1190 }
1191 }
1192
1193 // To be called only by Transliterator subclasses that are called
1194 // to register themselves by initializeRegistry().
1195 void Transliterator::_registerFactory(const UnicodeString& id,
1196 Transliterator::Factory factory,
1197 Transliterator::Token context) {
1198 registry->put(id, factory, context, TRUE);
1199 }
1200
1201 // To be called only by Transliterator subclasses that are called
1202 // to register themselves by initializeRegistry().
1203 void Transliterator::_registerSpecialInverse(const UnicodeString& target,
1204 const UnicodeString& inverseTarget,
1205 UBool bidirectional) {
1206 UErrorCode status = U_ZERO_ERROR;
1207 TransliteratorIDParser::registerSpecialInverse(target, inverseTarget, bidirectional, status);
1208 }
1209
1210 /**
1211 * Registers a instance <tt>obj</tt> of a subclass of
1212 * <code>Transliterator</code> with the system. This object must
1213 * implement the <tt>clone()</tt> method. When
1214 * <tt>getInstance()</tt> is called with an ID string that is
1215 * equal to <tt>obj.getID()</tt>, then <tt>obj.clone()</tt> is
1216 * returned.
1217 *
1218 * @param obj an instance of subclass of
1219 * <code>Transliterator</code> that defines <tt>clone()</tt>
1220 * @see #getInstance
1221 * @see #unregister
1222 */
1223 void U_EXPORT2 Transliterator::registerInstance(Transliterator* adoptedPrototype) {
1224 umtx_init(&registryMutex);
1225 Mutex lock(&registryMutex);
1226 if (HAVE_REGISTRY) {
1227 _registerInstance(adoptedPrototype);
1228 }
1229 }
1230
1231 void Transliterator::_registerInstance(Transliterator* adoptedPrototype) {
1232 registry->put(adoptedPrototype, TRUE);
1233 }
1234
1235 /**
1236 * Unregisters a transliterator or class. This may be either
1237 * a system transliterator or a user transliterator or class.
1238 *
1239 * @param ID the ID of the transliterator or class
1240 * @see #registerInstance
1241
1242 */
1243 void U_EXPORT2 Transliterator::unregister(const UnicodeString& ID) {
1244 umtx_init(&registryMutex);
1245 Mutex lock(&registryMutex);
1246 if (HAVE_REGISTRY) {
1247 registry->remove(ID);
1248 }
1249 }
1250
1251 /**
1252 * == OBSOLETE - remove in ICU 3.4 ==
1253 * Return the number of IDs currently registered with the system.
1254 * To retrieve the actual IDs, call getAvailableID(i) with
1255 * i from 0 to countAvailableIDs() - 1.
1256 */
1257 int32_t U_EXPORT2 Transliterator::countAvailableIDs(void) {
1258 umtx_init(&registryMutex);
1259 Mutex lock(&registryMutex);
1260 return HAVE_REGISTRY ? registry->countAvailableIDs() : 0;
1261 }
1262
1263 /**
1264 * == OBSOLETE - remove in ICU 3.4 ==
1265 * Return the index-th available ID. index must be between 0
1266 * and countAvailableIDs() - 1, inclusive. If index is out of
1267 * range, the result of getAvailableID(0) is returned.
1268 */
1269 const UnicodeString& U_EXPORT2 Transliterator::getAvailableID(int32_t index) {
1270 const UnicodeString* result = NULL;
1271 umtx_init(&registryMutex);
1272 umtx_lock(&registryMutex);
1273 if (HAVE_REGISTRY) {
1274 result = &registry->getAvailableID(index);
1275 }
1276 umtx_unlock(&registryMutex);
1277 U_ASSERT(result != NULL); // fail if no registry
1278 return *result;
1279 }
1280
1281 StringEnumeration* U_EXPORT2 Transliterator::getAvailableIDs(UErrorCode& ec) {
1282 if (U_FAILURE(ec)) return NULL;
1283 StringEnumeration* result = NULL;
1284 umtx_init(&registryMutex);
1285 umtx_lock(&registryMutex);
1286 if (HAVE_REGISTRY) {
1287 result = registry->getAvailableIDs();
1288 }
1289 umtx_unlock(&registryMutex);
1290 if (result == NULL) {
1291 ec = U_INTERNAL_TRANSLITERATOR_ERROR;
1292 }
1293 return result;
1294 }
1295
1296 int32_t U_EXPORT2 Transliterator::countAvailableSources(void) {
1297 umtx_init(&registryMutex);
1298 Mutex lock(&registryMutex);
1299 return HAVE_REGISTRY ? _countAvailableSources() : 0;
1300 }
1301
1302 UnicodeString& U_EXPORT2 Transliterator::getAvailableSource(int32_t index,
1303 UnicodeString& result) {
1304 umtx_init(&registryMutex);
1305 Mutex lock(&registryMutex);
1306 if (HAVE_REGISTRY) {
1307 _getAvailableSource(index, result);
1308 }
1309 return result;
1310 }
1311
1312 int32_t U_EXPORT2 Transliterator::countAvailableTargets(const UnicodeString& source) {
1313 umtx_init(&registryMutex);
1314 Mutex lock(&registryMutex);
1315 return HAVE_REGISTRY ? _countAvailableTargets(source) : 0;
1316 }
1317
1318 UnicodeString& U_EXPORT2 Transliterator::getAvailableTarget(int32_t index,
1319 const UnicodeString& source,
1320 UnicodeString& result) {
1321 umtx_init(&registryMutex);
1322 Mutex lock(&registryMutex);
1323 if (HAVE_REGISTRY) {
1324 _getAvailableTarget(index, source, result);
1325 }
1326 return result;
1327 }
1328
1329 int32_t U_EXPORT2 Transliterator::countAvailableVariants(const UnicodeString& source,
1330 const UnicodeString& target) {
1331 umtx_init(&registryMutex);
1332 Mutex lock(&registryMutex);
1333 return HAVE_REGISTRY ? _countAvailableVariants(source, target) : 0;
1334 }
1335
1336 UnicodeString& U_EXPORT2 Transliterator::getAvailableVariant(int32_t index,
1337 const UnicodeString& source,
1338 const UnicodeString& target,
1339 UnicodeString& result) {
1340 umtx_init(&registryMutex);
1341 Mutex lock(&registryMutex);
1342 if (HAVE_REGISTRY) {
1343 _getAvailableVariant(index, source, target, result);
1344 }
1345 return result;
1346 }
1347
1348 int32_t Transliterator::_countAvailableSources(void) {
1349 return registry->countAvailableSources();
1350 }
1351
1352 UnicodeString& Transliterator::_getAvailableSource(int32_t index,
1353 UnicodeString& result) {
1354 return registry->getAvailableSource(index, result);
1355 }
1356
1357 int32_t Transliterator::_countAvailableTargets(const UnicodeString& source) {
1358 return registry->countAvailableTargets(source);
1359 }
1360
1361 UnicodeString& Transliterator::_getAvailableTarget(int32_t index,
1362 const UnicodeString& source,
1363 UnicodeString& result) {
1364 return registry->getAvailableTarget(index, source, result);
1365 }
1366
1367 int32_t Transliterator::_countAvailableVariants(const UnicodeString& source,
1368 const UnicodeString& target) {
1369 return registry->countAvailableVariants(source, target);
1370 }
1371
1372 UnicodeString& Transliterator::_getAvailableVariant(int32_t index,
1373 const UnicodeString& source,
1374 const UnicodeString& target,
1375 UnicodeString& result) {
1376 return registry->getAvailableVariant(index, source, target, result);
1377 }
1378
1379 #ifdef U_USE_DEPRECATED_TRANSLITERATOR_API
1380
1381 /**
1382 * Method for subclasses to use to obtain a character in the given
1383 * string, with filtering.
1384 * @deprecated the new architecture provides filtering at the top
1385 * level. This method will be removed Dec 31 2001.
1386 */
1387 UChar Transliterator::filteredCharAt(const Replaceable& text, int32_t i) const {
1388 UChar c;
1389 const UnicodeFilter* localFilter = getFilter();
1390 return (localFilter == 0) ? text.charAt(i) :
1391 (localFilter->contains(c = text.charAt(i)) ? c : (UChar)0xFFFE);
1392 }
1393
1394 #endif
1395
1396 /**
1397 * If the registry is initialized, return TRUE. If not, initialize it
1398 * and return TRUE. If the registry cannot be initialized, return
1399 * FALSE (rare).
1400 *
1401 * IMPORTANT: Upon entry, registryMutex must be LOCKED. The entirely
1402 * initialization is done with the lock held. There is NO REASON to
1403 * unlock, since no other thread that is waiting on the registryMutex
1404 * cannot itself proceed until the registry is initialized.
1405 */
1406 UBool Transliterator::initializeRegistry() {
1407 if (registry != 0) {
1408 return TRUE;
1409 }
1410
1411 UErrorCode status = U_ZERO_ERROR;
1412
1413 registry = new TransliteratorRegistry(status);
1414 if (registry == 0 || U_FAILURE(status)) {
1415 delete registry;
1416 registry = 0;
1417 return FALSE; // can't create registry, no recovery
1418 }
1419
1420 /* The following code parses the index table located in
1421 * icu/data/translit/root.txt. The index is an n x 4 table
1422 * that follows this format:
1423 * <id>{
1424 * file{
1425 * resource{"<resource>"}
1426 * direction{"<direction>"}
1427 * }
1428 * }
1429 * <id>{
1430 * internal{
1431 * resource{"<resource>"}
1432 * direction{"<direction"}
1433 * }
1434 * }
1435 * <id>{
1436 * alias{"<getInstanceArg"}
1437 * }
1438 * <id> is the ID of the system transliterator being defined. These
1439 * are public IDs enumerated by Transliterator.getAvailableIDs(),
1440 * unless the second field is "internal".
1441 *
1442 * <resource> is a ResourceReader resource name. Currently these refer
1443 * to file names under com/ibm/text/resources. This string is passed
1444 * directly to ResourceReader, together with <encoding>.
1445 *
1446 * <direction> is either "FORWARD" or "REVERSE".
1447 *
1448 * <getInstanceArg> is a string to be passed directly to
1449 * Transliterator.getInstance(). The returned Transliterator object
1450 * then has its ID changed to <id> and is returned.
1451 *
1452 * The extra blank field on "alias" lines is to make the array square.
1453 */
1454 //static const char translit_index[] = "translit_index";
1455
1456 UResourceBundle *bundle, *transIDs, *colBund;
1457 bundle = ures_open(U_ICUDATA_TRANSLIT, NULL/*open root bundle*/, &status);
1458 transIDs = ures_getByKey(bundle, RB_RULE_BASED_IDS, 0, &status);
1459
1460 int32_t row, maxRows;
1461 if (U_SUCCESS(status)) {
1462 maxRows = ures_getSize(transIDs);
1463 for (row = 0; row < maxRows; row++) {
1464 colBund = ures_getByIndex(transIDs, row, 0, &status);
1465 if (U_SUCCESS(status)) {
1466 UnicodeString id(ures_getKey(colBund));
1467 UResourceBundle* res = ures_getNextResource(colBund, NULL, &status);
1468 const char* typeStr = ures_getKey(res);
1469 UChar type;
1470 u_charsToUChars(typeStr, &type, 1);
1471
1472 if (U_SUCCESS(status)) {
1473 switch (type) {
1474 case 0x66: // 'f'
1475 case 0x69: // 'i'
1476 // 'file' or 'internal';
1477 // row[2]=resource, row[3]=direction
1478 {
1479
1480 UnicodeString resString = ures_getUnicodeStringByKey(res, "resource", &status);
1481 UBool visible = (type == 0x0066 /*f*/);
1482 UTransDirection dir =
1483 (ures_getUnicodeStringByKey(res, "direction", &status).charAt(0) ==
1484 0x0046 /*F*/) ?
1485 UTRANS_FORWARD : UTRANS_REVERSE;
1486 registry->put(id, resString, dir, visible);
1487 }
1488 break;
1489 case 0x61: // 'a'
1490 // 'alias'; row[2]=createInstance argument
1491 UnicodeString resString = ures_getUnicodeString(res, &status);
1492 registry->put(id, resString, TRUE);
1493 break;
1494 }
1495 }
1496 ures_close(res);
1497 }
1498 ures_close(colBund);
1499 }
1500 }
1501
1502 ures_close(transIDs);
1503 ures_close(bundle);
1504
1505 // Manually add prototypes that the system knows about to the
1506 // cache. This is how new non-rule-based transliterators are
1507 // added to the system.
1508
1509 registry->put(new NullTransliterator(), TRUE);
1510 registry->put(new LowercaseTransliterator(), TRUE);
1511 registry->put(new UppercaseTransliterator(), TRUE);
1512 registry->put(new TitlecaseTransliterator(), TRUE);
1513 registry->put(new UnicodeNameTransliterator(), TRUE);
1514 registry->put(new NameUnicodeTransliterator(), TRUE);
1515
1516 RemoveTransliterator::registerIDs(); // Must be within mutex
1517 EscapeTransliterator::registerIDs();
1518 UnescapeTransliterator::registerIDs();
1519 NormalizationTransliterator::registerIDs();
1520 AnyTransliterator::registerIDs();
1521
1522 _registerSpecialInverse(NullTransliterator::SHORT_ID,
1523 NullTransliterator::SHORT_ID, FALSE);
1524 _registerSpecialInverse(UNICODE_STRING_SIMPLE("Upper"),
1525 UNICODE_STRING_SIMPLE("Lower"), TRUE);
1526 _registerSpecialInverse(UNICODE_STRING_SIMPLE("Title"),
1527 UNICODE_STRING_SIMPLE("Lower"), FALSE);
1528
1529 ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, transliterator_cleanup);
1530
1531 return TRUE;
1532 }
1533
1534 U_NAMESPACE_END
1535
1536 // Defined in ucln_in.h:
1537
1538 /**
1539 * Release all static memory held by transliterator. This will
1540 * necessarily invalidate any rule-based transliterators held by the
1541 * user, because RBTs hold pointers to common data objects.
1542 */
1543 U_CFUNC UBool transliterator_cleanup(void) {
1544 TransliteratorIDParser::cleanup();
1545 if (registry) {
1546 delete registry;
1547 registry = NULL;
1548 }
1549 umtx_destroy(&registryMutex);
1550 return TRUE;
1551 }
1552
1553 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
1554
1555 //eof