2 **********************************************************************
3 * Copyright (c) 2001-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 08/10/2001 aliu Creation.
8 **********************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_TRANSLITERATION
15 #include "unicode/translit.h"
16 #include "unicode/resbund.h"
17 #include "unicode/uniset.h"
18 #include "unicode/uscript.h"
30 // Enable the following symbol to add debugging code that tracks the
31 // allocation, deletion, and use of Entry objects. BoundsChecker has
32 // reported dangling pointer errors with these objects, but I have
33 // been unable to confirm them. I suspect BoundsChecker is getting
34 // confused with pointers going into and coming out of a UHashtable,
35 // despite the hinting code that is designed to help it.
42 static const UChar LOCALE_SEP
= 95; // '_'
43 //static const UChar ID_SEP = 0x002D; /*-*/
44 //static const UChar VARIANT_SEP = 0x002F; // '/'
47 static const UChar ANY
[] = { 65, 110, 121, 0 }; // Any
50 #define NO_VARIANT UnicodeString()
53 * Resource bundle key for the RuleBasedTransliterator rule.
55 //static const char RB_RULE[] = "Rule";
59 //------------------------------------------------------------------
61 //------------------------------------------------------------------
63 TransliteratorAlias::TransliteratorAlias(const UnicodeString
& theAliasID
,
64 const UnicodeSet
* cpdFilter
) :
66 aliasesOrRules(theAliasID
),
68 compoundFilter(cpdFilter
),
69 direction(UTRANS_FORWARD
),
70 type(TransliteratorAlias::SIMPLE
) {
73 TransliteratorAlias::TransliteratorAlias(const UnicodeString
& theID
,
74 const UnicodeString
& idBlocks
,
75 UVector
* adoptedTransliterators
,
76 const UnicodeSet
* cpdFilter
) :
78 aliasesOrRules(idBlocks
),
79 transes(adoptedTransliterators
),
80 compoundFilter(cpdFilter
),
81 direction(UTRANS_FORWARD
),
82 type(TransliteratorAlias::COMPOUND
) {
85 TransliteratorAlias::TransliteratorAlias(const UnicodeString
& theID
,
86 const UnicodeString
& rules
,
87 UTransDirection dir
) :
89 aliasesOrRules(rules
),
93 type(TransliteratorAlias::RULES
) {
96 TransliteratorAlias::~TransliteratorAlias() {
101 Transliterator
* TransliteratorAlias::create(UParseError
& pe
,
106 Transliterator
*t
= NULL
;
109 t
= Transliterator::createInstance(aliasesOrRules
, UTRANS_FORWARD
, pe
, ec
);
113 if (compoundFilter
!= 0)
114 t
->adoptFilter((UnicodeSet
*)compoundFilter
->clone());
118 // the total number of transliterators in the compound is the total number of anonymous transliterators
119 // plus the total number of ID blocks-- we start by assuming the list begins and ends with an ID
120 // block and that each pair anonymous transliterators has an ID block between them. Then we go back
121 // to see whether there really are ID blocks at the beginning and end (by looking for U+FFFF, which
122 // marks the position where an anonymous transliterator goes) and adjust accordingly
123 int32_t anonymousRBTs
= transes
->size();
124 int32_t transCount
= anonymousRBTs
* 2 + 1;
125 if (!aliasesOrRules
.isEmpty() && aliasesOrRules
[0] == (UChar
)(0xffff))
127 if (aliasesOrRules
.length() >= 2 && aliasesOrRules
[aliasesOrRules
.length() - 1] == (UChar
)(0xffff))
129 UnicodeString
noIDBlock((UChar
)(0xffff));
130 noIDBlock
+= ((UChar
)(0xffff));
131 int32_t pos
= aliasesOrRules
.indexOf(noIDBlock
);
134 pos
= aliasesOrRules
.indexOf(noIDBlock
, pos
+ 1);
137 UVector
transliterators(ec
);
138 UnicodeString idBlock
;
139 int32_t blockSeparatorPos
= aliasesOrRules
.indexOf((UChar
)(0xffff));
140 while (blockSeparatorPos
>= 0) {
141 aliasesOrRules
.extract(0, blockSeparatorPos
, idBlock
);
142 aliasesOrRules
.remove(0, blockSeparatorPos
+ 1);
143 if (!idBlock
.isEmpty())
144 transliterators
.addElement(Transliterator::createInstance(idBlock
, UTRANS_FORWARD
, pe
, ec
), ec
);
145 if (!transes
->isEmpty())
146 transliterators
.addElement(transes
->orphanElementAt(0), ec
);
147 blockSeparatorPos
= aliasesOrRules
.indexOf((UChar
)(0xffff));
149 if (!aliasesOrRules
.isEmpty())
150 transliterators
.addElement(Transliterator::createInstance(aliasesOrRules
, UTRANS_FORWARD
, pe
, ec
), ec
);
151 while (!transes
->isEmpty())
152 transliterators
.addElement(transes
->orphanElementAt(0), ec
);
155 t
= new CompoundTransliterator(ID
, transliterators
,
156 (compoundFilter
? (UnicodeSet
*)(compoundFilter
->clone()) : 0),
157 anonymousRBTs
, pe
, ec
);
159 ec
= U_MEMORY_ALLOCATION_ERROR
;
163 for (int32_t i
= 0; i
< transliterators
.size(); i
++)
164 delete (Transliterator
*)(transliterators
.elementAt(i
));
169 U_ASSERT(FALSE
); // don't call create() if isRuleBased() returns TRUE!
175 UBool
TransliteratorAlias::isRuleBased() const {
176 return type
== RULES
;
179 void TransliteratorAlias::parse(TransliteratorParser
& parser
,
180 UParseError
& pe
, UErrorCode
& ec
) const {
181 U_ASSERT(type
== RULES
);
186 parser
.parse(aliasesOrRules
, direction
, pe
, ec
);
189 //----------------------------------------------------------------------
190 // class TransliteratorSpec
191 //----------------------------------------------------------------------
194 * A TransliteratorSpec is a string specifying either a source or a target. In more
195 * general terms, it may also specify a variant, but we only use the
196 * Spec class for sources and targets.
198 * A Spec may be a locale or a script. If it is a locale, it has a
199 * fallback chain that goes xx_YY_ZZZ -> xx_YY -> xx -> ssss, where
200 * ssss is the script mapping of xx_YY_ZZZ. The Spec API methods
201 * hasFallback(), next(), and reset() iterate over this fallback
204 * The Spec class canonicalizes itself, so the locale is put into
205 * canonical form, or the script is transformed from an abbreviation
208 class TransliteratorSpec
: public UMemory
{
210 TransliteratorSpec(const UnicodeString
& spec
);
211 ~TransliteratorSpec();
213 const UnicodeString
& get() const;
214 UBool
hasFallback() const;
215 const UnicodeString
& next();
218 UBool
isLocale() const;
219 ResourceBundle
& getBundle() const;
221 operator const UnicodeString
&() const { return get(); }
222 const UnicodeString
& getTop() const { return top
; }
229 UnicodeString nextSpec
;
230 UnicodeString scriptName
;
231 UBool isSpecLocale
; // TRUE if spec is a locale
232 UBool isNextLocale
; // TRUE if nextSpec is a locale
235 TransliteratorSpec(const TransliteratorSpec
&other
); // forbid copying of this class
236 TransliteratorSpec
&operator=(const TransliteratorSpec
&other
); // forbid copying of this class
239 TransliteratorSpec::TransliteratorSpec(const UnicodeString
& theSpec
)
243 UErrorCode status
= U_ZERO_ERROR
;
245 LocaleUtility::initLocaleFromName(theSpec
, topLoc
);
246 if (!topLoc
.isBogus()) {
247 res
= new ResourceBundle(U_ICUDATA_TRANSLIT
, topLoc
, status
);
252 if (U_FAILURE(status
) || status
== U_USING_DEFAULT_WARNING
) {
258 // Canonicalize script name -or- do locale->script mapping
259 status
= U_ZERO_ERROR
;
260 static const int32_t capacity
= 10;
261 UScriptCode script
[capacity
]={USCRIPT_INVALID_CODE
};
262 int32_t num
= uscript_getCode(CharString().appendInvariantChars(theSpec
, status
).data(),
263 script
, capacity
, &status
);
264 if (num
> 0 && script
[0] != USCRIPT_INVALID_CODE
) {
265 scriptName
= UnicodeString(uscript_getName(script
[0]), -1, US_INV
);
270 // Canonicalize locale name
271 UnicodeString locStr
;
272 LocaleUtility::initNameFromLocale(topLoc
, locStr
);
273 if (!locStr
.isBogus()) {
276 } else if (scriptName
.length() != 0) {
277 // We are a script; use canonical name
281 // assert(spec != top);
285 TransliteratorSpec::~TransliteratorSpec() {
289 UBool
TransliteratorSpec::hasFallback() const {
290 return nextSpec
.length() != 0;
293 void TransliteratorSpec::reset() {
296 isSpecLocale
= (res
!= 0);
301 void TransliteratorSpec::setupNext() {
302 isNextLocale
= FALSE
;
305 int32_t i
= nextSpec
.lastIndexOf(LOCALE_SEP
);
306 // If i == 0 then we have _FOO, so we fall through
307 // to the scriptName.
309 nextSpec
.truncate(i
);
312 nextSpec
= scriptName
; // scriptName may be empty
315 // spec is a script, so we are at the end
316 nextSpec
.truncate(0);
321 // for(const UnicodeString& s(spec.get());
322 // spec.hasFallback(); s(spec.next())) { ...
324 const UnicodeString
& TransliteratorSpec::next() {
326 isSpecLocale
= isNextLocale
;
331 const UnicodeString
& TransliteratorSpec::get() const {
335 UBool
TransliteratorSpec::isLocale() const {
339 ResourceBundle
& TransliteratorSpec::getBundle() const {
343 //----------------------------------------------------------------------
347 // Vector of Entry pointers currently in use
348 static UVector
* DEBUG_entries
= NULL
;
350 static void DEBUG_setup() {
351 if (DEBUG_entries
== NULL
) {
352 UErrorCode ec
= U_ZERO_ERROR
;
353 DEBUG_entries
= new UVector(ec
);
357 // Caller must call DEBUG_setup first. Return index of given Entry,
358 // if it is in use (not deleted yet), or -1 if not found.
359 static int DEBUG_findEntry(TransliteratorEntry
* e
) {
360 for (int i
=0; i
<DEBUG_entries
->size(); ++i
) {
361 if (e
== (TransliteratorEntry
*) DEBUG_entries
->elementAt(i
)) {
368 // Track object creation
369 static void DEBUG_newEntry(TransliteratorEntry
* e
) {
371 if (DEBUG_findEntry(e
) >= 0) {
372 // This should really never happen unless the heap is broken
373 printf("ERROR DEBUG_newEntry duplicate new pointer %08X\n", e
);
376 UErrorCode ec
= U_ZERO_ERROR
;
377 DEBUG_entries
->addElement(e
, ec
);
380 // Track object deletion
381 static void DEBUG_delEntry(TransliteratorEntry
* e
) {
383 int i
= DEBUG_findEntry(e
);
385 printf("ERROR DEBUG_delEntry possible double deletion %08X\n", e
);
388 DEBUG_entries
->removeElementAt(i
);
391 // Track object usage
392 static void DEBUG_useEntry(TransliteratorEntry
* e
) {
393 if (e
== NULL
) return;
395 int i
= DEBUG_findEntry(e
);
397 printf("ERROR DEBUG_useEntry possible dangling pointer %08X\n", e
);
402 // If we're not debugging then make these macros into NOPs
403 #define DEBUG_newEntry(x)
404 #define DEBUG_delEntry(x)
405 #define DEBUG_useEntry(x)
408 //----------------------------------------------------------------------
410 //----------------------------------------------------------------------
413 * The Entry object stores objects of different types and
414 * singleton objects as placeholders for rule-based transliterators to
415 * be built as needed. Instances of this struct can be placeholders,
416 * can represent prototype transliterators to be cloned, or can
417 * represent TransliteratorData objects. We don't support storing
418 * classes in the registry because we don't have the rtti infrastructure
419 * for it. We could easily add this if there is a need for it in the
422 class TransliteratorEntry
: public UMemory
{
433 NONE
// Only used for uninitialized entries
435 // NOTE: stringArg cannot go inside the union because
436 // it has a copy constructor
437 UnicodeString stringArg
; // For RULES_*, ALIAS, COMPOUND_RBT
438 int32_t intArg
; // For COMPOUND_RBT, LOCALE_RULES
439 UnicodeSet
* compoundFilter
; // For COMPOUND_RBT
441 Transliterator
* prototype
; // For PROTOTYPE
442 TransliterationRuleData
* data
; // For RBT_DATA
443 UVector
* dataVector
; // For COMPOUND_RBT
445 Transliterator::Factory function
;
446 Transliterator::Token context
;
447 } factory
; // For FACTORY
449 TransliteratorEntry();
450 ~TransliteratorEntry();
451 void adoptPrototype(Transliterator
* adopted
);
452 void setFactory(Transliterator::Factory factory
,
453 Transliterator::Token context
);
457 TransliteratorEntry(const TransliteratorEntry
&other
); // forbid copying of this class
458 TransliteratorEntry
&operator=(const TransliteratorEntry
&other
); // forbid copying of this class
461 TransliteratorEntry::TransliteratorEntry() {
463 compoundFilter
= NULL
;
465 DEBUG_newEntry(this);
468 TransliteratorEntry::~TransliteratorEntry() {
469 DEBUG_delEntry(this);
470 if (entryType
== PROTOTYPE
) {
472 } else if (entryType
== RBT_DATA
) {
473 // The data object is shared between instances of RBT. The
474 // entry object owns it. It should only be deleted when the
475 // transliterator component is being cleaned up. Doing so
476 // invalidates any RBTs that the user has instantiated.
478 } else if (entryType
== COMPOUND_RBT
) {
479 while (u
.dataVector
!= NULL
&& !u
.dataVector
->isEmpty())
480 delete (TransliterationRuleData
*)u
.dataVector
->orphanElementAt(0);
483 delete compoundFilter
;
486 void TransliteratorEntry::adoptPrototype(Transliterator
* adopted
) {
487 if (entryType
== PROTOTYPE
) {
490 entryType
= PROTOTYPE
;
491 u
.prototype
= adopted
;
494 void TransliteratorEntry::setFactory(Transliterator::Factory factory
,
495 Transliterator::Token context
) {
496 if (entryType
== PROTOTYPE
) {
500 u
.factory
.function
= factory
;
501 u
.factory
.context
= context
;
504 // UObjectDeleter for Hashtable::setValueDeleter
506 static void U_CALLCONV
507 deleteEntry(void* obj
) {
508 delete (TransliteratorEntry
*) obj
;
512 //----------------------------------------------------------------------
513 // class TransliteratorRegistry: Basic public API
514 //----------------------------------------------------------------------
516 TransliteratorRegistry::TransliteratorRegistry(UErrorCode
& status
) :
517 registry(TRUE
, status
),
518 specDAG(TRUE
, status
),
521 registry
.setValueDeleter(deleteEntry
);
522 availableIDs
.setDeleter(uprv_deleteUObject
);
523 availableIDs
.setComparer(uhash_compareCaselessUnicodeString
);
524 specDAG
.setValueDeleter(uhash_deleteHashtable
);
527 TransliteratorRegistry::~TransliteratorRegistry() {
528 // Through the magic of C++, everything cleans itself up
531 Transliterator
* TransliteratorRegistry::get(const UnicodeString
& ID
,
532 TransliteratorAlias
*& aliasReturn
,
533 UErrorCode
& status
) {
534 U_ASSERT(aliasReturn
== NULL
);
535 TransliteratorEntry
*entry
= find(ID
);
536 return (entry
== 0) ? 0
537 : instantiateEntry(ID
, entry
, aliasReturn
, status
);
540 Transliterator
* TransliteratorRegistry::reget(const UnicodeString
& ID
,
541 TransliteratorParser
& parser
,
542 TransliteratorAlias
*& aliasReturn
,
543 UErrorCode
& status
) {
544 U_ASSERT(aliasReturn
== NULL
);
545 TransliteratorEntry
*entry
= find(ID
);
548 // We get to this point if there are two threads, one of which
549 // is instantiating an ID, and another of which is removing
550 // the same ID from the registry, and the timing is just right.
554 // The usage model for the caller is that they will first call
555 // reg->get() inside the mutex, they'll get back an alias, they call
556 // alias->isRuleBased(), and if they get TRUE, they call alias->parse()
557 // outside the mutex, then reg->reget() inside the mutex again. A real
558 // mess, but it gets things working for ICU 3.0. [alan].
560 // Note: It's possible that in between the caller calling
561 // alias->parse() and reg->reget(), that another thread will have
562 // called reg->reget(), and the entry will already have been fixed up.
563 // We have to detect this so we don't stomp over existing entry
564 // data members and potentially leak memory (u.data and compoundFilter).
566 if (entry
->entryType
== TransliteratorEntry::RULES_FORWARD
||
567 entry
->entryType
== TransliteratorEntry::RULES_REVERSE
||
568 entry
->entryType
== TransliteratorEntry::LOCALE_RULES
) {
570 if (parser
.idBlockVector
.isEmpty() && parser
.dataVector
.isEmpty()) {
572 entry
->entryType
= TransliteratorEntry::ALIAS
;
573 entry
->stringArg
= UNICODE_STRING_SIMPLE("Any-NULL");
575 else if (parser
.idBlockVector
.isEmpty() && parser
.dataVector
.size() == 1) {
576 entry
->u
.data
= (TransliterationRuleData
*)parser
.dataVector
.orphanElementAt(0);
577 entry
->entryType
= TransliteratorEntry::RBT_DATA
;
579 else if (parser
.idBlockVector
.size() == 1 && parser
.dataVector
.isEmpty()) {
580 entry
->stringArg
= *(UnicodeString
*)(parser
.idBlockVector
.elementAt(0));
581 entry
->compoundFilter
= parser
.orphanCompoundFilter();
582 entry
->entryType
= TransliteratorEntry::ALIAS
;
585 entry
->entryType
= TransliteratorEntry::COMPOUND_RBT
;
586 entry
->compoundFilter
= parser
.orphanCompoundFilter();
587 entry
->u
.dataVector
= new UVector(status
);
588 entry
->stringArg
.remove();
590 int32_t limit
= parser
.idBlockVector
.size();
591 if (parser
.dataVector
.size() > limit
)
592 limit
= parser
.dataVector
.size();
594 for (int32_t i
= 0; i
< limit
; i
++) {
595 if (i
< parser
.idBlockVector
.size()) {
596 UnicodeString
* idBlock
= (UnicodeString
*)parser
.idBlockVector
.elementAt(i
);
597 if (!idBlock
->isEmpty())
598 entry
->stringArg
+= *idBlock
;
600 if (!parser
.dataVector
.isEmpty()) {
601 TransliterationRuleData
* data
= (TransliterationRuleData
*)parser
.dataVector
.orphanElementAt(0);
602 entry
->u
.dataVector
->addElement(data
, status
);
603 entry
->stringArg
+= (UChar
)0xffff; // use U+FFFF to mark position of RBTs in ID block
610 instantiateEntry(ID
, entry
, aliasReturn
, status
);
614 void TransliteratorRegistry::put(Transliterator
* adoptedProto
,
618 TransliteratorEntry
*entry
= new TransliteratorEntry();
620 ec
= U_MEMORY_ALLOCATION_ERROR
;
623 entry
->adoptPrototype(adoptedProto
);
624 registerEntry(adoptedProto
->getID(), entry
, visible
);
627 void TransliteratorRegistry::put(const UnicodeString
& ID
,
628 Transliterator::Factory factory
,
629 Transliterator::Token context
,
632 TransliteratorEntry
*entry
= new TransliteratorEntry();
634 ec
= U_MEMORY_ALLOCATION_ERROR
;
637 entry
->setFactory(factory
, context
);
638 registerEntry(ID
, entry
, visible
);
641 void TransliteratorRegistry::put(const UnicodeString
& ID
,
642 const UnicodeString
& resourceName
,
644 UBool readonlyResourceAlias
,
647 TransliteratorEntry
*entry
= new TransliteratorEntry();
649 ec
= U_MEMORY_ALLOCATION_ERROR
;
652 entry
->entryType
= (dir
== UTRANS_FORWARD
) ? TransliteratorEntry::RULES_FORWARD
653 : TransliteratorEntry::RULES_REVERSE
;
654 if (readonlyResourceAlias
) {
655 entry
->stringArg
.setTo(TRUE
, resourceName
.getBuffer(), -1);
658 entry
->stringArg
= resourceName
;
660 registerEntry(ID
, entry
, visible
);
663 void TransliteratorRegistry::put(const UnicodeString
& ID
,
664 const UnicodeString
& alias
,
665 UBool readonlyAliasAlias
,
667 UErrorCode
& /*ec*/) {
668 TransliteratorEntry
*entry
= new TransliteratorEntry();
669 // Null pointer check
671 entry
->entryType
= TransliteratorEntry::ALIAS
;
672 if (readonlyAliasAlias
) {
673 entry
->stringArg
.setTo(TRUE
, alias
.getBuffer(), -1);
676 entry
->stringArg
= alias
;
678 registerEntry(ID
, entry
, visible
);
682 void TransliteratorRegistry::remove(const UnicodeString
& ID
) {
683 UnicodeString source
, target
, variant
;
685 TransliteratorIDParser::IDtoSTV(ID
, source
, target
, variant
, sawSource
);
686 // Only need to do this if ID.indexOf('-') < 0
688 TransliteratorIDParser::STVtoID(source
, target
, variant
, id
);
690 removeSTV(source
, target
, variant
);
691 availableIDs
.removeElement((void*) &id
);
694 //----------------------------------------------------------------------
695 // class TransliteratorRegistry: Public ID and spec management
696 //----------------------------------------------------------------------
699 * == OBSOLETE - remove in ICU 3.4 ==
700 * Return the number of IDs currently registered with the system.
701 * To retrieve the actual IDs, call getAvailableID(i) with
702 * i from 0 to countAvailableIDs() - 1.
704 int32_t TransliteratorRegistry::countAvailableIDs(void) const {
705 return availableIDs
.size();
709 * == OBSOLETE - remove in ICU 3.4 ==
710 * Return the index-th available ID. index must be between 0
711 * and countAvailableIDs() - 1, inclusive. If index is out of
712 * range, the result of getAvailableID(0) is returned.
714 const UnicodeString
& TransliteratorRegistry::getAvailableID(int32_t index
) const {
715 if (index
< 0 || index
>= availableIDs
.size()) {
718 return *(const UnicodeString
*) availableIDs
[index
];
721 StringEnumeration
* TransliteratorRegistry::getAvailableIDs() const {
722 return new Enumeration(*this);
725 int32_t TransliteratorRegistry::countAvailableSources(void) const {
726 return specDAG
.count();
729 UnicodeString
& TransliteratorRegistry::getAvailableSource(int32_t index
,
730 UnicodeString
& result
) const {
732 const UHashElement
*e
= 0;
733 while (index
-- >= 0) {
734 e
= specDAG
.nextElement(pos
);
742 result
= *(UnicodeString
*) e
->key
.pointer
;
747 int32_t TransliteratorRegistry::countAvailableTargets(const UnicodeString
& source
) const {
748 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
749 return (targets
== 0) ? 0 : targets
->count();
752 UnicodeString
& TransliteratorRegistry::getAvailableTarget(int32_t index
,
753 const UnicodeString
& source
,
754 UnicodeString
& result
) const {
755 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
757 result
.truncate(0); // invalid source
761 const UHashElement
*e
= 0;
762 while (index
-- >= 0) {
763 e
= targets
->nextElement(pos
);
769 result
.truncate(0); // invalid index
771 result
= *(UnicodeString
*) e
->key
.pointer
;
776 int32_t TransliteratorRegistry::countAvailableVariants(const UnicodeString
& source
,
777 const UnicodeString
& target
) const {
778 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
782 UVector
*variants
= (UVector
*) targets
->get(target
);
783 // variants may be 0 if the source/target are invalid
784 return (variants
== 0) ? 0 : variants
->size();
787 UnicodeString
& TransliteratorRegistry::getAvailableVariant(int32_t index
,
788 const UnicodeString
& source
,
789 const UnicodeString
& target
,
790 UnicodeString
& result
) const {
791 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
793 result
.truncate(0); // invalid source
796 UVector
*variants
= (UVector
*) targets
->get(target
);
798 result
.truncate(0); // invalid target
801 UnicodeString
*v
= (UnicodeString
*) variants
->elementAt(index
);
803 result
.truncate(0); // invalid index
810 //----------------------------------------------------------------------
811 // class TransliteratorRegistry::Enumeration
812 //----------------------------------------------------------------------
814 TransliteratorRegistry::Enumeration::Enumeration(const TransliteratorRegistry
& _reg
) :
815 index(0), reg(_reg
) {
818 TransliteratorRegistry::Enumeration::~Enumeration() {
821 int32_t TransliteratorRegistry::Enumeration::count(UErrorCode
& /*status*/) const {
822 return reg
.availableIDs
.size();
825 const UnicodeString
* TransliteratorRegistry::Enumeration::snext(UErrorCode
& status
) {
826 // This is sloppy but safe -- if we get out of sync with the underlying
827 // registry, we will still return legal strings, but they might not
828 // correspond to the snapshot at construction time. So there could be
829 // duplicate IDs or omitted IDs if insertions or deletions occur in one
830 // thread while another is iterating. To be more rigorous, add a timestamp,
831 // which is incremented with any modification, and validate this iterator
832 // against the timestamp at construction time. This probably isn't worth
833 // doing as long as there is some possibility of removing this code in favor
834 // of some new code based on Doug's service framework.
835 if (U_FAILURE(status
)) {
838 int32_t n
= reg
.availableIDs
.size();
840 status
= U_ENUM_OUT_OF_SYNC_ERROR
;
842 // index == n is okay -- this means we've reached the end
844 // Copy the string! This avoids lifetime problems.
845 unistr
= *(const UnicodeString
*)reg
.availableIDs
[index
++];
852 void TransliteratorRegistry::Enumeration::reset(UErrorCode
& /*status*/) {
856 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TransliteratorRegistry::Enumeration
)
858 //----------------------------------------------------------------------
859 // class TransliteratorRegistry: internal
860 //----------------------------------------------------------------------
863 * Convenience method. Calls 6-arg registerEntry().
865 void TransliteratorRegistry::registerEntry(const UnicodeString
& source
,
866 const UnicodeString
& target
,
867 const UnicodeString
& variant
,
868 TransliteratorEntry
* adopted
,
871 UnicodeString
s(source
);
872 if (s
.length() == 0) {
873 s
.setTo(TRUE
, ANY
, 3);
875 TransliteratorIDParser::STVtoID(source
, target
, variant
, ID
);
876 registerEntry(ID
, s
, target
, variant
, adopted
, visible
);
880 * Convenience method. Calls 6-arg registerEntry().
882 void TransliteratorRegistry::registerEntry(const UnicodeString
& ID
,
883 TransliteratorEntry
* adopted
,
885 UnicodeString source
, target
, variant
;
887 TransliteratorIDParser::IDtoSTV(ID
, source
, target
, variant
, sawSource
);
888 // Only need to do this if ID.indexOf('-') < 0
890 TransliteratorIDParser::STVtoID(source
, target
, variant
, id
);
891 registerEntry(id
, source
, target
, variant
, adopted
, visible
);
895 * Register an entry object (adopted) with the given ID, source,
896 * target, and variant strings.
898 void TransliteratorRegistry::registerEntry(const UnicodeString
& ID
,
899 const UnicodeString
& source
,
900 const UnicodeString
& target
,
901 const UnicodeString
& variant
,
902 TransliteratorEntry
* adopted
,
904 UErrorCode status
= U_ZERO_ERROR
;
905 registry
.put(ID
, adopted
, status
);
907 registerSTV(source
, target
, variant
);
908 if (!availableIDs
.contains((void*) &ID
)) {
909 UnicodeString
*newID
= (UnicodeString
*)ID
.clone();
910 // Check to make sure newID was created.
912 // NUL-terminate the ID string
913 newID
->getTerminatedBuffer();
914 availableIDs
.addElement(newID
, status
);
918 removeSTV(source
, target
, variant
);
919 availableIDs
.removeElement((void*) &ID
);
924 * Register a source-target/variant in the specDAG. Variant may be
925 * empty, but source and target must not be. If variant is empty then
926 * the special variant NO_VARIANT is stored in slot zero of the
927 * UVector of variants.
929 void TransliteratorRegistry::registerSTV(const UnicodeString
& source
,
930 const UnicodeString
& target
,
931 const UnicodeString
& variant
) {
932 // assert(source.length() > 0);
933 // assert(target.length() > 0);
934 UErrorCode status
= U_ZERO_ERROR
;
935 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
937 targets
= new Hashtable(TRUE
, status
);
938 if (U_FAILURE(status
) || targets
== 0) {
941 targets
->setValueDeleter(uprv_deleteUObject
);
942 specDAG
.put(source
, targets
, status
);
944 UVector
*variants
= (UVector
*) targets
->get(target
);
946 variants
= new UVector(uprv_deleteUObject
,
947 uhash_compareCaselessUnicodeString
, status
);
951 targets
->put(target
, variants
, status
);
953 // assert(NO_VARIANT == "");
954 // We add the variant string. If it is the special "no variant"
955 // string, that is, the empty string, we add it at position zero.
956 if (!variants
->contains((void*) &variant
)) {
957 UnicodeString
*tempus
; // Used for null pointer check.
958 if (variant
.length() > 0) {
959 tempus
= new UnicodeString(variant
);
960 if (tempus
!= NULL
) {
961 variants
->addElement(tempus
, status
);
964 tempus
= new UnicodeString(); // = NO_VARIANT
965 if (tempus
!= NULL
) {
966 variants
->insertElementAt(tempus
, 0, status
);
973 * Remove a source-target/variant from the specDAG.
975 void TransliteratorRegistry::removeSTV(const UnicodeString
& source
,
976 const UnicodeString
& target
,
977 const UnicodeString
& variant
) {
978 // assert(source.length() > 0);
979 // assert(target.length() > 0);
980 // UErrorCode status = U_ZERO_ERROR;
981 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
983 return; // should never happen for valid s-t/v
985 UVector
*variants
= (UVector
*) targets
->get(target
);
987 return; // should never happen for valid s-t/v
989 variants
->removeElement((void*) &variant
);
990 if (variants
->size() == 0) {
991 targets
->remove(target
); // should delete variants
992 if (targets
->count() == 0) {
993 specDAG
.remove(source
); // should delete targets
999 * Attempt to find a source-target/variant in the dynamic registry
1000 * store. Return 0 on failure.
1002 * Caller does NOT own returned object.
1004 TransliteratorEntry
* TransliteratorRegistry::findInDynamicStore(const TransliteratorSpec
& src
,
1005 const TransliteratorSpec
& trg
,
1006 const UnicodeString
& variant
) const {
1008 TransliteratorIDParser::STVtoID(src
, trg
, variant
, ID
);
1009 TransliteratorEntry
*e
= (TransliteratorEntry
*) registry
.get(ID
);
1015 * Attempt to find a source-target/variant in the static locale
1016 * resource store. Do not perform fallback. Return 0 on failure.
1018 * On success, create a new entry object, register it in the dynamic
1019 * store, and return a pointer to it, but do not make it public --
1020 * just because someone requested something, we do not expand the
1021 * available ID list (or spec DAG).
1023 * Caller does NOT own returned object.
1025 TransliteratorEntry
* TransliteratorRegistry::findInStaticStore(const TransliteratorSpec
& src
,
1026 const TransliteratorSpec
& trg
,
1027 const UnicodeString
& variant
) {
1028 TransliteratorEntry
* entry
= 0;
1029 if (src
.isLocale()) {
1030 entry
= findInBundle(src
, trg
, variant
, UTRANS_FORWARD
);
1031 } else if (trg
.isLocale()) {
1032 entry
= findInBundle(trg
, src
, variant
, UTRANS_REVERSE
);
1035 // If we found an entry, store it in the Hashtable for next
1038 registerEntry(src
.getTop(), trg
.getTop(), variant
, entry
, FALSE
);
1044 // As of 2.0, resource bundle keys cannot contain '_'
1045 static const UChar TRANSLITERATE_TO
[] = {84,114,97,110,115,108,105,116,101,114,97,116,101,84,111,0}; // "TransliterateTo"
1047 static const UChar TRANSLITERATE_FROM
[] = {84,114,97,110,115,108,105,116,101,114,97,116,101,70,114,111,109,0}; // "TransliterateFrom"
1049 static const UChar TRANSLITERATE
[] = {84,114,97,110,115,108,105,116,101,114,97,116,101,0}; // "Transliterate"
1052 * Attempt to find an entry in a single resource bundle. This is
1053 * a one-sided lookup. findInStaticStore() performs up to two such
1054 * lookups, one for the source, and one for the target.
1056 * Do not perform fallback. Return 0 on failure.
1058 * On success, create a new Entry object, populate it, and return it.
1059 * The caller owns the returned object.
1061 TransliteratorEntry
* TransliteratorRegistry::findInBundle(const TransliteratorSpec
& specToOpen
,
1062 const TransliteratorSpec
& specToFind
,
1063 const UnicodeString
& variant
,
1064 UTransDirection direction
)
1067 UnicodeString resStr
;
1070 for (pass
=0; pass
<2; ++pass
) {
1072 // First try either TransliteratorTo_xxx or
1073 // TransliterateFrom_xxx, then try the bidirectional
1074 // Transliterate_xxx. This precedence order is arbitrary
1075 // but must be consistent and documented.
1077 utag
.append(direction
== UTRANS_FORWARD
?
1078 TRANSLITERATE_TO
: TRANSLITERATE_FROM
, -1);
1080 utag
.append(TRANSLITERATE
, -1);
1082 UnicodeString
s(specToFind
.get());
1083 utag
.append(s
.toUpper(""));
1084 UErrorCode status
= U_ZERO_ERROR
;
1085 ResourceBundle
subres(specToOpen
.getBundle().get(
1086 CharString().appendInvariantChars(utag
, status
).data(), status
));
1087 if (U_FAILURE(status
) || status
== U_USING_DEFAULT_WARNING
) {
1092 if (specToOpen
.get() != LocaleUtility::initNameFromLocale(subres
.getLocale(), s
)) {
1096 if (variant
.length() != 0) {
1097 status
= U_ZERO_ERROR
;
1098 resStr
= subres
.getStringEx(
1099 CharString().appendInvariantChars(variant
, status
).data(), status
);
1100 if (U_SUCCESS(status
)) {
1101 // Exit loop successfully
1105 // Variant is empty, which means match the first variant listed.
1106 status
= U_ZERO_ERROR
;
1107 resStr
= subres
.getStringEx(1, status
);
1108 if (U_SUCCESS(status
)) {
1109 // Exit loop successfully
1120 // We have succeeded in loading a string from the locale
1121 // resources. Create a new registry entry to hold it and return it.
1122 TransliteratorEntry
*entry
= new TransliteratorEntry();
1124 // The direction is always forward for the
1125 // TransliterateTo_xxx and TransliterateFrom_xxx
1126 // items; those are unidirectional forward rules.
1127 // For the bidirectional Transliterate_xxx items,
1128 // the direction is the value passed in to this
1130 int32_t dir
= (pass
== 0) ? UTRANS_FORWARD
: direction
;
1131 entry
->entryType
= TransliteratorEntry::LOCALE_RULES
;
1132 entry
->stringArg
= resStr
;
1133 entry
->intArg
= dir
;
1140 * Convenience method. Calls 3-arg find().
1142 TransliteratorEntry
* TransliteratorRegistry::find(const UnicodeString
& ID
) {
1143 UnicodeString source
, target
, variant
;
1145 TransliteratorIDParser::IDtoSTV(ID
, source
, target
, variant
, sawSource
);
1146 return find(source
, target
, variant
);
1150 * Top-level find method. Attempt to find a source-target/variant in
1151 * either the dynamic or the static (locale resource) store. Perform
1154 * Lookup sequence for ss_SS_SSS-tt_TT_TTT/v:
1156 * ss_SS_SSS-tt_TT_TTT/v -- in hashtable
1157 * ss_SS_SSS-tt_TT_TTT/v -- in ss_SS_SSS (no fallback)
1159 * repeat with t = tt_TT_TTT, tt_TT, tt, and tscript
1166 * Here * matches the first variant listed.
1168 * Caller does NOT own returned object. Return 0 on failure.
1170 TransliteratorEntry
* TransliteratorRegistry::find(UnicodeString
& source
,
1171 UnicodeString
& target
,
1172 UnicodeString
& variant
) {
1174 TransliteratorSpec
src(source
);
1175 TransliteratorSpec
trg(target
);
1176 TransliteratorEntry
* entry
;
1178 // Seek exact match in hashtable. Temporary fix for ICU 4.6.
1179 // TODO: The general logic for finding a matching transliterator needs to be reviewed.
1182 TransliteratorIDParser::STVtoID(source
, target
, variant
, ID
);
1183 entry
= (TransliteratorEntry
*) registry
.get(ID
);
1186 // std::cout << ID.toUTF8String(ss) << std::endl;
1190 if (variant
.length() != 0) {
1192 // Seek exact match in hashtable
1193 entry
= findInDynamicStore(src
, trg
, variant
);
1198 // Seek exact match in locale resources
1199 entry
= findInStaticStore(src
, trg
, variant
);
1208 // Seek match in hashtable
1209 entry
= findInDynamicStore(src
, trg
, NO_VARIANT
);
1214 // Seek match in locale resources
1215 entry
= findInStaticStore(src
, trg
, NO_VARIANT
);
1219 if (!src
.hasFallback()) {
1224 if (!trg
.hasFallback()) {
1234 * Given an Entry object, instantiate it. Caller owns result. Return
1237 * Return a non-empty aliasReturn value if the ID points to an alias.
1238 * We cannot instantiate it ourselves because the alias may contain
1239 * filters or compounds, which we do not understand. Caller should
1240 * make aliasReturn empty before calling.
1242 * The entry object is assumed to reside in the dynamic store. It may be
1245 Transliterator
* TransliteratorRegistry::instantiateEntry(const UnicodeString
& ID
,
1246 TransliteratorEntry
*entry
,
1247 TransliteratorAlias
* &aliasReturn
,
1248 UErrorCode
& status
) {
1249 Transliterator
*t
= 0;
1250 U_ASSERT(aliasReturn
== 0);
1252 switch (entry
->entryType
) {
1253 case TransliteratorEntry::RBT_DATA
:
1254 t
= new RuleBasedTransliterator(ID
, entry
->u
.data
);
1256 status
= U_MEMORY_ALLOCATION_ERROR
;
1259 case TransliteratorEntry::PROTOTYPE
:
1260 t
= entry
->u
.prototype
->clone();
1262 status
= U_MEMORY_ALLOCATION_ERROR
;
1265 case TransliteratorEntry::ALIAS
:
1266 aliasReturn
= new TransliteratorAlias(entry
->stringArg
, entry
->compoundFilter
);
1267 if (aliasReturn
== 0) {
1268 status
= U_MEMORY_ALLOCATION_ERROR
;
1271 case TransliteratorEntry::FACTORY
:
1272 t
= entry
->u
.factory
.function(ID
, entry
->u
.factory
.context
);
1274 status
= U_MEMORY_ALLOCATION_ERROR
;
1277 case TransliteratorEntry::COMPOUND_RBT
:
1279 UVector
* rbts
= new UVector(entry
->u
.dataVector
->size(), status
);
1280 // Check for null pointer
1282 status
= U_MEMORY_ALLOCATION_ERROR
;
1285 int32_t passNumber
= 1;
1286 for (int32_t i
= 0; U_SUCCESS(status
) && i
< entry
->u
.dataVector
->size(); i
++) {
1287 // TODO: Should passNumber be turned into a decimal-string representation (1 -> "1")?
1288 Transliterator
* t
= new RuleBasedTransliterator(UnicodeString(CompoundTransliterator::PASS_STRING
) + UnicodeString(passNumber
++),
1289 (TransliterationRuleData
*)(entry
->u
.dataVector
->elementAt(i
)), FALSE
);
1291 status
= U_MEMORY_ALLOCATION_ERROR
;
1293 rbts
->addElement(t
, status
);
1295 if (U_FAILURE(status
)) {
1299 aliasReturn
= new TransliteratorAlias(ID
, entry
->stringArg
, rbts
, entry
->compoundFilter
);
1301 if (aliasReturn
== 0) {
1302 status
= U_MEMORY_ALLOCATION_ERROR
;
1305 case TransliteratorEntry::LOCALE_RULES
:
1306 aliasReturn
= new TransliteratorAlias(ID
, entry
->stringArg
,
1307 (UTransDirection
) entry
->intArg
);
1308 if (aliasReturn
== 0) {
1309 status
= U_MEMORY_ALLOCATION_ERROR
;
1312 case TransliteratorEntry::RULES_FORWARD
:
1313 case TransliteratorEntry::RULES_REVERSE
:
1314 // Process the rule data into a TransliteratorRuleData object,
1315 // and possibly also into an ::id header and/or footer. Then
1316 // we modify the registry with the parsed data and retry.
1318 TransliteratorParser
parser(status
);
1320 // We use the file name, taken from another resource bundle
1321 // 2-d array at static init time, as a locale language. We're
1322 // just using the locale mechanism to map through to a file
1323 // name; this in no way represents an actual locale.
1324 //CharString ch(entry->stringArg);
1325 //UResourceBundle *bundle = ures_openDirect(0, ch, &status);
1326 UnicodeString rules
= entry
->stringArg
;
1327 //ures_close(bundle);
1329 //if (U_FAILURE(status)) {
1330 // We have a failure of some kind. Remove the ID from the
1331 // registry so we don't keep trying. NOTE: This will throw off
1332 // anyone who is, at the moment, trying to iterate over the
1333 // available IDs. That's acceptable since we should never
1334 // really get here except under installation, configuration,
1335 // or unrecoverable run time memory failures.
1339 // If the status indicates a failure, then we don't have any
1340 // rules -- there is probably an installation error. The list
1341 // in the root locale should correspond to all the installed
1342 // transliterators; if it lists something that's not
1343 // installed, we'll get an error from ResourceBundle.
1344 aliasReturn
= new TransliteratorAlias(ID
, rules
,
1345 ((entry
->entryType
== TransliteratorEntry::RULES_REVERSE
) ?
1346 UTRANS_REVERSE
: UTRANS_FORWARD
));
1347 if (aliasReturn
== 0) {
1348 status
= U_MEMORY_ALLOCATION_ERROR
;
1354 U_ASSERT(FALSE
); // can't get here
1360 #endif /* #if !UCONFIG_NO_TRANSLITERATION */