1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (c) 2001-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 08/10/2001 aliu Creation.
10 **********************************************************************
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_TRANSLITERATION
17 #include "unicode/translit.h"
18 #include "unicode/resbund.h"
19 #include "unicode/uniset.h"
20 #include "unicode/uscript.h"
32 // Enable the following symbol to add debugging code that tracks the
33 // allocation, deletion, and use of Entry objects. BoundsChecker has
34 // reported dangling pointer errors with these objects, but I have
35 // been unable to confirm them. I suspect BoundsChecker is getting
36 // confused with pointers going into and coming out of a UHashtable,
37 // despite the hinting code that is designed to help it.
44 static const UChar LOCALE_SEP
= 95; // '_'
45 //static const UChar ID_SEP = 0x002D; /*-*/
46 //static const UChar VARIANT_SEP = 0x002F; // '/'
49 static const UChar ANY
[] = { 0x41, 0x6E, 0x79, 0 }; // Any
50 static const UChar LAT
[] = { 0x4C, 0x61, 0x74, 0 }; // Lat
53 #define NO_VARIANT UnicodeString()
55 // initial estimate for specDAG size
56 // ICU 60 Transliterator::countAvailableSources()
57 //#define SPECDAG_INIT_SIZE 149
59 #define SPECDAG_INIT_SIZE 134
61 // initial estimate for number of variant names
62 #define VARIANT_LIST_INIT_SIZE 11
63 #define VARIANT_LIST_MAX_SIZE 31
65 // initial estimate for availableIDs count (default estimate is 8 => multiple reallocs)
66 // ICU 60 Transliterator::countAvailableIDs()
67 //#define AVAILABLE_IDS_INIT_SIZE 641
69 #define AVAILABLE_IDS_INIT_SIZE 493
71 // initial estimate for number of targets for source "Any", "Lat"
72 // ICU 60 Transliterator::countAvailableTargets("Any")/("Latn")
73 //#define ANY_TARGETS_INIT_SIZE 125
75 #define ANY_TARGETS_INIT_SIZE 102
76 #define LAT_TARGETS_INIT_SIZE 23
79 * Resource bundle key for the RuleBasedTransliterator rule.
81 //static const char RB_RULE[] = "Rule";
85 //------------------------------------------------------------------
87 //------------------------------------------------------------------
89 TransliteratorAlias::TransliteratorAlias(const UnicodeString
& theAliasID
,
90 const UnicodeSet
* cpdFilter
) :
92 aliasesOrRules(theAliasID
),
94 compoundFilter(cpdFilter
),
95 direction(UTRANS_FORWARD
),
96 type(TransliteratorAlias::SIMPLE
) {
99 TransliteratorAlias::TransliteratorAlias(const UnicodeString
& theID
,
100 const UnicodeString
& idBlocks
,
101 UVector
* adoptedTransliterators
,
102 const UnicodeSet
* cpdFilter
) :
104 aliasesOrRules(idBlocks
),
105 transes(adoptedTransliterators
),
106 compoundFilter(cpdFilter
),
107 direction(UTRANS_FORWARD
),
108 type(TransliteratorAlias::COMPOUND
) {
111 TransliteratorAlias::TransliteratorAlias(const UnicodeString
& theID
,
112 const UnicodeString
& rules
,
113 UTransDirection dir
) :
115 aliasesOrRules(rules
),
119 type(TransliteratorAlias::RULES
) {
122 TransliteratorAlias::~TransliteratorAlias() {
127 Transliterator
* TransliteratorAlias::create(UParseError
& pe
,
132 Transliterator
*t
= NULL
;
135 t
= Transliterator::createInstance(aliasesOrRules
, UTRANS_FORWARD
, pe
, ec
);
139 if (compoundFilter
!= 0)
140 t
->adoptFilter((UnicodeSet
*)compoundFilter
->clone());
144 // the total number of transliterators in the compound is the total number of anonymous transliterators
145 // plus the total number of ID blocks-- we start by assuming the list begins and ends with an ID
146 // block and that each pair anonymous transliterators has an ID block between them. Then we go back
147 // to see whether there really are ID blocks at the beginning and end (by looking for U+FFFF, which
148 // marks the position where an anonymous transliterator goes) and adjust accordingly
149 int32_t anonymousRBTs
= transes
->size();
150 int32_t transCount
= anonymousRBTs
* 2 + 1;
151 if (!aliasesOrRules
.isEmpty() && aliasesOrRules
[0] == (UChar
)(0xffff))
153 if (aliasesOrRules
.length() >= 2 && aliasesOrRules
[aliasesOrRules
.length() - 1] == (UChar
)(0xffff))
155 UnicodeString
noIDBlock((UChar
)(0xffff));
156 noIDBlock
+= ((UChar
)(0xffff));
157 int32_t pos
= aliasesOrRules
.indexOf(noIDBlock
);
160 pos
= aliasesOrRules
.indexOf(noIDBlock
, pos
+ 1);
163 UVector
transliterators(ec
);
164 UnicodeString idBlock
;
165 int32_t blockSeparatorPos
= aliasesOrRules
.indexOf((UChar
)(0xffff));
166 while (blockSeparatorPos
>= 0) {
167 aliasesOrRules
.extract(0, blockSeparatorPos
, idBlock
);
168 aliasesOrRules
.remove(0, blockSeparatorPos
+ 1);
169 if (!idBlock
.isEmpty())
170 transliterators
.addElement(Transliterator::createInstance(idBlock
, UTRANS_FORWARD
, pe
, ec
), ec
);
171 if (!transes
->isEmpty())
172 transliterators
.addElement(transes
->orphanElementAt(0), ec
);
173 blockSeparatorPos
= aliasesOrRules
.indexOf((UChar
)(0xffff));
175 if (!aliasesOrRules
.isEmpty())
176 transliterators
.addElement(Transliterator::createInstance(aliasesOrRules
, UTRANS_FORWARD
, pe
, ec
), ec
);
177 while (!transes
->isEmpty())
178 transliterators
.addElement(transes
->orphanElementAt(0), ec
);
181 t
= new CompoundTransliterator(ID
, transliterators
,
182 (compoundFilter
? (UnicodeSet
*)(compoundFilter
->clone()) : 0),
183 anonymousRBTs
, pe
, ec
);
185 ec
= U_MEMORY_ALLOCATION_ERROR
;
189 for (int32_t i
= 0; i
< transliterators
.size(); i
++)
190 delete (Transliterator
*)(transliterators
.elementAt(i
));
195 UPRV_UNREACHABLE
; // don't call create() if isRuleBased() returns TRUE!
200 UBool
TransliteratorAlias::isRuleBased() const {
201 return type
== RULES
;
204 void TransliteratorAlias::parse(TransliteratorParser
& parser
,
205 UParseError
& pe
, UErrorCode
& ec
) const {
206 U_ASSERT(type
== RULES
);
211 parser
.parse(aliasesOrRules
, direction
, pe
, ec
);
214 //----------------------------------------------------------------------
215 // class TransliteratorSpec
216 //----------------------------------------------------------------------
219 * A TransliteratorSpec is a string specifying either a source or a target. In more
220 * general terms, it may also specify a variant, but we only use the
221 * Spec class for sources and targets.
223 * A Spec may be a locale or a script. If it is a locale, it has a
224 * fallback chain that goes xx_YY_ZZZ -> xx_YY -> xx -> ssss, where
225 * ssss is the script mapping of xx_YY_ZZZ. The Spec API methods
226 * hasFallback(), next(), and reset() iterate over this fallback
229 * The Spec class canonicalizes itself, so the locale is put into
230 * canonical form, or the script is transformed from an abbreviation
233 class TransliteratorSpec
: public UMemory
{
235 TransliteratorSpec(const UnicodeString
& spec
);
236 ~TransliteratorSpec();
238 const UnicodeString
& get() const;
239 UBool
hasFallback() const;
240 const UnicodeString
& next();
243 UBool
isLocale() const;
244 ResourceBundle
& getBundle() const;
246 operator const UnicodeString
&() const { return get(); }
247 const UnicodeString
& getTop() const { return top
; }
254 UnicodeString nextSpec
;
255 UnicodeString scriptName
;
256 UBool isSpecLocale
; // TRUE if spec is a locale
257 UBool isNextLocale
; // TRUE if nextSpec is a locale
260 TransliteratorSpec(const TransliteratorSpec
&other
); // forbid copying of this class
261 TransliteratorSpec
&operator=(const TransliteratorSpec
&other
); // forbid copying of this class
264 TransliteratorSpec::TransliteratorSpec(const UnicodeString
& theSpec
)
268 UErrorCode status
= U_ZERO_ERROR
;
270 LocaleUtility::initLocaleFromName(theSpec
, topLoc
);
271 if (!topLoc
.isBogus()) {
272 res
= new ResourceBundle(U_ICUDATA_TRANSLIT
, topLoc
, status
);
277 if (U_FAILURE(status
) || status
== U_USING_DEFAULT_WARNING
) {
283 // Canonicalize script name -or- do locale->script mapping
284 status
= U_ZERO_ERROR
;
285 static const int32_t capacity
= 10;
286 UScriptCode script
[capacity
]={USCRIPT_INVALID_CODE
};
287 int32_t num
= uscript_getCode(CharString().appendInvariantChars(theSpec
, status
).data(),
288 script
, capacity
, &status
);
289 if (num
> 0 && script
[0] != USCRIPT_INVALID_CODE
) {
290 scriptName
= UnicodeString(uscript_getName(script
[0]), -1, US_INV
);
295 // Canonicalize locale name
296 UnicodeString locStr
;
297 LocaleUtility::initNameFromLocale(topLoc
, locStr
);
298 if (!locStr
.isBogus()) {
301 } else if (scriptName
.length() != 0) {
302 // We are a script; use canonical name
306 // assert(spec != top);
310 TransliteratorSpec::~TransliteratorSpec() {
314 UBool
TransliteratorSpec::hasFallback() const {
315 return nextSpec
.length() != 0;
318 void TransliteratorSpec::reset() {
321 isSpecLocale
= (res
!= 0);
326 void TransliteratorSpec::setupNext() {
327 isNextLocale
= FALSE
;
330 int32_t i
= nextSpec
.lastIndexOf(LOCALE_SEP
);
331 // If i == 0 then we have _FOO, so we fall through
332 // to the scriptName.
334 nextSpec
.truncate(i
);
337 nextSpec
= scriptName
; // scriptName may be empty
340 // spec is a script, so we are at the end
341 nextSpec
.truncate(0);
346 // for(const UnicodeString& s(spec.get());
347 // spec.hasFallback(); s(spec.next())) { ...
349 const UnicodeString
& TransliteratorSpec::next() {
351 isSpecLocale
= isNextLocale
;
356 const UnicodeString
& TransliteratorSpec::get() const {
360 UBool
TransliteratorSpec::isLocale() const {
364 ResourceBundle
& TransliteratorSpec::getBundle() const {
368 //----------------------------------------------------------------------
372 // Vector of Entry pointers currently in use
373 static UVector
* DEBUG_entries
= NULL
;
375 static void DEBUG_setup() {
376 if (DEBUG_entries
== NULL
) {
377 UErrorCode ec
= U_ZERO_ERROR
;
378 DEBUG_entries
= new UVector(ec
);
382 // Caller must call DEBUG_setup first. Return index of given Entry,
383 // if it is in use (not deleted yet), or -1 if not found.
384 static int DEBUG_findEntry(TransliteratorEntry
* e
) {
385 for (int i
=0; i
<DEBUG_entries
->size(); ++i
) {
386 if (e
== (TransliteratorEntry
*) DEBUG_entries
->elementAt(i
)) {
393 // Track object creation
394 static void DEBUG_newEntry(TransliteratorEntry
* e
) {
396 if (DEBUG_findEntry(e
) >= 0) {
397 // This should really never happen unless the heap is broken
398 printf("ERROR DEBUG_newEntry duplicate new pointer %08X\n", e
);
401 UErrorCode ec
= U_ZERO_ERROR
;
402 DEBUG_entries
->addElement(e
, ec
);
405 // Track object deletion
406 static void DEBUG_delEntry(TransliteratorEntry
* e
) {
408 int i
= DEBUG_findEntry(e
);
410 printf("ERROR DEBUG_delEntry possible double deletion %08X\n", e
);
413 DEBUG_entries
->removeElementAt(i
);
416 // Track object usage
417 static void DEBUG_useEntry(TransliteratorEntry
* e
) {
418 if (e
== NULL
) return;
420 int i
= DEBUG_findEntry(e
);
422 printf("ERROR DEBUG_useEntry possible dangling pointer %08X\n", e
);
427 // If we're not debugging then make these macros into NOPs
428 #define DEBUG_newEntry(x)
429 #define DEBUG_delEntry(x)
430 #define DEBUG_useEntry(x)
433 //----------------------------------------------------------------------
435 //----------------------------------------------------------------------
438 * The Entry object stores objects of different types and
439 * singleton objects as placeholders for rule-based transliterators to
440 * be built as needed. Instances of this struct can be placeholders,
441 * can represent prototype transliterators to be cloned, or can
442 * represent TransliteratorData objects. We don't support storing
443 * classes in the registry because we don't have the rtti infrastructure
444 * for it. We could easily add this if there is a need for it in the
447 class TransliteratorEntry
: public UMemory
{
458 NONE
// Only used for uninitialized entries
460 // NOTE: stringArg cannot go inside the union because
461 // it has a copy constructor
462 UnicodeString stringArg
; // For RULES_*, ALIAS, COMPOUND_RBT
463 int32_t intArg
; // For COMPOUND_RBT, LOCALE_RULES
464 UnicodeSet
* compoundFilter
; // For COMPOUND_RBT
466 Transliterator
* prototype
; // For PROTOTYPE
467 TransliterationRuleData
* data
; // For RBT_DATA
468 UVector
* dataVector
; // For COMPOUND_RBT
470 Transliterator::Factory function
;
471 Transliterator::Token context
;
472 } factory
; // For FACTORY
474 TransliteratorEntry();
475 ~TransliteratorEntry();
476 void adoptPrototype(Transliterator
* adopted
);
477 void setFactory(Transliterator::Factory factory
,
478 Transliterator::Token context
);
482 TransliteratorEntry(const TransliteratorEntry
&other
); // forbid copying of this class
483 TransliteratorEntry
&operator=(const TransliteratorEntry
&other
); // forbid copying of this class
486 TransliteratorEntry::TransliteratorEntry() {
488 compoundFilter
= NULL
;
490 DEBUG_newEntry(this);
493 TransliteratorEntry::~TransliteratorEntry() {
494 DEBUG_delEntry(this);
495 if (entryType
== PROTOTYPE
) {
497 } else if (entryType
== RBT_DATA
) {
498 // The data object is shared between instances of RBT. The
499 // entry object owns it. It should only be deleted when the
500 // transliterator component is being cleaned up. Doing so
501 // invalidates any RBTs that the user has instantiated.
503 } else if (entryType
== COMPOUND_RBT
) {
504 while (u
.dataVector
!= NULL
&& !u
.dataVector
->isEmpty())
505 delete (TransliterationRuleData
*)u
.dataVector
->orphanElementAt(0);
508 delete compoundFilter
;
511 void TransliteratorEntry::adoptPrototype(Transliterator
* adopted
) {
512 if (entryType
== PROTOTYPE
) {
515 entryType
= PROTOTYPE
;
516 u
.prototype
= adopted
;
519 void TransliteratorEntry::setFactory(Transliterator::Factory factory
,
520 Transliterator::Token context
) {
521 if (entryType
== PROTOTYPE
) {
525 u
.factory
.function
= factory
;
526 u
.factory
.context
= context
;
529 // UObjectDeleter for Hashtable::setValueDeleter
531 static void U_CALLCONV
532 deleteEntry(void* obj
) {
533 delete (TransliteratorEntry
*) obj
;
537 //----------------------------------------------------------------------
538 // class TransliteratorRegistry: Basic public API
539 //----------------------------------------------------------------------
541 TransliteratorRegistry::TransliteratorRegistry(UErrorCode
& status
) :
542 registry(TRUE
, status
),
543 specDAG(TRUE
, SPECDAG_INIT_SIZE
, status
),
544 variantList(VARIANT_LIST_INIT_SIZE
, status
),
545 availableIDs(AVAILABLE_IDS_INIT_SIZE
, status
)
547 registry
.setValueDeleter(deleteEntry
);
548 variantList
.setDeleter(uprv_deleteUObject
);
549 variantList
.setComparer(uhash_compareCaselessUnicodeString
);
550 UnicodeString
*emptyString
= new UnicodeString();
551 if (emptyString
!= NULL
) {
552 variantList
.addElement(emptyString
, status
);
554 availableIDs
.setDeleter(uprv_deleteUObject
);
555 availableIDs
.setComparer(uhash_compareCaselessUnicodeString
);
556 specDAG
.setValueDeleter(uhash_deleteHashtable
);
559 TransliteratorRegistry::~TransliteratorRegistry() {
560 // Through the magic of C++, everything cleans itself up
563 Transliterator
* TransliteratorRegistry::get(const UnicodeString
& ID
,
564 TransliteratorAlias
*& aliasReturn
,
565 UErrorCode
& status
) {
566 U_ASSERT(aliasReturn
== NULL
);
567 TransliteratorEntry
*entry
= find(ID
);
568 return (entry
== 0) ? 0
569 : instantiateEntry(ID
, entry
, aliasReturn
, status
);
572 Transliterator
* TransliteratorRegistry::reget(const UnicodeString
& ID
,
573 TransliteratorParser
& parser
,
574 TransliteratorAlias
*& aliasReturn
,
575 UErrorCode
& status
) {
576 U_ASSERT(aliasReturn
== NULL
);
577 TransliteratorEntry
*entry
= find(ID
);
580 // We get to this point if there are two threads, one of which
581 // is instantiating an ID, and another of which is removing
582 // the same ID from the registry, and the timing is just right.
586 // The usage model for the caller is that they will first call
587 // reg->get() inside the mutex, they'll get back an alias, they call
588 // alias->isRuleBased(), and if they get TRUE, they call alias->parse()
589 // outside the mutex, then reg->reget() inside the mutex again. A real
590 // mess, but it gets things working for ICU 3.0. [alan].
592 // Note: It's possible that in between the caller calling
593 // alias->parse() and reg->reget(), that another thread will have
594 // called reg->reget(), and the entry will already have been fixed up.
595 // We have to detect this so we don't stomp over existing entry
596 // data members and potentially leak memory (u.data and compoundFilter).
598 if (entry
->entryType
== TransliteratorEntry::RULES_FORWARD
||
599 entry
->entryType
== TransliteratorEntry::RULES_REVERSE
||
600 entry
->entryType
== TransliteratorEntry::LOCALE_RULES
) {
602 if (parser
.idBlockVector
.isEmpty() && parser
.dataVector
.isEmpty()) {
604 entry
->entryType
= TransliteratorEntry::ALIAS
;
605 entry
->stringArg
= UNICODE_STRING_SIMPLE("Any-NULL");
607 else if (parser
.idBlockVector
.isEmpty() && parser
.dataVector
.size() == 1) {
608 entry
->u
.data
= (TransliterationRuleData
*)parser
.dataVector
.orphanElementAt(0);
609 entry
->entryType
= TransliteratorEntry::RBT_DATA
;
611 else if (parser
.idBlockVector
.size() == 1 && parser
.dataVector
.isEmpty()) {
612 entry
->stringArg
= *(UnicodeString
*)(parser
.idBlockVector
.elementAt(0));
613 entry
->compoundFilter
= parser
.orphanCompoundFilter();
614 entry
->entryType
= TransliteratorEntry::ALIAS
;
617 entry
->entryType
= TransliteratorEntry::COMPOUND_RBT
;
618 entry
->compoundFilter
= parser
.orphanCompoundFilter();
619 entry
->u
.dataVector
= new UVector(status
);
620 entry
->stringArg
.remove();
622 int32_t limit
= parser
.idBlockVector
.size();
623 if (parser
.dataVector
.size() > limit
)
624 limit
= parser
.dataVector
.size();
626 for (int32_t i
= 0; i
< limit
; i
++) {
627 if (i
< parser
.idBlockVector
.size()) {
628 UnicodeString
* idBlock
= (UnicodeString
*)parser
.idBlockVector
.elementAt(i
);
629 if (!idBlock
->isEmpty())
630 entry
->stringArg
+= *idBlock
;
632 if (!parser
.dataVector
.isEmpty()) {
633 TransliterationRuleData
* data
= (TransliterationRuleData
*)parser
.dataVector
.orphanElementAt(0);
634 entry
->u
.dataVector
->addElement(data
, status
);
635 entry
->stringArg
+= (UChar
)0xffff; // use U+FFFF to mark position of RBTs in ID block
642 instantiateEntry(ID
, entry
, aliasReturn
, status
);
646 void TransliteratorRegistry::put(Transliterator
* adoptedProto
,
650 TransliteratorEntry
*entry
= new TransliteratorEntry();
652 ec
= U_MEMORY_ALLOCATION_ERROR
;
655 entry
->adoptPrototype(adoptedProto
);
656 registerEntry(adoptedProto
->getID(), entry
, visible
);
659 void TransliteratorRegistry::put(const UnicodeString
& ID
,
660 Transliterator::Factory factory
,
661 Transliterator::Token context
,
664 TransliteratorEntry
*entry
= new TransliteratorEntry();
666 ec
= U_MEMORY_ALLOCATION_ERROR
;
669 entry
->setFactory(factory
, context
);
670 registerEntry(ID
, entry
, visible
);
673 void TransliteratorRegistry::put(const UnicodeString
& ID
,
674 const UnicodeString
& resourceName
,
676 UBool readonlyResourceAlias
,
679 TransliteratorEntry
*entry
= new TransliteratorEntry();
681 ec
= U_MEMORY_ALLOCATION_ERROR
;
684 entry
->entryType
= (dir
== UTRANS_FORWARD
) ? TransliteratorEntry::RULES_FORWARD
685 : TransliteratorEntry::RULES_REVERSE
;
686 if (readonlyResourceAlias
) {
687 entry
->stringArg
.setTo(TRUE
, resourceName
.getBuffer(), -1);
690 entry
->stringArg
= resourceName
;
692 registerEntry(ID
, entry
, visible
);
695 void TransliteratorRegistry::put(const UnicodeString
& ID
,
696 const UnicodeString
& alias
,
697 UBool readonlyAliasAlias
,
699 UErrorCode
& /*ec*/) {
700 TransliteratorEntry
*entry
= new TransliteratorEntry();
701 // Null pointer check
703 entry
->entryType
= TransliteratorEntry::ALIAS
;
704 if (readonlyAliasAlias
) {
705 entry
->stringArg
.setTo(TRUE
, alias
.getBuffer(), -1);
708 entry
->stringArg
= alias
;
710 registerEntry(ID
, entry
, visible
);
714 void TransliteratorRegistry::remove(const UnicodeString
& ID
) {
715 UnicodeString source
, target
, variant
;
717 TransliteratorIDParser::IDtoSTV(ID
, source
, target
, variant
, sawSource
);
718 // Only need to do this if ID.indexOf('-') < 0
720 TransliteratorIDParser::STVtoID(source
, target
, variant
, id
);
722 removeSTV(source
, target
, variant
);
723 availableIDs
.removeElement((void*) &id
);
726 //----------------------------------------------------------------------
727 // class TransliteratorRegistry: Public ID and spec management
728 //----------------------------------------------------------------------
731 * == OBSOLETE - remove in ICU 3.4 ==
732 * Return the number of IDs currently registered with the system.
733 * To retrieve the actual IDs, call getAvailableID(i) with
734 * i from 0 to countAvailableIDs() - 1.
736 int32_t TransliteratorRegistry::countAvailableIDs(void) const {
737 return availableIDs
.size();
741 * == OBSOLETE - remove in ICU 3.4 ==
742 * Return the index-th available ID. index must be between 0
743 * and countAvailableIDs() - 1, inclusive. If index is out of
744 * range, the result of getAvailableID(0) is returned.
746 const UnicodeString
& TransliteratorRegistry::getAvailableID(int32_t index
) const {
747 if (index
< 0 || index
>= availableIDs
.size()) {
750 return *(const UnicodeString
*) availableIDs
[index
];
753 StringEnumeration
* TransliteratorRegistry::getAvailableIDs() const {
754 return new Enumeration(*this);
757 int32_t TransliteratorRegistry::countAvailableSources(void) const {
758 return specDAG
.count();
761 UnicodeString
& TransliteratorRegistry::getAvailableSource(int32_t index
,
762 UnicodeString
& result
) const {
763 int32_t pos
= UHASH_FIRST
;
764 const UHashElement
*e
= 0;
765 while (index
-- >= 0) {
766 e
= specDAG
.nextElement(pos
);
774 result
= *(UnicodeString
*) e
->key
.pointer
;
779 int32_t TransliteratorRegistry::countAvailableTargets(const UnicodeString
& source
) const {
780 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
781 return (targets
== 0) ? 0 : targets
->count();
784 UnicodeString
& TransliteratorRegistry::getAvailableTarget(int32_t index
,
785 const UnicodeString
& source
,
786 UnicodeString
& result
) const {
787 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
789 result
.truncate(0); // invalid source
792 int32_t pos
= UHASH_FIRST
;
793 const UHashElement
*e
= 0;
794 while (index
-- >= 0) {
795 e
= targets
->nextElement(pos
);
801 result
.truncate(0); // invalid index
803 result
= *(UnicodeString
*) e
->key
.pointer
;
808 int32_t TransliteratorRegistry::countAvailableVariants(const UnicodeString
& source
,
809 const UnicodeString
& target
) const {
810 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
814 uint32_t varMask
= targets
->geti(target
);
815 int32_t varCount
= 0;
816 while (varMask
> 0) {
825 UnicodeString
& TransliteratorRegistry::getAvailableVariant(int32_t index
,
826 const UnicodeString
& source
,
827 const UnicodeString
& target
,
828 UnicodeString
& result
) const {
829 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
831 result
.truncate(0); // invalid source
834 uint32_t varMask
= targets
->geti(target
);
835 int32_t varCount
= 0;
836 int32_t varListIndex
= 0;
837 while (varMask
> 0) {
839 if (varCount
== index
) {
840 UnicodeString
*v
= (UnicodeString
*) variantList
.elementAt(varListIndex
);
852 result
.truncate(0); // invalid target or index
856 //----------------------------------------------------------------------
857 // class TransliteratorRegistry::Enumeration
858 //----------------------------------------------------------------------
860 TransliteratorRegistry::Enumeration::Enumeration(const TransliteratorRegistry
& _reg
) :
861 index(0), reg(_reg
) {
864 TransliteratorRegistry::Enumeration::~Enumeration() {
867 int32_t TransliteratorRegistry::Enumeration::count(UErrorCode
& /*status*/) const {
868 return reg
.availableIDs
.size();
871 const UnicodeString
* TransliteratorRegistry::Enumeration::snext(UErrorCode
& status
) {
872 // This is sloppy but safe -- if we get out of sync with the underlying
873 // registry, we will still return legal strings, but they might not
874 // correspond to the snapshot at construction time. So there could be
875 // duplicate IDs or omitted IDs if insertions or deletions occur in one
876 // thread while another is iterating. To be more rigorous, add a timestamp,
877 // which is incremented with any modification, and validate this iterator
878 // against the timestamp at construction time. This probably isn't worth
879 // doing as long as there is some possibility of removing this code in favor
880 // of some new code based on Doug's service framework.
881 if (U_FAILURE(status
)) {
884 int32_t n
= reg
.availableIDs
.size();
886 status
= U_ENUM_OUT_OF_SYNC_ERROR
;
888 // index == n is okay -- this means we've reached the end
890 // Copy the string! This avoids lifetime problems.
891 unistr
= *(const UnicodeString
*)reg
.availableIDs
[index
++];
898 void TransliteratorRegistry::Enumeration::reset(UErrorCode
& /*status*/) {
902 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TransliteratorRegistry::Enumeration
)
904 //----------------------------------------------------------------------
905 // class TransliteratorRegistry: internal
906 //----------------------------------------------------------------------
909 * Convenience method. Calls 6-arg registerEntry().
911 void TransliteratorRegistry::registerEntry(const UnicodeString
& source
,
912 const UnicodeString
& target
,
913 const UnicodeString
& variant
,
914 TransliteratorEntry
* adopted
,
917 UnicodeString
s(source
);
918 if (s
.length() == 0) {
919 s
.setTo(TRUE
, ANY
, 3);
921 TransliteratorIDParser::STVtoID(source
, target
, variant
, ID
);
922 registerEntry(ID
, s
, target
, variant
, adopted
, visible
);
926 * Convenience method. Calls 6-arg registerEntry().
928 void TransliteratorRegistry::registerEntry(const UnicodeString
& ID
,
929 TransliteratorEntry
* adopted
,
931 UnicodeString source
, target
, variant
;
933 TransliteratorIDParser::IDtoSTV(ID
, source
, target
, variant
, sawSource
);
934 // Only need to do this if ID.indexOf('-') < 0
936 TransliteratorIDParser::STVtoID(source
, target
, variant
, id
);
937 registerEntry(id
, source
, target
, variant
, adopted
, visible
);
941 * Register an entry object (adopted) with the given ID, source,
942 * target, and variant strings.
944 void TransliteratorRegistry::registerEntry(const UnicodeString
& ID
,
945 const UnicodeString
& source
,
946 const UnicodeString
& target
,
947 const UnicodeString
& variant
,
948 TransliteratorEntry
* adopted
,
950 UErrorCode status
= U_ZERO_ERROR
;
951 registry
.put(ID
, adopted
, status
);
953 registerSTV(source
, target
, variant
);
954 if (!availableIDs
.contains((void*) &ID
)) {
955 UnicodeString
*newID
= (UnicodeString
*)ID
.clone();
956 // Check to make sure newID was created.
958 // NUL-terminate the ID string
959 newID
->getTerminatedBuffer();
960 availableIDs
.addElement(newID
, status
);
964 removeSTV(source
, target
, variant
);
965 availableIDs
.removeElement((void*) &ID
);
970 * Register a source-target/variant in the specDAG. Variant may be
971 * empty, but source and target must not be.
973 void TransliteratorRegistry::registerSTV(const UnicodeString
& source
,
974 const UnicodeString
& target
,
975 const UnicodeString
& variant
) {
976 // assert(source.length() > 0);
977 // assert(target.length() > 0);
978 UErrorCode status
= U_ZERO_ERROR
;
979 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
982 if (source
.compare(ANY
,3) == 0) {
983 size
= ANY_TARGETS_INIT_SIZE
;
984 } else if (source
.compare(LAT
,3) == 0) {
985 size
= LAT_TARGETS_INIT_SIZE
;
987 targets
= new Hashtable(TRUE
, size
, status
);
988 if (U_FAILURE(status
) || targets
== NULL
) {
991 specDAG
.put(source
, targets
, status
);
993 int32_t variantListIndex
= variantList
.indexOf((void*) &variant
, 0);
994 if (variantListIndex
< 0) {
995 if (variantList
.size() >= VARIANT_LIST_MAX_SIZE
) {
996 // can't handle any more variants
999 UnicodeString
*variantEntry
= new UnicodeString(variant
);
1000 if (variantEntry
!= NULL
) {
1001 variantList
.addElement(variantEntry
, status
);
1002 if (U_SUCCESS(status
)) {
1003 variantListIndex
= variantList
.size() - 1;
1006 if (variantListIndex
< 0) {
1010 uint32_t addMask
= 1 << variantListIndex
;
1011 uint32_t varMask
= targets
->geti(target
);
1012 targets
->puti(target
, varMask
| addMask
, status
);
1016 * Remove a source-target/variant from the specDAG.
1018 void TransliteratorRegistry::removeSTV(const UnicodeString
& source
,
1019 const UnicodeString
& target
,
1020 const UnicodeString
& variant
) {
1021 // assert(source.length() > 0);
1022 // assert(target.length() > 0);
1023 UErrorCode status
= U_ZERO_ERROR
;
1024 Hashtable
*targets
= (Hashtable
*) specDAG
.get(source
);
1025 if (targets
== NULL
) {
1026 return; // should never happen for valid s-t/v
1028 uint32_t varMask
= targets
->geti(target
);
1030 return; // should never happen for valid s-t/v
1032 int32_t variantListIndex
= variantList
.indexOf((void*) &variant
, 0);
1033 if (variantListIndex
< 0) {
1034 return; // should never happen for valid s-t/v
1036 int32_t remMask
= 1 << variantListIndex
;
1037 varMask
&= (~remMask
);
1039 targets
->puti(target
, varMask
, status
);
1041 targets
->remove(target
); // should delete variants
1042 if (targets
->count() == 0) {
1043 specDAG
.remove(source
); // should delete targets
1049 * Attempt to find a source-target/variant in the dynamic registry
1050 * store. Return 0 on failure.
1052 * Caller does NOT own returned object.
1054 TransliteratorEntry
* TransliteratorRegistry::findInDynamicStore(const TransliteratorSpec
& src
,
1055 const TransliteratorSpec
& trg
,
1056 const UnicodeString
& variant
) const {
1058 TransliteratorIDParser::STVtoID(src
, trg
, variant
, ID
);
1059 TransliteratorEntry
*e
= (TransliteratorEntry
*) registry
.get(ID
);
1065 * Attempt to find a source-target/variant in the static locale
1066 * resource store. Do not perform fallback. Return 0 on failure.
1068 * On success, create a new entry object, register it in the dynamic
1069 * store, and return a pointer to it, but do not make it public --
1070 * just because someone requested something, we do not expand the
1071 * available ID list (or spec DAG).
1073 * Caller does NOT own returned object.
1075 TransliteratorEntry
* TransliteratorRegistry::findInStaticStore(const TransliteratorSpec
& src
,
1076 const TransliteratorSpec
& trg
,
1077 const UnicodeString
& variant
) {
1078 TransliteratorEntry
* entry
= 0;
1079 if (src
.isLocale()) {
1080 entry
= findInBundle(src
, trg
, variant
, UTRANS_FORWARD
);
1081 } else if (trg
.isLocale()) {
1082 entry
= findInBundle(trg
, src
, variant
, UTRANS_REVERSE
);
1085 // If we found an entry, store it in the Hashtable for next
1088 registerEntry(src
.getTop(), trg
.getTop(), variant
, entry
, FALSE
);
1094 // As of 2.0, resource bundle keys cannot contain '_'
1095 static const UChar TRANSLITERATE_TO
[] = {84,114,97,110,115,108,105,116,101,114,97,116,101,84,111,0}; // "TransliterateTo"
1097 static const UChar TRANSLITERATE_FROM
[] = {84,114,97,110,115,108,105,116,101,114,97,116,101,70,114,111,109,0}; // "TransliterateFrom"
1099 static const UChar TRANSLITERATE
[] = {84,114,97,110,115,108,105,116,101,114,97,116,101,0}; // "Transliterate"
1102 * Attempt to find an entry in a single resource bundle. This is
1103 * a one-sided lookup. findInStaticStore() performs up to two such
1104 * lookups, one for the source, and one for the target.
1106 * Do not perform fallback. Return 0 on failure.
1108 * On success, create a new Entry object, populate it, and return it.
1109 * The caller owns the returned object.
1111 TransliteratorEntry
* TransliteratorRegistry::findInBundle(const TransliteratorSpec
& specToOpen
,
1112 const TransliteratorSpec
& specToFind
,
1113 const UnicodeString
& variant
,
1114 UTransDirection direction
)
1117 UnicodeString resStr
;
1120 for (pass
=0; pass
<2; ++pass
) {
1122 // First try either TransliteratorTo_xxx or
1123 // TransliterateFrom_xxx, then try the bidirectional
1124 // Transliterate_xxx. This precedence order is arbitrary
1125 // but must be consistent and documented.
1127 utag
.append(direction
== UTRANS_FORWARD
?
1128 TRANSLITERATE_TO
: TRANSLITERATE_FROM
, -1);
1130 utag
.append(TRANSLITERATE
, -1);
1132 UnicodeString
s(specToFind
.get());
1133 utag
.append(s
.toUpper(""));
1134 UErrorCode status
= U_ZERO_ERROR
;
1135 ResourceBundle
subres(specToOpen
.getBundle().get(
1136 CharString().appendInvariantChars(utag
, status
).data(), status
));
1137 if (U_FAILURE(status
) || status
== U_USING_DEFAULT_WARNING
) {
1142 if (specToOpen
.get() != LocaleUtility::initNameFromLocale(subres
.getLocale(), s
)) {
1146 if (variant
.length() != 0) {
1147 status
= U_ZERO_ERROR
;
1148 resStr
= subres
.getStringEx(
1149 CharString().appendInvariantChars(variant
, status
).data(), status
);
1150 if (U_SUCCESS(status
)) {
1151 // Exit loop successfully
1155 // Variant is empty, which means match the first variant listed.
1156 status
= U_ZERO_ERROR
;
1157 resStr
= subres
.getStringEx(1, status
);
1158 if (U_SUCCESS(status
)) {
1159 // Exit loop successfully
1170 // We have succeeded in loading a string from the locale
1171 // resources. Create a new registry entry to hold it and return it.
1172 TransliteratorEntry
*entry
= new TransliteratorEntry();
1174 // The direction is always forward for the
1175 // TransliterateTo_xxx and TransliterateFrom_xxx
1176 // items; those are unidirectional forward rules.
1177 // For the bidirectional Transliterate_xxx items,
1178 // the direction is the value passed in to this
1180 int32_t dir
= (pass
== 0) ? UTRANS_FORWARD
: direction
;
1181 entry
->entryType
= TransliteratorEntry::LOCALE_RULES
;
1182 entry
->stringArg
= resStr
;
1183 entry
->intArg
= dir
;
1190 * Convenience method. Calls 3-arg find().
1192 TransliteratorEntry
* TransliteratorRegistry::find(const UnicodeString
& ID
) {
1193 UnicodeString source
, target
, variant
;
1195 TransliteratorIDParser::IDtoSTV(ID
, source
, target
, variant
, sawSource
);
1196 return find(source
, target
, variant
);
1200 * Top-level find method. Attempt to find a source-target/variant in
1201 * either the dynamic or the static (locale resource) store. Perform
1204 * Lookup sequence for ss_SS_SSS-tt_TT_TTT/v:
1206 * ss_SS_SSS-tt_TT_TTT/v -- in hashtable
1207 * ss_SS_SSS-tt_TT_TTT/v -- in ss_SS_SSS (no fallback)
1209 * repeat with t = tt_TT_TTT, tt_TT, tt, and tscript
1216 * Here * matches the first variant listed.
1218 * Caller does NOT own returned object. Return 0 on failure.
1220 TransliteratorEntry
* TransliteratorRegistry::find(UnicodeString
& source
,
1221 UnicodeString
& target
,
1222 UnicodeString
& variant
) {
1224 TransliteratorSpec
src(source
);
1225 TransliteratorSpec
trg(target
);
1226 TransliteratorEntry
* entry
;
1228 // Seek exact match in hashtable. Temporary fix for ICU 4.6.
1229 // TODO: The general logic for finding a matching transliterator needs to be reviewed.
1232 TransliteratorIDParser::STVtoID(source
, target
, variant
, ID
);
1233 entry
= (TransliteratorEntry
*) registry
.get(ID
);
1236 // std::cout << ID.toUTF8String(ss) << std::endl;
1240 if (variant
.length() != 0) {
1242 // Seek exact match in hashtable
1243 entry
= findInDynamicStore(src
, trg
, variant
);
1248 // Seek exact match in locale resources
1249 entry
= findInStaticStore(src
, trg
, variant
);
1258 // Seek match in hashtable
1259 entry
= findInDynamicStore(src
, trg
, NO_VARIANT
);
1264 // Seek match in locale resources
1265 entry
= findInStaticStore(src
, trg
, NO_VARIANT
);
1269 if (!src
.hasFallback()) {
1274 if (!trg
.hasFallback()) {
1284 * Given an Entry object, instantiate it. Caller owns result. Return
1287 * Return a non-empty aliasReturn value if the ID points to an alias.
1288 * We cannot instantiate it ourselves because the alias may contain
1289 * filters or compounds, which we do not understand. Caller should
1290 * make aliasReturn empty before calling.
1292 * The entry object is assumed to reside in the dynamic store. It may be
1295 Transliterator
* TransliteratorRegistry::instantiateEntry(const UnicodeString
& ID
,
1296 TransliteratorEntry
*entry
,
1297 TransliteratorAlias
* &aliasReturn
,
1298 UErrorCode
& status
) {
1299 Transliterator
*t
= 0;
1300 U_ASSERT(aliasReturn
== 0);
1302 switch (entry
->entryType
) {
1303 case TransliteratorEntry::RBT_DATA
:
1304 t
= new RuleBasedTransliterator(ID
, entry
->u
.data
);
1306 status
= U_MEMORY_ALLOCATION_ERROR
;
1309 case TransliteratorEntry::PROTOTYPE
:
1310 t
= entry
->u
.prototype
->clone();
1312 status
= U_MEMORY_ALLOCATION_ERROR
;
1315 case TransliteratorEntry::ALIAS
:
1316 aliasReturn
= new TransliteratorAlias(entry
->stringArg
, entry
->compoundFilter
);
1317 if (aliasReturn
== 0) {
1318 status
= U_MEMORY_ALLOCATION_ERROR
;
1321 case TransliteratorEntry::FACTORY
:
1322 t
= entry
->u
.factory
.function(ID
, entry
->u
.factory
.context
);
1324 status
= U_MEMORY_ALLOCATION_ERROR
;
1327 case TransliteratorEntry::COMPOUND_RBT
:
1329 UVector
* rbts
= new UVector(entry
->u
.dataVector
->size(), status
);
1330 // Check for null pointer
1332 status
= U_MEMORY_ALLOCATION_ERROR
;
1335 int32_t passNumber
= 1;
1336 for (int32_t i
= 0; U_SUCCESS(status
) && i
< entry
->u
.dataVector
->size(); i
++) {
1337 // TODO: Should passNumber be turned into a decimal-string representation (1 -> "1")?
1338 Transliterator
* tl
= new RuleBasedTransliterator(UnicodeString(CompoundTransliterator::PASS_STRING
) + UnicodeString(passNumber
++),
1339 (TransliterationRuleData
*)(entry
->u
.dataVector
->elementAt(i
)), FALSE
);
1341 status
= U_MEMORY_ALLOCATION_ERROR
;
1343 rbts
->addElement(tl
, status
);
1345 if (U_FAILURE(status
)) {
1349 aliasReturn
= new TransliteratorAlias(ID
, entry
->stringArg
, rbts
, entry
->compoundFilter
);
1351 if (aliasReturn
== 0) {
1352 status
= U_MEMORY_ALLOCATION_ERROR
;
1355 case TransliteratorEntry::LOCALE_RULES
:
1356 aliasReturn
= new TransliteratorAlias(ID
, entry
->stringArg
,
1357 (UTransDirection
) entry
->intArg
);
1358 if (aliasReturn
== 0) {
1359 status
= U_MEMORY_ALLOCATION_ERROR
;
1362 case TransliteratorEntry::RULES_FORWARD
:
1363 case TransliteratorEntry::RULES_REVERSE
:
1364 // Process the rule data into a TransliteratorRuleData object,
1365 // and possibly also into an ::id header and/or footer. Then
1366 // we modify the registry with the parsed data and retry.
1368 TransliteratorParser
parser(status
);
1370 // We use the file name, taken from another resource bundle
1371 // 2-d array at static init time, as a locale language. We're
1372 // just using the locale mechanism to map through to a file
1373 // name; this in no way represents an actual locale.
1374 //CharString ch(entry->stringArg);
1375 //UResourceBundle *bundle = ures_openDirect(0, ch, &status);
1376 UnicodeString rules
= entry
->stringArg
;
1377 //ures_close(bundle);
1379 //if (U_FAILURE(status)) {
1380 // We have a failure of some kind. Remove the ID from the
1381 // registry so we don't keep trying. NOTE: This will throw off
1382 // anyone who is, at the moment, trying to iterate over the
1383 // available IDs. That's acceptable since we should never
1384 // really get here except under installation, configuration,
1385 // or unrecoverable run time memory failures.
1389 // If the status indicates a failure, then we don't have any
1390 // rules -- there is probably an installation error. The list
1391 // in the root locale should correspond to all the installed
1392 // transliterators; if it lists something that's not
1393 // installed, we'll get an error from ResourceBundle.
1394 aliasReturn
= new TransliteratorAlias(ID
, rules
,
1395 ((entry
->entryType
== TransliteratorEntry::RULES_REVERSE
) ?
1396 UTRANS_REVERSE
: UTRANS_FORWARD
));
1397 if (aliasReturn
== 0) {
1398 status
= U_MEMORY_ALLOCATION_ERROR
;
1404 UPRV_UNREACHABLE
; // can't get here
1409 #endif /* #if !UCONFIG_NO_TRANSLITERATION */