X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..3d1f044b704633e2e541231cd17ae9ecf9ad5c7a:/icuSources/i18n/translit.cpp diff --git a/icuSources/i18n/translit.cpp b/icuSources/i18n/translit.cpp index f5e8a56e..aaaee8c9 100644 --- a/icuSources/i18n/translit.cpp +++ b/icuSources/i18n/translit.cpp @@ -1,12 +1,16 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* -********************************************************************** -* Copyright (C) 1999-2003, International Business Machines -* Corporation and others. All Rights Reserved. -********************************************************************** -* Date Name Description -* 11/17/99 aliu Creation. -********************************************************************** -*/ + ********************************************************************** + * Copyright (C) 1999-2016, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + * Date Name Description + * 11/17/99 aliu Creation. + ********************************************************************** + */ + +#include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" @@ -19,9 +23,10 @@ #include "unicode/rep.h" #include "unicode/resbund.h" #include "unicode/unifilt.h" -#include "unicode/unifltlg.h" #include "unicode/uniset.h" #include "unicode/uscript.h" +#include "unicode/strenum.h" +#include "unicode/utf16.h" #include "cpdtrans.h" #include "nultrans.h" #include "rbt_data.h" @@ -35,6 +40,7 @@ #include "tolowtrn.h" #include "toupptrn.h" #include "uni2name.h" +#include "brktrans.h" #include "esctrn.h" #include "unesctrn.h" #include "tridpars.h" @@ -46,6 +52,7 @@ #include "uassert.h" #include "cmemory.h" #include "cstring.h" +#include "uinvchar.h" static const UChar TARGET_SEP = 0x002D; /*-*/ static const UChar ID_DELIM = 0x003B; /*;*/ @@ -84,33 +91,29 @@ static const char RB_RULE_BASED_IDS[] = "RuleBasedTransliteratorIDs"; /** * The mutex controlling access to registry object. */ -static UMTX registryMutex = 0; +static icu::UMutex *registryMutex() { + static icu::UMutex *m = STATIC_NEW(icu::UMutex); + return m; +} /** * System transliterator registry; non-null when initialized. */ -static TransliteratorRegistry* registry = 0; +static icu::TransliteratorRegistry* registry = 0; // Macro to check/initialize the registry. ONLY USE WITHIN // MUTEX. Avoids function call when registry is initialized. -#define HAVE_REGISTRY (registry!=0 || initializeRegistry()) - -// Empty string -static const UChar EMPTY[] = {0}; //"" +#define HAVE_REGISTRY(status) (registry!=0 || initializeRegistry(status)) U_NAMESPACE_BEGIN -/** - * Class identifier for subclasses of Transliterator that do not - * define their class (anonymous subclasses). - */ -const char Transliterator::fgClassID = 0; // Value is irrelevant +UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(Transliterator) /** * Return TRUE if the given UTransPosition is valid for text of * the given length. */ -inline UBool positionIsValid(UTransPosition& index, int32_t len) { +static inline UBool positionIsValid(UTransPosition& index, int32_t len) { return !(index.contextStart < 0 || index.start < index.contextStart || index.limit < index.start || @@ -129,13 +132,20 @@ inline UBool positionIsValid(UTransPosition& index, int32_t len) { Transliterator::Transliterator(const UnicodeString& theID, UnicodeFilter* adoptedFilter) : UObject(), ID(theID), filter(adoptedFilter), - maximumContextLength(0) {} + maximumContextLength(0) +{ + // NUL-terminate the ID string, which is a non-aliased copy. + ID.append((UChar)0); + ID.truncate(ID.length()-1); +} /** * Destructor. */ Transliterator::~Transliterator() { - delete filter; + if (filter) { + delete filter; + } } /** @@ -143,18 +153,30 @@ Transliterator::~Transliterator() { */ Transliterator::Transliterator(const Transliterator& other) : UObject(other), ID(other.ID), filter(0), - maximumContextLength(other.maximumContextLength) { + maximumContextLength(other.maximumContextLength) +{ + // NUL-terminate the ID string, which is a non-aliased copy. + ID.append((UChar)0); + ID.truncate(ID.length()-1); + if (other.filter != 0) { // We own the filter, so we must have our own copy filter = (UnicodeFilter*) other.filter->clone(); } } +Transliterator* Transliterator::clone() const { + return NULL; +} + /** * Assignment operator. */ Transliterator& Transliterator::operator=(const Transliterator& other) { ID = other.ID; + // NUL-terminate the ID string + ID.getTerminatedBuffer(); + maximumContextLength = other.maximumContextLength; adoptFilter((other.filter == 0) ? 0 : (UnicodeFilter*) other.filter->clone()); return *this; @@ -352,7 +374,7 @@ void Transliterator::_transliterate(Replaceable& text, } if (index.limit > 0 && - UTF_IS_LEAD(text.charAt(index.limit - 1))) { + U16_IS_LEAD(text.charAt(index.limit - 1))) { // Oops, there is a dangling lead surrogate in the buffer. // This will break most transliterators, since they will // assume it is part of a pair. Don't transliterate until @@ -391,7 +413,7 @@ void Transliterator::_transliterate(Replaceable& text, int32_t n = getMaximumContextLength(); while (newCS > originalStart && n-- > 0) { --newCS; - newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1; + newCS -= U16_LENGTH(text.char32At(newCS)) - 1; } index.contextStart = uprv_max(newCS, originalStart); #endif @@ -462,14 +484,14 @@ void Transliterator::filteredTransliterate(Replaceable& text, UChar32 c; while (index.start < globalLimit && !filter->contains(c=text.char32At(index.start))) { - index.start += UTF_CHAR_LENGTH(c); + index.start += U16_LENGTH(c); } // Find the end of this run of unfiltered chars index.limit = index.start; while (index.limit < globalLimit && filter->contains(c=text.char32At(index.limit))) { - index.limit += UTF_CHAR_LENGTH(c); + index.limit += U16_LENGTH(c); } } @@ -552,8 +574,7 @@ void Transliterator::filteredTransliterate(Replaceable& text, // transliterations and commit complete transliterations. for (;;) { // Length of additional code point, either one or two - int32_t charLength = - UTF_CHAR_LENGTH(text.char32At(passLimit)); + int32_t charLength = U16_LENGTH(text.char32At(passLimit)); passLimit += charLength; if (passLimit > runLimit) { break; @@ -579,7 +600,7 @@ void Transliterator::filteredTransliterate(Replaceable& text, int32_t rs = rollbackStart + delta - (index.limit - passStart); // Delete the partially transliterated text - text.handleReplaceBetween(passStart, index.limit, EMPTY); + text.handleReplaceBetween(passStart, index.limit, UnicodeString()); // Copy the rollback text back text.copy(rs, rs + uncommittedLength, passStart); @@ -617,7 +638,7 @@ void Transliterator::filteredTransliterate(Replaceable& text, globalLimit += totalDelta; // Delete the rollback copy - text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, EMPTY); + text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, UnicodeString()); // Move start past committed text index.start = passStart; @@ -690,7 +711,7 @@ const UnicodeString& Transliterator::getID(void) const { * display to the user in the default locale. See {@link * #getDisplayName(Locale)} for details. */ -UnicodeString& Transliterator::getDisplayName(const UnicodeString& ID, +UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& ID, UnicodeString& result) { return getDisplayName(ID, Locale::getDefault(), result); } @@ -713,12 +734,12 @@ UnicodeString& Transliterator::getDisplayName(const UnicodeString& ID, * localized. * @see java.text.MessageFormat */ -UnicodeString& Transliterator::getDisplayName(const UnicodeString& id, +UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& id, const Locale& inLocale, UnicodeString& result) { UErrorCode status = U_ZERO_ERROR; - ResourceBundle bundle(u_getDataDirectory(), inLocale, status); + ResourceBundle bundle(U_ICUDATA_TRANSLIT, inLocale, status); // Suspend checking status until later... @@ -739,65 +760,69 @@ UnicodeString& Transliterator::getDisplayName(const UnicodeString& id, ID.append(TARGET_SEP).append(target).append(variant); // build the char* key - char key[200]; - uprv_strcpy(key, RB_DISPLAY_NAME_PREFIX); - int32_t length=(int32_t)uprv_strlen(RB_DISPLAY_NAME_PREFIX); - ID.extract(0, (int32_t)(sizeof(key)-length), key+length, ""); + if (uprv_isInvariantUString(ID.getBuffer(), ID.length())) { + char key[200]; + uprv_strcpy(key, RB_DISPLAY_NAME_PREFIX); + int32_t length=(int32_t)uprv_strlen(RB_DISPLAY_NAME_PREFIX); + ID.extract(0, (int32_t)(sizeof(key)-length), key+length, (int32_t)(sizeof(key)-length), US_INV); - // Try to retrieve a UnicodeString from the bundle. - UnicodeString resString = bundle.getStringEx(key, status); + // Try to retrieve a UnicodeString from the bundle. + UnicodeString resString = bundle.getStringEx(key, status); - if (U_SUCCESS(status) && resString.length() != 0) { - return result = resString; // [sic] assign & return - } + if (U_SUCCESS(status) && resString.length() != 0) { + return result = resString; // [sic] assign & return + } #if !UCONFIG_NO_FORMATTING - // We have failed to get a name from the locale data. This is - // typical, since most transliterators will not have localized - // name data. The next step is to retrieve the MessageFormat - // pattern from the locale data and to use it to synthesize the - // name from the ID. - - status = U_ZERO_ERROR; - resString = bundle.getStringEx(RB_DISPLAY_NAME_PATTERN, status); - - if (U_SUCCESS(status) && resString.length() != 0) { - MessageFormat msg(resString, inLocale, status); - // Suspend checking status until later... - - // We pass either 2 or 3 Formattable objects to msg. - Formattable args[3]; - int32_t nargs; - args[0].setLong(2); // # of args to follow - args[1].setString(source); - args[2].setString(target); - nargs = 3; - - // Use display names for the scripts, if they exist - UnicodeString s; - length=(int32_t)uprv_strlen(RB_SCRIPT_DISPLAY_NAME_PREFIX); - for (int j=1; j<=2; ++j) { - status = U_ZERO_ERROR; - uprv_strcpy(key, RB_SCRIPT_DISPLAY_NAME_PREFIX); - args[j].getString(s); - s.extract(0, sizeof(key)-length-1, key+length, ""); + // We have failed to get a name from the locale data. This is + // typical, since most transliterators will not have localized + // name data. The next step is to retrieve the MessageFormat + // pattern from the locale data and to use it to synthesize the + // name from the ID. - resString = bundle.getStringEx(key, status); + status = U_ZERO_ERROR; + resString = bundle.getStringEx(RB_DISPLAY_NAME_PATTERN, status); + + if (U_SUCCESS(status) && resString.length() != 0) { + MessageFormat msg(resString, inLocale, status); + // Suspend checking status until later... + + // We pass either 2 or 3 Formattable objects to msg. + Formattable args[3]; + int32_t nargs; + args[0].setLong(2); // # of args to follow + args[1].setString(source); + args[2].setString(target); + nargs = 3; + + // Use display names for the scripts, if they exist + UnicodeString s; + length=(int32_t)uprv_strlen(RB_SCRIPT_DISPLAY_NAME_PREFIX); + for (int j=1; j<=2; ++j) { + status = U_ZERO_ERROR; + uprv_strcpy(key, RB_SCRIPT_DISPLAY_NAME_PREFIX); + args[j].getString(s); + if (uprv_isInvariantUString(s.getBuffer(), s.length())) { + s.extract(0, sizeof(key)-length-1, key+length, (int32_t)sizeof(key)-length-1, US_INV); + + resString = bundle.getStringEx(key, status); + + if (U_SUCCESS(status)) { + args[j] = resString; + } + } + } + status = U_ZERO_ERROR; + FieldPosition pos; // ignored by msg + msg.format(args, nargs, result, pos, status); if (U_SUCCESS(status)) { - args[j] = resString; + result.append(variant); + return result; } } - - status = U_ZERO_ERROR; - FieldPosition pos; // ignored by msg - msg.format(args, nargs, result, pos, status); - if (U_SUCCESS(status)) { - result.append(variant); - return result; - } - } #endif + } // We should not reach this point unless there is something // wrong with the build or the RB_DISPLAY_NAME_PATTERN has @@ -865,9 +890,11 @@ Transliterator* Transliterator::createInverse(UErrorCode& status) const { return Transliterator::createInstance(ID, UTRANS_REVERSE,parseError,status); } -Transliterator* Transliterator::createInstance(const UnicodeString& ID, - UTransDirection dir, - UErrorCode& status) { +Transliterator* U_EXPORT2 +Transliterator::createInstance(const UnicodeString& ID, + UTransDirection dir, + UErrorCode& status) +{ UParseError parseError; return createInstance(ID, dir, parseError, status); } @@ -883,10 +910,12 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID, * @see #getAvailableIDs * @see #getID */ -Transliterator* Transliterator::createInstance(const UnicodeString& ID, - UTransDirection dir, - UParseError& parseError, - UErrorCode& status) { +Transliterator* U_EXPORT2 +Transliterator::createInstance(const UnicodeString& ID, + UTransDirection dir, + UParseError& parseError, + UErrorCode& status) +{ if (U_FAILURE(status)) { return 0; } @@ -905,33 +934,34 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID, return NULL; } - TransliteratorIDParser::instantiateList(list, NULL, -1, status); + TransliteratorIDParser::instantiateList(list, status); if (U_FAILURE(status)) { return NULL; } U_ASSERT(list.size() > 0); Transliterator* t = NULL; - switch (list.size()) { - case 1: - t = (Transliterator*) list.elementAt(0); - break; - default: + + if (list.size() > 1 || canonID.indexOf(ID_DELIM) >= 0) { + // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only + // has one child transliterator. This is so that toRules() will return the right thing + // (without any inactive ID), but our main ID still comes out correct. That is, if we + // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;" + // even though the ID is "(Lower);Latin-Greek;". t = new CompoundTransliterator(list, parseError, status); - /* test for NULL */ - if (t == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - if (U_FAILURE(status)) { - delete t; - return NULL; + } + else { + t = (Transliterator*)list.elementAt(0); + } + // Check null pointer + if (t != NULL) { + t->setID(canonID); + if (globalFilter != NULL) { + t->adoptFilter(globalFilter); } - break; } - t->setID(canonID); - if (globalFilter != NULL) { - t->adoptFilter(globalFilter); + else if (U_SUCCESS(status)) { + status = U_MEMORY_ALLOCATION_ERROR; } return t; } @@ -950,28 +980,57 @@ Transliterator* Transliterator::createBasicInstance(const UnicodeString& id, UErrorCode ec = U_ZERO_ERROR; TransliteratorAlias* alias = 0; Transliterator* t = 0; - - umtx_init(®istryMutex); - umtx_lock(®istryMutex); - if (HAVE_REGISTRY) { - t = registry->get(id, alias, pe, ec); + + umtx_lock(registryMutex()); + if (HAVE_REGISTRY(ec)) { + t = registry->get(id, alias, ec); } - umtx_unlock(®istryMutex); + umtx_unlock(registryMutex()); if (U_FAILURE(ec)) { delete t; delete alias; - return NULL; + return 0; } - if (alias != 0) { - // Instantiate an alias + // We may have not gotten a transliterator: Because we can't + // instantiate a transliterator from inside TransliteratorRegistry:: + // get() (that would deadlock), we sometimes pass back an alias. This + // contains the data we need to finish the instantiation outside the + // registry mutex. The alias may, in turn, generate another alias, so + // we handle aliases in a loop. The max times through the loop is two. + // [alan] + while (alias != 0) { U_ASSERT(t==0); - t = alias->create(pe, ec); - delete alias; + // Rule-based aliases are handled with TransliteratorAlias:: + // parse(), followed by TransliteratorRegistry::reget(). + // Other aliases are handled with TransliteratorAlias::create(). + if (alias->isRuleBased()) { + // Step 1. parse + TransliteratorParser parser(ec); + alias->parse(parser, pe, ec); + delete alias; + alias = 0; + + // Step 2. reget + umtx_lock(registryMutex()); + if (HAVE_REGISTRY(ec)) { + t = registry->reget(id, parser, alias, ec); + } + umtx_unlock(registryMutex()); + + // Step 3. Loop back around! + } else { + t = alias->create(pe, ec); + delete alias; + alias = 0; + break; + } if (U_FAILURE(ec)) { delete t; + delete alias; t = NULL; + break; } } @@ -990,14 +1049,16 @@ Transliterator* Transliterator::createBasicInstance(const UnicodeString& id, * NullTransliterator, if it contains ID blocks which parse as * empty for the given direction. */ -Transliterator* Transliterator::createFromRules(const UnicodeString& ID, - const UnicodeString& rules, - UTransDirection dir, - UParseError& parseError, - UErrorCode& status) { +Transliterator* U_EXPORT2 +Transliterator::createFromRules(const UnicodeString& ID, + const UnicodeString& rules, + UTransDirection dir, + UParseError& parseError, + UErrorCode& status) +{ Transliterator* t = NULL; - TransliteratorParser parser; + TransliteratorParser parser(status); parser.parse(rules, dir, parseError, status); if (U_FAILURE(status)) { @@ -1005,59 +1066,74 @@ Transliterator* Transliterator::createFromRules(const UnicodeString& ID, } // NOTE: The logic here matches that in TransliteratorRegistry. - if (parser.idBlock.length() == 0) { - if (parser.data == NULL) { - // No idBlock, no data -- this is just an - // alias for Null - t = new NullTransliterator(); - } else { - // No idBlock, data != 0 -- this is an - // ordinary RBT_DATA. - t = new RuleBasedTransliterator(ID, parser.orphanData(), TRUE); // TRUE == adopt data object + if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) { + t = new NullTransliterator(); + } + else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) { + t = new RuleBasedTransliterator(ID, (TransliterationRuleData*)parser.dataVector.orphanElementAt(0), TRUE); + } + else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) { + // idBlock, no data -- this is an alias. The ID has + // been munged from reverse into forward mode, if + // necessary, so instantiate the ID in the forward + // direction. + if (parser.compoundFilter != NULL) { + UnicodeString filterPattern; + parser.compoundFilter->toPattern(filterPattern, FALSE); + t = createInstance(filterPattern + UnicodeString(ID_DELIM) + + *((UnicodeString*)parser.idBlockVector.elementAt(0)), UTRANS_FORWARD, parseError, status); } - /* test for NULL */ - if (t == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; + else + t = createInstance(*((UnicodeString*)parser.idBlockVector.elementAt(0)), UTRANS_FORWARD, parseError, status); + + + if (t != NULL) { + t->setID(ID); } - } else { - if (parser.data == NULL) { - // idBlock, no data -- this is an alias. The ID has - // been munged from reverse into forward mode, if - // necessary, so instantiate the ID in the forward - // direction. - t = createInstance(parser.idBlock, UTRANS_FORWARD, parseError, status); - if (t != NULL) { - t->setID(ID); - } - } else { - // idBlock and data -- this is a compound - // RBT - UnicodeString id("_", ""); - t = new RuleBasedTransliterator(id, parser.orphanData(), TRUE); // TRUE == adopt data object - /* test for NULL */ - if (t == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - t = new CompoundTransliterator(ID, parser.idBlock, parser.idSplitPoint, - t, status); - /* test for NULL */ - if (t == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - if (U_FAILURE(status)) { - delete t; - t = 0; + } + else { + UVector transliterators(status); + int32_t passNumber = 1; + + int32_t limit = parser.idBlockVector.size(); + if (parser.dataVector.size() > limit) + limit = parser.dataVector.size(); + + for (int32_t i = 0; i < limit; i++) { + if (i < parser.idBlockVector.size()) { + UnicodeString* idBlock = (UnicodeString*)parser.idBlockVector.elementAt(i); + if (!idBlock->isEmpty()) { + Transliterator* temp = createInstance(*idBlock, UTRANS_FORWARD, parseError, status); + if (temp != NULL && typeid(*temp) != typeid(NullTransliterator)) + transliterators.addElement(temp, status); + else + delete temp; + } } - if (parser.compoundFilter != NULL) { - t->adoptFilter(parser.orphanCompoundFilter()); + if (!parser.dataVector.isEmpty()) { + TransliterationRuleData* data = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); + // TODO: Should passNumber be turned into a decimal-string representation (1 -> "1")? + RuleBasedTransliterator* temprbt = new RuleBasedTransliterator(UnicodeString(CompoundTransliterator::PASS_STRING) + UnicodeString(passNumber++), + data, TRUE); + // Check if NULL before adding it to transliterators to avoid future usage of NULL pointer. + if (temprbt == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return t; + } + transliterators.addElement(temprbt, status); } - return t; } - } + t = new CompoundTransliterator(transliterators, passNumber - 1, parseError, status); + // Null pointer check + if (t != NULL) { + t->setID(ID); + t->adoptFilter(parser.orphanCompoundFilter()); + } + } + if (U_SUCCESS(status) && t == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } return t; } @@ -1073,7 +1149,7 @@ UnicodeString& Transliterator::toRules(UnicodeString& rulesSource, if (!ICU_Utility::escapeUnprintable(rulesSource, c)) { rulesSource.append(c); } - i += UTF_CHAR_LENGTH(c); + i += U16_LENGTH(c); } } else { rulesSource = getID(); @@ -1084,24 +1160,45 @@ UnicodeString& Transliterator::toRules(UnicodeString& rulesSource, return rulesSource; } +int32_t Transliterator::countElements() const { + const CompoundTransliterator* ct = dynamic_cast(this); + return ct != NULL ? ct->getCount() : 0; +} + +const Transliterator& Transliterator::getElement(int32_t index, UErrorCode& ec) const { + if (U_FAILURE(ec)) { + return *this; + } + const CompoundTransliterator* cpd = dynamic_cast(this); + int32_t n = (cpd == NULL) ? 1 : cpd->getCount(); + if (index < 0 || index >= n) { + ec = U_INDEX_OUTOFBOUNDS_ERROR; + return *this; + } else { + return (n == 1) ? *this : cpd->getTransliterator(index); + } +} + UnicodeSet& Transliterator::getSourceSet(UnicodeSet& result) const { handleGetSourceSet(result); if (filter != NULL) { - UnicodeSet* filterSet; - UBool deleteFilterSet = FALSE; - // Most, but not all filters will be UnicodeSets. Optimize for - // the high-runner case. - if (filter->getDynamicClassID() == UnicodeSet::getStaticClassID()) { - filterSet = (UnicodeSet*) filter; - } else { - filterSet = new UnicodeSet(); - deleteFilterSet = TRUE; - filter->addMatchSetTo(*filterSet); - } - result.retainAll(*filterSet); - if (deleteFilterSet) { - delete filterSet; - } + UnicodeSet* filterSet = dynamic_cast(filter); + UBool deleteFilterSet = FALSE; + // Most, but not all filters will be UnicodeSets. Optimize for + // the high-runner case. + if (filterSet == NULL) { + filterSet = new UnicodeSet(); + // Check null pointer + if (filterSet == NULL) { + return result; + } + deleteFilterSet = TRUE; + filter->addMatchSetTo(*filterSet); + } + result.retainAll(*filterSet); + if (deleteFilterSet) { + delete filterSet; + } } return result; } @@ -1115,12 +1212,12 @@ UnicodeSet& Transliterator::getTargetSet(UnicodeSet& result) const { } // For public consumption -void Transliterator::registerFactory(const UnicodeString& id, +void U_EXPORT2 Transliterator::registerFactory(const UnicodeString& id, Transliterator::Factory factory, Transliterator::Token context) { - umtx_init(®istryMutex); - Mutex lock(®istryMutex); - if (HAVE_REGISTRY) { + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + if (HAVE_REGISTRY(ec)) { _registerFactory(id, factory, context); } } @@ -1130,7 +1227,8 @@ void Transliterator::registerFactory(const UnicodeString& id, void Transliterator::_registerFactory(const UnicodeString& id, Transliterator::Factory factory, Transliterator::Token context) { - registry->put(id, factory, context, TRUE); + UErrorCode ec = U_ZERO_ERROR; + registry->put(id, factory, context, TRUE, ec); } // To be called only by Transliterator subclasses that are called @@ -1138,7 +1236,8 @@ void Transliterator::_registerFactory(const UnicodeString& id, void Transliterator::_registerSpecialInverse(const UnicodeString& target, const UnicodeString& inverseTarget, UBool bidirectional) { - TransliteratorIDParser::registerSpecialInverse(target, inverseTarget, bidirectional); + UErrorCode status = U_ZERO_ERROR; + TransliteratorIDParser::registerSpecialInverse(target, inverseTarget, bidirectional, status); } /** @@ -1154,109 +1253,145 @@ void Transliterator::_registerSpecialInverse(const UnicodeString& target, * @see #getInstance * @see #unregister */ -void Transliterator::registerInstance(Transliterator* adoptedPrototype) { - umtx_init(®istryMutex); - Mutex lock(®istryMutex); - if (HAVE_REGISTRY) { +void U_EXPORT2 Transliterator::registerInstance(Transliterator* adoptedPrototype) { + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + if (HAVE_REGISTRY(ec)) { _registerInstance(adoptedPrototype); } } void Transliterator::_registerInstance(Transliterator* adoptedPrototype) { - registry->put(adoptedPrototype, TRUE); + UErrorCode ec = U_ZERO_ERROR; + registry->put(adoptedPrototype, TRUE, ec); +} + +void U_EXPORT2 Transliterator::registerAlias(const UnicodeString& aliasID, + const UnicodeString& realID) { + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + if (HAVE_REGISTRY(ec)) { + _registerAlias(aliasID, realID); + } +} + +void Transliterator::_registerAlias(const UnicodeString& aliasID, + const UnicodeString& realID) { + UErrorCode ec = U_ZERO_ERROR; + registry->put(aliasID, realID, FALSE, TRUE, ec); } /** * Unregisters a transliterator or class. This may be either * a system transliterator or a user transliterator or class. - * + * * @param ID the ID of the transliterator or class * @see #registerInstance */ -void Transliterator::unregister(const UnicodeString& ID) { - umtx_init(®istryMutex); - Mutex lock(®istryMutex); - if (HAVE_REGISTRY) { +void U_EXPORT2 Transliterator::unregister(const UnicodeString& ID) { + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + if (HAVE_REGISTRY(ec)) { registry->remove(ID); } } /** + * == OBSOLETE - remove in ICU 3.4 == * Return the number of IDs currently registered with the system. * To retrieve the actual IDs, call getAvailableID(i) with * i from 0 to countAvailableIDs() - 1. */ -int32_t Transliterator::countAvailableIDs(void) { - umtx_init(®istryMutex); - Mutex lock(®istryMutex); - return HAVE_REGISTRY ? registry->countAvailableIDs() : 0; +int32_t U_EXPORT2 Transliterator::countAvailableIDs(void) { + int32_t retVal = 0; + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + if (HAVE_REGISTRY(ec)) { + retVal = registry->countAvailableIDs(); + } + return retVal; } /** + * == OBSOLETE - remove in ICU 3.4 == * Return the index-th available ID. index must be between 0 * and countAvailableIDs() - 1, inclusive. If index is out of * range, the result of getAvailableID(0) is returned. */ -const UnicodeString& Transliterator::getAvailableID(int32_t index) { +const UnicodeString& U_EXPORT2 Transliterator::getAvailableID(int32_t index) { const UnicodeString* result = NULL; - umtx_init(®istryMutex); - umtx_lock(®istryMutex); - if (HAVE_REGISTRY) { + umtx_lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + if (HAVE_REGISTRY(ec)) { result = ®istry->getAvailableID(index); } - umtx_unlock(®istryMutex); + umtx_unlock(registryMutex()); U_ASSERT(result != NULL); // fail if no registry return *result; } -int32_t Transliterator::countAvailableSources(void) { - umtx_init(®istryMutex); - Mutex lock(®istryMutex); - return HAVE_REGISTRY ? _countAvailableSources() : 0; +StringEnumeration* U_EXPORT2 Transliterator::getAvailableIDs(UErrorCode& ec) { + if (U_FAILURE(ec)) return NULL; + StringEnumeration* result = NULL; + umtx_lock(registryMutex()); + if (HAVE_REGISTRY(ec)) { + result = registry->getAvailableIDs(); + } + umtx_unlock(registryMutex()); + if (result == NULL) { + ec = U_INTERNAL_TRANSLITERATOR_ERROR; + } + return result; +} + +int32_t U_EXPORT2 Transliterator::countAvailableSources(void) { + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + return HAVE_REGISTRY(ec) ? _countAvailableSources() : 0; } -UnicodeString& Transliterator::getAvailableSource(int32_t index, +UnicodeString& U_EXPORT2 Transliterator::getAvailableSource(int32_t index, UnicodeString& result) { - umtx_init(®istryMutex); - Mutex lock(®istryMutex); - if (HAVE_REGISTRY) { + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + if (HAVE_REGISTRY(ec)) { _getAvailableSource(index, result); } return result; } -int32_t Transliterator::countAvailableTargets(const UnicodeString& source) { - umtx_init(®istryMutex); - Mutex lock(®istryMutex); - return HAVE_REGISTRY ? _countAvailableTargets(source) : 0; +int32_t U_EXPORT2 Transliterator::countAvailableTargets(const UnicodeString& source) { + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + return HAVE_REGISTRY(ec) ? _countAvailableTargets(source) : 0; } -UnicodeString& Transliterator::getAvailableTarget(int32_t index, +UnicodeString& U_EXPORT2 Transliterator::getAvailableTarget(int32_t index, const UnicodeString& source, UnicodeString& result) { - umtx_init(®istryMutex); - Mutex lock(®istryMutex); - if (HAVE_REGISTRY) { + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + if (HAVE_REGISTRY(ec)) { _getAvailableTarget(index, source, result); } return result; } -int32_t Transliterator::countAvailableVariants(const UnicodeString& source, +int32_t U_EXPORT2 Transliterator::countAvailableVariants(const UnicodeString& source, const UnicodeString& target) { - umtx_init(®istryMutex); - Mutex lock(®istryMutex); - return HAVE_REGISTRY ? _countAvailableVariants(source, target) : 0; + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + return HAVE_REGISTRY(ec) ? _countAvailableVariants(source, target) : 0; } -UnicodeString& Transliterator::getAvailableVariant(int32_t index, +UnicodeString& U_EXPORT2 Transliterator::getAvailableVariant(int32_t index, const UnicodeString& source, const UnicodeString& target, UnicodeString& result) { - umtx_init(®istryMutex); - Mutex lock(®istryMutex); - if (HAVE_REGISTRY) { + Mutex lock(registryMutex()); + UErrorCode ec = U_ZERO_ERROR; + if (HAVE_REGISTRY(ec)) { _getAvailableVariant(index, source, target, result); } return result; @@ -1315,18 +1450,16 @@ UChar Transliterator::filteredCharAt(const Replaceable& text, int32_t i) const { * and return TRUE. If the registry cannot be initialized, return * FALSE (rare). * - * IMPORTANT: Upon entry, registryMutex must be LOCKED. The entirely + * IMPORTANT: Upon entry, registryMutex must be LOCKED. The entire * initialization is done with the lock held. There is NO REASON to * unlock, since no other thread that is waiting on the registryMutex * cannot itself proceed until the registry is initialized. */ -UBool Transliterator::initializeRegistry() { +UBool Transliterator::initializeRegistry(UErrorCode &status) { if (registry != 0) { return TRUE; } - UErrorCode status = U_ZERO_ERROR; - registry = new TransliteratorRegistry(status); if (registry == 0 || U_FAILURE(status)) { delete registry; @@ -1335,71 +1468,88 @@ UBool Transliterator::initializeRegistry() { } /* The following code parses the index table located in - * icu/data/translit_index.txt. The index is an n x 4 table + * icu/data/translit/root.txt. The index is an n x 4 table * that follows this format: - * - * :file:: - * :internal:: - * :alias:: - * + * { + * file{ + * resource{""} + * direction{""} + * } + * } + * { + * internal{ + * resource{""} + * direction{"{ + * alias{" is the ID of the system transliterator being defined. These * are public IDs enumerated by Transliterator.getAvailableIDs(), * unless the second field is "internal". - * + * * is a ResourceReader resource name. Currently these refer * to file names under com/ibm/text/resources. This string is passed * directly to ResourceReader, together with . - * + * * is either "FORWARD" or "REVERSE". - * + * * is a string to be passed directly to * Transliterator.getInstance(). The returned Transliterator object * then has its ID changed to and is returned. * * The extra blank field on "alias" lines is to make the array square. */ - static const char translit_index[] = "translit_index"; - - UResourceBundle *bundle, *transIDs, *colBund; - bundle = ures_openDirect(0, translit_index, &status); - transIDs = ures_getByKey(bundle, RB_RULE_BASED_IDS, 0, &status); + //static const char translit_index[] = "translit_index"; - int32_t row, maxRows; + UResourceBundle *bundle = ures_open(U_ICUDATA_TRANSLIT, NULL/*open default locale*/, &status); + UResourceBundle *transIDs = ures_getByKey(bundle, RB_RULE_BASED_IDS, 0, &status); if (U_SUCCESS(status)) { - maxRows = ures_getSize(transIDs); + UResourceBundle *colBund = NULL; + UResourceBundle* res = NULL; + int32_t row, maxRows = ures_getSize(transIDs); for (row = 0; row < maxRows; row++) { - colBund = ures_getByIndex(transIDs, row, 0, &status); - - if (U_SUCCESS(status) && ures_getSize(colBund) == 4) { - UnicodeString id = ures_getUnicodeStringByIndex(colBund, 0, &status); - UChar type = ures_getUnicodeStringByIndex(colBund, 1, &status).charAt(0); - UnicodeString resString = ures_getUnicodeStringByIndex(colBund, 2, &status); - - if (U_SUCCESS(status)) { - switch (type) { - case 0x66: // 'f' - case 0x69: // 'i' - // 'file' or 'internal'; - // row[2]=resource, row[3]=direction - { - UBool visible = (type == 0x0066 /*f*/); - UTransDirection dir = - (ures_getUnicodeStringByIndex(colBund, 3, &status).charAt(0) == - 0x0046 /*F*/) ? - UTRANS_FORWARD : UTRANS_REVERSE; - registry->put(id, resString, dir, visible); - } - break; - case 0x61: // 'a' - // 'alias'; row[2]=createInstance argument - registry->put(id, resString, TRUE); - break; + colBund = ures_getByIndex(transIDs, row, colBund, &status); + if (U_FAILURE(status)) { + break; + } + const char *tridKey = ures_getKey(colBund); + if (tridKey == NULL || uprv_strstr(tridKey, "-t-") != NULL) { + continue; // Apple version should not get any of these, eliminated the root.txt entries + } + res = ures_getNextResource(colBund, res, &status); + if (U_FAILURE(status)) { + break; + } + UnicodeString trID(tridKey, -1, US_INV); + const char* typeStr = ures_getKey(res); + int32_t len = 0, dlen = 0; + UBool visible = FALSE; + const UChar *resString; + switch (typeStr[0]) { + case 'f': // "file" + visible = TRUE; + // FALLTHROUGH + case 'i': // "internal" => visible = FALSE + // child resources are resource and direction + { + resString = ures_getStringByKey(res, "resource", &len, &status); + const UChar* dirString = ures_getStringByKey(res, "direction", &dlen, &status); + UTransDirection dir = (dlen <= 0 || dirString[0] == 0x0046 /*F*/)? UTRANS_FORWARD : UTRANS_REVERSE; + registry->put(trID, UnicodeString(TRUE, resString, len), dir, TRUE, visible, status); } - } + break; + case 'a': // "alias", string argument is alias + resString = ures_getString(res, &len, &status); + registry->put(trID, UnicodeString(TRUE, resString, len), TRUE, TRUE, status); + break; + default: // do nothing + break; } - - ures_close(colBund); } + ures_close(res); + ures_close(colBund); } ures_close(transIDs); @@ -1409,12 +1559,51 @@ UBool Transliterator::initializeRegistry() { // cache. This is how new non-rule-based transliterators are // added to the system. - registry->put(new NullTransliterator(), TRUE); - registry->put(new LowercaseTransliterator(), TRUE); - registry->put(new UppercaseTransliterator(), TRUE); - registry->put(new TitlecaseTransliterator(), TRUE); - registry->put(new UnicodeNameTransliterator(), TRUE); - registry->put(new NameUnicodeTransliterator(), TRUE); + // This is to allow for null pointer check + NullTransliterator* tempNullTranslit = new NullTransliterator(); + LowercaseTransliterator* tempLowercaseTranslit = new LowercaseTransliterator(); + UppercaseTransliterator* tempUppercaseTranslit = new UppercaseTransliterator(); + TitlecaseTransliterator* tempTitlecaseTranslit = new TitlecaseTransliterator(); + UnicodeNameTransliterator* tempUnicodeTranslit = new UnicodeNameTransliterator(); + NameUnicodeTransliterator* tempNameUnicodeTranslit = new NameUnicodeTransliterator(); +#if !UCONFIG_NO_BREAK_ITERATION + // TODO: could or should these transliterators be referenced polymorphically once constructed? + BreakTransliterator* tempBreakTranslit = new BreakTransliterator(); +#endif + // Check for null pointers + if (tempNullTranslit == NULL || tempLowercaseTranslit == NULL || tempUppercaseTranslit == NULL || + tempTitlecaseTranslit == NULL || tempUnicodeTranslit == NULL || +#if !UCONFIG_NO_BREAK_ITERATION + tempBreakTranslit == NULL || +#endif + tempNameUnicodeTranslit == NULL ) + { + delete tempNullTranslit; + delete tempLowercaseTranslit; + delete tempUppercaseTranslit; + delete tempTitlecaseTranslit; + delete tempUnicodeTranslit; + delete tempNameUnicodeTranslit; +#if !UCONFIG_NO_BREAK_ITERATION + delete tempBreakTranslit; +#endif + // Since there was an error, remove registry + delete registry; + registry = NULL; + + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + + registry->put(tempNullTranslit, TRUE, status); + registry->put(tempLowercaseTranslit, TRUE, status); + registry->put(tempUppercaseTranslit, TRUE, status); + registry->put(tempTitlecaseTranslit, TRUE, status); + registry->put(tempUnicodeTranslit, TRUE, status); + registry->put(tempNameUnicodeTranslit, TRUE, status); +#if !UCONFIG_NO_BREAK_ITERATION + registry->put(tempBreakTranslit, FALSE, status); // FALSE means invisible. +#endif RemoveTransliterator::registerIDs(); // Must be within mutex EscapeTransliterator::registerIDs(); @@ -1422,33 +1611,34 @@ UBool Transliterator::initializeRegistry() { NormalizationTransliterator::registerIDs(); AnyTransliterator::registerIDs(); - _registerSpecialInverse(NullTransliterator::SHORT_ID, - NullTransliterator::SHORT_ID, FALSE); - _registerSpecialInverse("Upper", "Lower", TRUE); - _registerSpecialInverse("Title", "Lower", FALSE); + _registerSpecialInverse(UNICODE_STRING_SIMPLE("Null"), + UNICODE_STRING_SIMPLE("Null"), FALSE); + _registerSpecialInverse(UNICODE_STRING_SIMPLE("Upper"), + UNICODE_STRING_SIMPLE("Lower"), TRUE); + _registerSpecialInverse(UNICODE_STRING_SIMPLE("Title"), + UNICODE_STRING_SIMPLE("Lower"), FALSE); - ucln_i18n_registerCleanup(); + ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup); return TRUE; } U_NAMESPACE_END -// Defined in ucln_in.h: +// Defined in transreg.h: /** * Release all static memory held by transliterator. This will * necessarily invalidate any rule-based transliterators held by the * user, because RBTs hold pointers to common data objects. */ -U_CFUNC UBool transliterator_cleanup(void) { - TitlecaseTransliterator::cleanup(); +U_CFUNC UBool utrans_transliterator_cleanup(void) { + U_NAMESPACE_USE TransliteratorIDParser::cleanup(); if (registry) { delete registry; registry = NULL; } - umtx_destroy(®istryMutex); return TRUE; }