+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
- * Copyright (C) 1999-2008, International Business Machines
+ * Copyright (C) 1999-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
**********************************************************************
*/
+#include "utypeinfo.h" // for 'typeid' to work
+
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION
#include "unicode/uniset.h"
#include "unicode/uscript.h"
#include "unicode/strenum.h"
+#include "unicode/utf16.h"
#include "cpdtrans.h"
#include "nultrans.h"
#include "rbt_data.h"
/**
* The mutex controlling access to registry object.
*/
-static UMTX registryMutex = 0;
+static UMutex registryMutex = U_MUTEX_INITIALIZER;
/**
* System transliterator registry; non-null when initialized.
*/
-static U_NAMESPACE_QUALIFIER TransliteratorRegistry* registry = 0;
+static icu::TransliteratorRegistry* registry = 0;
// Macro to check/initialize the registry. ONLY USE WITHIN
// MUTEX. Avoids function call when registry is initialized.
#define HAVE_REGISTRY(status) (registry!=0 || initializeRegistry(status))
-// Empty string
-static const UChar EMPTY[] = {0}; //""
-
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(Transliterator)
}
if (index.limit > 0 &&
- UTF_IS_LEAD(text.charAt(index.limit - 1))) {
+ U16_IS_LEAD(text.charAt(index.limit - 1))) {
// Oops, there is a dangling lead surrogate in the buffer.
// This will break most transliterators, since they will
// assume it is part of a pair. Don't transliterate until
int32_t n = getMaximumContextLength();
while (newCS > originalStart && n-- > 0) {
--newCS;
- newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1;
+ newCS -= U16_LENGTH(text.char32At(newCS)) - 1;
}
index.contextStart = uprv_max(newCS, originalStart);
#endif
UChar32 c;
while (index.start < globalLimit &&
!filter->contains(c=text.char32At(index.start))) {
- index.start += UTF_CHAR_LENGTH(c);
+ index.start += U16_LENGTH(c);
}
// Find the end of this run of unfiltered chars
index.limit = index.start;
while (index.limit < globalLimit &&
filter->contains(c=text.char32At(index.limit))) {
- index.limit += UTF_CHAR_LENGTH(c);
+ index.limit += U16_LENGTH(c);
}
}
// transliterations and commit complete transliterations.
for (;;) {
// Length of additional code point, either one or two
- int32_t charLength =
- UTF_CHAR_LENGTH(text.char32At(passLimit));
+ int32_t charLength = U16_LENGTH(text.char32At(passLimit));
passLimit += charLength;
if (passLimit > runLimit) {
break;
int32_t rs = rollbackStart + delta - (index.limit - passStart);
// Delete the partially transliterated text
- text.handleReplaceBetween(passStart, index.limit, EMPTY);
+ text.handleReplaceBetween(passStart, index.limit, UnicodeString());
// Copy the rollback text back
text.copy(rs, rs + uncommittedLength, passStart);
globalLimit += totalDelta;
// Delete the rollback copy
- text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, EMPTY);
+ text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, UnicodeString());
// Move start past committed text
index.start = passStart;
TransliteratorAlias* alias = 0;
Transliterator* t = 0;
- umtx_init(®istryMutex);
umtx_lock(®istryMutex);
if (HAVE_REGISTRY(ec)) {
t = registry->get(id, alias, ec);
UnicodeString* idBlock = (UnicodeString*)parser.idBlockVector.elementAt(i);
if (!idBlock->isEmpty()) {
Transliterator* temp = createInstance(*idBlock, UTRANS_FORWARD, parseError, status);
- if (temp != NULL && temp->getDynamicClassID() != NullTransliterator::getStaticClassID())
+ if (temp != NULL && typeid(*temp) != typeid(NullTransliterator))
transliterators.addElement(temp, status);
else
delete temp;
}
if (!parser.dataVector.isEmpty()) {
TransliterationRuleData* data = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
- RuleBasedTransliterator* temprbt = new RuleBasedTransliterator(UnicodeString(CompoundTransliterator::PASS_STRING) + (passNumber++),
+ // TODO: Should passNumber be turned into a decimal-string representation (1 -> "1")?
+ RuleBasedTransliterator* temprbt = new RuleBasedTransliterator(UnicodeString(CompoundTransliterator::PASS_STRING) + UnicodeString(passNumber++),
data, TRUE);
// Check if NULL before adding it to transliterators to avoid future usage of NULL pointer.
if (temprbt == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return t;
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return t;
}
transliterators.addElement(temprbt, status);
}
if (!ICU_Utility::escapeUnprintable(rulesSource, c)) {
rulesSource.append(c);
}
- i += UTF_CHAR_LENGTH(c);
+ i += U16_LENGTH(c);
}
} else {
rulesSource = getID();
}
int32_t Transliterator::countElements() const {
- return (this->getDynamicClassID() ==
- CompoundTransliterator::getStaticClassID()) ?
- ((const CompoundTransliterator*) this)->getCount() : 0;
+ const CompoundTransliterator* ct = dynamic_cast<const CompoundTransliterator*>(this);
+ return ct != NULL ? ct->getCount() : 0;
}
const Transliterator& Transliterator::getElement(int32_t index, UErrorCode& ec) const {
if (U_FAILURE(ec)) {
return *this;
}
- const CompoundTransliterator* cpd =
- (this->getDynamicClassID() == CompoundTransliterator::getStaticClassID()) ?
- (const CompoundTransliterator*) this : 0;
+ const CompoundTransliterator* cpd = dynamic_cast<const CompoundTransliterator*>(this);
int32_t n = (cpd == NULL) ? 1 : cpd->getCount();
if (index < 0 || index >= n) {
ec = U_INDEX_OUTOFBOUNDS_ERROR;
UnicodeSet& Transliterator::getSourceSet(UnicodeSet& result) const {
handleGetSourceSet(result);
if (filter != NULL) {
- UnicodeSet* filterSet;
+ UnicodeSet* filterSet = dynamic_cast<UnicodeSet*>(filter);
UBool deleteFilterSet = FALSE;
// Most, but not all filters will be UnicodeSets. Optimize for
// the high-runner case.
- if (filter->getDynamicClassID() == UnicodeSet::getStaticClassID()) {
- filterSet = (UnicodeSet*) filter;
- } else {
+ if (filterSet == NULL) {
filterSet = new UnicodeSet();
// Check null pointer
if (filterSet == NULL) {
void U_EXPORT2 Transliterator::registerFactory(const UnicodeString& id,
Transliterator::Factory factory,
Transliterator::Token context) {
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
if (HAVE_REGISTRY(ec)) {
* @see #unregister
*/
void U_EXPORT2 Transliterator::registerInstance(Transliterator* adoptedPrototype) {
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
if (HAVE_REGISTRY(ec)) {
void U_EXPORT2 Transliterator::registerAlias(const UnicodeString& aliasID,
const UnicodeString& realID) {
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
if (HAVE_REGISTRY(ec)) {
/**
* Unregisters a transliterator or class. This may be either
* a system transliterator or a user transliterator or class.
- *
+ *
* @param ID the ID of the transliterator or class
* @see #registerInstance
*/
void U_EXPORT2 Transliterator::unregister(const UnicodeString& ID) {
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
if (HAVE_REGISTRY(ec)) {
*/
int32_t U_EXPORT2 Transliterator::countAvailableIDs(void) {
int32_t retVal = 0;
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
if (HAVE_REGISTRY(ec)) {
*/
const UnicodeString& U_EXPORT2 Transliterator::getAvailableID(int32_t index) {
const UnicodeString* result = NULL;
- umtx_init(®istryMutex);
umtx_lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
if (HAVE_REGISTRY(ec)) {
StringEnumeration* U_EXPORT2 Transliterator::getAvailableIDs(UErrorCode& ec) {
if (U_FAILURE(ec)) return NULL;
StringEnumeration* result = NULL;
- umtx_init(®istryMutex);
umtx_lock(®istryMutex);
if (HAVE_REGISTRY(ec)) {
result = registry->getAvailableIDs();
}
int32_t U_EXPORT2 Transliterator::countAvailableSources(void) {
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
return HAVE_REGISTRY(ec) ? _countAvailableSources() : 0;
UnicodeString& U_EXPORT2 Transliterator::getAvailableSource(int32_t index,
UnicodeString& result) {
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
if (HAVE_REGISTRY(ec)) {
}
int32_t U_EXPORT2 Transliterator::countAvailableTargets(const UnicodeString& source) {
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
return HAVE_REGISTRY(ec) ? _countAvailableTargets(source) : 0;
UnicodeString& U_EXPORT2 Transliterator::getAvailableTarget(int32_t index,
const UnicodeString& source,
UnicodeString& result) {
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
if (HAVE_REGISTRY(ec)) {
int32_t U_EXPORT2 Transliterator::countAvailableVariants(const UnicodeString& source,
const UnicodeString& target) {
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
return HAVE_REGISTRY(ec) ? _countAvailableVariants(source, target) : 0;
const UnicodeString& source,
const UnicodeString& target,
UnicodeString& result) {
- umtx_init(®istryMutex);
Mutex lock(®istryMutex);
UErrorCode ec = U_ZERO_ERROR;
if (HAVE_REGISTRY(ec)) {
* and return TRUE. If the registry cannot be initialized, return
* FALSE (rare).
*
- * IMPORTANT: Upon entry, registryMutex must be LOCKED. The entirely
+ * IMPORTANT: Upon entry, registryMutex must be LOCKED. The entire
* initialization is done with the lock held. There is NO REASON to
* unlock, since no other thread that is waiting on the registryMutex
* cannot itself proceed until the registry is initialized.
* <id> is the ID of the system transliterator being defined. These
* are public IDs enumerated by Transliterator.getAvailableIDs(),
* unless the second field is "internal".
- *
+ *
* <resource> is a ResourceReader resource name. Currently these refer
* to file names under com/ibm/text/resources. This string is passed
* directly to ResourceReader, together with <encoding>.
- *
+ *
* <direction> is either "FORWARD" or "REVERSE".
- *
+ *
* <getInstanceArg> is a string to be passed directly to
* Transliterator.getInstance(). The returned Transliterator object
* then has its ID changed to <id> and is returned.
*/
//static const char translit_index[] = "translit_index";
- UResourceBundle *bundle, *transIDs, *colBund;
- bundle = ures_open(U_ICUDATA_TRANSLIT, NULL/*open default locale*/, &status);
- transIDs = ures_getByKey(bundle, RB_RULE_BASED_IDS, 0, &status);
-
- int32_t row, maxRows;
+ UResourceBundle *bundle = ures_open(U_ICUDATA_TRANSLIT, NULL/*open default locale*/, &status);
+ UResourceBundle *transIDs = ures_getByKey(bundle, RB_RULE_BASED_IDS, 0, &status);
if (U_SUCCESS(status)) {
- maxRows = ures_getSize(transIDs);
+ UResourceBundle *colBund = NULL;
+ UResourceBundle* res = NULL;
+ int32_t row, maxRows = ures_getSize(transIDs);
for (row = 0; row < maxRows; row++) {
- colBund = ures_getByIndex(transIDs, row, 0, &status);
- if (U_SUCCESS(status)) {
- UnicodeString id(ures_getKey(colBund), -1, US_INV);
- UResourceBundle* res = ures_getNextResource(colBund, NULL, &status);
- const char* typeStr = ures_getKey(res);
- UChar type;
- u_charsToUChars(typeStr, &type, 1);
-
- if (U_SUCCESS(status)) {
- int32_t len = 0;
- const UChar *resString;
- switch (type) {
- case 0x66: // 'f'
- case 0x69: // 'i'
- // 'file' or 'internal';
- // row[2]=resource, row[3]=direction
- {
-
- resString = ures_getStringByKey(res, "resource", &len, &status);
- UBool visible = (type == 0x0066 /*f*/);
- UTransDirection dir =
- (ures_getUnicodeStringByKey(res, "direction", &status).charAt(0) ==
- 0x0046 /*F*/) ?
- UTRANS_FORWARD : UTRANS_REVERSE;
- registry->put(id, UnicodeString(TRUE, resString, len), dir, TRUE, visible, status);
- }
- break;
- case 0x61: // 'a'
- // 'alias'; row[2]=createInstance argument
- resString = ures_getString(res, &len, &status);
- registry->put(id, UnicodeString(TRUE, resString, len), TRUE, TRUE, status);
- break;
+ colBund = ures_getByIndex(transIDs, row, colBund, &status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+ const char *tridKey = ures_getKey(colBund);
+ if (tridKey == NULL || uprv_strstr(tridKey, "-t-") != NULL) {
+ continue; // Apple version should not get any of these, eliminated the root.txt entries
+ }
+ res = ures_getNextResource(colBund, res, &status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+ UnicodeString trID(tridKey, -1, US_INV);
+ const char* typeStr = ures_getKey(res);
+ int32_t len = 0, dlen = 0;
+ UBool visible = FALSE;
+ const UChar *resString;
+ switch (typeStr[0]) {
+ case 'f': // "file"
+ visible = TRUE;
+ // FALLTHROUGH
+ case 'i': // "internal" => visible = FALSE
+ // child resources are resource and direction
+ {
+ resString = ures_getStringByKey(res, "resource", &len, &status);
+ const UChar* dirString = ures_getStringByKey(res, "direction", &dlen, &status);
+ UTransDirection dir = (dlen <= 0 || dirString[0] == 0x0046 /*F*/)? UTRANS_FORWARD : UTRANS_REVERSE;
+ registry->put(trID, UnicodeString(TRUE, resString, len), dir, TRUE, visible, status);
}
- }
- ures_close(res);
+ break;
+ case 'a': // "alias", string argument is alias
+ resString = ures_getString(res, &len, &status);
+ registry->put(trID, UnicodeString(TRUE, resString, len), TRUE, TRUE, status);
+ break;
+ default: // do nothing
+ break;
}
- ures_close(colBund);
}
+ ures_close(res);
+ ures_close(colBund);
}
ures_close(transIDs);
// Manually add prototypes that the system knows about to the
// cache. This is how new non-rule-based transliterators are
// added to the system.
-
+
// This is to allow for null pointer check
NullTransliterator* tempNullTranslit = new NullTransliterator();
LowercaseTransliterator* tempLowercaseTranslit = new LowercaseTransliterator();
#endif
// Check for null pointers
if (tempNullTranslit == NULL || tempLowercaseTranslit == NULL || tempUppercaseTranslit == NULL ||
- tempTitlecaseTranslit == NULL || tempUnicodeTranslit == NULL ||
+ tempTitlecaseTranslit == NULL || tempUnicodeTranslit == NULL ||
#if !UCONFIG_NO_BREAK_ITERATION
tempBreakTranslit == NULL ||
#endif
_registerSpecialInverse(UNICODE_STRING_SIMPLE("Title"),
UNICODE_STRING_SIMPLE("Lower"), FALSE);
- ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, transliterator_cleanup);
+ ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup);
return TRUE;
}
U_NAMESPACE_END
-// Defined in ucln_in.h:
+// Defined in transreg.h:
/**
* Release all static memory held by transliterator. This will
* necessarily invalidate any rule-based transliterators held by the
* user, because RBTs hold pointers to common data objects.
*/
-U_CFUNC UBool transliterator_cleanup(void) {
+U_CFUNC UBool utrans_transliterator_cleanup(void) {
U_NAMESPACE_USE
TransliteratorIDParser::cleanup();
if (registry) {
delete registry;
registry = NULL;
}
- umtx_destroy(®istryMutex);
return TRUE;
}