+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
- * Copyright (C) 1996-2012, International Business Machines Corporation and
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
* Normalizer::EMode
* 11/23/9 srl Inlining of some critical functions
* 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h)
+ * 2012-2014 markus Rewritten in C++ again.
-#include "utypeinfo.h" // for 'typeid' to work
+#include "utypeinfo.h" // for 'typeid' to work
#include "unicode/utypes.h"
#include "unicode/coll.h"
#include "unicode/tblcoll.h"
+#include "collationdata.h"
+#include "collationroot.h"
+#include "collationtailoring.h"
#include "ucol_imp.h"
#include "cstring.h"
#include "cmemory.h"
#include "umutex.h"
#include "servloc.h"
+#include "uassert.h"
#include "ustrenum.h"
#include "uresimp.h"
#include "ucln_in.h"
static icu::Locale* availableLocaleList = NULL;
static int32_t availableLocaleListCount;
static icu::ICULocaleService* gService = NULL;
+static icu::UInitOnce gServiceInitOnce = U_INITONCE_INITIALIZER;
+static icu::UInitOnce gAvailableLocaleListInitOnce;
* Release all static memory held by collator.
delete gService;
gService = NULL;
+ gServiceInitOnce.reset();
if (availableLocaleList) {
delete []availableLocaleList;
availableLocaleList = NULL;
availableLocaleListCount = 0;
+ gAvailableLocaleListInitOnce.reset();
return TRUE;
if (actualReturn == NULL) {
actualReturn = &ar;
- Collator* result = (Collator*)ICULocaleService::getKey(key, actualReturn, status);
- // Ugly Hack Alert! If the actualReturn length is zero, this
- // means we got a default object, not a "real" service-created
- // object. We don't call setLocales() on a default object,
- // because that will overwrite its correct built-in locale
- // metadata (valid & actual) with our incorrect data (all we
- // have is the requested locale). (TODO remove in 3.0) [aliu]
- if (result && actualReturn->length() > 0) {
- const LocaleKey& lkey = (const LocaleKey&)key;
- Locale canonicalLocale("");
- Locale currentLocale("");
- LocaleUtility::initLocaleFromName(*actualReturn, currentLocale);
- result->setLocales(lkey.canonicalLocale(canonicalLocale), currentLocale, currentLocale);
- }
- return result;
+ return (Collator*)ICULocaleService::getKey(key, actualReturn, status);
virtual UBool isDefault() const {
// -------------------------------------
+static void U_CALLCONV initService() {
+ gService = new ICUCollatorService();
+ ucln_i18n_registerCleanup(UCLN_I18N_COLLATOR, collator_cleanup);
static ICULocaleService*
- UBool needInit;
- UMTX_CHECK(NULL, (UBool)(gService == NULL), needInit);
- if(needInit) {
- ICULocaleService *newservice = new ICUCollatorService();
- if(newservice) {
- umtx_lock(NULL);
- if(gService == NULL) {
- gService = newservice;
- newservice = NULL;
- }
- umtx_unlock(NULL);
- }
- if(newservice) {
- delete newservice;
- }
- else {
- ucln_i18n_registerCleanup(UCLN_I18N_COLLATOR, collator_cleanup);
- }
- }
+ umtx_initOnce(gServiceInitOnce, &initService);
return gService;
static inline UBool
- UBool retVal;
- UMTX_CHECK(NULL, gService != NULL, retVal);
+ UBool retVal = !gServiceInitOnce.isReset() && (getService() != NULL);
return retVal;
-// -------------------------------------
+#endif /* UCONFIG_NO_SERVICE */
-Collator::createUCollator(const char *loc,
- UErrorCode *status)
- UCollator *result = 0;
- if (status && U_SUCCESS(*status) && hasService()) {
- Locale desiredLocale(loc);
- Collator *col = (Collator*)gService->get(desiredLocale, *status);
- RuleBasedCollator *rbc;
- if (col && (rbc = dynamic_cast<RuleBasedCollator *>(col))) {
- if (!rbc->dataIsOwned) {
- result = ucol_safeClone(rbc->ucollator, NULL, NULL, status);
- } else {
- result = rbc->ucollator;
- rbc->ucollator = NULL; // to prevent free on delete
+static void U_CALLCONV
+initAvailableLocaleList(UErrorCode &status) {
+ U_ASSERT(availableLocaleListCount == 0);
+ U_ASSERT(availableLocaleList == NULL);
+ // for now, there is a hardcoded list, so just walk through that list and set it up.
+ UResourceBundle *index = NULL;
+ UResourceBundle installed;
+ int32_t i = 0;
+ ures_initStackObject(&installed);
+ index = ures_openDirect(U_ICUDATA_COLL, "res_index", &status);
+ ures_getByKey(index, "InstalledLocales", &installed, &status);
+ if(U_SUCCESS(status)) {
+ availableLocaleListCount = ures_getSize(&installed);
+ availableLocaleList = new Locale[availableLocaleListCount];
+ if (availableLocaleList != NULL) {
+ ures_resetIterator(&installed);
+ while(ures_hasNext(&installed)) {
+ const char *tempKey = NULL;
+ ures_getNextString(&installed, NULL, &tempKey, &status);
+ availableLocaleList[i++] = Locale(tempKey);
- } else {
- // should go in a function- ucol_initDelegate(delegate)
- result = (UCollator *)uprv_malloc(sizeof(UCollator));
- if(result == NULL) {
- } else {
- uprv_memset(result, 0, sizeof(UCollator));
- result->delegate = col;
- result->freeOnClose = TRUE; // do free on close.
- col = NULL; // to prevent free on delete.
- }
- delete col;
+ U_ASSERT(availableLocaleListCount == i);
+ ures_close(&installed);
- return result;
+ ures_close(index);
+ ucln_i18n_registerCleanup(UCLN_I18N_COLLATOR, collator_cleanup);
-#endif /* UCONFIG_NO_SERVICE */
static UBool isAvailableLocaleListInitialized(UErrorCode &status) {
- // for now, there is a hardcoded list, so just walk through that list and set it up.
- UBool needInit;
- UMTX_CHECK(NULL, availableLocaleList == NULL, needInit);
- if (needInit) {
- UResourceBundle *index = NULL;
- UResourceBundle installed;
- Locale * temp;
- int32_t i = 0;
- int32_t localeCount;
- ures_initStackObject(&installed);
- index = ures_openDirect(U_ICUDATA_COLL, "res_index", &status);
- ures_getByKey(index, "InstalledLocales", &installed, &status);
- if(U_SUCCESS(status)) {
- localeCount = ures_getSize(&installed);
- temp = new Locale[localeCount];
- if (temp != NULL) {
- ures_resetIterator(&installed);
- while(ures_hasNext(&installed)) {
- const char *tempKey = NULL;
- ures_getNextString(&installed, NULL, &tempKey, &status);
- temp[i++] = Locale(tempKey);
- }
- umtx_lock(NULL);
- if (availableLocaleList == NULL)
- {
- availableLocaleListCount = localeCount;
- availableLocaleList = temp;
- temp = NULL;
- ucln_i18n_registerCleanup(UCLN_I18N_COLLATOR, collator_cleanup);
- }
- umtx_unlock(NULL);
- needInit = FALSE;
- if (temp) {
- delete []temp;
- }
- }
+ umtx_initOnce(gAvailableLocaleListInitOnce, &initAvailableLocaleList, status);
+ return U_SUCCESS(status);
+// Collator public methods -----------------------------------------------
+namespace {
+static const struct {
+ const char *name;
+ UColAttribute attr;
+} collAttributes[] = {
+ { "colStrength", UCOL_STRENGTH },
+ { "colBackwards", UCOL_FRENCH_COLLATION },
+ { "colCaseLevel", UCOL_CASE_LEVEL },
+ { "colCaseFirst", UCOL_CASE_FIRST },
+ { "colAlternate", UCOL_ALTERNATE_HANDLING },
+ { "colNormalization", UCOL_NORMALIZATION_MODE },
+ { "colNumeric", UCOL_NUMERIC_COLLATION }
- ures_close(&installed);
+static const struct {
+ const char *name;
+ UColAttributeValue value;
+} collAttributeValues[] = {
+ { "primary", UCOL_PRIMARY },
+ { "secondary", UCOL_SECONDARY },
+ { "tertiary", UCOL_TERTIARY },
+ { "quaternary", UCOL_QUATERNARY },
+ // Note: Not supporting typo "quarternary" because it was never supported in locale IDs.
+ { "identical", UCOL_IDENTICAL },
+ { "no", UCOL_OFF },
+ { "yes", UCOL_ON },
+ { "shifted", UCOL_SHIFTED },
+ { "non-ignorable", UCOL_NON_IGNORABLE },
+ { "lower", UCOL_LOWER_FIRST },
+ { "upper", UCOL_UPPER_FIRST }
+static const char *collReorderCodes[UCOL_REORDER_CODE_LIMIT - UCOL_REORDER_CODE_FIRST] = {
+ "space", "punct", "symbol", "currency", "digit"
+int32_t getReorderCode(const char *s) {
+ for (int32_t i = 0; i < UPRV_LENGTHOF(collReorderCodes); ++i) {
+ if (uprv_stricmp(s, collReorderCodes[i]) == 0) {
- ures_close(index);
- return !needInit;
+ // Not supporting "others" = UCOL_REORDER_CODE_OTHERS
+ // as a synonym for Zzzz = USCRIPT_UNKNOWN for now:
+ // Avoid introducing synonyms/aliases.
+ return -1;
-// Collator public methods -----------------------------------------------
+ * Sets collation attributes according to locale keywords. See
+ * http://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Settings
+ *
+ * Using "alias" keywords and values where defined:
+ * http://www.unicode.org/reports/tr35/tr35.html#Old_Locale_Extension_Syntax
+ * http://unicode.org/repos/cldr/trunk/common/bcp47/collation.xml
+ */
+void setAttributesFromKeywords(const Locale &loc, Collator &coll, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if (uprv_strcmp(loc.getName(), loc.getBaseName()) == 0) {
+ // No keywords.
+ return;
+ }
+ char value[1024]; // The reordering value could be long.
+ // Check for collation keywords that were already deprecated
+ // before any were supported in createInstance() (except for "collation").
+ int32_t length = loc.getKeywordValue("colHiraganaQuaternary", value, UPRV_LENGTHOF(value), errorCode);
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if (length != 0) {
+ return;
+ }
+ length = loc.getKeywordValue("variableTop", value, UPRV_LENGTHOF(value), errorCode);
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if (length != 0) {
+ return;
+ }
+ // Parse known collation keywords, ignore others.
+ errorCode = U_ZERO_ERROR;
+ }
+ for (int32_t i = 0; i < UPRV_LENGTHOF(collAttributes); ++i) {
+ length = loc.getKeywordValue(collAttributes[i].name, value, UPRV_LENGTHOF(value), errorCode);
+ if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
+ return;
+ }
+ if (length == 0) { continue; }
+ for (int32_t j = 0;; ++j) {
+ if (j == UPRV_LENGTHOF(collAttributeValues)) {
+ return;
+ }
+ if (uprv_stricmp(value, collAttributeValues[j].name) == 0) {
+ coll.setAttribute(collAttributes[i].attr, collAttributeValues[j].value, errorCode);
+ break;
+ }
+ }
+ }
+ length = loc.getKeywordValue("colReorder", value, UPRV_LENGTHOF(value), errorCode);
+ if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
+ return;
+ }
+ if (length != 0) {
+ int32_t codesLength = 0;
+ char *scriptName = value;
+ for (;;) {
+ if (codesLength == UPRV_LENGTHOF(codes)) {
+ return;
+ }
+ char *limit = scriptName;
+ char c;
+ while ((c = *limit) != 0 && c != '-') { ++limit; }
+ *limit = 0;
+ int32_t code;
+ if ((limit - scriptName) == 4) {
+ // Strict parsing, accept only 4-letter script codes, not long names.
+ code = u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName);
+ } else {
+ code = getReorderCode(scriptName);
+ }
+ if (code < 0) {
+ return;
+ }
+ codes[codesLength++] = code;
+ if (c == 0) { break; }
+ scriptName = limit + 1;
+ }
+ coll.setReorderCodes(codes, codesLength, errorCode);
+ }
+ length = loc.getKeywordValue("kv", value, UPRV_LENGTHOF(value), errorCode);
+ if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
+ return;
+ }
+ if (length != 0) {
+ int32_t code = getReorderCode(value);
+ if (code < 0) {
+ return;
+ }
+ coll.setMaxVariable((UColReorderCode)code, errorCode);
+ }
+ if (U_FAILURE(errorCode)) {
+ }
+} // namespace
Collator* U_EXPORT2 Collator::createInstance(UErrorCode& success)
if (U_FAILURE(status))
return 0;
+ if (desiredLocale.isBogus()) {
+ // Locale constructed from malformed locale ID or language tag.
+ return NULL;
+ }
+ Collator* coll;
if (hasService()) {
Locale actualLoc;
- Collator *result =
- (Collator*)gService->get(desiredLocale, &actualLoc, status);
- // Ugly Hack Alert! If the returned locale is empty (not root,
- // but empty -- getName() == "") then that means the service
- // returned a default object, not a "real" service object. In
- // that case, the locale metadata (valid & actual) is setup
- // correctly already, and we don't want to overwrite it. (TODO
- // remove in 3.0) [aliu]
- if (*actualLoc.getName() != 0) {
- result->setLocales(desiredLocale, actualLoc, actualLoc);
- }
- return result;
- }
+ coll = (Collator*)gService->get(desiredLocale, &actualLoc, status);
+ } else
- return makeInstance(desiredLocale, status);
-Collator* Collator::makeInstance(const Locale& desiredLocale,
- UErrorCode& status)
- // A bit of explanation is required here. Although in the current
- // implementation
- // Collator::createInstance() is just turning around and calling
- // RuleBasedCollator(Locale&), this will not necessarily always be the
- // case. For example, suppose we modify this code to handle a
- // non-table-based Collator, such as that for Thai. In this case,
- // createInstance() will have to be modified to somehow determine this fact
- // (perhaps a field in the resource bundle). Then it can construct the
- // non-table-based Collator in some other way, when it sees that it needs
- // to.
- // The specific caution is this: RuleBasedCollator(Locale&) will ALWAYS
- // return a valid collation object, if the system is functioning properly.
- // The reason is that it will fall back, use the default locale, and even
- // use the built-in default collation rules. THEREFORE, createInstance()
- // should in general ONLY CALL RuleBasedCollator(Locale&) IF IT KNOWS IN
- // ADVANCE that the given locale's collation is properly implemented as a
- // RuleBasedCollator.
- // Currently, we don't do this...we always return a RuleBasedCollator,
- // whether it is strictly correct to do so or not, without checking, because
- // we currently have no way of checking.
- RuleBasedCollator* collation = new RuleBasedCollator(desiredLocale,
- status);
- /* test for NULL */
- if (collation == 0) {
- return 0;
- }
- if (U_FAILURE(status))
- delete collation;
- collation = 0;
+ coll = makeInstance(desiredLocale, status);
+ }
+ setAttributesFromKeywords(desiredLocale, *coll, status);
+ if (U_FAILURE(status)) {
+ delete coll;
+ return NULL;
- return collation;
+ return coll;
-// !!! dlf the following is obsolete, ignore registration for this
-Collator *
-Collator::createInstance(const Locale &loc,
- UVersionInfo version,
- UErrorCode &status)
- Collator *collator;
- UVersionInfo info;
- collator=new RuleBasedCollator(loc, status);
- /* test for NULL */
- if (collator == 0) {
+Collator* Collator::makeInstance(const Locale& desiredLocale, UErrorCode& status) {
+ const CollationCacheEntry *entry = CollationLoader::loadTailoring(desiredLocale, status);
+ if (U_SUCCESS(status)) {
+ Collator *result = new RuleBasedCollator(entry);
+ if (result != NULL) {
+ // Both the unified cache's get() and the RBC constructor
+ // did addRef(). Undo one of them.
+ entry->removeRef();
+ return result;
+ }
- return 0;
- if(U_SUCCESS(status)) {
- collator->getVersion(info);
- if(0!=uprv_memcmp(version, info, sizeof(UVersionInfo))) {
- delete collator;
- return 0;
- }
+ if (entry != NULL) {
+ // Undo the addRef() from the cache.get().
+ entry->removeRef();
- return collator;
+ return NULL;
Collator *
Collator::safeClone() const {
Collator::registerInstance(Collator* toAdopt, const Locale& locale, UErrorCode& status)
if (U_SUCCESS(status)) {
+ // Set the collator locales while registering so that createInstance()
+ // need not guess whether the collator's locales are already set properly
+ // (as they are by the data loader).
+ toAdopt->setLocales(locale, locale, locale);
return getService()->registerInstance(toAdopt, locale, status);
return NULL;
StringEnumeration* U_EXPORT2
Collator::getKeywords(UErrorCode& status) {
- // This is a wrapper over ucol_getKeywords
- UEnumeration* uenum = ucol_getKeywords(&status);
- if (U_FAILURE(status)) {
- uenum_close(uenum);
- return NULL;
- }
- return new UStringEnumeration(uenum);
+ return UStringEnumeration::fromUEnumeration(
+ ucol_getKeywords(&status), status);
StringEnumeration* U_EXPORT2
Collator::getKeywordValues(const char *keyword, UErrorCode& status) {
- // This is a wrapper over ucol_getKeywordValues
- UEnumeration* uenum = ucol_getKeywordValues(keyword, &status);
- if (U_FAILURE(status)) {
- uenum_close(uenum);
- return NULL;
- }
- return new UStringEnumeration(uenum);
+ return UStringEnumeration::fromUEnumeration(
+ ucol_getKeywordValues(keyword, &status), status);
StringEnumeration* U_EXPORT2
Collator::getKeywordValuesForLocale(const char* key, const Locale& locale,
UBool commonlyUsed, UErrorCode& status) {
- // This is a wrapper over ucol_getKeywordValuesForLocale
- UEnumeration *uenum = ucol_getKeywordValuesForLocale(key, locale.getName(),
- commonlyUsed, &status);
- if (U_FAILURE(status)) {
- uenum_close(uenum);
- return NULL;
- }
- return new UStringEnumeration(uenum);
+ return UStringEnumeration::fromUEnumeration(
+ ucol_getKeywordValuesForLocale(
+ key, locale.getName(), commonlyUsed, &status),
+ status);
Locale U_EXPORT2
setAttribute(UCOL_STRENGTH, (UColAttributeValue)newStrength, intStatus);
+Collator &
+Collator::setMaxVariable(UColReorderCode /*group*/, UErrorCode &errorCode) {
+ if (U_SUCCESS(errorCode)) {
+ }
+ return *this;
+Collator::getMaxVariable() const {
Collator::getReorderCodes(int32_t* /* dest*/,
int32_t /* destCapacity*/,
-int32_t U_EXPORT2
-Collator::getEquivalentReorderCodes(int32_t /* reorderCode */,
- int32_t* /* dest */,
- int32_t /* destCapacity */,
- UErrorCode& status)
- if (U_SUCCESS(status)) {
+Collator::getEquivalentReorderCodes(int32_t reorderCode,
+ int32_t *dest, int32_t capacity,
+ UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return 0; }
+ if(capacity < 0 || (dest == NULL && capacity > 0)) {
+ return 0;
- return 0;
+ const CollationData *baseData = CollationRoot::getData(errorCode);
+ if(U_FAILURE(errorCode)) { return 0; }
+ return baseData->getEquivalentScripts(reorderCode, dest, capacity, errorCode);
return 0;
+Collator::internalCompareUTF8(const char *left, int32_t leftLength,
+ const char *right, int32_t rightLength,
+ UErrorCode &errorCode) const {
+ if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
+ if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) {
+ return UCOL_EQUAL;
+ }
+ return compareUTF8(
+ StringPiece(left, (leftLength < 0) ? uprv_strlen(left) : leftLength),
+ StringPiece(right, (rightLength < 0) ? uprv_strlen(right) : rightLength),
+ errorCode);
+Collator::internalNextSortKeyPart(UCharIterator * /*iter*/, uint32_t /*state*/[2],
+ uint8_t * /*dest*/, int32_t /*count*/, UErrorCode &errorCode) const {
+ if (U_SUCCESS(errorCode)) {
+ }
+ return 0;
// UCollator private data members ----------------------------------------
/* This is useless information */