X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..57a6839dcb3bba09e8228b822b290604668416fe:/icuSources/i18n/ucol_sit.cpp diff --git a/icuSources/i18n/ucol_sit.cpp b/icuSources/i18n/ucol_sit.cpp index 7fa8fb54..15e0981b 100644 --- a/icuSources/i18n/ucol_sit.cpp +++ b/icuSources/i18n/ucol_sit.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2004-2006, International Business Machines +* Copyright (C) 2004-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: ucol_sit.cpp @@ -14,23 +14,33 @@ */ #include "unicode/ustring.h" - +#include "unicode/udata.h" +#include "unicode/utf16.h" #include "utracimp.h" #include "ucol_imp.h" -#include "ucol_tok.h" -#include "unormimp.h" #include "cmemory.h" #include "cstring.h" +#include "uresimp.h" +#include "unicode/coll.h" + +#ifdef UCOL_TRACE_SIT +# include +#endif #if !UCONFIG_NO_COLLATION +#include "unicode/tblcoll.h" + enum OptionsList { UCOL_SIT_LANGUAGE = 0, - UCOL_SIT_SCRIPT, - UCOL_SIT_REGION, - UCOL_SIT_VARIANT, - UCOL_SIT_KEYWORD, - UCOL_SIT_RFC3166BIS, + UCOL_SIT_SCRIPT = 1, + UCOL_SIT_REGION = 2, + UCOL_SIT_VARIANT = 3, + UCOL_SIT_KEYWORD = 4, + UCOL_SIT_PROVIDER = 5, + UCOL_SIT_LOCELEMENT_MAX = UCOL_SIT_PROVIDER, /* the last element that's part of LocElements */ + + UCOL_SIT_BCP47, UCOL_SIT_STRENGTH, UCOL_SIT_CASE_LEVEL, UCOL_SIT_CASE_FIRST, @@ -44,34 +54,6 @@ enum OptionsList { UCOL_SIT_ITEMS_COUNT }; -/* list of locales for packing of a collator to an integer. - * This list corresponds to ICU 3.0. If more collation bearing - * locales are added in the future, this won't be a simple array - * but a mapping allowing forward and reverse lookup would have to - * be established. Currently, the mapping is from locale name to - * index. - */ -static const char* const locales[] = { -/* 00 - 09 */ "ar", "be", "bg", "ca", "cs", "da", "de", "de__PHONEBOOK", "el", "en", -/* 10 - 19 */ "en_BE", "eo", "es", "es__TRADITIONAL", "et", "fa", "fa_AF", "fi", "fo", "fr", -/* 20 - 29 */ "gu", "he", "hi", "hi__DIRECT", "hr", "hu", "is", "it", "ja", "kk", -/* 30 - 39 */ "kl", "kn", "ko", "lt", "lv", "mk", "mr", "mt", "nb", "nn", -/* 40 - 49 */ "om", "pa", "pl", "ps", "ro", "root", "ru", "sh", "sk", "sl", -/* 50 - 59 */ "sq", "sr", "sv", "ta", "te", "th", "tr", "uk", "vi", "zh", -/* 60 - 64 */ "zh_HK", "zh_MO", "zh_TW", "zh_TW_STROKE", "zh__PINYIN" -}; - -static const char* const keywords[] = { -/* 00 */ "", -/* 01 */ "direct", -/* 02 */ "phonebook", -/* 03 */ "pinyin", -/* 04 */ "standard", -/* 05 */ "stroke", -/* 06 */ "traditional" -}; - - /* option starters chars. */ static const char alternateHArg = 'A'; static const char variableTopValArg = 'B'; @@ -83,6 +65,7 @@ static const char hiraganaQArg = 'H'; static const char keywordArg = 'K'; static const char languageArg = 'L'; static const char normArg = 'N'; +static const char providerArg = 'P'; static const char regionArg = 'R'; static const char strengthArg = 'S'; static const char variableTopArg = 'T'; @@ -91,10 +74,13 @@ static const char RFC3066Arg = 'X'; static const char scriptArg = 'Z'; static const char collationKeyword[] = "@collation="; +static const char providerKeyword[] = "@sp="; -static const int32_t locElementCount = 5; + +static const int32_t locElementCount = UCOL_SIT_LOCELEMENT_MAX+1; static const int32_t locElementCapacity = 32; static const int32_t loc3066Capacity = 256; +static const int32_t locProviderCapacity = 10; static const int32_t internalBufferSize = 512; /* structure containing specification of a collator. Initialized @@ -104,6 +90,7 @@ static const int32_t internalBufferSize = 512; struct CollatorSpec { char locElements[locElementCount][locElementCapacity]; char locale[loc3066Capacity]; + char provider[locProviderCapacity]; UColAttributeValue options[UCOL_ATTRIBUTE_COUNT]; uint32_t variableTopValue; UChar variableTopString[locElementCapacity]; @@ -140,19 +127,7 @@ static const AttributeConversion conversions[12] = { }; -static char -ucol_sit_attributeValueToLetter(UColAttributeValue value, UErrorCode *status) { - uint32_t i = 0; - for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { - if(conversions[i].value == value) { - return conversions[i].letter; - } - } - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; -} - -static UColAttributeValue +static UColAttributeValue ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) { uint32_t i = 0; for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { @@ -161,6 +136,9 @@ ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) { } } *status = U_ILLEGAL_ARGUMENT_ERROR; +#ifdef UCOL_TRACE_SIT + fprintf(stderr, "%s:%d: unknown letter %c: %s\n", __FILE__, __LINE__, letter, u_errorName(*status)); +#endif return UCOL_DEFAULT; } @@ -173,12 +151,12 @@ U_CDECL_END U_CDECL_BEGIN static const char* U_CALLCONV -_processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string, - UErrorCode *status) +_processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string, + UErrorCode *status) { int32_t len = 0; do { - if(value == 0 || value == 4) { + if(value == UCOL_SIT_LANGUAGE || value == UCOL_SIT_KEYWORD || value == UCOL_SIT_PROVIDER) { spec->locElements[value][len++] = uprv_tolower(*string); } else { spec->locElements[value][len++] = *string; @@ -195,8 +173,8 @@ U_CDECL_END U_CDECL_BEGIN static const char* U_CALLCONV -_processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string, - UErrorCode *status) +_processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string, + UErrorCode *status) { char terminator = *string; string++; @@ -214,11 +192,14 @@ U_CDECL_END U_CDECL_BEGIN static const char* U_CALLCONV -_processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string, - UErrorCode *status) +_processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string, + UErrorCode *status) { spec->options[option] = ucol_sit_letterToAttributeValue(*string, status); if((*(++string) != '_' && *string) || U_FAILURE(*status)) { +#ifdef UCOL_TRACE_SIT + fprintf(stderr, "%s:%d: unknown collator option at '%s': %s\n", __FILE__, __LINE__, string, u_errorName(*status)); +#endif *status = U_ILLEGAL_ARGUMENT_ERROR; } return string; @@ -226,8 +207,8 @@ _processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string, U_CDECL_END -static UChar -readHexCodeUnit(const char **string, UErrorCode *status) +static UChar +readHexCodeUnit(const char **string, UErrorCode *status) { UChar result = 0; int32_t value = 0; @@ -242,6 +223,9 @@ readHexCodeUnit(const char **string, UErrorCode *status) value = c - 'A' + 10; } else { *status = U_ILLEGAL_ARGUMENT_ERROR; +#ifdef UCOL_TRACE_SIT + fprintf(stderr, "%s:%d: Bad hex char at '%s': %s\n", __FILE__, __LINE__, *string, u_errorName(*status)); +#endif return 0; } result = (result << 4) | (UChar)value; @@ -251,13 +235,16 @@ readHexCodeUnit(const char **string, UErrorCode *status) // if the string was terminated before we read 4 digits, set an error if(noDigits < 4) { *status = U_ILLEGAL_ARGUMENT_ERROR; +#ifdef UCOL_TRACE_SIT + fprintf(stderr, "%s:%d: Short (only %d digits, wanted 4) at '%s': %s\n", __FILE__, __LINE__, noDigits,*string, u_errorName(*status)); +#endif } return result; } U_CDECL_BEGIN static const char* U_CALLCONV -_processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status) +_processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status) { // get four digits int32_t i = 0; @@ -266,7 +253,7 @@ _processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UEr spec->variableTopString[i++] = readHexCodeUnit(&string, status); } spec->variableTopStringLen = i; - if(i == locElementCapacity && (*string != 0 || *string != '_')) { + if(i == locElementCapacity && *string != 0 && *string != '_') { *status = U_BUFFER_OVERFLOW_ERROR; } } else { @@ -274,7 +261,7 @@ _processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UEr } if(U_SUCCESS(*status)) { spec->variableTopSet = TRUE; - } + } return string; } U_CDECL_END @@ -289,28 +276,29 @@ struct ShortStringOptions { static const ShortStringOptions options[UCOL_SIT_ITEMS_COUNT] = { -/* 10 ALTERNATE_HANDLING */ {alternateHArg, _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate N, S, D +/* 10 ALTERNATE_HANDLING */ {alternateHArg, _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate N, S, D /* 15 VARIABLE_TOP_VALUE */ {variableTopValArg, _processVariableTop, 1 }, /* 08 CASE_FIRST */ {caseFirstArg, _processCollatorOption, UCOL_CASE_FIRST }, // case first L, U, X, D /* 09 NUMERIC_COLLATION */ {numericCollArg, _processCollatorOption, UCOL_NUMERIC_COLLATION }, // codan O, X, D /* 07 CASE_LEVEL */ {caseLevelArg, _processCollatorOption, UCOL_CASE_LEVEL }, // case level O, X, D /* 12 FRENCH_COLLATION */ {frenchCollArg, _processCollatorOption, UCOL_FRENCH_COLLATION }, // french O, X, D /* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg, _processCollatorOption, UCOL_HIRAGANA_QUATERNARY_MODE }, // hiragana O, X, D -/* 04 KEYWORD */ {keywordArg, _processLocaleElement, 4 }, // keyword -/* 00 LANGUAGE */ {languageArg, _processLocaleElement, 0 }, // language +/* 04 KEYWORD */ {keywordArg, _processLocaleElement, UCOL_SIT_KEYWORD }, // keyword +/* 00 LANGUAGE */ {languageArg, _processLocaleElement, UCOL_SIT_LANGUAGE }, // language /* 11 NORMALIZATION_MODE */ {normArg, _processCollatorOption, UCOL_NORMALIZATION_MODE }, // norm O, X, D -/* 02 REGION */ {regionArg, _processLocaleElement, 2 }, // region +/* 02 REGION */ {regionArg, _processLocaleElement, UCOL_SIT_REGION }, // region /* 06 STRENGTH */ {strengthArg, _processCollatorOption, UCOL_STRENGTH }, // strength 1, 2, 3, 4, I, D /* 14 VARIABLE_TOP */ {variableTopArg, _processVariableTop, 0 }, -/* 03 VARIANT */ {variantArg, _processLocaleElement, 3 }, // variant +/* 03 VARIANT */ {variantArg, _processLocaleElement, UCOL_SIT_VARIANT }, // variant /* 05 RFC3066BIS */ {RFC3066Arg, _processRFC3066Locale, 0 }, // rfc3066bis locale name -/* 01 SCRIPT */ {scriptArg, _processLocaleElement, 1 } // script +/* 01 SCRIPT */ {scriptArg, _processLocaleElement, UCOL_SIT_SCRIPT }, // script +/* PROVIDER */ {providerArg, _processLocaleElement, UCOL_SIT_PROVIDER } }; static -const char* ucol_sit_readOption(const char *start, CollatorSpec *spec, - UErrorCode *status) +const char* ucol_sit_readOption(const char *start, CollatorSpec *spec, + UErrorCode *status) { int32_t i = 0; @@ -318,16 +306,19 @@ const char* ucol_sit_readOption(const char *start, CollatorSpec *spec, if(*start == options[i].optionStart) { spec->entries[i].start = start; const char* end = options[i].action(spec, options[i].attr, start+1, status); - spec->entries[i].len = end - start; + spec->entries[i].len = (int32_t)(end - start); return end; } } *status = U_ILLEGAL_ARGUMENT_ERROR; +#ifdef UCOL_TRACE_SIT + fprintf(stderr, "%s:%d: Unknown option at '%s': %s\n", __FILE__, __LINE__, start, u_errorName(*status)); +#endif return start; } static -void ucol_sit_initCollatorSpecs(CollatorSpec *spec) +void ucol_sit_initCollatorSpecs(CollatorSpec *spec) { // reset everything uprv_memset(spec, 0, sizeof(CollatorSpec)); @@ -338,12 +329,12 @@ void ucol_sit_initCollatorSpecs(CollatorSpec *spec) } } -static const char* -ucol_sit_readSpecs(CollatorSpec *s, const char *string, +static const char* +ucol_sit_readSpecs(CollatorSpec *s, const char *string, UParseError *parseError, UErrorCode *status) { const char *definition = string; - while(U_SUCCESS(*status) && *string) { + while(U_SUCCESS(*status) && *string) { string = ucol_sit_readOption(string, s, status); // advance over '_' while(*string && *string == '_') { @@ -351,7 +342,7 @@ ucol_sit_readSpecs(CollatorSpec *s, const char *string, } } if(U_FAILURE(*status)) { - parseError->offset = string - definition; + parseError->offset = (int32_t)(string - definition); } return string; } @@ -361,7 +352,7 @@ int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, { int32_t i = 0, j = 0; int32_t len = 0; - char optName; + char optName; if(U_SUCCESS(*status)) { for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { if(s->entries[i].start) { @@ -370,7 +361,7 @@ int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, uprv_strcat(destination, "_"); } len++; - } + } optName = *(s->entries[i].start); if(optName == languageArg || optName == regionArg || optName == variantArg || optName == keywordArg) { for(j = 0; j < s->entries[i].len; j++) { @@ -399,29 +390,35 @@ ucol_sit_calculateWholeLocale(CollatorSpec *s) { // locale if(s->locale[0] == 0) { // first the language - uprv_strcat(s->locale, s->locElements[0]); + uprv_strcat(s->locale, s->locElements[UCOL_SIT_LANGUAGE]); // then the script, if present - if(*(s->locElements[1])) { + if(*(s->locElements[UCOL_SIT_SCRIPT])) { uprv_strcat(s->locale, "_"); - uprv_strcat(s->locale, s->locElements[1]); + uprv_strcat(s->locale, s->locElements[UCOL_SIT_SCRIPT]); } // then the region, if present - if(*(s->locElements[2])) { + if(*(s->locElements[UCOL_SIT_REGION])) { uprv_strcat(s->locale, "_"); - uprv_strcat(s->locale, s->locElements[2]); - } else if(*(s->locElements[3])) { // if there is a variant, we need an underscore + uprv_strcat(s->locale, s->locElements[UCOL_SIT_REGION]); + } else if(*(s->locElements[UCOL_SIT_VARIANT])) { // if there is a variant, we need an underscore uprv_strcat(s->locale, "_"); } // add variant, if there - if(*(s->locElements[3])) { + if(*(s->locElements[UCOL_SIT_VARIANT])) { uprv_strcat(s->locale, "_"); - uprv_strcat(s->locale, s->locElements[3]); + uprv_strcat(s->locale, s->locElements[UCOL_SIT_VARIANT]); } // if there is a collation keyword, add that too - if(*(s->locElements[4])) { + if(*(s->locElements[UCOL_SIT_KEYWORD])) { uprv_strcat(s->locale, collationKeyword); - uprv_strcat(s->locale, s->locElements[4]); + uprv_strcat(s->locale, s->locElements[UCOL_SIT_KEYWORD]); + } + + // if there is a provider keyword, add that too + if(*(s->locElements[UCOL_SIT_PROVIDER])) { + uprv_strcat(s->locale, providerKeyword); + uprv_strcat(s->locale, s->locElements[UCOL_SIT_PROVIDER]); } } } @@ -451,12 +448,11 @@ ucol_prepareShortStringOpen( const char *definition, // settings // analyse the string in order to get everything we need. - const char *string = definition; CollatorSpec s; ucol_sit_initCollatorSpecs(&s); - string = ucol_sit_readSpecs(&s, definition, parseError, status); + ucol_sit_readSpecs(&s, definition, parseError, status); ucol_sit_calculateWholeLocale(&s); - + char buffer[internalBufferSize]; uprv_memset(buffer, 0, internalBufferSize); uloc_canonicalize(s.locale, buffer, internalBufferSize, status); @@ -520,7 +516,7 @@ ucol_openFromShortString( const char *definition, ucol_sit_initCollatorSpecs(&s); string = ucol_sit_readSpecs(&s, definition, parseError, status); ucol_sit_calculateWholeLocale(&s); - + char buffer[internalBufferSize]; uprv_memset(buffer, 0, internalBufferSize); uloc_canonicalize(s.locale, buffer, internalBufferSize, status); @@ -535,7 +531,7 @@ ucol_openFromShortString( const char *definition, } if(U_FAILURE(*status)) { - parseError->offset = string - definition; + parseError->offset = (int32_t)(string - definition); ucol_close(result); return NULL; } @@ -561,23 +557,6 @@ ucol_openFromShortString( const char *definition, } -static void appendShortStringElement(const char *src, int32_t len, char *result, int32_t *resultSize, int32_t capacity, char arg) -{ - if(len) { - if(*resultSize) { - if(*resultSize < capacity) { - uprv_strcat(result, "_"); - } - (*resultSize)++; - } - *resultSize += len + 1; - if(*resultSize < capacity) { - uprv_strncat(result, &arg, 1); - uprv_strncat(result, src, len); - } - } -} - U_CAPI int32_t U_EXPORT2 ucol_getShortDefinitionString(const UCollator *coll, const char *locale, @@ -586,56 +565,11 @@ ucol_getShortDefinitionString(const UCollator *coll, UErrorCode *status) { if(U_FAILURE(*status)) return 0; - char buffer[internalBufferSize]; - uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); - int32_t resultSize = 0; - char tempbuff[internalBufferSize]; - char locBuff[internalBufferSize]; - uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); - int32_t elementSize = 0; - UBool isAvailable = 0; - CollatorSpec s; - ucol_sit_initCollatorSpecs(&s); - - if(!locale) { - locale = ucol_getLocale(coll, ULOC_VALID_LOCALE, status); - } - elementSize = ucol_getFunctionalEquivalent(locBuff, internalBufferSize, "collation", locale, &isAvailable, status); - - if(elementSize) { - // we should probably canonicalize here... - elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, languageArg); - elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, regionArg); - elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, scriptArg); - elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variantArg); - elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, keywordArg); - } - - int32_t i = 0; - UColAttributeValue attribute = UCOL_DEFAULT; - for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { - if(options[i].action == _processCollatorOption) { - attribute = ucol_getAttributeOrDefault(coll, (UColAttribute)options[i].attr, status); - if(attribute != UCOL_DEFAULT) { - char letter = ucol_sit_attributeValueToLetter(attribute, status); - appendShortStringElement(&letter, 1, - buffer, &resultSize, capacity, options[i].optionStart); - } - } - } - if(coll->variableTopValueisDefault == FALSE) { - //s.variableTopValue = ucol_getVariableTop(coll, status); - elementSize = T_CString_integerToString(tempbuff, coll->variableTopValue, 16); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variableTopValArg); + if(coll == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; } - - UParseError parseError; - return ucol_normalizeShortDefinitionString(buffer, dst, capacity, &parseError, status); + return ((icu::Collator*)coll)->internalGetShortDefinitionString(locale,dst,capacity,*status); } U_CAPI int32_t U_EXPORT2 @@ -666,406 +600,6 @@ ucol_normalizeShortDefinitionString(const char *definition, return ucol_sit_dumpSpecs(&s, destination, capacity, status); } -// structure for packing the bits of the attributes in the -// identifier number. -// locale is packed separately -struct bitPacking { - char letter; - uint32_t offset; - uint32_t width; - UColAttribute attribute; - UColAttributeValue values[6]; -}; - -static const bitPacking attributesToBits[UCOL_ATTRIBUTE_COUNT] = { - /* french */ { frenchCollArg, 29, 2, UCOL_FRENCH_COLLATION, { UCOL_DEFAULT, UCOL_OFF, UCOL_ON }}, - /* alternate */ { alternateHArg, 27, 2, UCOL_ALTERNATE_HANDLING, { UCOL_DEFAULT, UCOL_NON_IGNORABLE, UCOL_SHIFTED }}, - /* case first */ { caseFirstArg, 25, 2, UCOL_CASE_FIRST, { UCOL_DEFAULT, UCOL_OFF, UCOL_LOWER_FIRST, UCOL_UPPER_FIRST }}, - /* case level */ { caseLevelArg, 23, 2, UCOL_CASE_LEVEL, { UCOL_DEFAULT, UCOL_OFF, UCOL_ON }}, - /* normalization */ { normArg, 21, 2, UCOL_NORMALIZATION_MODE, { UCOL_DEFAULT, UCOL_OFF, UCOL_ON }}, - /* strength */ { strengthArg, 18, 3, UCOL_STRENGTH, { UCOL_DEFAULT, UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL }}, - /* hiragana */ { hiraganaQArg, 16, 2, UCOL_HIRAGANA_QUATERNARY_MODE, { UCOL_DEFAULT, UCOL_OFF, UCOL_ON }}, - /* numeric coll */ { numericCollArg, 14, 2, UCOL_NUMERIC_COLLATION, { UCOL_DEFAULT, UCOL_OFF, UCOL_ON }} -}; - -static const uint32_t keywordShift = 9; -static const uint32_t keywordWidth = 5; -static const uint32_t localeShift = 0; -static const uint32_t localeWidth = 7; - - -static uint32_t ucol_sit_putLocaleInIdentifier(uint32_t result, const char* locale, UErrorCode* status) { - char buffer[internalBufferSize], keywordBuffer[internalBufferSize], - baseName[internalBufferSize], localeBuffer[internalBufferSize]; - int32_t len = 0, keywordLen = 0, - baseNameLen = 0, localeLen = 0; - uint32_t i = 0; - UBool isAvailable = FALSE; - if(locale) { - len = uloc_canonicalize(locale, buffer, internalBufferSize, status); - localeLen = ucol_getFunctionalEquivalent(localeBuffer, internalBufferSize, "collation", buffer, &isAvailable, status); - keywordLen = uloc_getKeywordValue(buffer, "collation", keywordBuffer, internalBufferSize, status); - baseNameLen = uloc_getBaseName(buffer, baseName, internalBufferSize, status); - - /*Binary search for the map entry for normal cases */ - - uint32_t low = 0; - uint32_t high = sizeof(locales)/sizeof(locales[0]); - uint32_t mid = high; - uint32_t oldmid = 0; - int32_t compVal = 0; - - - while (high > low) /*binary search*/{ - - mid = (high+low) >> 1; /*Finds median*/ - - if (mid == oldmid) - return UCOL_SIT_COLLATOR_NOT_ENCODABLE; // we didn't find it - - compVal = uprv_strcmp(baseName, locales[mid]); - if (compVal < 0){ - high = mid; - } - else if (compVal > 0){ - low = mid; - } - else /*we found it*/{ - break; - } - oldmid = mid; - } - - result |= (mid & ((1 << localeWidth) - 1)) << localeShift; - } - - if(keywordLen) { - for(i = 1; i < sizeof(keywords)/sizeof(keywords[0]); i++) { - if(uprv_strcmp(keywords[i], keywordBuffer) == 0) { - result |= (i & ((1 << keywordWidth) - 1)) << keywordShift; - break; - } - } - } - return result; -} - -U_CAPI uint32_t U_EXPORT2 -ucol_collatorToIdentifier(const UCollator *coll, - const char *locale, - UErrorCode *status) -{ - uint32_t result = 0; - uint32_t i = 0, j = 0; - UColAttributeValue attrValue = UCOL_DEFAULT; - - // if variable top is not default, we need to use strings - if(coll->variableTopValueisDefault != TRUE) { - return UCOL_SIT_COLLATOR_NOT_ENCODABLE; - } - - if(locale == NULL) { - locale = ucol_getLocale(coll, ULOC_VALID_LOCALE, status); - } - - result = ucol_sit_putLocaleInIdentifier(result, locale, status); - - for(i = 0; i < sizeof(attributesToBits)/sizeof(attributesToBits[0]); i++) { - attrValue = ucol_getAttributeOrDefault(coll, attributesToBits[i].attribute, status); - j = 0; - while(attributesToBits[i].values[j] != attrValue) { - j++; - } - result |= (j & ((1 << attributesToBits[i].width) - 1)) << attributesToBits[i].offset; - } - - return result; -} - -U_CAPI UCollator* U_EXPORT2 -ucol_openFromIdentifier(uint32_t identifier, - UBool forceDefaults, - UErrorCode *status) -{ - uint32_t i = 0; - int32_t value = 0, keyword = 0; - char locale[internalBufferSize]; - - value = (identifier >> localeShift) & ((1 << localeWidth) - 1); - keyword = (identifier >> keywordShift) & ((1 << keywordWidth) - 1); - - uprv_strcpy(locale, locales[value]); - - if(keyword) { - uprv_strcat(locale, collationKeyword); - uprv_strcat(locale, keywords[keyword]); - } - - UColAttributeValue attrValue = UCOL_DEFAULT; - - UCollator *result = ucol_open(locale, status); - - // variable top is not set in the identifier, so we can easily skip that on - - for(i = 0; i < sizeof(attributesToBits)/sizeof(attributesToBits[0]); i++) { - value = (identifier >> attributesToBits[i].offset) & ((1 << attributesToBits[i].width) - 1); - attrValue = attributesToBits[i].values[value]; - // the collator is all default, so we will set only the values that will differ from - // the default values. - if(attrValue != UCOL_DEFAULT) { - if(forceDefaults || - ucol_getAttribute(result, attributesToBits[i].attribute, status) != attrValue) { - ucol_setAttribute(result, attributesToBits[i].attribute, attrValue, status); - } - } - } - - return result; -} - -U_CAPI int32_t U_EXPORT2 -ucol_identifierToShortString(uint32_t identifier, - char *buffer, - int32_t capacity, - UBool forceDefaults, - UErrorCode *status) -{ - int32_t locIndex = (identifier >> localeShift) & ((1 << localeWidth) - 1); - int32_t keywordIndex = (identifier >> keywordShift) & ((1 << keywordWidth) - 1); - CollatorSpec s; - ucol_sit_initCollatorSpecs(&s); - uprv_strcpy(s.locale, locales[locIndex]); - if(keywordIndex) { - uprv_strcat(s.locale, collationKeyword); - uprv_strcat(s.locale, keywords[keywordIndex]); - } - UCollator *coll = ucol_openFromIdentifier(identifier, forceDefaults, status); - int32_t resultLen = ucol_getShortDefinitionString(coll, s.locale, buffer, capacity, status); - ucol_close(coll); - return resultLen; - -#if 0 - // TODO: Crumy, crumy, crumy... Very hard to currently go algorithmically from - // identifier to short string. Do rethink - if(forceDefaults == FALSE) { - UCollator *coll = ucol_openFromIdentifier(identifier, FALSE, status); - int32_t resultLen = ucol_getShortDefinitionString(coll, s.locale, buffer, capacity, status); - ucol_close(coll); - return resultLen; - } else { // forceDefaults == TRUE - char letter; - UColAttributeValue value; - int32_t i = 0; - for(i = 0; i < sizeof(attributesToBits)/sizeof(attributesToBits[0]); i++) { - value = attributesToBits[i].values[(identifier >> attributesToBits[i].offset) & ((1 << attributesToBits[i].width) - 1)]; - if(value != UCOL_DEFAULT) { - uprv_strcat(buffer, "_"); - uprv_strncat(buffer, &attributesToBits[i].letter, 1); - letter = ucol_sit_attributeValueToLetter(value, status); - uprv_strncat(buffer, &letter, 1); - } - } - return ucol_sit_dumpSpecs(&s, buffer, capacity, status); - } -#endif -} - -U_CAPI uint32_t U_EXPORT2 -ucol_shortStringToIdentifier(const char *definition, - UBool forceDefaults, - UErrorCode *status) -{ - UParseError parseError; - CollatorSpec s; - uint32_t result = 0; - uint32_t i = 0, j = 0; - ucol_sit_initCollatorSpecs(&s); - - ucol_sit_readSpecs(&s, definition, &parseError, status); - ucol_sit_calculateWholeLocale(&s); - - char locBuffer[internalBufferSize]; - UBool isAvailable = FALSE; - UColAttributeValue attrValue = UCOL_DEFAULT; - - ucol_getFunctionalEquivalent(locBuffer, internalBufferSize, "collation", s.locale, &isAvailable, status); - - if(forceDefaults == FALSE) { - UCollator *coll = ucol_openFromShortString(definition, FALSE, &parseError, status); - result = ucol_collatorToIdentifier(coll, locBuffer, status); - ucol_close(coll); - } else { // forceDefaults == TRUE - result = ucol_sit_putLocaleInIdentifier(result, locBuffer, status); - - for(i = 0; i < sizeof(attributesToBits)/sizeof(attributesToBits[0]); i++) { - attrValue = s.options[i]; - j = 0; - while(attributesToBits[i].values[j] != attrValue) { - j++; - } - result |= (j & ((1 << attributesToBits[i].width) - 1)) << attributesToBits[i].offset; - } - - } - return result; - -} - -U_CAPI UColAttributeValue U_EXPORT2 -ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status) -{ - if(U_FAILURE(*status) || coll == NULL) { - return UCOL_DEFAULT; - } - switch(attr) { - case UCOL_NUMERIC_COLLATION: - return coll->numericCollationisDefault?UCOL_DEFAULT:coll->numericCollation; - case UCOL_HIRAGANA_QUATERNARY_MODE: - return coll->hiraganaQisDefault?UCOL_DEFAULT:coll->hiraganaQ; - case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ - return coll->frenchCollationisDefault?UCOL_DEFAULT:coll->frenchCollation; - case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ - return coll->alternateHandlingisDefault?UCOL_DEFAULT:coll->alternateHandling; - case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ - return coll->caseFirstisDefault?UCOL_DEFAULT:coll->caseFirst; - case UCOL_CASE_LEVEL: /* do we have an extra case level */ - return coll->caseLevelisDefault?UCOL_DEFAULT:coll->caseLevel; - case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ - return coll->normalizationModeisDefault?UCOL_DEFAULT:coll->normalizationMode; - case UCOL_STRENGTH: /* attribute for strength */ - return coll->strengthisDefault?UCOL_DEFAULT:coll->strength; - case UCOL_ATTRIBUTE_COUNT: - default: - *status = U_ILLEGAL_ARGUMENT_ERROR; - break; - } - return UCOL_DEFAULT; -} - - -struct contContext { - const UCollator *coll; - USet *conts; - USet *expansions; - USet *removedContractions; - UBool addPrefixes; - UErrorCode *status; -}; - - - -static void -addSpecial(contContext *context, UChar *buffer, int32_t bufLen, - uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status) -{ - const UCollator *coll = context->coll; - USet *contractions = context->conts; - USet *expansions = context->expansions; - UBool addPrefixes = context->addPrefixes; - - const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); - uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); - // we might have a contraction that ends from previous level - if(newCE != UCOL_NOT_FOUND) { - if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) { - addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); - } - if(contractions && rightIndex-leftIndex > 1) { - uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); - if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) { - uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); - } - } - } - - UCharOffset++; - // check whether we're doing contraction or prefix - if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) { - if(leftIndex == 0) { - *status = U_INTERNAL_PROGRAM_ERROR; - return; - } - --leftIndex; - while(*UCharOffset != 0xFFFF) { - newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); - buffer[leftIndex] = *UCharOffset; - if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { - addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); - } else { - if(contractions) { - uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); - } - if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { - uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); - } - } - UCharOffset++; - } - } else if(getCETag(CE) == CONTRACTION_TAG) { - if(rightIndex == bufLen-1) { - *status = U_INTERNAL_PROGRAM_ERROR; - return; - } - while(*UCharOffset != 0xFFFF) { - newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); - buffer[rightIndex] = *UCharOffset; - if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { - addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status); - } else { - if(contractions) { - uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex); - } - if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { - uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex); - } - } - UCharOffset++; - } - } - -} - -U_CDECL_BEGIN -static UBool U_CALLCONV -_processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE) -{ - UErrorCode *status = ((contContext *)context)->status; - USet *expansions = ((contContext *)context)->expansions; - USet *removed = ((contContext *)context)->removedContractions; - UBool addPrefixes = ((contContext *)context)->addPrefixes; - UChar contraction[internalBufferSize]; - if(isSpecial(CE)) { - if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) { - while(start < limit && U_SUCCESS(*status)) { - // if there are suppressed contractions, we don't - // want to add them. - if(removed && uset_contains(removed, start)) { - start++; - continue; - } - // we start our contraction from middle, since we don't know if it - // will grow toward right or left - contraction[internalBufferSize/2] = (UChar)start; - addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status); - start++; - } - } else if(expansions && getCETag(CE) == EXPANSION_TAG) { - while(start < limit && U_SUCCESS(*status)) { - uset_add(expansions, start++); - } - } - } - if(U_FAILURE(*status)) { - return FALSE; - } else { - return TRUE; - } -} - -U_CDECL_END - - - /** * Get a set containing the contractions defined by the collator. The set includes * both the UCA contractions and the contractions defined by the collator @@ -1073,8 +607,6 @@ U_CDECL_END * @param conts the set to hold the result * @param status to hold the error code * @return the size of the contraction set - * - * @draft ICU 3.0 */ U_CAPI int32_t U_EXPORT2 ucol_getContractions( const UCollator *coll, @@ -1109,77 +641,14 @@ ucol_getContractionsAndExpansions( const UCollator *coll, *status = U_ILLEGAL_ARGUMENT_ERROR; return; } - - if(contractions) { - uset_clear(contractions); - } - if(expansions) { - uset_clear(expansions); - } - int32_t rulesLen = 0; - const UChar* rules = ucol_getRules(coll, &rulesLen); - UColTokenParser src; - ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status); - - contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status }; - - // Add the UCA contractions - c.coll = coll->UCA; - utrie_enum(&coll->UCA->mapping, NULL, _processSpecials, &c); - - // This is collator specific. Add contractions from a collator - c.coll = coll; - c.removedContractions = NULL; - utrie_enum(&coll->mapping, NULL, _processSpecials, &c); - ucol_tok_closeTokenList(&src); -} - -U_CAPI int32_t U_EXPORT2 -ucol_getUnsafeSet( const UCollator *coll, - USet *unsafe, - UErrorCode *status) -{ - UChar buffer[internalBufferSize]; - int32_t len = 0; - - uset_clear(unsafe); - - // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant - static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, - 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; - - // add chars that fail the fcd check - uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); - - // add Thai/Lao prevowels - uset_addRange(unsafe, 0xe40, 0xe44); - uset_addRange(unsafe, 0xec0, 0xec4); - // add lead/trail surrogates - uset_addRange(unsafe, 0xd800, 0xdfff); - - USet *contractions = uset_open(0,0); - - int32_t i = 0, j = 0; - int32_t contsSize = ucol_getContractions(coll, contractions, status); - UChar32 c = 0; - // Contraction set consists only of strings - // to get unsafe code points, we need to - // break the strings apart and add them to the unsafe set - for(i = 0; i < contsSize; i++) { - len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); - if(len > 0) { - j = 0; - while(j < len) { - U16_NEXT(buffer, j, len, c); - if(j < len) { - uset_add(unsafe, c); - } - } - } + const icu::RuleBasedCollator *rbc = icu::RuleBasedCollator::rbcFromUCollator(coll); + if(rbc == NULL) { + *status = U_UNSUPPORTED_ERROR; + return; } - - uset_close(contractions); - - return uset_size(unsafe); + rbc->internalGetContractionsAndExpansions( + icu::UnicodeSet::fromUSet(contractions), + icu::UnicodeSet::fromUSet(expansions), + addPrefixes, *status); } #endif