X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/46f4442e9a5a4f3b98b7c1083586332f6a8a99a4..0f5d89e82340278ed3d7d50029f37cab2c41a57e:/icuSources/i18n/ucol_sit.cpp diff --git a/icuSources/i18n/ucol_sit.cpp b/icuSources/i18n/ucol_sit.cpp index 96fc7b8a..43c1f5d6 100644 --- a/icuSources/i18n/ucol_sit.cpp +++ b/icuSources/i18n/ucol_sit.cpp @@ -1,10 +1,12 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* -* Copyright (C) 2004-2008, International Business Machines +* Copyright (C) 2004-2016, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: ucol_sit.cpp -* encoding: US-ASCII +* encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * @@ -15,23 +17,31 @@ #include "unicode/ustring.h" #include "unicode/udata.h" - +#include "unicode/utf16.h" #include "utracimp.h" #include "ucol_imp.h" -#include "ucol_tok.h" -#include "unormimp.h" #include "cmemory.h" #include "cstring.h" #include "uresimp.h" +#include "unicode/coll.h" + +#ifdef UCOL_TRACE_SIT +# include +#endif #if !UCONFIG_NO_COLLATION +#include "unicode/tblcoll.h" + enum OptionsList { UCOL_SIT_LANGUAGE = 0, - UCOL_SIT_SCRIPT, - UCOL_SIT_REGION, - UCOL_SIT_VARIANT, - UCOL_SIT_KEYWORD, + UCOL_SIT_SCRIPT = 1, + UCOL_SIT_REGION = 2, + UCOL_SIT_VARIANT = 3, + UCOL_SIT_KEYWORD = 4, + UCOL_SIT_PROVIDER = 5, + UCOL_SIT_LOCELEMENT_MAX = UCOL_SIT_PROVIDER, /* the last element that's part of LocElements */ + UCOL_SIT_BCP47, UCOL_SIT_STRENGTH, UCOL_SIT_CASE_LEVEL, @@ -57,6 +67,7 @@ static const char hiraganaQArg = 'H'; static const char keywordArg = 'K'; static const char languageArg = 'L'; static const char normArg = 'N'; +static const char providerArg = 'P'; static const char regionArg = 'R'; static const char strengthArg = 'S'; static const char variableTopArg = 'T'; @@ -65,10 +76,13 @@ static const char RFC3066Arg = 'X'; static const char scriptArg = 'Z'; static const char collationKeyword[] = "@collation="; +static const char providerKeyword[] = "@sp="; + -static const int32_t locElementCount = 5; +static const int32_t locElementCount = UCOL_SIT_LOCELEMENT_MAX+1; static const int32_t locElementCapacity = 32; static const int32_t loc3066Capacity = 256; +static const int32_t locProviderCapacity = 10; static const int32_t internalBufferSize = 512; /* structure containing specification of a collator. Initialized @@ -78,6 +92,7 @@ static const int32_t internalBufferSize = 512; struct CollatorSpec { char locElements[locElementCount][locElementCapacity]; char locale[loc3066Capacity]; + char provider[locProviderCapacity]; UColAttributeValue options[UCOL_ATTRIBUTE_COUNT]; uint32_t variableTopValue; UChar variableTopString[locElementCapacity]; @@ -114,27 +129,18 @@ static const AttributeConversion conversions[12] = { }; -static char -ucol_sit_attributeValueToLetter(UColAttributeValue value, UErrorCode *status) { - uint32_t i = 0; - for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { - if(conversions[i].value == value) { - return conversions[i].letter; - } - } - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; -} - -static UColAttributeValue +static UColAttributeValue ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) { uint32_t i = 0; - for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { + for(i = 0; i < UPRV_LENGTHOF(conversions); i++) { if(conversions[i].letter == letter) { return conversions[i].value; } } *status = U_ILLEGAL_ARGUMENT_ERROR; +#ifdef UCOL_TRACE_SIT + fprintf(stderr, "%s:%d: unknown letter %c: %s\n", __FILE__, __LINE__, letter, u_errorName(*status)); +#endif return UCOL_DEFAULT; } @@ -147,12 +153,12 @@ U_CDECL_END U_CDECL_BEGIN static const char* U_CALLCONV -_processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string, - UErrorCode *status) +_processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string, + UErrorCode *status) { int32_t len = 0; do { - if(value == 0 || value == 4) { + if(value == UCOL_SIT_LANGUAGE || value == UCOL_SIT_KEYWORD || value == UCOL_SIT_PROVIDER) { spec->locElements[value][len++] = uprv_tolower(*string); } else { spec->locElements[value][len++] = *string; @@ -169,8 +175,8 @@ U_CDECL_END U_CDECL_BEGIN static const char* U_CALLCONV -_processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string, - UErrorCode *status) +_processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string, + UErrorCode *status) { char terminator = *string; string++; @@ -188,11 +194,14 @@ U_CDECL_END U_CDECL_BEGIN static const char* U_CALLCONV -_processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string, - UErrorCode *status) +_processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string, + UErrorCode *status) { spec->options[option] = ucol_sit_letterToAttributeValue(*string, status); if((*(++string) != '_' && *string) || U_FAILURE(*status)) { +#ifdef UCOL_TRACE_SIT + fprintf(stderr, "%s:%d: unknown collator option at '%s': %s\n", __FILE__, __LINE__, string, u_errorName(*status)); +#endif *status = U_ILLEGAL_ARGUMENT_ERROR; } return string; @@ -200,8 +209,8 @@ _processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string, U_CDECL_END -static UChar -readHexCodeUnit(const char **string, UErrorCode *status) +static UChar +readHexCodeUnit(const char **string, UErrorCode *status) { UChar result = 0; int32_t value = 0; @@ -216,6 +225,9 @@ readHexCodeUnit(const char **string, UErrorCode *status) value = c - 'A' + 10; } else { *status = U_ILLEGAL_ARGUMENT_ERROR; +#ifdef UCOL_TRACE_SIT + fprintf(stderr, "%s:%d: Bad hex char at '%s': %s\n", __FILE__, __LINE__, *string, u_errorName(*status)); +#endif return 0; } result = (result << 4) | (UChar)value; @@ -225,13 +237,16 @@ readHexCodeUnit(const char **string, UErrorCode *status) // if the string was terminated before we read 4 digits, set an error if(noDigits < 4) { *status = U_ILLEGAL_ARGUMENT_ERROR; +#ifdef UCOL_TRACE_SIT + fprintf(stderr, "%s:%d: Short (only %d digits, wanted 4) at '%s': %s\n", __FILE__, __LINE__, noDigits,*string, u_errorName(*status)); +#endif } return result; } U_CDECL_BEGIN static const char* U_CALLCONV -_processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status) +_processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status) { // get four digits int32_t i = 0; @@ -240,7 +255,7 @@ _processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UEr spec->variableTopString[i++] = readHexCodeUnit(&string, status); } spec->variableTopStringLen = i; - if(i == locElementCapacity && (*string != 0 || *string != '_')) { + if(i == locElementCapacity && *string != 0 && *string != '_') { *status = U_BUFFER_OVERFLOW_ERROR; } } else { @@ -248,7 +263,7 @@ _processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UEr } if(U_SUCCESS(*status)) { spec->variableTopSet = TRUE; - } + } return string; } U_CDECL_END @@ -263,28 +278,29 @@ struct ShortStringOptions { static const ShortStringOptions options[UCOL_SIT_ITEMS_COUNT] = { -/* 10 ALTERNATE_HANDLING */ {alternateHArg, _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate N, S, D +/* 10 ALTERNATE_HANDLING */ {alternateHArg, _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate N, S, D /* 15 VARIABLE_TOP_VALUE */ {variableTopValArg, _processVariableTop, 1 }, /* 08 CASE_FIRST */ {caseFirstArg, _processCollatorOption, UCOL_CASE_FIRST }, // case first L, U, X, D /* 09 NUMERIC_COLLATION */ {numericCollArg, _processCollatorOption, UCOL_NUMERIC_COLLATION }, // codan O, X, D /* 07 CASE_LEVEL */ {caseLevelArg, _processCollatorOption, UCOL_CASE_LEVEL }, // case level O, X, D /* 12 FRENCH_COLLATION */ {frenchCollArg, _processCollatorOption, UCOL_FRENCH_COLLATION }, // french O, X, D /* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg, _processCollatorOption, UCOL_HIRAGANA_QUATERNARY_MODE }, // hiragana O, X, D -/* 04 KEYWORD */ {keywordArg, _processLocaleElement, 4 }, // keyword -/* 00 LANGUAGE */ {languageArg, _processLocaleElement, 0 }, // language +/* 04 KEYWORD */ {keywordArg, _processLocaleElement, UCOL_SIT_KEYWORD }, // keyword +/* 00 LANGUAGE */ {languageArg, _processLocaleElement, UCOL_SIT_LANGUAGE }, // language /* 11 NORMALIZATION_MODE */ {normArg, _processCollatorOption, UCOL_NORMALIZATION_MODE }, // norm O, X, D -/* 02 REGION */ {regionArg, _processLocaleElement, 2 }, // region +/* 02 REGION */ {regionArg, _processLocaleElement, UCOL_SIT_REGION }, // region /* 06 STRENGTH */ {strengthArg, _processCollatorOption, UCOL_STRENGTH }, // strength 1, 2, 3, 4, I, D /* 14 VARIABLE_TOP */ {variableTopArg, _processVariableTop, 0 }, -/* 03 VARIANT */ {variantArg, _processLocaleElement, 3 }, // variant +/* 03 VARIANT */ {variantArg, _processLocaleElement, UCOL_SIT_VARIANT }, // variant /* 05 RFC3066BIS */ {RFC3066Arg, _processRFC3066Locale, 0 }, // rfc3066bis locale name -/* 01 SCRIPT */ {scriptArg, _processLocaleElement, 1 } // script +/* 01 SCRIPT */ {scriptArg, _processLocaleElement, UCOL_SIT_SCRIPT }, // script +/* PROVIDER */ {providerArg, _processLocaleElement, UCOL_SIT_PROVIDER } }; static -const char* ucol_sit_readOption(const char *start, CollatorSpec *spec, - UErrorCode *status) +const char* ucol_sit_readOption(const char *start, CollatorSpec *spec, + UErrorCode *status) { int32_t i = 0; @@ -292,16 +308,19 @@ const char* ucol_sit_readOption(const char *start, CollatorSpec *spec, if(*start == options[i].optionStart) { spec->entries[i].start = start; const char* end = options[i].action(spec, options[i].attr, start+1, status); - spec->entries[i].len = end - start; + spec->entries[i].len = (int32_t)(end - start); return end; } } *status = U_ILLEGAL_ARGUMENT_ERROR; +#ifdef UCOL_TRACE_SIT + fprintf(stderr, "%s:%d: Unknown option at '%s': %s\n", __FILE__, __LINE__, start, u_errorName(*status)); +#endif return start; } static -void ucol_sit_initCollatorSpecs(CollatorSpec *spec) +void ucol_sit_initCollatorSpecs(CollatorSpec *spec) { // reset everything uprv_memset(spec, 0, sizeof(CollatorSpec)); @@ -312,12 +331,12 @@ void ucol_sit_initCollatorSpecs(CollatorSpec *spec) } } -static const char* -ucol_sit_readSpecs(CollatorSpec *s, const char *string, +static const char* +ucol_sit_readSpecs(CollatorSpec *s, const char *string, UParseError *parseError, UErrorCode *status) { const char *definition = string; - while(U_SUCCESS(*status) && *string) { + while(U_SUCCESS(*status) && *string) { string = ucol_sit_readOption(string, s, status); // advance over '_' while(*string && *string == '_') { @@ -325,7 +344,7 @@ ucol_sit_readSpecs(CollatorSpec *s, const char *string, } } if(U_FAILURE(*status)) { - parseError->offset = string - definition; + parseError->offset = (int32_t)(string - definition); } return string; } @@ -335,7 +354,7 @@ int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, { int32_t i = 0, j = 0; int32_t len = 0; - char optName; + char optName; if(U_SUCCESS(*status)) { for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { if(s->entries[i].start) { @@ -344,7 +363,7 @@ int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, uprv_strcat(destination, "_"); } len++; - } + } optName = *(s->entries[i].start); if(optName == languageArg || optName == regionArg || optName == variantArg || optName == keywordArg) { for(j = 0; j < s->entries[i].len; j++) { @@ -373,29 +392,35 @@ ucol_sit_calculateWholeLocale(CollatorSpec *s) { // locale if(s->locale[0] == 0) { // first the language - uprv_strcat(s->locale, s->locElements[0]); + uprv_strcat(s->locale, s->locElements[UCOL_SIT_LANGUAGE]); // then the script, if present - if(*(s->locElements[1])) { + if(*(s->locElements[UCOL_SIT_SCRIPT])) { uprv_strcat(s->locale, "_"); - uprv_strcat(s->locale, s->locElements[1]); + uprv_strcat(s->locale, s->locElements[UCOL_SIT_SCRIPT]); } // then the region, if present - if(*(s->locElements[2])) { + if(*(s->locElements[UCOL_SIT_REGION])) { uprv_strcat(s->locale, "_"); - uprv_strcat(s->locale, s->locElements[2]); - } else if(*(s->locElements[3])) { // if there is a variant, we need an underscore + uprv_strcat(s->locale, s->locElements[UCOL_SIT_REGION]); + } else if(*(s->locElements[UCOL_SIT_VARIANT])) { // if there is a variant, we need an underscore uprv_strcat(s->locale, "_"); } // add variant, if there - if(*(s->locElements[3])) { + if(*(s->locElements[UCOL_SIT_VARIANT])) { uprv_strcat(s->locale, "_"); - uprv_strcat(s->locale, s->locElements[3]); + uprv_strcat(s->locale, s->locElements[UCOL_SIT_VARIANT]); } // if there is a collation keyword, add that too - if(*(s->locElements[4])) { + if(*(s->locElements[UCOL_SIT_KEYWORD])) { uprv_strcat(s->locale, collationKeyword); - uprv_strcat(s->locale, s->locElements[4]); + uprv_strcat(s->locale, s->locElements[UCOL_SIT_KEYWORD]); + } + + // if there is a provider keyword, add that too + if(*(s->locElements[UCOL_SIT_PROVIDER])) { + uprv_strcat(s->locale, providerKeyword); + uprv_strcat(s->locale, s->locElements[UCOL_SIT_PROVIDER]); } } } @@ -429,7 +454,7 @@ ucol_prepareShortStringOpen( const char *definition, ucol_sit_initCollatorSpecs(&s); ucol_sit_readSpecs(&s, definition, parseError, status); ucol_sit_calculateWholeLocale(&s); - + char buffer[internalBufferSize]; uprv_memset(buffer, 0, internalBufferSize); uloc_canonicalize(s.locale, buffer, internalBufferSize, status); @@ -440,8 +465,15 @@ ucol_prepareShortStringOpen( const char *definition, UResourceBundle *collElem = NULL; char keyBuffer[256]; // if there is a keyword, we pick it up and try to get elements - if(!uloc_getKeywordValue(buffer, "collation", keyBuffer, 256, status)) { - // no keyword. we try to find the default setting, which will give us the keyword value + int32_t keyLen = uloc_getKeywordValue(buffer, "collation", keyBuffer, sizeof(keyBuffer), status); + // Treat too long a value as no keyword. + if(keyLen >= (int32_t)sizeof(keyBuffer)) { + keyLen = 0; + *status = U_ZERO_ERROR; + } + if(keyLen == 0) { + // no keyword + // we try to find the default setting, which will give us the keyword value UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status); if(U_SUCCESS(*status)) { int32_t defaultKeyLen = 0; @@ -493,7 +525,7 @@ ucol_openFromShortString( const char *definition, ucol_sit_initCollatorSpecs(&s); string = ucol_sit_readSpecs(&s, definition, parseError, status); ucol_sit_calculateWholeLocale(&s); - + char buffer[internalBufferSize]; uprv_memset(buffer, 0, internalBufferSize); uloc_canonicalize(s.locale, buffer, internalBufferSize, status); @@ -508,7 +540,7 @@ ucol_openFromShortString( const char *definition, } if(U_FAILURE(*status)) { - parseError->offset = string - definition; + parseError->offset = (int32_t)(string - definition); ucol_close(result); return NULL; } @@ -534,23 +566,6 @@ ucol_openFromShortString( const char *definition, } -static void appendShortStringElement(const char *src, int32_t len, char *result, int32_t *resultSize, int32_t capacity, char arg) -{ - if(len) { - if(*resultSize) { - if(*resultSize < capacity) { - uprv_strcat(result, "_"); - } - (*resultSize)++; - } - *resultSize += len + 1; - if(*resultSize < capacity) { - uprv_strncat(result, &arg, 1); - uprv_strncat(result, src, len); - } - } -} - U_CAPI int32_t U_EXPORT2 ucol_getShortDefinitionString(const UCollator *coll, const char *locale, @@ -559,56 +574,11 @@ ucol_getShortDefinitionString(const UCollator *coll, UErrorCode *status) { if(U_FAILURE(*status)) return 0; - char buffer[internalBufferSize]; - uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); - int32_t resultSize = 0; - char tempbuff[internalBufferSize]; - char locBuff[internalBufferSize]; - uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); - int32_t elementSize = 0; - UBool isAvailable = 0; - CollatorSpec s; - ucol_sit_initCollatorSpecs(&s); - - if(!locale) { - locale = ucol_getLocale(coll, ULOC_VALID_LOCALE, status); - } - elementSize = ucol_getFunctionalEquivalent(locBuff, internalBufferSize, "collation", locale, &isAvailable, status); - - if(elementSize) { - // we should probably canonicalize here... - elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, languageArg); - elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, regionArg); - elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, scriptArg); - elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variantArg); - elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, keywordArg); - } - - int32_t i = 0; - UColAttributeValue attribute = UCOL_DEFAULT; - for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { - if(options[i].action == _processCollatorOption) { - attribute = ucol_getAttributeOrDefault(coll, (UColAttribute)options[i].attr, status); - if(attribute != UCOL_DEFAULT) { - char letter = ucol_sit_attributeValueToLetter(attribute, status); - appendShortStringElement(&letter, 1, - buffer, &resultSize, capacity, options[i].optionStart); - } - } - } - if(coll->variableTopValueisDefault == FALSE) { - //s.variableTopValue = ucol_getVariableTop(coll, status); - elementSize = T_CString_integerToString(tempbuff, coll->variableTopValue, 16); - appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variableTopValArg); + if(coll == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; } - - UParseError parseError; - return ucol_normalizeShortDefinitionString(buffer, dst, capacity, &parseError, status); + return ((icu::Collator*)coll)->internalGetShortDefinitionString(locale,dst,capacity,*status); } U_CAPI int32_t U_EXPORT2 @@ -639,161 +609,6 @@ ucol_normalizeShortDefinitionString(const char *definition, return ucol_sit_dumpSpecs(&s, destination, capacity, status); } -U_CAPI UColAttributeValue U_EXPORT2 -ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status) -{ - if(U_FAILURE(*status) || coll == NULL) { - return UCOL_DEFAULT; - } - switch(attr) { - case UCOL_NUMERIC_COLLATION: - return coll->numericCollationisDefault?UCOL_DEFAULT:coll->numericCollation; - case UCOL_HIRAGANA_QUATERNARY_MODE: - return coll->hiraganaQisDefault?UCOL_DEFAULT:coll->hiraganaQ; - case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ - return coll->frenchCollationisDefault?UCOL_DEFAULT:coll->frenchCollation; - case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ - return coll->alternateHandlingisDefault?UCOL_DEFAULT:coll->alternateHandling; - case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ - return coll->caseFirstisDefault?UCOL_DEFAULT:coll->caseFirst; - case UCOL_CASE_LEVEL: /* do we have an extra case level */ - return coll->caseLevelisDefault?UCOL_DEFAULT:coll->caseLevel; - case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ - return coll->normalizationModeisDefault?UCOL_DEFAULT:coll->normalizationMode; - case UCOL_STRENGTH: /* attribute for strength */ - return coll->strengthisDefault?UCOL_DEFAULT:coll->strength; - case UCOL_ATTRIBUTE_COUNT: - default: - *status = U_ILLEGAL_ARGUMENT_ERROR; - break; - } - return UCOL_DEFAULT; -} - - -struct contContext { - const UCollator *coll; - USet *conts; - USet *expansions; - USet *removedContractions; - UBool addPrefixes; - UErrorCode *status; -}; - - - -static void -addSpecial(contContext *context, UChar *buffer, int32_t bufLen, - uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status) -{ - const UCollator *coll = context->coll; - USet *contractions = context->conts; - USet *expansions = context->expansions; - UBool addPrefixes = context->addPrefixes; - - const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); - uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); - // we might have a contraction that ends from previous level - if(newCE != UCOL_NOT_FOUND) { - if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) { - addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); - } - if(contractions && rightIndex-leftIndex > 1) { - uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); - if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) { - uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); - } - } - } - - UCharOffset++; - // check whether we're doing contraction or prefix - if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) { - if(leftIndex == 0) { - *status = U_INTERNAL_PROGRAM_ERROR; - return; - } - --leftIndex; - while(*UCharOffset != 0xFFFF) { - newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); - buffer[leftIndex] = *UCharOffset; - if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { - addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); - } else { - if(contractions) { - uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); - } - if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { - uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); - } - } - UCharOffset++; - } - } else if(getCETag(CE) == CONTRACTION_TAG) { - if(rightIndex == bufLen-1) { - *status = U_INTERNAL_PROGRAM_ERROR; - return; - } - while(*UCharOffset != 0xFFFF) { - newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); - buffer[rightIndex] = *UCharOffset; - if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { - addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status); - } else { - if(contractions) { - uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex); - } - if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { - uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex); - } - } - UCharOffset++; - } - } - -} - -U_CDECL_BEGIN -static UBool U_CALLCONV -_processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE) -{ - UErrorCode *status = ((contContext *)context)->status; - USet *expansions = ((contContext *)context)->expansions; - USet *removed = ((contContext *)context)->removedContractions; - UBool addPrefixes = ((contContext *)context)->addPrefixes; - UChar contraction[internalBufferSize]; - if(isSpecial(CE)) { - if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) { - while(start < limit && U_SUCCESS(*status)) { - // if there are suppressed contractions, we don't - // want to add them. - if(removed && uset_contains(removed, start)) { - start++; - continue; - } - // we start our contraction from middle, since we don't know if it - // will grow toward right or left - contraction[internalBufferSize/2] = (UChar)start; - addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status); - start++; - } - } else if(expansions && getCETag(CE) == EXPANSION_TAG) { - while(start < limit && U_SUCCESS(*status)) { - uset_add(expansions, start++); - } - } - } - if(U_FAILURE(*status)) { - return FALSE; - } else { - return TRUE; - } -} - -U_CDECL_END - - - /** * Get a set containing the contractions defined by the collator. The set includes * both the UCA contractions and the contractions defined by the collator @@ -801,8 +616,6 @@ U_CDECL_END * @param conts the set to hold the result * @param status to hold the error code * @return the size of the contraction set - * - * @draft ICU 3.0 */ U_CAPI int32_t U_EXPORT2 ucol_getContractions( const UCollator *coll, @@ -837,77 +650,14 @@ ucol_getContractionsAndExpansions( const UCollator *coll, *status = U_ILLEGAL_ARGUMENT_ERROR; return; } - - if(contractions) { - uset_clear(contractions); - } - if(expansions) { - uset_clear(expansions); - } - int32_t rulesLen = 0; - const UChar* rules = ucol_getRules(coll, &rulesLen); - UColTokenParser src; - ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status); - - contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status }; - - // Add the UCA contractions - c.coll = coll->UCA; - utrie_enum(&coll->UCA->mapping, NULL, _processSpecials, &c); - - // This is collator specific. Add contractions from a collator - c.coll = coll; - c.removedContractions = NULL; - utrie_enum(&coll->mapping, NULL, _processSpecials, &c); - ucol_tok_closeTokenList(&src); -} - -U_CAPI int32_t U_EXPORT2 -ucol_getUnsafeSet( const UCollator *coll, - USet *unsafe, - UErrorCode *status) -{ - UChar buffer[internalBufferSize]; - int32_t len = 0; - - uset_clear(unsafe); - - // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant - static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, - 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; - - // add chars that fail the fcd check - uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); - - // add Thai/Lao prevowels - uset_addRange(unsafe, 0xe40, 0xe44); - uset_addRange(unsafe, 0xec0, 0xec4); - // add lead/trail surrogates - uset_addRange(unsafe, 0xd800, 0xdfff); - - USet *contractions = uset_open(0,0); - - int32_t i = 0, j = 0; - int32_t contsSize = ucol_getContractions(coll, contractions, status); - UChar32 c = 0; - // Contraction set consists only of strings - // to get unsafe code points, we need to - // break the strings apart and add them to the unsafe set - for(i = 0; i < contsSize; i++) { - len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); - if(len > 0) { - j = 0; - while(j < len) { - U16_NEXT(buffer, j, len, c); - if(j < len) { - uset_add(unsafe, c); - } - } - } + const icu::RuleBasedCollator *rbc = icu::RuleBasedCollator::rbcFromUCollator(coll); + if(rbc == NULL) { + *status = U_UNSUPPORTED_ERROR; + return; } - - uset_close(contractions); - - return uset_size(unsafe); + rbc->internalGetContractionsAndExpansions( + icu::UnicodeSet::fromUSet(contractions), + icu::UnicodeSet::fromUSet(expansions), + addPrefixes, *status); } #endif