X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..48b980fed3435926e0b3a8d72ecb58be703a1c7a:/icuSources/i18n/ucol.cpp diff --git a/icuSources/i18n/ucol.cpp b/icuSources/i18n/ucol.cpp index acc2591d..384a2e9c 100644 --- a/icuSources/i18n/ucol.cpp +++ b/icuSources/i18n/ucol.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 1996-2004, International Business Machines +* Copyright (C) 1996-2009, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: ucol.cpp @@ -17,35 +17,26 @@ */ #include "unicode/utypes.h" -#include "ustrenum.h" -#include "uassert.h" #if !UCONFIG_NO_COLLATION -#include "unicode/uloc.h" -#include "unicode/coll.h" -#include "unicode/tblcoll.h" #include "unicode/coleitr.h" #include "unicode/unorm.h" #include "unicode/udata.h" -#include "unicode/uchar.h" -#include "unicode/caniter.h" +#include "unicode/ustring.h" -#include "ucol_bld.h" #include "ucol_imp.h" -#include "ucol_tok.h" -#include "ucol_elm.h" #include "bocsu.h" #include "unormimp.h" #include "unorm_it.h" -#include "uresimp.h" #include "umutex.h" -#include "uhash.h" +#include "cmemory.h" #include "ucln_in.h" #include "cstring.h" #include "utracimp.h" #include "putilimp.h" +#include "uassert.h" #ifdef UCOL_DEBUG #include @@ -63,54 +54,28 @@ U_NAMESPACE_USE #define ZERO_CC_LIMIT_ 0xC0 -// static UCA. There is only one. Collators don't use it. -// It is referenced only in ucol_initUCA and ucol_cleanup -static UCollator* _staticUCA = NULL; -// static pointer to udata memory. Inited in ucol_initUCA -// used for cleanup in ucol_cleanup -static UDataMemory* UCA_DATA_MEM = NULL; - // this is static pointer to the normalizer fcdTrieIndex // it is always the same between calls to u_cleanup // and therefore writing to it is not synchronized. // It is cleaned in ucol_cleanup static const uint16_t *fcdTrieIndex=NULL; +// These are values from UCA required for +// implicit generation and supressing sort key compression +// they should regularly be in the UCA, but if one +// is running without UCA, it could be a problem +static const int32_t maxRegularPrimary = 0xA0; +static const int32_t minImplicitPrimary = 0xE0; +static const int32_t maxImplicitPrimary = 0xE4; + U_CDECL_BEGIN static UBool U_CALLCONV -isAcceptableUCA(void * /*context*/, - const char * /*type*/, const char * /*name*/, - const UDataInfo *pInfo){ - /* context, type & name are intentionally not used */ - if( pInfo->size>=20 && - pInfo->isBigEndian==U_IS_BIG_ENDIAN && - pInfo->charsetFamily==U_CHARSET_FAMILY && - pInfo->dataFormat[0]==UCA_DATA_FORMAT_0 && /* dataFormat="UCol" */ - pInfo->dataFormat[1]==UCA_DATA_FORMAT_1 && - pInfo->dataFormat[2]==UCA_DATA_FORMAT_2 && - pInfo->dataFormat[3]==UCA_DATA_FORMAT_3 && - pInfo->formatVersion[0]==UCA_FORMAT_VERSION_0 && - pInfo->formatVersion[1]>=UCA_FORMAT_VERSION_1// && - //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 && - //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh - //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh - ) { - UVersionInfo UCDVersion; - u_getUnicodeVersion(UCDVersion); - if(pInfo->dataVersion[0]==UCDVersion[0] && - pInfo->dataVersion[1]==UCDVersion[1]) { // && - //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] && - //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) { - return TRUE; - } else { - return FALSE; - } - } else { - return FALSE; - } +ucol_cleanup(void) +{ + fcdTrieIndex = NULL; + return TRUE; } - static int32_t U_CALLCONV _getFoldingOffset(uint32_t data) { return (int32_t)(data&0xFFFFFF); @@ -119,8 +84,9 @@ _getFoldingOffset(uint32_t data) { U_CDECL_END static -inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, - int32_t sourceLen, collIterate *s) { +inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, + int32_t sourceLen, collIterate *s) +{ (s)->string = (s)->pos = (UChar *)(sourceString); (s)->origFlags = 0; (s)->flags = 0; @@ -132,7 +98,13 @@ inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStr /* change to enable easier checking for end of string for fcdpositon */ (s)->endp = NULL; } + (s)->extendCEs = NULL; + (s)->extendCEsSize = 0; (s)->CEpos = (s)->toReturn = (s)->CEs; + (s)->offsetBuffer = NULL; + (s)->offsetBufferSize = 0; + (s)->offsetReturn = (s)->offsetStore = NULL; + (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; (s)->writableBuffer = (s)->stackWritableBuffer; (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; (s)->coll = (collator); @@ -141,7 +113,7 @@ inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStr (s)->flags |= UCOL_ITER_NORM; } if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { - (s)->flags |= UCOL_HIRAGANA_Q; + (s)->flags |= UCOL_HIRAGANA_Q; } (s)->iterator = NULL; //(s)->iteratorIndex = 0; @@ -169,18 +141,19 @@ inline void backupState(const collIterate *data, collIterateState *backup) backup->pos = data->pos; backup->bufferaddress = data->writableBuffer; backup->buffersize = data->writableBufSize; + backup->iteratorMove = 0; + backup->iteratorIndex = 0; if(data->iterator != NULL) { - //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); - backup->iteratorIndex = data->iterator->getState(data->iterator); - // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE - backup->iteratorMove = 0; - if(backup->iteratorIndex == UITER_NO_STATE) { - while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { - backup->iteratorMove++; - data->iterator->move(data->iterator, -1, UITER_CURRENT); - } - data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); - } + //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); + backup->iteratorIndex = data->iterator->getState(data->iterator); + // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE + if(backup->iteratorIndex == UITER_NO_STATE) { + while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { + backup->iteratorMove++; + data->iterator->move(data->iterator, -1, UITER_CURRENT); + } + data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); + } } } @@ -195,17 +168,18 @@ static inline void loadState(collIterate *data, const collIterateState *backup, UBool forwards) { - UErrorCode status = U_ZERO_ERROR; + UErrorCode status = U_ZERO_ERROR; data->flags = backup->flags; data->origFlags = backup->origFlags; if(data->iterator != NULL) { - //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); - data->iterator->setState(data->iterator, backup->iteratorIndex, &status); - if(backup->iteratorMove != 0) { - data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); - } + //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); + data->iterator->setState(data->iterator, backup->iteratorIndex, &status); + if(backup->iteratorMove != 0) { + data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); + } } data->pos = backup->pos; + if ((data->flags & UCOL_ITER_INNORMBUF) && data->writableBuffer != backup->bufferaddress) { /* @@ -306,7 +280,7 @@ inline UBool collIter_bos(collIterate *source) { return FALSE; } -static +/*static inline UBool collIter_SimpleBos(collIterate *source) { // if we're going backwards, we need to know whether there is more in the // iterator, even if we are in the side buffer @@ -317,7 +291,7 @@ inline UBool collIter_SimpleBos(collIterate *source) { return TRUE; } return FALSE; -} +}*/ //return (data->pos == data->string) || @@ -339,437 +313,358 @@ inline void freeHeapWritableBuffer(collIterate *data) /* Following are the open/close functions */ /* */ /****************************************************************************/ -static UCollator* -tryOpeningFromRules(UResourceBundle *collElem, UErrorCode *status) { - int32_t rulesLen = 0; - const UChar *rules = ures_getStringByKey(collElem, "Sequence", &rulesLen, status); - return ucol_openRules(rules, rulesLen, UCOL_DEFAULT, UCOL_DEFAULT, NULL, status); +static UCollator* +ucol_initFromBinary(const uint8_t *bin, int32_t length, + const UCollator *base, + UCollator *fillIn, + UErrorCode *status) +{ + UCollator *result = fillIn; + if(U_FAILURE(*status)) { + return NULL; + } + /* + if(base == NULL) { + // we don't support null base yet + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + */ + // We need these and we could be running without UCA + uprv_uca_initImplicitConstants(status); + UCATableHeader *colData = (UCATableHeader *)bin; + // do we want version check here? We're trying to figure out whether collators are compatible + if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || + uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || + colData->version[0] != UCOL_BUILDER_VERSION) + { + *status = U_COLLATOR_VERSION_MISMATCH; + return NULL; + } + else { + if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { + result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); + if(U_FAILURE(*status)){ + return NULL; + } + result->hasRealData = TRUE; + } + else { + if(base) { + result = ucol_initCollator(base->image, result, base, status); + ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); + if(U_FAILURE(*status)){ + return NULL; + } + result->hasRealData = FALSE; + } + else { + *status = U_USELESS_COLLATOR_ERROR; + return NULL; + } + } + result->freeImageOnClose = FALSE; + } + result->actualLocale = NULL; + result->validLocale = NULL; + result->requestedLocale = NULL; + result->rules = NULL; + result->rulesLength = 0; + result->freeRulesOnClose = FALSE; + result->ucaRules = NULL; + return result; } - -U_CAPI UCollator* -ucol_open(const char *loc, - UErrorCode *status) +U_CAPI UCollator* U_EXPORT2 +ucol_openBinary(const uint8_t *bin, int32_t length, + const UCollator *base, + UErrorCode *status) { - UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN); - UTRACE_DATA1(UTRACE_INFO, "locale = \"%s\"", loc); - UCollator *result = NULL; - - u_init(status); -#if !UCONFIG_NO_SERVICE - result = Collator::createUCollator(loc, status); - if (result == NULL) -#endif - { - result = ucol_open_internal(loc, status); - } - UTRACE_EXIT_PTR_STATUS(result, *status); - return result; + return ucol_initFromBinary(bin, length, base, NULL, status); } -// API in ucol_imp.h - -U_CFUNC UCollator* -ucol_open_internal(const char *loc, - UErrorCode *status) +U_CAPI int32_t U_EXPORT2 +ucol_cloneBinary(const UCollator *coll, + uint8_t *buffer, int32_t capacity, + UErrorCode *status) { - const UCollator* UCA = ucol_initUCA(status); + int32_t length = 0; + if(U_FAILURE(*status)) { + return length; + } + if(capacity < 0) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return length; + } + if(coll->hasRealData == TRUE) { + length = coll->image->size; + if(length <= capacity) { + uprv_memcpy(buffer, coll->image, length); + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + } + } else { + length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); + if(length <= capacity) { + /* build the UCATableHeader with minimal entries */ + /* do not copy the header from the UCA file because its values are wrong! */ + /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ - /* New version */ - if(U_FAILURE(*status)) return 0; + /* reset everything */ + uprv_memset(buffer, 0, length); + /* set the tailoring-specific values */ + UCATableHeader *myData = (UCATableHeader *)buffer; + myData->size = length; + /* offset for the options, the only part of the data that is present after the header */ + myData->options = sizeof(UCATableHeader); - UCollator *result = NULL; - UResourceBundle *b = ures_open(U_ICUDATA_COLL, loc, status); + /* need to always set the expansion value for an upper bound of the options */ + myData->expansion = myData->options + sizeof(UColOptionSet); - /* we try to find stuff from keyword */ - UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status); - UResourceBundle *collElem = NULL; - char keyBuffer[256]; - // if there is a keyword, we pick it up and try to get elements - if(!uloc_getKeywordValue(loc, "collation", keyBuffer, 256, status)) { - // no keyword. we try to find the default setting, which will give us the keyword value - UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status); - if(U_SUCCESS(*status)) { - int32_t defaultKeyLen = 0; - const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status); - u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen); - keyBuffer[defaultKeyLen] = 0; - } else { - *status = U_INTERNAL_PROGRAM_ERROR; - return NULL; - } - ures_close(defaultColl); - } - collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status); + myData->magic = UCOL_HEADER_MAGIC; + myData->isBigEndian = U_IS_BIG_ENDIAN; + myData->charSetFamily = U_CHARSET_FAMILY; - UResourceBundle *binary = NULL; - UErrorCode binaryStatus = U_ZERO_ERROR; + /* copy UCA's version; genrb will override all but the builder version with tailoring data */ + uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); - if(*status == U_MISSING_RESOURCE_ERROR) { /* We didn't find the tailoring data, we fallback to the UCA */ - *status = U_USING_DEFAULT_WARNING; - result = ucol_initCollator(UCA->image, result, UCA, status); - // if we use UCA, real locale is root - result->rb = ures_open(U_ICUDATA_COLL, "", status); - result->elements = ures_open(U_ICUDATA_COLL, "", status); - if(U_FAILURE(*status)) { - goto clean; - } - ures_close(b); - result->hasRealData = FALSE; - } else if(U_SUCCESS(*status)) { - binary = ures_getByKey(collElem, "%%CollationBin", NULL, &binaryStatus); - - if(binaryStatus == U_MISSING_RESOURCE_ERROR) { /* we didn't find the binary image, we should use the rules */ - binary = NULL; - result = tryOpeningFromRules(collElem, status); - if(U_FAILURE(*status)) { - goto clean; - } - } else if(U_SUCCESS(*status)) { /* otherwise, we'll pick a collation data that exists */ - int32_t len = 0; - const uint8_t *inData = ures_getBinary(binary, &len, status); - UCATableHeader *colData = (UCATableHeader *)inData; - if(uprv_memcmp(colData->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0 || - uprv_memcmp(colData->UCDVersion, UCA->image->UCDVersion, sizeof(UVersionInfo)) != 0 || - colData->version[0] != UCOL_BUILDER_VERSION) { - *status = U_DIFFERENT_UCA_VERSION; - result = tryOpeningFromRules(collElem, status); - } else { - if(U_FAILURE(*status)){ - goto clean; - } - if((uint32_t)len > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { - result = ucol_initCollator((const UCATableHeader *)inData, result, UCA, status); - if(U_FAILURE(*status)){ - goto clean; - } - result->hasRealData = TRUE; + uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); + uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); + uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); + myData->jamoSpecial = coll->image->jamoSpecial; + + /* copy the collator options */ + uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); } else { - result = ucol_initCollator(UCA->image, result, UCA, status); - ucol_setOptionsFromHeader(result, (UColOptionSet *)(inData+((const UCATableHeader *)inData)->options), status); - if(U_FAILURE(*status)){ - goto clean; - } - result->hasRealData = FALSE; + *status = U_BUFFER_OVERFLOW_ERROR; } - result->freeImageOnClose = FALSE; - } } - result->rb = b; - result->elements = collElem; - } else { /* There is another error, and we're just gonna clean up */ -clean: - ures_close(b); - ures_close(collElem); - ures_close(collations); - ures_close(binary); - return NULL; - } + return length; +} - result->validLocale = NULL; // default is to use rb info +U_CAPI UCollator* U_EXPORT2 +ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) +{ + UCollator * localCollator; + int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); + char *stackBufferChars = (char *)stackBuffer; + int32_t imageSize = 0; + int32_t rulesSize = 0; + int32_t rulesPadding = 0; + uint8_t *image; + UChar *rules; + UBool colAllocated = FALSE; + UBool imageAllocated = FALSE; - if(loc == NULL) { - loc = ures_getLocale(result->rb, status); - } - result->requestedLocale = (char *)uprv_malloc((uprv_strlen(loc)+1)*sizeof(char)); - /* test for NULL */ - if (result->requestedLocale == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - ures_close(b); // ??? appears needed - ures_close(collElem); - ures_close(collations); - ures_close(binary); // ??? appears needed - return NULL; - } - uprv_strcpy(result->requestedLocale, loc); + if (status == NULL || U_FAILURE(*status)){ + return 0; + } + if ((stackBuffer && !pBufferSize) || !coll){ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + if (coll->rules && coll->freeRulesOnClose) { + rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); + rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); + bufferSizeNeeded += rulesSize + rulesPadding; + } - ures_close(binary); - ures_close(collations); //??? we have to decide on that. Probably affects something :) - return result; -} + if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ + *pBufferSize = bufferSizeNeeded; + return 0; + } + + /* Pointers on 64-bit platforms need to be aligned + * on a 64-bit boundry in memory. + */ + if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { + int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); + if (*pBufferSize > offsetUp) { + *pBufferSize -= offsetUp; + stackBufferChars += offsetUp; + } + else { + /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */ + *pBufferSize = 1; + } + } + stackBuffer = (void *)stackBufferChars; + if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) { + /* allocate one here...*/ + stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); + // Null pointer check. + if (stackBufferChars == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + colAllocated = TRUE; + if (U_SUCCESS(*status)) { + *status = U_SAFECLONE_ALLOCATED_WARNING; + } + } + localCollator = (UCollator *)stackBufferChars; + rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); + { + UErrorCode tempStatus = U_ZERO_ERROR; + imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); + } + if (coll->freeImageOnClose) { + image = (uint8_t *)uprv_malloc(imageSize); + // Null pointer check + if (image == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + ucol_cloneBinary(coll, image, imageSize, status); + imageAllocated = TRUE; + } + else { + image = (uint8_t *)coll->image; + } + localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); + if (U_FAILURE(*status)) { + return NULL; + } -U_CAPI void U_EXPORT2 -ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *validLocaleToAdopt) -{ - if (coll) { - if (coll->validLocale) { - uprv_free(coll->validLocale); + if (coll->rules) { + if (coll->freeRulesOnClose) { + localCollator->rules = u_strcpy(rules, coll->rules); + //bufferEnd += rulesSize; + } + else { + localCollator->rules = coll->rules; + } + localCollator->freeRulesOnClose = FALSE; + localCollator->rulesLength = coll->rulesLength; } - coll->validLocale = validLocaleToAdopt; - if (coll->requestedLocale) { // should always have - uprv_free(coll->requestedLocale); + + int32_t i; + for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { + ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); } - coll->requestedLocale = requestedLocaleToAdopt; - } + // zero copies of pointers + localCollator->actualLocale = NULL; + localCollator->validLocale = NULL; + localCollator->requestedLocale = NULL; + localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. + localCollator->freeOnClose = colAllocated; + localCollator->freeImageOnClose = imageAllocated; + return localCollator; } U_CAPI void U_EXPORT2 ucol_close(UCollator *coll) { - UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); - UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); - if(coll != NULL) { - // these are always owned by each UCollator struct, - // so we always free them - if(coll->validLocale != NULL) { - uprv_free(coll->validLocale); - } - if(coll->requestedLocale != NULL) { - uprv_free(coll->requestedLocale); - } + UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); + UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); + if(coll != NULL) { + // these are always owned by each UCollator struct, + // so we always free them + if(coll->validLocale != NULL) { + uprv_free(coll->validLocale); + } + if(coll->actualLocale != NULL) { + uprv_free(coll->actualLocale); + } + if(coll->requestedLocale != NULL) { + uprv_free(coll->requestedLocale); + } + if(coll->latinOneCEs != NULL) { + uprv_free(coll->latinOneCEs); + } + if(coll->options != NULL && coll->freeOptionsOnClose) { + uprv_free(coll->options); + } + if(coll->rules != NULL && coll->freeRulesOnClose) { + uprv_free((UChar *)coll->rules); + } + if(coll->image != NULL && coll->freeImageOnClose) { + uprv_free((UCATableHeader *)coll->image); + } - /* Here, it would be advisable to close: */ - /* - UData for UCA (unless we stuff it in the root resb */ - /* Again, do we need additional housekeeping... HMMM! */ - UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); - if(coll->freeOnClose){ - /* for safeClone, if freeOnClose is FALSE, - don't free the other instance data */ - if(coll->freeOptionsOnClose != FALSE) { - if(coll->options != NULL) { - uprv_free(coll->options); - } - } - if(coll->mapping != NULL) { - /*ucmpe32_close(coll->mapping);*/ - uprv_free(coll->mapping); - } - if(coll->rules != NULL && coll->freeRulesOnClose) { - uprv_free((UChar *)coll->rules); - } - if(coll->rb != NULL) { /* pointing to read-only memory */ - ures_close(coll->rb); - } - if(coll->freeImageOnClose == TRUE) { - uprv_free((UCATableHeader *)coll->image); - } - if(coll->elements != NULL) { - ures_close(coll->elements); - } - if(coll->latinOneCEs != NULL) { - uprv_free(coll->latinOneCEs); - } - uprv_free(coll); - } - } - UTRACE_EXIT(); + /* Here, it would be advisable to close: */ + /* - UData for UCA (unless we stuff it in the root resb */ + /* Again, do we need additional housekeeping... HMMM! */ + UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); + if(coll->freeOnClose){ + /* for safeClone, if freeOnClose is FALSE, + don't free the other instance data */ + uprv_free(coll); + } + } + UTRACE_EXIT(); } -U_CAPI UCollator* U_EXPORT2 -ucol_openRules( const UChar *rules, - int32_t rulesLength, - UColAttributeValue normalizationMode, - UCollationStrength strength, - UParseError *parseError, - UErrorCode *status) +/* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ +/* you should be able to get the binary chunk to write out... Doesn't look very full now */ +U_CFUNC uint8_t* U_EXPORT2 +ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) { - uint32_t listLen = 0; - UColTokenParser src; - UColAttributeValue norm; - UParseError tErr; - - if(status == NULL || U_FAILURE(*status)){ - return 0; - } + uint8_t *result = NULL; + if(U_FAILURE(*status)) { + return NULL; + } + if(coll->hasRealData == TRUE) { + *length = coll->image->size; + result = (uint8_t *)uprv_malloc(*length); + /* test for NULL */ + if (result == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + uprv_memcpy(result, coll->image, *length); + } else { + *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); + result = (uint8_t *)uprv_malloc(*length); + /* test for NULL */ + if (result == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } - u_init(status); - if (U_FAILURE(*status)) { - return NULL; - } + /* build the UCATableHeader with minimal entries */ + /* do not copy the header from the UCA file because its values are wrong! */ + /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ - if(rulesLength < -1 || (rules == NULL && rulesLength != 0)) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } + /* reset everything */ + uprv_memset(result, 0, *length); - if(rulesLength == -1) { - rulesLength = u_strlen(rules); - } + /* set the tailoring-specific values */ + UCATableHeader *myData = (UCATableHeader *)result; + myData->size = *length; - if(parseError == NULL){ - parseError = &tErr; - } + /* offset for the options, the only part of the data that is present after the header */ + myData->options = sizeof(UCATableHeader); - switch(normalizationMode) { - case UCOL_OFF: - case UCOL_ON: - case UCOL_DEFAULT: - norm = normalizationMode; - break; - default: - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } + /* need to always set the expansion value for an upper bound of the options */ + myData->expansion = myData->options + sizeof(UColOptionSet); - UCollator *UCA = ucol_initUCA(status); + myData->magic = UCOL_HEADER_MAGIC; + myData->isBigEndian = U_IS_BIG_ENDIAN; + myData->charSetFamily = U_CHARSET_FAMILY; - if(U_FAILURE(*status)){ - return NULL; - } + /* copy UCA's version; genrb will override all but the builder version with tailoring data */ + uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); - ucol_tok_initTokenList(&src, rules, rulesLength, UCA, status); - listLen = ucol_tok_assembleTokenList(&src,parseError, status); + uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); + uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); + uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); + myData->jamoSpecial = coll->image->jamoSpecial; - if(U_FAILURE(*status)) { - /* if status is U_ILLEGAL_ARGUMENT_ERROR, src->current points at the offending option */ - /* if status is U_INVALID_FORMAT_ERROR, src->current points after the problematic part of the rules */ - /* so something might be done here... or on lower level */ -#ifdef UCOL_DEBUG - if(*status == U_ILLEGAL_ARGUMENT_ERROR) { - fprintf(stderr, "bad option starting at offset %i\n", src.current-src.source); - } else { - fprintf(stderr, "invalid rule just before offset %i\n", src.current-src.source); + /* copy the collator options */ + uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); } -#endif - ucol_tok_closeTokenList(&src); - return NULL; - } - UCollator *result = NULL; - UCATableHeader *table = NULL; + return result; +} - if(src.resultLen > 0 || src.removeSet != NULL) { /* we have a set of rules, let's make something of it */ - /* also, if we wanted to remove some contractions, we should make a tailoring */ - table = ucol_assembleTailoringTable(&src, status); - if(U_SUCCESS(*status)) { - // builder version - table->version[0] = UCOL_BUILDER_VERSION; - // no tailoring information on this level - table->version[1] = table->version[2] = table->version[3] = 0; - // set UCD version - u_getUnicodeVersion(table->UCDVersion); - // set UCA version - uprv_memcpy(table->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)); - result = ucol_initCollator(table, 0, UCA, status); - result->hasRealData = TRUE; - result->freeImageOnClose = TRUE; - } - } else { /* no rules, but no error either */ - // must be only options - // We will init the collator from UCA - result = ucol_initCollator(UCA->image, 0, UCA, status); - // And set only the options - UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet)); - /* test for NULL */ - if (opts == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - goto cleanup; +void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { + if(U_FAILURE(*status)) { + return; } - uprv_memcpy(opts, src.opts, sizeof(UColOptionSet)); - ucol_setOptionsFromHeader(result, opts, status); - result->freeOptionsOnClose = TRUE; - result->hasRealData = FALSE; - result->freeImageOnClose = FALSE; - } - - if(U_SUCCESS(*status)) { - UChar *newRules; - result->dataInfo.dataVersion[0] = UCOL_BUILDER_VERSION; - if(rulesLength > 0) { - newRules = (UChar *)uprv_malloc((rulesLength+1)*U_SIZEOF_UCHAR); - /* test for NULL */ - if (newRules == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - uprv_memcpy(newRules, rules, rulesLength*U_SIZEOF_UCHAR); - newRules[rulesLength]=0; - result->rules = newRules; - result->rulesLength = rulesLength; - result->freeRulesOnClose = TRUE; - } - result->rb = NULL; - result->elements = NULL; - result->validLocale = NULL; - result->requestedLocale = NULL; - ucol_setAttribute(result, UCOL_STRENGTH, strength, status); - ucol_setAttribute(result, UCOL_NORMALIZATION_MODE, norm, status); - } else { -cleanup: - if(result != NULL) { - ucol_close(result); - } else { - if(table != NULL) { - uprv_free(table); - } - } - result = NULL; - } - - ucol_tok_closeTokenList(&src); - - return result; -} - -/* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ -/* you should be able to get the binary chunk to write out... Doesn't look very full now */ -U_CAPI uint8_t* U_EXPORT2 -ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) -{ - uint8_t *result = NULL; - if(U_FAILURE(*status)) { - return NULL; - } - if(coll->hasRealData == TRUE) { - *length = coll->image->size; - result = (uint8_t *)uprv_malloc(*length); - /* test for NULL */ - if (result == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - uprv_memcpy(result, coll->image, *length); - } else { - *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); - result = (uint8_t *)uprv_malloc(*length); - /* test for NULL */ - if (result == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - - /* build the UCATableHeader with minimal entries */ - /* do not copy the header from the UCA file because its values are wrong! */ - /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ - - /* reset everything */ - uprv_memset(result, 0, *length); - - /* set the tailoring-specific values */ - UCATableHeader *myData = (UCATableHeader *)result; - myData->size = *length; - - /* offset for the options, the only part of the data that is present after the header */ - myData->options = sizeof(UCATableHeader); - - /* need to always set the expansion value for an upper bound of the options */ - myData->expansion = myData->options + sizeof(UColOptionSet); - - myData->magic = UCOL_HEADER_MAGIC; - myData->isBigEndian = U_IS_BIG_ENDIAN; - myData->charSetFamily = U_CHARSET_FAMILY; - - /* copy UCA's version; genrb will override all but the builder version with tailoring data */ - uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); - - uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); - uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); - uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); - myData->jamoSpecial = coll->image->jamoSpecial; - - /* copy the collator options */ - uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); - } - return result; -} - -void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { - if(U_FAILURE(*status)) { - return; - } result->caseFirst = (UColAttributeValue)opts->caseFirst; result->caseLevel = (UColAttributeValue)opts->caseLevel; result->frenchCollation = (UColAttributeValue)opts->frenchCollation; @@ -794,24 +689,6 @@ void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCo result->options = opts; } -#if 0 -// doesn't look like anybody is using this -void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { - if(U_FAILURE(*status)) { - return; - } - opts->caseFirst = result->caseFirst; - opts->caseLevel = result->caseLevel; - opts->frenchCollation = result->frenchCollation; - opts->normalizationMode = result->normalizationMode; - opts->strength = result->strength; - opts->variableTopValue = result->variableTopValue; - opts->alternateHandling = result->alternateHandling; - opts->hiraganaQ = result->hiraganaQ; - opts->numericCollation = result->numericCollation; -} -#endif - /** * Approximate determination if a character is at a contraction end. @@ -822,10 +699,6 @@ void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode */ static inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { - if (UTF_IS_TRAIL(c)) { - return TRUE; - } - if (c < coll->minContrEndCP) { return FALSE; } @@ -833,6 +706,9 @@ inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { int32_t hash = c; uint8_t htbyte; if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { + if (U16_IS_TRAIL(c)) { + return TRUE; + } hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; } htbyte = coll->contrEndCP[hash>>3]; @@ -848,15 +724,14 @@ inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { * in contraction processing. */ static -inline uint8_t i_getCombiningClass(UChar c, const UCollator *coll) { +inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { uint8_t sCC = 0; - if (c >= 0x300 && ucol_unsafeCP(c, coll)) { + if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { sCC = u_getCombiningClass(c); } return sCC; } - UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { UChar c; UCollator *result = fillIn; @@ -875,33 +750,27 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con result->freeOnClose = FALSE; } + // init FCD data + if (fcdTrieIndex == NULL) { + // The result is constant, until the library is reloaded. + fcdTrieIndex = unorm_getFCDTrie(status); + ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); + } + result->image = image; + result->mapping.getFoldingOffset = _getFoldingOffset; const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; - /*CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);*/ - UTrie *newUCAmapping = (UTrie *)uprv_malloc(sizeof(UTrie)); - if(newUCAmapping != NULL) { - utrie_unserialize(newUCAmapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); - } else { - *status = U_MEMORY_ALLOCATION_ERROR; - if(result->freeOnClose == TRUE) { - uprv_free(result); - result = NULL; - } - return result; - } - if(U_SUCCESS(*status)) { - result->mapping = newUCAmapping; - } else { + utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); + if(U_FAILURE(*status)) { if(result->freeOnClose == TRUE) { uprv_free(result); result = NULL; } - uprv_free(newUCAmapping); return result; } /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/ - result->latinOneMapping = UTRIE_GET32_LATIN1(result->mapping); + result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); @@ -930,14 +799,17 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con result->hiraganaQisDefault = TRUE; result->numericCollationisDefault = TRUE; - result->scriptOrder = NULL; + /*result->scriptOrder = NULL;*/ result->rules = NULL; result->rulesLength = 0; + result->freeRulesOnClose = FALSE; /* get the version info from UCATableHeader and populate the Collator struct*/ - result->dataInfo.dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ - result->dataInfo.dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ + result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ + result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ + result->dataVersion[2] = 0; + result->dataVersion[3] = 0; result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; result->minUnsafeCP = 0; @@ -972,6 +844,13 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con ucol_updateInternalState(result, status); + /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ + result->ucaRules = NULL; + result->actualLocale = NULL; + result->validLocale = NULL; + result->requestedLocale = NULL; + result->hasRealData = FALSE; // real data lives in .dat file... + result->freeImageOnClose = FALSE; return result; } @@ -997,30 +876,30 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con */ /** - * Function used to: - * a) collapse the 2 different Han ranges from UCA into one (in the right order), and - * b) bump any non-CJK characters by 10FFFF. - * The relevant blocks are: - * A: 4E00..9FFF; CJK Unified Ideographs - * F900..FAFF; CJK Compatibility Ideographs - * B: 3400..4DBF; CJK Unified Ideographs Extension A - * 20000..XX; CJK Unified Ideographs Extension B (and others later on) - * As long as - * no new B characters are allocated between 4E00 and FAFF, and - * no new A characters are outside of this range, - * (very high probability) this simple code will work. - * The reordered blocks are: - * Block1 is CJK - * Block2 is CJK_COMPAT_USED - * Block3 is CJK_A - * (all contiguous) - * Any other CJK gets its normal code point - * Any non-CJK gets +10FFFF - * When we reorder Block1, we make sure that it is at the very start, - * so that it will use a 3-byte form. - * Warning: the we only pick up the compatibility characters that are - * NOT decomposed, so that block is smaller! - */ + * Function used to: + * a) collapse the 2 different Han ranges from UCA into one (in the right order), and + * b) bump any non-CJK characters by 10FFFF. + * The relevant blocks are: + * A: 4E00..9FFF; CJK Unified Ideographs + * F900..FAFF; CJK Compatibility Ideographs + * B: 3400..4DBF; CJK Unified Ideographs Extension A + * 20000..XX; CJK Unified Ideographs Extension B (and others later on) + * As long as + * no new B characters are allocated between 4E00 and FAFF, and + * no new A characters are outside of this range, + * (very high probability) this simple code will work. + * The reordered blocks are: + * Block1 is CJK + * Block2 is CJK_COMPAT_USED + * Block3 is CJK_A + * (all contiguous) + * Any other CJK gets its normal code point + * Any non-CJK gets +10FFFF + * When we reorder Block1, we make sure that it is at the very start, + * so that it will use a 3-byte form. + * Warning: the we only pick up the compatibility characters that are + * NOT decomposed, so that block is smaller! + */ // CONSTANTS static const UChar32 @@ -1154,7 +1033,7 @@ uprv_uca_getImplicitFromRaw(UChar32 cp) { } } -U_CAPI uint32_t U_EXPORT2 +static uint32_t U_EXPORT2 uprv_uca_getImplicitPrimary(UChar32 cp) { //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); @@ -1176,41 +1055,44 @@ U_CAPI UChar32 U_EXPORT2 uprv_uca_getRawFromImplicit(uint32_t implicit) { UChar32 result; UChar32 b3 = implicit & 0xFF; - implicit >>= 8; - UChar32 b2 = implicit & 0xFF; - implicit >>= 8; - UChar32 b1 = implicit & 0xFF; - implicit >>= 8; - UChar32 b0 = implicit & 0xFF; + UChar32 b2 = (implicit >> 8) & 0xFF; + UChar32 b1 = (implicit >> 16) & 0xFF; + UChar32 b0 = (implicit >> 24) & 0xFF; // simple parameter checks if (b0 < min3Primary || b0 > max4Primary - || b1 < minTrail || b1 > maxTrail) return -1; + || b1 < minTrail || b1 > maxTrail) + return -1; // normal offsets b1 -= minTrail; // take care of the final values, and compose if (b0 < min4Primary) { - if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1; + if (b2 < minTrail || b2 > max3Trail || b3 != 0) + return -1; b2 -= minTrail; UChar32 remainder = b2 % final3Multiplier; - if (remainder != 0) return -1; + if (remainder != 0) + return -1; b0 -= min3Primary; b2 /= final3Multiplier; result = ((b0 * medialCount) + b1) * final3Count + b2; } else { - if (b2 < minTrail || b2 > maxTrail - || b3 < minTrail || b3 > max4Trail) return -1; + if (b2 < minTrail || b2 > maxTrail + || b3 < minTrail || b3 > max4Trail) + return -1; b2 -= minTrail; b3 -= minTrail; UChar32 remainder = b3 % final4Multiplier; - if (remainder != 0) return -1; + if (remainder != 0) + return -1; b3 /= final4Multiplier; b0 -= min4Primary; result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; } // final check - if (result < 0 || result > UCOL_MAX_INPUT) return -1; + if (result < 0 || result > UCOL_MAX_INPUT) + return -1; return result; } @@ -1237,15 +1119,10 @@ static void initImplicitConstants(int minPrimary, int maxPrimary, int gap3, int primaries3count, UErrorCode *status) { // some simple parameter checks - if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return; - }; - if (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return; - }; - if (primaries3count < 1) { + if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) + || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) + || (primaries3count < 1)) + { *status = U_ILLEGAL_ARGUMENT_ERROR; return; }; @@ -1282,11 +1159,8 @@ static void initImplicitConstants(int minPrimary, int maxPrimary, int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); - //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte); int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); - //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte); int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; - //if (DEBUG) System.out.println("expandedGap: " + gap4); if (gap4 < 1) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; @@ -1294,107 +1168,16 @@ static void initImplicitConstants(int minPrimary, int maxPrimary, final4Multiplier = gap4 + 1; final4Count = neededPerFinalByte; max4Trail = minTrail + (final4Count - 1) * final4Multiplier; - /* - if (DEBUG) { - System.out.println("final4Count: " + final4Count); - for (int counter = 0; counter <= final4Count; ++counter) { - int value = minTrail + (1 + counter)*final4Multiplier; - System.out.println(counter + "\t" + value + "\t" + Utility.hex(value)); - } - } - */ } /** * Supply parameters for generating implicit CEs */ U_CAPI void U_EXPORT2 -uprv_uca_initImplicitConstants(int32_t minPrimary, int32_t maxPrimary, UErrorCode *status) { +uprv_uca_initImplicitConstants(UErrorCode *status) { // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. - initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); -} - -U_CDECL_BEGIN -static UBool U_CALLCONV -ucol_cleanup(void) -{ - if (UCA_DATA_MEM) { - udata_close(UCA_DATA_MEM); - UCA_DATA_MEM = NULL; - } - if (_staticUCA) { - ucol_close(_staticUCA); - _staticUCA = NULL; - } - fcdTrieIndex = NULL; - return TRUE; -} -U_CDECL_END - -/* do not close UCA returned by ucol_initUCA! */ -UCollator * -ucol_initUCA(UErrorCode *status) { - if(U_FAILURE(*status)) { - return NULL; - } - umtx_lock(NULL); - UBool f = (_staticUCA == NULL); - umtx_unlock(NULL); - - if(f) { - UCollator *newUCA = NULL; - UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status); - - if(U_FAILURE(*status)) { - if (result) { - udata_close(result); - } - uprv_free(newUCA); - } - - // init FCD data - if (fcdTrieIndex == NULL) { - fcdTrieIndex = unorm_getFCDTrie(status); - ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); - } - - if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ - newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, newUCA, status); - if(U_SUCCESS(*status)){ - newUCA->rb = NULL; - newUCA->elements = NULL; - newUCA->validLocale = NULL; - newUCA->requestedLocale = NULL; - newUCA->hasRealData = FALSE; // real data lives in .dat file... - newUCA->freeImageOnClose = FALSE; - umtx_lock(NULL); - if(_staticUCA == NULL) { - _staticUCA = newUCA; - UCA_DATA_MEM = result; - result = NULL; - newUCA = NULL; - } - umtx_unlock(NULL); - - if(newUCA != NULL) { - udata_close(result); - uprv_free(newUCA); - } - else { - ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); - } - // Initalize variables for implicit generation - const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)_staticUCA->image + _staticUCA->image->UCAConsts); - uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN, UCAconsts->UCA_PRIMARY_IMPLICIT_MAX, status); - _staticUCA->mapping->getFoldingOffset = _getFoldingOffset; - }else{ - udata_close(result); - uprv_free(newUCA); - _staticUCA= NULL; - } - } - } - return _staticUCA; + //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); + initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); } @@ -1441,20 +1224,20 @@ void collIterNormalize(collIterate *collationSource) return; } - if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { - collationSource->flags |= UCOL_ITER_ALLOCATED; - } - collationSource->pos = collationSource->writableBuffer; - collationSource->origFlags = collationSource->flags; - collationSource->flags |= UCOL_ITER_INNORMBUF; - collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); + if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { + collationSource->flags |= UCOL_ITER_ALLOCATED; + } + collationSource->pos = collationSource->writableBuffer; + collationSource->origFlags = collationSource->flags; + collationSource->flags |= UCOL_ITER_INNORMBUF; + collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); } // This function takes the iterator and extracts normalized stuff up to the next boundary // It is similar in the end results to the collIterNormalize, but for the cases when we // use an iterator -static +/*static inline void normalizeIterator(collIterate *collationSource) { UErrorCode status = U_ZERO_ERROR; UBool wasNormalized = FALSE; @@ -1489,7 +1272,7 @@ inline void normalizeIterator(collIterate *collationSource) { collationSource->origFlags = collationSource->flags; collationSource->flags |= UCOL_ITER_INNORMBUF; collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); -} +}*/ /* Incremental FCD check and normalize */ @@ -1525,8 +1308,8 @@ inline UBool collIterFCD(collIterate *collationSource) { /* trie access */ fcd = unorm_getFCD16(fcdTrieIndex, c); if (fcd != 0) { - if (UTF_IS_FIRST_SURROGATE(c)) { - if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) { + if (U16_IS_LEAD(c)) { + if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) { ++srcP; fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2); } else { @@ -1546,8 +1329,8 @@ inline UBool collIterFCD(collIterate *collationSource) { c = *srcP++; /* trie access */ fcd = unorm_getFCD16(fcdTrieIndex, c); - if (fcd != 0 && UTF_IS_FIRST_SURROGATE(c)) { - if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) { + if (fcd != 0 && U16_IS_LEAD(c)) { + if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) { ++srcP; fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2); } else { @@ -1580,6 +1363,9 @@ inline UBool collIterFCD(collIterate *collationSource) { /* */ /****************************************************************************/ +static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); +static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); + /* there should be a macro version of this function in the header file */ /* This is the first function that tries to fetch a collation element */ /* If it's not succesfull or it encounters a more difficult situation */ @@ -1588,18 +1374,19 @@ static inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { uint32_t order = 0; if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ - order = *(collationSource->toReturn++); /* if so, return them */ - if(collationSource->CEpos == collationSource->toReturn) { - collationSource->CEpos = collationSource->toReturn = collationSource->CEs; - } - return order; + order = *(collationSource->toReturn++); /* if so, return them */ + if(collationSource->CEpos == collationSource->toReturn) { + collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; + } + return order; } UChar ch = 0; + collationSource->offsetReturn = NULL; for (;;) /* Loop handles case when incremental normalize switches */ { /* to or from the side buffer / original string, and we */ - /* need to start again to get the next character. */ + /* need to start again to get the next character. */ if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) { @@ -1627,7 +1414,7 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou else if(collationSource->flags & UCOL_USE_ITERATOR) { UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); if(iterCh == U_SENTINEL) { - return UCOL_NO_MORE_CES; + return UCOL_NO_MORE_CES; } ch = (UChar)iterCh; } @@ -1663,11 +1450,15 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou } if(collationSource->flags&UCOL_HIRAGANA_Q) { - if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) { - collationSource->flags |= UCOL_WAS_HIRAGANA; - } else { - collationSource->flags &= ~UCOL_WAS_HIRAGANA; - } + /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag + * based on whether the previous codepoint was Hiragana or Katakana. + */ + if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || + ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { + collationSource->flags |= UCOL_WAS_HIRAGANA; + } else { + collationSource->flags &= ~UCOL_WAS_HIRAGANA; + } } // We've got a character. See if there's any fcd and/or normalization stuff to do. @@ -1717,29 +1508,32 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou } // end for (;;) - if (ch <= 0xFF) { - /* For latin-1 characters we never need to fall back to the UCA table */ - /* because all of the UCA data is replicated in the latinOneMapping array */ - order = coll->latinOneMapping[ch]; - if (order > UCOL_NOT_FOUND) { - order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); - } - } - else - { - order = UTRIE_GET32_FROM_LEAD(coll->mapping, ch); - if(order > UCOL_NOT_FOUND) { /* if a CE is special */ - order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ - } - if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ + if (ch <= 0xFF) { + /* For latin-1 characters we never need to fall back to the UCA table */ + /* because all of the UCA data is replicated in the latinOneMapping array */ + order = coll->latinOneMapping[ch]; + if (order > UCOL_NOT_FOUND) { + order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); + } + } + else + { + order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); + if(order > UCOL_NOT_FOUND) { /* if a CE is special */ + order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ + } + if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ - order = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch); + order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ - order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); + order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); } - } - } + } + } + if(order == UCOL_NOT_FOUND) { + order = getImplicit(ch, collationSource); + } return order; /* return the CE */ } @@ -1781,7 +1575,8 @@ void collPrevIterNormalize(collIterate *data) data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) * sizeof(UChar)); if(data->writableBuffer == NULL) { // something is wrong here, return - return; + data->writableBufSize = 0; // Reset writableBufSize + return; } data->flags |= UCOL_ITER_ALLOCATED; /* to handle the zero termination */ @@ -1797,6 +1592,83 @@ void collPrevIterNormalize(collIterate *data) unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm, normLen, &status); + if (data->offsetBuffer == NULL) { + int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE; + + data->offsetBufferSize = len; + data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len); + data->offsetStore = data->offsetBuffer; + } else if(data->offsetBufferSize < (int32_t) normLen) { + int32_t storeIX = data->offsetStore - data->offsetBuffer; + int32_t *tob = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1)); + + if (tob != NULL) { + data->offsetBuffer = tob; + data->offsetStore = &data->offsetBuffer[storeIX]; + data->offsetBufferSize = normLen + 1; + } + } + + /* + * The usual case at this point is that we've got a base + * character followed by marks that were normalized. If + * fcdPosition is NULL, that means that we backed up to + * the beginning of the string and there's no base character. + * + * Forward processing will usually normalize when it sees + * the first mark, so that mark will get it's natural offset + * and the rest will get the offset of the character following + * the marks. The base character will also get its natural offset. + * + * We write the offset of the base character, if there is one, + * followed by the offset of the first mark and then the offsets + * of the rest of the marks. + */ + int32_t firstMarkOffset = 0; + int32_t trailOffset = data->pos - data->string + 1; + int32_t trailCount = normLen - 1; + + if (data->fcdPosition != NULL) { + int32_t baseOffset = data->fcdPosition - data->string; + UChar baseChar = *data->fcdPosition; + + firstMarkOffset = baseOffset + 1; + + /* + * If the base character is the start of a contraction, forward processing + * will normalize the marks while checking for the contraction, which means + * that the offset of the first mark will the same as the other marks. + * + * **** THIS IS PROBABLY NOT A COMPLETE TEST **** + */ + if (baseChar >= 0x100) { + uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); + + if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { + baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); + } + + if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { + firstMarkOffset = trailOffset; + } + } + + *(data->offsetStore++) = baseOffset; + } + + *(data->offsetStore++) = firstMarkOffset; + + for (int32_t i = 0; i < trailCount; i += 1) { + *(data->offsetStore++) = trailOffset; + } + + data->offsetRepeatValue = trailOffset; + + data->offsetReturn = data->offsetStore - 1; + if (data->offsetReturn == data->offsetBuffer) { + data->offsetStore = data->offsetBuffer; + } + data->pos = data->writableBuffer + data->writableBufSize; data->origFlags = data->flags; data->flags |= UCOL_ITER_INNORMBUF; @@ -1834,9 +1706,9 @@ inline UBool collPrevIterFCD(collIterate *data) /* Get the trailing combining class of the current character. */ c = *--src; - if (!UTF_IS_SURROGATE(c)) { + if (!U16_IS_SURROGATE(c)) { fcd = unorm_getFCD16(fcdTrieIndex, c); - } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) { + } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) { --src; fcd = unorm_getFCD16(fcdTrieIndex, c2); if (fcd != 0) { @@ -1861,9 +1733,9 @@ inline UBool collPrevIterFCD(collIterate *data) } c = *--src; - if (!UTF_IS_SURROGATE(c)) { + if (!U16_IS_SURROGATE(c)) { fcd = unorm_getFCD16(fcdTrieIndex, c); - } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) { + } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) { --src; fcd = unorm_getFCD16(fcdTrieIndex, c2); if (fcd != 0) { @@ -1898,20 +1770,20 @@ inline UBool collPrevIterFCD(collIterate *data) */ inline static UChar peekCharacter(collIterate *source, int32_t offset) { - if(source->pos != NULL) { - return *(source->pos + offset); - } else if(source->iterator != NULL) { - if(offset != 0) { - source->iterator->move(source->iterator, offset, UITER_CURRENT); - UChar toReturn = (UChar)source->iterator->next(source->iterator); - source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); - return toReturn; + if(source->pos != NULL) { + return *(source->pos + offset); + } else if(source->iterator != NULL) { + if(offset != 0) { + source->iterator->move(source->iterator, offset, UITER_CURRENT); + UChar toReturn = (UChar)source->iterator->next(source->iterator); + source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); + return toReturn; + } else { + return (UChar)source->iterator->current(source->iterator); + } } else { - return (UChar)source->iterator->current(source->iterator); + return (UChar)U_SENTINEL; } - } else { - return (UChar)U_SENTINEL; - } } /** @@ -1922,33 +1794,33 @@ UChar peekCharacter(collIterate *source, int32_t offset) { */ static inline UBool isAtStartPrevIterate(collIterate *data) { - if(data->pos == NULL && data->iterator != NULL) { - return !data->iterator->hasPrevious(data->iterator); - } - //return (collIter_bos(data)) || - return (data->pos == data->string) || - ((data->flags & UCOL_ITER_INNORMBUF) && - *(data->pos - 1) == 0 && data->fcdPosition == NULL); + if(data->pos == NULL && data->iterator != NULL) { + return !data->iterator->hasPrevious(data->iterator); + } + //return (collIter_bos(data)) || + return (data->pos == data->string) || + ((data->flags & UCOL_ITER_INNORMBUF) && + *(data->pos - 1) == 0 && data->fcdPosition == NULL); } static inline void goBackOne(collIterate *data) { # if 0 - // somehow, it looks like we need to keep iterator synced up - // at all times, as above. - if(data->pos) { - data->pos--; - } - if(data->iterator) { - data->iterator->previous(data->iterator); - } + // somehow, it looks like we need to keep iterator synced up + // at all times, as above. + if(data->pos) { + data->pos--; + } + if(data->iterator) { + data->iterator->previous(data->iterator); + } #endif - if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { - data->iterator->previous(data->iterator); - } - if(data->pos) { - data->pos --; - } + if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { + data->iterator->previous(data->iterator); + } + if(data->pos) { + data->pos --; + } } /** @@ -1967,15 +1839,32 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, UErrorCode *status) { uint32_t result = (uint32_t)UCOL_NULLORDER; - if (data->toReturn > data->CEs) { - data->toReturn --; + + if (data->offsetReturn != NULL) { + if (data->offsetRepeatCount > 0) { + data->offsetRepeatCount -= 1; + } else { + if (data->offsetReturn == data->offsetBuffer) { + data->offsetReturn = NULL; + data->offsetStore = data->offsetBuffer; + } else { + data->offsetReturn -= 1; + } + } + } + + if ((data->extendCEs && data->toReturn > data->extendCEs) || + (!data->extendCEs && data->toReturn > data->CEs)) + { + data->toReturn -= 1; result = *(data->toReturn); - if (data->CEs == data->toReturn) { + if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { data->CEpos = data->toReturn; } } else { UChar ch = 0; + /* Loop handles case when incremental normalize switches to or from the side buffer / original string, and we need to start again to get the @@ -2014,15 +1903,18 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, Because pointer points to the last accessed character, hence we have to increment it by one here. */ - if (data->fcdPosition == NULL) { + data->flags = data->origFlags; + data->offsetRepeatValue = 0; + + if (data->fcdPosition == NULL) { data->pos = data->string; return UCOL_NO_MORE_CES; } else { data->pos = data->fcdPosition + 1; } - data->flags = data->origFlags; - continue; + + continue; } } @@ -2084,80 +1976,46 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, */ if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); - } - else { - // TODO: fix me for THAI - I reference *(data->pos-1) - if ((data->flags & UCOL_ITER_INNORMBUF) == 0 && - /*UCOL_ISTHAIBASECONSONANT(ch) &&*/ // This is from the old specs - we now rearrange unconditionally - // makes sure that we're not at the beggining of the string - //data->pos > data->string && - !collIter_bos(data) && - UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) - //UCOL_ISTHAIPREVOWEL(*(data->pos -1))) + } else { + if (ch <= 0xFF) { + result = coll->latinOneMapping[ch]; + } + else { + result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); + } + if (result > UCOL_NOT_FOUND) { + result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); + } + if (result == UCOL_NOT_FOUND) { // Not found in master list + if (!isAtStartPrevIterate(data) && + ucol_contractionEndCP(ch, data->coll)) { - collIterateState entryState; - backupState(data, &entryState); - // we have to check if the previous character is also Thai - // if not, we can just set the result - goBackOne(data); - if(collIter_bos(data) || !UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) { - loadState(data, &entryState, FALSE); - result = UCOL_THAI; - } else { // previous is also reordered - // we need to go back as long as they are being reordered - // count over the range of reorderable characters and see - // if there is an even or odd number of them - // if even, we should not reorder. If odd we should reorder. - int32_t noReordered = 1; // the one we already detected - while(!collIter_bos(data) && UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) { - noReordered++; - goBackOne(data); - } - if(noReordered & 1) { // odd number of reorderables - result = UCOL_THAI; - } else { - result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch); - } - loadState(data, &entryState, FALSE); + result = UCOL_CONTRACTION; + } else { + if(coll->UCA) { + result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); } } - else if (ch <= 0xFF) { - result = coll->latinOneMapping[ch]; - //if (result > UCOL_NOT_FOUND) { - //result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); - //} - } - else { - /*result = ucmpe32_get(coll->mapping, ch);*/ - result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch); - } - if (result > UCOL_NOT_FOUND) { - result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); - } - if (result == UCOL_NOT_FOUND) { - if (!isAtStartPrevIterate(data) && - ucol_contractionEndCP(ch, data->coll)) { - result = UCOL_CONTRACTION; - } - else { - /*result = ucmpe32_get(UCA->mapping, ch);*/ - if(coll->UCA) { - result = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch); - } - } - if (result > UCOL_NOT_FOUND && coll->UCA) { - result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); - } + if (result > UCOL_NOT_FOUND) { + if(coll->UCA) { + result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); + } } } } + + if(result == UCOL_NOT_FOUND) { + result = getPrevImplicit(ch, data); + } + } + return result; } /* ucol_getPrevCE, out-of-line version for use from other files. */ -U_CAPI uint32_t U_EXPORT2 +U_CFUNC uint32_t U_EXPORT2 ucol_getPrevCE(const UCollator *coll, collIterate *data, UErrorCode *status) { return ucol_IGetPrevCE(coll, data, status); @@ -2165,14 +2023,14 @@ ucol_getPrevCE(const UCollator *coll, collIterate *data, /* this should be connected to special Jamo handling */ -U_CAPI uint32_t U_EXPORT2 +U_CFUNC uint32_t U_EXPORT2 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { - collIterate colIt; - uint32_t order; - IInit_collIterate(coll, &u, 1, &colIt); - order = ucol_IGetNextCE(coll, &colIt, status); - /*UCOL_GETNEXTCE(order, coll, colIt, status);*/ - return order; + collIterate colIt; + uint32_t order; + IInit_collIterate(coll, &u, 1, &colIt); + order = ucol_IGetNextCE(coll, &colIt, status); + /*UCOL_GETNEXTCE(order, coll, colIt, status);*/ + return order; } /** @@ -2186,9 +2044,9 @@ ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { static inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch) { - uint32_t size = data->writableBufSize; - UChar *newbuffer; - const uint32_t incsize = 5; + uint32_t size = data->writableBufSize; + UChar *newbuffer; + static const uint32_t INCSIZE = 5; if ((data->writableBuffer + size) > (pNull + 1)) { *pNull = ch; @@ -2200,19 +2058,19 @@ inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch) buffer will always be null terminated at the end. giving extra space since it is likely that more characters will be added. */ - size += incsize; + size += INCSIZE; newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size); if(newbuffer != NULL) { // something wrong, but no status - uprv_memcpy(newbuffer, data->writableBuffer, - data->writableBufSize * sizeof(UChar)); + uprv_memcpy(newbuffer, data->writableBuffer, + data->writableBufSize * sizeof(UChar)); - freeHeapWritableBuffer(data); - data->writableBufSize = size; - data->writableBuffer = newbuffer; + freeHeapWritableBuffer(data); + data->writableBufSize = size; + data->writableBuffer = newbuffer; - newbuffer = newbuffer + data->writableBufSize; - *newbuffer = ch; - *(newbuffer + 1) = 0; + newbuffer = newbuffer + data->writableBufSize; + *newbuffer = ch; + *(newbuffer + 1) = 0; } return newbuffer; } @@ -2300,6 +2158,8 @@ inline void normalizeNextContraction(collIterate *data) data->writableBuffer = temp; data->writableBufSize = size; data->flags |= UCOL_ITER_ALLOCATED; + } else { + return; // Avoid writing past bound of buffer->writableBuffer. } } @@ -2390,6 +2250,10 @@ inline UChar getNextNormalizedChar(collIterate *data) /* at the end of the string, dump it into the normalizer */ data->pos = insertBufferEnd(data, data->pos, *(data->fcdPosition)) + 1; + // Check if data->pos received a null pointer + if (data->pos == NULL) { + return (UChar)-1; // Return to indicate error. + } return *(data->fcdPosition ++); } pEndWritableBuffer = data->pos; @@ -2435,6 +2299,10 @@ inline UChar getNextNormalizedChar(collIterate *data) int32_t length = data->fcdPosition - data->pos + 1; data->pos = insertBufferEnd(data, pEndWritableBuffer, data->pos - 1, length); + // Check if data->pos received a null pointer + if (data->pos == NULL) { + return (UChar)-1; // Return to indicate error. + } return *(data->pos ++); } } @@ -2445,6 +2313,10 @@ inline UChar getNextNormalizedChar(collIterate *data) appended to the buffer. */ data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1; + // Check if data->pos received a null pointer + if (data->pos == NULL) { + return (UChar)-1; // Return to indicate error. + } } /* points back to the pos in string */ @@ -2489,7 +2361,8 @@ inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer, source->writableBuffer = (UChar *)uprv_malloc((length + 1) * sizeof(UChar)); if(source->writableBuffer == NULL) { - return; + source->writableBufSize = 0; // Reset size + return; } source->writableBufSize = length; } @@ -2524,9 +2397,9 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, backupState(source, &discState); //*tempdb = *(source->pos - 1); - *tempdb = peekCharacter(source, -1); - tempdb ++; - while (TRUE) { + *tempdb = peekCharacter(source, -1); + tempdb++; + for (;;) { UChar *UCharOffset; UChar schar, tchar; @@ -2618,21 +2491,19 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, static inline UBool isNonChar(UChar32 cp) { - if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) { - return TRUE; - } - return FALSE; + return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)); } /* now uses Mark's getImplicitPrimary code */ static inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { - if(isNonChar(cp)) { - return 0; - } - uint32_t r = uprv_uca_getImplicitPrimary(cp); - *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; - return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' + if(isNonChar(cp)) { + return 0; + } + uint32_t r = uprv_uca_getImplicitPrimary(cp); + *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; + collationSource->offsetRepeatCount += 1; + return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' } /** @@ -2646,10 +2517,10 @@ inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { static inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch) { - uint32_t size = data->writableBufSize; - UChar *end; - UChar *newbuffer; - const uint32_t incsize = 5; + uint32_t size = data->writableBufSize; + UChar *end; + UChar *newbuffer; + static const uint32_t INCSIZE = 5; if (pNull > data->writableBuffer + 1) { *pNull = ch; @@ -2661,12 +2532,12 @@ inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch) buffer will always be null terminated infront. giving extra space since it is likely that more characters will be added. */ - size += incsize; + size += INCSIZE; newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size); if(newbuffer == NULL) { - return NULL; + return NULL; } - end = newbuffer + incsize; + end = newbuffer + INCSIZE; uprv_memcpy(end, data->writableBuffer, data->writableBufSize * sizeof(UChar)); *end = ch; @@ -2689,12 +2560,10 @@ inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch) * @param data collation iterator data */ static -inline void normalizePrevContraction(collIterate *data) +inline void normalizePrevContraction(collIterate *data, UErrorCode *status) { - UChar *buffer = data->writableBuffer; - uint32_t buffersize = data->writableBufSize; uint32_t nulltermsize; - UErrorCode status = U_ZERO_ERROR; + UErrorCode localstatus = U_ZERO_ERROR; UChar *pEnd = data->pos + 1; /* End normalize + 1 */ UChar *pStart; uint32_t normLen; @@ -2705,12 +2574,12 @@ inline void normalizePrevContraction(collIterate *data) normalization buffer not used yet, we'll pull down the next character into the end of the buffer */ - *(buffer + (buffersize - 1)) = *(data->pos + 1); - nulltermsize = buffersize - 1; + *(data->writableBuffer + (data->writableBufSize - 1)) = *(data->pos + 1); + nulltermsize = data->writableBufSize - 1; } else { - nulltermsize = buffersize; - UChar *temp = buffer + (nulltermsize - 1); + nulltermsize = data->writableBufSize; + UChar *temp = data->writableBuffer + (nulltermsize - 1); while (*(temp --) != 0) { nulltermsize --; } @@ -2724,31 +2593,32 @@ inline void normalizePrevContraction(collIterate *data) pStart = data->fcdPosition + 1; } - normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0, - &status); + normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, data->writableBuffer, 0, + &localstatus); if (nulltermsize <= normLen) { - uint32_t size = buffersize - nulltermsize + normLen + 1; + uint32_t size = data->writableBufSize - nulltermsize + normLen + 1; UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar)); - if(temp != NULL) { - nulltermsize = normLen + 1; - uprv_memcpy(temp + normLen, buffer, - sizeof(UChar) * (buffersize - nulltermsize)); - freeHeapWritableBuffer(data); - data->writableBuffer = temp; - data->writableBufSize = size; + if (temp == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return; } + nulltermsize = normLen + 1; + uprv_memcpy(temp + normLen, data->writableBuffer, + sizeof(UChar) * (data->writableBufSize - nulltermsize)); + freeHeapWritableBuffer(data); + data->writableBuffer = temp; + data->writableBufSize = size; } - status = U_ZERO_ERROR; /* this puts the null termination infront of the normalized string instead of the end */ - pStartNorm = buffer + (nulltermsize - normLen); + pStartNorm = data->writableBuffer + (nulltermsize - normLen); *(pStartNorm - 1) = 0; unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen, - &status); + status); data->pos = data->writableBuffer + nulltermsize; data->origFlags = data->flags; @@ -2770,7 +2640,7 @@ inline void normalizePrevContraction(collIterate *data) * @return previous character */ static -inline UChar getPrevNormalizedChar(collIterate *data) +inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) { UChar prevch; UChar ch; @@ -2793,7 +2663,7 @@ inline UChar getPrevNormalizedChar(collIterate *data) } start = data->pos; - if (data->flags & UCOL_ITER_HASLEN) { + if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { /* in data string */ if ((start - 1) == data->string) { return *(start - 1); @@ -2832,7 +2702,7 @@ inline UChar getPrevNormalizedChar(collIterate *data) UChar *backuppos = data->pos; data->pos = start; if (collPrevIterFCD(data)) { - normalizePrevContraction(data); + normalizePrevContraction(data, status); return *(data->pos - 1); } data->pos = backuppos; @@ -2854,761 +2724,614 @@ inline UChar getPrevNormalizedChar(collIterate *data) /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ /* It is called by getNextCE */ +/* The following should be even */ +#define UCOL_MAX_DIGITS_FOR_NUMBER 254 + uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { - collIterateState entryState; - backupState(source, &entryState); - UChar32 cp = ch; - - for (;;) { - // This loop will repeat only in the case of contractions, and only when a contraction - // is found and the first CE resulting from that contraction is itself a special - // (an expansion, for example.) All other special CE types are fully handled the - // first time through, and the loop exits. - - const uint32_t *CEOffset = NULL; - switch(getCETag(CE)) { - case NOT_FOUND_TAG: - /* This one is not found, and we'll let somebody else bother about it... no more games */ - return CE; - case SURROGATE_TAG: - /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ - /* two things can happen here: next code point can be a trailing surrogate - we will use it */ - /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ - /* we return 0 (completely ignorable - per UCA specification */ - { - UChar trail; - collIterateState state; - backupState(source, &state); - if (collIter_eos(source) || !(UTF16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { - // we chould have stepped one char forward and it might have turned that it - // was not a trail surrogate. In that case, we have to backup. - loadState(source, &state, TRUE); - return 0; - } else { - /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ - CE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, CE&0xFFFFFF, trail); - if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. - // We need to backup - loadState(source, &state, TRUE); + collIterateState entryState; + backupState(source, &entryState); + UChar32 cp = ch; + + for (;;) { + // This loop will repeat only in the case of contractions, and only when a contraction + // is found and the first CE resulting from that contraction is itself a special + // (an expansion, for example.) All other special CE types are fully handled the + // first time through, and the loop exits. + + const uint32_t *CEOffset = NULL; + switch(getCETag(CE)) { + case NOT_FOUND_TAG: + /* This one is not found, and we'll let somebody else bother about it... no more games */ return CE; - } - // calculate the supplementary code point value, if surrogate was not tailored - cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); - } - } - break; - case THAI_TAG: - /* Thai/Lao reordering */ - if (((source->flags) & UCOL_ITER_INNORMBUF) /* Already Swapped || */ - || collIter_eos(source)) /* At end of string. No swap possible */ - { - // Treat Thai as a length one expansion */ - CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ - CE = *CEOffset++; - } - else - { - // Move the prevowel and the following base Consonant into the normalization buffer - // with their order swapped - // Note: this operation might activate the normalization buffer. We have to check for - // that and act accordingly. - UChar thCh = getNextNormalizedChar(source); - UChar32 cp = 0; - if(U16_IS_LEAD(thCh)) { - if(!collIter_eos(source)) { - collIterateState thaiState; - backupState(source, &thaiState); - UChar trailCh = getNextNormalizedChar(source); - if(U16_IS_TRAIL(trailCh)) { - cp = U16_GET_SUPPLEMENTARY(thCh, trailCh); - } else { - loadState(source, &thaiState, TRUE); - cp = (UChar32)thCh; - } - } else { - cp = (UChar32)thCh; - } - } else { - cp = (UChar32)thCh; - } - // Now we have the character that needs to be decomposed - // if the normalizing buffer was not used, we can just use our structure and be happy. - if((source->flags & UCOL_ITER_INNORMBUF) == 0) { - // decompose into writable buffer - int32_t decompLen = unorm_getDecomposition(cp, FALSE, &(source->writableBuffer[1]), UCOL_WRITABLE_BUFFER_SIZE-1); - if(decompLen < 0) { - decompLen = -decompLen; - } - // reorder Thai and the character after it - if(decompLen >= 2 && U16_IS_LEAD(source->writableBuffer[1]) && U16_IS_TRAIL(source->writableBuffer[2])) { - source->writableBuffer[0] = source->writableBuffer[1]; - source->writableBuffer[1] = source->writableBuffer[2]; - source->writableBuffer[2] = ch; - } else { - source->writableBuffer[0] = source->writableBuffer[1]; - source->writableBuffer[1] = ch; - } - // zero terminate, since normalization buffer is always zero terminated - source->writableBuffer[decompLen+1] = 0; // we added the prevowel - if(source->pos) { - source->fcdPosition = source->pos; // Indicate where to continue in main input string - // after exhausting the writableBuffer - } - source->pos = source->writableBuffer; - source->origFlags = source->flags; - source->flags |= UCOL_ITER_INNORMBUF; - source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); - } - else { - // stuff is already normalized... what to do here??? - - // if we are in the normalization buffer, thCh must be in it - // prove by contradiction - // if thCh is not in the normalization buffer, - // that means that trailCh is the normalization buffer - // that means that trailCh is a trail surrogate by the above - // bounding if block, this is a contradiction because there - // are no characters at the moment that decomposes to an - // unmatched surrogate. qed. - if (cp >= 0x10000) { - source->writableBuffer[0] = source->writableBuffer[1]; - source->writableBuffer[1] = source->writableBuffer[2]; - source->writableBuffer[2] = ch; - } - else { - source->writableBuffer[0] = source->writableBuffer[1]; - source->writableBuffer[1] = ch; - } - source->pos = source->writableBuffer; - } - CE = UCOL_IGNORABLE; - } - break; - case SPEC_PROC_TAG: - { - // Special processing is getting a CE that is preceded by a certain prefix - // Currently this is only needed for optimizing Japanese length and iteration marks. - // When we encouter a special processing tag, we go backwards and try to see if - // we have a match. - // Contraction tables are used - so the whole process is not unlike contraction. - // prefix data is stored backwards in the table. - const UChar *UCharOffset; - UChar schar, tchar; - collIterateState prefixState; - backupState(source, &prefixState); - loadState(source, &entryState, TRUE); - goBackOne(source); // We want to look at the point where we entered - actually one - // before that... + case SPEC_PROC_TAG: + { + // Special processing is getting a CE that is preceded by a certain prefix + // Currently this is only needed for optimizing Japanese length and iteration marks. + // When we encouter a special processing tag, we go backwards and try to see if + // we have a match. + // Contraction tables are used - so the whole process is not unlike contraction. + // prefix data is stored backwards in the table. + const UChar *UCharOffset; + UChar schar, tchar; + collIterateState prefixState; + backupState(source, &prefixState); + loadState(source, &entryState, TRUE); + goBackOne(source); // We want to look at the point where we entered - actually one + // before that... + + for(;;) { + // This loop will run once per source string character, for as long as we + // are matching a potential contraction sequence + + // First we position ourselves at the begining of contraction sequence + const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); + if (collIter_bos(source)) { + CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); + break; + } + schar = getPrevNormalizedChar(source, status); + goBackOne(source); - for(;;) { - // This loop will run once per source string character, for as long as we - // are matching a potential contraction sequence + while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ + UCharOffset++; + } - // First we position ourselves at the begining of contraction sequence - const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); - if (collIter_bos(source)) { - CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); - break; - } - schar = getPrevNormalizedChar(source); - goBackOne(source); - - while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ - UCharOffset++; - } + if (schar == tchar) { + // Found the source string char in the table. + // Pick up the corresponding CE from the table. + CE = *(coll->contractionCEs + + (UCharOffset - coll->contractionIndex)); + } + else + { + // Source string char was not in the table. + // We have not found the prefix. + CE = *(coll->contractionCEs + + (ContractionStart - coll->contractionIndex)); + } - if (schar == tchar) { - // Found the source string char in the table. - // Pick up the corresponding CE from the table. - CE = *(coll->contractionCEs + - (UCharOffset - coll->contractionIndex)); - } - else - { - // if there is a completely ignorable code point in the middle of - // a prefix, we need to act as if it's not there - // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) - // lone surrogates cannot be set to zero as it would break other processing - uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar); - // it's easy for BMP code points - if(isZeroCE == 0) { - continue; - } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) { - // for supplementary code points, we have to check the next one - // situations where we are going to ignore - // 1. beginning of the string: schar is a lone surrogate - // 2. schar is a lone surrogate - // 3. schar is a trail surrogate in a valid surrogate sequence - // that is explicitly set to zero. - if (!collIter_bos(source)) { - UChar lead; - if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) { - isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead); - if(getCETag(isZeroCE) == SURROGATE_TAG) { - uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar); - if(finalCE == 0) { - // this is a real, assigned completely ignorable code point - goBackOne(source); - continue; - } + if(!isPrefix(CE)) { + // The source string char was in the contraction table, and the corresponding + // CE is not a prefix CE. We found the prefix, break + // out of loop, this CE will end up being returned. This is the normal + // way out of prefix handling when the source actually contained + // the prefix. + break; } - } else { - // lone surrogate, completely ignorable - continue; - } - } else { - // lone surrogate at the beggining, completely ignorable - continue; } - } - // Source string char was not in the table. - // We have not found the prefix. - CE = *(coll->contractionCEs + - (ContractionStart - coll->contractionIndex)); - } - - if(!isPrefix(CE)) { - // The source string char was in the contraction table, and the corresponding - // CE is not a prefix CE. We found the prefix, break - // out of loop, this CE will end up being returned. This is the normal - // way out of prefix handling when the source actually contained - // the prefix. - break; - } - } - if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue - loadState(source, &prefixState, TRUE); - if(source->origFlags & UCOL_USE_ITERATOR) { - source->flags = source->origFlags; - } - } else { // prefix search was a failure, we have to backup all the way to the start - loadState(source, &entryState, TRUE); - } - break; - } - case CONTRACTION_TAG: - { - /* This should handle contractions */ - collIterateState state; - backupState(source, &state); - uint32_t firstCE = UCOL_NOT_FOUND; - const UChar *UCharOffset; - UChar schar, tchar; - - for (;;) { - /* This loop will run once per source string character, for as long as we */ - /* are matching a potential contraction sequence */ - - /* First we position ourselves at the begining of contraction sequence */ - const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); - - if (collIter_eos(source)) { - // Ran off the end of the source string. - CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); - // So we'll pick whatever we have at the point... - if (CE == UCOL_NOT_FOUND) { - // back up the source over all the chars we scanned going into this contraction. - CE = firstCE; - loadState(source, &state, TRUE); - if(source->origFlags & UCOL_USE_ITERATOR) { - source->flags = source->origFlags; + if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue + loadState(source, &prefixState, TRUE); + if(source->origFlags & UCOL_USE_ITERATOR) { + source->flags = source->origFlags; + } + } else { // prefix search was a failure, we have to backup all the way to the start + loadState(source, &entryState, TRUE); } + break; } - break; - } - - uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ - uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); - - schar = getNextNormalizedChar(source); - while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ - UCharOffset++; - } - - if (schar == tchar) { - // Found the source string char in the contraction table. - // Pick up the corresponding CE from the table. - CE = *(coll->contractionCEs + - (UCharOffset - coll->contractionIndex)); - } - else - { - // if there is a completely ignorable code point in the middle of - // contraction, we need to act as if it's not there - uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar); - // it's easy for BMP code points - if(isZeroCE == 0) { - continue; - } else if(UTF_IS_LEAD(schar)) { - if(!collIter_eos(source)) { + case CONTRACTION_TAG: + { + /* This should handle contractions */ + collIterateState state; backupState(source, &state); - UChar trail = getNextNormalizedChar(source); - if(UTF_IS_TRAIL(trail)) { // do stuff with trail - if(getCETag(isZeroCE) == SURROGATE_TAG) { - uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, trail); - if(finalCE == 0) { - continue; - } - } - } else { - // broken surrogate sequence, thus completely ignorable - loadState(source, &state, TRUE); - continue; - } - loadState(source, &state, TRUE); - } else { // no more characters, so broken surrogate pair... - // this contraction will ultimately fail, but not because of us - continue; - } - } // else if(UTF_IS_LEAD(schar)) - - // Source string char was not in contraction table. - // Unless we have a discontiguous contraction, we have finished - // with this contraction. - uint8_t sCC; - if (schar < 0x300 || - maxCC == 0 || - (sCC = i_getCombiningClass(schar, coll)) == 0 || - sCC>maxCC || - (allSame != 0 && sCC == maxCC) || - collIter_eos(source)) { - // Contraction can not be discontiguous. - goBackOne(source); // back up the source string by one, - // because the character we just looked at was - // not part of the contraction. */ - CE = *(coll->contractionCEs + - (ContractionStart - coll->contractionIndex)); - } else { - // - // Contraction is possibly discontiguous. - // Scan more of source string looking for a match - // - UChar tempchar; - /* find the next character if schar is not a base character - and we are not yet at the end of the string */ - tempchar = getNextNormalizedChar(source); - goBackOne(source); - if (i_getCombiningClass(tempchar, coll) == 0) { - goBackOne(source); - /* Spit out the last char of the string, wasn't tasty enough */ - CE = *(coll->contractionCEs + - (ContractionStart - coll->contractionIndex)); - } else { - CE = getDiscontiguous(coll, source, ContractionStart); - } - } - } // else after if(schar == tchar) + uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; + const UChar *UCharOffset; + UChar schar, tchar; + + for (;;) { + /* This loop will run once per source string character, for as long as we */ + /* are matching a potential contraction sequence */ + + /* First we position ourselves at the begining of contraction sequence */ + const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); + + if (collIter_eos(source)) { + // Ran off the end of the source string. + CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); + // So we'll pick whatever we have at the point... + if (CE == UCOL_NOT_FOUND) { + // back up the source over all the chars we scanned going into this contraction. + CE = firstCE; + loadState(source, &state, TRUE); + if(source->origFlags & UCOL_USE_ITERATOR) { + source->flags = source->origFlags; + } + } + break; + } - if(CE == UCOL_NOT_FOUND) { - /* The Source string did not match the contraction that we were checking. */ - /* Back up the source position to undo the effects of having partially */ - /* scanned through what ultimately proved to not be a contraction. */ - loadState(source, &state, TRUE); - CE = firstCE; - break; - } + uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ + uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); - if(!isContraction(CE)) { - // The source string char was in the contraction table, and the corresponding - // CE is not a contraction CE. We completed the contraction, break - // out of loop, this CE will end up being returned. This is the normal - // way out of contraction handling when the source actually contained - // the contraction. - break; - } + schar = getNextNormalizedChar(source); + while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ + UCharOffset++; + } + if (schar == tchar) { + // Found the source string char in the contraction table. + // Pick up the corresponding CE from the table. + CE = *(coll->contractionCEs + + (UCharOffset - coll->contractionIndex)); + } + else + { + // Source string char was not in contraction table. + // Unless we have a discontiguous contraction, we have finished + // with this contraction. + // in order to do the proper detection, we + // need to see if we're dealing with a supplementary + /* We test whether the next two char are surrogate pairs. + * This test is done if the iterator is not NULL. + * If there is no surrogate pair, the iterator + * goes back one if needed. */ + UChar32 miss = schar; + if (source->iterator) { + UChar32 surrNextChar; /* the next char in the iteration to test */ + int32_t prevPos; /* holds the previous position before move forward of the source iterator */ + if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { + prevPos = source->iterator->index; + surrNextChar = getNextNormalizedChar(source); + if (U16_IS_TRAIL(surrNextChar)) { + miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); + } else if (prevPos < source->iterator->index){ + goBackOne(source); + } + } + } else if (U16_IS_LEAD(schar)) { + miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); + } - // The source string char was in the contraction table, and the corresponding - // CE is IS a contraction CE. We will continue looping to check the source - // string for the remaining chars in the contraction. - uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); - if(tempCE != UCOL_NOT_FOUND) { - // We have scanned a a section of source string for which there is a - // CE from the contraction table. Remember the CE and scan position, so - // that we can return to this point if further scanning fails to - // match a longer contraction sequence. - firstCE = tempCE; - - goBackOne(source); - backupState(source, &state); - getNextNormalizedChar(source); - - // Another way to do this is: - //collIterateState tempState; - //backupState(source, &tempState); - //goBackOne(source); - //backupState(source, &state); - //loadState(source, &tempState, TRUE); - - // The problem is that for incomplete contractions we have to remember the previous - // position. Before, the only thing I needed to do was state.pos--; - // After iterator introduction and especially after introduction of normalizing - // iterators, it became much more difficult to decrease the saved state. - // I'm not yet sure which of the two methods above is faster. - } - } // for(;;) - break; - } // case CONTRACTION_TAG: - case LONG_PRIMARY_TAG: - { - *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; - CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; - return CE; - } - case EXPANSION_TAG: - { - /* This should handle expansion. */ - /* NOTE: we can encounter both continuations and expansions in an expansion! */ - /* I have to decide where continuations are going to be dealt with */ - uint32_t size; - uint32_t i; /* general counter */ - CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ - size = getExpansionCount(CE); - CE = *CEOffset++; - if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ - for(i = 1; iCEpos++) = *CEOffset++; - } - } else { /* else, we do */ - while(*CEOffset != 0) { - *(source->CEpos++) = *CEOffset++; - } - } - return CE; - } - case DIGIT_TAG: - { - /* - We do a check to see if we want to collate digits as numbers; if so we generate - a custom collation key. Otherwise we pull out the value stored in the expansion table. - */ - uint32_t size; - uint32_t i; /* general counter */ - collIterateState digitState; + uint8_t sCC; + if (miss < 0x300 || + maxCC == 0 || + (sCC = i_getCombiningClass(miss, coll)) == 0 || + sCC>maxCC || + (allSame != 0 && sCC == maxCC) || + collIter_eos(source)) + { + // Contraction can not be discontiguous. + goBackOne(source); // back up the source string by one, + // because the character we just looked at was + // not part of the contraction. */ + if(U_IS_SUPPLEMENTARY(miss)) { + goBackOne(source); + } + CE = *(coll->contractionCEs + + (ContractionStart - coll->contractionIndex)); + } else { + // + // Contraction is possibly discontiguous. + // Scan more of source string looking for a match + // + UChar tempchar; + /* find the next character if schar is not a base character + and we are not yet at the end of the string */ + tempchar = getNextNormalizedChar(source); + // probably need another supplementary thingie here + goBackOne(source); + if (i_getCombiningClass(tempchar, coll) == 0) { + goBackOne(source); + if(U_IS_SUPPLEMENTARY(miss)) { + goBackOne(source); + } + /* Spit out the last char of the string, wasn't tasty enough */ + CE = *(coll->contractionCEs + + (ContractionStart - coll->contractionIndex)); + } else { + CE = getDiscontiguous(coll, source, ContractionStart); + } + } + } // else after if(schar == tchar) + + if(CE == UCOL_NOT_FOUND) { + /* The Source string did not match the contraction that we were checking. */ + /* Back up the source position to undo the effects of having partially */ + /* scanned through what ultimately proved to not be a contraction. */ + loadState(source, &state, TRUE); + CE = firstCE; + break; + } - if (source->coll->numericCollation == UCOL_ON){ - UChar32 char32 = 0; + if(!isContraction(CE)) { + // The source string char was in the contraction table, and the corresponding + // CE is not a contraction CE. We completed the contraction, break + // out of loop, this CE will end up being returned. This is the normal + // way out of contraction handling when the source actually contained + // the contraction. + break; + } - uint32_t digIndx = 0; - uint32_t endIndex = 0; - uint32_t trailingZeroIndex = 0; - uint32_t primWeight = 0; + // The source string char was in the contraction table, and the corresponding + // CE is IS a contraction CE. We will continue looping to check the source + // string for the remaining chars in the contraction. + uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); + if(tempCE != UCOL_NOT_FOUND) { + // We have scanned a a section of source string for which there is a + // CE from the contraction table. Remember the CE and scan position, so + // that we can return to this point if further scanning fails to + // match a longer contraction sequence. + firstCE = tempCE; - int32_t digVal = 0; - uint8_t collateVal = 0; + goBackOne(source); + backupState(source, &state); + getNextNormalizedChar(source); + + // Another way to do this is: + //collIterateState tempState; + //backupState(source, &tempState); + //goBackOne(source); + //backupState(source, &state); + //loadState(source, &tempState, TRUE); + + // The problem is that for incomplete contractions we have to remember the previous + // position. Before, the only thing I needed to do was state.pos--; + // After iterator introduction and especially after introduction of normalizing + // iterators, it became much more difficult to decrease the saved state. + // I'm not yet sure which of the two methods above is faster. + } + } // for(;;) + break; + } // case CONTRACTION_TAG: + case LONG_PRIMARY_TAG: + { + *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; + CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; + source->offsetRepeatCount += 1; + return CE; + } + case EXPANSION_TAG: + { + /* This should handle expansion. */ + /* NOTE: we can encounter both continuations and expansions in an expansion! */ + /* I have to decide where continuations are going to be dealt with */ + uint32_t size; + uint32_t i; /* general counter */ - UBool nonZeroValReached = FALSE; + CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ + size = getExpansionCount(CE); + CE = *CEOffset++; + //source->offsetRepeatCount = -1; - uint8_t *numTempBuf; - uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs. - uint32_t numTempBufSize = UCOL_MAX_BUFFER; + if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ + for(i = 1; iCEpos++) = *CEOffset++; + source->offsetRepeatCount += 1; + } + } else { /* else, we do */ + while(*CEOffset != 0) { + *(source->CEpos++) = *CEOffset++; + source->offsetRepeatCount += 1; + } + } - numTempBuf = stackNumTempBuf; - /* - We parse the source string until we hit a char that's NOT a digit. - Use this u_charDigitValue. This might be slow because we have to - handle surrogates... - */ -/* - if (U16_IS_LEAD(ch)){ - if (!collIter_eos(source)) { - backupState(source, &digitState); - UChar trail = getNextNormalizedChar(source); - if(U16_IS_TRAIL(trail)) { - char32 = U16_GET_SUPPLEMENTARY(ch, trail); - } else { - loadState(source, &digitState, TRUE); - char32 = ch; + return CE; } - } else { - char32 = ch; - } - } else { - char32 = ch; - } - digVal = u_charDigitValue(char32); -*/ - digVal = u_charDigitValue(cp); // if we have arrived here, we have - // already processed possible supplementaries that trigered the digit tag - - // all supplementaries are marked in the UCA. - /* - We pad a zero in front of the first element anyways. This takes - care of the (probably) most common case where people are sorting things followed - by a single digit - */ - digIndx++; - for(;;){ - // Make sure we have enough space. - if (digIndx >= ((numTempBufSize - 2) * 2) + 1) - { - numTempBufSize *= 2; - if (numTempBuf == stackNumTempBuf){ - numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize); - uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER); - }else - uprv_realloc(numTempBuf, numTempBufSize); - } - - // Skipping over leading zeroes. - if (digVal != 0 || nonZeroValReached){ - if (digVal != 0 && !nonZeroValReached) - nonZeroValReached = TRUE; - + case DIGIT_TAG: + { /* - We parse the digit string into base 100 numbers (this fits into a byte). - We only add to the buffer in twos, thus if we are parsing an odd character, - that serves as the 'tens' digit while the if we are parsing an even one, that - is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into - a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid - overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less - than all the other bytes. - */ + We do a check to see if we want to collate digits as numbers; if so we generate + a custom collation key. Otherwise we pull out the value stored in the expansion table. + */ + //uint32_t size; + uint32_t i; /* general counter */ - if (digIndx % 2 == 1){ - collateVal += (uint8_t)digVal; + if (source->coll->numericCollation == UCOL_ON){ + collIterateState digitState = {0,0,0,0,0,0,0,0,0}; + UChar32 char32 = 0; + int32_t digVal = 0; - // We don't enter the low-order-digit case unless we've already seen - // the high order, or for the first digit, which is always non-zero. - if (collateVal != 0) - trailingZeroIndex = 0; + uint32_t digIndx = 0; + uint32_t endIndex = 0; + uint32_t trailingZeroIndex = 0; - numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; - collateVal = 0; - } - else{ - // We drop the collation value into the buffer so if we need to do - // a "front patch" we don't have to check to see if we're hitting the - // last element. - collateVal = (uint8_t)(digVal * 10); + uint8_t collateVal = 0; - // Check for trailing zeroes. - if (collateVal == 0) - { - if (!trailingZeroIndex) - trailingZeroIndex = (digIndx/2) + 2; - } - else - trailingZeroIndex = 0; + UBool nonZeroValReached = FALSE; - numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; - } - digIndx++; - } - - // Get next character. - if (!collIter_eos(source)){ - ch = getNextNormalizedChar(source); - if (U16_IS_LEAD(ch)){ - if (!collIter_eos(source)) { - backupState(source, &digitState); - UChar trail = getNextNormalizedChar(source); - if(U16_IS_TRAIL(trail)) { - char32 = U16_GET_SUPPLEMENTARY(ch, trail); + uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. + /* + We parse the source string until we hit a char that's NOT a digit. + Use this u_charDigitValue. This might be slow because we have to + handle surrogates... + */ + /* + if (U16_IS_LEAD(ch)){ + if (!collIter_eos(source)) { + backupState(source, &digitState); + UChar trail = getNextNormalizedChar(source); + if(U16_IS_TRAIL(trail)) { + char32 = U16_GET_SUPPLEMENTARY(ch, trail); + } else { + loadState(source, &digitState, TRUE); + char32 = ch; + } + } else { + char32 = ch; + } } else { - loadState(source, &digitState, TRUE); char32 = ch; } - } - } else { - char32 = ch; - } - - if ((digVal = u_charDigitValue(char32)) == -1){ - // Resetting position to point to the next unprocessed char. We - // overshot it when doing our test/set for numbers. - if (char32 > 0xFFFF) { // For surrogates. - loadState(source, &digitState, TRUE); - //goBackOne(source); - } - goBackOne(source); - break; - } - } else { - break; - } - } + digVal = u_charDigitValue(char32); + */ + digVal = u_charDigitValue(cp); // if we have arrived here, we have + // already processed possible supplementaries that trigered the digit tag - + // all supplementaries are marked in the UCA. + /* + We pad a zero in front of the first element anyways. This takes + care of the (probably) most common case where people are sorting things followed + by a single digit + */ + digIndx++; + for(;;){ + // Make sure we have enough space. No longer needed; + // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER + // (it has been pre-incremented) so we just ensure that numTempBuf is big enough + // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). + + // Skipping over leading zeroes. + if (digVal != 0) { + nonZeroValReached = TRUE; + } + if (nonZeroValReached) { + /* + We parse the digit string into base 100 numbers (this fits into a byte). + We only add to the buffer in twos, thus if we are parsing an odd character, + that serves as the 'tens' digit while the if we are parsing an even one, that + is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into + a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid + overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less + than all the other bytes. + */ + + if (digIndx % 2 == 1){ + collateVal += (uint8_t)digVal; + + // We don't enter the low-order-digit case unless we've already seen + // the high order, or for the first digit, which is always non-zero. + if (collateVal != 0) + trailingZeroIndex = 0; + + numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; + collateVal = 0; + } + else{ + // We drop the collation value into the buffer so if we need to do + // a "front patch" we don't have to check to see if we're hitting the + // last element. + collateVal = (uint8_t)(digVal * 10); + + // Check for trailing zeroes. + if (collateVal == 0) + { + if (!trailingZeroIndex) + trailingZeroIndex = (digIndx/2) + 2; + } + else + trailingZeroIndex = 0; + + numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; + } + digIndx++; + } - if (nonZeroValReached == FALSE){ - digIndx = 2; - numTempBuf[2] = 6; - } + // Get next character. + if (!collIter_eos(source)){ + ch = getNextNormalizedChar(source); + if (U16_IS_LEAD(ch)){ + if (!collIter_eos(source)) { + backupState(source, &digitState); + UChar trail = getNextNormalizedChar(source); + if(U16_IS_TRAIL(trail)) { + char32 = U16_GET_SUPPLEMENTARY(ch, trail); + } else { + loadState(source, &digitState, TRUE); + char32 = ch; + } + } + } else { + char32 = ch; + } + + if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){ + // Resetting position to point to the next unprocessed char. We + // overshot it when doing our test/set for numbers. + if (char32 > 0xFFFF) { // For surrogates. + loadState(source, &digitState, TRUE); + //goBackOne(source); + } + goBackOne(source); + break; + } + } else { + break; + } + } - endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; - if (digIndx % 2 != 0){ - /* - We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what - we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. - Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a - single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. - */ + if (nonZeroValReached == FALSE){ + digIndx = 2; + numTempBuf[2] = 6; + } - for(i = 2; i < endIndex; i++){ - numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + - (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; - } - --digIndx; - } + endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; + if (digIndx % 2 != 0){ + /* + We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what + we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. + Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a + single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. + */ + + for(i = 2; i < endIndex; i++){ + numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + + (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; + } + --digIndx; + } - // Subtract one off of the last byte. - numTempBuf[endIndex-1] -= 1; + // Subtract one off of the last byte. + numTempBuf[endIndex-1] -= 1; - /* - We want to skip over the first two slots in the buffer. The first slot - is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the - sign/exponent byte: 0x80 + (decimalPos/2) & 7f. - */ - numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; - numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); - - // Now transfer the collation key to our collIterate struct. - // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. - size = ((endIndex+1) & ~1)/2; - CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight - (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight - UCOL_BYTE_COMMON; // Tertiary weight. - i = 2; // Reset the index into the buffer. - while(i < endIndex) - { - primWeight = numTempBuf[i++] << 8; - if ( i < endIndex) - primWeight |= numTempBuf[i++]; - *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; - } + /* + We want to skip over the first two slots in the buffer. The first slot + is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the + sign/exponent byte: 0x80 + (decimalPos/2) & 7f. + */ + numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; + numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); + + // Now transfer the collation key to our collIterate struct. + // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. + //size = ((endIndex+1) & ~1)/2; + CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight + (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight + UCOL_BYTE_COMMON; // Tertiary weight. + i = 2; // Reset the index into the buffer. + while(i < endIndex) + { + uint32_t primWeight = numTempBuf[i++] << 8; + if ( i < endIndex) + primWeight |= numTempBuf[i++]; + *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; + } - if (numTempBuf != stackNumTempBuf) - uprv_free(numTempBuf); - } else { - // no numeric mode, we'll just switch to whatever we stashed and continue - CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ - CE = *CEOffset++; - break; -#if 0 - CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ - size = getExpansionCount(CE); - CE = *CEOffset++; - if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ - for(i = 1; iCEpos++) = *CEOffset++; - } - } else { /* else, we do */ - while(*CEOffset != 0) { - *(source->CEpos++) = *CEOffset++; - } - } -#endif - } - return CE; - } - /* various implicits optimization */ - // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit - case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ - //return getImplicit(cp, source, 0x04000000); - return getImplicit(cp, source); - case IMPLICIT_TAG: /* everything that is not defined otherwise */ - /* UCA is filled with these. Tailorings are NOT_FOUND */ - //return getImplicit(cp, source, 0); - return getImplicit(cp, source); - case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ - return 0; /* broken surrogate sequence */ - case LEAD_SURROGATE_TAG: /* D800-DBFF*/ - UChar nextChar; - if( source->flags & UCOL_USE_ITERATOR) { - if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { - cp = U16_GET_SUPPLEMENTARY(ch, nextChar); - source->iterator->next(source->iterator); - return getImplicit(cp, source); - } else { - return 0; - } - } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->posendp)) && - U_IS_TRAIL((nextChar=*source->pos))) { - cp = U16_GET_SUPPLEMENTARY(ch, nextChar); - source->pos++; - return getImplicit(cp, source); - } else { - return 0; /* completely ignorable */ - } - case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ - { - const uint32_t - SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; - //const uint32_t LCount = 19; - const uint32_t VCount = 21; - const uint32_t TCount = 28; - //const uint32_t NCount = VCount * TCount; // 588 - //const uint32_t SCount = LCount * NCount; // 11172 - uint32_t L = ch - SBase; - - // divide into pieces - - uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation - L /= TCount; - uint32_t V = L % VCount; - L /= VCount; - - // offset them - - L += LBase; - V += VBase; - T += TBase; - - // return the first CE, but first put the rest into the expansion buffer - if (!source->coll->image->jamoSpecial) { // FAST PATH - - /**(source->CEpos++) = ucmpe32_get(UCA->mapping, V);*/ - /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/ - *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V); - if (T != TBase) { - /**(source->CEpos++) = ucmpe32_get(UCA->mapping, T);*/ - /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/ - *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T); - } + } else { + // no numeric mode, we'll just switch to whatever we stashed and continue + CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ + CE = *CEOffset++; + break; + } + return CE; + } + /* various implicits optimization */ + case IMPLICIT_TAG: /* everything that is not defined otherwise */ + /* UCA is filled with these. Tailorings are NOT_FOUND */ + return getImplicit(cp, source); + case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ + // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit + return getImplicit(cp, source); + case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ + { + static const uint32_t + SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; + //const uint32_t LCount = 19; + static const uint32_t VCount = 21; + static const uint32_t TCount = 28; + //const uint32_t NCount = VCount * TCount; // 588 + //const uint32_t SCount = LCount * NCount; // 11172 + uint32_t L = ch - SBase; + + // divide into pieces + + uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation + L /= TCount; + uint32_t V = L % VCount; + L /= VCount; + + // offset them + + L += LBase; + V += VBase; + T += TBase; + + // return the first CE, but first put the rest into the expansion buffer + if (!source->coll->image->jamoSpecial) { // FAST PATH + + *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); + if (T != TBase) { + *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); + } - /*return ucmpe32_get(UCA->mapping, L);*/ // return first one - /*return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/ - return UTRIE_GET32_FROM_LEAD(coll->mapping, L); - - } else { // Jamo is Special - // Since Hanguls pass the FCD check, it is - // guaranteed that we won't be in - // the normalization buffer if something like this happens - // However, if we are using a uchar iterator and normalization - // is ON, the Hangul that lead us here is going to be in that - // normalization buffer. Here we want to restore the uchar - // iterator state and pull out of the normalization buffer - if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { - source->flags = source->origFlags; // restore the iterator - source->pos = NULL; - } - // Move Jamos into normalization buffer - source->writableBuffer[0] = (UChar)L; - source->writableBuffer[1] = (UChar)V; - if (T != TBase) { - source->writableBuffer[2] = (UChar)T; - source->writableBuffer[3] = 0; - } else { - source->writableBuffer[2] = 0; - } + return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); + + } else { // Jamo is Special + // Since Hanguls pass the FCD check, it is + // guaranteed that we won't be in + // the normalization buffer if something like this happens + // However, if we are using a uchar iterator and normalization + // is ON, the Hangul that lead us here is going to be in that + // normalization buffer. Here we want to restore the uchar + // iterator state and pull out of the normalization buffer + if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { + source->flags = source->origFlags; // restore the iterator + source->pos = NULL; + } + // Move Jamos into normalization buffer + source->writableBuffer[0] = (UChar)L; + source->writableBuffer[1] = (UChar)V; + if (T != TBase) { + source->writableBuffer[2] = (UChar)T; + source->writableBuffer[3] = 0; + } else { + source->writableBuffer[2] = 0; + } - source->fcdPosition = source->pos; // Indicate where to continue in main input string - // after exhausting the writableBuffer - source->pos = source->writableBuffer; - source->origFlags = source->flags; - source->flags |= UCOL_ITER_INNORMBUF; - source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); + source->fcdPosition = source->pos; // Indicate where to continue in main input string + // after exhausting the writableBuffer + source->pos = source->writableBuffer; + source->origFlags = source->flags; + source->flags |= UCOL_ITER_INNORMBUF; + source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); - return(UCOL_IGNORABLE); - } - } - case CHARSET_TAG: - /* not yet implemented */ - /* probably after 1.8 */ - return UCOL_NOT_FOUND; - default: - *status = U_INTERNAL_PROGRAM_ERROR; - CE=0; - break; + return(UCOL_IGNORABLE); + } + } + case SURROGATE_TAG: + /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ + /* two things can happen here: next code point can be a trailing surrogate - we will use it */ + /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ + /* we return 0 (completely ignorable - per UCA specification */ + { + UChar trail; + collIterateState state; + backupState(source, &state); + if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { + // we chould have stepped one char forward and it might have turned that it + // was not a trail surrogate. In that case, we have to backup. + loadState(source, &state, TRUE); + return 0; + } else { + /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ + CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail); + if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. + // We need to backup + loadState(source, &state, TRUE); + return CE; + } + // calculate the supplementary code point value, if surrogate was not tailored + cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); + } + } + break; + case LEAD_SURROGATE_TAG: /* D800-DBFF*/ + UChar nextChar; + if( source->flags & UCOL_USE_ITERATOR) { + if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { + cp = U16_GET_SUPPLEMENTARY(ch, nextChar); + source->iterator->next(source->iterator); + return getImplicit(cp, source); + } else { + return 0; + } + } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->posendp)) && + U_IS_TRAIL((nextChar=*source->pos))) { + cp = U16_GET_SUPPLEMENTARY(ch, nextChar); + source->pos++; + return getImplicit(cp, source); + } else { + return 0; /* completely ignorable */ + } + case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ + return 0; /* broken surrogate sequence */ + case CHARSET_TAG: + /* not yet implemented */ + /* probably after 1.8 */ + return UCOL_NOT_FOUND; + default: + *status = U_INTERNAL_PROGRAM_ERROR; + CE=0; + break; } if (CE <= UCOL_NOT_FOUND) break; } @@ -3619,15 +3342,38 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col /* now uses Mark's getImplicitPrimary code */ static inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { - if(isNonChar(cp)) { - return 0; - } + if(isNonChar(cp)) { + return 0; + } + + uint32_t r = uprv_uca_getImplicitPrimary(cp); + + *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; + collationSource->toReturn = collationSource->CEpos; + + if (collationSource->offsetBuffer == NULL) { + collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; + collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); + collationSource->offsetStore = collationSource->offsetBuffer; + } + + // **** doesn't work if using iterator **** + if (collationSource->flags & UCOL_ITER_INNORMBUF) { + collationSource->offsetRepeatCount = 1; + } else { + int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string); - uint32_t r = uprv_uca_getImplicitPrimary(cp); + *(collationSource->offsetStore++) = firstOffset; + *(collationSource->offsetStore++) = firstOffset + 1; - *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; - collationSource->toReturn = collationSource->CEpos; - return ((r & 0x0000FFFF)<<16) | 0x000000C0; + collationSource->offsetReturn = collationSource->offsetStore - 1; + *(collationSource->offsetBuffer) = firstOffset; + if (collationSource->offsetReturn == collationSource->offsetBuffer) { + collationSource->offsetStore = collationSource->offsetBuffer; + } + } + + return ((r & 0x0000FFFF)<<16) | 0x000000C0; } /** @@ -3639,705 +3385,867 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { - const uint32_t *CEOffset = NULL; - UChar *UCharOffset = NULL; - UChar schar; - const UChar *constart = NULL; - uint32_t size; - UChar buffer[UCOL_MAX_BUFFER]; - uint32_t *endCEBuffer; - UChar *strbuffer; - int32_t noChars = 0; - - for(;;) - { - /* the only ces that loops are thai and contractions */ - switch (getCETag(CE)) + const uint32_t *CEOffset = NULL; + UChar *UCharOffset = NULL; + UChar schar; + const UChar *constart = NULL; + uint32_t size; + UChar buffer[UCOL_MAX_BUFFER]; + uint32_t *endCEBuffer; + UChar *strbuffer; + int32_t noChars = 0; + int32_t CECount = 0; + + for(;;) { - case NOT_FOUND_TAG: /* this tag always returns */ - return CE; - case SURROGATE_TAG: /* This is a surrogate pair */ - /* essentialy an engaged lead surrogate. */ - /* if you have encountered it here, it means that a */ - /* broken sequence was encountered and this is an error */ - return 0; - case THAI_TAG: - if ((source->flags & UCOL_ITER_INNORMBUF) || /* Already Swapped || */ - source->string == source->pos || /* At start of string.|| */ - /* previous char not Thai prevowel */ - /*UCOL_ISTHAIBASECONSONANT(*(source->pos)) == FALSE ||*/ // This is from the old specs - we now rearrange unconditionally - UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1)) == FALSE) - //UCOL_ISTHAIPREVOWEL(*(source->pos - 1)) == FALSE) - { - /* Treat Thai as a length one expansion */ - /* find the offset to expansion table */ - CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); - CE = *CEOffset ++; - } - else - { - /* - Move the prevowel and the following base Consonant into the - normalization buffer with their order swapped - */ - UChar32 cp = (UChar32)peekCharacter(source, 0); - UBool reorder = TRUE; - - int32_t decompLen = unorm_getDecomposition(cp, FALSE, source->writableBuffer, UCOL_WRITABLE_BUFFER_SIZE-1); - if(decompLen < 0) { - decompLen = -decompLen; // there was no decomposition - } else { // we need to check if we will hit a contraction trigger because of decomposition - int32_t i = decompLen; - for(i = 0; i < decompLen; i++) { - if(ucol_contractionEndCP(source->writableBuffer[i], coll)) { - reorder = FALSE; + /* the only ces that loops are thai and contractions */ + switch (getCETag(CE)) + { + case NOT_FOUND_TAG: /* this tag always returns */ + return CE; + + case SPEC_PROC_TAG: + { + // Special processing is getting a CE that is preceded by a certain prefix + // Currently this is only needed for optimizing Japanese length and iteration marks. + // When we encouter a special processing tag, we go backwards and try to see if + // we have a match. + // Contraction tables are used - so the whole process is not unlike contraction. + // prefix data is stored backwards in the table. + const UChar *UCharOffset; + UChar schar, tchar; + collIterateState prefixState; + backupState(source, &prefixState); + for(;;) { + // This loop will run once per source string character, for as long as we + // are matching a potential contraction sequence + + // First we position ourselves at the begining of contraction sequence + const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); + + if (collIter_bos(source)) { + CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); + break; + } + schar = getPrevNormalizedChar(source, status); + goBackOne(source); + + while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ + UCharOffset++; + } + + if (schar == tchar) { + // Found the source string char in the table. + // Pick up the corresponding CE from the table. + CE = *(coll->contractionCEs + + (UCharOffset - coll->contractionIndex)); + } + else + { + // if there is a completely ignorable code point in the middle of + // a prefix, we need to act as if it's not there + // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) + // lone surrogates cannot be set to zero as it would break other processing + uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); + // it's easy for BMP code points + if(isZeroCE == 0) { + continue; + } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) { + // for supplementary code points, we have to check the next one + // situations where we are going to ignore + // 1. beginning of the string: schar is a lone surrogate + // 2. schar is a lone surrogate + // 3. schar is a trail surrogate in a valid surrogate sequence + // that is explicitly set to zero. + if (!collIter_bos(source)) { + UChar lead; + if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) { + isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead); + if(getCETag(isZeroCE) == SURROGATE_TAG) { + uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); + if(finalCE == 0) { + // this is a real, assigned completely ignorable code point + goBackOne(source); + continue; + } + } + } else { + // lone surrogate, completely ignorable + continue; + } + } else { + // lone surrogate at the beggining, completely ignorable + continue; + } + } + // Source string char was not in the table. + // We have not found the prefix. + CE = *(coll->contractionCEs + + (ContractionStart - coll->contractionIndex)); + } + + if(!isPrefix(CE)) { + // The source string char was in the contraction table, and the corresponding + // CE is not a prefix CE. We found the prefix, break + // out of loop, this CE will end up being returned. This is the normal + // way out of prefix handling when the source actually contained + // the prefix. + break; + } + } + loadState(source, &prefixState, TRUE); + break; + } + + case CONTRACTION_TAG: + /* to ensure that the backwards and forwards iteration matches, we + take the current region of most possible match and pass it through + the forward iteration. this will ensure that the obstinate problem of + overlapping contractions will not occur. + */ + schar = peekCharacter(source, 0); + constart = (UChar *)coll->image + getContractOffset(CE); + if (isAtStartPrevIterate(source) + /* commented away contraction end checks after adding the checks + in getPrevCE */) { + /* start of string or this is not the end of any contraction */ + CE = *(coll->contractionCEs + + (constart - coll->contractionIndex)); + break; + } + strbuffer = buffer; + UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); + *(UCharOffset --) = 0; + noChars = 0; + // have to swap thai characters + while (ucol_unsafeCP(schar, coll)) { + *(UCharOffset) = schar; + noChars++; + UCharOffset --; + schar = getPrevNormalizedChar(source, status); + goBackOne(source); + // TODO: when we exhaust the contraction buffer, + // it needs to get reallocated. The problem is + // that the size depends on the string which is + // not iterated over. However, since we're travelling + // backwards, we already had to set the iterator at + // the end - so we might as well know where we are? + if (UCharOffset + 1 == buffer) { + /* we have exhausted the buffer */ + int32_t newsize = 0; + if(source->pos) { // actually dealing with a position + newsize = source->pos - source->string + 1; + } else { // iterator + newsize = 4 * UCOL_MAX_BUFFER; + } + strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * + (newsize + UCOL_MAX_BUFFER)); + /* test for NULL */ + if (strbuffer == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return UCOL_NO_MORE_CES; + } + UCharOffset = strbuffer + newsize; + uprv_memcpy(UCharOffset, buffer, + UCOL_MAX_BUFFER * sizeof(UChar)); + UCharOffset --; + } + if ((source->pos && (source->pos == source->string || + ((source->flags & UCOL_ITER_INNORMBUF) && + *(source->pos - 1) == 0 && source->fcdPosition == NULL))) + || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { + break; + } + } + /* adds the initial base character to the string */ + *(UCharOffset) = schar; + noChars++; + + int32_t offsetBias; + +#if 0 + if (source->offsetReturn != NULL) { + source->offsetStore = source->offsetReturn - noChars; + } + + // **** doesn't work if using iterator **** + if (source->flags & UCOL_ITER_INNORMBUF) { + if (source->fcdPosition == NULL) { + offsetBias = 0; + } else { + offsetBias = (int32_t)(source->fcdPosition - source->string); + } + } else { + offsetBias = (int32_t)(source->pos - source->string); + } + +#else + // **** doesn't work if using iterator **** + if (source->flags & UCOL_ITER_INNORMBUF) { +#if 1 + offsetBias = -1; +#else + if (source->fcdPosition == NULL) { + offsetBias = 0; + } else { + offsetBias = (int32_t)(source->fcdPosition - source->string); } +#endif + } else { + offsetBias = (int32_t)(source->pos - source->string); } - } +#endif - UChar *tempbuffer = source->writableBuffer + - (source->writableBufSize - 1); - uprv_memcpy(tempbuffer-decompLen + 1, source->writableBuffer, sizeof(UChar)*decompLen); - if(reorder) { - *(tempbuffer - decompLen) = *(tempbuffer - decompLen + 1); - *(tempbuffer - decompLen + 1) = peekCharacter(source, -1); - } else { - *(tempbuffer - decompLen) = peekCharacter(source, -1); - } - *(tempbuffer - decompLen - 1) = 0; + /* a new collIterate is used to simplify things, since using the current + collIterate will mean that the forward and backwards iteration will + share and change the same buffers. we don't want to get into that. */ + collIterate temp; + int32_t rawOffset; + //IInit_collIterate(coll, UCharOffset, -1, &temp); + IInit_collIterate(coll, UCharOffset, noChars, &temp); + temp.flags &= ~UCOL_ITER_NORM; -/* - UChar *tempbuffer = source->writableBuffer + - (source->writableBufSize - 1); - *(tempbuffer - 2) = 0; - *(tempbuffer - 1) = peekCharacter(source, 0); - *(tempbuffer) = peekCharacter(source, -1); -*/ - /* - Indicate where to continue in main input string after exhausting - the writableBuffer - */ - if (source->pos - 1 == source->string) { - source->fcdPosition = NULL; - } else { - source->fcdPosition = source->pos-2; - } + rawOffset = temp.pos - temp.string; // should always be zero? + CE = ucol_IGetNextCE(coll, &temp, status); - source->pos = tempbuffer+1; // we're doing predecrement, right? - source->origFlags = source->flags; - source->flags |= UCOL_ITER_INNORMBUF; - source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); + if (source->extendCEs) { + endCEBuffer = source->extendCEs + source->extendCEsSize; + CECount = (source->CEpos - source->extendCEs)/sizeof(uint32_t); + } else { + endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; + CECount = (source->CEpos - source->CEs)/sizeof(uint32_t); + } - //CE = UCOL_IGNORABLE; - return(UCOL_IGNORABLE); - } - break; - case SPEC_PROC_TAG: - { - // Special processing is getting a CE that is preceded by a certain prefix - // Currently this is only needed for optimizing Japanese length and iteration marks. - // When we encouter a special processing tag, we go backwards and try to see if - // we have a match. - // Contraction tables are used - so the whole process is not unlike contraction. - // prefix data is stored backwards in the table. - const UChar *UCharOffset; - UChar schar, tchar; - collIterateState prefixState; - backupState(source, &prefixState); - for(;;) { - // This loop will run once per source string character, for as long as we - // are matching a potential contraction sequence + if (source->offsetBuffer == NULL) { + source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; + source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); + source->offsetStore = source->offsetBuffer; + } - // First we position ourselves at the begining of contraction sequence - const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); + while (CE != UCOL_NO_MORE_CES) { + *(source->CEpos ++) = CE; - if (collIter_bos(source)) { - CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); - break; - } - schar = getPrevNormalizedChar(source); - goBackOne(source); + if (offsetBias >= 0) { + *(source->offsetStore ++) = rawOffset + offsetBias; + } - while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ - UCharOffset++; - } + CECount++; + if (source->CEpos == endCEBuffer) { + /* ran out of CE space, reallocate to new buffer. + If reallocation fails, reset pointers and bail out, + there's no guarantee of the right character position after + this bail*/ + if (source->extendCEs == NULL) { + source->extendCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t) * + (source->extendCEsSize =UCOL_EXPAND_CE_BUFFER_SIZE + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE)); + if (source->extendCEs == NULL) { + // Handle error later. + CECount = -1; + } else { + source->extendCEs = (uint32_t *)uprv_memcpy(source->extendCEs, source->CEs, UCOL_EXPAND_CE_BUFFER_SIZE * sizeof(uint32_t)); + } + } else { + uint32_t *tempBufCE = (uint32_t *)uprv_realloc(source->extendCEs, + sizeof(uint32_t) * (source->extendCEsSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE)); + if (tempBufCE == NULL) { + // Handle error later. + CECount = -1; + } + else { + source->extendCEs = tempBufCE; + } + } - if (schar == tchar) { - // Found the source string char in the table. - // Pick up the corresponding CE from the table. - CE = *(coll->contractionCEs + - (UCharOffset - coll->contractionIndex)); - } - else - { - // if there is a completely ignorable code point in the middle of - // a prefix, we need to act as if it's not there - // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) - // lone surrogates cannot be set to zero as it would break other processing - uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar); - // it's easy for BMP code points - if(isZeroCE == 0) { - continue; - } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) { - // for supplementary code points, we have to check the next one - // situations where we are going to ignore - // 1. beginning of the string: schar is a lone surrogate - // 2. schar is a lone surrogate - // 3. schar is a trail surrogate in a valid surrogate sequence - // that is explicitly set to zero. - if (!collIter_bos(source)) { - UChar lead; - if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) { - isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead); - if(getCETag(isZeroCE) == SURROGATE_TAG) { - uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar); - if(finalCE == 0) { - // this is a real, assigned completely ignorable code point - goBackOne(source); - continue; - } + if (CECount == -1) { + *status = U_MEMORY_ALLOCATION_ERROR; + source->extendCEsSize = 0; + source->CEpos = source->CEs; + freeHeapWritableBuffer(&temp); + + if (strbuffer != buffer) { + uprv_free(strbuffer); + } + + return (uint32_t)UCOL_NULLORDER; } - } else { - // lone surrogate, completely ignorable - continue; - } - } else { - // lone surrogate at the beggining, completely ignorable - continue; + + source->CEpos = source->extendCEs + CECount; + endCEBuffer = source->extendCEs + source->extendCEsSize; } - } - // Source string char was not in the table. - // We have not found the prefix. - CE = *(coll->contractionCEs + - (ContractionStart - coll->contractionIndex)); - } - if(!isPrefix(CE)) { - // The source string char was in the contraction table, and the corresponding - // CE is not a prefix CE. We found the prefix, break - // out of loop, this CE will end up being returned. This is the normal - // way out of prefix handling when the source actually contained - // the prefix. - break; - } - } - loadState(source, &prefixState, TRUE); - break; - } + if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) { + int32_t storeIX = source->offsetStore - source->offsetBuffer; + int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer, + sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE)); - case CONTRACTION_TAG: - /* to ensure that the backwards and forwards iteration matches, we - take the current region of most possible match and pass it through - the forward iteration. this will ensure that the obstinate problem of - overlapping contractions will not occur. - */ - schar = peekCharacter(source, 0); - constart = (UChar *)coll->image + getContractOffset(CE); - if (isAtStartPrevIterate(source) - /* commented away contraction end checks after adding the checks - in getPrevCE */) { - /* start of string or this is not the end of any contraction */ - CE = *(coll->contractionCEs + - (constart - coll->contractionIndex)); - break; - } - strbuffer = buffer; - UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); - *(UCharOffset --) = 0; - noChars = 0; - // have to swap thai characters - while (ucol_unsafeCP(schar, coll) || UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1))) { - // we might have ended here after trying to reorder Thai, but seeing that there are unsafe points - // in the backward processing - *(UCharOffset) = schar; - noChars++; - UCharOffset --; - schar = getPrevNormalizedChar(source); - goBackOne(source); - // TODO: when we exhaust the contraction buffer, - // it needs to get reallocated. The problem is - // that the size depends on the string which is - // not iterated over. However, since we're travelling - // backwards, we already had to set the iterator at - // the end - so we might as well know where we are? - if (UCharOffset + 1 == buffer) { - /* we have exhausted the buffer */ - int32_t newsize = 0; - if(source->pos) { // actually dealing with a position - newsize = source->pos - source->string + 1; - } else { // iterator - newsize = 4 * UCOL_MAX_BUFFER; - } - strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * - (newsize + UCOL_MAX_BUFFER)); - /* test for NULL */ - if (strbuffer == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return UCOL_NO_MORE_CES; + if (tob != NULL) { + source->offsetBuffer = tob; + source->offsetStore = &source->offsetBuffer[storeIX]; + source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE; + } else { + // memory error... + *status = U_MEMORY_ALLOCATION_ERROR; + source->CEpos = source->CEs; + freeHeapWritableBuffer(&temp); + + if (strbuffer != buffer) { + uprv_free(strbuffer); + } + + return (uint32_t) UCOL_NULLORDER; + } } - UCharOffset = strbuffer + newsize; - uprv_memcpy(UCharOffset, buffer, - UCOL_MAX_BUFFER * sizeof(UChar)); - UCharOffset --; + + rawOffset = temp.pos - temp.string; + CE = ucol_IGetNextCE(coll, &temp, status); } - if ((source->pos && (source->pos == source->string || - ((source->flags & UCOL_ITER_INNORMBUF) && - *(source->pos - 1) == 0 && source->fcdPosition == NULL))) - || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { - break; + + if (source->offsetRepeatValue != 0) { + if (CECount > noChars) { + source->offsetRepeatCount += temp.offsetRepeatCount; + } else { + // **** does this really skip the right offsets? **** + source->offsetReturn -= (noChars - CECount); + } } - } - /* adds the initial base character to the string */ - *(UCharOffset) = schar; - noChars++; - - /* a new collIterate is used to simplify things, since using the current - collIterate will mean that the forward and backwards iteration will - share and change the same buffers. we don't want to get into that. */ - collIterate temp; - //IInit_collIterate(coll, UCharOffset, -1, &temp); - IInit_collIterate(coll, UCharOffset, noChars, &temp); - temp.flags &= ~UCOL_ITER_NORM; - - CE = ucol_IGetNextCE(coll, &temp, status); - endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; - while (CE != UCOL_NO_MORE_CES) { - *(source->CEpos ++) = CE; - if (source->CEpos == endCEBuffer) { - /* ran out of CE space, bail. - there's no guarantee of the right character position after - this bail*/ - *status = U_BUFFER_OVERFLOW_ERROR; - source->CEpos = source->CEs; - freeHeapWritableBuffer(&temp); - if (strbuffer != buffer) { - uprv_free(strbuffer); + + freeHeapWritableBuffer(&temp); + + if (strbuffer != buffer) { + uprv_free(strbuffer); + } + + if (offsetBias >= 0) { + source->offsetReturn = source->offsetStore - 1; + if (source->offsetReturn == source->offsetBuffer) { + source->offsetStore = source->offsetBuffer; } - return (uint32_t)UCOL_NULLORDER; } - CE = ucol_IGetNextCE(coll, &temp, status); - } - freeHeapWritableBuffer(&temp); - if (strbuffer != buffer) { - uprv_free(strbuffer); - } - source->toReturn = source->CEpos - 1; - if (source->toReturn == source->CEs) { - source->CEpos = source->CEs; - } - return *(source->toReturn); - case LONG_PRIMARY_TAG: - { - *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; - *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; - source->toReturn = source->CEpos - 1; - return *(source->toReturn); - } - case EXPANSION_TAG: /* this tag always returns */ - /* - This should handle expansion. - NOTE: we can encounter both continuations and expansions in an expansion! - I have to decide where continuations are going to be dealt with - */ - /* find the offset to expansion table */ - CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); - size = getExpansionCount(CE); - if (size != 0) { - /* - if there are less than 16 elements in expansion, we don't terminate - */ - uint32_t count; - for (count = 0; count < size; count++) { - *(source->CEpos ++) = *CEOffset++; - } - } - else { - /* else, we do */ - while (*CEOffset != 0) { - *(source->CEpos ++) = *CEOffset ++; - } - } - source->toReturn = source->CEpos - 1; - // in case of one element expansion, we - // want to immediately return CEpos - if(source->toReturn == source->CEs) { - source->CEpos = source->CEs; - } - return *(source->toReturn); - case DIGIT_TAG: - { - /* - We do a check to see if we want to collate digits as numbers; if so we generate - a custom collation key. Otherwise we pull out the value stored in the expansion table. - */ - //uint32_t size; - uint32_t i; /* general counter */ - collIterateState state; - if (source->coll->numericCollation == UCOL_ON){ - UChar32 char32 = 0; + source->toReturn = source->CEpos - 1; + if (source->toReturn == source->CEs) { + source->CEpos = source->CEs; + } - uint32_t digIndx = 0; - uint32_t endIndex = 0; - uint32_t leadingZeroIndex = 0; - uint32_t trailingZeroCount = 0; + return *(source->toReturn); - uint32_t primWeight = 0; + case LONG_PRIMARY_TAG: + { + *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; + *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; + source->toReturn = source->CEpos - 1; + + if (source->offsetBuffer == NULL) { + source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; + source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); + source->offsetStore = source->offsetBuffer; + } - int32_t digVal = 0; - uint8_t collateVal = 0; + if (source->flags & UCOL_ITER_INNORMBUF) { + source->offsetRepeatCount = 1; + } else { + int32_t firstOffset = (int32_t)(source->pos - source->string); - UBool nonZeroValReached = FALSE; + *(source->offsetStore++) = firstOffset; + *(source->offsetStore++) = firstOffset + 1; - uint8_t *numTempBuf; - uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs. - uint32_t numTempBufSize = UCOL_MAX_BUFFER; + source->offsetReturn = source->offsetStore - 1; + *(source->offsetBuffer) = firstOffset; + if (source->offsetReturn == source->offsetBuffer) { + source->offsetStore = source->offsetBuffer; + } + } - numTempBuf = stackNumTempBuf; - /* - We parse the source string until we hit a char that's NOT a digit. - Use this u_charDigitValue. This might be slow because we have to - handle surrogates... - */ - if (U16_IS_TRAIL (ch)){ - if (!collIter_bos(source)){ - UChar lead = getPrevNormalizedChar(source); - if(U16_IS_LEAD(lead)) { - char32 = U16_GET_SUPPLEMENTARY(lead,ch); - goBackOne(source); - } else { - char32 = ch; - } - } else { - char32 = ch; + return *(source->toReturn); } - } else { - char32 = ch; - } - digVal = u_charDigitValue(char32); - for(;;){ - // Make sure we have enough space. - if (digIndx >= ((numTempBufSize - 2) * 2) + 1) - { - numTempBufSize *= 2; - if (numTempBuf == stackNumTempBuf){ - numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize); - uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER); - }else - uprv_realloc(numTempBuf, numTempBufSize); - } - - // Skip over trailing zeroes, and keep a count of them. - if (digVal != 0) - nonZeroValReached = TRUE; - if (nonZeroValReached){ - /* - We parse the digit string into base 100 numbers (this fits into a byte). - We only add to the buffer in twos, thus if we are parsing an odd character, - that serves as the 'tens' digit while the if we are parsing an even one, that - is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into - a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid - overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less - than all the other bytes. - - Since we're doing in this reverse we want to put the first digit encountered into the - ones place and the second digit encountered into the tens place. - */ + case EXPANSION_TAG: /* this tag always returns */ + { + /* + This should handle expansion. + NOTE: we can encounter both continuations and expansions in an expansion! + I have to decide where continuations are going to be dealt with + */ + int32_t firstOffset = (int32_t)(source->pos - source->string); + + // **** doesn't work if using iterator **** + if (source->offsetReturn != NULL) { + if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) { + source->offsetStore = source->offsetBuffer; + }else { + firstOffset = -1; + } + } + + if (source->offsetBuffer == NULL) { + source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; + source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); + source->offsetStore = source->offsetBuffer; + } - if ((digIndx + trailingZeroCount) % 2 == 1){ - // High-order digit case (tens place) - collateVal += (uint8_t)(digVal * 10); + /* find the offset to expansion table */ + CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); + size = getExpansionCount(CE); + if (size != 0) { + /* + if there are less than 16 elements in expansion, we don't terminate + */ + uint32_t count; - // We cannot set leadingZeroIndex unless it has been set for the - // low-order digit. Therefore, all we can do for the high-order - // digit is turn it off, never on. - // The only time we will have a high digit without a low is for - // the very first non-zero digit, so no zero check is necessary. - if (collateVal != 0) - leadingZeroIndex = 0; + for (count = 0; count < size; count++) { + *(source->CEpos ++) = *CEOffset++; - numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; - collateVal = 0; + if (firstOffset >= 0) { + *(source->offsetStore ++) = firstOffset + 1; + } } - else{ - // Low-order digit case (ones place) - collateVal = (uint8_t)digVal; + } else { + /* else, we do */ + while (*CEOffset != 0) { + *(source->CEpos ++) = *CEOffset ++; - // Check for leading zeroes. - if (collateVal == 0) - { - if (!leadingZeroIndex) - leadingZeroIndex = (digIndx/2) + 2; + if (firstOffset >= 0) { + *(source->offsetStore ++) = firstOffset + 1; } - else - leadingZeroIndex = 0; + } + } - // No need to write to buffer; the case of a last odd digit - // is handled below. + if (firstOffset >= 0) { + source->offsetReturn = source->offsetStore - 1; + *(source->offsetBuffer) = firstOffset; + if (source->offsetReturn == source->offsetBuffer) { + source->offsetStore = source->offsetBuffer; } - ++digIndx; + } else { + source->offsetRepeatCount += size - 1; } - else - ++trailingZeroCount; - if (!collIter_bos(source)){ - ch = getPrevNormalizedChar(source); - //goBackOne(source); - if (U16_IS_TRAIL(ch)){ - backupState(source, &state); - if (!collIter_bos(source)) - { - goBackOne(source); - UChar lead = getPrevNormalizedChar(source); - if(U16_IS_LEAD(lead)) { - char32 = U16_GET_SUPPLEMENTARY(lead,ch); + source->toReturn = source->CEpos - 1; + // in case of one element expansion, we + // want to immediately return CEpos + if(source->toReturn == source->CEs) { + source->CEpos = source->CEs; + } + + return *(source->toReturn); + } + + case DIGIT_TAG: + { + /* + We do a check to see if we want to collate digits as numbers; if so we generate + a custom collation key. Otherwise we pull out the value stored in the expansion table. + */ + //uint32_t size; + uint32_t i; /* general counter */ + + if (source->coll->numericCollation == UCOL_ON){ + uint32_t digIndx = 0; + uint32_t endIndex = 0; + uint32_t leadingZeroIndex = 0; + uint32_t trailingZeroCount = 0; + + uint8_t collateVal = 0; + + UBool nonZeroValReached = FALSE; + + uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs. + /* + We parse the source string until we hit a char that's NOT a digit. + Use this u_charDigitValue. This might be slow because we have to + handle surrogates... + */ + /* + We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less, + with any chunks smaller than that being on the right end of the digit string - i.e. the first collation + element we process when going backward. To determine how long that chunk might be, we may need to make + two passes through the loop that collects digits - one to see how long the string is (and how much is + leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has + more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation + element chunk after resetting the state to the initialState at the right side of the digit string. + */ + uint32_t ceLimit = 0; + UChar initial_ch = ch; + collIterateState initialState = {0,0,0,0,0,0,0,0,0}; + backupState(source, &initialState); + + for(;;) { + collIterateState state = {0,0,0,0,0,0,0,0,0}; + UChar32 char32 = 0; + int32_t digVal = 0; + + if (U16_IS_TRAIL (ch)) { + if (!collIter_bos(source)){ + UChar lead = getPrevNormalizedChar(source, status); + if(U16_IS_LEAD(lead)) { + char32 = U16_GET_SUPPLEMENTARY(lead,ch); + goBackOne(source); + } else { + char32 = ch; + } + } else { + char32 = ch; + } } else { - loadState(source, &state, FALSE); - char32 = ch; + char32 = ch; + } + digVal = u_charDigitValue(char32); + + for(;;) { + // Make sure we have enough space. No longer needed; + // at this point the largest value of digIndx when we need to save data in numTempBuf + // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure + // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2). + + // Skip over trailing zeroes, and keep a count of them. + if (digVal != 0) + nonZeroValReached = TRUE; + + if (nonZeroValReached) { + /* + We parse the digit string into base 100 numbers (this fits into a byte). + We only add to the buffer in twos, thus if we are parsing an odd character, + that serves as the 'tens' digit while the if we are parsing an even one, that + is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into + a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid + overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less + than all the other bytes. + + Since we're doing in this reverse we want to put the first digit encountered into the + ones place and the second digit encountered into the tens place. + */ + + if ((digIndx + trailingZeroCount) % 2 == 1) { + // High-order digit case (tens place) + collateVal += (uint8_t)(digVal * 10); + + // We cannot set leadingZeroIndex unless it has been set for the + // low-order digit. Therefore, all we can do for the high-order + // digit is turn it off, never on. + // The only time we will have a high digit without a low is for + // the very first non-zero digit, so no zero check is necessary. + if (collateVal != 0) + leadingZeroIndex = 0; + + // The first pass through, digIndx may exceed the limit, but in that case + // we no longer care about numTempBuf contents since they will be discarded + if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) { + numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; + } + collateVal = 0; + } else { + // Low-order digit case (ones place) + collateVal = (uint8_t)digVal; + + // Check for leading zeroes. + if (collateVal == 0) { + if (!leadingZeroIndex) + leadingZeroIndex = (digIndx/2) + 2; + } else + leadingZeroIndex = 0; + + // No need to write to buffer; the case of a last odd digit + // is handled below. + } + ++digIndx; + } else + ++trailingZeroCount; + + if (!collIter_bos(source)) { + ch = getPrevNormalizedChar(source, status); + //goBackOne(source); + if (U16_IS_TRAIL(ch)) { + backupState(source, &state); + if (!collIter_bos(source)) { + goBackOne(source); + UChar lead = getPrevNormalizedChar(source, status); + + if(U16_IS_LEAD(lead)) { + char32 = U16_GET_SUPPLEMENTARY(lead,ch); + } else { + loadState(source, &state, FALSE); + char32 = ch; + } + } + } else + char32 = ch; + + if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { + if (char32 > 0xFFFF) {// For surrogates. + loadState(source, &state, FALSE); + } + // Don't need to "reverse" the goBackOne call, + // as this points to the next position to process.. + //if (char32 > 0xFFFF) // For surrogates. + //getNextNormalizedChar(source); + break; + } + + goBackOne(source); + }else + break; + } + + if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) { + // our collation element is not too big, go ahead and finish with it + break; } + // our digit string is too long for a collation element; + // set the limit for it, reset the state and begin again + ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER; + if ( ceLimit == 0 ) { + ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; + } + ch = initial_ch; + loadState(source, &initialState, FALSE); + digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0; + collateVal = 0; + nonZeroValReached = FALSE; } - } - else - char32 = ch; - - if ((digVal = u_charDigitValue(char32)) == -1){ - if (char32 > 0xFFFF) {// For surrogates. - loadState(source, &state, FALSE); - } - // Don't need to "reverse" the goBackOne call, - // as this points to the next position to process.. - //if (char32 > 0xFFFF) // For surrogates. - //getNextNormalizedChar(source); + + if (! nonZeroValReached) { + digIndx = 2; + trailingZeroCount = 0; + numTempBuf[2] = 6; + } + + if ((digIndx + trailingZeroCount) % 2 != 0) { + numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; + digIndx += 1; // The implicit leading zero + } + if (trailingZeroCount % 2 != 0) { + // We had to consume one trailing zero for the low digit + // of the least significant byte + digIndx += 1; // The trailing zero not in the exponent + trailingZeroCount -= 1; + } + + endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; + + // Subtract one off of the last byte. Really the first byte here, but it's reversed... + numTempBuf[2] -= 1; + + /* + We want to skip over the first two slots in the buffer. The first slot + is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the + sign/exponent byte: 0x80 + (decimalPos/2) & 7f. + The exponent must be adjusted by the number of leading zeroes, and the number of + trailing zeroes. + */ + numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; + uint32_t exponent = (digIndx+trailingZeroCount)/2; + if (leadingZeroIndex) + exponent -= ((digIndx/2) + 2 - leadingZeroIndex); + numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); + + // Now transfer the collation key to our collIterate struct. + // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. + //size = ((endIndex+1) & ~1)/2; + *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight + (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight + UCOL_BYTE_COMMON; // Tertiary weight. + i = endIndex - 1; // Reset the index into the buffer. + while(i >= 2) { + uint32_t primWeight = numTempBuf[i--] << 8; + if ( i >= 2) + primWeight |= numTempBuf[i--]; + *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; + } + + source->toReturn = source->CEpos -1; + return *(source->toReturn); + } else { + CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); + CE = *(CEOffset++); break; } - goBackOne(source); - }else - break; - } + } - if (nonZeroValReached == FALSE){ - digIndx = 2; - trailingZeroCount = 0; - numTempBuf[2] = 6; - } + case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ + { + static const uint32_t + SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; + //const uint32_t LCount = 19; + static const uint32_t VCount = 21; + static const uint32_t TCount = 28; + //const uint32_t NCount = VCount * TCount; /* 588 */ + //const uint32_t SCount = LCount * NCount; /* 11172 */ + + uint32_t L = ch - SBase; + /* + divide into pieces. + we do it in this order since some compilers can do % and / in one + operation + */ + uint32_t T = L % TCount; + L /= TCount; + uint32_t V = L % VCount; + L /= VCount; + + /* offset them */ + L += LBase; + V += VBase; + T += TBase; + + if (source->offsetBuffer == NULL) { + source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; + source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); + source->offsetStore = source->offsetBuffer; + } - if ((digIndx + trailingZeroCount) % 2 != 0){ - numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; - digIndx += 1; // The implicit leading zero - } - if (trailingZeroCount % 2 != 0){ - // We had to consume one trailing zero for the low digit - // of the least significant byte - digIndx += 1; // The trailing zero not in the exponent - trailingZeroCount -= 1; - } + int32_t firstOffset = (int32_t)(source->pos - source->string); - endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; + *(source->offsetStore++) = firstOffset; - // Subtract one off of the last byte. Really the first byte here, but it's reversed... - numTempBuf[2] -= 1; + /* + * return the first CE, but first put the rest into the expansion buffer + */ + if (!source->coll->image->jamoSpecial) { + *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L); + *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); + *(source->offsetStore++) = firstOffset + 1; + + if (T != TBase) { + *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); + *(source->offsetStore++) = firstOffset + 1; + } - /* - We want to skip over the first two slots in the buffer. The first slot - is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the - sign/exponent byte: 0x80 + (decimalPos/2) & 7f. - The exponent must be adjusted by the number of leading zeroes, and the number of - trailing zeroes. - */ - numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; - uint32_t exponent = (digIndx+trailingZeroCount)/2; - if (leadingZeroIndex) - exponent -= ((digIndx/2) + 2 - leadingZeroIndex); - numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); - - // Now transfer the collation key to our collIterate struct. - // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. - //size = ((endIndex+1) & ~1)/2; - *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight - (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight - UCOL_BYTE_COMMON; // Tertiary weight. - i = endIndex - 1; // Reset the index into the buffer. - while(i >= 2) - { - primWeight = numTempBuf[i--] << 8; - if ( i >= 2) - primWeight |= numTempBuf[i--]; - *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; - } - if (numTempBuf != stackNumTempBuf) - uprv_free(numTempBuf); + source->toReturn = source->CEpos - 1; - source->toReturn = source->CEpos -1; - return *(source->toReturn); - } - else { - CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); - CE = *(CEOffset++); - break; + source->offsetReturn = source->offsetStore - 1; + if (source->offsetReturn == source->offsetBuffer) { + source->offsetStore = source->offsetBuffer; + } + + return *(source->toReturn); + } else { + // Since Hanguls pass the FCD check, it is + // guaranteed that we won't be in + // the normalization buffer if something like this happens + // Move Jamos into normalization buffer + /* + Move the Jamos into the + normalization buffer + */ + UChar *tempbuffer = source->writableBuffer + + (source->writableBufSize - 1); + *(tempbuffer) = 0; + if (T != TBase) { + *(tempbuffer - 1) = (UChar)T; + *(tempbuffer - 2) = (UChar)V; + *(tempbuffer - 3) = (UChar)L; + *(tempbuffer - 4) = 0; + } else { + *(tempbuffer - 1) = (UChar)V; + *(tempbuffer - 2) = (UChar)L; + *(tempbuffer - 3) = 0; + } + + /* + Indicate where to continue in main input string after exhausting + the writableBuffer + */ + if (source->pos == source->string) { + source->fcdPosition = NULL; + } else { + source->fcdPosition = source->pos-1; + } + + source->pos = tempbuffer; + source->origFlags = source->flags; + source->flags |= UCOL_ITER_INNORMBUF; + source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); + + return(UCOL_IGNORABLE); + } + } + + case IMPLICIT_TAG: /* everything that is not defined otherwise */ #if 0 - /* find the offset to expansion table */ - CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); - size = getExpansionCount(CE); - if (size != 0) { - /* - if there are less than 16 elements in expansion, we don't terminate - */ - uint32_t count; - for (count = 0; count < size; count++) { - *(source->CEpos ++) = *CEOffset++; + if (source->offsetBuffer == NULL) { + source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; + source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); + source->offsetStore = source->offsetBuffer; } - } - else { - /* else, we do */ - while (*CEOffset != 0) { - *(source->CEpos ++) = *CEOffset ++; + + // **** doesn't work if using iterator **** + if (source->flags & UCOL_ITER_INNORMBUF) { + source->offsetRepeatCount = 1; + } else { + int32_t firstOffset = (int32_t)(source->pos - source->string); + + *(source->offsetStore++) = firstOffset; + *(source->offsetStore++) = firstOffset + 1; + + source->offsetReturn = source->offsetStore - 1; + if (source->offsetReturn == source->offsetBuffer) { + source->offsetStore = source->offsetBuffer; + } } - } - source->toReturn = source->CEpos - 1; - // in case of one element expansion, we - // want to immediately return CEpos - if(source->toReturn == source->CEs) { - source->CEpos = source->CEs; - } - return *(source->toReturn); #endif - } - } - case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ - { - const uint32_t - SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; - //const uint32_t LCount = 19; - const uint32_t VCount = 21; - const uint32_t TCount = 28; - //const uint32_t NCount = VCount * TCount; /* 588 */ - //const uint32_t SCount = LCount * NCount; /* 11172 */ - - uint32_t L = ch - SBase; - /* - divide into pieces. - we do it in this order since some compilers can do % and / in one - operation - */ - uint32_t T = L % TCount; - L /= TCount; - uint32_t V = L % VCount; - L /= VCount; - /* offset them */ - L += LBase; - V += VBase; - T += TBase; + return getPrevImplicit(ch, source); - /* - return the first CE, but first put the rest into the expansion buffer - */ - if (!source->coll->image->jamoSpecial) - { - /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, L);*/ - /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/ - *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, L); - /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, V);*/ - /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/ - *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V); - if (T != TBase) - /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, T);*/ - /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/ - *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T); - - source->toReturn = source->CEpos - 1; - return *(source->toReturn); - } else { - // Since Hanguls pass the FCD check, it is - // guaranteed that we won't be in - // the normalization buffer if something like this happens - // Move Jamos into normalization buffer - /* - Move the Jamos into the - normalization buffer - */ - UChar *tempbuffer = source->writableBuffer + - (source->writableBufSize - 1); - *(tempbuffer) = 0; - if (T != TBase) { - *(tempbuffer - 1) = (UChar)T; - *(tempbuffer - 2) = (UChar)V; - *(tempbuffer - 3) = (UChar)L; - *(tempbuffer - 4) = 0; - } else { - *(tempbuffer - 1) = (UChar)V; - *(tempbuffer - 2) = (UChar)L; - *(tempbuffer - 3) = 0; - } + // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function + case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ + return getPrevImplicit(ch, source); - /* - Indicate where to continue in main input string after exhausting - the writableBuffer - */ - if (source->pos == source->string) { - source->fcdPosition = NULL; - } else { - source->fcdPosition = source->pos-1; - } + case SURROGATE_TAG: /* This is a surrogate pair */ + /* essentialy an engaged lead surrogate. */ + /* if you have encountered it here, it means that a */ + /* broken sequence was encountered and this is an error */ + return 0; + + case LEAD_SURROGATE_TAG: /* D800-DBFF*/ + return 0; /* broken surrogate sequence */ + + case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ + { + UChar32 cp = 0; + UChar prevChar; + UChar *prev; + if (isAtStartPrevIterate(source)) { + /* we are at the start of the string, wrong place to be at */ + return 0; + } + if (source->pos != source->writableBuffer) { + prev = source->pos - 1; + } else { + prev = source->fcdPosition; + } + prevChar = *prev; + + /* Handles Han and Supplementary characters here.*/ + if (U16_IS_LEAD(prevChar)) { + cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); + source->pos = prev; + } else { + return 0; /* completely ignorable */ + } + + return getPrevImplicit(cp, source); + } - source->pos = tempbuffer; - source->origFlags = source->flags; - source->flags |= UCOL_ITER_INNORMBUF; - source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); + /* UCA is filled with these. Tailorings are NOT_FOUND */ + /* not yet implemented */ + case CHARSET_TAG: /* this tag always returns */ + /* probably after 1.8 */ + return UCOL_NOT_FOUND; - return(UCOL_IGNORABLE); + default: /* this tag always returns */ + *status = U_INTERNAL_PROGRAM_ERROR; + CE=0; + break; } - } - case LEAD_SURROGATE_TAG: /* D800-DBFF*/ - return 0; /* broken surrogate sequence */ - case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ - { - UChar32 cp = 0; - UChar prevChar; - UChar *prev; - if (isAtStartPrevIterate(source)) { - /* we are at the start of the string, wrong place to be at */ - return 0; - } - if (source->pos != source->writableBuffer) { - prev = source->pos - 1; - } else { - prev = source->fcdPosition; - } - prevChar = *prev; - /* Handles Han and Supplementary characters here.*/ - if (UTF_IS_FIRST_SURROGATE(prevChar)) { - cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); - source->pos = prev; - } else { - return 0; /* completely ignorable */ - } - return getPrevImplicit(cp, source); - } - // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function - case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ - return getPrevImplicit(ch, source); - case IMPLICIT_TAG: /* everything that is not defined otherwise */ - return getPrevImplicit(ch, source); - /* UCA is filled with these. Tailorings are NOT_FOUND */ - /* not yet implemented */ - case CHARSET_TAG: /* this tag always returns */ - /* probably after 1.8 */ - return UCOL_NOT_FOUND; - default: /* this tag always returns */ - *status = U_INTERNAL_PROGRAM_ERROR; - CE=0; - break; - } - if (CE <= UCOL_NOT_FOUND) { - break; + if (CE <= UCOL_NOT_FOUND) { + break; + } } - } - return CE; + + return CE; } /* This should really be a macro */ @@ -4346,28 +4254,29 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, static uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) { #ifdef UCOL_DEBUG - fprintf(stderr, "."); + fprintf(stderr, "."); #endif - uint8_t *newStart = NULL; - uint32_t offset = *secondaries-secStart; + uint8_t *newStart = NULL; + uint32_t offset = *secondaries-secStart; - if(secStart==second) { - newStart=(uint8_t*)uprv_malloc(newSize); - if(newStart==NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - uprv_memcpy(newStart, secStart, *secondaries-secStart); - } else { - newStart=(uint8_t*)uprv_realloc(secStart, newSize); - if(newStart==NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; + if(secStart==second) { + newStart=(uint8_t*)uprv_malloc(newSize); + if(newStart==NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + uprv_memcpy(newStart, secStart, *secondaries-secStart); + } else { + newStart=(uint8_t*)uprv_realloc(secStart, newSize); + if(newStart==NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + /* Since we're reallocating, return original reference so we don't loose it. */ + return secStart; + } } - } - *secondaries=newStart+offset; - *secSize=newSize; - return newStart; + *secondaries=newStart+offset; + *secSize=newSize; + return newStart; } @@ -4508,34 +4417,38 @@ ucol_getSortKey(const UCollator *coll, uint8_t *result, int32_t resultLength) { - UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); - if (UTRACE_LEVEL(UTRACE_VERBOSE)) { - int32_t actualSrcLen = sourceLength; - if (actualSrcLen==-1 && source!=NULL) { - actualSrcLen = u_strlen(source); - } - UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, actualSrcLen); - } - - UErrorCode status = U_ZERO_ERROR; - int32_t keySize = 0; - - if(source != NULL) { - // source == NULL is actually an error situation, but we would need to - // have an error code to return it. Until we introduce a new - // API, it stays like this - - /* this uses the function pointer that is set in updateinternalstate */ - /* currently, there are two funcs: */ - /*ucol_calcSortKey(...);*/ - /*ucol_calcSortKeySimpleTertiary(...);*/ + UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); + if (UTRACE_LEVEL(UTRACE_VERBOSE)) { + UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, + ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); + } - keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status); - //((UCollator *)coll)->errorCode = status; /*semantically const */ - } - UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); - UTRACE_EXIT_STATUS(status); - return keySize; + UErrorCode status = U_ZERO_ERROR; + int32_t keySize = 0; + + if(source != NULL) { + // source == NULL is actually an error situation, but we would need to + // have an error code to return it. Until we introduce a new + // API, it stays like this + + /* this uses the function pointer that is set in updateinternalstate */ + /* currently, there are two funcs: */ + /*ucol_calcSortKey(...);*/ + /*ucol_calcSortKeySimpleTertiary(...);*/ + + keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status); + //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) { + // That's not good. Something unusual happened. + // We don't know how much we initialized before we failed. + // NULL terminate for safety. + // We have no way say that we have generated a partial sort key. + //result[0] = 0; + //keySize = 0; + //} + } + UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); + UTRACE_EXIT_STATUS(status); + return keySize; } /* this function is called by the C++ API for sortkey generation */ @@ -4554,7 +4467,7 @@ ucol_getSortKeyWithAllocation(const UCollator *coll, /* or if we run out of space while making a sortkey and want to return ASAP */ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) { UErrorCode status = U_ZERO_ERROR; - const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); + //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); @@ -4572,8 +4485,8 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre uint32_t variableTopValue = coll->variableTopValue; uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); if(doHiragana) { - UCOL_COMMON_BOT4++; - /* allocate one more space for hiragana */ + UCOL_COMMON_BOT4++; + /* allocate one more space for hiragana */ } uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); @@ -4595,296 +4508,305 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre for(;;) { - order = ucol_IGetNextCE(coll, s, &status); - if(order == UCOL_NO_MORE_CES) { - break; - } + order = ucol_IGetNextCE(coll, s, &status); + if(order == UCOL_NO_MORE_CES) { + break; + } - if(order == 0) { + if(order == 0) { continue; - } + } - notIsContinuation = !isContinuation(order); + notIsContinuation = !isContinuation(order); - if(notIsContinuation) { + if(notIsContinuation) { tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK)); - } else { + } else { tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); - } - secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); - primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); - primary1 = (uint8_t)(order >> 8); + } + secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); + primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); + primary1 = (uint8_t)(order >> 8); - if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) + if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) || (!notIsContinuation && wasShifted)) || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */ - /* and other ignorables should be removed if following a shifted code point */ - if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ - /* we should just completely ignore it */ - continue; - } - if(compareQuad == 0) { - if(c4 > 0) { - currentSize += (c2/UCOL_BOT_COUNT4)+1; - c4 = 0; - } - currentSize++; - if(primary2 != 0) { - currentSize++; - } - } - wasShifted = TRUE; - } else { + /* and other ignorables should be removed if following a shifted code point */ + if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ + /* we should just completely ignore it */ + continue; + } + if(compareQuad == 0) { + if(c4 > 0) { + currentSize += (c2/UCOL_BOT_COUNT4)+1; + c4 = 0; + } + currentSize++; + if(primary2 != 0) { + currentSize++; + } + } + wasShifted = TRUE; + } else { wasShifted = FALSE; /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ /* calculate sortkey size */ if(primary1 != UCOL_IGNORABLE) { - if(notIsContinuation) { - if(leadPrimary == primary1) { - currentSize++; - } else { - if(leadPrimary != 0) { + if(notIsContinuation) { + if(leadPrimary == primary1) { + currentSize++; + } else { + if(leadPrimary != 0) { + currentSize++; + } + if(primary2 == UCOL_IGNORABLE) { + /* one byter, not compressed */ + currentSize++; + leadPrimary = 0; + } + else if(primary1 (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) { + //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { + (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) + { + /* not compressible */ + leadPrimary = 0; + currentSize+=2; + } + else { /* compress */ + leadPrimary = primary1; + currentSize+=2; + } + } + } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ currentSize++; - } - if(primary2 == UCOL_IGNORABLE) { - /* one byter, not compressed */ - currentSize++; - leadPrimary = 0; - } else if(primary1 (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) { - (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { - /* not compressible */ - leadPrimary = 0; - currentSize+=2; - } else { /* compress */ - leadPrimary = primary1; - currentSize+=2; - } - } - } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ - currentSize++; - if(primary2 != UCOL_IGNORABLE) { - currentSize++; + if(primary2 != UCOL_IGNORABLE) { + currentSize++; + } } - } } if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */ - if(!isFrenchSec){ - if (secondary == UCOL_COMMON2 && notIsContinuation) { - c2++; - } else { - if(c2 > 0) { - if (secondary > UCOL_COMMON2) { // not necessary for 4th level. - currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1; + if(!isFrenchSec){ + if (secondary == UCOL_COMMON2 && notIsContinuation) { + c2++; } else { - currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1; + if(c2 > 0) { + if (secondary > UCOL_COMMON2) { // not necessary for 4th level. + currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1; + } else { + currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1; + } + c2 = 0; + } + currentSize++; } - c2 = 0; - } - currentSize++; - } - } else { - fSecs[fSecsLen++] = secondary; - if(fSecsLen == fSecsMaxLen) { - if(fSecs == fSecsBuff) { - fSecs = (uint8_t *)uprv_malloc(2*fSecsLen); - } else { - fSecs = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen); - } - if(fSecs == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return -1; - } - fSecsMaxLen *= 2; - } - if(notIsContinuation) { - if (frenchStartPtr != NULL) { - /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ - uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); - frenchStartPtr = NULL; - } } else { - if (frenchStartPtr == NULL) { - frenchStartPtr = fSecs+fSecsLen-2; - } - frenchEndPtr = fSecs+fSecsLen-1; + fSecs[fSecsLen++] = secondary; + if(fSecsLen == fSecsMaxLen) { + uint8_t *fSecsTemp; + if(fSecs == fSecsBuff) { + fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen); + } else { + fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen); + } + if(fSecsTemp == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + fSecs = fSecsTemp; + fSecsMaxLen *= 2; + } + if(notIsContinuation) { + if (frenchStartPtr != NULL) { + /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ + uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); + frenchStartPtr = NULL; + } + } else { + if (frenchStartPtr == NULL) { + frenchStartPtr = fSecs+fSecsLen-2; + } + frenchEndPtr = fSecs+fSecsLen-1; + } } - } } - if(doCase) { - if (caseShift == 0) { - currentSize++; - caseShift = UCOL_CASE_SHIFT_START; - } - if((tertiary&0x3F) > 0 && notIsContinuation) { - caseShift--; - if((tertiary &0xC0) != 0) { - if (caseShift == 0) { + if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { + // do the case level if we need to do it. We don't want to calculate + // case level for primary ignorables if we have only primary strength and case level + // otherwise we would break well formedness of CEs + if (caseShift == 0) { currentSize++; caseShift = UCOL_CASE_SHIFT_START; - } - caseShift--; } - } + if((tertiary&0x3F) > 0 && notIsContinuation) { + caseShift--; + if((tertiary &0xC0) != 0) { + if (caseShift == 0) { + currentSize++; + caseShift = UCOL_CASE_SHIFT_START; + } + caseShift--; + } + } } else { - if(notIsContinuation) { - tertiary ^= caseSwitch; - } + if(notIsContinuation) { + tertiary ^= caseSwitch; + } } tertiary &= tertiaryMask; if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */ - if (tertiary == tertiaryCommon && notIsContinuation) { - c3++; - } else { - if(c3 > 0) { - if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) - || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) { - currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1; - } else { - currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1; - } - c3 = 0; + if (tertiary == tertiaryCommon && notIsContinuation) { + c3++; + } else { + if(c3 > 0) { + if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) + || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) { + currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1; + } else { + currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1; + } + c3 = 0; + } + currentSize++; } - currentSize++; - } } if(/*qShifted*/(compareQuad==0) && notIsContinuation) { - if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it - if(c4>0) { // Close this part - currentSize += (c4/UCOL_BOT_COUNT4)+1; - c4 = 0; - } - currentSize++; // Add the Hiragana - } else { // This wasn't Hiragana, so we can continue adding stuff - c4++; - } + if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it + if(c4>0) { // Close this part + currentSize += (c4/UCOL_BOT_COUNT4)+1; + c4 = 0; + } + currentSize++; // Add the Hiragana + } else { // This wasn't Hiragana, so we can continue adding stuff + c4++; + } } - - } + } } if(!isFrenchSec){ - if(c2 > 0) { - currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); - } + if(c2 > 0) { + currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); + } } else { - uint32_t i = 0; - if(frenchStartPtr != NULL) { - uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); - } - for(i = 0; i 0) { - if (secondary > UCOL_COMMON2) { // not necessary for 4th level. - currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0); + uint32_t i = 0; + if(frenchStartPtr != NULL) { + uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); + } + for(i = 0; i 0) { + if (secondary > UCOL_COMMON2) { // not necessary for 4th level. + currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0); + } else { + currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); + } + c2 = 0; + } + currentSize++; } - c2 = 0; - } - currentSize++; } - } - if(c2 > 0) { - currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); - } - if(fSecs != fSecsBuff) { - uprv_free(fSecs); - } + if(c2 > 0) { + currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); + } + if(fSecs != fSecsBuff) { + uprv_free(fSecs); + } } if(c3 > 0) { - currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0); + currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0); } if(c4 > 0 && compareQuad == 0) { - currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0); + currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0); } if(compareIdent) { - currentSize += u_lengthOfIdenticalLevelRun(s->string, len); + currentSize += u_lengthOfIdenticalLevelRun(s->string, len); } return currentSize; - } static inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) { - if (caseShift == 0) { - *(*cases)++ = UCOL_CASE_BYTE_START; - caseShift = UCOL_CASE_SHIFT_START; - } + if (caseShift == 0) { + *(*cases)++ = UCOL_CASE_BYTE_START; + caseShift = UCOL_CASE_SHIFT_START; + } } // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we // know how many values we wanted to add, even if we didn't add them all static inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) { - size++; - if(primaries < limit) { - *(primaries)++ = value; - } + size++; + if(primaries < limit) { + *(primaries)++ = value; + } } // Packs the secondary buffer when processing French locale. Adds the terminator. static inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) { - uint8_t secondary; - int32_t count2 = 0; - uint32_t i = 0, size = 0; - // we use i here since the key size already accounts for terminators, so we'll discard the increment - addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR); - /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */ - if(frenchStartPtr != NULL) { - uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); - } - for(i = 0; i<*secsize; i++) { - secondary = *(secondaries-i-1); - /* This is compression code. */ - if (secondary == UCOL_COMMON2) { - ++count2; - } else { - if (count2 > 0) { - if (secondary > UCOL_COMMON2) { // not necessary for 4th level. - while (count2 > UCOL_TOP_COUNT2) { - addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2)); - count2 -= (uint32_t)UCOL_TOP_COUNT2; - } - addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1))); + uint8_t secondary; + int32_t count2 = 0; + uint32_t i = 0, size = 0; + // we use i here since the key size already accounts for terminators, so we'll discard the increment + addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR); + /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */ + if(frenchStartPtr != NULL) { + uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); + } + for(i = 0; i<*secsize; i++) { + secondary = *(secondaries-i-1); + /* This is compression code. */ + if (secondary == UCOL_COMMON2) { + ++count2; } else { - while (count2 > UCOL_BOT_COUNT2) { + if (count2 > 0) { + if (secondary > UCOL_COMMON2) { // not necessary for 4th level. + while (count2 > UCOL_TOP_COUNT2) { + addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2)); + count2 -= (uint32_t)UCOL_TOP_COUNT2; + } + addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1))); + } else { + while (count2 > UCOL_BOT_COUNT2) { + addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); + count2 -= (uint32_t)UCOL_BOT_COUNT2; + } + addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); + } + count2 = 0; + } + addWithIncrement(primaries, primEnd, size, secondary); + } + } + if (count2 > 0) { + while (count2 > UCOL_BOT_COUNT2) { addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); count2 -= (uint32_t)UCOL_BOT_COUNT2; - } - addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); } - count2 = 0; - } - addWithIncrement(primaries, primEnd, size, secondary); - } - } - if (count2 > 0) { - while (count2 > UCOL_BOT_COUNT2) { - addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); - count2 -= (uint32_t)UCOL_BOT_COUNT2; + addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); } - addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); - } - *secsize = size; - return primaries; + *secsize = size; + return primaries; } +#define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 + /* This is the sortkey work horse function */ U_CFUNC int32_t U_CALLCONV ucol_calcSortKey(const UCollator *coll, @@ -4895,7 +4817,7 @@ ucol_calcSortKey(const UCollator *coll, UBool allocateSKBuffer, UErrorCode *status) { - const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); + //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); uint32_t i = 0; /* general purpose counter */ @@ -4905,7 +4827,7 @@ ucol_calcSortKey(const UCollator *coll, uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad; if(U_FAILURE(*status)) { - return 0; + return 0; } if(primaries == NULL && allocateSKBuffer == TRUE) { @@ -4935,7 +4857,7 @@ ucol_calcSortKey(const UCollator *coll, UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); //UBool qShifted = shifted && (compareQuad == 0); UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); - const uint8_t *scriptOrder = coll->scriptOrder; + /*const uint8_t *scriptOrder = coll->scriptOrder;*/ uint32_t variableTopValue = coll->variableTopValue; // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no @@ -4943,8 +4865,8 @@ ucol_calcSortKey(const UCollator *coll, uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); uint8_t UCOL_HIRAGANA_QUAD = 0; if(doHiragana) { - UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; - /* allocate one more space for hiragana, value for hiragana */ + UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; + /* allocate one more space for hiragana, value for hiragana */ } uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); @@ -4997,13 +4919,16 @@ ucol_calcSortKey(const UCollator *coll, } if(resultLength == 0 || primaries == NULL) { - int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); - if(normSource != normBuffer) { - uprv_free(normSource); - } - return keyLen; + int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); + if(normSource != normBuffer) { + uprv_free(normSource); + } + return keyLen; + } + uint8_t *primarySafeEnd = primaries + resultLength - 1; + if(strength > UCOL_PRIMARY) { + primarySafeEnd--; } - uint8_t *primarySafeEnd = primaries + resultLength - 2; uint32_t minBufferSize = UCOL_MAX_BUFFER; @@ -5021,7 +4946,7 @@ ucol_calcSortKey(const UCollator *coll, uint8_t tertiary = 0; uint8_t caseSwitch = coll->caseSwitch; uint8_t tertiaryMask = coll->tertiaryMask; - int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition; + int8_t tertiaryAddition = coll->tertiaryAddition; uint8_t tertiaryTop = coll->tertiaryTop; uint8_t tertiaryBottom = coll->tertiaryBottom; uint8_t tertiaryCommon = coll->tertiaryCommon; @@ -5046,422 +4971,485 @@ ucol_calcSortKey(const UCollator *coll, } if(order == 0) { - continue; + continue; } notIsContinuation = !isContinuation(order); if(notIsContinuation) { - tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); + tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); } else { - tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); + tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); } secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); primary1 = (uint8_t)(order >> 8); - if(notIsContinuation) { - if(scriptOrder != NULL) { - primary1 = scriptOrder[primary1]; - } - } + /*if(notIsContinuation && scriptOrder != NULL) { + primary1 = scriptOrder[primary1]; + }*/ if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) - || (!notIsContinuation && wasShifted)) - || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */ - /* and other ignorables should be removed if following a shifted code point */ - if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ - /* we should just completely ignore it */ - continue; - } - if(compareQuad == 0) { - if(count4 > 0) { - while (count4 > UCOL_BOT_COUNT4) { - *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); - count4 -= UCOL_BOT_COUNT4; - } - *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); - count4 = 0; - } - /* We are dealing with a variable and we're treating them as shifted */ - /* This is a shifted ignorable */ - if(primary1 != 0) { /* we need to check this since we could be in continuation */ - *quads++ = primary1; + || (!notIsContinuation && wasShifted)) + || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ + { + /* and other ignorables should be removed if following a shifted code point */ + if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ + /* we should just completely ignore it */ + continue; } - if(primary2 != 0) { - *quads++ = primary2; + if(compareQuad == 0) { + if(count4 > 0) { + while (count4 > UCOL_BOT_COUNT4) { + *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); + count4 -= UCOL_BOT_COUNT4; + } + *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); + count4 = 0; + } + /* We are dealing with a variable and we're treating them as shifted */ + /* This is a shifted ignorable */ + if(primary1 != 0) { /* we need to check this since we could be in continuation */ + *quads++ = primary1; + } + if(primary2 != 0) { + *quads++ = primary2; + } } - } - wasShifted = TRUE; + wasShifted = TRUE; } else { - wasShifted = FALSE; - /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ - /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ - /* regular and simple sortkey calc */ - if(primary1 != UCOL_IGNORABLE) { - if(notIsContinuation) { - if(leadPrimary == primary1) { - *primaries++ = primary2; - } else { - if(leadPrimary != 0) { - *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); - } - if(primary2 == UCOL_IGNORABLE) { - /* one byter, not compressed */ - *primaries++ = primary1; - leadPrimary = 0; - } else if(primary1 (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { - /* not compressible */ - leadPrimary = 0; + wasShifted = FALSE; + /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ + /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ + /* regular and simple sortkey calc */ + if(primary1 != UCOL_IGNORABLE) { + if(notIsContinuation) { + if(leadPrimary == primary1) { + *primaries++ = primary2; + } else { + if(leadPrimary != 0) { + *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); + } + if(primary2 == UCOL_IGNORABLE) { + /* one byter, not compressed */ + *primaries++ = primary1; + leadPrimary = 0; + } else if(primary1 (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { + (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) { + /* not compressible */ + leadPrimary = 0; + *primaries++ = primary1; + if(primaries <= primarySafeEnd) { + *primaries++ = primary2; + } + } else { /* compress */ + *primaries++ = leadPrimary = primary1; + if(primaries <= primarySafeEnd) { + *primaries++ = primary2; + } + } + } + } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ *primaries++ = primary1; - *primaries++ = primary2; - } else { /* compress */ - *primaries++ = leadPrimary = primary1; - *primaries++ = primary2; + if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) { + *primaries++ = primary2; /* second part */ + } } - } - } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ - *primaries++ = primary1; - if(primary2 != UCOL_IGNORABLE) { - *primaries++ = primary2; /* second part */ - } } - } - if(secondary > compareSec) { - if(!isFrenchSec) { - /* This is compression code. */ - if (secondary == UCOL_COMMON2 && notIsContinuation) { - ++count2; - } else { - if (count2 > 0) { - if (secondary > UCOL_COMMON2) { // not necessary for 4th level. - while (count2 > UCOL_TOP_COUNT2) { - *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); - count2 -= (uint32_t)UCOL_TOP_COUNT2; - } - *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); + if(secondary > compareSec) { + if(!isFrenchSec) { + /* This is compression code. */ + if (secondary == UCOL_COMMON2 && notIsContinuation) { + ++count2; + } else { + if (count2 > 0) { + if (secondary > UCOL_COMMON2) { // not necessary for 4th level. + while (count2 > UCOL_TOP_COUNT2) { + *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); + count2 -= (uint32_t)UCOL_TOP_COUNT2; + } + *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); + } else { + while (count2 > UCOL_BOT_COUNT2) { + *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); + count2 -= (uint32_t)UCOL_BOT_COUNT2; + } + *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); + } + count2 = 0; + } + *secondaries++ = secondary; + } } else { - while (count2 > UCOL_BOT_COUNT2) { - *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); - count2 -= (uint32_t)UCOL_BOT_COUNT2; - } - *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); + *secondaries++ = secondary; + /* Do the special handling for French secondaries */ + /* We need to get continuation elements and do intermediate restore */ + /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ + if(notIsContinuation) { + if (frenchStartPtr != NULL) { + /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ + uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); + frenchStartPtr = NULL; + } + } else { + if (frenchStartPtr == NULL) { + frenchStartPtr = secondaries - 2; + } + frenchEndPtr = secondaries-1; + } } - count2 = 0; - } - *secondaries++ = secondary; - } - } else { - *secondaries++ = secondary; - /* Do the special handling for French secondaries */ - /* We need to get continuation elements and do intermediate restore */ - /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ - if(notIsContinuation) { - if (frenchStartPtr != NULL) { - /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ - uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); - frenchStartPtr = NULL; - } - } else { - if (frenchStartPtr == NULL) { - frenchStartPtr = secondaries - 2; - } - frenchEndPtr = secondaries-1; - } } - } - if(doCase) { - doCaseShift(&cases, caseShift); - if(notIsContinuation) { - caseBits = (uint8_t)(tertiary & 0xC0); + if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { + // do the case level if we need to do it. We don't want to calculate + // case level for primary ignorables if we have only primary strength and case level + // otherwise we would break well formedness of CEs + doCaseShift(&cases, caseShift); + if(notIsContinuation) { + caseBits = (uint8_t)(tertiary & 0xC0); + + if(tertiary != 0) { + if(coll->caseFirst == UCOL_UPPER_FIRST) { + if((caseBits & 0xC0) == 0) { + *(cases-1) |= 1 << (--caseShift); + } else { + *(cases-1) |= 0 << (--caseShift); + /* second bit */ + doCaseShift(&cases, caseShift); + *(cases-1) |= ((caseBits>>6)&1) << (--caseShift); + } + } else { + if((caseBits & 0xC0) == 0) { + *(cases-1) |= 0 << (--caseShift); + } else { + *(cases-1) |= 1 << (--caseShift); + /* second bit */ + doCaseShift(&cases, caseShift); + *(cases-1) |= ((caseBits>>7)&1) << (--caseShift); + } + } + } - if(tertiary != 0) { - if(coll->caseFirst == UCOL_UPPER_FIRST) { - if((caseBits & 0xC0) == 0) { - *(cases-1) |= 1 << (--caseShift); - } else { - *(cases-1) |= 0 << (--caseShift); - /* second bit */ - doCaseShift(&cases, caseShift); - *(cases-1) |= ((caseBits>>6)&1) << (--caseShift); - } - } else { - if((caseBits & 0xC0) == 0) { - *(cases-1) |= 0 << (--caseShift); - } else { - *(cases-1) |= 1 << (--caseShift); - /* second bit */ - doCaseShift(&cases, caseShift); - *(cases-1) |= ((caseBits>>7)&1) << (--caseShift); - } } - } - - } - } else { - if(notIsContinuation) { - tertiary ^= caseSwitch; + } else { + if(notIsContinuation) { + tertiary ^= caseSwitch; + } } - } - tertiary &= tertiaryMask; - if(tertiary > compareTer) { - /* This is compression code. */ - /* sequence size check is included in the if clause */ - if (tertiary == tertiaryCommon && notIsContinuation) { - ++count3; - } else { - if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) - || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) { - tertiary += tertiaryAddition; - } - if (count3 > 0) { - if ((tertiary > tertiaryCommon)) { - while (count3 > coll->tertiaryTopCount) { - *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); - count3 -= (uint32_t)coll->tertiaryTopCount; - } - *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); + tertiary &= tertiaryMask; + if(tertiary > compareTer) { + /* This is compression code. */ + /* sequence size check is included in the if clause */ + if (tertiary == tertiaryCommon && notIsContinuation) { + ++count3; } else { - while (count3 > coll->tertiaryBottomCount) { - *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); - count3 -= (uint32_t)coll->tertiaryBottomCount; - } - *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); + if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { + tertiary += tertiaryAddition; + } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { + tertiary -= tertiaryAddition; + } + if (count3 > 0) { + if ((tertiary > tertiaryCommon)) { + while (count3 > coll->tertiaryTopCount) { + *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); + count3 -= (uint32_t)coll->tertiaryTopCount; + } + *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); + } else { + while (count3 > coll->tertiaryBottomCount) { + *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); + count3 -= (uint32_t)coll->tertiaryBottomCount; + } + *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); + } + count3 = 0; + } + *tertiaries++ = tertiary; } - count3 = 0; - } - *tertiaries++ = tertiary; } - } - if(/*qShifted*/(compareQuad==0) && notIsContinuation) { - if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it - if(count4>0) { // Close this part - while (count4 > UCOL_BOT_COUNT4) { - *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); - count4 -= UCOL_BOT_COUNT4; + if(/*qShifted*/(compareQuad==0) && notIsContinuation) { + if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it + if(count4>0) { // Close this part + while (count4 > UCOL_BOT_COUNT4) { + *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); + count4 -= UCOL_BOT_COUNT4; + } + *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); + count4 = 0; + } + *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana + } else { // This wasn't Hiragana, so we can continue adding stuff + count4++; } - *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); - count4 = 0; - } - *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana - } else { // This wasn't Hiragana, so we can continue adding stuff - count4++; } - } } if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ - if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ - IInit_collIterate(coll, (UChar *)source, len, &s); - if(source == normSource) { - s.flags &= ~UCOL_ITER_NORM; - } - sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); - *status = U_BUFFER_OVERFLOW_ERROR; - finished = TRUE; - break; - } else { /* It's much nicer if we can actually reallocate */ - int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart); - primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); - if(U_SUCCESS(*status)) { - *result = primStart; - primarySafeEnd = primStart + resultLength - 2; - } else { - IInit_collIterate(coll, (UChar *)source, len, &s); - if(source == normSource) { - s.flags &= ~UCOL_ITER_NORM; - } - sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); - finished = TRUE; - break; + if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ + IInit_collIterate(coll, (UChar *)source, len, &s); + if(source == normSource) { + s.flags &= ~UCOL_ITER_NORM; + } + sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); + *status = U_BUFFER_OVERFLOW_ERROR; + finished = TRUE; + break; + } else { /* It's much nicer if we can actually reallocate */ + int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart); + primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); + if(U_SUCCESS(*status)) { + *result = primStart; + primarySafeEnd = primStart + resultLength - 1; + if(strength > UCOL_PRIMARY) { + primarySafeEnd--; + } + } else { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + finished = TRUE; + break; + } } - } } } if(finished) { break; } else { - prevBuffSize = minBufferSize; - secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); - terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); - caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status); - quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status); - minBufferSize *= 2; - if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size - IInit_collIterate(coll, (UChar *)source, len, &s); - if(source == normSource) { - s.flags &= ~UCOL_ITER_NORM; - } - sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); - break; - } - } - } - - /* Here, we are generally done with processing */ - /* bailing out would not be too productive */ - - if(U_SUCCESS(*status)) { - sortKeySize += (primaries - primStart); - /* we have done all the CE's, now let's put them together to form a key */ - if(compareSec == 0) { - if (count2 > 0) { - while (count2 > UCOL_BOT_COUNT2) { - *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); - count2 -= (uint32_t)UCOL_BOT_COUNT2; - } - *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); - } - uint32_t secsize = secondaries-secStart; - if(!isFrenchSec) { // Regular situation, we know the length of secondaries - sortKeySize += secsize; - if(sortKeySize <= resultLength) { - *(primaries++) = UCOL_LEVELTERMINATOR; - uprv_memcpy(primaries, secStart, secsize); - primaries += secsize; - } else { - if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ - primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); - if(U_SUCCESS(*status)) { - *result = primStart; - *(primaries++) = UCOL_LEVELTERMINATOR; - uprv_memcpy(primaries, secStart, secsize); - primaries += secsize; - } - } else { - *status = U_BUFFER_OVERFLOW_ERROR; - } - } - } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator - uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); - sortKeySize += secsize; - if(sortKeySize <= resultLength) { // if we managed to pack fine - primaries = newPrim; // update the primary pointer - } else { // overflow, need to reallocate and redo - if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ - primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); - if(U_SUCCESS(*status)) { - primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); - } - } else { - *status = U_BUFFER_OVERFLOW_ERROR; - } - } - } - } - - if(doCase) { - uint32_t casesize = cases - caseStart; - sortKeySize += casesize; - if(sortKeySize <= resultLength) { - *(primaries++) = UCOL_LEVELTERMINATOR; - uprv_memcpy(primaries, caseStart, casesize); - primaries += casesize; - } else { - if(allocateSKBuffer == TRUE) { - primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); - if(U_SUCCESS(*status)) { - *result = primStart; - *(primaries++) = UCOL_LEVELTERMINATOR; - uprv_memcpy(primaries, caseStart, casesize); - } - } else { - *status = U_BUFFER_OVERFLOW_ERROR; - } - } - } - - if(compareTer == 0) { - if (count3 > 0) { - if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { - while (count3 >= coll->tertiaryTopCount) { - *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); - count3 -= (uint32_t)coll->tertiaryTopCount; - } - *tertiaries++ = (uint8_t)(tertiaryTop - count3); - } else { - while (count3 > coll->tertiaryBottomCount) { - *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); - count3 -= (uint32_t)coll->tertiaryBottomCount; - } - *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); - } - } - uint32_t tersize = tertiaries - terStart; - sortKeySize += tersize; - if(sortKeySize <= resultLength) { - *(primaries++) = UCOL_LEVELTERMINATOR; - uprv_memcpy(primaries, terStart, tersize); - primaries += tersize; - } else { - if(allocateSKBuffer == TRUE) { - primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); - if(U_SUCCESS(*status)) { - *result = primStart; - *(primaries++) = UCOL_LEVELTERMINATOR; - uprv_memcpy(primaries, terStart, tersize); - } - } else { - *status = U_BUFFER_OVERFLOW_ERROR; - } + prevBuffSize = minBufferSize; + + uint32_t frenchStartOffset = 0, frenchEndOffset = 0; + if (frenchStartPtr != NULL) { + frenchStartOffset = frenchStartPtr - secStart; + frenchEndOffset = frenchEndPtr - secStart; + } + secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); + terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); + caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status); + quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status); + if(U_FAILURE(*status)) { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + break; + } + if (frenchStartPtr != NULL) { + frenchStartPtr = secStart + frenchStartOffset; + frenchEndPtr = secStart + frenchEndOffset; + } + minBufferSize *= 2; } + } - if(compareQuad == 0/*qShifted == TRUE*/) { - if(count4 > 0) { - while (count4 > UCOL_BOT_COUNT4) { - *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); - count4 -= UCOL_BOT_COUNT4; - } - *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); + /* Here, we are generally done with processing */ + /* bailing out would not be too productive */ + + if(U_SUCCESS(*status)) { + sortKeySize += (primaries - primStart); + /* we have done all the CE's, now let's put them together to form a key */ + if(compareSec == 0) { + if (count2 > 0) { + while (count2 > UCOL_BOT_COUNT2) { + *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); + count2 -= (uint32_t)UCOL_BOT_COUNT2; + } + *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); + } + uint32_t secsize = secondaries-secStart; + if(!isFrenchSec) { // Regular situation, we know the length of secondaries + sortKeySize += secsize; + if(sortKeySize <= resultLength) { + *(primaries++) = UCOL_LEVELTERMINATOR; + uprv_memcpy(primaries, secStart, secsize); + primaries += secsize; + } else { + if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ + primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); + if(U_SUCCESS(*status)) { + *result = primStart; + *(primaries++) = UCOL_LEVELTERMINATOR; + uprv_memcpy(primaries, secStart, secsize); + primaries += secsize; + } + else { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + goto cleanup; + } + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + } + } + } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator + uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); + sortKeySize += secsize; + if(sortKeySize <= resultLength) { // if we managed to pack fine + primaries = newPrim; // update the primary pointer + } else { // overflow, need to reallocate and redo + if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ + primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); + if(U_SUCCESS(*status)) { + primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); + } + else { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + goto cleanup; + } + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + } + } } - uint32_t quadsize = quads - quadStart; - sortKeySize += quadsize; + } + + if(doCase) { + uint32_t casesize = cases - caseStart; + sortKeySize += casesize; if(sortKeySize <= resultLength) { - *(primaries++) = UCOL_LEVELTERMINATOR; - uprv_memcpy(primaries, quadStart, quadsize); - primaries += quadsize; + *(primaries++) = UCOL_LEVELTERMINATOR; + uprv_memcpy(primaries, caseStart, casesize); + primaries += casesize; } else { - if(allocateSKBuffer == TRUE) { - primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); - if(U_SUCCESS(*status)) { - *result = primStart; - *(primaries++) = UCOL_LEVELTERMINATOR; - uprv_memcpy(primaries, quadStart, quadsize); + if(allocateSKBuffer == TRUE) { + primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); + if(U_SUCCESS(*status)) { + *result = primStart; + *(primaries++) = UCOL_LEVELTERMINATOR; + uprv_memcpy(primaries, caseStart, casesize); + } + else { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + goto cleanup; + } + } else { + *status = U_BUFFER_OVERFLOW_ERROR; } - } else { - *status = U_BUFFER_OVERFLOW_ERROR; - } } } - if(compareIdent) { - sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len); - if(sortKeySize <= resultLength) { - *(primaries++) = UCOL_LEVELTERMINATOR; - primaries += u_writeIdenticalLevelRun(s.string, len, primaries); - } else { - if(allocateSKBuffer == TRUE) { - primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status); - if(U_SUCCESS(*status)) { - *result = primStart; + if(compareTer == 0) { + if (count3 > 0) { + if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { + while (count3 >= coll->tertiaryTopCount) { + *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); + count3 -= (uint32_t)coll->tertiaryTopCount; + } + *tertiaries++ = (uint8_t)(tertiaryTop - count3); + } else { + while (count3 > coll->tertiaryBottomCount) { + *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); + count3 -= (uint32_t)coll->tertiaryBottomCount; + } + *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); + } + } + uint32_t tersize = tertiaries - terStart; + sortKeySize += tersize; + if(sortKeySize <= resultLength) { *(primaries++) = UCOL_LEVELTERMINATOR; - u_writeIdenticalLevelRun(s.string, len, primaries); - } + uprv_memcpy(primaries, terStart, tersize); + primaries += tersize; } else { - *status = U_BUFFER_OVERFLOW_ERROR; + if(allocateSKBuffer == TRUE) { + primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); + if(U_SUCCESS(*status)) { + *result = primStart; + *(primaries++) = UCOL_LEVELTERMINATOR; + uprv_memcpy(primaries, terStart, tersize); + } + else { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + goto cleanup; + } + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + } + } + + if(compareQuad == 0/*qShifted == TRUE*/) { + if(count4 > 0) { + while (count4 > UCOL_BOT_COUNT4) { + *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); + count4 -= UCOL_BOT_COUNT4; + } + *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); + } + uint32_t quadsize = quads - quadStart; + sortKeySize += quadsize; + if(sortKeySize <= resultLength) { + *(primaries++) = UCOL_LEVELTERMINATOR; + uprv_memcpy(primaries, quadStart, quadsize); + primaries += quadsize; + } else { + if(allocateSKBuffer == TRUE) { + primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); + if(U_SUCCESS(*status)) { + *result = primStart; + *(primaries++) = UCOL_LEVELTERMINATOR; + uprv_memcpy(primaries, quadStart, quadsize); + } + else { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + goto cleanup; + } + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + } + } + } + + if(compareIdent) { + sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len); + if(sortKeySize <= resultLength) { + *(primaries++) = UCOL_LEVELTERMINATOR; + primaries += u_writeIdenticalLevelRun(s.string, len, primaries); + } else { + if(allocateSKBuffer == TRUE) { + primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status); + if(U_SUCCESS(*status)) { + *result = primStart; + *(primaries++) = UCOL_LEVELTERMINATOR; + u_writeIdenticalLevelRun(s.string, len, primaries); + } + else { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + goto cleanup; + } + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + } + } } - } } - } - *(primaries++) = '\0'; + *(primaries++) = '\0'; + } + + if(allocateSKBuffer == TRUE) { + *result = (uint8_t*)uprv_malloc(sortKeySize); + /* test for NULL */ + if (*result == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto cleanup; + } + uprv_memcpy(*result, primStart, sortKeySize); + if(primStart != prim) { + uprv_free(primStart); + } } +cleanup: + if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { + /* NULL terminate for safety */ + **result = 0; + } if(terStart != tert) { uprv_free(terStart); uprv_free(secStart); @@ -5469,23 +5457,13 @@ ucol_calcSortKey(const UCollator *coll, uprv_free(quadStart); } + /* To avoid memory leak, free the offset buffer if necessary. */ + freeOffsetBuffer(&s); + if(normSource != normBuffer) { uprv_free(normSource); } - if(allocateSKBuffer == TRUE) { - *result = (uint8_t*)uprv_malloc(sortKeySize); - /* test for NULL */ - if (*result == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return sortKeySize; - } - uprv_memcpy(*result, primStart, sortKeySize); - if(primStart != prim) { - uprv_free(primStart); - } - } - return sortKeySize; } @@ -5501,7 +5479,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll, { U_ALIGN_CODE(16); - const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); + //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); uint32_t i = 0; /* general purpose counter */ /* Stack allocated buffers for buffers we use */ @@ -5510,7 +5488,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll, uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert; if(U_FAILURE(*status)) { - return 0; + return 0; } if(primaries == NULL && allocateSKBuffer == TRUE) { @@ -5546,6 +5524,11 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll, source, len, UNORM_FCD, FALSE, status); + if(U_FAILURE(*status)) { + /* Should never happen. */ + uprv_free(normSource); + normSource = normBuffer; + } } if(U_FAILURE(*status)) { @@ -5584,7 +5567,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll, uint8_t tertiary = 0; uint8_t caseSwitch = coll->caseSwitch; uint8_t tertiaryMask = coll->tertiaryMask; - int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition; + int8_t tertiaryAddition = coll->tertiaryAddition; uint8_t tertiaryTop = coll->tertiaryTop; uint8_t tertiaryBottom = coll->tertiaryBottom; uint8_t tertiaryCommon = coll->tertiaryCommon; @@ -5603,7 +5586,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll, order = ucol_IGetNextCE(coll, &s, status); if(order == 0) { - continue; + continue; } if(order == UCOL_NO_MORE_CES) { @@ -5614,9 +5597,9 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll, notIsContinuation = !isContinuation(order); if(notIsContinuation) { - tertiary = (uint8_t)((order & tertiaryMask)); + tertiary = (uint8_t)((order & tertiaryMask)); } else { - tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); + tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); } secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); @@ -5627,261 +5610,275 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll, /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ /* regular and simple sortkey calc */ if(primary1 != UCOL_IGNORABLE) { - if(notIsContinuation) { - if(leadPrimary == primary1) { - *primaries++ = primary2; - } else { - if(leadPrimary != 0) { - *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); - } - if(primary2 == UCOL_IGNORABLE) { - /* one byter, not compressed */ - *primaries++ = primary1; - leadPrimary = 0; - } else if(primary1 (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) - (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { - /* not compressible */ - leadPrimary = 0; - *primaries++ = primary1; - *primaries++ = primary2; - } else { /* compress */ - *primaries++ = leadPrimary = primary1; - *primaries++ = primary2; - } - } - } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ - *primaries++ = primary1; - if(primary2 != UCOL_IGNORABLE) { - *primaries++ = primary2; /* second part */ + if(notIsContinuation) { + if(leadPrimary == primary1) { + *primaries++ = primary2; + } else { + if(leadPrimary != 0) { + *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); + } + if(primary2 == UCOL_IGNORABLE) { + /* one byter, not compressed */ + *primaries++ = primary1; + leadPrimary = 0; + } else if(primary1 (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) + //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { + (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) { + /* not compressible */ + leadPrimary = 0; + *primaries++ = primary1; + *primaries++ = primary2; + } else { /* compress */ + *primaries++ = leadPrimary = primary1; + *primaries++ = primary2; + } + } + } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ + *primaries++ = primary1; + if(primary2 != UCOL_IGNORABLE) { + *primaries++ = primary2; /* second part */ + } } - } } if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ - /* This is compression code. */ - if (secondary == UCOL_COMMON2 && notIsContinuation) { - ++count2; - } else { - if (count2 > 0) { - if (secondary > UCOL_COMMON2) { // not necessary for 4th level. - while (count2 > UCOL_TOP_COUNT2) { - *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); - count2 -= (uint32_t)UCOL_TOP_COUNT2; - } - *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); - } else { - while (count2 > UCOL_BOT_COUNT2) { - *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); - count2 -= (uint32_t)UCOL_BOT_COUNT2; + /* This is compression code. */ + if (secondary == UCOL_COMMON2 && notIsContinuation) { + ++count2; + } else { + if (count2 > 0) { + if (secondary > UCOL_COMMON2) { // not necessary for 4th level. + while (count2 > UCOL_TOP_COUNT2) { + *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); + count2 -= (uint32_t)UCOL_TOP_COUNT2; + } + *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); + } else { + while (count2 > UCOL_BOT_COUNT2) { + *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); + count2 -= (uint32_t)UCOL_BOT_COUNT2; + } + *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); + } + count2 = 0; } - *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); - } - count2 = 0; + *secondaries++ = secondary; } - *secondaries++ = secondary; - } } if(notIsContinuation) { - tertiary ^= caseSwitch; + tertiary ^= caseSwitch; } - if(tertiary > 0) { - /* This is compression code. */ - /* sequence size check is included in the if clause */ - if (tertiary == tertiaryCommon && notIsContinuation) { - ++count3; - } else { - if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { - tertiary += tertiaryAddition; - } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { - tertiary -= tertiaryAddition; - } - if (count3 > 0) { - if ((tertiary > tertiaryCommon)) { - while (count3 > coll->tertiaryTopCount) { - *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); - count3 -= (uint32_t)coll->tertiaryTopCount; - } - *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); - } else { - while (count3 > coll->tertiaryBottomCount) { - *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); - count3 -= (uint32_t)coll->tertiaryBottomCount; + if(tertiary > 0) { + /* This is compression code. */ + /* sequence size check is included in the if clause */ + if (tertiary == tertiaryCommon && notIsContinuation) { + ++count3; + } else { + if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { + tertiary += tertiaryAddition; + } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { + tertiary -= tertiaryAddition; } - *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); - } - count3 = 0; + if (count3 > 0) { + if ((tertiary > tertiaryCommon)) { + while (count3 > coll->tertiaryTopCount) { + *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); + count3 -= (uint32_t)coll->tertiaryTopCount; + } + *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); + } else { + while (count3 > coll->tertiaryBottomCount) { + *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); + count3 -= (uint32_t)coll->tertiaryBottomCount; + } + *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); + } + count3 = 0; + } + *tertiaries++ = tertiary; } - *tertiaries++ = tertiary; - } } if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ - if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ - IInit_collIterate(coll, (UChar *)source, len, &s); - if(source == normSource) { - s.flags &= ~UCOL_ITER_NORM; - } - sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); - *status = U_BUFFER_OVERFLOW_ERROR; - finished = TRUE; - break; - } else { /* It's much nicer if we can actually reallocate */ - int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart); - primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); - if(U_SUCCESS(*status)) { - *result = primStart; - primarySafeEnd = primStart + resultLength - 2; - } else { - IInit_collIterate(coll, (UChar *)source, len, &s); - if(source == normSource) { - s.flags &= ~UCOL_ITER_NORM; - } - sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); - finished = TRUE; - break; + if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ + IInit_collIterate(coll, (UChar *)source, len, &s); + if(source == normSource) { + s.flags &= ~UCOL_ITER_NORM; + } + sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); + *status = U_BUFFER_OVERFLOW_ERROR; + finished = TRUE; + break; + } else { /* It's much nicer if we can actually reallocate */ + int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart); + primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); + if(U_SUCCESS(*status)) { + *result = primStart; + primarySafeEnd = primStart + resultLength - 2; + } else { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + finished = TRUE; + break; + } } - } } } if(finished) { break; } else { - prevBuffSize = minBufferSize; - secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); - terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); - minBufferSize *= 2; - if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size - IInit_collIterate(coll, (UChar *)source, len, &s); - if(source == normSource) { - s.flags &= ~UCOL_ITER_NORM; - } - sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); - break; - } + prevBuffSize = minBufferSize; + secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); + terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); + minBufferSize *= 2; + if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + break; + } } } if(U_SUCCESS(*status)) { - sortKeySize += (primaries - primStart); - /* we have done all the CE's, now let's put them together to form a key */ - if (count2 > 0) { - while (count2 > UCOL_BOT_COUNT2) { - *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); - count2 -= (uint32_t)UCOL_BOT_COUNT2; + sortKeySize += (primaries - primStart); + /* we have done all the CE's, now let's put them together to form a key */ + if (count2 > 0) { + while (count2 > UCOL_BOT_COUNT2) { + *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); + count2 -= (uint32_t)UCOL_BOT_COUNT2; + } + *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); } - *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); - } - uint32_t secsize = secondaries-secStart; - sortKeySize += secsize; - if(sortKeySize <= resultLength) { - *(primaries++) = UCOL_LEVELTERMINATOR; - uprv_memcpy(primaries, secStart, secsize); - primaries += secsize; - } else { - if(allocateSKBuffer == TRUE) { - primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); - if(U_SUCCESS(*status)) { + uint32_t secsize = secondaries-secStart; + sortKeySize += secsize; + if(sortKeySize <= resultLength) { *(primaries++) = UCOL_LEVELTERMINATOR; - *result = primStart; uprv_memcpy(primaries, secStart, secsize); - } + primaries += secsize; } else { - *status = U_BUFFER_OVERFLOW_ERROR; + if(allocateSKBuffer == TRUE) { + primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); + if(U_SUCCESS(*status)) { + *(primaries++) = UCOL_LEVELTERMINATOR; + *result = primStart; + uprv_memcpy(primaries, secStart, secsize); + } + else { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + goto cleanup; + } + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + } } - } - if (count3 > 0) { - if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { - while (count3 >= coll->tertiaryTopCount) { - *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); - count3 -= (uint32_t)coll->tertiaryTopCount; - } - *tertiaries++ = (uint8_t)(tertiaryTop - count3); - } else { - while (count3 > coll->tertiaryBottomCount) { - *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); - count3 -= (uint32_t)coll->tertiaryBottomCount; - } - *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); + if (count3 > 0) { + if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { + while (count3 >= coll->tertiaryTopCount) { + *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); + count3 -= (uint32_t)coll->tertiaryTopCount; + } + *tertiaries++ = (uint8_t)(tertiaryTop - count3); + } else { + while (count3 > coll->tertiaryBottomCount) { + *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); + count3 -= (uint32_t)coll->tertiaryBottomCount; + } + *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); + } } - } - uint32_t tersize = tertiaries - terStart; - sortKeySize += tersize; - if(sortKeySize <= resultLength) { - *(primaries++) = UCOL_LEVELTERMINATOR; - uprv_memcpy(primaries, terStart, tersize); - primaries += tersize; - } else { - if(allocateSKBuffer == TRUE) { - primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); - if(U_SUCCESS(*status)) { - *result = primStart; + uint32_t tersize = tertiaries - terStart; + sortKeySize += tersize; + if(sortKeySize <= resultLength) { *(primaries++) = UCOL_LEVELTERMINATOR; uprv_memcpy(primaries, terStart, tersize); - } + primaries += tersize; } else { - *status = U_MEMORY_ALLOCATION_ERROR; + if(allocateSKBuffer == TRUE) { + primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); + if(U_SUCCESS(*status)) { + *result = primStart; + *(primaries++) = UCOL_LEVELTERMINATOR; + uprv_memcpy(primaries, terStart, tersize); + } + else { + /* We ran out of memory!? We can't recover. */ + sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; + goto cleanup; + } + } else { + *status = U_MEMORY_ALLOCATION_ERROR; + } } - } - *(primaries++) = '\0'; + *(primaries++) = '\0'; + } + + if(allocateSKBuffer == TRUE) { + *result = (uint8_t*)uprv_malloc(sortKeySize); + /* test for NULL */ + if (*result == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto cleanup; + } + uprv_memcpy(*result, primStart, sortKeySize); + if(primStart != prim) { + uprv_free(primStart); + } } +cleanup: + if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { + /* NULL terminate for safety */ + **result = 0; + } if(terStart != tert) { uprv_free(terStart); uprv_free(secStart); } + /* To avoid memory leak, free the offset buffer if necessary. */ + freeOffsetBuffer(&s); + if(normSource != normBuffer) { uprv_free(normSource); } - if(allocateSKBuffer == TRUE) { - *result = (uint8_t*)uprv_malloc(sortKeySize); - /* test for NULL */ - if (*result == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return sortKeySize; - } - uprv_memcpy(*result, primStart, sortKeySize); - if(primStart != prim) { - uprv_free(primStart); - } - } - return sortKeySize; } static inline UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { - UBool notIsContinuation = !isContinuation(CE); - uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); - if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) - || (!notIsContinuation && *wasShifted)) - || (*wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */ - // The stuff below should probably be in the sortkey code... maybe not... - if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ - /* we should just completely ignore it */ - *wasShifted = TRUE; - //continue; - } - //*wasShifted = TRUE; - return TRUE; - } else { - *wasShifted = FALSE; - return FALSE; - } + UBool notIsContinuation = !isContinuation(CE); + uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); + if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) + || (!notIsContinuation && *wasShifted)) + || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ + { + // The stuff below should probably be in the sortkey code... maybe not... + if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ + /* we should just completely ignore it */ + *wasShifted = TRUE; + //continue; + } + //*wasShifted = TRUE; + return TRUE; + } else { + *wasShifted = FALSE; + return FALSE; + } } static inline void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { - if(level < maxLevel) { - dest[i++] = UCOL_LEVELTERMINATOR; - } else { - dest[i++] = 0; - } + if(level < maxLevel) { + dest[i++] = UCOL_LEVELTERMINATOR; + } else { + dest[i++] = 0; + } } /** enumeration of level identifiers for partial sort key generation */ @@ -5917,12 +5914,15 @@ enum { /** When we do French we need to reverse secondary values. However, continuations * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba */ - UCOL_PSK_USED_ELEMENTS_SHIFT = 7, - UCOL_PSK_USED_ELEMENTS_MASK = 0x3FF, - UCOL_PSK_ITER_SKIP_SHIFT = 17, - UCOL_PSK_ITER_SKIP_MASK = 0x7FFF + UCOL_PSK_BOCSU_BYTES_SHIFT = 7, + UCOL_PSK_BOCSU_BYTES_MASK = 3, + UCOL_PSK_CONSUMED_CES_SHIFT = 9, + UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF }; +// macro calculating the number of expansion CEs available +#define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn + /** main sortkey part procedure. On the first call, * you should pass in a collator, an iterator, empty state @@ -5960,21 +5960,18 @@ enum { * 4 - was shifted. Whether the previous iteration finished in the * shifted state. * 5, 6 - French continuation bytes written. See the comment in the enum - * 7..16 - Used elements. Number of CEs that were already used from the - * expansion buffer or number of bytes from a bocu sequence on + * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on * the identical level. - * 17..31 - iterator skip. Number of move operations iterator needs to - * skip from the current state in order to continue. This is used - * only if normalization is turned on, since the normalizing iterator - * can return undefined state, which means that it's in the middle - * of normalizing sequence. + * 9..31 - CEs consumed. Number of getCE or next32 operations performed + * since thes last successful update of the iterator state. */ U_CAPI int32_t U_EXPORT2 ucol_nextSortKeyPart(const UCollator *coll, UCharIterator *iter, uint32_t state[2], uint8_t *dest, int32_t count, - UErrorCode *status) { + UErrorCode *status) +{ /* error checking */ if(status==NULL || U_FAILURE(*status)) { return 0; @@ -5985,6 +5982,8 @@ ucol_nextSortKeyPart(const UCollator *coll, count<0 || (count>0 && dest==NULL) ) { *status=U_ILLEGAL_ARGUMENT_ERROR; + UTRACE_EXIT_STATUS(status); + return 0; } UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", @@ -5995,7 +5994,6 @@ ucol_nextSortKeyPart(const UCollator *coll, UTRACE_EXIT_VALUE(0); return 0; } - /** Setting up situation according to the state we got from the previous iteration */ // The state of the iterator from the previous invocation uint32_t iterState = state[0]; @@ -6008,13 +6006,13 @@ ucol_nextSortKeyPart(const UCollator *coll, int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; // number of bytes in the continuation buffer for French int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; - // Skip the CEs that we got from an extraction - // and delivered in the previous call - int32_t usedElements = (state[1] >> UCOL_PSK_USED_ELEMENTS_SHIFT) & UCOL_PSK_USED_ELEMENTS_MASK; - // Number of times to skip because the iterator returned - // UITER_NO_STATE when it was stopped in the last iteration, so we had to save the - // last valid state. - int32_t iterSkips = (state[1] >> UCOL_PSK_ITER_SKIP_SHIFT) & UCOL_PSK_ITER_SKIP_MASK; + // Number of bytes already written from a bocsu sequence. Since + // the longes bocsu sequence is 4 long, this can be up to 3. + int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK; + // Number of elements that need to be consumed in this iteration because + // the iterator returned UITER_NO_STATE at the end of the last iteration, + // so we had to save the last valid state. + int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK; /** values that depend on the collator attributes */ // strength of the collator. @@ -6022,18 +6020,18 @@ ucol_nextSortKeyPart(const UCollator *coll, // maximal level of the partial sortkey. Need to take whether case level is done int32_t maxLevel = 0; if(strength < UCOL_TERTIARY) { - if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { - maxLevel = UCOL_PSK_CASE; - } else { - maxLevel = strength; - } + if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { + maxLevel = UCOL_PSK_CASE; + } else { + maxLevel = strength; + } } else { if(strength == UCOL_TERTIARY) { - maxLevel = UCOL_PSK_TERTIARY; + maxLevel = UCOL_PSK_TERTIARY; } else if(strength == UCOL_QUATERNARY) { - maxLevel = UCOL_PSK_QUATERNARY; + maxLevel = UCOL_PSK_QUATERNARY; } else { // identical - maxLevel = UCOL_IDENTICAL; + maxLevel = UCOL_IDENTICAL; } } // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation @@ -6065,24 +6063,24 @@ ucol_nextSortKeyPart(const UCollator *coll, // If the normalization is turned on for the collator and we are below identical level // we will use a FCD normalizing iterator if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { - normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); - s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); - s.flags &= ~UCOL_ITER_NORM; - if(U_FAILURE(*status)) { - UTRACE_EXIT_STATUS(*status); - return 0; - } + normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); + s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); + s.flags &= ~UCOL_ITER_NORM; + if(U_FAILURE(*status)) { + UTRACE_EXIT_STATUS(*status); + return 0; + } } else if(level == UCOL_PSK_IDENTICAL) { - // for identical level, we need a NFD iterator. We need to instantiate it here, since we - // will be updating the state - and this cannot be done on an ordinary iterator. - normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); - s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); - s.flags &= ~UCOL_ITER_NORM; - if(U_FAILURE(*status)) { - UTRACE_EXIT_STATUS(*status); - return 0; - } - doingIdenticalFromStart = TRUE; + // for identical level, we need a NFD iterator. We need to instantiate it here, since we + // will be updating the state - and this cannot be done on an ordinary iterator. + normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); + s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); + s.flags &= ~UCOL_ITER_NORM; + if(U_FAILURE(*status)) { + UTRACE_EXIT_STATUS(*status); + return 0; + } + doingIdenticalFromStart = TRUE; } // This is the tentative new state of the iterator. The problem @@ -6094,80 +6092,62 @@ ucol_nextSortKeyPart(const UCollator *coll, // First, we set the iterator to the last valid position // from the last iteration. This was saved in state[0]. if(iterState == 0) { - /* initial state */ - if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { - s.iterator->move(s.iterator, 0, UITER_LIMIT); - } else { - s.iterator->move(s.iterator, 0, UITER_START); - } + /* initial state */ + if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { + s.iterator->move(s.iterator, 0, UITER_LIMIT); + } else { + s.iterator->move(s.iterator, 0, UITER_START); + } } else { /* reset to previous state */ - s.iterator->setState(s.iterator, iterState, status); - if(U_FAILURE(*status)) { - UTRACE_EXIT_STATUS(*status); - return 0; - } + s.iterator->setState(s.iterator, iterState, status); + if(U_FAILURE(*status)) { + UTRACE_EXIT_STATUS(*status); + return 0; + } } - // Then, we may have to move more, if the normalizing iterator - // was going through a normalizing sequence. - if(iterSkips) { - // if we are on secondary level AND we do French, we need to go backward instead of forward - if(level == UCOL_PSK_SECONDARY && doingFrench) { - s.iterator->move(s.iterator, -iterSkips, UITER_CURRENT); - } else { - s.iterator->move(s.iterator, iterSkips, UITER_CURRENT); - } - } - // Number of expansion CEs that were already consumed in the - // previous iteration for the last code point processed. We - // want to clean out the expansion buffer, so that we can - // get correct CEs. This value is persistent over iterations, - // since we can have several iterations on the one expansion - // buffer. - int32_t consumedExpansionCEs = usedElements; - // Number of bytes already writted from a bocsu sequence. Since - // the longes bocsu sequence is 4 long, this can be up to 3. It - // shares the state field with consumedExpansionCEs value, since - // they cannot simultanously appear on the same level - int32_t bocsuBytesUsed = 0; - // Clean out the expansion buffer unless we are on - // identical level. In that case we use this field - // to store the number of bytes already written - // from the previous bocsu sequence. - if(level < UCOL_PSK_IDENTICAL && usedElements != 0) { - while(usedElements-->0) { - // If we're doing French and we are on the secondary level, - // we go backwards. - if(level == UCOL_PSK_SECONDARY && doingFrench) { - CE = ucol_IGetPrevCE(coll, &s, status); - } else { - CE = ucol_IGetNextCE(coll, &s, status); - } - if(CE==UCOL_NO_MORE_CES) { - /* should not happen */ - *status=U_INTERNAL_PROGRAM_ERROR; - UTRACE_EXIT_STATUS(*status); - return 0; + // This variable tells us whether we can attempt to update the state + // of iterator. Situations where we don't want to update iterator state + // are the existence of expansion CEs that are not yet processed, and + // finishing the case level without enough space in the buffer to insert + // a level terminator. + UBool canUpdateState = TRUE; + + // Consume all the CEs that were consumed at the end of the previous + // iteration without updating the iterator state. On identical level, + // consume the code points. + int32_t counter = cces; + if(level < UCOL_PSK_IDENTICAL) { + while(counter-->0) { + // If we're doing French and we are on the secondary level, + // we go backwards. + if(level == UCOL_PSK_SECONDARY && doingFrench) { + CE = ucol_IGetPrevCE(coll, &s, status); + } else { + CE = ucol_IGetNextCE(coll, &s, status); + } + if(CE==UCOL_NO_MORE_CES) { + /* should not happen */ + *status=U_INTERNAL_PROGRAM_ERROR; + UTRACE_EXIT_STATUS(*status); + return 0; + } + if(uprv_numAvailableExpCEs(s)) { + canUpdateState = FALSE; + } } - } } else { - bocsuBytesUsed = usedElements; + while(counter-->0) { + uiter_next32(s.iterator); + } } - // This variable prevents the adjusting of iterator - // skip variable when we are the first time on a - // level. I hope there is a better way to do it, but - // I could not think of it. - UBool firstTimeOnLevel = TRUE; // French secondary needs to know whether the iterator state of zero came from previous level OR // from a new invocation... UBool wasDoingPrimary = FALSE; - // Case level is kind of goofy. This variable tells us that - // we are still not done with the case level. - UBool dontAdvanceIteratorBecauseWeNeedALevelTerminator = FALSE; // destination buffer byte counter. When this guy // gets to count, we're done with the iteration int32_t i = 0; @@ -6184,93 +6164,23 @@ ucol_nextSortKeyPart(const UCollator *coll, // out our buffer. switch(level) { case UCOL_PSK_PRIMARY: - wasDoingPrimary = TRUE; - for(;;) { - if(i==count) { - goto saveState; - } - // We should save the state only if we - // are sure that we are done with the - // previous iterator state - if(consumedExpansionCEs == 0 && byteCountOrFrenchDone == 0) { - newState = s.iterator->getState(s.iterator); - if(newState != UITER_NO_STATE) { - iterState = newState; - iterSkips = 0; - } else { - if(!firstTimeOnLevel && !byteCountOrFrenchDone) { - iterSkips++; - } - } - } - firstTimeOnLevel = FALSE; - CE = ucol_IGetNextCE(coll, &s, status); - if(CE==UCOL_NO_MORE_CES) { - // Add the level separator - terminatePSKLevel(level, maxLevel, i, dest); - byteCountOrFrenchDone=0; - // Restart the iteration an move to the - // second level - s.iterator->move(s.iterator, 0, UITER_START); - level = UCOL_PSK_SECONDARY; - break; - } - if(!isShiftedCE(CE, LVT, &wasShifted)) { - CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ - if(CE != 0) { - if(byteCountOrFrenchDone == 0) { - // get the second byte of primary - dest[i++]=(uint8_t)(CE >> 8); - } else { - byteCountOrFrenchDone = 0; - } - if((CE &=0xff)!=0) { - if(i==count) { - /* overflow */ - byteCountOrFrenchDone=1; - goto saveState; - } - dest[i++]=(uint8_t)CE; - } - } - } - if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { - // s.pos != NULL means there is a normalization buffer in effect - // in iterative case, this means that we are doing Thai (maybe discontiguos) - consumedExpansionCEs++; - } else { - consumedExpansionCEs = 0; - } - if(s.pos && *s.pos == 0) { - // maybe it is the end of Thai - we have to have - // an extra skip - iterSkips++; - } - } - /* fall through to next level */ - case UCOL_PSK_SECONDARY: - if(strength >= UCOL_SECONDARY) { - if(!doingFrench) { - for(;;) { - if(i == count) { - goto saveState; + wasDoingPrimary = TRUE; + for(;;) { + if(i==count) { + goto saveState; } // We should save the state only if we // are sure that we are done with the // previous iterator state - if(consumedExpansionCEs == 0) { - newState = s.iterator->getState(s.iterator); - if(newState != UITER_NO_STATE) { - iterState = newState; - iterSkips = 0; - } else { - if(!firstTimeOnLevel) { - iterSkips++; + if(canUpdateState && byteCountOrFrenchDone == 0) { + newState = s.iterator->getState(s.iterator); + if(newState != UITER_NO_STATE) { + iterState = newState; + cces = 0; } - } } - firstTimeOnLevel = FALSE; CE = ucol_IGetNextCE(coll, &s, status); + cces++; if(CE==UCOL_NO_MORE_CES) { // Add the level separator terminatePSKLevel(level, maxLevel, i, dest); @@ -6278,491 +6188,512 @@ ucol_nextSortKeyPart(const UCollator *coll, // Restart the iteration an move to the // second level s.iterator->move(s.iterator, 0, UITER_START); - level = UCOL_PSK_CASE; + cces = 0; + level = UCOL_PSK_SECONDARY; break; } if(!isShiftedCE(CE, LVT, &wasShifted)) { - CE >>= 8; /* get secondary */ - if(CE != 0) { - dest[i++]=(uint8_t)CE; - } + CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ + if(CE != 0) { + if(byteCountOrFrenchDone == 0) { + // get the second byte of primary + dest[i++]=(uint8_t)(CE >> 8); + } else { + byteCountOrFrenchDone = 0; + } + if((CE &=0xff)!=0) { + if(i==count) { + /* overflow */ + byteCountOrFrenchDone = 1; + cces--; + goto saveState; + } + dest[i++]=(uint8_t)CE; + } + } } - if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { - consumedExpansionCEs++; + if(uprv_numAvailableExpCEs(s)) { + canUpdateState = FALSE; } else { - consumedExpansionCEs = 0; - } - if(s.pos && *s.pos == 0) { - iterSkips++; + canUpdateState = TRUE; } - } - } else { // French secondary processing - uint8_t frenchBuff[UCOL_MAX_BUFFER]; - int32_t frenchIndex = 0; - // Here we are going backwards. - // If the iterator is at the beggining, it should be - // moved to end. - if(wasDoingPrimary) { - s.iterator->move(s.iterator, 0, UITER_LIMIT); - } - for(;;) { - if(i == count) { - goto saveState; - } - if(consumedExpansionCEs == 0) { - newState = s.iterator->getState(s.iterator); - if(newState != UITER_NO_STATE) { - iterState = newState; - iterSkips = 0; - } else { - if(!firstTimeOnLevel) { - iterSkips++; + } + /* fall through to next level */ + case UCOL_PSK_SECONDARY: + if(strength >= UCOL_SECONDARY) { + if(!doingFrench) { + for(;;) { + if(i == count) { + goto saveState; + } + // We should save the state only if we + // are sure that we are done with the + // previous iterator state + if(canUpdateState) { + newState = s.iterator->getState(s.iterator); + if(newState != UITER_NO_STATE) { + iterState = newState; + cces = 0; + } + } + CE = ucol_IGetNextCE(coll, &s, status); + cces++; + if(CE==UCOL_NO_MORE_CES) { + // Add the level separator + terminatePSKLevel(level, maxLevel, i, dest); + byteCountOrFrenchDone = 0; + // Restart the iteration an move to the + // second level + s.iterator->move(s.iterator, 0, UITER_START); + cces = 0; + level = UCOL_PSK_CASE; + break; + } + if(!isShiftedCE(CE, LVT, &wasShifted)) { + CE >>= 8; /* get secondary */ + if(CE != 0) { + dest[i++]=(uint8_t)CE; + } + } + if(uprv_numAvailableExpCEs(s)) { + canUpdateState = FALSE; + } else { + canUpdateState = TRUE; + } } - } - } - firstTimeOnLevel = FALSE; - CE = ucol_IGetPrevCE(coll, &s, status); - if(CE==UCOL_NO_MORE_CES) { - // Add the level separator - terminatePSKLevel(level, maxLevel, i, dest); - byteCountOrFrenchDone=0; - // Restart the iteration an move to the next level - s.iterator->move(s.iterator, 0, UITER_START); - level = UCOL_PSK_CASE; - break; - } - if(isContinuation(CE)) { // if it's a continuation, we want to save it and - // reverse when we get a first non-continuation CE. - CE >>= 8; - frenchBuff[frenchIndex++] = (uint8_t)CE; - } else if(!isShiftedCE(CE, LVT, &wasShifted)) { - CE >>= 8; /* get secondary */ - if(!frenchIndex) { - if(CE != 0) { - dest[i++]=(uint8_t)CE; + } else { // French secondary processing + uint8_t frenchBuff[UCOL_MAX_BUFFER]; + int32_t frenchIndex = 0; + // Here we are going backwards. + // If the iterator is at the beggining, it should be + // moved to end. + if(wasDoingPrimary) { + s.iterator->move(s.iterator, 0, UITER_LIMIT); + cces = 0; } - } else { - frenchBuff[frenchIndex++] = (uint8_t)CE; - frenchIndex -= usedFrench; - usedFrench = 0; - while(i < count && frenchIndex) { - dest[i++] = frenchBuff[--frenchIndex]; - usedFrench++; + for(;;) { + if(i == count) { + goto saveState; + } + if(canUpdateState) { + newState = s.iterator->getState(s.iterator); + if(newState != UITER_NO_STATE) { + iterState = newState; + cces = 0; + } + } + CE = ucol_IGetPrevCE(coll, &s, status); + cces++; + if(CE==UCOL_NO_MORE_CES) { + // Add the level separator + terminatePSKLevel(level, maxLevel, i, dest); + byteCountOrFrenchDone = 0; + // Restart the iteration an move to the next level + s.iterator->move(s.iterator, 0, UITER_START); + level = UCOL_PSK_CASE; + break; + } + if(isContinuation(CE)) { // if it's a continuation, we want to save it and + // reverse when we get a first non-continuation CE. + CE >>= 8; + frenchBuff[frenchIndex++] = (uint8_t)CE; + } else if(!isShiftedCE(CE, LVT, &wasShifted)) { + CE >>= 8; /* get secondary */ + if(!frenchIndex) { + if(CE != 0) { + dest[i++]=(uint8_t)CE; + } + } else { + frenchBuff[frenchIndex++] = (uint8_t)CE; + frenchIndex -= usedFrench; + usedFrench = 0; + while(i < count && frenchIndex) { + dest[i++] = frenchBuff[--frenchIndex]; + usedFrench++; + } + } + } + if(uprv_numAvailableExpCEs(s)) { + canUpdateState = FALSE; + } else { + canUpdateState = TRUE; + } } - } - } - if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { - consumedExpansionCEs++; - } else { - consumedExpansionCEs = 0; } - if(s.pos && *s.pos == 0) { - iterSkips++; - } - } + } else { + level = UCOL_PSK_CASE; } - } else { - level = UCOL_PSK_CASE; - } /* fall through to next level */ case UCOL_PSK_CASE: - if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { - uint32_t caseShift = UCOL_CASE_SHIFT_START; - uint8_t caseByte = UCOL_CASE_BYTE_START; - uint8_t caseBits = 0; + if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { + uint32_t caseShift = UCOL_CASE_SHIFT_START; + uint8_t caseByte = UCOL_CASE_BYTE_START; + uint8_t caseBits = 0; + + for(;;) { + if(i == count) { + goto saveState; + } + // We should save the state only if we + // are sure that we are done with the + // previous iterator state + if(canUpdateState) { + newState = s.iterator->getState(s.iterator); + if(newState != UITER_NO_STATE) { + iterState = newState; + cces = 0; + } + } + CE = ucol_IGetNextCE(coll, &s, status); + cces++; + if(CE==UCOL_NO_MORE_CES) { + // On the case level we might have an unfinished + // case byte. Add one if it's started. + if(caseShift != UCOL_CASE_SHIFT_START) { + dest[i++] = caseByte; + } + cces = 0; + // We have finished processing CEs on this level. + // However, we don't know if we have enough space + // to add a case level terminator. + if(i < count) { + // Add the level separator + terminatePSKLevel(level, maxLevel, i, dest); + // Restart the iteration and move to the + // next level + s.iterator->move(s.iterator, 0, UITER_START); + level = UCOL_PSK_TERTIARY; + } else { + canUpdateState = FALSE; + } + break; + } - for(;;) { - if(i == count) { - goto saveState; - } - // We should save the state only if we - // are sure that we are done with the - // previous iterator state - if(consumedExpansionCEs == 0) { - newState = s.iterator->getState(s.iterator); - if(newState != UITER_NO_STATE) { - iterState = newState; - iterSkips = 0; - } else { - if(!firstTimeOnLevel) { - iterSkips++; - } - } - } - firstTimeOnLevel = FALSE; - CE = ucol_IGetNextCE(coll, &s, status); - if(CE==UCOL_NO_MORE_CES) { - // On the case level we might have an unfinished - // case byte. Add one if it's started. - if(caseShift != UCOL_CASE_SHIFT_START) { - dest[i++] = caseByte; - } - // This is kind of tricky - situation where - // we need to keep the iterator in the old - // state, but don't need to bring anything - // to the next invocation - if(i < count) { - // Add the level separator - terminatePSKLevel(level, maxLevel, i, dest); - // Restart the iteration and move to the - // next level - s.iterator->move(s.iterator, 0, UITER_START); - level = UCOL_PSK_TERTIARY; - } else { - dontAdvanceIteratorBecauseWeNeedALevelTerminator = TRUE; - } - break; - } + if(!isShiftedCE(CE, LVT, &wasShifted)) { + if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) { + // do the case level if we need to do it. We don't want to calculate + // case level for primary ignorables if we have only primary strength and case level + // otherwise we would break well formedness of CEs + CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); + caseBits = (uint8_t)(CE & 0xC0); + // this copies the case level logic from the + // sort key generation code + if(CE != 0) { + if(coll->caseFirst == UCOL_UPPER_FIRST) { + if((caseBits & 0xC0) == 0) { + caseByte |= 1 << (--caseShift); + } else { + caseByte |= 0 << (--caseShift); + /* second bit */ + if(caseShift == 0) { + dest[i++] = caseByte; + caseShift = UCOL_CASE_SHIFT_START; + caseByte = UCOL_CASE_BYTE_START; + } + caseByte |= ((caseBits>>6)&1) << (--caseShift); + } + } else { + if((caseBits & 0xC0) == 0) { + caseByte |= 0 << (--caseShift); + } else { + caseByte |= 1 << (--caseShift); + /* second bit */ + if(caseShift == 0) { + dest[i++] = caseByte; + caseShift = UCOL_CASE_SHIFT_START; + caseByte = UCOL_CASE_BYTE_START; + } + caseByte |= ((caseBits>>7)&1) << (--caseShift); + } + } + } - if(!isShiftedCE(CE, LVT, &wasShifted)) { - if(!isContinuation(CE)) { - CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); - caseBits = (uint8_t)(CE & 0xC0); - // this copies the case level logic from the - // sort key generation code - if(CE != 0) { - if(coll->caseFirst == UCOL_UPPER_FIRST) { - if((caseBits & 0xC0) == 0) { - caseByte |= 1 << (--caseShift); - } else { - caseByte |= 0 << (--caseShift); - /* second bit */ - if(caseShift == 0) { - dest[i++] = caseByte; - caseShift = UCOL_CASE_SHIFT_START; - caseByte = UCOL_CASE_BYTE_START; - } - caseByte |= ((caseBits>>6)&1) << (--caseShift); - } + } + } + // Not sure this is correct for the case level - revisit + if(uprv_numAvailableExpCEs(s)) { + canUpdateState = FALSE; } else { - if((caseBits & 0xC0) == 0) { - caseByte |= 0 << (--caseShift); - } else { - caseByte |= 1 << (--caseShift); - /* second bit */ - if(caseShift == 0) { - dest[i++] = caseByte; - caseShift = UCOL_CASE_SHIFT_START; - caseByte = UCOL_CASE_BYTE_START; - } - caseByte |= ((caseBits>>7)&1) << (--caseShift); - } + canUpdateState = TRUE; } - } - } - } - // Not sure this is correct for the case level - revisit - if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { - consumedExpansionCEs++; - } else { - consumedExpansionCEs = 0; - } - if(s.pos && *s.pos == 0) { - iterSkips++; - } + } else { + level = UCOL_PSK_TERTIARY; } - } else { - level = UCOL_PSK_TERTIARY; - } /* fall through to next level */ case UCOL_PSK_TERTIARY: - if(strength >= UCOL_TERTIARY) { - for(;;) { - if(i == count) { - goto saveState; - } - // We should save the state only if we - // are sure that we are done with the - // previous iterator state - if(consumedExpansionCEs == 0) { - newState = s.iterator->getState(s.iterator); - if(newState != UITER_NO_STATE) { - iterState = newState; - iterSkips = 0; - } else { - if(!firstTimeOnLevel) { - iterSkips++; - } - } - } - firstTimeOnLevel = FALSE; - CE = ucol_IGetNextCE(coll, &s, status); - if(CE==UCOL_NO_MORE_CES) { - // Add the level separator - terminatePSKLevel(level, maxLevel, i, dest); - byteCountOrFrenchDone=0; - // Restart the iteration an move to the - // second level - s.iterator->move(s.iterator, 0, UITER_START); - level = UCOL_PSK_QUATERNARY; - break; - } - if(!isShiftedCE(CE, LVT, &wasShifted)) { - notIsContinuation = !isContinuation(CE); + if(strength >= UCOL_TERTIARY) { + for(;;) { + if(i == count) { + goto saveState; + } + // We should save the state only if we + // are sure that we are done with the + // previous iterator state + if(canUpdateState) { + newState = s.iterator->getState(s.iterator); + if(newState != UITER_NO_STATE) { + iterState = newState; + cces = 0; + } + } + CE = ucol_IGetNextCE(coll, &s, status); + cces++; + if(CE==UCOL_NO_MORE_CES) { + // Add the level separator + terminatePSKLevel(level, maxLevel, i, dest); + byteCountOrFrenchDone = 0; + // Restart the iteration an move to the + // second level + s.iterator->move(s.iterator, 0, UITER_START); + cces = 0; + level = UCOL_PSK_QUATERNARY; + break; + } + if(!isShiftedCE(CE, LVT, &wasShifted)) { + notIsContinuation = !isContinuation(CE); - if(notIsContinuation) { - CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); - CE ^= coll->caseSwitch; - CE &= coll->tertiaryMask; - } else { - CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); - } + if(notIsContinuation) { + CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); + CE ^= coll->caseSwitch; + CE &= coll->tertiaryMask; + } else { + CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); + } - if(CE != 0) { - dest[i++]=(uint8_t)CE; + if(CE != 0) { + dest[i++]=(uint8_t)CE; + } + } + if(uprv_numAvailableExpCEs(s)) { + canUpdateState = FALSE; + } else { + canUpdateState = TRUE; + } } - } - if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { - consumedExpansionCEs++; - } else { - consumedExpansionCEs = 0; - } - if(s.pos && *s.pos == 0) { - iterSkips++; - } + } else { + // if we're not doing tertiary + // skip to the end + level = UCOL_PSK_NULL; } - } else { - // if we're not doing tertiary - // skip to the end - level = UCOL_PSK_NULL; - } /* fall through to next level */ case UCOL_PSK_QUATERNARY: - if(strength >= UCOL_QUATERNARY) { - for(;;) { - if(i == count) { - goto saveState; - } - // We should save the state only if we - // are sure that we are done with the - // previous iterator state - if(consumedExpansionCEs == 0) { - newState = s.iterator->getState(s.iterator); - if(newState != UITER_NO_STATE) { - iterState = newState; - iterSkips = 0; - } else { - if(!firstTimeOnLevel) { - iterSkips++; - } - } - } - firstTimeOnLevel = FALSE; - CE = ucol_IGetNextCE(coll, &s, status); - if(CE==UCOL_NO_MORE_CES) { - // Add the level separator - terminatePSKLevel(level, maxLevel, i, dest); - //dest[i++] = UCOL_LEVELTERMINATOR; - byteCountOrFrenchDone=0; - // Restart the iteration an move to the - // second level - s.iterator->move(s.iterator, 0, UITER_START); - level = UCOL_PSK_QUIN; - break; - } - if(isShiftedCE(CE, LVT, &wasShifted)) { - CE >>= 16; /* get primary */ - if(CE != 0) { - if(byteCountOrFrenchDone == 0) { - dest[i++]=(uint8_t)(CE >> 8); - } else { - byteCountOrFrenchDone = 0; - } - if((CE &=0xff)!=0) { - if(i==count) { - /* overflow */ - byteCountOrFrenchDone=1; - goto saveState; - } - dest[i++]=(uint8_t)CE; - } - } - } else { - notIsContinuation = !isContinuation(CE); - if(notIsContinuation) { - if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it - dest[i++] = UCOL_HIRAGANA_QUAD; - } else { - dest[i++] = 0xFF; - } + if(strength >= UCOL_QUATERNARY) { + for(;;) { + if(i == count) { + goto saveState; + } + // We should save the state only if we + // are sure that we are done with the + // previous iterator state + if(canUpdateState) { + newState = s.iterator->getState(s.iterator); + if(newState != UITER_NO_STATE) { + iterState = newState; + cces = 0; + } + } + CE = ucol_IGetNextCE(coll, &s, status); + cces++; + if(CE==UCOL_NO_MORE_CES) { + // Add the level separator + terminatePSKLevel(level, maxLevel, i, dest); + //dest[i++] = UCOL_LEVELTERMINATOR; + byteCountOrFrenchDone = 0; + // Restart the iteration an move to the + // second level + s.iterator->move(s.iterator, 0, UITER_START); + cces = 0; + level = UCOL_PSK_QUIN; + break; + } + if(CE==0) + continue; + if(isShiftedCE(CE, LVT, &wasShifted)) { + CE >>= 16; /* get primary */ + if(CE != 0) { + if(byteCountOrFrenchDone == 0) { + dest[i++]=(uint8_t)(CE >> 8); + } else { + byteCountOrFrenchDone = 0; + } + if((CE &=0xff)!=0) { + if(i==count) { + /* overflow */ + byteCountOrFrenchDone = 1; + goto saveState; + } + dest[i++]=(uint8_t)CE; + } + } + } else { + notIsContinuation = !isContinuation(CE); + if(notIsContinuation) { + if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it + dest[i++] = UCOL_HIRAGANA_QUAD; + } else { + dest[i++] = 0xFF; + } + } + } + if(uprv_numAvailableExpCEs(s)) { + canUpdateState = FALSE; + } else { + canUpdateState = TRUE; + } } - } - if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) { - consumedExpansionCEs++; - } else { - consumedExpansionCEs = 0; - } - if(s.pos && *s.pos == 0) { - iterSkips++; - } + } else { + // if we're not doing quaternary + // skip to the end + level = UCOL_PSK_NULL; } - } else { - // if we're not doing quaternary - // skip to the end - level = UCOL_PSK_NULL; - } /* fall through to next level */ case UCOL_PSK_QUIN: - level = UCOL_PSK_IDENTICAL; + level = UCOL_PSK_IDENTICAL; /* fall through to next level */ case UCOL_PSK_IDENTICAL: - if(strength >= UCOL_IDENTICAL) { - UChar32 first, second; - int32_t bocsuBytesWritten = 0; - // We always need to do identical on - // the NFD form of the string. - if(normIter == NULL) { - // we arrived from the level below and - // normalization was not turned on. - // therefore, we need to make a fresh NFD iterator - normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); - s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); - } else if(!doingIdenticalFromStart) { - // there is an iterator, but we did some other levels. - // therefore, we have a FCD iterator - need to make - // a NFD one. - // normIter being at the beginning does not guarantee - // that the underlying iterator is at the beginning - iter->move(iter, 0, UITER_START); - s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); - } - // At this point we have a NFD iterator that is positioned - // in the right place - if(U_FAILURE(*status)) { - UTRACE_EXIT_STATUS(*status); - return 0; - } - first = uiter_previous32(s.iterator); - // maybe we're at the start of the string - if(first == U_SENTINEL) { - first = 0; - } else { - uiter_next32(s.iterator); - } - - j = 0; - for(;;) { - if(i == count) { - if(j+1 < bocsuBytesWritten) { - bocsuBytesUsed = j+1; + if(strength >= UCOL_IDENTICAL) { + UChar32 first, second; + int32_t bocsuBytesWritten = 0; + // We always need to do identical on + // the NFD form of the string. + if(normIter == NULL) { + // we arrived from the level below and + // normalization was not turned on. + // therefore, we need to make a fresh NFD iterator + normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); + s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); + } else if(!doingIdenticalFromStart) { + // there is an iterator, but we did some other levels. + // therefore, we have a FCD iterator - need to make + // a NFD one. + // normIter being at the beginning does not guarantee + // that the underlying iterator is at the beginning + iter->move(iter, 0, UITER_START); + s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); + } + // At this point we have a NFD iterator that is positioned + // in the right place + if(U_FAILURE(*status)) { + UTRACE_EXIT_STATUS(*status); + return 0; + } + first = uiter_previous32(s.iterator); + // maybe we're at the start of the string + if(first == U_SENTINEL) { + first = 0; + } else { + uiter_next32(s.iterator); } - goto saveState; - } - // On identical level, we will always save - // the state if we reach this point, since - // we don't depend on getNextCE for content - // all the content is in our buffer and we - // already either stored the full buffer OR - // otherwise we won't arrive here. - newState = s.iterator->getState(s.iterator); - if(newState != UITER_NO_STATE) { - iterState = newState; - iterSkips = 0; - } else { - iterSkips++; - } + j = 0; + for(;;) { + if(i == count) { + if(j+1 < bocsuBytesWritten) { + bocsuBytesUsed = j+1; + } + goto saveState; + } - uint8_t buff[4]; - second = uiter_next32(s.iterator); + // On identical level, we will always save + // the state if we reach this point, since + // we don't depend on getNextCE for content + // all the content is in our buffer and we + // already either stored the full buffer OR + // otherwise we won't arrive here. + newState = s.iterator->getState(s.iterator); + if(newState != UITER_NO_STATE) { + iterState = newState; + cces = 0; + } - // end condition for identical level - if(second == U_SENTINEL) { - terminatePSKLevel(level, maxLevel, i, dest); - level = UCOL_PSK_NULL; - break; - } - bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); - first = second; + uint8_t buff[4]; + second = uiter_next32(s.iterator); + cces++; + + // end condition for identical level + if(second == U_SENTINEL) { + terminatePSKLevel(level, maxLevel, i, dest); + level = UCOL_PSK_NULL; + break; + } + bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); + first = second; - j = 0; - if(bocsuBytesUsed != 0) { - while(bocsuBytesUsed-->0) { - j++; + j = 0; + if(bocsuBytesUsed != 0) { + while(bocsuBytesUsed-->0) { + j++; + } + } + + while(i < count && j < bocsuBytesWritten) { + dest[i++] = buff[j++]; + } } - } - while(i < count && j < bocsuBytesWritten) { - dest[i++] = buff[j++]; - } + } else { + level = UCOL_PSK_NULL; } - - } else { - level = UCOL_PSK_NULL; - } /* fall through to next level */ case UCOL_PSK_NULL: - j = i; - while(jgetState(s.iterator)) == UITER_NO_STATE) + { + // Any of above mean that the previous transaction + // wasn't finished and that we should store the + // previous iterator state. + state[0] = iterState; } else { - // The transaction is complete. We will continue in - // next iteration. - if((newState = s.iterator->getState(s.iterator))!= UITER_NO_STATE) { + // The transaction is complete. We will continue in the next iteration. state[0] = s.iterator->getState(s.iterator); - iterSkips = 0; - } else { - state[0] = iterState; - iterSkips++; - } + cces = 0; } - // Store the number of elements processed. On CE levels, this is - // the number of expansion CEs processed. On identical level, this - // is the number of bocsu bytes written. - if(level < UCOL_PSK_IDENTICAL) { - if((consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) != consumedExpansionCEs) { - *status = U_INDEX_OUTOFBOUNDS_ERROR; - } - state[1] = (consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT; - } else { - if((bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) != bocsuBytesUsed) { + // Store the number of bocsu bytes written. + if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { *status = U_INDEX_OUTOFBOUNDS_ERROR; - } - state[1] = (bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT; } + state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT; // Next we put in the level of comparison state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); // If we are doing French, we need to store whether we have just finished the French level if(level == UCOL_PSK_SECONDARY && doingFrench) { - state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); + state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); } else { - state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); + state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); } // Was the latest CE shifted if(wasShifted) { - state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; + state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; } - // Check for iterSkips overflow - if((iterSkips & UCOL_PSK_ITER_SKIP_MASK) != iterSkips) { - *status = U_INDEX_OUTOFBOUNDS_ERROR; + // Check for cces overflow + if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { + *status = U_INDEX_OUTOFBOUNDS_ERROR; } - // Store iterSkips - state[1] |= ((iterSkips & UCOL_PSK_ITER_SKIP_MASK) << UCOL_PSK_ITER_SKIP_SHIFT); + // Store cces + state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT); // Check for French overflow if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { - *status = U_INDEX_OUTOFBOUNDS_ERROR; + *status = U_INDEX_OUTOFBOUNDS_ERROR; } // Store number of bytes written in the French secondary continuation sequence state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); @@ -6770,9 +6701,12 @@ saveState: // If we have used normalizing iterator, get rid of it if(normIter != NULL) { - unorm_closeIter(normIter); + unorm_closeIter(normIter); } + /* To avoid memory leak, free the offset buffer if necessary. */ + freeOffsetBuffer(&s); + // Return number of meaningful sortkey bytes. UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", dest,i, state[0], state[1]); @@ -6790,120 +6724,64 @@ ucol_getBound(const uint8_t *source, uint32_t noOfLevels, uint8_t *result, int32_t resultLength, - UErrorCode *status) { - // consistency checks - if(status == NULL || U_FAILURE(*status)) { - return 0; - } - if(source == NULL) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - int32_t sourceIndex = 0; - // Scan the string until we skip enough of the key OR reach the end of the key - do { - sourceIndex++; - if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { - noOfLevels--; + UErrorCode *status) +{ + // consistency checks + if(status == NULL || U_FAILURE(*status)) { + return 0; } - } while (noOfLevels > 0 - && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); - - if((source[sourceIndex] == 0 || sourceIndex == sourceLength) - && noOfLevels > 0) { - *status = U_SORT_KEY_TOO_SHORT_WARNING; - } - - - // READ ME: this code assumes that the values for boundType - // enum will not changes. They are set so that the enum value - // corresponds to the number of extra bytes each bound type - // needs. - if(result != NULL && resultLength >= sourceIndex+boundType) { - uprv_memcpy(result, source, sourceIndex); - switch(boundType) { - // Lower bound just gets terminated. No extra bytes - case UCOL_BOUND_LOWER: // = 0 - break; - // Upper bound needs one extra byte - case UCOL_BOUND_UPPER: // = 1 - result[sourceIndex++] = 2; - break; - // Upper long bound needs two extra bytes - case UCOL_BOUND_UPPER_LONG: // = 2 - result[sourceIndex++] = 0xFF; - result[sourceIndex++] = 0xFF; - break; - default: - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; + if(source == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; } - result[sourceIndex++] = 0; - - return sourceIndex; - } else { - return sourceIndex+boundType+1; - } -} -static -inline void uprv_appendByteToHexString(char *dst, uint8_t val) { - uint32_t len = (uint32_t)uprv_strlen(dst); - *(dst+len) = T_CString_itosOffset((val >> 4)); - *(dst+len+1) = T_CString_itosOffset((val & 0xF)); - *(dst+len+2) = 0; -} - -/* this function makes a string with representation of a sortkey */ -U_CAPI char* U_EXPORT2 ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len) { - int32_t strength = UCOL_PRIMARY; - uint32_t res_size = 0; - UBool doneCase = FALSE; - - char *current = buffer; - const uint8_t *currentSk = sortkey; - - uprv_strcpy(current, "["); - - while(strength <= UCOL_QUATERNARY && strength <= coll->strength) { - if(strength > UCOL_PRIMARY) { - uprv_strcat(current, " . "); - } - while(*currentSk != 0x01 && *currentSk != 0x00) { /* print a level */ - uprv_appendByteToHexString(current, *currentSk++); - uprv_strcat(current, " "); - } - if(coll->caseLevel == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) { - doneCase = TRUE; - } else if(coll->caseLevel == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) { - strength ++; - } - uprv_appendByteToHexString(current, *currentSk++); /* This should print '01' */ - if(strength == UCOL_QUATERNARY && coll->alternateHandling == UCOL_NON_IGNORABLE) { - break; - } - } + int32_t sourceIndex = 0; + // Scan the string until we skip enough of the key OR reach the end of the key + do { + sourceIndex++; + if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { + noOfLevels--; + } + } while (noOfLevels > 0 + && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); - if(coll->strength == UCOL_IDENTICAL) { - uprv_strcat(current, " . "); - while(*currentSk != 0) { - uprv_appendByteToHexString(current, *currentSk++); - uprv_strcat(current, " "); + if((source[sourceIndex] == 0 || sourceIndex == sourceLength) + && noOfLevels > 0) { + *status = U_SORT_KEY_TOO_SHORT_WARNING; } - uprv_appendByteToHexString(current, *currentSk++); - } - uprv_strcat(current, "]"); - if(res_size > *len) { - return NULL; - } + // READ ME: this code assumes that the values for boundType + // enum will not changes. They are set so that the enum value + // corresponds to the number of extra bytes each bound type + // needs. + if(result != NULL && resultLength >= sourceIndex+boundType) { + uprv_memcpy(result, source, sourceIndex); + switch(boundType) { + // Lower bound just gets terminated. No extra bytes + case UCOL_BOUND_LOWER: // = 0 + break; + // Upper bound needs one extra byte + case UCOL_BOUND_UPPER: // = 1 + result[sourceIndex++] = 2; + break; + // Upper long bound needs two extra bytes + case UCOL_BOUND_UPPER_LONG: // = 2 + result[sourceIndex++] = 0xFF; + result[sourceIndex++] = 0xFF; + break; + default: + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + result[sourceIndex++] = 0; - return buffer; + return sourceIndex; + } else { + return sourceIndex+boundType+1; + } } - /****************************************************************************/ /* Following are the functions that deal with the properties of a collator */ /* there are new APIs and some compatibility APIs */ @@ -6911,50 +6789,51 @@ U_CAPI char* U_EXPORT2 ucol_sortKeyToString(const UCollator *coll, const uint8_t static inline void ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, - int32_t *primShift, int32_t *secShift, int32_t *terShift) { - uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; - UBool reverseSecondary = FALSE; - if(!isContinuation(CE)) { - tertiary = (uint8_t)((CE & coll->tertiaryMask)); - tertiary ^= coll->caseSwitch; - reverseSecondary = TRUE; - } else { - tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); - tertiary &= UCOL_REMOVE_CASE; - reverseSecondary = FALSE; - } + int32_t *primShift, int32_t *secShift, int32_t *terShift) +{ + uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; + UBool reverseSecondary = FALSE; + if(!isContinuation(CE)) { + tertiary = (uint8_t)((CE & coll->tertiaryMask)); + tertiary ^= coll->caseSwitch; + reverseSecondary = TRUE; + } else { + tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); + tertiary &= UCOL_REMOVE_CASE; + reverseSecondary = FALSE; + } - secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); - primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); - primary1 = (uint8_t)(CE >> 8); + secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); + primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); + primary1 = (uint8_t)(CE >> 8); - if(primary1 != 0) { - coll->latinOneCEs[ch] |= (primary1 << *primShift); - *primShift -= 8; - } - if(primary2 != 0) { - if(*primShift < 0) { - coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; - coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; - coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; - return; + if(primary1 != 0) { + coll->latinOneCEs[ch] |= (primary1 << *primShift); + *primShift -= 8; + } + if(primary2 != 0) { + if(*primShift < 0) { + coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; + coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; + coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; + return; + } + coll->latinOneCEs[ch] |= (primary2 << *primShift); + *primShift -= 8; + } + if(secondary != 0) { + if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary + coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary + coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); + } else { // normal case + coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); + } + *secShift -= 8; + } + if(tertiary != 0) { + coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); + *terShift -= 8; } - coll->latinOneCEs[ch] |= (primary2 << *primShift); - *primShift -= 8; - } - if(secondary != 0) { - if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary - coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary - coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); - } else { // normal case - coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); - } - *secShift -= 8; - } - if(tertiary != 0) { - coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); - *terShift -= 8; - } } static inline UBool @@ -6978,168 +6857,186 @@ ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { static UBool ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { - UBool result = TRUE; - if(coll->latinOneCEs == NULL) { - coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); + UBool result = TRUE; if(coll->latinOneCEs == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return FALSE; + coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); + if(coll->latinOneCEs == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + coll->latinOneTableLen = UCOL_LATINONETABLELEN; } - coll->latinOneTableLen = UCOL_LATINONETABLELEN; - } - UChar ch = 0; - UCollationElements *it = ucol_openElements(coll, &ch, 1, status); - uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); - - int32_t primShift = 24, secShift = 24, terShift = 24; - uint32_t CE = 0; - int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; - - // TODO: make safe if you get more than you wanted... - for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { - primShift = 24; secShift = 24; terShift = 24; - if(ch < 0x100) { - CE = coll->latinOneMapping[ch]; - } else { - CE = UTRIE_GET32_FROM_LEAD(coll->mapping, ch); - if(CE == UCOL_NOT_FOUND && coll->UCA) { - CE = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch); - } + UChar ch = 0; + UCollationElements *it = ucol_openElements(coll, &ch, 1, status); + // Check for null pointer + if (U_FAILURE(*status)) { + return FALSE; } - if(CE < UCOL_NOT_FOUND) { - ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); - } else { - switch (getCETag(CE)) { - case EXPANSION_TAG: - case DIGIT_TAG: - ucol_setText(it, &ch, 1, status); - while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { - if(primShift < 0 || secShift < 0 || terShift < 0) { - coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; - coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; - coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; - break; - } - ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); - } - break; - case CONTRACTION_TAG: - // here is the trick - // F2 is contraction. We do something very similar to contractions - // but have two indices, one in the real contraction table and the - // other to where we stuffed things. This hopes that we don't have - // many contractions (this should work for latin-1 tables). - { - if((CE & 0x00FFF000) != 0) { - *status = U_UNSUPPORTED_ERROR; - coll->latinOneFailed = TRUE; - return FALSE; - } - - const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); + uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); - CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table + int32_t primShift = 24, secShift = 24, terShift = 24; + uint32_t CE = 0; + int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; - coll->latinOneCEs[ch] = CE; - coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; - coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; - - // We're going to jump into contraction table, pick the elements - // and use them - do { - CE = *(coll->contractionCEs + - (UCharOffset - coll->contractionIndex)); - if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { - uint32_t size; - uint32_t i; /* general counter */ - uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ - size = getExpansionCount(CE); - //CE = *CEOffset++; - if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ - for(i = 0; ilatinOneMapping[ch]; + } else { + CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); + if(CE == UCOL_NOT_FOUND && coll->UCA) { + CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); + } + } + if(CE < UCOL_NOT_FOUND) { + ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); + } else { + switch (getCETag(CE)) { + case EXPANSION_TAG: + case DIGIT_TAG: + ucol_setText(it, &ch, 1, status); + while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { if(primShift < 0 || secShift < 0 || terShift < 0) { - coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; - coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; - coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; - break; + coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; + coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; + coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; + break; } - ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); - } - } else { /* else, we do */ - while(*CEOffset != 0) { - if(primShift < 0 || secShift < 0 || terShift < 0) { - coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; - coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; - coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; - break; + ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); + } + break; + case CONTRACTION_TAG: + // here is the trick + // F2 is contraction. We do something very similar to contractions + // but have two indices, one in the real contraction table and the + // other to where we stuffed things. This hopes that we don't have + // many contractions (this should work for latin-1 tables). + { + if((CE & 0x00FFF000) != 0) { + *status = U_UNSUPPORTED_ERROR; + goto cleanup_after_failure; } - ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); - } + + const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); + + CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table + + coll->latinOneCEs[ch] = CE; + coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; + coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; + + // We're going to jump into contraction table, pick the elements + // and use them + do { + CE = *(coll->contractionCEs + + (UCharOffset - coll->contractionIndex)); + if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { + uint32_t size; + uint32_t i; /* general counter */ + uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ + size = getExpansionCount(CE); + //CE = *CEOffset++; + if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ + for(i = 0; ilatinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; + coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; + coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; + break; + } + ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); + } + } else { /* else, we do */ + while(*CEOffset != 0) { + if(primShift < 0 || secShift < 0 || terShift < 0) { + coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; + coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; + coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; + break; + } + ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); + } + } + contractionOffset++; + } else if(CE < UCOL_NOT_FOUND) { + ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); + } else { + coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; + coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; + coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; + contractionOffset++; + } + UCharOffset++; + primShift = 24; secShift = 24; terShift = 24; + if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate + if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { + goto cleanup_after_failure; + } + } + } while(*UCharOffset != 0xFFFF); } - contractionOffset++; - } else if(CE < UCOL_NOT_FOUND) { - ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); - } else { - coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; - coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; - coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; - contractionOffset++; - } - UCharOffset++; - primShift = 24; secShift = 24; terShift = 24; - if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate - if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { - coll->latinOneFailed = TRUE; - return FALSE; + break;; + case SPEC_PROC_TAG: + { + // 0xB7 is a precontext character defined in UCA5.1, a special + // handle is implemeted in order to save LatinOne table for + // most locales. + if (ch==0xb7) { + ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); + } + else { + goto cleanup_after_failure; + } } - } - } while(*UCharOffset != 0xFFFF); + break; + default: + goto cleanup_after_failure; + } } - break; - default: - coll->latinOneFailed = TRUE; - result = FALSE; - break; - } } - } - ucol_closeElements(it); - // compact table - if(contractionOffset < coll->latinOneTableLen) { - if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { - coll->latinOneFailed = TRUE; - return FALSE; + // compact table + if(contractionOffset < coll->latinOneTableLen) { + if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { + goto cleanup_after_failure; + } } - } - return result; + ucol_closeElements(it); + return result; + +cleanup_after_failure: + // status should already be set before arriving here. + coll->latinOneFailed = TRUE; + ucol_closeElements(it); + return FALSE; } void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { - if(U_SUCCESS(*status)) { + if(U_SUCCESS(*status)) { if(coll->caseFirst == UCOL_UPPER_FIRST) { - coll->caseSwitch = UCOL_CASE_SWITCH; + coll->caseSwitch = UCOL_CASE_SWITCH; } else { - coll->caseSwitch = UCOL_NO_CASE_SWITCH; + coll->caseSwitch = UCOL_NO_CASE_SWITCH; } if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { - coll->tertiaryMask = UCOL_REMOVE_CASE; - coll->tertiaryCommon = UCOL_COMMON3_NORMAL; - coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF; - coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; - coll->tertiaryBottom = UCOL_COMMON_BOT3; - } else { - coll->tertiaryMask = UCOL_KEEP_CASE; - coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; - if(coll->caseFirst == UCOL_UPPER_FIRST) { - coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; - coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; - coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; - } else { + coll->tertiaryMask = UCOL_REMOVE_CASE; coll->tertiaryCommon = UCOL_COMMON3_NORMAL; - coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; - coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; - } + coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */ + coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; + coll->tertiaryBottom = UCOL_COMMON_BOT3; + } else { + coll->tertiaryMask = UCOL_KEEP_CASE; + coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; + if(coll->caseFirst == UCOL_UPPER_FIRST) { + coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; + coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; + coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; + } else { + coll->tertiaryCommon = UCOL_COMMON3_NORMAL; + coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; + coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; + } } /* Set the compression values */ @@ -7148,90 +7045,94 @@ void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY - && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) { - coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; + && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) + { + coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; } else { - coll->sortKeyGen = ucol_calcSortKey; + coll->sortKeyGen = ucol_calcSortKey; } if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF - && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) { - if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { - if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it - //fprintf(stderr, "F"); - coll->latinOneUse = TRUE; - } else { - coll->latinOneUse = FALSE; - } - if(*status == U_UNSUPPORTED_ERROR) { - *status = U_ZERO_ERROR; + && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) + { + if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { + if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it + //fprintf(stderr, "F"); + coll->latinOneUse = TRUE; + } else { + coll->latinOneUse = FALSE; + } + if(*status == U_UNSUPPORTED_ERROR) { + *status = U_ZERO_ERROR; + } + } else { // latin1Table exists and it doesn't need to be regenerated, just use it + coll->latinOneUse = TRUE; } - } else { // latin1Table exists and it doesn't need to be regenerated, just use it - coll->latinOneUse = TRUE; - } } else { - coll->latinOneUse = FALSE; + coll->latinOneUse = FALSE; } - } - + } } U_CAPI uint32_t U_EXPORT2 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { - if(U_FAILURE(*status) || coll == NULL) { - return 0; - } - if(len == -1) { - len = u_strlen(varTop); - } - if(len == 0) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } + if(U_FAILURE(*status) || coll == NULL) { + return 0; + } + if(len == -1) { + len = u_strlen(varTop); + } + if(len == 0) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } - collIterate s; - IInit_collIterate(coll, varTop, len, &s); + collIterate s; + IInit_collIterate(coll, varTop, len, &s); - uint32_t CE = ucol_IGetNextCE(coll, &s, status); + uint32_t CE = ucol_IGetNextCE(coll, &s, status); - /* here we check if we have consumed all characters */ - /* you can put in either one character or a contraction */ - /* you shouldn't put more... */ - if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { - *status = U_CE_NOT_FOUND_ERROR; - return 0; - } + /* here we check if we have consumed all characters */ + /* you can put in either one character or a contraction */ + /* you shouldn't put more... */ + if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { + *status = U_CE_NOT_FOUND_ERROR; + return 0; + } - uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); + uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); - if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { - *status = U_PRIMARY_TOO_LONG_ERROR; - return 0; - } - if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { - coll->variableTopValueisDefault = FALSE; - coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; - } + if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { + *status = U_PRIMARY_TOO_LONG_ERROR; + return 0; + } + if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { + coll->variableTopValueisDefault = FALSE; + coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; + } + + /* To avoid memory leak, free the offset buffer if necessary. */ + freeOffsetBuffer(&s); - return CE & UCOL_PRIMARYMASK; + return CE & UCOL_PRIMARYMASK; } U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { - if(U_FAILURE(*status) || coll == NULL) { - return 0; - } - return coll->variableTopValue<<16; + if(U_FAILURE(*status) || coll == NULL) { + return 0; + } + return coll->variableTopValue<<16; } U_CAPI void U_EXPORT2 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { - if(U_FAILURE(*status) || coll == NULL) { - return; - } + if(U_FAILURE(*status) || coll == NULL) { + return; + } - if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { - coll->variableTopValueisDefault = FALSE; - coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; - } + if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { + coll->variableTopValueisDefault = FALSE; + coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; + } } /* Attribute setter API */ U_CAPI void U_EXPORT2 @@ -7243,33 +7144,33 @@ ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UColAttributeValue oldCaseFirst = coll->caseFirst; switch(attr) { case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ - if(value == UCOL_ON) { - coll->numericCollation = UCOL_ON; - coll->numericCollationisDefault = FALSE; - } else if (value == UCOL_OFF) { - coll->numericCollation = UCOL_OFF; - coll->numericCollationisDefault = FALSE; - } else if (value == UCOL_DEFAULT) { - coll->numericCollationisDefault = TRUE; - coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; - } else { - *status = U_ILLEGAL_ARGUMENT_ERROR; - } - break; + if(value == UCOL_ON) { + coll->numericCollation = UCOL_ON; + coll->numericCollationisDefault = FALSE; + } else if (value == UCOL_OFF) { + coll->numericCollation = UCOL_OFF; + coll->numericCollationisDefault = FALSE; + } else if (value == UCOL_DEFAULT) { + coll->numericCollationisDefault = TRUE; + coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; + } else { + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + break; case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ - if(value == UCOL_ON) { - coll->hiraganaQ = UCOL_ON; - coll->hiraganaQisDefault = FALSE; - } else if (value == UCOL_OFF) { - coll->hiraganaQ = UCOL_OFF; - coll->hiraganaQisDefault = FALSE; - } else if (value == UCOL_DEFAULT) { - coll->hiraganaQisDefault = TRUE; - coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ; - } else { - *status = U_ILLEGAL_ARGUMENT_ERROR; - } - break; + if(value == UCOL_ON) { + coll->hiraganaQ = UCOL_ON; + coll->hiraganaQisDefault = FALSE; + } else if (value == UCOL_OFF) { + coll->hiraganaQ = UCOL_OFF; + coll->hiraganaQisDefault = FALSE; + } else if (value == UCOL_DEFAULT) { + coll->hiraganaQisDefault = TRUE; + coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ; + } else { + *status = U_ILLEGAL_ARGUMENT_ERROR; + } + break; case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ if(value == UCOL_ON) { coll->frenchCollation = UCOL_ON; @@ -7306,8 +7207,8 @@ ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, coll->caseFirst = UCOL_UPPER_FIRST; coll->caseFirstisDefault = FALSE; } else if (value == UCOL_OFF) { - coll->caseFirst = UCOL_OFF; - coll->caseFirstisDefault = FALSE; + coll->caseFirst = UCOL_OFF; + coll->caseFirstisDefault = FALSE; } else if (value == UCOL_DEFAULT) { coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; coll->caseFirstisDefault = TRUE; @@ -7352,268 +7253,70 @@ ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, coll->strength = value; } else { *status = U_ILLEGAL_ARGUMENT_ERROR ; - } - break; - case UCOL_ATTRIBUTE_COUNT: - default: - *status = U_ILLEGAL_ARGUMENT_ERROR; - break; - } - if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { - coll->latinOneRegenTable = TRUE; - } else { - coll->latinOneRegenTable = FALSE; - } - ucol_updateInternalState(coll, status); -} - -U_CAPI UColAttributeValue U_EXPORT2 -ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { - if(U_FAILURE(*status) || coll == NULL) { - return UCOL_DEFAULT; - } - switch(attr) { - case UCOL_NUMERIC_COLLATION: - return coll->numericCollation; - case UCOL_HIRAGANA_QUATERNARY_MODE: - return coll->hiraganaQ; - case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ - return coll->frenchCollation; - case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ - return coll->alternateHandling; - case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ - return coll->caseFirst; - case UCOL_CASE_LEVEL: /* do we have an extra case level */ - return coll->caseLevel; - case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ - return coll->normalizationMode; - case UCOL_STRENGTH: /* attribute for strength */ - return coll->strength; - case UCOL_ATTRIBUTE_COUNT: - default: - *status = U_ILLEGAL_ARGUMENT_ERROR; - break; - } - return UCOL_DEFAULT; -} - -U_CAPI void U_EXPORT2 -ucol_setStrength( UCollator *coll, - UCollationStrength strength) -{ - UErrorCode status = U_ZERO_ERROR; - ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); -} - -U_CAPI UCollationStrength U_EXPORT2 -ucol_getStrength(const UCollator *coll) -{ - UErrorCode status = U_ZERO_ERROR; - return ucol_getAttribute(coll, UCOL_STRENGTH, &status); -} - -/****************************************************************************/ -/* Following are misc functions */ -/* there are new APIs and some compatibility APIs */ -/****************************************************************************/ - -U_CAPI UCollator* U_EXPORT2 -ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) -{ - UCollator * localCollator; - int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); - char *stackBufferChars = (char *)stackBuffer; - - if (status == NULL || U_FAILURE(*status)){ - return 0; - } - if ((stackBuffer && !pBufferSize) || !coll){ - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - /* Pointers on 64-bit platforms need to be aligned - * on a 64-bit boundry in memory. - */ - if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { - int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); - *pBufferSize -= offsetUp; - stackBufferChars += offsetUp; - } - stackBuffer = (void *)stackBufferChars; - - if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ - *pBufferSize = bufferSizeNeeded; - return 0; - } - if (!stackBuffer || *pBufferSize < bufferSizeNeeded) { - /* allocate one here...*/ - int32_t length; - const UChar * rules = ucol_getRules(coll, &length); - - localCollator = ucol_openRules(rules, - length, - ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status), - ucol_getStrength(coll), - NULL, - status); - if (U_SUCCESS(*status)) - { - *status = U_SAFECLONE_ALLOCATED_WARNING; - } - } else { - localCollator = (UCollator *)stackBuffer; - uprv_memcpy(localCollator, coll, sizeof(UCollator)); - localCollator->freeOnClose = FALSE; - localCollator->requestedLocale = NULL; // zero copies of pointers - localCollator->validLocale = NULL; - } - return localCollator; -} - -U_CAPI int32_t U_EXPORT2 -ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) { - UErrorCode status = U_ZERO_ERROR; - int32_t len = 0; - int32_t UCAlen = 0; - const UChar* ucaRules = 0; - const UChar *rules = ucol_getRules(coll, &len); - if(delta == UCOL_FULL_RULES) { - /* take the UCA rules and append real rules at the end */ - /* UCA rules will be probably coming from the root RB */ - ucaRules = ures_getStringByKey(coll->rb,"%%UCARULES",&UCAlen,&status); - /* - UResourceBundle* cresb = ures_getByKeyWithFallback(coll->rb, "collations", NULL, &status); - UResourceBundle* uca = ures_getByKeyWithFallback(cresb, "UCA", NULL, &status); - ucaRules = ures_getStringByKey(uca,"Sequence",&UCAlen,&status); - ures_close(uca); - ures_close(cresb); - */ - } - if(U_FAILURE(status)) { - return 0; - } - if(buffer!=0 && bufferLen>0){ - *buffer=0; - if(UCAlen > 0) { - u_memcpy(buffer, ucaRules, uprv_min(UCAlen, bufferLen)); - } - if(len > 0 && bufferLen > UCAlen) { - u_memcpy(buffer+UCAlen, rules, uprv_min(len, bufferLen-UCAlen)); - } - } - return u_terminateUChars(buffer, bufferLen, len+UCAlen, &status); -} - -static const UChar _NUL = 0; - -U_CAPI const UChar* U_EXPORT2 -ucol_getRules( const UCollator *coll, - int32_t *length) -{ - if(coll->rules != NULL) { - *length = coll->rulesLength; - return coll->rules; - } else { - UErrorCode status = U_ZERO_ERROR; - if(coll->elements != NULL) { - if(U_SUCCESS(status)) { - /*Semantic const */ - ((UCollator *)coll)->rules = ures_getStringByKey(coll->elements, "Sequence", length, &status); - ((UCollator *)coll)->rulesLength = *length; - ((UCollator *)coll)->freeRulesOnClose = FALSE; - return coll->rules; - } + } + break; + case UCOL_ATTRIBUTE_COUNT: + default: + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; } - *length = 0; - return &_NUL; - } -} - -U_CAPI int32_t U_EXPORT2 -ucol_getDisplayName( const char *objLoc, - const char *dispLoc, - UChar *result, - int32_t resultLength, - UErrorCode *status) -{ - - if(U_FAILURE(*status)) return -1; - UnicodeString dst; - if(!(result==NULL && resultLength==0)) { - // NULL destination for pure preflighting: empty dummy string - // otherwise, alias the destination buffer - dst.setTo(result, 0, resultLength); - } - Collator::getDisplayName(Locale(objLoc), Locale(dispLoc), dst); - return dst.extract(result, resultLength, *status); -} - -U_CAPI const char* U_EXPORT2 -ucol_getAvailable(int32_t index) -{ - return uloc_getAvailable(index); -} - -U_CAPI int32_t U_EXPORT2 -ucol_countAvailable() -{ - return uloc_countAvailable(); + if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { + coll->latinOneRegenTable = TRUE; + } else { + coll->latinOneRegenTable = FALSE; + } + ucol_updateInternalState(coll, status); } -#if !UCONFIG_NO_SERVICE -U_CAPI UEnumeration* U_EXPORT2 -ucol_openAvailableLocales(UErrorCode *status) { - // This is a wrapper over Collator::getAvailableLocales() - if (U_FAILURE(*status)) { - return NULL; +U_CAPI UColAttributeValue U_EXPORT2 +ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { + if(U_FAILURE(*status) || coll == NULL) { + return UCOL_DEFAULT; } - StringEnumeration *s = Collator::getAvailableLocales(); - if (s == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; + switch(attr) { + case UCOL_NUMERIC_COLLATION: + return coll->numericCollation; + case UCOL_HIRAGANA_QUATERNARY_MODE: + return coll->hiraganaQ; + case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ + return coll->frenchCollation; + case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ + return coll->alternateHandling; + case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ + return coll->caseFirst; + case UCOL_CASE_LEVEL: /* do we have an extra case level */ + return coll->caseLevel; + case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ + return coll->normalizationMode; + case UCOL_STRENGTH: /* attribute for strength */ + return coll->strength; + case UCOL_ATTRIBUTE_COUNT: + default: + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; } - return uenum_openStringEnumeration(s, status); + return UCOL_DEFAULT; } -#endif - -// Note: KEYWORDS[0] != RESOURCE_NAME - alan -static const char* RESOURCE_NAME = "collations"; - -static const char* KEYWORDS[] = { "collation" }; - -#define KEYWORD_COUNT (sizeof(KEYWORDS)/sizeof(KEYWORDS[0])) - -U_CAPI UEnumeration* U_EXPORT2 -ucol_getKeywords(UErrorCode *status) { - UEnumeration *result = NULL; - if (U_SUCCESS(*status)) { - return uenum_openCharStringsEnumeration(KEYWORDS, KEYWORD_COUNT, status); - } - return result; +U_CAPI void U_EXPORT2 +ucol_setStrength( UCollator *coll, + UCollationStrength strength) +{ + UErrorCode status = U_ZERO_ERROR; + ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); } -U_CAPI UEnumeration* U_EXPORT2 -ucol_getKeywordValues(const char *keyword, UErrorCode *status) { - // hard-coded to accept exactly one collation keyword - // modify if additional collation keyword is added later - if (U_SUCCESS(*status) && - keyword==NULL || uprv_strcmp(keyword, KEYWORDS[0])!=0) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - return ures_getKeywordValues(U_ICUDATA_COLL, RESOURCE_NAME, status); +U_CAPI UCollationStrength U_EXPORT2 +ucol_getStrength(const UCollator *coll) +{ + UErrorCode status = U_ZERO_ERROR; + return ucol_getAttribute(coll, UCOL_STRENGTH, &status); } -U_CAPI int32_t U_EXPORT2 -ucol_getFunctionalEquivalent(char* result, int32_t resultCapacity, - const char* keyword, const char* locale, - UBool* isAvailable, UErrorCode* status) { - // N.B.: Resource name is "collations" but keyword is "collation" - return ures_getFunctionalEquivalent(result, resultCapacity, U_ICUDATA_COLL, - "collations", keyword, locale, - isAvailable, TRUE, status); -} +/****************************************************************************/ +/* Following are misc functions */ +/* there are new APIs and some compatibility APIs */ +/****************************************************************************/ U_CAPI void U_EXPORT2 ucol_getVersion(const UCollator* coll, @@ -7648,35 +7351,27 @@ ucol_getVersion(const UCollator* coll, /* This internal API checks whether a character is tailored or not */ U_CAPI UBool U_EXPORT2 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { - uint32_t CE = UCOL_NOT_FOUND; - const UChar *ContractionStart = NULL; - if(U_SUCCESS(*status) && coll != NULL) { - if(coll == coll->UCA) { - return FALSE; - } else if(u < 0x100) { /* latin-1 */ - CE = coll->latinOneMapping[u]; - if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { + if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { return FALSE; - } - } else { /* regular */ - /*CE = ucmpe32_get(coll->mapping, u);*/ - CE = UTRIE_GET32_FROM_LEAD(coll->mapping, u); + } + uint32_t CE = UCOL_NOT_FOUND; + const UChar *ContractionStart = NULL; + if(u < 0x100) { /* latin-1 */ + CE = coll->latinOneMapping[u]; + if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { + return FALSE; + } + } else { /* regular */ + CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); } if(isContraction(CE)) { - ContractionStart = (UChar *)coll->image+getContractOffset(CE); - CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); + ContractionStart = (UChar *)coll->image+getContractOffset(CE); + CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); } - if(CE == UCOL_NOT_FOUND) { - return FALSE; - } else { - return TRUE; - } - } else { - return FALSE; - } + return (UBool)(CE != UCOL_NOT_FOUND); } @@ -7724,100 +7419,102 @@ UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBoo UBool freeSBuf = FALSE, freeTBuf = FALSE; if (sColl->flags & UCOL_USE_ITERATOR) { - UNormIterator *sNIt = NULL, *tNIt = NULL; - sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); - tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); - sColl->iterator->move(sColl->iterator, 0, UITER_START); - tColl->iterator->move(tColl->iterator, 0, UITER_START); - UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); - UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); - comparison = u_strCompareIter(sIt, tIt, TRUE); - unorm_closeIter(sNIt); - unorm_closeIter(tNIt); + UNormIterator *sNIt = NULL, *tNIt = NULL; + sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); + tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); + sColl->iterator->move(sColl->iterator, 0, UITER_START); + tColl->iterator->move(tColl->iterator, 0, UITER_START); + UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); + UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); + comparison = u_strCompareIter(sIt, tIt, TRUE); + unorm_closeIter(sNIt); + unorm_closeIter(tNIt); } else { - sLen = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1; - sBuf = sColl->string; - tLen = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1; - tBuf = tColl->string; - - if (normalize) { - *status = U_ZERO_ERROR; - if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) { - sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize, - sBuf, sLen, - FALSE, 0, - status); - if(*status == U_BUFFER_OVERFLOW_ERROR) { - if(!u_growBufferFromStatic(sColl->stackWritableBuffer, - &sColl->writableBuffer, - (int32_t *)&sColl->writableBufSize, sLen, - 0) - ) { - *status = U_MEMORY_ALLOCATION_ERROR; - return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */ - } - *status = U_ZERO_ERROR; - sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize, - sBuf, sLen, - FALSE, 0, - status); - } - if(freeSBuf) { - uprv_free(sBuf); - freeSBuf = FALSE; - } - sBuf = sColl->writableBuffer; - if (sBuf != sColl->stackWritableBuffer) { - sColl->flags |= UCOL_ITER_ALLOCATED; - } - } + sLen = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1; + sBuf = sColl->string; + tLen = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1; + tBuf = tColl->string; - *status = U_ZERO_ERROR; - if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) { - tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize, - tBuf, tLen, - FALSE, 0, - status); - if(*status == U_BUFFER_OVERFLOW_ERROR) { - if(!u_growBufferFromStatic(tColl->stackWritableBuffer, - &tColl->writableBuffer, - (int32_t *)&tColl->writableBufSize, tLen, - 0) - ) { - *status = U_MEMORY_ALLOCATION_ERROR; - return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */ - } - *status = U_ZERO_ERROR; - tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize, - tBuf, tLen, - FALSE, 0, - status); - } - if(freeTBuf) { - uprv_free(tBuf); - freeTBuf = FALSE; - } - tBuf = tColl->writableBuffer; - if (tBuf != tColl->stackWritableBuffer) { - tColl->flags |= UCOL_ITER_ALLOCATED; - } - } - } + if (normalize) { + *status = U_ZERO_ERROR; + if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) { + sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize, + sBuf, sLen, + FALSE, 0, + status); + if(*status == U_BUFFER_OVERFLOW_ERROR) { + if(!u_growBufferFromStatic(sColl->stackWritableBuffer, + &sColl->writableBuffer, + (int32_t *)&sColl->writableBufSize, sLen, + 0) + ) + { + *status = U_MEMORY_ALLOCATION_ERROR; + return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */ + } + *status = U_ZERO_ERROR; + sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize, + sBuf, sLen, + FALSE, 0, + status); + } + if(freeSBuf) { + uprv_free(sBuf); + freeSBuf = FALSE; + } + sBuf = sColl->writableBuffer; + if (sBuf != sColl->stackWritableBuffer) { + sColl->flags |= UCOL_ITER_ALLOCATED; + } + } - if (sLen == -1 && tLen == -1) { - comparison = u_strcmpCodePointOrder(sBuf, tBuf); - } else { - if (sLen == -1) { - sLen = u_strlen(sBuf); - } - if (tLen == -1) { - tLen = u_strlen(tBuf); - } - comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen)); - if (comparison == 0) { - comparison = sLen - tLen; - } - } + *status = U_ZERO_ERROR; + if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) { + tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize, + tBuf, tLen, + FALSE, 0, + status); + if(*status == U_BUFFER_OVERFLOW_ERROR) { + if(!u_growBufferFromStatic(tColl->stackWritableBuffer, + &tColl->writableBuffer, + (int32_t *)&tColl->writableBufSize, tLen, + 0) + ) + { + *status = U_MEMORY_ALLOCATION_ERROR; + return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */ + } + *status = U_ZERO_ERROR; + tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize, + tBuf, tLen, + FALSE, 0, + status); + } + if(freeTBuf) { + uprv_free(tBuf); + freeTBuf = FALSE; + } + tBuf = tColl->writableBuffer; + if (tBuf != tColl->stackWritableBuffer) { + tColl->flags |= UCOL_ITER_ALLOCATED; + } + } + } + + if (sLen == -1 && tLen == -1) { + comparison = u_strcmpCodePointOrder(sBuf, tBuf); + } else { + if (sLen == -1) { + sLen = u_strlen(sBuf); + } + if (tLen == -1) { + tLen = u_strlen(tBuf); + } + comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen)); + if (comparison == 0) { + comparison = sLen - tLen; + } + } } if (comparison < 0) { @@ -7845,10 +7542,10 @@ static inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { (b)->buf = (b)->pos = (b)->localArray; (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; -}; +} static -void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) { +void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { uint32_t oldSize; uint32_t newSize; uint32_t *newBuf; @@ -7857,29 +7554,35 @@ void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) { oldSize = b->pos - b->buf; newSize = oldSize * 2; newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); - if(newBuf != NULL) { - uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); - if (b->buf != b->localArray) { - uprv_free(b->buf); - } - b->buf = newBuf; - b->endp = b->buf + newSize; - b->pos = b->buf + oldSize; + if(newBuf == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + } + else { + uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); + if (b->buf != b->localArray) { + uprv_free(b->buf); + } + b->buf = newBuf; + b->endp = b->buf + newSize; + b->pos = b->buf + oldSize; } } static -inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci) { +inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) { if (b->pos == b->endp) { - ucol_CEBuf_Expand(b, ci); + ucol_CEBuf_Expand(b, ci, status); + } + if (U_SUCCESS(*status)) { + *(b)->pos++ = ce; + } } - *(b)->pos++ = ce; -}; /* This is a trick string compare function that goes in and uses sortkeys to compare */ /* It is used when compare gets in trouble and needs to bail out */ static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, - collIterate *tColl) + collIterate *tColl, + UErrorCode *status) { uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; uint8_t *sourceKeyP = sourceKey; @@ -7888,31 +7591,32 @@ static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, const UCollator *coll = sColl->coll; UChar *source = NULL; UChar *target = NULL; + int32_t result = UCOL_EQUAL; UChar sStackBuf[256], tStackBuf[256]; int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1; int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1; // TODO: Handle long strings. Do the same in ucol_checkIdent. if(sColl->flags & UCOL_USE_ITERATOR) { - sColl->iterator->move(sColl->iterator, 0, UITER_START); - tColl->iterator->move(tColl->iterator, 0, UITER_START); - source = sStackBuf; - UChar *sBufp = source; - target = tStackBuf; - UChar *tBufp = target; - while(sColl->iterator->hasNext(sColl->iterator)) { - *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator); - } - while(tColl->iterator->hasNext(tColl->iterator)) { - *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator); - } - sourceLength = sBufp - source; - targetLength = tBufp - target; + sColl->iterator->move(sColl->iterator, 0, UITER_START); + tColl->iterator->move(tColl->iterator, 0, UITER_START); + source = sStackBuf; + UChar *sBufp = source; + target = tStackBuf; + UChar *tBufp = target; + while(sColl->iterator->hasNext(sColl->iterator)) { + *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator); + } + while(tColl->iterator->hasNext(tColl->iterator)) { + *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator); + } + sourceLength = sBufp - source; + targetLength = tBufp - target; } else { // no iterators - sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1; - targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1; - source = sColl->string; - target = tColl->string; + sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1; + targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1; + source = sColl->string; + target = tColl->string; } @@ -7920,26 +7624,31 @@ static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); if(sourceKeyLen > UCOL_MAX_BUFFER) { sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); - if(sourceKeyP != NULL) { - sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); + if(sourceKeyP == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto cleanup_and_do_compare; } + sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); } targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); if(targetKeyLen > UCOL_MAX_BUFFER) { targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); - if(targetKeyP != NULL) { - targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); + if(targetKeyP == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto cleanup_and_do_compare; } + targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); } - int32_t result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); + result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); - if(sourceKeyP != sourceKey) { +cleanup_and_do_compare: + if(sourceKeyP != NULL && sourceKeyP != sourceKey) { uprv_free(sourceKeyP); } - if(targetKeyP != targetKey) { + if(targetKeyP != NULL && targetKeyP != targetKey) { uprv_free(targetKeyP); } @@ -7982,7 +7691,7 @@ ucol_strcollRegular( collIterate *sColl, collIterate *tColl, UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; if(doHiragana && shifted) { - return (ucol_compareUsingSortKeys(sColl, tColl)); + return (ucol_compareUsingSortKeys(sColl, tColl, status)); } uint8_t caseSwitch = coll->caseSwitch; uint8_t tertiaryMask = coll->tertiaryMask; @@ -8004,165 +7713,164 @@ ucol_strcollRegular( collIterate *sColl, collIterate *tColl, // Non shifted primary processing is quite simple if(!shifted) { - for(;;) { - - // We fetch CEs until we hit a non ignorable primary or end. - do { - // We get the next CE - sOrder = ucol_IGetNextCE(coll, sColl, status); - // Stuff it in the buffer - UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); - // And keep just the primary part. - sOrder &= UCOL_PRIMARYMASK; - } while(sOrder == 0); - - // see the comments on the above block - do { - tOrder = ucol_IGetNextCE(coll, tColl, status); - UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); - tOrder &= UCOL_PRIMARYMASK; - } while(tOrder == 0); - - // if both primaries are the same - if(sOrder == tOrder) { - // and there are no more CEs, we advance to the next level - if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { - break; - } - if(doHiragana && hirResult == UCOL_EQUAL) { - if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { - hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) - ? UCOL_LESS:UCOL_GREATER; - } - } - } else { - // if two primaries are different, we are done - result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; - goto commonReturn; - } - } // no primary difference... do the rest from the buffers - } else { // shifted - do a slightly more complicated processing :) - for(;;) { - UBool sInShifted = FALSE; - UBool tInShifted = FALSE; - // This version of code can be refactored. However, it seems easier to understand this way. - // Source loop. Sam as the target loop. for(;;) { - sOrder = ucol_IGetNextCE(coll, sColl, status); - if(sOrder == UCOL_NO_MORE_CES) { - UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); - break; - } else if(sOrder == 0 - || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { - /* UCA amendment - ignore ignorables that follow shifted code points */ - continue; - } else if(isContinuation(sOrder)) { - if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ - if(sInShifted) { - sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ - UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); - continue; - } else { - UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); - break; - } - } else { /* Just lower level values */ - if(sInShifted) { - continue; - } else { - UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); - continue; - } - } - } else { /* regular */ - if((sOrder & UCOL_PRIMARYMASK) > LVT) { - UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); - break; - } else { - if((sOrder & UCOL_PRIMARYMASK) > 0) { - sInShifted = TRUE; + + // We fetch CEs until we hit a non ignorable primary or end. + do { + // We get the next CE + sOrder = ucol_IGetNextCE(coll, sColl, status); + // Stuff it in the buffer + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); + // And keep just the primary part. sOrder &= UCOL_PRIMARYMASK; - UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); - continue; - } else { - UCOL_CEBUF_PUT(&sCEs, sOrder, sColl); - sInShifted = FALSE; - continue; - } - } - } - } - sOrder &= UCOL_PRIMARYMASK; - sInShifted = FALSE; + } while(sOrder == 0); + + // see the comments on the above block + do { + tOrder = ucol_IGetNextCE(coll, tColl, status); + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); + tOrder &= UCOL_PRIMARYMASK; + } while(tOrder == 0); + // if both primaries are the same + if(sOrder == tOrder) { + // and there are no more CEs, we advance to the next level + if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { + break; + } + if(doHiragana && hirResult == UCOL_EQUAL) { + if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { + hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) + ? UCOL_LESS:UCOL_GREATER; + } + } + } else { + // if two primaries are different, we are done + result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; + goto commonReturn; + } + } // no primary difference... do the rest from the buffers + } else { // shifted - do a slightly more complicated processing :) for(;;) { - tOrder = ucol_IGetNextCE(coll, tColl, status); - if(tOrder == UCOL_NO_MORE_CES) { - UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); - break; - } else if(tOrder == 0 - || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { - /* UCA amendment - ignore ignorables that follow shifted code points */ - continue; - } else if(isContinuation(tOrder)) { - if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ - if(tInShifted) { - tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ - UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); - continue; - } else { - UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); - break; - } - } else { /* Just lower level values */ - if(tInShifted) { - continue; - } else { - UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); - continue; - } + UBool sInShifted = FALSE; + UBool tInShifted = FALSE; + // This version of code can be refactored. However, it seems easier to understand this way. + // Source loop. Sam as the target loop. + for(;;) { + sOrder = ucol_IGetNextCE(coll, sColl, status); + if(sOrder == UCOL_NO_MORE_CES) { + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); + break; + } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { + /* UCA amendment - ignore ignorables that follow shifted code points */ + continue; + } else if(isContinuation(sOrder)) { + if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ + if(sInShifted) { + sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); + continue; + } else { + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); + break; + } + } else { /* Just lower level values */ + if(sInShifted) { + continue; + } else { + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); + continue; + } + } + } else { /* regular */ + if((sOrder & UCOL_PRIMARYMASK) > LVT) { + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); + break; + } else { + if((sOrder & UCOL_PRIMARYMASK) > 0) { + sInShifted = TRUE; + sOrder &= UCOL_PRIMARYMASK; + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); + continue; + } else { + UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); + sInShifted = FALSE; + continue; + } + } + } } - } else { /* regular */ - if((tOrder & UCOL_PRIMARYMASK) > LVT) { - UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); - break; - } else { - if((tOrder & UCOL_PRIMARYMASK) > 0) { - tInShifted = TRUE; - tOrder &= UCOL_PRIMARYMASK; - UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); - continue; - } else { - UCOL_CEBUF_PUT(&tCEs, tOrder, tColl); - tInShifted = FALSE; - continue; - } + sOrder &= UCOL_PRIMARYMASK; + sInShifted = FALSE; + + for(;;) { + tOrder = ucol_IGetNextCE(coll, tColl, status); + if(tOrder == UCOL_NO_MORE_CES) { + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); + break; + } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { + /* UCA amendment - ignore ignorables that follow shifted code points */ + continue; + } else if(isContinuation(tOrder)) { + if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ + if(tInShifted) { + tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); + continue; + } else { + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); + break; + } + } else { /* Just lower level values */ + if(tInShifted) { + continue; + } else { + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); + continue; + } + } + } else { /* regular */ + if((tOrder & UCOL_PRIMARYMASK) > LVT) { + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); + break; + } else { + if((tOrder & UCOL_PRIMARYMASK) > 0) { + tInShifted = TRUE; + tOrder &= UCOL_PRIMARYMASK; + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); + continue; + } else { + UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); + tInShifted = FALSE; + continue; + } + } + } } - } - } - tOrder &= UCOL_PRIMARYMASK; - tInShifted = FALSE; + tOrder &= UCOL_PRIMARYMASK; + tInShifted = FALSE; - if(sOrder == tOrder) { - /* - if(doHiragana && hirResult == UCOL_EQUAL) { - if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { + if(sOrder == tOrder) { + /* + if(doHiragana && hirResult == UCOL_EQUAL) { + if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) - ? UCOL_LESS:UCOL_GREATER; - } - } - */ - if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { - break; + ? UCOL_LESS:UCOL_GREATER; + } + } + */ + if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { + break; + } else { + sOrder = 0; + tOrder = 0; + continue; + } } else { - sOrder = 0; tOrder = 0; - continue; + result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; + goto commonReturn; } - } else { - result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; - goto commonReturn; - } - } /* no primary difference... do the rest from the buffers */ + } /* no primary difference... do the rest from the buffers */ } /* now, we're gonna reexamine collected CEs */ @@ -8171,229 +7879,245 @@ ucol_strcollRegular( collIterate *sColl, collIterate *tColl, /* This is the secondary level of comparison */ if(checkSecTer) { - if(!isFrenchSec) { /* normal */ - sCE = sCEs.buf; - tCE = tCEs.buf; - for(;;) { - while (secS == 0) { - secS = *(sCE++) & UCOL_SECONDARYMASK; - } + if(!isFrenchSec) { /* normal */ + sCE = sCEs.buf; + tCE = tCEs.buf; + for(;;) { + while (secS == 0) { + secS = *(sCE++) & UCOL_SECONDARYMASK; + } - while(secT == 0) { - secT = *(tCE++) & UCOL_SECONDARYMASK; - } + while(secT == 0) { + secT = *(tCE++) & UCOL_SECONDARYMASK; + } - if(secS == secT) { - if(secS == UCOL_NO_MORE_CES_SECONDARY) { - break; - } else { - secS = 0; secT = 0; - continue; - } - } else { - result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; - goto commonReturn; - } - } - } else { /* do the French */ - uint32_t *sCESave = NULL; - uint32_t *tCESave = NULL; - sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ - tCE = tCEs.pos-2; - for(;;) { - while (secS == 0 && sCE >= sCEs.buf) { - if(sCESave == 0) { - secS = *(sCE--); - if(isContinuation(secS)) { - while(isContinuation(secS = *(sCE--))); - /* after this, secS has the start of continuation, and sCEs points before that */ - sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ - sCE+=2; /* need to point to the first continuation CP */ - /* However, now you can just continue doing stuff */ - } - } else { - secS = *(sCE++); - if(!isContinuation(secS)) { /* This means we have finished with this cont */ - sCE = sCESave; /* reset the pointer to before continuation */ - sCESave = 0; - continue; - } + if(secS == secT) { + if(secS == UCOL_NO_MORE_CES_SECONDARY) { + break; + } else { + secS = 0; secT = 0; + continue; + } + } else { + result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; + goto commonReturn; + } } - secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ - } + } else { /* do the French */ + uint32_t *sCESave = NULL; + uint32_t *tCESave = NULL; + sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ + tCE = tCEs.pos-2; + for(;;) { + while (secS == 0 && sCE >= sCEs.buf) { + if(sCESave == 0) { + secS = *(sCE--); + if(isContinuation(secS)) { + while(isContinuation(secS = *(sCE--))) + ; + /* after this, secS has the start of continuation, and sCEs points before that */ + sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ + sCE+=2; /* need to point to the first continuation CP */ + /* However, now you can just continue doing stuff */ + } + } else { + secS = *(sCE++); + if(!isContinuation(secS)) { /* This means we have finished with this cont */ + sCE = sCESave; /* reset the pointer to before continuation */ + sCESave = 0; + continue; + } + } + secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ + } - while(secT == 0 && tCE >= tCEs.buf) { - if(tCESave == 0) { - secT = *(tCE--); - if(isContinuation(secT)) { - while(isContinuation(secT = *(tCE--))); - /* after this, secS has the start of continuation, and sCEs points before that */ - tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ - tCE+=2; /* need to point to the first continuation CP */ - /* However, now you can just continue doing stuff */ - } - } else { - secT = *(tCE++); - if(!isContinuation(secT)) { /* This means we have finished with this cont */ - tCE = tCESave; /* reset the pointer to before continuation */ - tCESave = 0; - continue; - } - } - secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ - } + while(secT == 0 && tCE >= tCEs.buf) { + if(tCESave == 0) { + secT = *(tCE--); + if(isContinuation(secT)) { + while(isContinuation(secT = *(tCE--))) + ; + /* after this, secS has the start of continuation, and sCEs points before that */ + tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ + tCE+=2; /* need to point to the first continuation CP */ + /* However, now you can just continue doing stuff */ + } + } else { + secT = *(tCE++); + if(!isContinuation(secT)) { /* This means we have finished with this cont */ + tCE = tCESave; /* reset the pointer to before continuation */ + tCESave = 0; + continue; + } + } + secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ + } - if(secS == secT) { - if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { - break; - } else { - secS = 0; secT = 0; - continue; + if(secS == secT) { + if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { + break; + } else { + secS = 0; secT = 0; + continue; + } + } else { + result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; + goto commonReturn; + } } - } else { - result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; - goto commonReturn; - } } - } } /* doing the case bit */ if(checkCase) { - sCE = sCEs.buf; - tCE = tCEs.buf; - for(;;) { - while((secS & UCOL_REMOVE_CASE) == 0) { - if(!isContinuation(*sCE++)) { - secS =*(sCE-1) & UCOL_TERT_CASE_MASK; - secS ^= caseSwitch; - } else { - secS = 0; - } - } + sCE = sCEs.buf; + tCE = tCEs.buf; + for(;;) { + while((secS & UCOL_REMOVE_CASE) == 0) { + if(!isContinuation(*sCE++)) { + secS =*(sCE-1); + if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { + // primary ignorables should not be considered on the case level when the strength is primary + // otherwise, the CEs stop being well-formed + secS &= UCOL_TERT_CASE_MASK; + secS ^= caseSwitch; + } else { + secS = 0; + } + } else { + secS = 0; + } + } - while((secT & UCOL_REMOVE_CASE) == 0) { - if(!isContinuation(*tCE++)) { - secT = *(tCE-1) & UCOL_TERT_CASE_MASK; - secT ^= caseSwitch; - } else { - secT = 0; - } - } + while((secT & UCOL_REMOVE_CASE) == 0) { + if(!isContinuation(*tCE++)) { + secT = *(tCE-1); + if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { + // primary ignorables should not be considered on the case level when the strength is primary + // otherwise, the CEs stop being well-formed + secT &= UCOL_TERT_CASE_MASK; + secT ^= caseSwitch; + } else { + secT = 0; + } + } else { + secT = 0; + } + } - if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { - result = UCOL_LESS; - goto commonReturn; - } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { - result = UCOL_GREATER; - goto commonReturn; - } + if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { + result = UCOL_LESS; + goto commonReturn; + } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { + result = UCOL_GREATER; + goto commonReturn; + } - if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { - break; - } else { - secS = 0; - secT = 0; + if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { + break; + } else { + secS = 0; + secT = 0; + } } - } } /* Tertiary level */ if(checkTertiary) { - secS = 0; - secT = 0; - sCE = sCEs.buf; - tCE = tCEs.buf; - for(;;) { - while((secS & UCOL_REMOVE_CASE) == 0) { - secS = *(sCE++) & tertiaryMask; - if(!isContinuation(secS)) { - secS ^= caseSwitch; - } else { - secS &= UCOL_REMOVE_CASE; - } - } + secS = 0; + secT = 0; + sCE = sCEs.buf; + tCE = tCEs.buf; + for(;;) { + while((secS & UCOL_REMOVE_CASE) == 0) { + secS = *(sCE++) & tertiaryMask; + if(!isContinuation(secS)) { + secS ^= caseSwitch; + } else { + secS &= UCOL_REMOVE_CASE; + } + } - while((secT & UCOL_REMOVE_CASE) == 0) { - secT = *(tCE++) & tertiaryMask; - if(!isContinuation(secT)) { - secT ^= caseSwitch; - } else { - secT &= UCOL_REMOVE_CASE; - } - } + while((secT & UCOL_REMOVE_CASE) == 0) { + secT = *(tCE++) & tertiaryMask; + if(!isContinuation(secT)) { + secT ^= caseSwitch; + } else { + secT &= UCOL_REMOVE_CASE; + } + } - if(secS == secT) { - if((secS & UCOL_REMOVE_CASE) == 1) { - break; - } else { - secS = 0; secT = 0; - continue; - } - } else { - result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; - goto commonReturn; + if(secS == secT) { + if((secS & UCOL_REMOVE_CASE) == 1) { + break; + } else { + secS = 0; secT = 0; + continue; + } + } else { + result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; + goto commonReturn; + } } - } } if(qShifted /*checkQuad*/) { - UBool sInShifted = TRUE; - UBool tInShifted = TRUE; - secS = 0; - secT = 0; - sCE = sCEs.buf; - tCE = tCEs.buf; - for(;;) { - while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) { - secS = *(sCE++); - if(isContinuation(secS)) { - if(!sInShifted) { - continue; - } - } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ - secS = UCOL_PRIMARYMASK; - sInShifted = FALSE; - } else { - sInShifted = TRUE; - } - } - secS &= UCOL_PRIMARYMASK; + UBool sInShifted = TRUE; + UBool tInShifted = TRUE; + secS = 0; + secT = 0; + sCE = sCEs.buf; + tCE = tCEs.buf; + for(;;) { + while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) { + secS = *(sCE++); + if(isContinuation(secS)) { + if(!sInShifted) { + continue; + } + } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ + secS = UCOL_PRIMARYMASK; + sInShifted = FALSE; + } else { + sInShifted = TRUE; + } + } + secS &= UCOL_PRIMARYMASK; - while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) { - secT = *(tCE++); - if(isContinuation(secT)) { - if(!tInShifted) { - continue; + while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) { + secT = *(tCE++); + if(isContinuation(secT)) { + if(!tInShifted) { + continue; + } + } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { + secT = UCOL_PRIMARYMASK; + tInShifted = FALSE; + } else { + tInShifted = TRUE; + } } - } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { - secT = UCOL_PRIMARYMASK; - tInShifted = FALSE; - } else { - tInShifted = TRUE; - } - } - secT &= UCOL_PRIMARYMASK; + secT &= UCOL_PRIMARYMASK; - if(secS == secT) { - if(secS == UCOL_NO_MORE_CES_PRIMARY) { - break; - } else { - secS = 0; secT = 0; - continue; - } - } else { - result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; - goto commonReturn; + if(secS == secT) { + if(secS == UCOL_NO_MORE_CES_PRIMARY) { + break; + } else { + secS = 0; secT = 0; + continue; + } + } else { + result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; + goto commonReturn; + } } - } } else if(doHiragana && hirResult != UCOL_EQUAL) { - // If we're fine on quaternaries, we might be different - // on Hiragana. This, however, might fail us in shifted. - result = hirResult; - goto commonReturn; + // If we're fine on quaternaries, we might be different + // on Hiragana. This, however, might fail us in shifted. + result = hirResult; + goto commonReturn; } /* For IDENTICAL comparisons, we use a bitwise character comparison */ @@ -8425,50 +8149,51 @@ commonReturn: static inline uint32_t ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, - uint32_t CE, const UChar *s, int32_t *index, int32_t len) { - const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); - int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; - int32_t offset = 1; - UChar schar = 0, tchar = 0; + uint32_t CE, const UChar *s, int32_t *index, int32_t len) +{ + const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); + int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; + int32_t offset = 1; + UChar schar = 0, tchar = 0; - for(;;) { - if(len == -1) { - if(s[*index] == 0) { // end of string - return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); - } else { - schar = s[*index]; - } - } else { - if(*index == len) { - return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); - } else { - schar = s[*index]; - } - } + for(;;) { + if(len == -1) { + if(s[*index] == 0) { // end of string + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); + } else { + schar = s[*index]; + } + } else { + if(*index == len) { + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); + } else { + schar = s[*index]; + } + } - while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ - offset++; - } + while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ + offset++; + } - if (schar == tchar) { - (*index)++; - return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); - } - else - { - if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { - return UCOL_BAIL_OUT_CE; - } - // skip completely ignorables - uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar); - if(isZeroCE == 0) { // we have to ignore completely ignorables - (*index)++; - continue; - } + if (schar == tchar) { + (*index)++; + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); + } + else + { + if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { + return UCOL_BAIL_OUT_CE; + } + // skip completely ignorables + uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); + if(isZeroCE == 0) { // we have to ignore completely ignorables + (*index)++; + continue; + } - return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); + return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); + } } - } } @@ -8496,7 +8221,7 @@ ucol_strcollUseLatin1( const UCollator *coll, UChar sChar = 0, tChar = 0; uint32_t sOrder=0, tOrder=0; - UBool endOfSource = FALSE, endOfTarget = FALSE; + UBool endOfSource = FALSE; uint32_t *elements = coll->latinOneCEs; @@ -8505,112 +8230,112 @@ ucol_strcollUseLatin1( const UCollator *coll, // Do the primary level for(;;) { - while(sOrder==0) { // this loop skips primary ignorables - // sOrder=getNextlatinOneCE(source); - if(sLen==-1) { // handling zero terminated strings - sChar=source[sIndex++]; - if(sChar==0) { - endOfSource = TRUE; - break; - } - } else { // handling strings with known length - if(sIndex==sLen) { - endOfSource = TRUE; - break; - } - sChar=source[sIndex++]; - } - if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) - //fprintf(stderr, "R"); - goto returnRegular; - //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); - } - sOrder = elements[sChar]; - if(sOrder >= UCOL_NOT_FOUND) { // if we got a special - // specials can basically be either contractions or bail-out signs. If we get anything - // else, we'll bail out anywasy - if(getCETag(sOrder) == CONTRACTION_TAG) { - sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); - haveContractions = TRUE; // if there are contractions, we cannot do French secondary - // However, if there are contractions in the table, but we always use just one char, - // we might be able to do French. This should be checked out. - } - if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { - //fprintf(stderr, "S"); - goto returnRegular; - //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); - } + while(sOrder==0) { // this loop skips primary ignorables + // sOrder=getNextlatinOneCE(source); + if(sLen==-1) { // handling zero terminated strings + sChar=source[sIndex++]; + if(sChar==0) { + endOfSource = TRUE; + break; + } + } else { // handling strings with known length + if(sIndex==sLen) { + endOfSource = TRUE; + break; + } + sChar=source[sIndex++]; + } + if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) + //fprintf(stderr, "R"); + goto returnRegular; + //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); + } + sOrder = elements[sChar]; + if(sOrder >= UCOL_NOT_FOUND) { // if we got a special + // specials can basically be either contractions or bail-out signs. If we get anything + // else, we'll bail out anywasy + if(getCETag(sOrder) == CONTRACTION_TAG) { + sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); + haveContractions = TRUE; // if there are contractions, we cannot do French secondary + // However, if there are contractions in the table, but we always use just one char, + // we might be able to do French. This should be checked out. + } + if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { + //fprintf(stderr, "S"); + goto returnRegular; + //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); + } + } } - } - while(tOrder==0) { // this loop skips primary ignorables - // tOrder=getNextlatinOneCE(target); - if(tLen==-1) { // handling zero terminated strings - tChar=target[tIndex++]; - if(tChar==0) { - if(endOfSource) { // this is different than source loop, - // as we already know that source loop is done here, - // so we can either finish the primary loop if both - // strings are done or anounce the result if only - // target is done. Same below. - goto endOfPrimLoop; - } else { - return UCOL_GREATER; + while(tOrder==0) { // this loop skips primary ignorables + // tOrder=getNextlatinOneCE(target); + if(tLen==-1) { // handling zero terminated strings + tChar=target[tIndex++]; + if(tChar==0) { + if(endOfSource) { // this is different than source loop, + // as we already know that source loop is done here, + // so we can either finish the primary loop if both + // strings are done or anounce the result if only + // target is done. Same below. + goto endOfPrimLoop; + } else { + return UCOL_GREATER; + } + } + } else { // handling strings with known length + if(tIndex==tLen) { + if(endOfSource) { + goto endOfPrimLoop; + } else { + return UCOL_GREATER; + } + } + tChar=target[tIndex++]; } - } - } else { // handling strings with known length - if(tIndex==tLen) { - if(endOfSource) { - goto endOfPrimLoop; - } else { - return UCOL_GREATER; + if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) + //fprintf(stderr, "R"); + goto returnRegular; + //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); + } + tOrder = elements[tChar]; + if(tOrder >= UCOL_NOT_FOUND) { + // Handling specials, see the comments for source + if(getCETag(tOrder) == CONTRACTION_TAG) { + tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); + haveContractions = TRUE; + } + if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { + //fprintf(stderr, "S"); + goto returnRegular; + //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); + } } - } - tChar=target[tIndex++]; - } - if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) - //fprintf(stderr, "R"); - goto returnRegular; - //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); - } - tOrder = elements[tChar]; - if(tOrder >= UCOL_NOT_FOUND) { - // Handling specials, see the comments for source - if(getCETag(tOrder) == CONTRACTION_TAG) { - tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); - haveContractions = TRUE; - } - if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { - //fprintf(stderr, "S"); - goto returnRegular; - //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); - } } - } - if(endOfSource) { // source is finished, but target is not, say the result. - return UCOL_LESS; - } - - if(sOrder == tOrder) { // if we have same CEs, we continue the loop - sOrder = 0; tOrder = 0; - continue; - } else { - // compare current top bytes - if(((sOrder^tOrder)&0xFF000000)!=0) { - // top bytes differ, return difference - if(sOrder < tOrder) { + if(endOfSource) { // source is finished, but target is not, say the result. return UCOL_LESS; - } else if(sOrder > tOrder) { - return UCOL_GREATER; - } - // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); - // since we must return enum value } - // top bytes match, continue with following bytes - sOrder<<=8; - tOrder<<=8; - } + if(sOrder == tOrder) { // if we have same CEs, we continue the loop + sOrder = 0; tOrder = 0; + continue; + } else { + // compare current top bytes + if(((sOrder^tOrder)&0xFF000000)!=0) { + // top bytes differ, return difference + if(sOrder < tOrder) { + return UCOL_LESS; + } else if(sOrder > tOrder) { + return UCOL_GREATER; + } + // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); + // since we must return enum value + } + + // top bytes match, continue with following bytes + sOrder<<=8; + tOrder<<=8; + } } endOfPrimLoop: @@ -8618,167 +8343,167 @@ endOfPrimLoop: // so we set it and use simpler loop for secondaries and tertiaries sLen = sIndex; tLen = tIndex; if(strength >= UCOL_SECONDARY) { - // adjust the table beggining - elements += coll->latinOneTableLen; - endOfSource = FALSE; endOfTarget = FALSE; - - if(coll->frenchCollation == UCOL_OFF) { // non French - // This loop is a simplified copy of primary loop - // at this point we know that whole strings are latin-1, so we don't - // check for that. We also know that we only have contractions as - // specials. - sIndex = 0; tIndex = 0; - for(;;) { - while(sOrder==0) { - if(sIndex==sLen) { - endOfSource = TRUE; - break; - } - sChar=source[sIndex++]; - sOrder = elements[sChar]; - if(sOrder > UCOL_NOT_FOUND) { - sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); - } - } + // adjust the table beggining + elements += coll->latinOneTableLen; + endOfSource = FALSE; + + if(coll->frenchCollation == UCOL_OFF) { // non French + // This loop is a simplified copy of primary loop + // at this point we know that whole strings are latin-1, so we don't + // check for that. We also know that we only have contractions as + // specials. + sIndex = 0; tIndex = 0; + for(;;) { + while(sOrder==0) { + if(sIndex==sLen) { + endOfSource = TRUE; + break; + } + sChar=source[sIndex++]; + sOrder = elements[sChar]; + if(sOrder > UCOL_NOT_FOUND) { + sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); + } + } - while(tOrder==0) { - if(tIndex==tLen) { - if(endOfSource) { - goto endOfSecLoop; - } else { - return UCOL_GREATER; - } - } - tChar=target[tIndex++]; - tOrder = elements[tChar]; - if(tOrder > UCOL_NOT_FOUND) { - tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); - } - } - if(endOfSource) { - return UCOL_LESS; - } + while(tOrder==0) { + if(tIndex==tLen) { + if(endOfSource) { + goto endOfSecLoop; + } else { + return UCOL_GREATER; + } + } + tChar=target[tIndex++]; + tOrder = elements[tChar]; + if(tOrder > UCOL_NOT_FOUND) { + tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); + } + } + if(endOfSource) { + return UCOL_LESS; + } - if(sOrder == tOrder) { - sOrder = 0; tOrder = 0; - continue; - } else { - // see primary loop for comments on this - if(((sOrder^tOrder)&0xFF000000)!=0) { - if(sOrder < tOrder) { - return UCOL_LESS; - } else if(sOrder > tOrder) { - return UCOL_GREATER; - } - } - sOrder<<=8; - tOrder<<=8; - } - } - } else { // French - if(haveContractions) { // if we have contractions, we have to bail out - // since we don't really know how to handle them here - goto returnRegular; - //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); - } - // For French, we go backwards - sIndex = sLen; tIndex = tLen; - for(;;) { - while(sOrder==0) { - if(sIndex==0) { - endOfSource = TRUE; - break; + if(sOrder == tOrder) { + sOrder = 0; tOrder = 0; + continue; + } else { + // see primary loop for comments on this + if(((sOrder^tOrder)&0xFF000000)!=0) { + if(sOrder < tOrder) { + return UCOL_LESS; + } else if(sOrder > tOrder) { + return UCOL_GREATER; + } + } + sOrder<<=8; + tOrder<<=8; + } } - sChar=source[--sIndex]; - sOrder = elements[sChar]; - // don't even look for contractions - } + } else { // French + if(haveContractions) { // if we have contractions, we have to bail out + // since we don't really know how to handle them here + goto returnRegular; + //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); + } + // For French, we go backwards + sIndex = sLen; tIndex = tLen; + for(;;) { + while(sOrder==0) { + if(sIndex==0) { + endOfSource = TRUE; + break; + } + sChar=source[--sIndex]; + sOrder = elements[sChar]; + // don't even look for contractions + } - while(tOrder==0) { - if(tIndex==0) { - if(endOfSource) { - goto endOfSecLoop; - } else { - return UCOL_GREATER; - } - } - tChar=target[--tIndex]; - tOrder = elements[tChar]; - // don't even look for contractions - } - if(endOfSource) { - return UCOL_LESS; - } + while(tOrder==0) { + if(tIndex==0) { + if(endOfSource) { + goto endOfSecLoop; + } else { + return UCOL_GREATER; + } + } + tChar=target[--tIndex]; + tOrder = elements[tChar]; + // don't even look for contractions + } + if(endOfSource) { + return UCOL_LESS; + } - if(sOrder == tOrder) { - sOrder = 0; tOrder = 0; - continue; - } else { - // see the primary loop for comments - if(((sOrder^tOrder)&0xFF000000)!=0) { - if(sOrder < tOrder) { - return UCOL_LESS; - } else if(sOrder > tOrder) { - return UCOL_GREATER; - } - } - sOrder<<=8; - tOrder<<=8; - } + if(sOrder == tOrder) { + sOrder = 0; tOrder = 0; + continue; + } else { + // see the primary loop for comments + if(((sOrder^tOrder)&0xFF000000)!=0) { + if(sOrder < tOrder) { + return UCOL_LESS; + } else if(sOrder > tOrder) { + return UCOL_GREATER; + } + } + sOrder<<=8; + tOrder<<=8; + } + } } - } } endOfSecLoop: if(strength >= UCOL_TERTIARY) { - // tertiary loop is the same as secondary (except no French) - elements += coll->latinOneTableLen; - sIndex = 0; tIndex = 0; - endOfSource = FALSE; endOfTarget = FALSE; - for(;;) { - while(sOrder==0) { - if(sIndex==sLen) { - endOfSource = TRUE; - break; - } - sChar=source[sIndex++]; - sOrder = elements[sChar]; - if(sOrder > UCOL_NOT_FOUND) { - sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); - } - } - while(tOrder==0) { - if(tIndex==tLen) { + // tertiary loop is the same as secondary (except no French) + elements += coll->latinOneTableLen; + sIndex = 0; tIndex = 0; + endOfSource = FALSE; + for(;;) { + while(sOrder==0) { + if(sIndex==sLen) { + endOfSource = TRUE; + break; + } + sChar=source[sIndex++]; + sOrder = elements[sChar]; + if(sOrder > UCOL_NOT_FOUND) { + sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); + } + } + while(tOrder==0) { + if(tIndex==tLen) { + if(endOfSource) { + return UCOL_EQUAL; // if both strings are at the end, they are equal + } else { + return UCOL_GREATER; + } + } + tChar=target[tIndex++]; + tOrder = elements[tChar]; + if(tOrder > UCOL_NOT_FOUND) { + tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); + } + } if(endOfSource) { - return UCOL_EQUAL; // if both strings are at the end, they are equal - } else { - return UCOL_GREATER; + return UCOL_LESS; } - } - tChar=target[tIndex++]; - tOrder = elements[tChar]; - if(tOrder > UCOL_NOT_FOUND) { - tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); - } - } - if(endOfSource) { - return UCOL_LESS; - } - if(sOrder == tOrder) { - sOrder = 0; tOrder = 0; - continue; - } else { - if(((sOrder^tOrder)&0xff000000)!=0) { - if(sOrder < tOrder) { - return UCOL_LESS; - } else if(sOrder > tOrder) { - return UCOL_GREATER; + if(sOrder == tOrder) { + sOrder = 0; tOrder = 0; + continue; + } else { + if(((sOrder^tOrder)&0xff000000)!=0) { + if(sOrder < tOrder) { + return UCOL_LESS; + } else if(sOrder > tOrder) { + return UCOL_GREATER; + } + } + sOrder<<=8; + tOrder<<=8; } - } - sOrder<<=8; - tOrder<<=8; } - } } return UCOL_EQUAL; @@ -8796,106 +8521,103 @@ U_CAPI UCollationResult U_EXPORT2 ucol_strcollIter( const UCollator *coll, UCharIterator *sIter, UCharIterator *tIter, - UErrorCode *status) { - if(!status || U_FAILURE(*status)) { - return UCOL_EQUAL; - } + UErrorCode *status) +{ + if(!status || U_FAILURE(*status)) { + return UCOL_EQUAL; + } - UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); - UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); + UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); + UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); - if (sIter == tIter) { - UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) - return UCOL_EQUAL; - } - if(sIter == NULL || tIter == NULL || coll == NULL) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) - return UCOL_EQUAL; - } + if (sIter == tIter) { + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) + return UCOL_EQUAL; + } + if(sIter == NULL || tIter == NULL || coll == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) + return UCOL_EQUAL; + } - UCollationResult result = UCOL_EQUAL; - - // Preparing the context objects for iterating over strings - collIterate sColl, tColl; - // The division for the array length may truncate the array size to - // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high - // for all platforms anyway. - UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; - UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; - UNormIterator *sNormIter = NULL, *tNormIter = NULL; - - IInit_collIterate(coll, NULL, -1, &sColl); - sColl.iterator = sIter; - sColl.flags |= UCOL_USE_ITERATOR; - IInit_collIterate(coll, NULL, -1, &tColl); - tColl.flags |= UCOL_USE_ITERATOR; - tColl.iterator = tIter; - - if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { - sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); - sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); - sColl.flags &= ~UCOL_ITER_NORM; - - tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); - tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); - tColl.flags &= ~UCOL_ITER_NORM; - } + UCollationResult result = UCOL_EQUAL; + + // Preparing the context objects for iterating over strings + collIterate sColl, tColl; + // The division for the array length may truncate the array size to + // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high + // for all platforms anyway. + UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; + UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; + UNormIterator *sNormIter = NULL, *tNormIter = NULL; + + IInit_collIterate(coll, NULL, -1, &sColl); + sColl.iterator = sIter; + sColl.flags |= UCOL_USE_ITERATOR; + IInit_collIterate(coll, NULL, -1, &tColl); + tColl.flags |= UCOL_USE_ITERATOR; + tColl.iterator = tIter; - UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; + if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { + sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); + sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); + sColl.flags &= ~UCOL_ITER_NORM; - while((sChar = sColl.iterator->next(sColl.iterator)) == - (tChar = tColl.iterator->next(tColl.iterator))) { - if(UCOL_ISTHAIPREVOWEL(sChar)) { - break; + tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); + tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); + tColl.flags &= ~UCOL_ITER_NORM; } + + UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; + + while((sChar = sColl.iterator->next(sColl.iterator)) == + (tChar = tColl.iterator->next(tColl.iterator))) { + if(sChar == U_SENTINEL) { + result = UCOL_EQUAL; + goto end_compare; + } + } + if(sChar == U_SENTINEL) { - result = UCOL_EQUAL; - goto end_compare; + tChar = tColl.iterator->previous(tColl.iterator); } - } - if(sChar == U_SENTINEL) { - tChar = tColl.iterator->previous(tColl.iterator); - } + if(tChar == U_SENTINEL) { + sChar = sColl.iterator->previous(sColl.iterator); + } - if(tChar == U_SENTINEL) { sChar = sColl.iterator->previous(sColl.iterator); - } - - sChar = sColl.iterator->previous(sColl.iterator); - tChar = tColl.iterator->previous(tColl.iterator); + tChar = tColl.iterator->previous(tColl.iterator); - if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) - { - // We are stopped in the middle of a contraction. - // Scan backwards through the == part of the string looking for the start of the contraction. - // It doesn't matter which string we scan, since they are the same in this region. - do - { - sChar = sColl.iterator->previous(sColl.iterator); - tChar = tColl.iterator->previous(tColl.iterator); - } - while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); - } + if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) + { + // We are stopped in the middle of a contraction. + // Scan backwards through the == part of the string looking for the start of the contraction. + // It doesn't matter which string we scan, since they are the same in this region. + do + { + sChar = sColl.iterator->previous(sColl.iterator); + tChar = tColl.iterator->previous(tColl.iterator); + } + while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); + } - if(U_SUCCESS(*status)) { - result = ucol_strcollRegular(&sColl, &tColl, status); - } + if(U_SUCCESS(*status)) { + result = ucol_strcollRegular(&sColl, &tColl, status); + } end_compare: - if(sNormIter || tNormIter) { - unorm_closeIter(sNormIter); - unorm_closeIter(tNormIter); - } + if(sNormIter || tNormIter) { + unorm_closeIter(sNormIter); + unorm_closeIter(tNormIter); + } - UTRACE_EXIT_VALUE_STATUS(result, *status) - return result; + UTRACE_EXIT_VALUE_STATUS(result, *status) + return result; } - /* */ /* ucol_strcoll Main public API string comparison function */ /* */ @@ -8904,24 +8626,30 @@ ucol_strcoll( const UCollator *coll, const UChar *source, int32_t sourceLength, const UChar *target, - int32_t targetLength) { + int32_t targetLength) +{ U_ALIGN_CODE(16); UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); if (UTRACE_LEVEL(UTRACE_VERBOSE)) { - UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); - UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); - UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); + UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); + UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); + UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); } - UErrorCode status = U_ZERO_ERROR; if(source == NULL || target == NULL) { - // do not crash, but return. Should have - // status argument to return error. - UTRACE_EXIT_VALUE(UTRACE_UCOL_STRCOLL); - return UCOL_EQUAL; + // do not crash, but return. Should have + // status argument to return error. + UTRACE_EXIT_VALUE(UCOL_EQUAL); + return UCOL_EQUAL; + } + + /* Quick check if source and target are same strings. */ + /* They should either both be NULL terminated or the explicit length should be set on both. */ + if (source==target && sourceLength==targetLength) { + UTRACE_EXIT_VALUE(UCOL_EQUAL); + return UCOL_EQUAL; } - collIterate sColl, tColl; /* Scan the strings. Find: */ /* The length of any leading portion that is equal */ @@ -8932,20 +8660,8 @@ ucol_strcoll( const UCollator *coll, if (sourceLength == -1 && targetLength == -1) { // Both strings are null terminated. - // Check for them being the same string, and scan through - // any leading equal portion. - if (source==target) { - UTRACE_EXIT_VALUE(UCOL_EQUAL); - return UCOL_EQUAL; - } - - for (;;) { - if ( *pSrc != *pTarg || *pSrc == 0) { - break; - } - if(UCOL_ISTHAIPREVOWEL(*pSrc)) { - break; - } + // Scan through any leading equal portion. + while (*pSrc == *pTarg && *pSrc != 0) { pSrc++; pTarg++; } @@ -8958,48 +8674,41 @@ ucol_strcoll( const UCollator *coll, else { // One or both strings has an explicit length. - /* check if source and target are same strings */ - - if (source==target && sourceLength==targetLength) { - UTRACE_EXIT_VALUE(UCOL_EQUAL); - return UCOL_EQUAL; - } const UChar *pSrcEnd = source + sourceLength; const UChar *pTargEnd = target + targetLength; - // Scan while the strings are bitwise ==, or until one is exhausted. - for (;;) { - if (pSrc == pSrcEnd || pTarg == pTargEnd) { - break; - } - if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { - break; - } - if (*pSrc != *pTarg) { - break; - } - if(UCOL_ISTHAIPREVOWEL(*pSrc)) { // they are the same here, so any will do - break; - } - pSrc++; - pTarg++; + for (;;) { + if (pSrc == pSrcEnd || pTarg == pTargEnd) { + break; } - equalLength = pSrc - source; - - // If we made it all the way through both strings, we are done. They are == - if ((pSrc ==pSrcEnd || (pSrcEnd 0) { /* There is an identical portion at the beginning of the two strings. */ /* If the identical portion ends within a contraction or a comibining */ /* character sequence, back up to the start of that sequence. */ - pSrc = source + equalLength; /* point to the first differing chars */ - pTarg = target + equalLength; + + // These values should already be set by the code above. + //pSrc = source + equalLength; /* point to the first differing chars */ + //pTarg = target + equalLength; if (pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll) || pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)) { @@ -9024,14 +8733,16 @@ ucol_strcoll( const UCollator *coll, } } - UCollationResult returnVal; + UErrorCode status = U_ZERO_ERROR; + UCollationResult returnVal; if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { - // Preparing the context objects for iterating over strings - IInit_collIterate(coll, source, sourceLength, &sColl); - IInit_collIterate(coll, target, targetLength, &tColl); - returnVal = ucol_strcollRegular(&sColl, &tColl, &status); + collIterate sColl, tColl; + // Preparing the context objects for iterating over strings + IInit_collIterate(coll, source, sourceLength, &sColl); + IInit_collIterate(coll, target, targetLength, &tColl); + returnVal = ucol_strcollRegular(&sColl, &tColl, &status); } else { - returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); + returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); } UTRACE_EXIT_VALUE(returnVal); return returnVal; @@ -9045,8 +8756,8 @@ ucol_greater( const UCollator *coll, const UChar *target, int32_t targetLength) { - return (ucol_strcoll(coll, source, sourceLength, target, targetLength) - == UCOL_GREATER); + return (ucol_strcoll(coll, source, sourceLength, target, targetLength) + == UCOL_GREATER); } /* convenience function for comparing strings */ @@ -9057,8 +8768,8 @@ ucol_greaterOrEqual( const UCollator *coll, const UChar *target, int32_t targetLength) { - return (ucol_strcoll(coll, source, sourceLength, target, targetLength) - != UCOL_LESS); + return (ucol_strcoll(coll, source, sourceLength, target, targetLength) + != UCOL_LESS); } /* convenience function for comparing strings */ @@ -9069,322 +8780,15 @@ ucol_equal( const UCollator *coll, const UChar *target, int32_t targetLength) { - return (ucol_strcoll(coll, source, sourceLength, target, targetLength) - == UCOL_EQUAL); -} - -/* returns the locale name the collation data comes from */ -U_CAPI const char * U_EXPORT2 -ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) { - return ucol_getLocaleByType(coll, type, status); -} - -U_CAPI const char * U_EXPORT2 -ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) { - const char *result = NULL; - if(status == NULL || U_FAILURE(*status)) { - return NULL; - } - UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE); - UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll); - - switch(type) { - case ULOC_ACTUAL_LOCALE: - // validLocale is set only if service registration has explicitly set the - // requested and valid locales. if this is the case, the actual locale - // is considered to be the valid locale. - if (coll->validLocale != NULL) { - result = coll->validLocale; - } else if(coll->elements != NULL) { - result = ures_getLocale(coll->elements, status); - } - break; - case ULOC_VALID_LOCALE: - if (coll->validLocale != NULL) { - result = coll->validLocale; - } else if(coll->rb != NULL) { - result = ures_getLocale(coll->rb, status); - } - break; - case ULOC_REQUESTED_LOCALE: - result = coll->requestedLocale; - break; - default: - *status = U_ILLEGAL_ARGUMENT_ERROR; - } - UTRACE_DATA1(UTRACE_INFO, "result = %s", result); - UTRACE_EXIT_STATUS(*status); - return result; -} - -U_CAPI USet * U_EXPORT2 -ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) -{ - if(status == NULL || U_FAILURE(*status)) { - return NULL; - } - if(coll == NULL || coll->UCA == NULL) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - } - UParseError parseError; - UColTokenParser src; - int32_t rulesLen = 0; - const UChar *rules = ucol_getRules(coll, &rulesLen); - const UChar *current = NULL; - UBool startOfRules = TRUE; - // we internally use the C++ class, for the following reasons: - // 1. we need to utilize canonical iterator, which is a C++ only class - // 2. canonical iterator returns UnicodeStrings - USet cannot take them - // 3. USet is internally really UnicodeSet, C is just a wrapper - UnicodeSet *tailored = new UnicodeSet(); - UnicodeString pattern; - UnicodeString empty; - CanonicalIterator it(empty, *status); - - - // The idea is to tokenize the rule set. For each non-reset token, - // we add all the canonicaly equivalent FCD sequences - ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status); - while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError, status)) != NULL) { - startOfRules = FALSE; - if(src.parsedToken.strength != UCOL_TOK_RESET) { - const UChar *stuff = src.source+(src.parsedToken.charsOffset); - it.setSource(UnicodeString(stuff, src.parsedToken.charsLen), *status); - pattern = it.next(); - while(!pattern.isBogus()) { - if(Normalizer::quickCheck(pattern, UNORM_FCD, *status) != UNORM_NO) { - tailored->add(pattern); - } - pattern = it.next(); - } - } - } - ucol_tok_closeTokenList(&src); - return (USet *)tailored; -} - -U_CAPI UBool U_EXPORT2 -ucol_equals(const UCollator *source, const UCollator *target) { - UErrorCode status = U_ZERO_ERROR; - // if pointers are equal, collators are equal - if(source == target) { - return TRUE; - } - int32_t i = 0, j = 0; - // if any of attributes are different, collators are not equal - for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { - if(ucol_getAttribute(source, (UColAttribute)i, &status) != ucol_getAttribute(target, (UColAttribute)i, &status) || U_FAILURE(status)) { - return FALSE; - } - } - - int32_t sourceRulesLen = 0, targetRulesLen = 0; - const UChar *sourceRules = ucol_getRules(source, &sourceRulesLen); - const UChar *targetRules = ucol_getRules(target, &targetRulesLen); - - if(sourceRulesLen == targetRulesLen && u_strncmp(sourceRules, targetRules, sourceRulesLen) == 0) { - // all the attributes are equal and the rules are equal - collators are equal - return(TRUE); - } - // hard part, need to construct tree from rules and see if they yield the same tailoring - UBool result = TRUE; - UParseError parseError; - UColTokenParser sourceParser, targetParser; - int32_t sourceListLen = 0, targetListLen = 0; - ucol_tok_initTokenList(&sourceParser, sourceRules, sourceRulesLen, source->UCA, &status); - ucol_tok_initTokenList(&targetParser, targetRules, targetRulesLen, target->UCA, &status); - sourceListLen = ucol_tok_assembleTokenList(&sourceParser, &parseError, &status); - targetListLen = ucol_tok_assembleTokenList(&targetParser, &parseError, &status); - - if(sourceListLen != targetListLen) { - // different number of resets - result = FALSE; - } else { - UColToken *sourceReset = NULL, *targetReset = NULL; - UChar *sourceResetString = NULL, *targetResetString = NULL; - int32_t sourceStringLen = 0, targetStringLen = 0; - for(i = 0; i < sourceListLen; i++) { - sourceReset = sourceParser.lh[i].reset; - sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF); - sourceStringLen = sourceReset->source >> 24; - for(j = 0; j < sourceListLen; j++) { - targetReset = targetParser.lh[j].reset; - targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF); - targetStringLen = targetReset->source >> 24; - if(sourceStringLen == targetStringLen && (u_strncmp(sourceResetString, targetResetString, sourceStringLen) == 0)) { - sourceReset = sourceParser.lh[i].first; - targetReset = targetParser.lh[j].first; - while(sourceReset != NULL && targetReset != NULL) { - sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF); - sourceStringLen = sourceReset->source >> 24; - targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF); - targetStringLen = targetReset->source >> 24; - if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) { - result = FALSE; - goto returnResult; - } - // probably also need to check the expansions - if(sourceReset->expansion) { - if(!targetReset->expansion) { - result = FALSE; - goto returnResult; - } else { - // compare expansions - sourceResetString = sourceParser.source+(sourceReset->expansion& 0xFFFFFF); - sourceStringLen = sourceReset->expansion >> 24; - targetResetString = targetParser.source+(targetReset->expansion & 0xFFFFFF); - targetStringLen = targetReset->expansion >> 24; - if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) { - result = FALSE; - goto returnResult; - } - } - } else { - if(targetReset->expansion) { - result = FALSE; - goto returnResult; - } - } - sourceReset = sourceReset->next; - targetReset = targetReset->next; - } - if(sourceReset != targetReset) { // at least one is not NULL - // there are more tailored elements in one list - result = FALSE; - goto returnResult; - } - - - break; - } - } - // couldn't find the reset anchor, so the collators are not equal - if(j == sourceListLen) { - result = FALSE; - goto returnResult; - } - } - } - -returnResult: - ucol_tok_closeTokenList(&sourceParser); - ucol_tok_closeTokenList(&targetParser); - return result; - + return (ucol_strcoll(coll, source, sourceLength, target, targetLength) + == UCOL_EQUAL); } U_CAPI void U_EXPORT2 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { - if(coll && coll->UCA) { - uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); - } -} - -U_CAPI int32_t U_EXPORT2 -ucol_cloneBinary(const UCollator *coll, - uint8_t *buffer, int32_t capacity, - UErrorCode *status) -{ - int32_t length = 0; - if(U_FAILURE(*status)) { - return length; - } - if(coll->hasRealData == TRUE) { - length = coll->image->size; - if(length <= capacity) { - uprv_memcpy(buffer, coll->image, length); - } - } else { - length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); - if(length <= capacity) { - /* build the UCATableHeader with minimal entries */ - /* do not copy the header from the UCA file because its values are wrong! */ - /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ - - /* reset everything */ - uprv_memset(buffer, 0, length); - - /* set the tailoring-specific values */ - UCATableHeader *myData = (UCATableHeader *)buffer; - myData->size = length; - - /* offset for the options, the only part of the data that is present after the header */ - myData->options = sizeof(UCATableHeader); - - /* need to always set the expansion value for an upper bound of the options */ - myData->expansion = myData->options + sizeof(UColOptionSet); - - myData->magic = UCOL_HEADER_MAGIC; - myData->isBigEndian = U_IS_BIG_ENDIAN; - myData->charSetFamily = U_CHARSET_FAMILY; - - /* copy UCA's version; genrb will override all but the builder version with tailoring data */ - uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); - - uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); - uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); - uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); - myData->jamoSpecial = coll->image->jamoSpecial; - - /* copy the collator options */ - uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); - } - } - return length; -} - -U_CAPI UCollator* U_EXPORT2 -ucol_openBinary(const uint8_t *bin, int32_t length, - const UCollator *base, - UErrorCode *status) -{ - UCollator *result = NULL; - if(U_FAILURE(*status)){ - return NULL; - } - if(base == NULL) { - // we don't support null base yet - *status = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; + if(coll && coll->UCA) { + uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); } - UCATableHeader *colData = (UCATableHeader *)bin; - // do we want version check here? We're trying to figure out whether collators are compatible - if(uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || - uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0 || - colData->version[0] != UCOL_BUILDER_VERSION) { - *status = U_COLLATOR_VERSION_MISMATCH; - return NULL; - } else { - if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { - result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); - if(U_FAILURE(*status)){ - return NULL; - } - result->hasRealData = TRUE; - } else { - if(base) { - result = ucol_initCollator(base->image, result, base, status); - ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); - if(U_FAILURE(*status)){ - return NULL; - } - result->hasRealData = FALSE; - } else { - *status = U_USELESS_COLLATOR_ERROR; - return NULL; - } - } - result->freeImageOnClose = FALSE; - } - result->validLocale = NULL; - result->requestedLocale = NULL; - result->rules = NULL; - result->rulesLength = 0; - result->freeRulesOnClose = FALSE; - result->rb = NULL; - result->elements = NULL; - return result; } #endif /* #if !UCONFIG_NO_COLLATION */ -