X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..48b980fed3435926e0b3a8d72ecb58be703a1c7a:/icuSources/i18n/ucoleitr.cpp diff --git a/icuSources/i18n/ucoleitr.cpp b/icuSources/i18n/ucoleitr.cpp index f386fb4e..0b7751e4 100644 --- a/icuSources/i18n/ucoleitr.cpp +++ b/icuSources/i18n/ucoleitr.cpp @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (C) 2001-2003, International Business Machines +* Copyright (C) 2001-2008, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * @@ -20,6 +20,7 @@ #include "unicode/ucoleitr.h" #include "unicode/ustring.h" #include "unicode/sortkey.h" +#include "unicode/uobject.h" #include "ucol_imp.h" #include "cmemory.h" @@ -27,120 +28,488 @@ U_NAMESPACE_USE #define BUFFER_LENGTH 100 +#define DEFAULT_BUFFER_SIZE 16 +#define BUFFER_GROW 8 + +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) + +#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) + +#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) + +#define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0]) + +#define DELETE_ARRAY(array) uprv_free((void *) (array)) + typedef struct collIterate collIterator; -/* public methods ---------------------------------------------------- */ +struct RCEI +{ + uint32_t ce; + int32_t low; + int32_t high; +}; -/** -* Since this is going to be deprecated, I'll leave it as it is -*/ -U_CAPI int32_t U_EXPORT2 -ucol_keyHashCode(const uint8_t *key, - int32_t length) +U_NAMESPACE_BEGIN + +struct RCEBuffer +{ + RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; + RCEI *buffer; + int32_t bufferIndex; + int32_t bufferSize; + + RCEBuffer(); + ~RCEBuffer(); + + UBool empty() const; + void put(uint32_t ce, int32_t ixLow, int32_t ixHigh); + const RCEI *get(); +}; + +RCEBuffer::RCEBuffer() { + buffer = defaultBuffer; + bufferIndex = 0; + bufferSize = DEFAULT_BUFFER_SIZE; +} + +RCEBuffer::~RCEBuffer() +{ + if (buffer != defaultBuffer) { + DELETE_ARRAY(buffer); + } +} + +UBool RCEBuffer::empty() const +{ + return bufferIndex <= 0; +} + +void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh) +{ + if (bufferIndex >= bufferSize) { + RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); + + ARRAY_COPY(newBuffer, buffer, bufferSize); - CollationKey newKey(key, length); - return newKey.hashCode(); + if (buffer != defaultBuffer) { + DELETE_ARRAY(buffer); + } + + buffer = newBuffer; + bufferSize += BUFFER_GROW; + } + + buffer[bufferIndex].ce = ce; + buffer[bufferIndex].low = ixLow; + buffer[bufferIndex].high = ixHigh; + + bufferIndex += 1; +} + +const RCEI *RCEBuffer::get() +{ + if (bufferIndex > 0) { + return &buffer[--bufferIndex]; + } + + return NULL; +} + +struct PCEI +{ + uint64_t ce; + int32_t low; + int32_t high; +}; + +struct PCEBuffer +{ + PCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; + PCEI *buffer; + int32_t bufferIndex; + int32_t bufferSize; + + PCEBuffer(); + ~PCEBuffer(); + + void reset(); + UBool empty() const; + void put(uint64_t ce, int32_t ixLow, int32_t ixHigh); + const PCEI *get(); +}; + +PCEBuffer::PCEBuffer() +{ + buffer = defaultBuffer; + bufferIndex = 0; + bufferSize = DEFAULT_BUFFER_SIZE; +} + +PCEBuffer::~PCEBuffer() +{ + if (buffer != defaultBuffer) { + DELETE_ARRAY(buffer); + } +} + +void PCEBuffer::reset() +{ + bufferIndex = 0; +} + +UBool PCEBuffer::empty() const +{ + return bufferIndex <= 0; +} + +void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh) +{ + if (bufferIndex >= bufferSize) { + PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); + + ARRAY_COPY(newBuffer, buffer, bufferSize); + + if (buffer != defaultBuffer) { + DELETE_ARRAY(buffer); + } + + buffer = newBuffer; + bufferSize += BUFFER_GROW; + } + + buffer[bufferIndex].ce = ce; + buffer[bufferIndex].low = ixLow; + buffer[bufferIndex].high = ixHigh; + + bufferIndex += 1; +} + +const PCEI *PCEBuffer::get() +{ + if (bufferIndex > 0) { + return &buffer[--bufferIndex]; + } + + return NULL; +} + +/* + * This inherits from UObject so that + * it can be allocated by new and the + * constructor for PCEBuffer is called. + */ +struct UCollationPCE : public UObject +{ + PCEBuffer pceBuffer; + UCollationStrength strength; + UBool toShift; + UBool isShifted; + uint32_t variableTop; + + UCollationPCE(UCollationElements *elems); + ~UCollationPCE(); + + void init(const UCollator *coll); + + virtual UClassID getDynamicClassID() const; + static UClassID getStaticClassID(); +}; + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE) + +UCollationPCE::UCollationPCE(UCollationElements *elems) +{ + init(elems->iteratordata_.coll); +} + +void UCollationPCE::init(const UCollator *coll) +{ + UErrorCode status = U_ZERO_ERROR; + + strength = ucol_getStrength(coll); + toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; + isShifted = FALSE; + variableTop = coll->variableTopValue << 16; +} + +UCollationPCE::~UCollationPCE() +{ + // nothing to do +} + + +U_NAMESPACE_END + + +inline uint64_t processCE(UCollationElements *elems, uint32_t ce) +{ + uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; + + // This is clean, but somewhat slow... + // We could apply the mask to ce and then + // just get all three orders... + switch(elems->pce->strength) { + default: + tertiary = ucol_tertiaryOrder(ce); + /* note fall-through */ + + case UCOL_SECONDARY: + secondary = ucol_secondaryOrder(ce); + /* note fall-through */ + + case UCOL_PRIMARY: + primary = ucol_primaryOrder(ce); + } + + // Continuation? + if (elems->pce->toShift && (elems->pce->variableTop > ce && primary != 0) + || (elems->pce->isShifted && primary == 0)) { + + if (primary == 0) { + return UCOL_IGNORABLE; + } + + if (elems->pce->strength >= UCOL_QUATERNARY) { + quaternary = primary; + } + + primary = secondary = tertiary = 0; + elems->pce->isShifted = TRUE; + } else { + if (elems->pce->strength >= UCOL_QUATERNARY) { + quaternary = 0xFFFF; + } + + elems->pce->isShifted = FALSE; + } + + + return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; +} + +U_CAPI void U_EXPORT2 +uprv_init_pce(const UCollationElements *elems) +{ + if (elems->pce != NULL) { + elems->pce->init(elems->iteratordata_.coll); + } } + +/* public methods ---------------------------------------------------- */ + U_CAPI UCollationElements* U_EXPORT2 ucol_openElements(const UCollator *coll, const UChar *text, int32_t textLength, UErrorCode *status) { - UCollationElements *result; - - if (U_FAILURE(*status)) { - return NULL; - } + UCollationElements *result; - result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements)); - /* test for NULL */ - if (result == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; - } + if (U_FAILURE(*status)) { + return NULL; + } - result->reset_ = TRUE; - result->isWritable = FALSE; + result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements)); + /* test for NULL */ + if (result == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + + result->reset_ = TRUE; + result->isWritable = FALSE; + result->pce = NULL; - if (text == NULL) { - textLength = 0; - } - uprv_init_collIterate(coll, text, textLength, &result->iteratordata_); + if (text == NULL) { + textLength = 0; + } + uprv_init_collIterate(coll, text, textLength, &result->iteratordata_); - return result; + return result; } U_CAPI void U_EXPORT2 ucol_closeElements(UCollationElements *elems) { - collIterate *ci = &elems->iteratordata_; - if (ci->writableBuffer != ci->stackWritableBuffer) { - uprv_free(ci->writableBuffer); - } - if (elems->isWritable && elems->iteratordata_.string != NULL) - { - uprv_free(elems->iteratordata_.string); - } - uprv_free(elems); + if (elems != NULL) { + collIterate *ci = &elems->iteratordata_; + + if (ci != NULL) { + if (ci->writableBuffer != ci->stackWritableBuffer) { + uprv_free(ci->writableBuffer); + } + + if (ci->extendCEs) { + uprv_free(ci->extendCEs); + } + + if (ci->offsetBuffer) { + uprv_free(ci->offsetBuffer); + } + } + + if (elems->isWritable && elems->iteratordata_.string != NULL) + { + uprv_free(elems->iteratordata_.string); + } + + if (elems->pce != NULL) { + delete elems->pce; + } + + uprv_free(elems); + } } U_CAPI void U_EXPORT2 ucol_reset(UCollationElements *elems) { - collIterate *ci = &(elems->iteratordata_); - elems->reset_ = TRUE; - ci->pos = ci->string; - if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) { - ci->endp = ci->string + u_strlen(ci->string); - } - ci->CEpos = ci->toReturn = ci->CEs; - ci->flags = UCOL_ITER_HASLEN; - if (ci->coll->normalizationMode == UCOL_ON) { - ci->flags |= UCOL_ITER_NORM; - } - - if (ci->stackWritableBuffer != ci->writableBuffer) { - uprv_free(ci->writableBuffer); - ci->writableBuffer = ci->stackWritableBuffer; - ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; - } - ci->fcdPosition = NULL; + collIterate *ci = &(elems->iteratordata_); + elems->reset_ = TRUE; + ci->pos = ci->string; + if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) { + ci->endp = ci->string + u_strlen(ci->string); + } + ci->CEpos = ci->toReturn = ci->CEs; + ci->flags = UCOL_ITER_HASLEN; + if (ci->coll->normalizationMode == UCOL_ON) { + ci->flags |= UCOL_ITER_NORM; + } + + if (ci->stackWritableBuffer != ci->writableBuffer) { + uprv_free(ci->writableBuffer); + ci->writableBuffer = ci->stackWritableBuffer; + ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; + } + ci->fcdPosition = NULL; + + //ci->offsetReturn = ci->offsetStore = NULL; + ci->offsetRepeatCount = ci->offsetRepeatValue = 0; } U_CAPI int32_t U_EXPORT2 ucol_next(UCollationElements *elems, UErrorCode *status) { - uint32_t result; - if (U_FAILURE(*status)) { - return UCOL_NULLORDER; - } + int32_t result; + if (U_FAILURE(*status)) { + return UCOL_NULLORDER; + } - elems->reset_ = FALSE; + elems->reset_ = FALSE; - result = ucol_getNextCE(elems->iteratordata_.coll, &elems->iteratordata_, - status); - - if (result == UCOL_NO_MORE_CES) { - result = UCOL_NULLORDER; - } - return result; + result = (int32_t)ucol_getNextCE(elems->iteratordata_.coll, + &elems->iteratordata_, + status); + + if (result == UCOL_NO_MORE_CES) { + result = UCOL_NULLORDER; + } + return result; +} + +U_CAPI int64_t U_EXPORT2 +ucol_nextProcessed(UCollationElements *elems, + int32_t *ixLow, + int32_t *ixHigh, + UErrorCode *status) +{ + const UCollator *coll = elems->iteratordata_.coll; + int64_t result = UCOL_IGNORABLE; + uint32_t low = 0, high = 0; + + if (U_FAILURE(*status)) { + return UCOL_PROCESSED_NULLORDER; + } + + if (elems->pce == NULL) { + elems->pce = new UCollationPCE(elems); + } else { + elems->pce->pceBuffer.reset(); + } + + elems->reset_ = FALSE; + + do { + low = ucol_getOffset(elems); + uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status); + high = ucol_getOffset(elems); + + if (ce == UCOL_NO_MORE_CES) { + result = UCOL_PROCESSED_NULLORDER; + break; + } + + result = processCE(elems, ce); + } while (result == UCOL_IGNORABLE); + + if (ixLow != NULL) { + *ixLow = low; + } + + if (ixHigh != NULL) { + *ixHigh = high; + } + + return result; } U_CAPI int32_t U_EXPORT2 ucol_previous(UCollationElements *elems, UErrorCode *status) { - if(U_FAILURE(*status)) { - return UCOL_NULLORDER; - } - else - { - uint32_t result; + if(U_FAILURE(*status)) { + return UCOL_NULLORDER; + } + else + { + int32_t result; + + if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) { + if (elems->iteratordata_.endp == NULL) { + elems->iteratordata_.endp = elems->iteratordata_.string + + u_strlen(elems->iteratordata_.string); + elems->iteratordata_.flags |= UCOL_ITER_HASLEN; + } + elems->iteratordata_.pos = elems->iteratordata_.endp; + elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; + } + + elems->reset_ = FALSE; + + result = (int32_t)ucol_getPrevCE(elems->iteratordata_.coll, + &(elems->iteratordata_), + status); + + if (result == UCOL_NO_MORE_CES) { + result = UCOL_NULLORDER; + } + + return result; + } +} + +U_CAPI int64_t U_EXPORT2 +ucol_previousProcessed(UCollationElements *elems, + int32_t *ixLow, + int32_t *ixHigh, + UErrorCode *status) +{ + const UCollator *coll = elems->iteratordata_.coll; + int64_t result = UCOL_IGNORABLE; + // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; + // UCollationStrength strength = ucol_getStrength(coll); + // UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; + // uint32_t variableTop = coll->variableTopValue; + int32_t low = 0, high = 0; + + if (U_FAILURE(*status)) { + return UCOL_PROCESSED_NULLORDER; + } if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) { @@ -149,30 +518,138 @@ ucol_previous(UCollationElements *elems, u_strlen(elems->iteratordata_.string); elems->iteratordata_.flags |= UCOL_ITER_HASLEN; } + elems->iteratordata_.pos = elems->iteratordata_.endp; elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; } + if (elems->pce == NULL) { + elems->pce = new UCollationPCE(elems); + } else { + //elems->pce->pceBuffer.reset(); + } + elems->reset_ = FALSE; - result = ucol_getPrevCE(elems->iteratordata_.coll, &(elems->iteratordata_), - status); + while (elems->pce->pceBuffer.empty()) { + // buffer raw CEs up to non-ignorable primary + RCEBuffer rceb; + uint32_t ce; + + // **** do we need to reset rceb, or will it always be empty at this point **** + do { + high = ucol_getOffset(elems); + ce = ucol_getPrevCE(coll, &elems->iteratordata_, status); + low = ucol_getOffset(elems); + + if (ce == UCOL_NO_MORE_CES) { + if (! rceb.empty()) { + break; + } + + goto finish; + } + + rceb.put(ce, low, high); + } while ((ce & UCOL_PRIMARYMASK) == 0); + + // process the raw CEs + while (! rceb.empty()) { + const RCEI *rcei = rceb.get(); + + result = processCE(elems, rcei->ce); + + if (result != UCOL_IGNORABLE) { + elems->pce->pceBuffer.put(result, rcei->low, rcei->high); + } + } + } - if (result == UCOL_NO_MORE_CES) { - result = UCOL_NULLORDER; +finish: + if (elems->pce->pceBuffer.empty()) { + // **** Is -1 the right value for ixLow, ixHigh? **** + if (ixLow != NULL) { + *ixLow = -1; + } + + if (ixHigh != NULL) { + *ixHigh = -1 + ; + } + return UCOL_PROCESSED_NULLORDER; } - return result; - } + const PCEI *pcei = elems->pce->pceBuffer.get(); + + if (ixLow != NULL) { + *ixLow = pcei->low; + } + + if (ixHigh != NULL) { + *ixHigh = pcei->high; + } + + return pcei->ce; } U_CAPI int32_t U_EXPORT2 ucol_getMaxExpansion(const UCollationElements *elems, int32_t order) { - uint8_t result; - UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result); - return result; + uint8_t result; + +#if 0 + UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result); +#else + const UCollator *coll = elems->iteratordata_.coll; + const uint32_t *start; + const uint32_t *limit; + const uint32_t *mid; + uint32_t strengthMask = 0; + uint32_t mOrder = (uint32_t) order; + + switch (coll->strength) + { + default: + strengthMask |= UCOL_TERTIARYORDERMASK; + /* fall through */ + + case UCOL_SECONDARY: + strengthMask |= UCOL_SECONDARYORDERMASK; + /* fall through */ + + case UCOL_PRIMARY: + strengthMask |= UCOL_PRIMARYORDERMASK; + } + + mOrder &= strengthMask; + start = (coll)->endExpansionCE; + limit = (coll)->lastEndExpansionCE; + + while (start < limit - 1) { + mid = start + ((limit - start) >> 1); + if (mOrder <= (*mid & strengthMask)) { + limit = mid; + } else { + start = mid; + } + } + + // FIXME: with a masked search, there might be more than one hit, + // so we need to look forward and backward from the match to find all + // of the hits... + if ((*start & strengthMask) == mOrder) { + result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE)); + } else if ((*limit & strengthMask) == mOrder) { + result = *(coll->expansionCESize + (limit - coll->endExpansionCE)); + } else if ((mOrder & 0xFFFF) == 0x00C0) { + result = 2; + } else { + result = 1; + } +#endif + + return result; } U_CAPI void U_EXPORT2 @@ -181,30 +658,42 @@ ucol_setText( UCollationElements *elems, int32_t textLength, UErrorCode *status) { - if (U_FAILURE(*status)) { - return; - } + if (U_FAILURE(*status)) { + return; + } - if (elems->isWritable && elems->iteratordata_.string != NULL) - { - uprv_free(elems->iteratordata_.string); - } - - if (text == NULL) { - textLength = 0; - } + if (elems->isWritable && elems->iteratordata_.string != NULL) + { + uprv_free(elems->iteratordata_.string); + } - elems->isWritable = FALSE; - uprv_init_collIterate(elems->iteratordata_.coll, text, textLength, - &elems->iteratordata_); + if (text == NULL) { + textLength = 0; + } + + elems->isWritable = FALSE; + + /* free offset buffer to avoid memory leak before initializing. */ + freeOffsetBuffer(&(elems->iteratordata_)); + uprv_init_collIterate(elems->iteratordata_.coll, text, textLength, + &elems->iteratordata_); - elems->reset_ = TRUE; + elems->reset_ = TRUE; } U_CAPI int32_t U_EXPORT2 ucol_getOffset(const UCollationElements *elems) { const collIterate *ci = &(elems->iteratordata_); + + if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) { + return ci->offsetRepeatValue; + } + + if (ci->offsetReturn != NULL) { + return *ci->offsetReturn; + } + // while processing characters in normalization buffer getOffset will // return the next non-normalized character. // should be inline with the old implementation since the old codes uses @@ -226,44 +715,48 @@ ucol_setOffset(UCollationElements *elems, int32_t offset, UErrorCode *status) { - if (U_FAILURE(*status)) { - return; - } + if (U_FAILURE(*status)) { + return; + } - // this methods will clean up any use of the writable buffer and points to - // the original string - collIterate *ci = &(elems->iteratordata_); - ci->pos = ci->string + offset; - ci->CEpos = ci->toReturn = ci->CEs; - if (ci->flags & UCOL_ITER_INNORMBUF) { - ci->flags = ci->origFlags; - } - if ((ci->flags & UCOL_ITER_HASLEN) == 0) { - ci->endp = ci->string + u_strlen(ci->string); - ci->flags |= UCOL_ITER_HASLEN; - } - ci->fcdPosition = NULL; - elems->reset_ = FALSE; + // this methods will clean up any use of the writable buffer and points to + // the original string + collIterate *ci = &(elems->iteratordata_); + ci->pos = ci->string + offset; + ci->CEpos = ci->toReturn = ci->CEs; + if (ci->flags & UCOL_ITER_INNORMBUF) { + ci->flags = ci->origFlags; + } + if ((ci->flags & UCOL_ITER_HASLEN) == 0) { + ci->endp = ci->string + u_strlen(ci->string); + ci->flags |= UCOL_ITER_HASLEN; + } + ci->fcdPosition = NULL; + elems->reset_ = FALSE; + + ci->offsetReturn = NULL; + ci->offsetStore = ci->offsetBuffer; + ci->offsetRepeatCount = ci->offsetRepeatValue = 0; } U_CAPI int32_t U_EXPORT2 ucol_primaryOrder (int32_t order) { - order &= UCOL_PRIMARYMASK; - return (order >> UCOL_PRIMARYORDERSHIFT); + order &= UCOL_PRIMARYMASK; + return (order >> UCOL_PRIMARYORDERSHIFT); } U_CAPI int32_t U_EXPORT2 ucol_secondaryOrder (int32_t order) { - order &= UCOL_SECONDARYMASK; - return (order >> UCOL_SECONDARYORDERSHIFT); + order &= UCOL_SECONDARYMASK; + return (order >> UCOL_SECONDARYORDERSHIFT); } U_CAPI int32_t U_EXPORT2 ucol_tertiaryOrder (int32_t order) { - return (order & UCOL_TERTIARYMASK); + return (order & UCOL_TERTIARYMASK); } #endif /* #if !UCONFIG_NO_COLLATION */