X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..0f5d89e82340278ed3d7d50029f37cab2c41a57e:/icuSources/i18n/ucoleitr.cpp?ds=sidebyside diff --git a/icuSources/i18n/ucoleitr.cpp b/icuSources/i18n/ucoleitr.cpp index f386fb4e..e56ea1ef 100644 --- a/icuSources/i18n/ucoleitr.cpp +++ b/icuSources/i18n/ucoleitr.cpp @@ -1,6 +1,8 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** -* Copyright (C) 2001-2003, International Business Machines +* Copyright (C) 2001-2016, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * @@ -11,38 +13,262 @@ * Date Name Description * 02/15/2001 synwee Modified all methods to process its own function * instead of calling the equivalent c++ api (coleitr.h) +* 2012-2014 markus Rewritten in C++ again. ******************************************************************************/ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION +#include "unicode/coleitr.h" +#include "unicode/tblcoll.h" #include "unicode/ucoleitr.h" #include "unicode/ustring.h" #include "unicode/sortkey.h" -#include "ucol_imp.h" +#include "unicode/uobject.h" #include "cmemory.h" +#include "usrchimp.h" U_NAMESPACE_USE #define BUFFER_LENGTH 100 -typedef struct collIterate collIterator; +#define DEFAULT_BUFFER_SIZE 16 +#define BUFFER_GROW 8 -/* public methods ---------------------------------------------------- */ +#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0]) -/** -* Since this is going to be deprecated, I'll leave it as it is -*/ -U_CAPI int32_t U_EXPORT2 -ucol_keyHashCode(const uint8_t *key, - int32_t length) +#define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type)) + +#define DELETE_ARRAY(array) uprv_free((void *) (array)) + +struct RCEI +{ + uint32_t ce; + int32_t low; + int32_t high; +}; + +U_NAMESPACE_BEGIN + +struct RCEBuffer +{ + RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; + RCEI *buffer; + int32_t bufferIndex; + int32_t bufferSize; + + RCEBuffer(); + ~RCEBuffer(); + + UBool isEmpty() const; + void put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); + const RCEI *get(); +}; + +RCEBuffer::RCEBuffer() +{ + buffer = defaultBuffer; + bufferIndex = 0; + bufferSize = UPRV_LENGTHOF(defaultBuffer); +} + +RCEBuffer::~RCEBuffer() +{ + if (buffer != defaultBuffer) { + DELETE_ARRAY(buffer); + } +} + +UBool RCEBuffer::isEmpty() const +{ + return bufferIndex <= 0; +} + +void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode) +{ + if (U_FAILURE(errorCode)) { + return; + } + if (bufferIndex >= bufferSize) { + RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); + if (newBuffer == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + + ARRAY_COPY(newBuffer, buffer, bufferSize); + + if (buffer != defaultBuffer) { + DELETE_ARRAY(buffer); + } + + buffer = newBuffer; + bufferSize += BUFFER_GROW; + } + + buffer[bufferIndex].ce = ce; + buffer[bufferIndex].low = ixLow; + buffer[bufferIndex].high = ixHigh; + + bufferIndex += 1; +} + +const RCEI *RCEBuffer::get() +{ + if (bufferIndex > 0) { + return &buffer[--bufferIndex]; + } + + return NULL; +} + +PCEBuffer::PCEBuffer() +{ + buffer = defaultBuffer; + bufferIndex = 0; + bufferSize = UPRV_LENGTHOF(defaultBuffer); +} + +PCEBuffer::~PCEBuffer() { + if (buffer != defaultBuffer) { + DELETE_ARRAY(buffer); + } +} - CollationKey newKey(key, length); - return newKey.hashCode(); +void PCEBuffer::reset() +{ + bufferIndex = 0; } +UBool PCEBuffer::isEmpty() const +{ + return bufferIndex <= 0; +} + +void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode) +{ + if (U_FAILURE(errorCode)) { + return; + } + if (bufferIndex >= bufferSize) { + PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); + if (newBuffer == NULL) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + + ARRAY_COPY(newBuffer, buffer, bufferSize); + + if (buffer != defaultBuffer) { + DELETE_ARRAY(buffer); + } + + buffer = newBuffer; + bufferSize += BUFFER_GROW; + } + + buffer[bufferIndex].ce = ce; + buffer[bufferIndex].low = ixLow; + buffer[bufferIndex].high = ixHigh; + + bufferIndex += 1; +} + +const PCEI *PCEBuffer::get() +{ + if (bufferIndex > 0) { + return &buffer[--bufferIndex]; + } + + return NULL; +} + +UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); } + +UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); } + +void UCollationPCE::init(UCollationElements *elems) { + init(CollationElementIterator::fromUCollationElements(elems)); +} + +void UCollationPCE::init(CollationElementIterator *iter) +{ + cei = iter; + init(*iter->rbc_); +} + +void UCollationPCE::init(const Collator &coll) +{ + UErrorCode status = U_ZERO_ERROR; + + strength = coll.getAttribute(UCOL_STRENGTH, status); + toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; + isShifted = FALSE; + variableTop = coll.getVariableTop(status); +} + +UCollationPCE::~UCollationPCE() +{ + // nothing to do +} + +uint64_t UCollationPCE::processCE(uint32_t ce) +{ + uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; + + // This is clean, but somewhat slow... + // We could apply the mask to ce and then + // just get all three orders... + switch(strength) { + default: + tertiary = ucol_tertiaryOrder(ce); + U_FALLTHROUGH; + + case UCOL_SECONDARY: + secondary = ucol_secondaryOrder(ce); + U_FALLTHROUGH; + + case UCOL_PRIMARY: + primary = ucol_primaryOrder(ce); + } + + // **** This should probably handle continuations too. **** + // **** That means that we need 24 bits for the primary **** + // **** instead of the 16 that we're currently using. **** + // **** So we can lay out the 64 bits as: 24.12.12.16. **** + // **** Another complication with continuations is that **** + // **** the *second* CE is marked as a continuation, so **** + // **** we always have to peek ahead to know how long **** + // **** the primary is... **** + if ((toShift && variableTop > ce && primary != 0) + || (isShifted && primary == 0)) { + + if (primary == 0) { + return UCOL_IGNORABLE; + } + + if (strength >= UCOL_QUATERNARY) { + quaternary = primary; + } + + primary = secondary = tertiary = 0; + isShifted = TRUE; + } else { + if (strength >= UCOL_QUATERNARY) { + quaternary = 0xFFFF; + } + + isShifted = FALSE; + } + + return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; +} + +U_NAMESPACE_END + +/* public methods ---------------------------------------------------- */ U_CAPI UCollationElements* U_EXPORT2 ucol_openElements(const UCollator *coll, @@ -50,175 +276,247 @@ ucol_openElements(const UCollator *coll, int32_t textLength, UErrorCode *status) { - UCollationElements *result; - - if (U_FAILURE(*status)) { - return NULL; - } - - result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements)); - /* test for NULL */ - if (result == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - - result->reset_ = TRUE; - result->isWritable = FALSE; + if (U_FAILURE(*status)) { + return NULL; + } + if (coll == NULL || (text == NULL && textLength != 0)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); + if (rbc == NULL) { + *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator + return NULL; + } - if (text == NULL) { - textLength = 0; - } - uprv_init_collIterate(coll, text, textLength, &result->iteratordata_); + UnicodeString s((UBool)(textLength < 0), text, textLength); + CollationElementIterator *cei = rbc->createCollationElementIterator(s); + if (cei == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } - return result; + return cei->toUCollationElements(); } + U_CAPI void U_EXPORT2 ucol_closeElements(UCollationElements *elems) { - collIterate *ci = &elems->iteratordata_; - if (ci->writableBuffer != ci->stackWritableBuffer) { - uprv_free(ci->writableBuffer); - } - if (elems->isWritable && elems->iteratordata_.string != NULL) - { - uprv_free(elems->iteratordata_.string); - } - uprv_free(elems); + delete CollationElementIterator::fromUCollationElements(elems); } U_CAPI void U_EXPORT2 ucol_reset(UCollationElements *elems) { - collIterate *ci = &(elems->iteratordata_); - elems->reset_ = TRUE; - ci->pos = ci->string; - if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) { - ci->endp = ci->string + u_strlen(ci->string); - } - ci->CEpos = ci->toReturn = ci->CEs; - ci->flags = UCOL_ITER_HASLEN; - if (ci->coll->normalizationMode == UCOL_ON) { - ci->flags |= UCOL_ITER_NORM; - } - - if (ci->stackWritableBuffer != ci->writableBuffer) { - uprv_free(ci->writableBuffer); - ci->writableBuffer = ci->stackWritableBuffer; - ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; - } - ci->fcdPosition = NULL; + CollationElementIterator::fromUCollationElements(elems)->reset(); } U_CAPI int32_t U_EXPORT2 ucol_next(UCollationElements *elems, UErrorCode *status) { - uint32_t result; - if (U_FAILURE(*status)) { - return UCOL_NULLORDER; - } + if (U_FAILURE(*status)) { + return UCOL_NULLORDER; + } - elems->reset_ = FALSE; + return CollationElementIterator::fromUCollationElements(elems)->next(*status); +} - result = ucol_getNextCE(elems->iteratordata_.coll, &elems->iteratordata_, - status); - - if (result == UCOL_NO_MORE_CES) { - result = UCOL_NULLORDER; - } - return result; +// temporarily restore the following removed internal function which is used by Spotlight +U_CAPI int64_t U_EXPORT2 +ucol_nextProcessed(UCollationElements *elems, + int32_t *ixLow, + int32_t *ixHigh, + UErrorCode *status) +{ + return (UCollationPCE(elems)).nextProcessed(ixLow, ixHigh, status); } + +U_NAMESPACE_BEGIN + +int64_t +UCollationPCE::nextProcessed( + int32_t *ixLow, + int32_t *ixHigh, + UErrorCode *status) +{ + int64_t result = UCOL_IGNORABLE; + uint32_t low = 0, high = 0; + + if (U_FAILURE(*status)) { + return UCOL_PROCESSED_NULLORDER; + } + + pceBuffer.reset(); + + do { + low = cei->getOffset(); + int32_t ce = cei->next(*status); + high = cei->getOffset(); + + if (ce == UCOL_NULLORDER) { + result = UCOL_PROCESSED_NULLORDER; + break; + } + + result = processCE((uint32_t)ce); + } while (result == UCOL_IGNORABLE); + + if (ixLow != NULL) { + *ixLow = low; + } + + if (ixHigh != NULL) { + *ixHigh = high; + } + + return result; +} + +U_NAMESPACE_END + U_CAPI int32_t U_EXPORT2 ucol_previous(UCollationElements *elems, UErrorCode *status) { - if(U_FAILURE(*status)) { - return UCOL_NULLORDER; - } - else - { - uint32_t result; - - if (elems->reset_ && - (elems->iteratordata_.pos == elems->iteratordata_.string)) { - if (elems->iteratordata_.endp == NULL) { - elems->iteratordata_.endp = elems->iteratordata_.string + - u_strlen(elems->iteratordata_.string); - elems->iteratordata_.flags |= UCOL_ITER_HASLEN; + if(U_FAILURE(*status)) { + return UCOL_NULLORDER; + } + return CollationElementIterator::fromUCollationElements(elems)->previous(*status); +} + +// temporarily restore the following removed internal function which is used by Spotlight +U_CAPI int64_t U_EXPORT2 +ucol_previousProcessed(UCollationElements *elems, + int32_t *ixLow, + int32_t *ixHigh, + UErrorCode *status) +{ + return (UCollationPCE(elems)).previousProcessed(ixLow, ixHigh, status); +} + +U_NAMESPACE_BEGIN + +int64_t +UCollationPCE::previousProcessed( + int32_t *ixLow, + int32_t *ixHigh, + UErrorCode *status) +{ + int64_t result = UCOL_IGNORABLE; + int32_t low = 0, high = 0; + + if (U_FAILURE(*status)) { + return UCOL_PROCESSED_NULLORDER; + } + + // pceBuffer.reset(); + + while (pceBuffer.isEmpty()) { + // buffer raw CEs up to non-ignorable primary + RCEBuffer rceb; + int32_t ce; + + // **** do we need to reset rceb, or will it always be empty at this point **** + do { + high = cei->getOffset(); + ce = cei->previous(*status); + low = cei->getOffset(); + + if (ce == UCOL_NULLORDER) { + if (!rceb.isEmpty()) { + break; + } + + goto finish; + } + + rceb.put((uint32_t)ce, low, high, *status); + } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce))); + + // process the raw CEs + while (U_SUCCESS(*status) && !rceb.isEmpty()) { + const RCEI *rcei = rceb.get(); + + result = processCE(rcei->ce); + + if (result != UCOL_IGNORABLE) { + pceBuffer.put(result, rcei->low, rcei->high, *status); + } + } + if (U_FAILURE(*status)) { + return UCOL_PROCESSED_NULLORDER; } - elems->iteratordata_.pos = elems->iteratordata_.endp; - elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; } - elems->reset_ = FALSE; +finish: + if (pceBuffer.isEmpty()) { + // **** Is -1 the right value for ixLow, ixHigh? **** + if (ixLow != NULL) { + *ixLow = -1; + } + + if (ixHigh != NULL) { + *ixHigh = -1 + ; + } + return UCOL_PROCESSED_NULLORDER; + } - result = ucol_getPrevCE(elems->iteratordata_.coll, &(elems->iteratordata_), - status); + const PCEI *pcei = pceBuffer.get(); - if (result == UCOL_NO_MORE_CES) { - result = UCOL_NULLORDER; + if (ixLow != NULL) { + *ixLow = pcei->low; } - return result; - } + if (ixHigh != NULL) { + *ixHigh = pcei->high; + } + + return pcei->ce; } +U_NAMESPACE_END + U_CAPI int32_t U_EXPORT2 ucol_getMaxExpansion(const UCollationElements *elems, int32_t order) { - uint8_t result; - UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result); - return result; + return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order); + + // TODO: The old code masked the order according to strength and then did a binary search. + // However this was probably at least partially broken because of the following comment. + // Still, it might have found a match when this version may not. + + // FIXME: with a masked search, there might be more than one hit, + // so we need to look forward and backward from the match to find all + // of the hits... } - + U_CAPI void U_EXPORT2 ucol_setText( UCollationElements *elems, const UChar *text, int32_t textLength, UErrorCode *status) { - if (U_FAILURE(*status)) { - return; - } - - if (elems->isWritable && elems->iteratordata_.string != NULL) - { - uprv_free(elems->iteratordata_.string); - } - - if (text == NULL) { - textLength = 0; - } - - elems->isWritable = FALSE; - uprv_init_collIterate(elems->iteratordata_.coll, text, textLength, - &elems->iteratordata_); + if (U_FAILURE(*status)) { + return; + } - elems->reset_ = TRUE; + if ((text == NULL && textLength != 0)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + UnicodeString s((UBool)(textLength < 0), text, textLength); + return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status); } U_CAPI int32_t U_EXPORT2 ucol_getOffset(const UCollationElements *elems) { - const collIterate *ci = &(elems->iteratordata_); - // while processing characters in normalization buffer getOffset will - // return the next non-normalized character. - // should be inline with the old implementation since the old codes uses - // nextDecomp in normalizer which also decomposes the string till the - // first base character is found. - if (ci->flags & UCOL_ITER_INNORMBUF) { - if (ci->fcdPosition == NULL) { - return 0; - } - return (int32_t)(ci->fcdPosition - ci->string); - } - else { - return (int32_t)(ci->pos - ci->string); - } + return CollationElementIterator::fromUCollationElements(elems)->getOffset(); } U_CAPI void U_EXPORT2 @@ -226,44 +524,29 @@ ucol_setOffset(UCollationElements *elems, int32_t offset, UErrorCode *status) { - if (U_FAILURE(*status)) { - return; - } - - // this methods will clean up any use of the writable buffer and points to - // the original string - collIterate *ci = &(elems->iteratordata_); - ci->pos = ci->string + offset; - ci->CEpos = ci->toReturn = ci->CEs; - if (ci->flags & UCOL_ITER_INNORMBUF) { - ci->flags = ci->origFlags; - } - if ((ci->flags & UCOL_ITER_HASLEN) == 0) { - ci->endp = ci->string + u_strlen(ci->string); - ci->flags |= UCOL_ITER_HASLEN; - } - ci->fcdPosition = NULL; - elems->reset_ = FALSE; + if (U_FAILURE(*status)) { + return; + } + + CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status); } U_CAPI int32_t U_EXPORT2 ucol_primaryOrder (int32_t order) { - order &= UCOL_PRIMARYMASK; - return (order >> UCOL_PRIMARYORDERSHIFT); + return (order >> 16) & 0xffff; } U_CAPI int32_t U_EXPORT2 ucol_secondaryOrder (int32_t order) { - order &= UCOL_SECONDARYMASK; - return (order >> UCOL_SECONDARYORDERSHIFT); + return (order >> 8) & 0xff; } U_CAPI int32_t U_EXPORT2 ucol_tertiaryOrder (int32_t order) { - return (order & UCOL_TERTIARYMASK); + return order & 0xff; } #endif /* #if !UCONFIG_NO_COLLATION */