+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
-* Copyright (C) 2001-2003, International Business Machines
+* Copyright (C) 2001-2016, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
*
* Date Name Description
* 02/15/2001 synwee Modified all methods to process its own function
* instead of calling the equivalent c++ api (coleitr.h)
+* 2012-2014 markus Rewritten in C++ again.
******************************************************************************/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
+#include "unicode/coleitr.h"
+#include "unicode/tblcoll.h"
#include "unicode/ucoleitr.h"
#include "unicode/ustring.h"
#include "unicode/sortkey.h"
-#include "ucol_imp.h"
+#include "unicode/uobject.h"
#include "cmemory.h"
+#include "usrchimp.h"
U_NAMESPACE_USE
#define BUFFER_LENGTH 100
-typedef struct collIterate collIterator;
+#define DEFAULT_BUFFER_SIZE 16
+#define BUFFER_GROW 8
-/* public methods ---------------------------------------------------- */
+#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
-/**
-* Since this is going to be deprecated, I'll leave it as it is
-*/
-U_CAPI int32_t U_EXPORT2
-ucol_keyHashCode(const uint8_t *key,
- int32_t length)
+#define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
+
+#define DELETE_ARRAY(array) uprv_free((void *) (array))
+
+struct RCEI
+{
+ uint32_t ce;
+ int32_t low;
+ int32_t high;
+};
+
+U_NAMESPACE_BEGIN
+
+struct RCEBuffer
+{
+ RCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
+ RCEI *buffer;
+ int32_t bufferIndex;
+ int32_t bufferSize;
+
+ RCEBuffer();
+ ~RCEBuffer();
+
+ UBool isEmpty() const;
+ void put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
+ const RCEI *get();
+};
+
+RCEBuffer::RCEBuffer()
+{
+ buffer = defaultBuffer;
+ bufferIndex = 0;
+ bufferSize = UPRV_LENGTHOF(defaultBuffer);
+}
+
+RCEBuffer::~RCEBuffer()
+{
+ if (buffer != defaultBuffer) {
+ DELETE_ARRAY(buffer);
+ }
+}
+
+UBool RCEBuffer::isEmpty() const
+{
+ return bufferIndex <= 0;
+}
+
+void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
+{
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if (bufferIndex >= bufferSize) {
+ RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
+ if (newBuffer == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ ARRAY_COPY(newBuffer, buffer, bufferSize);
+
+ if (buffer != defaultBuffer) {
+ DELETE_ARRAY(buffer);
+ }
+
+ buffer = newBuffer;
+ bufferSize += BUFFER_GROW;
+ }
+
+ buffer[bufferIndex].ce = ce;
+ buffer[bufferIndex].low = ixLow;
+ buffer[bufferIndex].high = ixHigh;
+
+ bufferIndex += 1;
+}
+
+const RCEI *RCEBuffer::get()
+{
+ if (bufferIndex > 0) {
+ return &buffer[--bufferIndex];
+ }
+
+ return NULL;
+}
+
+PCEBuffer::PCEBuffer()
+{
+ buffer = defaultBuffer;
+ bufferIndex = 0;
+ bufferSize = UPRV_LENGTHOF(defaultBuffer);
+}
+
+PCEBuffer::~PCEBuffer()
{
+ if (buffer != defaultBuffer) {
+ DELETE_ARRAY(buffer);
+ }
+}
- CollationKey newKey(key, length);
- return newKey.hashCode();
+void PCEBuffer::reset()
+{
+ bufferIndex = 0;
}
+UBool PCEBuffer::isEmpty() const
+{
+ return bufferIndex <= 0;
+}
+
+void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
+{
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if (bufferIndex >= bufferSize) {
+ PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
+ if (newBuffer == NULL) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ ARRAY_COPY(newBuffer, buffer, bufferSize);
+
+ if (buffer != defaultBuffer) {
+ DELETE_ARRAY(buffer);
+ }
+
+ buffer = newBuffer;
+ bufferSize += BUFFER_GROW;
+ }
+
+ buffer[bufferIndex].ce = ce;
+ buffer[bufferIndex].low = ixLow;
+ buffer[bufferIndex].high = ixHigh;
+
+ bufferIndex += 1;
+}
+
+const PCEI *PCEBuffer::get()
+{
+ if (bufferIndex > 0) {
+ return &buffer[--bufferIndex];
+ }
+
+ return NULL;
+}
+
+UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
+
+UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
+
+void UCollationPCE::init(UCollationElements *elems) {
+ init(CollationElementIterator::fromUCollationElements(elems));
+}
+
+void UCollationPCE::init(CollationElementIterator *iter)
+{
+ cei = iter;
+ init(*iter->rbc_);
+}
+
+void UCollationPCE::init(const Collator &coll)
+{
+ UErrorCode status = U_ZERO_ERROR;
+
+ strength = coll.getAttribute(UCOL_STRENGTH, status);
+ toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
+ isShifted = FALSE;
+ variableTop = coll.getVariableTop(status);
+}
+
+UCollationPCE::~UCollationPCE()
+{
+ // nothing to do
+}
+
+uint64_t UCollationPCE::processCE(uint32_t ce)
+{
+ uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
+
+ // This is clean, but somewhat slow...
+ // We could apply the mask to ce and then
+ // just get all three orders...
+ switch(strength) {
+ default:
+ tertiary = ucol_tertiaryOrder(ce);
+ U_FALLTHROUGH;
+
+ case UCOL_SECONDARY:
+ secondary = ucol_secondaryOrder(ce);
+ U_FALLTHROUGH;
+
+ case UCOL_PRIMARY:
+ primary = ucol_primaryOrder(ce);
+ }
+
+ // **** This should probably handle continuations too. ****
+ // **** That means that we need 24 bits for the primary ****
+ // **** instead of the 16 that we're currently using. ****
+ // **** So we can lay out the 64 bits as: 24.12.12.16. ****
+ // **** Another complication with continuations is that ****
+ // **** the *second* CE is marked as a continuation, so ****
+ // **** we always have to peek ahead to know how long ****
+ // **** the primary is... ****
+ if ((toShift && variableTop > ce && primary != 0)
+ || (isShifted && primary == 0)) {
+
+ if (primary == 0) {
+ return UCOL_IGNORABLE;
+ }
+
+ if (strength >= UCOL_QUATERNARY) {
+ quaternary = primary;
+ }
+
+ primary = secondary = tertiary = 0;
+ isShifted = TRUE;
+ } else {
+ if (strength >= UCOL_QUATERNARY) {
+ quaternary = 0xFFFF;
+ }
+
+ isShifted = FALSE;
+ }
+
+ return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
+}
+
+U_NAMESPACE_END
+
+/* public methods ---------------------------------------------------- */
U_CAPI UCollationElements* U_EXPORT2
ucol_openElements(const UCollator *coll,
int32_t textLength,
UErrorCode *status)
{
- UCollationElements *result;
-
- if (U_FAILURE(*status)) {
- return NULL;
- }
-
- result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements));
- /* test for NULL */
- if (result == NULL) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- return NULL;
- }
-
- result->reset_ = TRUE;
- result->isWritable = FALSE;
+ if (U_FAILURE(*status)) {
+ return NULL;
+ }
+ if (coll == NULL || (text == NULL && textLength != 0)) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return NULL;
+ }
+ const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
+ if (rbc == NULL) {
+ *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator
+ return NULL;
+ }
- if (text == NULL) {
- textLength = 0;
- }
- uprv_init_collIterate(coll, text, textLength, &result->iteratordata_);
+ UnicodeString s((UBool)(textLength < 0), text, textLength);
+ CollationElementIterator *cei = rbc->createCollationElementIterator(s);
+ if (cei == NULL) {
+ *status = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
+ }
- return result;
+ return cei->toUCollationElements();
}
+
U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements *elems)
{
- collIterate *ci = &elems->iteratordata_;
- if (ci->writableBuffer != ci->stackWritableBuffer) {
- uprv_free(ci->writableBuffer);
- }
- if (elems->isWritable && elems->iteratordata_.string != NULL)
- {
- uprv_free(elems->iteratordata_.string);
- }
- uprv_free(elems);
+ delete CollationElementIterator::fromUCollationElements(elems);
}
U_CAPI void U_EXPORT2
ucol_reset(UCollationElements *elems)
{
- collIterate *ci = &(elems->iteratordata_);
- elems->reset_ = TRUE;
- ci->pos = ci->string;
- if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) {
- ci->endp = ci->string + u_strlen(ci->string);
- }
- ci->CEpos = ci->toReturn = ci->CEs;
- ci->flags = UCOL_ITER_HASLEN;
- if (ci->coll->normalizationMode == UCOL_ON) {
- ci->flags |= UCOL_ITER_NORM;
- }
-
- if (ci->stackWritableBuffer != ci->writableBuffer) {
- uprv_free(ci->writableBuffer);
- ci->writableBuffer = ci->stackWritableBuffer;
- ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
- }
- ci->fcdPosition = NULL;
+ CollationElementIterator::fromUCollationElements(elems)->reset();
}
U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements *elems,
UErrorCode *status)
{
- uint32_t result;
- if (U_FAILURE(*status)) {
- return UCOL_NULLORDER;
- }
+ if (U_FAILURE(*status)) {
+ return UCOL_NULLORDER;
+ }
- elems->reset_ = FALSE;
+ return CollationElementIterator::fromUCollationElements(elems)->next(*status);
+}
- result = ucol_getNextCE(elems->iteratordata_.coll, &elems->iteratordata_,
- status);
-
- if (result == UCOL_NO_MORE_CES) {
- result = UCOL_NULLORDER;
- }
- return result;
+// temporarily restore the following removed internal function which is used by Spotlight
+U_CAPI int64_t U_EXPORT2
+ucol_nextProcessed(UCollationElements *elems,
+ int32_t *ixLow,
+ int32_t *ixHigh,
+ UErrorCode *status)
+{
+ return (UCollationPCE(elems)).nextProcessed(ixLow, ixHigh, status);
}
+
+U_NAMESPACE_BEGIN
+
+int64_t
+UCollationPCE::nextProcessed(
+ int32_t *ixLow,
+ int32_t *ixHigh,
+ UErrorCode *status)
+{
+ int64_t result = UCOL_IGNORABLE;
+ uint32_t low = 0, high = 0;
+
+ if (U_FAILURE(*status)) {
+ return UCOL_PROCESSED_NULLORDER;
+ }
+
+ pceBuffer.reset();
+
+ do {
+ low = cei->getOffset();
+ int32_t ce = cei->next(*status);
+ high = cei->getOffset();
+
+ if (ce == UCOL_NULLORDER) {
+ result = UCOL_PROCESSED_NULLORDER;
+ break;
+ }
+
+ result = processCE((uint32_t)ce);
+ } while (result == UCOL_IGNORABLE);
+
+ if (ixLow != NULL) {
+ *ixLow = low;
+ }
+
+ if (ixHigh != NULL) {
+ *ixHigh = high;
+ }
+
+ return result;
+}
+
+U_NAMESPACE_END
+
U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements *elems,
UErrorCode *status)
{
- if(U_FAILURE(*status)) {
- return UCOL_NULLORDER;
- }
- else
- {
- uint32_t result;
-
- if (elems->reset_ &&
- (elems->iteratordata_.pos == elems->iteratordata_.string)) {
- if (elems->iteratordata_.endp == NULL) {
- elems->iteratordata_.endp = elems->iteratordata_.string +
- u_strlen(elems->iteratordata_.string);
- elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
+ if(U_FAILURE(*status)) {
+ return UCOL_NULLORDER;
+ }
+ return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
+}
+
+// temporarily restore the following removed internal function which is used by Spotlight
+U_CAPI int64_t U_EXPORT2
+ucol_previousProcessed(UCollationElements *elems,
+ int32_t *ixLow,
+ int32_t *ixHigh,
+ UErrorCode *status)
+{
+ return (UCollationPCE(elems)).previousProcessed(ixLow, ixHigh, status);
+}
+
+U_NAMESPACE_BEGIN
+
+int64_t
+UCollationPCE::previousProcessed(
+ int32_t *ixLow,
+ int32_t *ixHigh,
+ UErrorCode *status)
+{
+ int64_t result = UCOL_IGNORABLE;
+ int32_t low = 0, high = 0;
+
+ if (U_FAILURE(*status)) {
+ return UCOL_PROCESSED_NULLORDER;
+ }
+
+ // pceBuffer.reset();
+
+ while (pceBuffer.isEmpty()) {
+ // buffer raw CEs up to non-ignorable primary
+ RCEBuffer rceb;
+ int32_t ce;
+
+ // **** do we need to reset rceb, or will it always be empty at this point ****
+ do {
+ high = cei->getOffset();
+ ce = cei->previous(*status);
+ low = cei->getOffset();
+
+ if (ce == UCOL_NULLORDER) {
+ if (!rceb.isEmpty()) {
+ break;
+ }
+
+ goto finish;
+ }
+
+ rceb.put((uint32_t)ce, low, high, *status);
+ } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce)));
+
+ // process the raw CEs
+ while (U_SUCCESS(*status) && !rceb.isEmpty()) {
+ const RCEI *rcei = rceb.get();
+
+ result = processCE(rcei->ce);
+
+ if (result != UCOL_IGNORABLE) {
+ pceBuffer.put(result, rcei->low, rcei->high, *status);
+ }
+ }
+ if (U_FAILURE(*status)) {
+ return UCOL_PROCESSED_NULLORDER;
}
- elems->iteratordata_.pos = elems->iteratordata_.endp;
- elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
}
- elems->reset_ = FALSE;
+finish:
+ if (pceBuffer.isEmpty()) {
+ // **** Is -1 the right value for ixLow, ixHigh? ****
+ if (ixLow != NULL) {
+ *ixLow = -1;
+ }
+
+ if (ixHigh != NULL) {
+ *ixHigh = -1
+ ;
+ }
+ return UCOL_PROCESSED_NULLORDER;
+ }
- result = ucol_getPrevCE(elems->iteratordata_.coll, &(elems->iteratordata_),
- status);
+ const PCEI *pcei = pceBuffer.get();
- if (result == UCOL_NO_MORE_CES) {
- result = UCOL_NULLORDER;
+ if (ixLow != NULL) {
+ *ixLow = pcei->low;
}
- return result;
- }
+ if (ixHigh != NULL) {
+ *ixHigh = pcei->high;
+ }
+
+ return pcei->ce;
}
+U_NAMESPACE_END
+
U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements *elems,
int32_t order)
{
- uint8_t result;
- UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result);
- return result;
+ return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
+
+ // TODO: The old code masked the order according to strength and then did a binary search.
+ // However this was probably at least partially broken because of the following comment.
+ // Still, it might have found a match when this version may not.
+
+ // FIXME: with a masked search, there might be more than one hit,
+ // so we need to look forward and backward from the match to find all
+ // of the hits...
}
-
+
U_CAPI void U_EXPORT2
ucol_setText( UCollationElements *elems,
const UChar *text,
int32_t textLength,
UErrorCode *status)
{
- if (U_FAILURE(*status)) {
- return;
- }
-
- if (elems->isWritable && elems->iteratordata_.string != NULL)
- {
- uprv_free(elems->iteratordata_.string);
- }
-
- if (text == NULL) {
- textLength = 0;
- }
-
- elems->isWritable = FALSE;
- uprv_init_collIterate(elems->iteratordata_.coll, text, textLength,
- &elems->iteratordata_);
+ if (U_FAILURE(*status)) {
+ return;
+ }
- elems->reset_ = TRUE;
+ if ((text == NULL && textLength != 0)) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ UnicodeString s((UBool)(textLength < 0), text, textLength);
+ return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
}
U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements *elems)
{
- const collIterate *ci = &(elems->iteratordata_);
- // while processing characters in normalization buffer getOffset will
- // return the next non-normalized character.
- // should be inline with the old implementation since the old codes uses
- // nextDecomp in normalizer which also decomposes the string till the
- // first base character is found.
- if (ci->flags & UCOL_ITER_INNORMBUF) {
- if (ci->fcdPosition == NULL) {
- return 0;
- }
- return (int32_t)(ci->fcdPosition - ci->string);
- }
- else {
- return (int32_t)(ci->pos - ci->string);
- }
+ return CollationElementIterator::fromUCollationElements(elems)->getOffset();
}
U_CAPI void U_EXPORT2
int32_t offset,
UErrorCode *status)
{
- if (U_FAILURE(*status)) {
- return;
- }
-
- // this methods will clean up any use of the writable buffer and points to
- // the original string
- collIterate *ci = &(elems->iteratordata_);
- ci->pos = ci->string + offset;
- ci->CEpos = ci->toReturn = ci->CEs;
- if (ci->flags & UCOL_ITER_INNORMBUF) {
- ci->flags = ci->origFlags;
- }
- if ((ci->flags & UCOL_ITER_HASLEN) == 0) {
- ci->endp = ci->string + u_strlen(ci->string);
- ci->flags |= UCOL_ITER_HASLEN;
- }
- ci->fcdPosition = NULL;
- elems->reset_ = FALSE;
+ if (U_FAILURE(*status)) {
+ return;
+ }
+
+ CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
}
U_CAPI int32_t U_EXPORT2
ucol_primaryOrder (int32_t order)
{
- order &= UCOL_PRIMARYMASK;
- return (order >> UCOL_PRIMARYORDERSHIFT);
+ return (order >> 16) & 0xffff;
}
U_CAPI int32_t U_EXPORT2
ucol_secondaryOrder (int32_t order)
{
- order &= UCOL_SECONDARYMASK;
- return (order >> UCOL_SECONDARYORDERSHIFT);
+ return (order >> 8) & 0xff;
}
U_CAPI int32_t U_EXPORT2
ucol_tertiaryOrder (int32_t order)
{
- return (order & UCOL_TERTIARYMASK);
+ return order & 0xff;
}
#endif /* #if !UCONFIG_NO_COLLATION */