ICU-62107.0.1.tar.gz

[apple/icu.git] / icuSources / i18n / ucoleitr.cpp
diff --git a/icuSources/i18n/ucoleitr.cpp b/icuSources/i18n/ucoleitr.cpp

index f386fb4ef71869a60616269c4c6aaa2718b35c23..e56ea1efe0d452f95fef60594dfe0c782f76fd70 100644 (file)
--- a/icuSources/i18n/ucoleitr.cpp
+++ b/icuSources/i18n/ucoleitr.cpp
@@ -1,6 +1,8 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
  ******************************************************************************
-*   Copyright (C) 2001-2003, International Business Machines
+*   Copyright (C) 2001-2016, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  ******************************************************************************
  *
@@ -11,38 +13,262 @@
  * Date        Name        Description
  * 02/15/2001  synwee      Modified all methods to process its own function 
  *                         instead of calling the equivalent c++ api (coleitr.h)
+* 2012-2014   markus      Rewritten in C++ again.
  ******************************************************************************/
  
  #include "unicode/utypes.h"
  
  #if !UCONFIG_NO_COLLATION
  
+#include "unicode/coleitr.h"
+#include "unicode/tblcoll.h"
  #include "unicode/ucoleitr.h"
  #include "unicode/ustring.h"
  #include "unicode/sortkey.h"
-#include "ucol_imp.h"
+#include "unicode/uobject.h"
  #include "cmemory.h"
+#include "usrchimp.h"
  
  U_NAMESPACE_USE
  
  #define BUFFER_LENGTH             100
  
-typedef struct collIterate collIterator;
+#define DEFAULT_BUFFER_SIZE 16
+#define BUFFER_GROW 8
  
-/* public methods ---------------------------------------------------- */
+#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
  
-/**
-* Since this is going to be deprecated, I'll leave it as it is
-*/
-U_CAPI int32_t U_EXPORT2
-ucol_keyHashCode(const uint8_t *key, 
-                       int32_t  length)
+#define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
+
+#define DELETE_ARRAY(array) uprv_free((void *) (array))
+
+struct RCEI
+{
+    uint32_t ce;
+    int32_t  low;
+    int32_t  high;
+};
+
+U_NAMESPACE_BEGIN
+
+struct RCEBuffer
+{
+    RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
+    RCEI   *buffer;
+    int32_t bufferIndex;
+    int32_t bufferSize;
+
+    RCEBuffer();
+    ~RCEBuffer();
+
+    UBool isEmpty() const;
+    void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
+    const RCEI *get();
+};
+
+RCEBuffer::RCEBuffer()
+{
+    buffer = defaultBuffer;
+    bufferIndex = 0;
+    bufferSize = UPRV_LENGTHOF(defaultBuffer);
+}
+
+RCEBuffer::~RCEBuffer()
+{
+    if (buffer != defaultBuffer) {
+        DELETE_ARRAY(buffer);
+    }
+}
+
+UBool RCEBuffer::isEmpty() const
+{
+    return bufferIndex <= 0;
+}
+
+void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
+{
+    if (U_FAILURE(errorCode)) {
+        return;
+    }
+    if (bufferIndex >= bufferSize) {
+        RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
+        if (newBuffer == NULL) {
+            errorCode = U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+
+        ARRAY_COPY(newBuffer, buffer, bufferSize);
+
+        if (buffer != defaultBuffer) {
+            DELETE_ARRAY(buffer);
+        }
+
+        buffer = newBuffer;
+        bufferSize += BUFFER_GROW;
+    }
+
+    buffer[bufferIndex].ce   = ce;
+    buffer[bufferIndex].low  = ixLow;
+    buffer[bufferIndex].high = ixHigh;
+
+    bufferIndex += 1;
+}
+
+const RCEI *RCEBuffer::get()
+{
+    if (bufferIndex > 0) {
+     return &buffer[--bufferIndex];
+    }
+
+    return NULL;
+}
+
+PCEBuffer::PCEBuffer()
+{
+    buffer = defaultBuffer;
+    bufferIndex = 0;
+    bufferSize = UPRV_LENGTHOF(defaultBuffer);
+}
+
+PCEBuffer::~PCEBuffer()
  {
+    if (buffer != defaultBuffer) {
+        DELETE_ARRAY(buffer);
+    }
+}
  
-    CollationKey newKey(key, length);
-    return newKey.hashCode();
+void PCEBuffer::reset()
+{
+    bufferIndex = 0;
  }
  
+UBool PCEBuffer::isEmpty() const
+{
+    return bufferIndex <= 0;
+}
+
+void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
+{
+    if (U_FAILURE(errorCode)) {
+        return;
+    }
+    if (bufferIndex >= bufferSize) {
+        PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
+        if (newBuffer == NULL) {
+            errorCode = U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+
+        ARRAY_COPY(newBuffer, buffer, bufferSize);
+
+        if (buffer != defaultBuffer) {
+            DELETE_ARRAY(buffer);
+        }
+
+        buffer = newBuffer;
+        bufferSize += BUFFER_GROW;
+    }
+
+    buffer[bufferIndex].ce   = ce;
+    buffer[bufferIndex].low  = ixLow;
+    buffer[bufferIndex].high = ixHigh;
+
+    bufferIndex += 1;
+}
+
+const PCEI *PCEBuffer::get()
+{
+    if (bufferIndex > 0) {
+     return &buffer[--bufferIndex];
+    }
+
+    return NULL;
+}
+
+UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
+
+UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
+
+void UCollationPCE::init(UCollationElements *elems) {
+    init(CollationElementIterator::fromUCollationElements(elems));
+}
+
+void UCollationPCE::init(CollationElementIterator *iter)
+{
+    cei = iter;
+    init(*iter->rbc_);
+}
+
+void UCollationPCE::init(const Collator &coll)
+{
+    UErrorCode status = U_ZERO_ERROR;
+
+    strength    = coll.getAttribute(UCOL_STRENGTH, status);
+    toShift     = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
+    isShifted   = FALSE;
+    variableTop = coll.getVariableTop(status);
+}
+
+UCollationPCE::~UCollationPCE()
+{
+    // nothing to do
+}
+
+uint64_t UCollationPCE::processCE(uint32_t ce)
+{
+    uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
+
+    // This is clean, but somewhat slow...
+    // We could apply the mask to ce and then
+    // just get all three orders...
+    switch(strength) {
+    default:
+        tertiary = ucol_tertiaryOrder(ce);
+        U_FALLTHROUGH;
+
+    case UCOL_SECONDARY:
+        secondary = ucol_secondaryOrder(ce);
+        U_FALLTHROUGH;
+
+    case UCOL_PRIMARY:
+        primary = ucol_primaryOrder(ce);
+    }
+
+    // **** This should probably handle continuations too.  ****
+    // **** That means that we need 24 bits for the primary ****
+    // **** instead of the 16 that we're currently using.   ****
+    // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
+    // **** Another complication with continuations is that ****
+    // **** the *second* CE is marked as a continuation, so ****
+    // **** we always have to peek ahead to know how long   ****
+    // **** the primary is...                               ****
+    if ((toShift && variableTop > ce && primary != 0)
+                || (isShifted && primary == 0)) {
+
+        if (primary == 0) {
+            return UCOL_IGNORABLE;
+        }
+
+        if (strength >= UCOL_QUATERNARY) {
+            quaternary = primary;
+        }
+
+        primary = secondary = tertiary = 0;
+        isShifted = TRUE;
+    } else {
+        if (strength >= UCOL_QUATERNARY) {
+            quaternary = 0xFFFF;
+        }
+
+        isShifted = FALSE;
+    }
+
+    return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
+}
+
+U_NAMESPACE_END
+
+/* public methods ---------------------------------------------------- */
  
  U_CAPI UCollationElements* U_EXPORT2
  ucol_openElements(const UCollator  *coll,
@@ -50,175 +276,247 @@ ucol_openElements(const UCollator  *coll,
                          int32_t    textLength,
                          UErrorCode *status)
  {
-  UCollationElements *result;
-
-  if (U_FAILURE(*status)) {
-    return NULL;
-  }
-
-  result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements));
-  /* test for NULL */
-  if (result == NULL) {
-      *status = U_MEMORY_ALLOCATION_ERROR;
-      return NULL;
-  }
-
-  result->reset_   = TRUE;
-  result->isWritable = FALSE;
+    if (U_FAILURE(*status)) {
+        return NULL;
+    }
+    if (coll == NULL || (text == NULL && textLength != 0)) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return NULL;
+    }
+    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
+    if (rbc == NULL) {
+        *status = U_UNSUPPORTED_ERROR;  // coll is a Collator but not a RuleBasedCollator
+        return NULL;
+    }
  
-  if (text == NULL) {
-      textLength = 0;
-  }
-  uprv_init_collIterate(coll, text, textLength, &result->iteratordata_);
+    UnicodeString s((UBool)(textLength < 0), text, textLength);
+    CollationElementIterator *cei = rbc->createCollationElementIterator(s);
+    if (cei == NULL) {
+        *status = U_MEMORY_ALLOCATION_ERROR;
+        return NULL;
+    }
  
-  return result;
+    return cei->toUCollationElements();
  }
  
+
  U_CAPI void U_EXPORT2
  ucol_closeElements(UCollationElements *elems)
  {
-  collIterate *ci = &elems->iteratordata_;
-  if (ci->writableBuffer != ci->stackWritableBuffer) {
-    uprv_free(ci->writableBuffer);
-  }
-  if (elems->isWritable && elems->iteratordata_.string != NULL)
-  {
-    uprv_free(elems->iteratordata_.string);
-  }
-  uprv_free(elems);
+    delete CollationElementIterator::fromUCollationElements(elems);
  }
  
  U_CAPI void U_EXPORT2
  ucol_reset(UCollationElements *elems)
  {
-  collIterate *ci = &(elems->iteratordata_);
-  elems->reset_   = TRUE;
-  ci->pos         = ci->string;
-  if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) {
-    ci->endp      = ci->string + u_strlen(ci->string);
-  }
-  ci->CEpos       = ci->toReturn = ci->CEs;
-  ci->flags       = UCOL_ITER_HASLEN;
-  if (ci->coll->normalizationMode == UCOL_ON) {
-    ci->flags |= UCOL_ITER_NORM;
-  }
-  
-  if (ci->stackWritableBuffer != ci->writableBuffer) {
-    uprv_free(ci->writableBuffer);
-    ci->writableBuffer = ci->stackWritableBuffer;
-    ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
-  }
-  ci->fcdPosition = NULL;
+    CollationElementIterator::fromUCollationElements(elems)->reset();
  }
  
  U_CAPI int32_t U_EXPORT2
  ucol_next(UCollationElements *elems, 
            UErrorCode         *status)
  {
-  uint32_t result;
-  if (U_FAILURE(*status)) {
-    return UCOL_NULLORDER;
-  }
+    if (U_FAILURE(*status)) {
+        return UCOL_NULLORDER;
+    }
  
-  elems->reset_ = FALSE;
+    return CollationElementIterator::fromUCollationElements(elems)->next(*status);
+}
  
-  result = ucol_getNextCE(elems->iteratordata_.coll, &elems->iteratordata_, 
-                          status);
-  
-  if (result == UCOL_NO_MORE_CES) {
-    result = UCOL_NULLORDER;
-  }
-  return result;
+// temporarily restore the following removed internal function which is used by Spotlight
+U_CAPI int64_t U_EXPORT2
+ucol_nextProcessed(UCollationElements *elems,
+                   int32_t            *ixLow,
+                   int32_t            *ixHigh,
+                   UErrorCode         *status)
+{
+    return (UCollationPCE(elems)).nextProcessed(ixLow, ixHigh, status);
  }
  
+
+U_NAMESPACE_BEGIN
+
+int64_t
+UCollationPCE::nextProcessed(
+                   int32_t            *ixLow,
+                   int32_t            *ixHigh,
+                   UErrorCode         *status)
+{
+    int64_t result = UCOL_IGNORABLE;
+    uint32_t low = 0, high = 0;
+
+    if (U_FAILURE(*status)) {
+        return UCOL_PROCESSED_NULLORDER;
+    }
+
+    pceBuffer.reset();
+
+    do {
+        low = cei->getOffset();
+        int32_t ce = cei->next(*status);
+        high = cei->getOffset();
+
+        if (ce == UCOL_NULLORDER) {
+             result = UCOL_PROCESSED_NULLORDER;
+             break;
+        }
+
+        result = processCE((uint32_t)ce);
+    } while (result == UCOL_IGNORABLE);
+
+    if (ixLow != NULL) {
+        *ixLow = low;
+    }
+
+    if (ixHigh != NULL) {
+        *ixHigh = high;
+    }
+
+    return result;
+}
+
+U_NAMESPACE_END
+
  U_CAPI int32_t U_EXPORT2
  ucol_previous(UCollationElements *elems,
                UErrorCode         *status)
  {
-  if(U_FAILURE(*status)) {
-    return UCOL_NULLORDER;
-  }
-  else
-  {
-    uint32_t result;
-
-    if (elems->reset_ && 
-        (elems->iteratordata_.pos == elems->iteratordata_.string)) {
-        if (elems->iteratordata_.endp == NULL) {
-            elems->iteratordata_.endp = elems->iteratordata_.string + 
-                                        u_strlen(elems->iteratordata_.string);
-            elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
+    if(U_FAILURE(*status)) {
+        return UCOL_NULLORDER;
+    }
+    return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
+}
+
+// temporarily restore the following removed internal function which is used by Spotlight
+U_CAPI int64_t U_EXPORT2
+ucol_previousProcessed(UCollationElements *elems,
+                   int32_t            *ixLow,
+                   int32_t            *ixHigh,
+                   UErrorCode         *status)
+{
+    return (UCollationPCE(elems)).previousProcessed(ixLow, ixHigh, status);
+}
+
+U_NAMESPACE_BEGIN
+
+int64_t
+UCollationPCE::previousProcessed(
+                   int32_t            *ixLow,
+                   int32_t            *ixHigh,
+                   UErrorCode         *status)
+{
+    int64_t result = UCOL_IGNORABLE;
+    int32_t  low = 0, high = 0;
+
+    if (U_FAILURE(*status)) {
+        return UCOL_PROCESSED_NULLORDER;
+    }
+
+    // pceBuffer.reset();
+
+    while (pceBuffer.isEmpty()) {
+        // buffer raw CEs up to non-ignorable primary
+        RCEBuffer rceb;
+        int32_t ce;
+        
+        // **** do we need to reset rceb, or will it always be empty at this point ****
+        do {
+            high = cei->getOffset();
+            ce   = cei->previous(*status);
+            low  = cei->getOffset();
+
+            if (ce == UCOL_NULLORDER) {
+                if (!rceb.isEmpty()) {
+                    break;
+                }
+
+                goto finish;
+            }
+
+            rceb.put((uint32_t)ce, low, high, *status);
+        } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce)));
+
+        // process the raw CEs
+        while (U_SUCCESS(*status) && !rceb.isEmpty()) {
+            const RCEI *rcei = rceb.get();
+
+            result = processCE(rcei->ce);
+
+            if (result != UCOL_IGNORABLE) {
+                pceBuffer.put(result, rcei->low, rcei->high, *status);
+            }
+        }
+        if (U_FAILURE(*status)) {
+            return UCOL_PROCESSED_NULLORDER;
          }
-        elems->iteratordata_.pos = elems->iteratordata_.endp;
-        elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
      }
  
-    elems->reset_ = FALSE;
+finish:
+    if (pceBuffer.isEmpty()) {
+        // **** Is -1 the right value for ixLow, ixHigh? ****
+       if (ixLow != NULL) {
+               *ixLow = -1;
+       }
+       
+       if (ixHigh != NULL) {
+               *ixHigh = -1
+               ;
+       }
+        return UCOL_PROCESSED_NULLORDER;
+    }
  
-    result = ucol_getPrevCE(elems->iteratordata_.coll, &(elems->iteratordata_), 
-                            status);
+    const PCEI *pcei = pceBuffer.get();
  
-    if (result == UCOL_NO_MORE_CES) {
-      result = UCOL_NULLORDER;
+    if (ixLow != NULL) {
+        *ixLow = pcei->low;
      }
  
-    return result;
-  }
+    if (ixHigh != NULL) {
+        *ixHigh = pcei->high;
+    }
+
+    return pcei->ce;
  }
  
+U_NAMESPACE_END
+
  U_CAPI int32_t U_EXPORT2
  ucol_getMaxExpansion(const UCollationElements *elems,
                             int32_t            order)
  {
-  uint8_t result;
-  UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result);
-  return result;
+    return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
+
+    // TODO: The old code masked the order according to strength and then did a binary search.
+    // However this was probably at least partially broken because of the following comment.
+    // Still, it might have found a match when this version may not.
+
+    // FIXME: with a masked search, there might be more than one hit,
+    // so we need to look forward and backward from the match to find all
+    // of the hits...
  }
- 
+
  U_CAPI void U_EXPORT2
  ucol_setText(      UCollationElements *elems,
               const UChar              *text,
                     int32_t            textLength,
                     UErrorCode         *status)
  {
-  if (U_FAILURE(*status)) {
-    return;
-  }
-
-  if (elems->isWritable && elems->iteratordata_.string != NULL)
-  {
-    uprv_free(elems->iteratordata_.string);
-  }
- 
-  if (text == NULL) {
-      textLength = 0;
-  }
-
-  elems->isWritable = FALSE;
-  uprv_init_collIterate(elems->iteratordata_.coll, text, textLength, 
-                   &elems->iteratordata_);
+    if (U_FAILURE(*status)) {
+        return;
+    }
  
-  elems->reset_   = TRUE;
+    if ((text == NULL && textLength != 0)) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+    UnicodeString s((UBool)(textLength < 0), text, textLength);
+    return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
  }
  
  U_CAPI int32_t U_EXPORT2
  ucol_getOffset(const UCollationElements *elems)
  {
-  const collIterate *ci = &(elems->iteratordata_);
-  // while processing characters in normalization buffer getOffset will 
-  // return the next non-normalized character. 
-  // should be inline with the old implementation since the old codes uses
-  // nextDecomp in normalizer which also decomposes the string till the 
-  // first base character is found.
-  if (ci->flags & UCOL_ITER_INNORMBUF) {
-      if (ci->fcdPosition == NULL) {
-        return 0;
-      }
-      return (int32_t)(ci->fcdPosition - ci->string);
-  }
-  else {
-      return (int32_t)(ci->pos - ci->string);
-  }
+    return CollationElementIterator::fromUCollationElements(elems)->getOffset();
  }
  
  U_CAPI void U_EXPORT2
@@ -226,44 +524,29 @@ ucol_setOffset(UCollationElements    *elems,
                 int32_t           offset,
                 UErrorCode            *status)
  {
-  if (U_FAILURE(*status)) {
-    return;
-  }
-
-  // this methods will clean up any use of the writable buffer and points to 
-  // the original string
-  collIterate *ci = &(elems->iteratordata_);
-  ci->pos         = ci->string + offset;
-  ci->CEpos       = ci->toReturn = ci->CEs;
-  if (ci->flags & UCOL_ITER_INNORMBUF) {
-    ci->flags = ci->origFlags;
-  }
-  if ((ci->flags & UCOL_ITER_HASLEN) == 0) {
-      ci->endp  = ci->string + u_strlen(ci->string);
-      ci->flags |= UCOL_ITER_HASLEN;
-  }
-  ci->fcdPosition = NULL;
-  elems->reset_ = FALSE;
+    if (U_FAILURE(*status)) {
+        return;
+    }
+
+    CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
  }
  
  U_CAPI int32_t U_EXPORT2
  ucol_primaryOrder (int32_t order) 
  {
-  order &= UCOL_PRIMARYMASK;
-  return (order >> UCOL_PRIMARYORDERSHIFT);
+    return (order >> 16) & 0xffff;
  }
  
  U_CAPI int32_t U_EXPORT2
  ucol_secondaryOrder (int32_t order) 
  {
-  order &= UCOL_SECONDARYMASK;
-  return (order >> UCOL_SECONDARYORDERSHIFT);
+    return (order >> 8) & 0xff;
  }
  
  U_CAPI int32_t U_EXPORT2
  ucol_tertiaryOrder (int32_t order) 
  {
-  return (order & UCOL_TERTIARYMASK);
+    return order & 0xff;
  }
  
  #endif /* #if !UCONFIG_NO_COLLATION */