ICU-59180.0.1.tar.gz

[apple/icu.git] / icuSources / common / normlzr.cpp
diff --git a/icuSources/common/normlzr.cpp b/icuSources/common/normlzr.cpp

index 909bd30b27cb688f950fea4e592ab8fef93b1b42..607660c45f1139b65fbc87405affdfb72410900f 100644 (file)
--- a/icuSources/common/normlzr.cpp
+++ b/icuSources/common/normlzr.cpp
@@ -1,7 +1,9 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
   *************************************************************************
   * COPYRIGHT: 
- * Copyright (c) 1996-2005, International Business Machines Corporation and
+ * Copyright (c) 1996-2012, International Business Machines Corporation and
   * others. All Rights Reserved.
   *************************************************************************
   */
@@ -10,14 +12,22 @@
  
  #if !UCONFIG_NO_NORMALIZATION
  
+#include "unicode/uniset.h"
  #include "unicode/unistr.h"
  #include "unicode/chariter.h"
  #include "unicode/schriter.h"
  #include "unicode/uchriter.h"
-#include "unicode/uiter.h"
  #include "unicode/normlzr.h"
+#include "unicode/utf16.h"
  #include "cmemory.h"
-#include "unormimp.h"
+#include "normalizer2impl.h"
+#include "uprops.h"  // for uniset_getUnicode32Instance()
+
+#if defined(_ARM64_) && defined(move32)
+ // System can define move32 intrinsics, but the char iters define move32 method
+ // using same undef trick in headers, so undef here to re-enable the method.
+#undef move32
+#endif
  
  U_NAMESPACE_BEGIN
  
@@ -28,72 +38,66 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
  //-------------------------------------------------------------------------
  
  Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
-    UObject(), fUMode(mode), fOptions(0),
+    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
+    text(new StringCharacterIterator(str)),
      currentIndex(0), nextIndex(0),
      buffer(), bufferPos(0)
  {
-    init(new StringCharacterIterator(str));
+    init();
  }
  
-Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
-    UObject(), fUMode(mode), fOptions(0),
+Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
+    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
+    text(new UCharCharacterIterator(str, length)),
      currentIndex(0), nextIndex(0),
      buffer(), bufferPos(0)
  {
-    init(new UCharCharacterIterator(str, length));
+    init();
  }
  
  Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
-    UObject(), fUMode(mode), fOptions(0),
+    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
+    text(iter.clone()),
      currentIndex(0), nextIndex(0),
      buffer(), bufferPos(0)
  {
-    init(iter.clone());
+    init();
  }
  
  Normalizer::Normalizer(const Normalizer &copy) :
-    UObject(copy), fUMode(copy.fUMode), fOptions(copy.fOptions),
+    UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
+    text(copy.text->clone()),
      currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
      buffer(copy.buffer), bufferPos(copy.bufferPos)
  {
-    init(((CharacterIterator *)(copy.text->context))->clone());
+    init();
  }
  
-static const UChar _NUL=0;
-
  void
-Normalizer::init(CharacterIterator *iter) {
+Normalizer::init() {
      UErrorCode errorCode=U_ZERO_ERROR;
-
-    text=(UCharIterator *)uprv_malloc(sizeof(UCharIterator));
-    if(text!=NULL) {
-        if(unorm_haveData(&errorCode)) {
-            uiter_setCharacterIterator(text, iter);
-        } else {
-            delete iter;
-            uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0));
-        }
-    } else {
-        delete iter;
+    fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
+    if(fOptions&UNORM_UNICODE_3_2) {
+        delete fFilteredNorm2;
+        fNorm2=fFilteredNorm2=
+            new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
+    }
+    if(U_FAILURE(errorCode)) {
+        errorCode=U_ZERO_ERROR;
+        fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
      }
  }
  
  Normalizer::~Normalizer()
  {
-    if(text!=NULL) {
-        delete (CharacterIterator *)text->context;
-        uprv_free(text);
-    }
+    delete fFilteredNorm2;
+    delete text;
  }
  
  Normalizer* 
  Normalizer::clone() const
  {
-    if(this!=0) {
-        return new Normalizer(*this);
-    } else {
-        return 0;
-    }
+    return new Normalizer(*this);
  }
  
  /**
@@ -101,19 +105,19 @@ Normalizer::clone() const
   */
  int32_t Normalizer::hashCode() const
  {
-    return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
+    return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
  }
      
  UBool Normalizer::operator==(const Normalizer& that) const
  {
      return
          this==&that ||
-        fUMode==that.fUMode &&
+        (fUMode==that.fUMode &&
          fOptions==that.fOptions &&
-        *((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) &&
+        *text==*that.text &&
          buffer==that.buffer &&
          bufferPos==that.bufferPos &&
-        nextIndex==that.nextIndex;
+        nextIndex==that.nextIndex);
  }
  
  //-------------------------------------------------------------------------
@@ -140,29 +144,18 @@ Normalizer::normalize(const UnicodeString& source,
              // the source and result strings are the same object, use a temporary one
              dest=&localDest;
          }
-
-        UChar *buffer=dest->getBuffer(source.length());
-        int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(),
-                                               source.getBuffer(), source.length(),
-                                               mode, options,
-                                               &status);
-        dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
-        if(status==U_BUFFER_OVERFLOW_ERROR) {
-            status=U_ZERO_ERROR;
-            buffer=dest->getBuffer(length);
-            length=unorm_internalNormalize(buffer, dest->getCapacity(),
-                                           source.getBuffer(), source.length(),
-                                           mode, options,
-                                           &status);
-            dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
+        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+        if(U_SUCCESS(status)) {
+            if(options&UNORM_UNICODE_3_2) {
+                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+                    normalize(source, *dest, status);
+            } else {
+                n2->normalize(source, *dest, status);
+            }
          }
-
-        if(dest==&localDest) {
+        if(dest==&localDest && U_SUCCESS(status)) {
              result=*dest;
          }
-        if(U_FAILURE(status)) {
-            result.setToBogus();
-        }
      }
  }
  
@@ -171,45 +164,7 @@ Normalizer::compose(const UnicodeString& source,
                      UBool compat, int32_t options,
                      UnicodeString& result, 
                      UErrorCode &status) {
-    if(source.isBogus() || U_FAILURE(status)) {
-        result.setToBogus();
-        if(U_SUCCESS(status)) {
-            status=U_ILLEGAL_ARGUMENT_ERROR;
-        }
-    } else {
-        UnicodeString localDest;
-        UnicodeString *dest;
-
-        if(&source!=&result) {
-            dest=&result;
-        } else {
-            // the source and result strings are the same object, use a temporary one
-            dest=&localDest;
-        }
-
-        UChar *buffer=dest->getBuffer(source.length());
-        int32_t length=unorm_compose(buffer, dest->getCapacity(),
-                                     source.getBuffer(), source.length(),
-                                     compat, options,
-                                     &status);
-        dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
-        if(status==U_BUFFER_OVERFLOW_ERROR) {
-            status=U_ZERO_ERROR;
-            buffer=dest->getBuffer(length);
-            length=unorm_compose(buffer, dest->getCapacity(),
-                                 source.getBuffer(), source.length(),
-                                 compat, options,
-                                 &status);
-            dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
-        }
-
-        if(dest==&localDest) {
-            result=*dest;
-        }
-        if(U_FAILURE(status)) {
-            result.setToBogus();
-        }
-    }
+    normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
  }
  
  void U_EXPORT2
@@ -217,49 +172,45 @@ Normalizer::decompose(const UnicodeString& source,
                        UBool compat, int32_t options,
                        UnicodeString& result, 
                        UErrorCode &status) {
-    if(source.isBogus() || U_FAILURE(status)) {
-        result.setToBogus();
-        if(U_SUCCESS(status)) {
-            status=U_ILLEGAL_ARGUMENT_ERROR;
+    normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
+}
+
+UNormalizationCheckResult
+Normalizer::quickCheck(const UnicodeString& source,
+                       UNormalizationMode mode, int32_t options,
+                       UErrorCode &status) {
+    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+    if(U_SUCCESS(status)) {
+        if(options&UNORM_UNICODE_3_2) {
+            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+                quickCheck(source, status);
+        } else {
+            return n2->quickCheck(source, status);
          }
      } else {
-        UnicodeString localDest;
-        UnicodeString *dest;
+        return UNORM_MAYBE;
+    }
+}
  
-        if(&source!=&result) {
-            dest=&result;
+UBool
+Normalizer::isNormalized(const UnicodeString& source,
+                         UNormalizationMode mode, int32_t options,
+                         UErrorCode &status) {
+    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
+    if(U_SUCCESS(status)) {
+        if(options&UNORM_UNICODE_3_2) {
+            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
+                isNormalized(source, status);
          } else {
-            // the source and result strings are the same object, use a temporary one
-            dest=&localDest;
-        }
-
-        UChar *buffer=dest->getBuffer(source.length());
-        int32_t length=unorm_decompose(buffer, dest->getCapacity(),
-                                     source.getBuffer(), source.length(),
-                                     compat, options,
-                                     &status);
-        dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
-        if(status==U_BUFFER_OVERFLOW_ERROR) {
-            status=U_ZERO_ERROR;
-            buffer=dest->getBuffer(length);
-            length=unorm_decompose(buffer, dest->getCapacity(),
-                                   source.getBuffer(), source.length(),
-                                   compat, options,
-                                   &status);
-            dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
-        }
-
-        if(dest==&localDest) {
-            result=*dest;
-        }
-        if(U_FAILURE(status)) {
-            result.setToBogus();
+            return n2->isNormalized(source, status);
          }
+    } else {
+        return FALSE;
      }
  }
  
  UnicodeString & U_EXPORT2
-Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
+Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
                          UnicodeString &result,
                          UNormalizationMode mode, int32_t options,
                          UErrorCode &errorCode) {
@@ -272,37 +223,25 @@ Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
          UnicodeString localDest;
          UnicodeString *dest;
  
-        if(&left!=&result && &right!=&result) {
+        if(&right!=&result) {
              dest=&result;
          } else {
-            // the source and result strings are the same object, use a temporary one
+            // the right and result strings are the same object, use a temporary one
              dest=&localDest;
          }
-
-        UChar *buffer=dest->getBuffer(left.length()+right.length());
-        int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
-                                         right.getBuffer(), right.length(),
-                                         buffer, dest->getCapacity(),
-                                         mode, options,
-                                         &errorCode);
-        dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
-        if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
-            errorCode=U_ZERO_ERROR;
-            buffer=dest->getBuffer(length);
-            int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
-                                             right.getBuffer(), right.length(),
-                                             buffer, dest->getCapacity(),
-                                             mode, options,
-                                             &errorCode);
-            dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
+        *dest=left;
+        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
+        if(U_SUCCESS(errorCode)) {
+            if(options&UNORM_UNICODE_3_2) {
+                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
+                    append(*dest, right, errorCode);
+            } else {
+                n2->append(*dest, right, errorCode);
+            }
          }
-
-        if(dest==&localDest) {
+        if(dest==&localDest && U_SUCCESS(errorCode)) {
              result=*dest;
          }
-        if(U_FAILURE(errorCode)) {
-            result.setToBogus();
-        }
      }
      return result;
  }
@@ -330,7 +269,7 @@ UChar32 Normalizer::current() {
  UChar32 Normalizer::next() {
      if(bufferPos<buffer.length() ||  nextNormalize()) {
          UChar32 c=buffer.char32At(bufferPos);
-        bufferPos+=UTF_CHAR_LENGTH(c);
+        bufferPos+=U16_LENGTH(c);
          return c;
      } else {
          return DONE;
@@ -345,7 +284,7 @@ UChar32 Normalizer::next() {
  UChar32 Normalizer::previous() {
      if(bufferPos>0 || previousNormalize()) {
          UChar32 c=buffer.char32At(bufferPos-1);
-        bufferPos-=UTF_CHAR_LENGTH(c);
+        bufferPos-=U16_LENGTH(c);
          return c;
      } else {
          return DONE;
@@ -353,19 +292,20 @@ UChar32 Normalizer::previous() {
  }
  
  void Normalizer::reset() {
-    currentIndex=nextIndex=text->move(text, 0, UITER_START);
+    currentIndex=nextIndex=text->setToStart();
      clearBuffer();
  }
  
  void
  Normalizer::setIndexOnly(int32_t index) {
-    currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index
+    text->setIndex(index);  // pins index
+    currentIndex=nextIndex=text->getIndex();
      clearBuffer();
  }
  
  /**
- * Return the first character in the normalized text->  This resets
- * the <tt>Normalizer's</tt> position to the beginning of the text->
+ * Return the first character in the normalized text.  This resets
+ * the <tt>Normalizer's</tt> position to the beginning of the text.
   */
  UChar32 Normalizer::first() {
      reset();
@@ -373,12 +313,12 @@ UChar32 Normalizer::first() {
  }
  
  /**
- * Return the last character in the normalized text->  This resets
+ * Return the last character in the normalized text.  This resets
   * the <tt>Normalizer's</tt> position to be just before the
   * the input text corresponding to that normalized character.
   */
  UChar32 Normalizer::last() {
-    currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT);
+    currentIndex=nextIndex=text->setToEnd();
      clearBuffer();
      return previous();
  }
@@ -406,21 +346,21 @@ int32_t Normalizer::getIndex() const {
  }
  
  /**
- * Retrieve the index of the start of the input text->  This is the begin index
+ * Retrieve the index of the start of the input text.  This is the begin index
   * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
   * over which this <tt>Normalizer</tt> is iterating
   */
  int32_t Normalizer::startIndex() const {
-    return text->getIndex(text, UITER_START);
+    return text->startIndex();
  }
  
  /**
- * Retrieve the index of the end of the input text->  This is the end index
+ * Retrieve the index of the end of the input text.  This is the end index
   * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
   * over which this <tt>Normalizer</tt> is iterating
   */
  int32_t Normalizer::endIndex() const {
-    return text->getIndex(text, UITER_LIMIT);
+    return text->endIndex();
  }
  
  //-------------------------------------------------------------------------
@@ -431,6 +371,7 @@ void
  Normalizer::setMode(UNormalizationMode newMode) 
  {
      fUMode = newMode;
+    init();
  }
  
  UNormalizationMode
@@ -448,6 +389,7 @@ Normalizer::setOption(int32_t option,
      } else {
          fOptions &= (~option);
      }
+    init();
  }
  
  UBool
@@ -458,7 +400,7 @@ Normalizer::getOption(int32_t option) const
  
  /**
   * Set the input text over which this <tt>Normalizer</tt> will iterate.
- * The iteration position is set to the beginning of the input text->
+ * The iteration position is set to the beginning of the input text.
   */
  void
  Normalizer::setText(const UnicodeString& newText, 
@@ -472,8 +414,8 @@ Normalizer::setText(const UnicodeString& newText,
          status = U_MEMORY_ALLOCATION_ERROR;
          return;
      }
-    delete (CharacterIterator *)(text->context);
-    text->context = newIter;
+    delete text;
+    text = newIter;
      reset();
  }
  
@@ -493,13 +435,13 @@ Normalizer::setText(const CharacterIterator& newText,
          status = U_MEMORY_ALLOCATION_ERROR;
          return;
      }
-    delete (CharacterIterator *)(text->context);
-    text->context = newIter;
+    delete text;
+    text = newIter;
      reset();
  }
  
  void
-Normalizer::setText(const UChar* newText,
+Normalizer::setText(ConstChar16Ptr newText,
                      int32_t length,
                      UErrorCode &status)
  {
@@ -511,8 +453,8 @@ Normalizer::setText(const UChar* newText,
          status = U_MEMORY_ALLOCATION_ERROR;
          return;
      }
-    delete (CharacterIterator *)(text->context);
-    text->context = newIter;
+    delete text;
+    text = newIter;
      reset();
  }
  
@@ -523,7 +465,7 @@ Normalizer::setText(const UChar* newText,
  void
  Normalizer::getText(UnicodeString&  result) 
  {
-    ((CharacterIterator *)(text->context))->getText(result);
+    text->getText(result);
  }
  
  //-------------------------------------------------------------------------
@@ -537,72 +479,48 @@ void Normalizer::clearBuffer() {
  
  UBool
  Normalizer::nextNormalize() {
-    UChar *p;
-    int32_t length;
-    UErrorCode errorCode;
-
      clearBuffer();
      currentIndex=nextIndex;
-    text->move(text, nextIndex, UITER_ZERO);
-    if(!text->hasNext(text)) {
+    text->setIndex(nextIndex);
+    if(!text->hasNext()) {
          return FALSE;
      }
-
-    errorCode=U_ZERO_ERROR;
-    p=buffer.getBuffer(-1);
-    length=unorm_next(text, p, buffer.getCapacity(),
-                      fUMode, fOptions,
-                      TRUE, 0,
-                      &errorCode);
-    buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
-    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
-        errorCode=U_ZERO_ERROR;
-        text->move(text, nextIndex, UITER_ZERO);
-        p=buffer.getBuffer(length);
-        length=unorm_next(text, p, buffer.getCapacity(),
-                          fUMode, fOptions,
-                          TRUE, 0,
-                          &errorCode);
-        buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
+    // Skip at least one character so we make progress.
+    UnicodeString segment(text->next32PostInc());
+    while(text->hasNext()) {
+        UChar32 c;
+        if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
+            text->move32(-1, CharacterIterator::kCurrent);
+            break;
+        }
+        segment.append(c);
      }
-
-    nextIndex=text->getIndex(text, UITER_CURRENT);
+    nextIndex=text->getIndex();
+    UErrorCode errorCode=U_ZERO_ERROR;
+    fNorm2->normalize(segment, buffer, errorCode);
      return U_SUCCESS(errorCode) && !buffer.isEmpty();
  }
  
  UBool
  Normalizer::previousNormalize() {
-    UChar *p;
-    int32_t length;
-    UErrorCode errorCode;
-
      clearBuffer();
      nextIndex=currentIndex;
-    text->move(text, currentIndex, UITER_ZERO);
-    if(!text->hasPrevious(text)) {
+    text->setIndex(currentIndex);
+    if(!text->hasPrevious()) {
          return FALSE;
      }
-
-    errorCode=U_ZERO_ERROR;
-    p=buffer.getBuffer(-1);
-    length=unorm_previous(text, p, buffer.getCapacity(),
-                          fUMode, fOptions,
-                          TRUE, 0,
-                          &errorCode);
-    buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
-    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
-        errorCode=U_ZERO_ERROR;
-        text->move(text, currentIndex, UITER_ZERO);
-        p=buffer.getBuffer(length);
-        length=unorm_previous(text, p, buffer.getCapacity(),
-                              fUMode, fOptions,
-                              TRUE, 0,
-                              &errorCode);
-        buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
+    UnicodeString segment;
+    while(text->hasPrevious()) {
+        UChar32 c=text->previous32();
+        segment.insert(0, c);
+        if(fNorm2->hasBoundaryBefore(c)) {
+            break;
+        }
      }
-
+    currentIndex=text->getIndex();
+    UErrorCode errorCode=U_ZERO_ERROR;
+    fNorm2->normalize(segment, buffer, errorCode);
      bufferPos=buffer.length();
-    currentIndex=text->getIndex(text, UITER_CURRENT);
      return U_SUCCESS(errorCode) && !buffer.isEmpty();
  }