ICU-57163.0.1.tar.gz

[apple/icu.git] / icuSources / common / unormcmp.cpp
diff --git a/icuSources/common/unormcmp.cpp b/icuSources/common/unormcmp.cpp

index 7b3903d898c2db0eb34047e7903e4e6491ee8c48..1b072c4c34d8bedc2a20272486cb36bc55ecf3c6 100644 (file)
--- a/icuSources/common/unormcmp.cpp
+++ b/icuSources/common/unormcmp.cpp
@@ -1,7 +1,7 @@
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2001-2006, International Business Machines
+*   Copyright (C) 2001-2014, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
@@ -22,17 +22,16 @@
  
  #if !UCONFIG_NO_NORMALIZATION
  
-#include "unicode/ustring.h"
  #include "unicode/unorm.h"
-#include "unicode/uniset.h"
-#include "unormimp.h"
-#include "ucase.h"
+#include "unicode/ustring.h"
  #include "cmemory.h"
+#include "normalizer2impl.h"
+#include "ucase.h"
+#include "uprops.h"
+#include "ustr_imp.h"
  
  U_NAMESPACE_USE
  
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
-
  /* compare canonically equivalent ------------------------------------------- */
  
  /*
@@ -123,9 +122,6 @@ U_NAMESPACE_USE
   * Note that all of this is only a problem when case-folding _and_
   * canonical equivalence come together.
   * (Comments in unorm_compare() are more up to date than this TODO.)
- *
- * This function could be moved to a different source file, at increased cost
- * for calling the decomposition access function.
   */
  
  /* stack element for previous-level source/decomposition pointers */
@@ -134,12 +130,19 @@ struct CmpEquivLevel {
  };
  typedef struct CmpEquivLevel CmpEquivLevel;
  
+/**
+ * Internal option for unorm_cmpEquivFold() for decomposing.
+ * If not set, just do strcasecmp().
+ */
+#define _COMPARE_EQUIV 0x80000
+
  /* internal function */
  static int32_t
  unorm_cmpEquivFold(const UChar *s1, int32_t length1,
                     const UChar *s2, int32_t length2,
                     uint32_t options,
                     UErrorCode *pErrorCode) {
+    const Normalizer2Impl *nfcImpl;
      const UCaseProps *csp;
  
      /* current-level start/limit - s1/s2 as current */
@@ -152,7 +155,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
      /* stacks of previous-level start/current/limit */
      CmpEquivLevel stack1[2], stack2[2];
  
-    /* decomposition buffers for Hangul */
+    /* buffers for algorithmic decompositions */
      UChar decomp1[4], decomp2[4];
  
      /* case folding buffers, only use current-level start/limit */
@@ -173,19 +176,19 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
       */
  
      /* normalization/properties data loaded? */
-    if( ((options&_COMPARE_EQUIV)!=0 && !unorm_haveData(pErrorCode)) ||
-        U_FAILURE(*pErrorCode)
-    ) {
-        return 0;
+    if((options&_COMPARE_EQUIV)!=0) {
+        nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode);
+    } else {
+        nfcImpl=NULL;
      }
      if((options&U_COMPARE_IGNORE_CASE)!=0) {
-        csp=ucase_getSingleton(pErrorCode);
-        if(U_FAILURE(*pErrorCode)) {
-            return 0;
-        }
+        csp=ucase_getSingleton();
      } else {
          csp=NULL;
      }
+    if(U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
  
      /* initialize */
      start1=s1;
@@ -228,10 +231,10 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
                  /* reached end of level buffer, pop one level */
                  do {
                      --level1;
-                    start1=stack1[level1].start;
+                    start1=stack1[level1].start;    /*Not uninitialized*/
                  } while(start1==NULL);
-                s1=stack1[level1].s;
-                limit1=stack1[level1].limit;
+                s1=stack1[level1].s;                /*Not uninitialized*/
+                limit1=stack1[level1].limit;        /*Not uninitialized*/
              }
          }
  
@@ -251,10 +254,10 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
                  /* reached end of level buffer, pop one level */
                  do {
                      --level2;
-                    start2=stack2[level2].start;
+                    start2=stack2[level2].start;    /*Not uninitialized*/
                  } while(start2==NULL);
-                s2=stack2[level2].s;
-                limit2=stack2[level2].limit;
+                s2=stack2[level2].s;                /*Not uninitialized*/
+                limit2=stack2[level2].limit;        /*Not uninitialized*/
              }
          }
  
@@ -404,7 +407,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
          }
  
          if( level1<2 && (options&_COMPARE_EQUIV) &&
-            0!=(p=unorm_getCanonicalDecomposition((UChar32)cp1, decomp1, &length))
+            0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length))
          ) {
              /* cp1 decomposes into p[length] */
              if(U_IS_SURROGATE(c1)) {
@@ -445,7 +448,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
          }
  
          if( level2<2 && (options&_COMPARE_EQUIV) &&
-            0!=(p=unorm_getCanonicalDecomposition((UChar32)cp2, decomp2, &length))
+            0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length))
          ) {
              /* cp2 decomposes into p[length] */
              if(U_IS_SURROGATE(c2)) {
@@ -529,20 +532,42 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
      }
  }
  
+static
+UBool _normalize(const Normalizer2 *n2, const UChar *s, int32_t length,
+                UnicodeString &normalized, UErrorCode *pErrorCode) {
+    UnicodeString str(length<0, s, length);
+
+    // check if s fulfill the conditions
+    int32_t spanQCYes=n2->spanQuickCheckYes(str, *pErrorCode);
+    if (U_FAILURE(*pErrorCode)) {
+        return FALSE;
+    }
+    /*
+     * ICU 2.4 had a further optimization:
+     * If both strings were not in FCD, then they were both NFD'ed,
+     * and the _COMPARE_EQUIV option was turned off.
+     * It is not entirely clear that this is valid with the current
+     * definition of the canonical caseless match.
+     * Therefore, ICU 2.6 removes that optimization.
+     */
+    if(spanQCYes<str.length()) {
+        UnicodeString unnormalized=str.tempSubString(spanQCYes);
+        normalized.setTo(FALSE, str.getBuffer(), spanQCYes);
+        n2->normalizeSecondAndAppend(normalized, unnormalized, *pErrorCode);
+        if (U_SUCCESS(*pErrorCode)) {
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
+
  U_CAPI int32_t U_EXPORT2
  unorm_compare(const UChar *s1, int32_t length1,
                const UChar *s2, int32_t length2,
                uint32_t options,
                UErrorCode *pErrorCode) {
-    UChar fcd1[300], fcd2[300];
-    UChar *d1, *d2;
-    const UnicodeSet *nx;
-    UNormalizationMode mode;
-    int32_t normOptions;
-    int32_t result;
-
      /* argument checking */
-    if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
+    if(U_FAILURE(*pErrorCode)) {
          return 0;
      }
      if(s1==0 || length1<-1 || s2==0 || length2<-1) {
@@ -550,22 +575,9 @@ unorm_compare(const UChar *s1, int32_t length1,
          return 0;
      }
  
-    if(!unorm_haveData(pErrorCode)) {
-        return 0;
-    }
-    if(!uprv_haveProperties(pErrorCode)) {
-        return 0;
-    }
-
-    normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
-    nx=unorm_getNX(normOptions, pErrorCode);
-    if(U_FAILURE(*pErrorCode)) {
-        return 0;
-    }
-
-    d1=d2=0;
+    UnicodeString fcd1, fcd2;
+    int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
      options|=_COMPARE_EQUIV;
-    result=0;
  
      /*
       * UAX #21 Case Mappings, as fixed for Unicode version 4
@@ -588,103 +600,45 @@ unorm_compare(const UChar *s1, int32_t length1,
       * are first decomposed or not, so an FCD check - a check only for
       * canonical order - is not sufficient.
       */
-    if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
-        mode=UNORM_NFD;
-        options&=~UNORM_INPUT_IS_FCD;
-    } else {
-        mode=UNORM_FCD;
-    }
-
-    if(!(options&UNORM_INPUT_IS_FCD)) {
-        int32_t _len1, _len2;
-        UBool isFCD1, isFCD2;
-
-        // check if s1 and/or s2 fulfill the FCD conditions
-        isFCD1= UNORM_YES==unorm_internalQuickCheck(s1, length1, mode, TRUE, nx, pErrorCode);
-        isFCD2= UNORM_YES==unorm_internalQuickCheck(s2, length2, mode, TRUE, nx, pErrorCode);
-        if(U_FAILURE(*pErrorCode)) {
+    if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) {
+        const Normalizer2 *n2;
+        if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
+            n2=Normalizer2::getNFDInstance(*pErrorCode);
+        } else {
+            n2=Normalizer2Factory::getFCDInstance(*pErrorCode);
+        }
+        if (U_FAILURE(*pErrorCode)) {
              return 0;
          }
  
-        /*
-         * ICU 2.4 had a further optimization:
-         * If both strings were not in FCD, then they were both NFD'ed,
-         * and the _COMPARE_EQUIV option was turned off.
-         * It is not entirely clear that this is valid with the current
-         * definition of the canonical caseless match.
-         * Therefore, ICU 2.6 removes that optimization.
-         */
-
-        if(!isFCD1) {
-            _len1=unorm_internalNormalizeWithNX(fcd1, LENGTHOF(fcd1),
-                                                s1, length1,
-                                                mode, normOptions, nx,
-                                                pErrorCode);
-            if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-                s1=fcd1;
-            } else {
-                d1=(UChar *)uprv_malloc(_len1*U_SIZEOF_UCHAR);
-                if(d1==0) {
-                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-                    goto cleanup;
-                }
-
-                *pErrorCode=U_ZERO_ERROR;
-                _len1=unorm_internalNormalizeWithNX(d1, _len1,
-                                                    s1, length1,
-                                                    mode, normOptions, nx,
-                                                    pErrorCode);
-                if(U_FAILURE(*pErrorCode)) {
-                    goto cleanup;
-                }
-
-                s1=d1;
+        if(normOptions&UNORM_UNICODE_3_2) {
+            const UnicodeSet *uni32=uniset_getUnicode32Instance(*pErrorCode);
+            FilteredNormalizer2 fn2(*n2, *uni32);
+            if(_normalize(&fn2, s1, length1, fcd1, pErrorCode)) {
+                s1=fcd1.getBuffer();
+                length1=fcd1.length();
              }
-            length1=_len1;
-        }
-
-        if(!isFCD2) {
-            _len2=unorm_internalNormalizeWithNX(fcd2, LENGTHOF(fcd2),
-                                                s2, length2,
-                                                mode, normOptions, nx,
-                                                pErrorCode);
-            if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-                s2=fcd2;
-            } else {
-                d2=(UChar *)uprv_malloc(_len2*U_SIZEOF_UCHAR);
-                if(d2==0) {
-                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-                    goto cleanup;
-                }
-
-                *pErrorCode=U_ZERO_ERROR;
-                _len2=unorm_internalNormalizeWithNX(d2, _len2,
-                                                    s2, length2,
-                                                    mode, normOptions, nx,
-                                                    pErrorCode);
-                if(U_FAILURE(*pErrorCode)) {
-                    goto cleanup;
-                }
-
-                s2=d2;
+            if(_normalize(&fn2, s2, length2, fcd2, pErrorCode)) {
+                s2=fcd2.getBuffer();
+                length2=fcd2.length();
+            }
+        } else {
+            if(_normalize(n2, s1, length1, fcd1, pErrorCode)) {
+                s1=fcd1.getBuffer();
+                length1=fcd1.length();
+            }
+            if(_normalize(n2, s2, length2, fcd2, pErrorCode)) {
+                s2=fcd2.getBuffer();
+                length2=fcd2.length();
              }
-            length2=_len2;
          }
      }
  
      if(U_SUCCESS(*pErrorCode)) {
-        result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
-    }
-
-cleanup:
-    if(d1!=0) {
-        uprv_free(d1);
-    }
-    if(d2!=0) {
-        uprv_free(d2);
+        return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
+    } else {
+        return 0;
      }
-
-    return result;
  }
  
  #endif /* #if !UCONFIG_NO_NORMALIZATION */