]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/unormcmp.cpp
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / common / unormcmp.cpp
index ded2e730f96cf5a15aa50214188161822dd1fc8a..689b0b53b2d5f503b8c56b3174d0318b34abf684 100644 (file)
@@ -1,12 +1,14 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2001-2005, International Business Machines
+*   Copyright (C) 2001-2014, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 *   file name:  unormcmp.cpp
-*   encoding:   US-ASCII
+*   encoding:   UTF-8
 *   tab size:   8 (not used)
 *   indentation:4
 *
 
 #if !UCONFIG_NO_NORMALIZATION
 
-#include "unicode/ustring.h"
 #include "unicode/unorm.h"
-#include "unicode/uniset.h"
-#include "unormimp.h"
-#include "ucase.h"
+#include "unicode/ustring.h"
 #include "cmemory.h"
+#include "normalizer2impl.h"
+#include "ucase.h"
+#include "uprops.h"
+#include "ustr_imp.h"
 
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+U_NAMESPACE_USE
 
 /* compare canonically equivalent ------------------------------------------- */
 
  * Note that all of this is only a problem when case-folding _and_
  * canonical equivalence come together.
  * (Comments in unorm_compare() are more up to date than this TODO.)
- *
- * This function could be moved to a different source file, at increased cost
- * for calling the decomposition access function.
  */
 
 /* stack element for previous-level source/decomposition pointers */
@@ -132,13 +132,19 @@ struct CmpEquivLevel {
 };
 typedef struct CmpEquivLevel CmpEquivLevel;
 
+/**
+ * Internal option for unorm_cmpEquivFold() for decomposing.
+ * If not set, just do strcasecmp().
+ */
+#define _COMPARE_EQUIV 0x80000
+
 /* internal function */
 static int32_t
 unorm_cmpEquivFold(const UChar *s1, int32_t length1,
                    const UChar *s2, int32_t length2,
                    uint32_t options,
                    UErrorCode *pErrorCode) {
-    const UCaseProps *csp;
+    const Normalizer2Impl *nfcImpl;
 
     /* current-level start/limit - s1/s2 as current */
     const UChar *start1, *start2, *limit1, *limit2;
@@ -150,7 +156,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
     /* stacks of previous-level start/current/limit */
     CmpEquivLevel stack1[2], stack2[2];
 
-    /* decomposition buffers for Hangul */
+    /* buffers for algorithmic decompositions */
     UChar decomp1[4], decomp2[4];
 
     /* case folding buffers, only use current-level start/limit */
@@ -171,18 +177,13 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
      */
 
     /* normalization/properties data loaded? */
-    if( ((options&_COMPARE_EQUIV)!=0 && !unorm_haveData(pErrorCode)) ||
-        U_FAILURE(*pErrorCode)
-    ) {
-        return 0;
-    }
-    if((options&U_COMPARE_IGNORE_CASE)!=0) {
-        csp=ucase_getSingleton(pErrorCode);
-        if(U_FAILURE(*pErrorCode)) {
-            return 0;
-        }
+    if((options&_COMPARE_EQUIV)!=0) {
+        nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode);
     } else {
-        csp=NULL;
+        nfcImpl=NULL;
+    }
+    if(U_FAILURE(*pErrorCode)) {
+        return 0;
     }
 
     /* initialize */
@@ -226,10 +227,10 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
                 /* reached end of level buffer, pop one level */
                 do {
                     --level1;
-                    start1=stack1[level1].start;
+                    start1=stack1[level1].start;    /*Not uninitialized*/
                 } while(start1==NULL);
-                s1=stack1[level1].s;
-                limit1=stack1[level1].limit;
+                s1=stack1[level1].s;                /*Not uninitialized*/
+                limit1=stack1[level1].limit;        /*Not uninitialized*/
             }
         }
 
@@ -249,10 +250,10 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
                 /* reached end of level buffer, pop one level */
                 do {
                     --level2;
-                    start2=stack2[level2].start;
+                    start2=stack2[level2].start;    /*Not uninitialized*/
                 } while(start2==NULL);
-                s2=stack2[level2].s;
-                limit2=stack2[level2].limit;
+                s2=stack2[level2].s;                /*Not uninitialized*/
+                limit2=stack2[level2].limit;        /*Not uninitialized*/
             }
         }
 
@@ -312,7 +313,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
          */
 
         if( level1==0 && (options&U_COMPARE_IGNORE_CASE) &&
-            (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
+            (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
         ) {
             /* cp1 case-folds to the code point "length" or to p[length] */
             if(U_IS_SURROGATE(c1)) {
@@ -357,7 +358,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
         }
 
         if( level2==0 && (options&U_COMPARE_IGNORE_CASE) &&
-            (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
+            (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
         ) {
             /* cp2 case-folds to the code point "length" or to p[length] */
             if(U_IS_SURROGATE(c2)) {
@@ -402,7 +403,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
         }
 
         if( level1<2 && (options&_COMPARE_EQUIV) &&
-            0!=(p=unorm_getCanonicalDecomposition((UChar32)cp1, decomp1, &length))
+            0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length))
         ) {
             /* cp1 decomposes into p[length] */
             if(U_IS_SURROGATE(c1)) {
@@ -443,7 +444,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
         }
 
         if( level2<2 && (options&_COMPARE_EQUIV) &&
-            0!=(p=unorm_getCanonicalDecomposition((UChar32)cp2, decomp2, &length))
+            0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length))
         ) {
             /* cp2 decomposes into p[length] */
             if(U_IS_SURROGATE(c2)) {
@@ -527,20 +528,42 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
     }
 }
 
+static
+UBool _normalize(const Normalizer2 *n2, const UChar *s, int32_t length,
+                UnicodeString &normalized, UErrorCode *pErrorCode) {
+    UnicodeString str(length<0, s, length);
+
+    // check if s fulfill the conditions
+    int32_t spanQCYes=n2->spanQuickCheckYes(str, *pErrorCode);
+    if (U_FAILURE(*pErrorCode)) {
+        return FALSE;
+    }
+    /*
+     * ICU 2.4 had a further optimization:
+     * If both strings were not in FCD, then they were both NFD'ed,
+     * and the _COMPARE_EQUIV option was turned off.
+     * It is not entirely clear that this is valid with the current
+     * definition of the canonical caseless match.
+     * Therefore, ICU 2.6 removes that optimization.
+     */
+    if(spanQCYes<str.length()) {
+        UnicodeString unnormalized=str.tempSubString(spanQCYes);
+        normalized.setTo(FALSE, str.getBuffer(), spanQCYes);
+        n2->normalizeSecondAndAppend(normalized, unnormalized, *pErrorCode);
+        if (U_SUCCESS(*pErrorCode)) {
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
+
 U_CAPI int32_t U_EXPORT2
 unorm_compare(const UChar *s1, int32_t length1,
               const UChar *s2, int32_t length2,
               uint32_t options,
               UErrorCode *pErrorCode) {
-    UChar fcd1[300], fcd2[300];
-    UChar *d1, *d2;
-    const UnicodeSet *nx;
-    UNormalizationMode mode;
-    int32_t normOptions;
-    int32_t result;
-
     /* argument checking */
-    if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
+    if(U_FAILURE(*pErrorCode)) {
         return 0;
     }
     if(s1==0 || length1<-1 || s2==0 || length2<-1) {
@@ -548,22 +571,9 @@ unorm_compare(const UChar *s1, int32_t length1,
         return 0;
     }
 
-    if(!unorm_haveData(pErrorCode)) {
-        return 0;
-    }
-    if(!uprv_haveProperties(pErrorCode)) {
-        return 0;
-    }
-
-    normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
-    nx=unorm_getNX(normOptions, pErrorCode);
-    if(U_FAILURE(*pErrorCode)) {
-        return 0;
-    }
-
-    d1=d2=0;
+    UnicodeString fcd1, fcd2;
+    int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
     options|=_COMPARE_EQUIV;
-    result=0;
 
     /*
      * UAX #21 Case Mappings, as fixed for Unicode version 4
@@ -586,103 +596,45 @@ unorm_compare(const UChar *s1, int32_t length1,
      * are first decomposed or not, so an FCD check - a check only for
      * canonical order - is not sufficient.
      */
-    if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
-        mode=UNORM_NFD;
-        options&=~UNORM_INPUT_IS_FCD;
-    } else {
-        mode=UNORM_FCD;
-    }
-
-    if(!(options&UNORM_INPUT_IS_FCD)) {
-        int32_t _len1, _len2;
-        UBool isFCD1, isFCD2;
-
-        // check if s1 and/or s2 fulfill the FCD conditions
-        isFCD1= UNORM_YES==unorm_internalQuickCheck(s1, length1, mode, TRUE, nx, pErrorCode);
-        isFCD2= UNORM_YES==unorm_internalQuickCheck(s2, length2, mode, TRUE, nx, pErrorCode);
-        if(U_FAILURE(*pErrorCode)) {
+    if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) {
+        const Normalizer2 *n2;
+        if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
+            n2=Normalizer2::getNFDInstance(*pErrorCode);
+        } else {
+            n2=Normalizer2Factory::getFCDInstance(*pErrorCode);
+        }
+        if (U_FAILURE(*pErrorCode)) {
             return 0;
         }
 
-        /*
-         * ICU 2.4 had a further optimization:
-         * If both strings were not in FCD, then they were both NFD'ed,
-         * and the _COMPARE_EQUIV option was turned off.
-         * It is not entirely clear that this is valid with the current
-         * definition of the canonical caseless match.
-         * Therefore, ICU 2.6 removes that optimization.
-         */
-
-        if(!isFCD1) {
-            _len1=unorm_internalNormalizeWithNX(fcd1, LENGTHOF(fcd1),
-                                                s1, length1,
-                                                mode, normOptions, nx,
-                                                pErrorCode);
-            if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-                s1=fcd1;
-            } else {
-                d1=(UChar *)uprv_malloc(_len1*U_SIZEOF_UCHAR);
-                if(d1==0) {
-                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-                    goto cleanup;
-                }
-
-                *pErrorCode=U_ZERO_ERROR;
-                _len1=unorm_internalNormalizeWithNX(d1, _len1,
-                                                    s1, length1,
-                                                    mode, normOptions, nx,
-                                                    pErrorCode);
-                if(U_FAILURE(*pErrorCode)) {
-                    goto cleanup;
-                }
-
-                s1=d1;
+        if(normOptions&UNORM_UNICODE_3_2) {
+            const UnicodeSet *uni32=uniset_getUnicode32Instance(*pErrorCode);
+            FilteredNormalizer2 fn2(*n2, *uni32);
+            if(_normalize(&fn2, s1, length1, fcd1, pErrorCode)) {
+                s1=fcd1.getBuffer();
+                length1=fcd1.length();
             }
-            length1=_len1;
-        }
-
-        if(!isFCD2) {
-            _len2=unorm_internalNormalizeWithNX(fcd2, LENGTHOF(fcd2),
-                                                s2, length2,
-                                                mode, normOptions, nx,
-                                                pErrorCode);
-            if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-                s2=fcd2;
-            } else {
-                d2=(UChar *)uprv_malloc(_len2*U_SIZEOF_UCHAR);
-                if(d2==0) {
-                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-                    goto cleanup;
-                }
-
-                *pErrorCode=U_ZERO_ERROR;
-                _len2=unorm_internalNormalizeWithNX(d2, _len2,
-                                                    s2, length2,
-                                                    mode, normOptions, nx,
-                                                    pErrorCode);
-                if(U_FAILURE(*pErrorCode)) {
-                    goto cleanup;
-                }
-
-                s2=d2;
+            if(_normalize(&fn2, s2, length2, fcd2, pErrorCode)) {
+                s2=fcd2.getBuffer();
+                length2=fcd2.length();
+            }
+        } else {
+            if(_normalize(n2, s1, length1, fcd1, pErrorCode)) {
+                s1=fcd1.getBuffer();
+                length1=fcd1.length();
+            }
+            if(_normalize(n2, s2, length2, fcd2, pErrorCode)) {
+                s2=fcd2.getBuffer();
+                length2=fcd2.length();
             }
-            length2=_len2;
         }
     }
 
     if(U_SUCCESS(*pErrorCode)) {
-        result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
-    }
-
-cleanup:
-    if(d1!=0) {
-        uprv_free(d1);
-    }
-    if(d2!=0) {
-        uprv_free(d2);
+        return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
+    } else {
+        return 0;
     }
-
-    return result;
 }
 
 #endif /* #if !UCONFIG_NO_NORMALIZATION */