]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/ustrcase.cpp
ICU-57165.0.1.tar.gz
[apple/icu.git] / icuSources / common / ustrcase.cpp
index fce05c8e69d47b0cbc25b4ca9dbbfede0ff1591d..13f148df6b9103f450e83aa8a5a8f2e99f419357 100644 (file)
@@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2001-2011, International Business Machines
+*   Copyright (C) 2001-2015, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@@ -28,8 +28,7 @@
 #include "cmemory.h"
 #include "ucase.h"
 #include "ustr_imp.h"
-
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+#include "uassert.h"
 
 U_NAMESPACE_USE
 
@@ -46,24 +45,27 @@ appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
     if(result<0) {
         /* (not) original code point */
         c=~result;
-        length=-1;
+        length=U16_LENGTH(c);
     } else if(result<=UCASE_MAX_STRING_LENGTH) {
         c=U_SENTINEL;
         length=result;
     } else {
         c=result;
-        length=-1;
+        length=U16_LENGTH(c);
+    }
+    if(length>(INT32_MAX-destIndex)) {
+        return -1;  // integer overflow
     }
 
     if(destIndex<destCapacity) {
         /* append the result */
-        if(length<0) {
+        if(c>=0) {
             /* code point */
             UBool isError=FALSE;
             U16_APPEND(dest, destIndex, destCapacity, c, isError);
             if(isError) {
                 /* overflow, nothing written */
-                destIndex+=U16_LENGTH(c);
+                destIndex+=length;
             }
         } else {
             /* string */
@@ -79,15 +81,21 @@ appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
         }
     } else {
         /* preflight */
-        if(length<0) {
-            destIndex+=U16_LENGTH(c);
-        } else {
-            destIndex+=length;
-        }
+        destIndex+=length;
     }
     return destIndex;
 }
 
+static inline int32_t
+appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
+    if(destIndex<destCapacity) {
+        dest[destIndex]=c;
+    } else if(destIndex==INT32_MAX) {
+        return -1;  // integer overflow
+    }
+    return destIndex+1;
+}
+
 static UChar32 U_CALLCONV
 utf16_caseContextIterator(void *context, int8_t dir) {
     UCaseContext *csc=(UCaseContext *)context;
@@ -150,6 +158,10 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
             dest[destIndex++]=(UChar)c2;
         } else {
             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+            if(destIndex<0) {
+                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                return 0;
+            }
         }
     }
 
@@ -237,7 +249,7 @@ ustrcase_internalToTitle(const UCaseMap *csm,
                 length=titleStart-prev;
                 if(length>0) {
                     if((destIndex+length)<=destCapacity) {
-                        uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
+                        u_memcpy(dest+destIndex, src+prev, length);
                     }
                     destIndex+=length;
                 }
@@ -249,15 +261,22 @@ ustrcase_internalToTitle(const UCaseMap *csm,
                 csc.cpLimit=titleLimit;
                 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache);
                 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 
+                if(destIndex<0) {
+                    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                    return 0;
+                }
 
                 /* Special case Dutch IJ titlecasing */
-                if ( titleStart+1 < idx && 
-                     ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH &&
-                     ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) &&
-                     ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { 
-                            c=(UChar32) 0x004A;
-                            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
-                            titleLimit++;
+                if (titleStart+1 < idx &&
+                        ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH &&
+                        (src[titleStart] == 0x0049 || src[titleStart] == 0x0069) &&
+                        (src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A)) {
+                    destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
+                    if(destIndex<0) {
+                        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                        return 0;
+                    }
+                    titleLimit++;
                 }
 
                 /* lowercase [titleLimit..index[ */
@@ -271,11 +290,18 @@ ustrcase_internalToTitle(const UCaseMap *csm,
                                 src, &csc,
                                 titleLimit, idx,
                                 pErrorCode);
+                        if(U_FAILURE(*pErrorCode)) {
+                            return destIndex;
+                        }
                     } else {
                         /* Optionally just copy the rest of the word unchanged. */
                         length=idx-titleLimit;
+                        if(length>(INT32_MAX-destIndex)) {
+                            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                            return 0;
+                        }
                         if((destIndex+length)<=destCapacity) {
-                            uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR);
+                            u_memcpy(dest+destIndex, src+titleLimit, length);
                         }
                         destIndex+=length;
                     }
@@ -347,6 +373,10 @@ ustr_foldCase(const UCaseProps *csp,
             dest[destIndex++]=(UChar)c2;
         } else {
             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+            if(destIndex<0) {
+                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+                return 0;
+            }
         }
     }
 
@@ -399,7 +429,7 @@ ustrcase_map(const UCaseMap *csm,
          (dest>=src && dest<(src+srcLength)))
     ) {
         /* overlap: provide a temporary destination buffer and later copy the result */
-        if(destCapacity<=LENGTHOF(buffer)) {
+        if(destCapacity<=UPRV_LENGTHOF(buffer)) {
             /* the stack buffer is large enough */
             temp=buffer;
         } else {
@@ -420,7 +450,7 @@ ustrcase_map(const UCaseMap *csm,
         if(destLength>0) {
             int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
             if(copyLength>0) {
-                uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR);
+                u_memmove(dest, temp, copyLength);
             }
         }
         if(temp!=buffer) {
@@ -465,17 +495,39 @@ struct CmpEquivLevel {
 };
 typedef struct CmpEquivLevel CmpEquivLevel;
 
-/* internal function */
-U_CFUNC int32_t
-u_strcmpFold(const UChar *s1, int32_t length1,
-             const UChar *s2, int32_t length2,
-             uint32_t options,
-             UErrorCode *pErrorCode) {
+/**
+ * Internal implementation code comparing string with case fold.
+ * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
+ *
+ * @param s1            input string 1
+ * @param length1       length of string 1, or -1 (NULL terminated)
+ * @param s2            input string 2
+ * @param length2       length of string 2, or -1 (NULL terminated)
+ * @param options       compare options
+ * @param matchLen1     (output) length of partial prefix match in s1
+ * @param matchLen2     (output) length of partial prefix match in s2
+ * @param pErrorCode    receives error status
+ * @return The result of comparison
+ */
+static int32_t _cmpFold(
+            const UChar *s1, int32_t length1,
+            const UChar *s2, int32_t length2,
+            uint32_t options,
+            int32_t *matchLen1, int32_t *matchLen2,
+            UErrorCode *pErrorCode) {
+    int32_t cmpRes = 0;
+
     const UCaseProps *csp;
 
     /* current-level start/limit - s1/s2 as current */
     const UChar *start1, *start2, *limit1, *limit2;
 
+    /* points to the original start address */
+    const UChar *org1, *org2;
+
+    /* points to the end of match + 1 */
+    const UChar *m1, *m2;
+
     /* case folding variables */
     const UChar *p;
     int32_t length;
@@ -504,14 +556,20 @@ u_strcmpFold(const UChar *s1, int32_t length1,
     }
 
     /* initialize */
-    start1=s1;
+    if(matchLen1) {
+        U_ASSERT(matchLen2 !=NULL);
+        *matchLen1=0;
+        *matchLen2=0;
+    }
+
+    start1=m1=org1=s1;
     if(length1==-1) {
         limit1=NULL;
     } else {
         limit1=s1+length1;
     }
 
-    start2=s2;
+    start2=m2=org2=s2;
     if(length2==-1) {
         limit2=NULL;
     } else {
@@ -579,15 +637,59 @@ u_strcmpFold(const UChar *s1, int32_t length1,
          * either variable c1, c2 is -1 only if the corresponding string is finished
          */
         if(c1==c2) {
+            const UChar *next1, *next2;
+
             if(c1<0) {
-                return 0;   /* c1==c2==-1 indicating end of strings */
+                cmpRes=0;   /* c1==c2==-1 indicating end of strings */
+                break;
+            }
+
+            /*
+             * Note: Move the match positions in both strings at the same time
+             *      only when corresponding code point(s) in the original strings
+             *      are fully consumed. For example, when comparing s1="Fust" and
+             *      s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
+             *      the first code point in the case-folded data. But the second "s"
+             *      has no matching code point in s1, so this implementation returns
+             *      2 as the prefix match length ("Fu").
+             */
+            next1=next2=NULL;
+            if(level1==0) {
+                next1=s1;
+            } else if(s1==limit1) {
+                /* Note: This implementation only use a single level of stack.
+                 *      If this code needs to be changed to use multiple levels
+                 *      of stacks, the code above should check if the current
+                 *      code is at the end of all stacks.
+                 */
+                U_ASSERT(level1==1);
+
+                /* is s1 at the end of the current stack? */
+                next1=stack1[0].s;
+            }
+
+            if (next1!=NULL) {
+                if(level2==0) {
+                    next2=s2;
+                } else if(s2==limit2) {
+                    U_ASSERT(level2==1);
+
+                    /* is s2 at the end of the current stack? */
+                    next2=stack2[0].s;
+                }
+                if(next2!=NULL) {
+                    m1=next1;
+                    m2=next2;
+                }
             }
             c1=c2=-1;       /* make us fetch new code units */
             continue;
         } else if(c1<0) {
-            return -1;      /* string 1 ends before string 2 */
+            cmpRes=-1;      /* string 1 ends before string 2 */
+            break;
         } else if(c2<0) {
-            return 1;       /* string 2 ends before string 1 */
+            cmpRes=1;       /* string 2 ends before string 1 */
+            break;
         }
         /* c1!=c2 && c1>=0 && c2>=0 */
 
@@ -646,6 +748,7 @@ u_strcmpFold(const UChar *s1, int32_t length1,
                      * the decomposition would replace the entire code point
                      */
                     --s2;
+                    --m2;
                     c2=*(s2-1);
                 }
             }
@@ -691,6 +794,7 @@ u_strcmpFold(const UChar *s1, int32_t length1,
                      * the decomposition would replace the entire code point
                      */
                     --s1;
+                    --m2;
                     c1=*(s1-1);
                 }
             }
@@ -759,8 +863,24 @@ u_strcmpFold(const UChar *s1, int32_t length1,
             }
         }
 
-        return c1-c2;
+        cmpRes=c1-c2;
+        break;
     }
+
+    if(matchLen1) {
+        *matchLen1=m1-org1;
+        *matchLen2=m2-org2;
+    }
+    return cmpRes;
+}
+
+/* internal function */
+U_CFUNC int32_t
+u_strcmpFold(const UChar *s1, int32_t length1,
+             const UChar *s2, int32_t length2,
+             uint32_t options,
+             UErrorCode *pErrorCode) {
+    return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
 }
 
 /* public API functions */
@@ -806,3 +926,14 @@ u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
                         options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
                         &errorCode);
 }
+
+/* internal API - detect length of shared prefix */
+U_CAPI void
+u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
+                             const UChar *s2, int32_t length2,
+                             uint32_t options,
+                             int32_t *matchLen1, int32_t *matchLen2,
+                             UErrorCode *pErrorCode) {
+    _cmpFold(s1, length1, s2, length2, options,
+        matchLen1, matchLen2, pErrorCode);
+}