X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/4388f060552cc537e71e957d32f35e9d75a61233..1a147d096ae81f4c8262f7bfc56bd19fc2dee932:/icuSources/common/ustring.cpp

diff --git a/icuSources/common/ustring.cpp b/icuSources/common/ustring.cpp
index 3210cfc2..762cecb3 100644
--- a/icuSources/common/ustring.cpp
+++ b/icuSources/common/ustring.cpp
@@ -1,7 +1,9 @@
+// Â© 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 /*
 ******************************************************************************
 *
-*   Copyright (C) 1998-2011, International Business Machines
+*   Copyright (C) 1998-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -17,6 +19,7 @@
 
 #include "unicode/utypes.h"
 #include "unicode/putil.h"
+#include "unicode/uchar.h"
 #include "unicode/ustring.h"
 #include "unicode/utf16.h"
 #include "cstring.h"
@@ -991,7 +994,7 @@ U_CAPI int32_t   U_EXPORT2
 u_strlen(const UChar *s) 
 {
 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
-    return (int32_t)uprv_wcslen(s);
+    return (int32_t)uprv_wcslen((const wchar_t *)s);
 #else
     const UChar *t = s;
     while(*t != 0) {
@@ -1110,14 +1113,178 @@ u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
     }
 }
 
+/* ----- String validation functions --- */
+
+/*
+ * Check whether the string is well-formed according to various criteria:
+ * - No code points that are defined as non-characters (e.g. 0xFFFF) or are undefined in
+ *   the version of Unicode currently supported.
+ * - No isolated surrogate code points.
+ * - No overly-long sequences of non-starter combining marks, i.e. more than 30 characters
+ *   in a row with non-zero combining class (which may have category Mn or Mc); this
+ *   violates Stream-Safe Text Format per UAX #15. This test does not ensure that the
+ *   string satisfies Stream-Safe Text Format (because it does not convert to NFKC first),
+ *   but any string that fails this test is certainly not Stream-Safe.
+ * - No emoji variation selectors applied to non-emoji code points. This function may
+ *   also check for other non-standard variation sequences.
+ * - No tag sequences that are ill-formed per definition ED-14a in UTS #51 (e.g. tag
+ *   sequences must have an emoji base and a terminator).
+ *
+ * @internal Apple only 
+ */
+enum { kBidiMaxDepth = 125 };
+
+static UBool isWellFormed(UChar32 c, UChar32 cLast, int32_t *nonStarterCountP, UBool *inTagSeqP,
+                        uint8_t* dirStatus, int32_t* dirStatusIndexP, int32_t* validIsolateCountP) {
+    if (*inTagSeqP) {
+        // can only have tag_spec or tag_term
+        if (c == 0xE007F) { // tag_term
+            *inTagSeqP = FALSE;
+        } else if (c < 0xE0020 || c > 0xE007E) {
+            return FALSE;
+        }
+    } else if (c < 0x0300) {
+        // Everything in this range (includes ASCII) is a valid character with combining class 0
+        *nonStarterCountP = 0;
+        if (c == 0x000A || c == 0x000D || c == 0x0085 || (c >= 0x001C && c <= 0x001E)) {
+            // paragraph sep, reset bidi
+            *dirStatusIndexP = 0;
+            *validIsolateCountP = 0;
+        }
+    } else if ((c >= 0x2029 && c <= 0x202E) || (c >= 0x2066 && c <= 0x2069)) {
+        // para sep & bidi controls, all have combining class 0. The bidi control actions here
+        // are from [https://www.unicode.org/reports/tr9/#Explicit_Levels_and_Directions]
+        *nonStarterCountP = 0;
+        if (c == 0x2029) { // paragraph sep, reset bidi
+            *dirStatusIndexP = 0;
+            *validIsolateCountP = 0;
+        } else if (c == 0x2069) { // PDI
+            if (*validIsolateCountP > 0) {
+                while (*dirStatusIndexP > 0 && (dirStatus[(*dirStatusIndexP)--] & 0x80) == 0);
+                (*validIsolateCountP)--;
+            }
+        } else if (c == 0x202C) { // PDF
+            if (*dirStatusIndexP > 0 && (dirStatus[*dirStatusIndexP] & 0x80) == 0) {
+                (*dirStatusIndexP)--;
+            }
+        } else {
+            // embedding/override initiator. Need to increment the level by at least 1, and possibly 2 if the
+            // embedding/override direction matches the current direction (i.e. R and current odd, or L and current even).
+            // Since we increment first, the test for odd/even is flipped. For FSI, we do not actually determine
+            // whether it should be treated as RLI or LRI, so we just do the minimum increment.
+            uint8_t newEntry = (dirStatus[*dirStatusIndexP] & 0x7F) + 1; // min increment, flips odd/even status compared to current
+            if ( ((c == 0x202B || c == 0x202E || c == 0x2067) && (newEntry & 0x01) == 0) ||  // RLE/RLO/RLI and current was odd
+                 ((c == 0x202A || c == 0x202D || c == 0x2066) && (newEntry & 0x01) != 0) ) { // LRE/LRO/LRI and current was even
+                newEntry++;
+            }
+            if (newEntry > kBidiMaxDepth || *dirStatusIndexP > kBidiMaxDepth) {
+                return FALSE; // Checking for this is the whole point.
+            }
+            if (c >= 0x2066 &&  c <= 0x2068) { // LRI/RLI/FSI
+                newEntry |= 0x80; // set directional isolate status
+                (*validIsolateCountP)++;
+            }
+            dirStatus[++(*dirStatusIndexP)] = newEntry;
+        }
+    } else if (c == 0xFE0F) { // emoji variation selector
+        if (!u_isEmoji(cLast)) { // previous char must be emoji
+            return FALSE;
+        }
+        // previous character would have set *nonStarterCountP = 0;
+    } else if (c >= 0xE0020 && c <= 0xE007E) { // tag_spec
+        if (!u_isEmoji(cLast) && cLast != 0xFE0F) { // previous char must be emoji or FE0F
+              return FALSE;
+        } 
+        *inTagSeqP = TRUE;
+        // previous character would have set  *nonStarterCountP = 0;
+    } else if (c == 0xE007F) { // tag_term
+         return FALSE;
+    } else {
+        // we have checked specific ranges/chars, now check general info for others
+        int8_t genCat = u_charType(c);
+        if (genCat == U_UNASSIGNED || genCat == U_SURROGATE) {
+            return FALSE;
+        }
+        if ((genCat == U_NON_SPACING_MARK || genCat == U_COMBINING_SPACING_MARK) && u_getCombiningClass(c) != 0) {
+            // non-starter
+            if (++(*nonStarterCountP) > 30) {
+                return FALSE;
+            }
+        } else {
+             *nonStarterCountP = 0;
+        }
+    }
+    return TRUE;
+}
+
+U_CAPI UBool U_EXPORT2
+u_strIsWellFormed(const UChar *s, int32_t length) {
+    if (s==NULL || length<-1) {
+        return FALSE;
+    }
+    UChar32 c, c2, cLast = 0;
+    int32_t nonStarterCount = 0;
+    UBool inTagSeq = FALSE;
+    uint8_t dirStatus[kBidiMaxDepth + 3]; // low 7 bits is embed level, high bit is direction override status
+    int32_t dirStatusIndex = 0;
+    int32_t validIsolateCount = 0;
+    dirStatus[0] = 0; // assume initial paragraph direction L (most conservative)
+    if (length < 0) {
+        // NUL terminated
+        while ((c = *s++) != 0) {
+            // get next UChar32 c
+            if (U16_IS_LEAD(c)) {
+                if (U16_IS_TRAIL(c2 = *s)) {
+                    s++;
+                    c = U16_GET_SUPPLEMENTARY(c,c2);
+                }
+            }
+            // check current c
+            if (!isWellFormed(c, cLast, &nonStarterCount, &inTagSeq, dirStatus, &dirStatusIndex, &validIsolateCount)) {
+                return FALSE;
+            }
+            // setup next iteration
+            cLast = c;
+        }
+    } else {
+        // use length
+        const UChar *sLimit = s + length;
+        while (s < sLimit) {
+            // get next UChar32 c
+            c = *s++;
+            if (U16_IS_LEAD(c)) {
+                if (s < sLimit && U16_IS_TRAIL(c2 = *s)) {
+                    s++;
+                    c = U16_GET_SUPPLEMENTARY(c,c2);
+                }
+            }
+            // check current c
+            if (!isWellFormed(c, cLast, &nonStarterCount, &inTagSeq, dirStatus, &dirStatusIndex, &validIsolateCount)) {
+                return FALSE;
+            }
+            // setup next iteration
+            cLast = c;
+        }
+    }
+    return TRUE;
+}
+
+/* ----- U_mem functions --- */
+
 U_CAPI UChar * U_EXPORT2
 u_memcpy(UChar *dest, const UChar *src, int32_t count) {
-    return (UChar *)uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR);
+    if(count > 0) {
+        uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
+    }
+    return dest;
 }
 
 U_CAPI UChar * U_EXPORT2
 u_memmove(UChar *dest, const UChar *src, int32_t count) {
-    return (UChar *)uprv_memmove(dest, src, count*U_SIZEOF_UCHAR);
+    if(count > 0) {
+        uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
+    }
+    return dest;
 }
 
 U_CAPI UChar * U_EXPORT2
@@ -1173,7 +1340,7 @@ static const UChar UNESCAPE_MAP[] = {
     /*t*/ 0x74, 0x09,
     /*v*/ 0x76, 0x0b
 };
-enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) };
+enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
 
 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
 static int8_t _digit8(UChar c) {
@@ -1480,7 +1647,7 @@ u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCod
 */
 
 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
-    int32_t hash = 0;                         \
+    uint32_t hash = 0;                        \
     const TYPE *p = (const TYPE*) STR;        \
     if (p != NULL) {                          \
         int32_t len = (int32_t)(STRLEN);      \
@@ -1491,7 +1658,7 @@ u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCod
             p += inc;                         \
         }                                     \
     }                                         \
-    return hash
+    return static_cast<int32_t>(hash)
 
 /* Used by UnicodeString to compute its hashcode - Not public API. */
 U_CAPI int32_t U_EXPORT2