]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/ustrtrns.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / ustrtrns.cpp
index 57cabd58e2a3b8d95f1ba8a892cb004f1da06484..5dc032c02fb807413495d7daf4db668cfd16518b 100644 (file)
@@ -1,7 +1,9 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2001-2012, International Business Machines
+*   Copyright (C) 2001-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
@@ -254,148 +256,6 @@ u_strToUTF32(UChar32 *dest,
             pErrorCode);
 }
 
-/* for utf8_nextCharSafeBodyTerminated() */
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - checks for NUL termination instead of length
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
-    const uint8_t *s=*ps;
-    uint8_t trail, illegal=0;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    U_ASSERT(count<6);
-    U8_MASK_LEAD_BYTE((c), count);
-    /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-    switch(count) {
-    /* each branch falls through to the next one */
-    case 5:
-    case 4:
-        /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-        illegal=1;
-        break;
-    case 3:
-        trail=(uint8_t)(*s++ - 0x80);
-        c=(c<<6)|trail;
-        if(trail>0x3f || c>=0x110) {
-            /* not a trail byte, or code point>0x10ffff (outside Unicode) */
-            illegal=1;
-            break;
-        }
-    case 2: /*fall through*/
-        trail=(uint8_t)(*s++ - 0x80);
-        if(trail>0x3f) {
-            /* not a trail byte */
-            illegal=1;
-            break;
-        }
-        c=(c<<6)|trail;
-    case 1: /*fall through*/
-        trail=(uint8_t)(*s++ - 0x80);
-        if(trail>0x3f) {
-            /* not a trail byte */
-            illegal=1;
-        }
-        c=(c<<6)|trail;
-        break;
-    case 0:
-        return U_SENTINEL;
-    /* no default branch to optimize switch()  - all values are covered */
-    }
-
-    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
-    /* illegal is also set if count>=4 */
-    if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
-        /* error handling */
-        /* don't go beyond this sequence */
-        s=*ps;
-        while(count>0 && U8_IS_TRAIL(*s)) {
-            ++s;
-            --count;
-        }
-        c=U_SENTINEL;
-    }
-    *ps=s;
-    return c;
-}
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
-    const uint8_t *s=*ps;
-    uint8_t trail, illegal=0;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    if((limit-s)>=count) {
-        U8_MASK_LEAD_BYTE((c), count);
-        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-        switch(count) {
-        /* each branch falls through to the next one */
-        case 5:
-        case 4:
-            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-            illegal=1;
-            break;
-        case 3:
-            trail=*s++;
-            c=(c<<6)|(trail&0x3f);
-            if(c<0x110) {
-                illegal|=(trail&0xc0)^0x80;
-            } else {
-                /* code point>0x10ffff, outside Unicode */
-                illegal=1;
-                break;
-            }
-        case 2: /*fall through*/
-            trail=*s++;
-            c=(c<<6)|(trail&0x3f);
-            illegal|=(trail&0xc0)^0x80;
-        case 1: /*fall through*/
-            trail=*s++;
-            c=(c<<6)|(trail&0x3f);
-            illegal|=(trail&0xc0)^0x80;
-            break;
-        case 0:
-            return U_SENTINEL;
-        /* no default branch to optimize switch()  - all values are covered */
-        }
-    } else {
-        illegal=1; /* too few bytes left */
-    }
-
-    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
-    /* illegal is also set if count>=4 */
-    U_ASSERT(count<sizeof(utf8_minLegal)/sizeof(utf8_minLegal[0]));
-    if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
-        /* error handling */
-        /* don't go beyond this sequence */
-        s=*ps;
-        while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
-            ++s;
-            --count;
-        }
-        c=U_SENTINEL;
-    }
-    *ps=s;
-    return c;
-}
-
 U_CAPI UChar* U_EXPORT2
 u_strFromUTF8WithSub(UChar *dest,
               int32_t destCapacity,
@@ -404,19 +264,10 @@ u_strFromUTF8WithSub(UChar *dest,
               int32_t srcLength,
               UChar32 subchar, int32_t *pNumSubstitutions,
               UErrorCode *pErrorCode){
-    UChar *pDest = dest;
-    UChar *pDestLimit = dest+destCapacity;
-    UChar32 ch;
-    int32_t reqLength = 0;
-    const uint8_t* pSrc = (const uint8_t*) src;
-    uint8_t t1, t2; /* trail bytes */
-    int32_t numSubstitutions;
-
     /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)) {
         return NULL;
     }
-        
     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
@@ -428,7 +279,10 @@ u_strFromUTF8WithSub(UChar *dest,
     if(pNumSubstitutions!=NULL) {
         *pNumSubstitutions=0;
     }
-    numSubstitutions=0;
+    UChar *pDest = dest;
+    UChar *pDestLimit = dest+destCapacity;
+    int32_t reqLength = 0;
+    int32_t numSubstitutions=0;
 
     /*
      * Inline processing of UTF-8 byte sequences:
@@ -449,95 +303,81 @@ u_strFromUTF8WithSub(UChar *dest,
          * The code explicitly checks for NULs only in the lead byte position.
          * A NUL byte in the trail byte position fails the trail byte range check anyway.
          */
-        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
-            if(ch <= 0x7f){
-                *pDest++=(UChar)ch;
-                ++pSrc;
+        int32_t i;
+        UChar32 c;
+        for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
+            // modified copy of U8_NEXT()
+            ++i;
+            if(U8_IS_SINGLE(c)) {
+                *pDest++=(UChar)c;
             } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
-                    ) {
-                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
-                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
-                    ) {
-                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                        pSrc += 2;
-                        continue;
-                    }
-                }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                } else if(ch<=0xFFFF) {
-                    *(pDest++)=(UChar)ch;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0x1f)<<6)|__t1;
+                    ++(i);
                 } else {
-                    *(pDest++)=U16_LEAD(ch);
-                    if(pDest<pDestLimit) {
-                        *(pDest++)=U16_TRAIL(ch);
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
+                    } else if(c<=0xFFFF) {
+                        *(pDest++)=(UChar)c;
                     } else {
-                        reqLength++;
-                        break;
+                        *(pDest++)=U16_LEAD(c);
+                        if(pDest<pDestLimit) {
+                            *(pDest++)=U16_TRAIL(c);
+                        } else {
+                            reqLength++;
+                            break;
+                        }
                     }
                 }
             }
         }
 
         /* Pre-flight the rest of the string. */
-        while((ch = *pSrc) != 0) {
-            if(ch <= 0x7f){
+        while((c = (uint8_t)src[i]) != 0) {
+            // modified copy of U8_NEXT()
+            ++i;
+            if(U8_IS_SINGLE(c)) {
                 ++reqLength;
-                ++pSrc;
             } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
-                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
-                    ) {
-                        ++reqLength;
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
-                    ) {
-                        ++reqLength;
-                        pSrc += 2;
-                        continue;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    ++reqLength;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    ++reqLength;
+                    ++(i);
+                } else {
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
                     }
+                    reqLength += U16_LENGTH(c);
                 }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                }
-                reqLength += U16_LENGTH(ch);
             }
         }
     } else /* srcLength >= 0 */ {
-        const uint8_t *pSrcLimit = pSrc + srcLength;
-        int32_t count;
-
-        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
+        /* Faster loop without ongoing checking for srcLength and pDestLimit. */
+        int32_t i = 0;
+        UChar32 c;
         for(;;) {
             /*
              * Each iteration of the inner loop progresses by at most 3 UTF-8
@@ -545,10 +385,10 @@ u_strFromUTF8WithSub(UChar *dest,
              * For supplementary code points (4 & 2), which are rare,
              * there is an additional adjustment.
              */
-            count = (int32_t)(pDestLimit - pDest);
-            srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
-            if(count > srcLength) {
-                count = srcLength; /* min(remaining dest, remaining src/3) */
+            int32_t count = (int32_t)(pDestLimit - pDest);
+            int32_t count2 = (srcLength - i) / 3;
+            if(count > count2) {
+                count = count2; /* min(remaining dest, remaining src/3) */
             }
             if(count < 3) {
                 /*
@@ -559,147 +399,123 @@ u_strFromUTF8WithSub(UChar *dest,
             }
 
             do {
-                ch = *pSrc;
-                if(ch <= 0x7f){
-                    *pDest++=(UChar)ch;
-                    ++pSrc;
+                // modified copy of U8_NEXT()
+                c = (uint8_t)src[i++];
+                if(U8_IS_SINGLE(c)) {
+                    *pDest++=(UChar)c;
                 } else {
-                    if(ch > 0xe0) {
-                        if( /* handle U+1000..U+CFFF inline */
-                            ch <= 0xec &&
-                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                            (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
-                        ) {
-                            /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
-                            *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                            pSrc += 3;
-                            continue;
-                        }
-                    } else if(ch < 0xe0) {
-                        if( /* handle U+0080..U+07FF inline */
-                            ch >= 0xc2 &&
-                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
-                        ) {
-                            *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                            pSrc += 2;
-                            continue;
+                    uint8_t __t1, __t2;
+                    if( /* handle U+0800..U+FFFF inline */
+                            (0xe0<=(c) && (c)<0xf0) &&
+                            ((i)+1)<srcLength &&
+                            U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                            (__t2=src[(i)+1]-0x80)<=0x3f) {
+                        *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+                        i+=2;
+                    } else if( /* handle U+0080..U+07FF inline */
+                            ((c)<0xe0 && (c)>=0xc2) &&
+                            ((i)!=srcLength) &&
+                            (__t1=src[i]-0x80)<=0x3f) {
+                        *pDest++ = (((c)&0x1f)<<6)|__t1;
+                        ++(i);
+                    } else {
+                        if(c >= 0xf0 || subchar > 0xffff) {
+                            // We may read up to four bytes and write up to two UChars,
+                            // which we didn't account for with computing count,
+                            // so we adjust it here.
+                            if(--count == 0) {
+                                --i;  // back out byte c
+                                break;
+                            }
                         }
-                    }
 
-                    if(ch >= 0xf0 || subchar > 0xffff) {
-                        /*
-                         * We may read up to six bytes and write up to two UChars,
-                         * which we didn't account for with computing count,
-                         * so we adjust it here.
-                         */
-                        if(--count == 0) {
-                            break;
+                        /* function call for "complicated" and error cases */
+                        (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+                        if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                            *pErrorCode = U_INVALID_CHAR_FOUND;
+                            return NULL;
+                        } else if(c<=0xFFFF) {
+                            *(pDest++)=(UChar)c;
+                        } else {
+                            *(pDest++)=U16_LEAD(c);
+                            *(pDest++)=U16_TRAIL(c);
                         }
                     }
-
-                    /* function call for "complicated" and error cases */
-                    ++pSrc; /* continue after the lead byte */
-                    ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
-                    if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
-                        *pErrorCode = U_INVALID_CHAR_FOUND;
-                        return NULL;
-                    }else if(ch<=0xFFFF){
-                        *(pDest++)=(UChar)ch;
-                    }else{
-                        *(pDest++)=U16_LEAD(ch);
-                        *(pDest++)=U16_TRAIL(ch);
-                    }
                 }
             } while(--count > 0);
         }
 
-        while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
-            ch = *pSrc;
-            if(ch <= 0x7f){
-                *pDest++=(UChar)ch;
-                ++pSrc;
+        while(i < srcLength && (pDest < pDestLimit)) {
+            // modified copy of U8_NEXT()
+            c = (uint8_t)src[i++];
+            if(U8_IS_SINGLE(c)) {
+                *pDest++=(UChar)c;
             } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        ((pSrcLimit - pSrc) >= 3) &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
-                    ) {
-                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
-                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        ((pSrcLimit - pSrc) >= 2) &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
-                    ) {
-                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                        pSrc += 2;
-                        continue;
-                    }
-                }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                }else if(ch<=0xFFFF){
-                    *(pDest++)=(UChar)ch;
-                }else{
-                    *(pDest++)=U16_LEAD(ch);
-                    if(pDest<pDestLimit){
-                        *(pDest++)=U16_TRAIL(ch);
-                    }else{
-                        reqLength++;
-                        break;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        ((i)+1)<srcLength &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        ((i)!=srcLength) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    *pDest++ = (((c)&0x1f)<<6)|__t1;
+                    ++(i);
+                } else {
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
+                    } else if(c<=0xFFFF) {
+                        *(pDest++)=(UChar)c;
+                    } else {
+                        *(pDest++)=U16_LEAD(c);
+                        if(pDest<pDestLimit) {
+                            *(pDest++)=U16_TRAIL(c);
+                        } else {
+                            reqLength++;
+                            break;
+                        }
                     }
                 }
             }
         }
-        /* do not fill the dest buffer just count the UChars needed */
-        while(pSrc < pSrcLimit){
-            ch = *pSrc;
-            if(ch <= 0x7f){
-                reqLength++;
-                ++pSrc;
+
+        /* Pre-flight the rest of the string. */
+        while(i < srcLength) {
+            // modified copy of U8_NEXT()
+            c = (uint8_t)src[i++];
+            if(U8_IS_SINGLE(c)) {
+                ++reqLength;
             } else {
-                if(ch > 0xe0) {
-                    if( /* handle U+1000..U+CFFF inline */
-                        ch <= 0xec &&
-                        ((pSrcLimit - pSrc) >= 3) &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
-                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
-                    ) {
-                        reqLength++;
-                        pSrc += 3;
-                        continue;
-                    }
-                } else if(ch < 0xe0) {
-                    if( /* handle U+0080..U+07FF inline */
-                        ch >= 0xc2 &&
-                        ((pSrcLimit - pSrc) >= 2) &&
-                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
-                    ) {
-                        reqLength++;
-                        pSrc += 2;
-                        continue;
+                uint8_t __t1, __t2;
+                if( /* handle U+0800..U+FFFF inline */
+                        (0xe0<=(c) && (c)<0xf0) &&
+                        ((i)+1)<srcLength &&
+                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+                        (__t2=src[(i)+1]-0x80)<=0x3f) {
+                    ++reqLength;
+                    i+=2;
+                } else if( /* handle U+0080..U+07FF inline */
+                        ((c)<0xe0 && (c)>=0xc2) &&
+                        ((i)!=srcLength) &&
+                        (__t1=src[i]-0x80)<=0x3f) {
+                    ++reqLength;
+                    ++(i);
+                } else {
+                    /* function call for "complicated" and error cases */
+                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+                        *pErrorCode = U_INVALID_CHAR_FOUND;
+                        return NULL;
                     }
+                    reqLength += U16_LENGTH(c);
                 }
-
-                /* function call for "complicated" and error cases */
-                ++pSrc; /* continue after the lead byte */
-                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
-                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
-                    *pErrorCode = U_INVALID_CHAR_FOUND;
-                    return NULL;
-                }
-                reqLength+=U16_LENGTH(ch);
             }
         }
     }
@@ -747,7 +563,7 @@ u_strFromUTF8Lenient(UChar *dest,
     uint8_t* pSrc = (uint8_t*) src;
 
     /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)){
         return NULL;
     }
         
@@ -988,7 +804,7 @@ u_strToUTF8WithSub(char *dest,
     int32_t numSubstitutions;
 
     /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)){
         return NULL;
     }
         
@@ -1260,18 +1076,8 @@ u_strFromJavaModifiedUTF8WithSub(
         int32_t srcLength,
         UChar32 subchar, int32_t *pNumSubstitutions,
         UErrorCode *pErrorCode) {
-    UChar *pDest = dest;
-    UChar *pDestLimit = dest+destCapacity;
-    UChar32 ch;
-    int32_t reqLength = 0;
-    const uint8_t* pSrc = (const uint8_t*) src;
-    const uint8_t *pSrcLimit;
-    int32_t count;
-    uint8_t t1, t2; /* trail bytes */
-    int32_t numSubstitutions;
-
     /* args check */
-    if(U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)) {
         return NULL;
     }
     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
@@ -1285,18 +1091,22 @@ u_strFromJavaModifiedUTF8WithSub(
     if(pNumSubstitutions!=NULL) {
         *pNumSubstitutions=0;
     }
-    numSubstitutions=0;
+    UChar *pDest = dest;
+    UChar *pDestLimit = dest+destCapacity;
+    int32_t reqLength = 0;
+    int32_t numSubstitutions=0;
 
     if(srcLength < 0) {
         /*
          * Transform a NUL-terminated ASCII string.
          * Handle non-ASCII strings with slower code.
          */
-        while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
-            *pDest++=(UChar)ch;
-            ++pSrc;
+        UChar32 c;
+        while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
+            *pDest++=(UChar)c;
+            ++src;
         }
-        if(ch == 0) {
+        if(c == 0) {
             reqLength=(int32_t)(pDest - dest);
             if(pDestLength) {
                 *pDestLength = reqLength;
@@ -1306,33 +1116,38 @@ u_strFromJavaModifiedUTF8WithSub(
             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
             return dest;
         }
-        srcLength = uprv_strlen((const char *)pSrc);
+        srcLength = static_cast<int32_t>(uprv_strlen(src));
     }
 
-    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
-    pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
+    /* Faster loop without ongoing checking for srcLength and pDestLimit. */
+    UChar32 ch;
+    uint8_t t1, t2;
+    int32_t i = 0;
     for(;;) {
-        count = (int32_t)(pDestLimit - pDest);
-        srcLength = (int32_t)(pSrcLimit - pSrc);
-        if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
+        int32_t count = (int32_t)(pDestLimit - pDest);
+        int32_t count2 = srcLength - i;
+        if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
             /* fast ASCII loop */
-            const uint8_t *prevSrc = pSrc;
-            int32_t delta;
-            while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
-                *pDest++=(UChar)ch;
-                ++pSrc;
+            int32_t start = i;
+            uint8_t b;
+            while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
+                *pDest++=b;
+                ++i;
             }
-            delta = (int32_t)(pSrc - prevSrc);
+            int32_t delta = i - start;
             count -= delta;
-            srcLength -= delta;
+            count2 -= delta;
         }
         /*
          * Each iteration of the inner loop progresses by at most 3 UTF-8
          * bytes and one UChar.
          */
-        srcLength /= 3;
-        if(count > srcLength) {
-            count = srcLength; /* min(remaining dest, remaining src/3) */
+        if(subchar > 0xFFFF) {
+            break;
+        }
+        count2 /= 3;
+        if(count > count2) {
+            count = count2; /* min(remaining dest, remaining src/3) */
         }
         if(count < 3) {
             /*
@@ -1342,29 +1157,28 @@ u_strFromJavaModifiedUTF8WithSub(
             break;
         }
         do {
-            ch = *pSrc;
-            if(ch <= 0x7f){
+            ch = (uint8_t)src[i++];
+            if(U8_IS_SINGLE(ch)) {
                 *pDest++=(UChar)ch;
-                ++pSrc;
             } else {
                 if(ch >= 0xe0) {
                     if( /* handle U+0000..U+FFFF inline */
                         ch <= 0xef &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+                        (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
+                        (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
                     ) {
                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                        pSrc += 3;
+                        i += 2;
                         continue;
                     }
                 } else {
                     if( /* handle U+0000..U+07FF inline */
                         ch >= 0xc0 &&
-                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+                        (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
                     ) {
                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                        pSrc += 2;
+                        ++i;
                         continue;
                     }
                 }
@@ -1377,49 +1191,43 @@ u_strFromJavaModifiedUTF8WithSub(
                      * We need to write two UChars, adjusted count for that,
                      * and ran out of space.
                      */
+                    --i;  // back out byte ch
                     break;
                 } else {
                     /* function call for error cases */
-                    ++pSrc; /* continue after the lead byte */
-                    utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                    utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
                     ++numSubstitutions;
-                    if(subchar<=0xFFFF) {
-                        *(pDest++)=(UChar)subchar;
-                    } else {
-                        *(pDest++)=U16_LEAD(subchar);
-                        *(pDest++)=U16_TRAIL(subchar);
-                    }
+                    *(pDest++)=(UChar)subchar;
                 }
             }
         } while(--count > 0);
     }
 
-    while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
-        ch = *pSrc;
-        if(ch <= 0x7f){
+    while(i < srcLength && (pDest < pDestLimit)) {
+        ch = (uint8_t)src[i++];
+        if(U8_IS_SINGLE(ch)){
             *pDest++=(UChar)ch;
-            ++pSrc;
         } else {
             if(ch >= 0xe0) {
                 if( /* handle U+0000..U+FFFF inline */
                     ch <= 0xef &&
-                    ((pSrcLimit - pSrc) >= 3) &&
-                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
-                    (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+                    (i+1) < srcLength &&
+                    (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
+                    (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
                 ) {
                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
-                    pSrc += 3;
+                    i += 2;
                     continue;
                 }
             } else {
                 if( /* handle U+0000..U+07FF inline */
                     ch >= 0xc0 &&
-                    ((pSrcLimit - pSrc) >= 2) &&
-                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+                    i < srcLength &&
+                    (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
                 ) {
                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
-                    pSrc += 2;
+                    ++i;
                     continue;
                 }
             }
@@ -1429,8 +1237,7 @@ u_strFromJavaModifiedUTF8WithSub(
                 return NULL;
             } else {
                 /* function call for error cases */
-                ++pSrc; /* continue after the lead byte */
-                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
                 ++numSubstitutions;
                 if(subchar<=0xFFFF) {
                     *(pDest++)=(UChar)subchar;
@@ -1447,32 +1254,31 @@ u_strFromJavaModifiedUTF8WithSub(
         }
     }
 
-    /* do not fill the dest buffer just count the UChars needed */
-    while(pSrc < pSrcLimit){
-        ch = *pSrc;
-        if(ch <= 0x7f) {
+    /* Pre-flight the rest of the string. */
+    while(i < srcLength) {
+        ch = (uint8_t)src[i++];
+        if(U8_IS_SINGLE(ch)) {
             reqLength++;
-            ++pSrc;
         } else {
             if(ch >= 0xe0) {
                 if( /* handle U+0000..U+FFFF inline */
                     ch <= 0xef &&
-                    ((pSrcLimit - pSrc) >= 3) &&
-                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
-                    (uint8_t)(pSrc[2] - 0x80) <= 0x3f
+                    (i+1) < srcLength &&
+                    (uint8_t)(src[i] - 0x80) <= 0x3f &&
+                    (uint8_t)(src[i+1] - 0x80) <= 0x3f
                 ) {
                     reqLength++;
-                    pSrc += 3;
+                    i += 2;
                     continue;
                 }
             } else {
                 if( /* handle U+0000..U+07FF inline */
                     ch >= 0xc0 &&
-                    ((pSrcLimit - pSrc) >= 2) &&
-                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f
+                    i < srcLength &&
+                    (uint8_t)(src[i] - 0x80) <= 0x3f
                 ) {
                     reqLength++;
-                    pSrc += 2;
+                    ++i;
                     continue;
                 }
             }
@@ -1482,8 +1288,7 @@ u_strFromJavaModifiedUTF8WithSub(
                 return NULL;
             } else {
                 /* function call for error cases */
-                ++pSrc; /* continue after the lead byte */
-                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
                 ++numSubstitutions;
                 reqLength+=U16_LENGTH(ch);
             }