X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/57a6839dcb3bba09e8228b822b290604668416fe..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/common/ustrtrns.cpp?ds=sidebyside diff --git a/icuSources/common/ustrtrns.cpp b/icuSources/common/ustrtrns.cpp index fbb7b46f..5dc032c0 100644 --- a/icuSources/common/ustrtrns.cpp +++ b/icuSources/common/ustrtrns.cpp @@ -1,7 +1,9 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * -* Copyright (C) 2001-2013, International Business Machines +* Copyright (C) 2001-2016, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -34,8 +36,6 @@ #include "ustr_imp.h" #include "uassert.h" -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - U_CAPI UChar* U_EXPORT2 u_strFromUTF32WithSub(UChar *dest, int32_t destCapacity, @@ -256,148 +256,6 @@ u_strToUTF32(UChar32 *dest, pErrorCode); } -/* for utf8_nextCharSafeBodyTerminated() */ -static const UChar32 -utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; - -/* - * Version of utf8_nextCharSafeBody() with the following differences: - * - checks for NUL termination instead of length - * - works with pointers instead of indexes - * - always strict (strict==-1) - * - * *ps points to after the lead byte and will be moved to after the last trail byte. - * c is the lead byte. - * @return the code point, or U_SENTINEL - */ -static UChar32 -utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { - const uint8_t *s=*ps; - uint8_t trail, illegal=0; - uint8_t count=U8_COUNT_TRAIL_BYTES(c); - U_ASSERT(count<6); - U8_MASK_LEAD_BYTE((c), count); - /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ - switch(count) { - /* each branch falls through to the next one */ - case 5: - case 4: - /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ - illegal=1; - break; - case 3: - trail=(uint8_t)(*s++ - 0x80); - c=(c<<6)|trail; - if(trail>0x3f || c>=0x110) { - /* not a trail byte, or code point>0x10ffff (outside Unicode) */ - illegal=1; - break; - } - case 2: /*fall through*/ - trail=(uint8_t)(*s++ - 0x80); - if(trail>0x3f) { - /* not a trail byte */ - illegal=1; - break; - } - c=(c<<6)|trail; - case 1: /*fall through*/ - trail=(uint8_t)(*s++ - 0x80); - if(trail>0x3f) { - /* not a trail byte */ - illegal=1; - } - c=(c<<6)|trail; - break; - case 0: - return U_SENTINEL; - /* no default branch to optimize switch() - all values are covered */ - } - - /* correct sequence - all trail bytes have (b7..b6)==(10)? */ - /* illegal is also set if count>=4 */ - if(illegal || c0 && U8_IS_TRAIL(*s)) { - ++s; - --count; - } - c=U_SENTINEL; - } - *ps=s; - return c; -} - -/* - * Version of utf8_nextCharSafeBody() with the following differences: - * - works with pointers instead of indexes - * - always strict (strict==-1) - * - * *ps points to after the lead byte and will be moved to after the last trail byte. - * c is the lead byte. - * @return the code point, or U_SENTINEL - */ -static UChar32 -utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { - const uint8_t *s=*ps; - uint8_t trail, illegal=0; - uint8_t count=U8_COUNT_TRAIL_BYTES(c); - if((limit-s)>=count) { - U8_MASK_LEAD_BYTE((c), count); - /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ - switch(count) { - /* each branch falls through to the next one */ - case 5: - case 4: - /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ - illegal=1; - break; - case 3: - trail=*s++; - c=(c<<6)|(trail&0x3f); - if(c<0x110) { - illegal|=(trail&0xc0)^0x80; - } else { - /* code point>0x10ffff, outside Unicode */ - illegal=1; - break; - } - case 2: /*fall through*/ - trail=*s++; - c=(c<<6)|(trail&0x3f); - illegal|=(trail&0xc0)^0x80; - case 1: /*fall through*/ - trail=*s++; - c=(c<<6)|(trail&0x3f); - illegal|=(trail&0xc0)^0x80; - break; - case 0: - return U_SENTINEL; - /* no default branch to optimize switch() - all values are covered */ - } - } else { - illegal=1; /* too few bytes left */ - } - - /* correct sequence - all trail bytes have (b7..b6)==(10)? */ - /* illegal is also set if count>=4 */ - U_ASSERT(illegal || count0 && s 0) || subchar > 0x10ffff || U_IS_SURROGATE(subchar) @@ -430,7 +279,10 @@ u_strFromUTF8WithSub(UChar *dest, if(pNumSubstitutions!=NULL) { *pNumSubstitutions=0; } - numSubstitutions=0; + UChar *pDest = dest; + UChar *pDestLimit = dest+destCapacity; + int32_t reqLength = 0; + int32_t numSubstitutions=0; /* * Inline processing of UTF-8 byte sequences: @@ -451,95 +303,81 @@ u_strFromUTF8WithSub(UChar *dest, * The code explicitly checks for NULs only in the lead byte position. * A NUL byte in the trail byte position fails the trail byte range check anyway. */ - while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { - if(ch <= 0x7f){ - *pDest++=(UChar)ch; - ++pSrc; + int32_t i; + UChar32 c; + for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) { + // modified copy of U8_NEXT() + ++i; + if(U8_IS_SINGLE(c)) { + *pDest++=(UChar)c; } else { - if(ch > 0xe0) { - if( /* handle U+1000..U+CFFF inline */ - ch <= 0xec && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f - ) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - pSrc += 3; - continue; - } - } else if(ch < 0xe0) { - if( /* handle U+0080..U+07FF inline */ - ch >= 0xc2 && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f - ) { - *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - pSrc += 2; - continue; - } - } - - /* function call for "complicated" and error cases */ - ++pSrc; /* continue after the lead byte */ - ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); - if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } else if(ch<=0xFFFF) { - *(pDest++)=(UChar)ch; + uint8_t __t1, __t2; + if( /* handle U+0800..U+FFFF inline */ + (0xe0<=(c) && (c)<0xf0) && + U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && + (__t2=src[(i)+1]-0x80)<=0x3f) { + *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; + i+=2; + } else if( /* handle U+0080..U+07FF inline */ + ((c)<0xe0 && (c)>=0xc2) && + (__t1=src[i]-0x80)<=0x3f) { + *pDest++ = (((c)&0x1f)<<6)|__t1; + ++(i); } else { - *(pDest++)=U16_LEAD(ch); - if(pDest 0xe0) { - if( /* handle U+1000..U+CFFF inline */ - ch <= 0xec && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f && - (uint8_t)(pSrc[2] - 0x80) <= 0x3f - ) { - ++reqLength; - pSrc += 3; - continue; - } - } else if(ch < 0xe0) { - if( /* handle U+0080..U+07FF inline */ - ch >= 0xc2 && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f - ) { - ++reqLength; - pSrc += 2; - continue; + uint8_t __t1, __t2; + if( /* handle U+0800..U+FFFF inline */ + (0xe0<=(c) && (c)<0xf0) && + U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && + (__t2=src[(i)+1]-0x80)<=0x3f) { + ++reqLength; + i+=2; + } else if( /* handle U+0080..U+07FF inline */ + ((c)<0xe0 && (c)>=0xc2) && + (__t1=src[i]-0x80)<=0x3f) { + ++reqLength; + ++(i); + } else { + /* function call for "complicated" and error cases */ + (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1); + if(c<0 && (++numSubstitutions, c = subchar) < 0) { + *pErrorCode = U_INVALID_CHAR_FOUND; + return NULL; } + reqLength += U16_LENGTH(c); } - - /* function call for "complicated" and error cases */ - ++pSrc; /* continue after the lead byte */ - ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); - if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - reqLength += U16_LENGTH(ch); } } } else /* srcLength >= 0 */ { - const uint8_t *pSrcLimit = pSrc + srcLength; - int32_t count; - - /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ + /* Faster loop without ongoing checking for srcLength and pDestLimit. */ + int32_t i = 0; + UChar32 c; for(;;) { /* * Each iteration of the inner loop progresses by at most 3 UTF-8 @@ -547,10 +385,10 @@ u_strFromUTF8WithSub(UChar *dest, * For supplementary code points (4 & 2), which are rare, * there is an additional adjustment. */ - count = (int32_t)(pDestLimit - pDest); - srcLength = (int32_t)((pSrcLimit - pSrc) / 3); - if(count > srcLength) { - count = srcLength; /* min(remaining dest, remaining src/3) */ + int32_t count = (int32_t)(pDestLimit - pDest); + int32_t count2 = (srcLength - i) / 3; + if(count > count2) { + count = count2; /* min(remaining dest, remaining src/3) */ } if(count < 3) { /* @@ -561,147 +399,123 @@ u_strFromUTF8WithSub(UChar *dest, } do { - ch = *pSrc; - if(ch <= 0x7f){ - *pDest++=(UChar)ch; - ++pSrc; + // modified copy of U8_NEXT() + c = (uint8_t)src[i++]; + if(U8_IS_SINGLE(c)) { + *pDest++=(UChar)c; } else { - if(ch > 0xe0) { - if( /* handle U+1000..U+CFFF inline */ - ch <= 0xec && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f - ) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - pSrc += 3; - continue; - } - } else if(ch < 0xe0) { - if( /* handle U+0080..U+07FF inline */ - ch >= 0xc2 && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f - ) { - *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - pSrc += 2; - continue; + uint8_t __t1, __t2; + if( /* handle U+0800..U+FFFF inline */ + (0xe0<=(c) && (c)<0xf0) && + ((i)+1)=0xc2) && + ((i)!=srcLength) && + (__t1=src[i]-0x80)<=0x3f) { + *pDest++ = (((c)&0x1f)<<6)|__t1; + ++(i); + } else { + if(c >= 0xf0 || subchar > 0xffff) { + // We may read up to four bytes and write up to two UChars, + // which we didn't account for with computing count, + // so we adjust it here. + if(--count == 0) { + --i; // back out byte c + break; + } } - } - if(ch >= 0xf0 || subchar > 0xffff) { - /* - * We may read up to six bytes and write up to two UChars, - * which we didn't account for with computing count, - * so we adjust it here. - */ - if(--count == 0) { - break; + /* function call for "complicated" and error cases */ + (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); + if(c<0 && (++numSubstitutions, c = subchar) < 0) { + *pErrorCode = U_INVALID_CHAR_FOUND; + return NULL; + } else if(c<=0xFFFF) { + *(pDest++)=(UChar)c; + } else { + *(pDest++)=U16_LEAD(c); + *(pDest++)=U16_TRAIL(c); } } - - /* function call for "complicated" and error cases */ - ++pSrc; /* continue after the lead byte */ - ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); - if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - }else if(ch<=0xFFFF){ - *(pDest++)=(UChar)ch; - }else{ - *(pDest++)=U16_LEAD(ch); - *(pDest++)=U16_TRAIL(ch); - } } } while(--count > 0); } - while((pSrc 0xe0) { - if( /* handle U+1000..U+CFFF inline */ - ch <= 0xec && - ((pSrcLimit - pSrc) >= 3) && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f - ) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - pSrc += 3; - continue; - } - } else if(ch < 0xe0) { - if( /* handle U+0080..U+07FF inline */ - ch >= 0xc2 && - ((pSrcLimit - pSrc) >= 2) && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f - ) { - *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - pSrc += 2; - continue; - } - } - - /* function call for "complicated" and error cases */ - ++pSrc; /* continue after the lead byte */ - ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); - if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - }else if(ch<=0xFFFF){ - *(pDest++)=(UChar)ch; - }else{ - *(pDest++)=U16_LEAD(ch); - if(pDest=0xc2) && + ((i)!=srcLength) && + (__t1=src[i]-0x80)<=0x3f) { + *pDest++ = (((c)&0x1f)<<6)|__t1; + ++(i); + } else { + /* function call for "complicated" and error cases */ + (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); + if(c<0 && (++numSubstitutions, c = subchar) < 0) { + *pErrorCode = U_INVALID_CHAR_FOUND; + return NULL; + } else if(c<=0xFFFF) { + *(pDest++)=(UChar)c; + } else { + *(pDest++)=U16_LEAD(c); + if(pDest 0xe0) { - if( /* handle U+1000..U+CFFF inline */ - ch <= 0xec && - ((pSrcLimit - pSrc) >= 3) && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f && - (uint8_t)(pSrc[2] - 0x80) <= 0x3f - ) { - reqLength++; - pSrc += 3; - continue; - } - } else if(ch < 0xe0) { - if( /* handle U+0080..U+07FF inline */ - ch >= 0xc2 && - ((pSrcLimit - pSrc) >= 2) && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f - ) { - reqLength++; - pSrc += 2; - continue; + uint8_t __t1, __t2; + if( /* handle U+0800..U+FFFF inline */ + (0xe0<=(c) && (c)<0xf0) && + ((i)+1)=0xc2) && + ((i)!=srcLength) && + (__t1=src[i]-0x80)<=0x3f) { + ++reqLength; + ++(i); + } else { + /* function call for "complicated" and error cases */ + (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); + if(c<0 && (++numSubstitutions, c = subchar) < 0) { + *pErrorCode = U_INVALID_CHAR_FOUND; + return NULL; } + reqLength += U16_LENGTH(c); } - - /* function call for "complicated" and error cases */ - ++pSrc; /* continue after the lead byte */ - ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); - if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - reqLength+=U16_LENGTH(ch); } } } @@ -749,7 +563,7 @@ u_strFromUTF8Lenient(UChar *dest, uint8_t* pSrc = (uint8_t*) src; /* args check */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ + if(U_FAILURE(*pErrorCode)){ return NULL; } @@ -990,7 +804,7 @@ u_strToUTF8WithSub(char *dest, int32_t numSubstitutions; /* args check */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ + if(U_FAILURE(*pErrorCode)){ return NULL; } @@ -1262,18 +1076,8 @@ u_strFromJavaModifiedUTF8WithSub( int32_t srcLength, UChar32 subchar, int32_t *pNumSubstitutions, UErrorCode *pErrorCode) { - UChar *pDest = dest; - UChar *pDestLimit = dest+destCapacity; - UChar32 ch; - int32_t reqLength = 0; - const uint8_t* pSrc = (const uint8_t*) src; - const uint8_t *pSrcLimit; - int32_t count; - uint8_t t1, t2; /* trail bytes */ - int32_t numSubstitutions; - /* args check */ - if(U_FAILURE(*pErrorCode)){ + if(U_FAILURE(*pErrorCode)) { return NULL; } if( (src==NULL && srcLength!=0) || srcLength < -1 || @@ -1287,18 +1091,22 @@ u_strFromJavaModifiedUTF8WithSub( if(pNumSubstitutions!=NULL) { *pNumSubstitutions=0; } - numSubstitutions=0; + UChar *pDest = dest; + UChar *pDestLimit = dest+destCapacity; + int32_t reqLength = 0; + int32_t numSubstitutions=0; if(srcLength < 0) { /* * Transform a NUL-terminated ASCII string. * Handle non-ASCII strings with slower code. */ - while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { - *pDest++=(UChar)ch; - ++pSrc; + UChar32 c; + while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) { + *pDest++=(UChar)c; + ++src; } - if(ch == 0) { + if(c == 0) { reqLength=(int32_t)(pDest - dest); if(pDestLength) { *pDestLength = reqLength; @@ -1308,33 +1116,38 @@ u_strFromJavaModifiedUTF8WithSub( u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); return dest; } - srcLength = uprv_strlen((const char *)pSrc); + srcLength = static_cast(uprv_strlen(src)); } - /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ - pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength; + /* Faster loop without ongoing checking for srcLength and pDestLimit. */ + UChar32 ch; + uint8_t t1, t2; + int32_t i = 0; for(;;) { - count = (int32_t)(pDestLimit - pDest); - srcLength = (int32_t)(pSrcLimit - pSrc); - if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { + int32_t count = (int32_t)(pDestLimit - pDest); + int32_t count2 = srcLength - i; + if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) { /* fast ASCII loop */ - const uint8_t *prevSrc = pSrc; - int32_t delta; - while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { - *pDest++=(UChar)ch; - ++pSrc; + int32_t start = i; + uint8_t b; + while(i < srcLength && U8_IS_SINGLE(b = src[i])) { + *pDest++=b; + ++i; } - delta = (int32_t)(pSrc - prevSrc); + int32_t delta = i - start; count -= delta; - srcLength -= delta; + count2 -= delta; } /* * Each iteration of the inner loop progresses by at most 3 UTF-8 * bytes and one UChar. */ - srcLength /= 3; - if(count > srcLength) { - count = srcLength; /* min(remaining dest, remaining src/3) */ + if(subchar > 0xFFFF) { + break; + } + count2 /= 3; + if(count > count2) { + count = count2; /* min(remaining dest, remaining src/3) */ } if(count < 3) { /* @@ -1344,29 +1157,28 @@ u_strFromJavaModifiedUTF8WithSub( break; } do { - ch = *pSrc; - if(ch <= 0x7f){ + ch = (uint8_t)src[i++]; + if(U8_IS_SINGLE(ch)) { *pDest++=(UChar)ch; - ++pSrc; } else { if(ch >= 0xe0) { if( /* handle U+0000..U+FFFF inline */ ch <= 0xef && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f + (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f && + (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f ) { /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - pSrc += 3; + i += 2; continue; } } else { if( /* handle U+0000..U+07FF inline */ ch >= 0xc0 && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f + (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f ) { *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - pSrc += 2; + ++i; continue; } } @@ -1379,49 +1191,43 @@ u_strFromJavaModifiedUTF8WithSub( * We need to write two UChars, adjusted count for that, * and ran out of space. */ + --i; // back out byte ch break; } else { /* function call for error cases */ - ++pSrc; /* continue after the lead byte */ - utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); + utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); ++numSubstitutions; - if(subchar<=0xFFFF) { - *(pDest++)=(UChar)subchar; - } else { - *(pDest++)=U16_LEAD(subchar); - *(pDest++)=U16_TRAIL(subchar); - } + *(pDest++)=(UChar)subchar; } } } while(--count > 0); } - while((pSrc= 0xe0) { if( /* handle U+0000..U+FFFF inline */ ch <= 0xef && - ((pSrcLimit - pSrc) >= 3) && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f + (i+1) < srcLength && + (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f && + (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f ) { /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - pSrc += 3; + i += 2; continue; } } else { if( /* handle U+0000..U+07FF inline */ ch >= 0xc0 && - ((pSrcLimit - pSrc) >= 2) && - (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f + i < srcLength && + (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f ) { *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - pSrc += 2; + ++i; continue; } } @@ -1431,8 +1237,7 @@ u_strFromJavaModifiedUTF8WithSub( return NULL; } else { /* function call for error cases */ - ++pSrc; /* continue after the lead byte */ - utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); + utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); ++numSubstitutions; if(subchar<=0xFFFF) { *(pDest++)=(UChar)subchar; @@ -1449,32 +1254,31 @@ u_strFromJavaModifiedUTF8WithSub( } } - /* do not fill the dest buffer just count the UChars needed */ - while(pSrc < pSrcLimit){ - ch = *pSrc; - if(ch <= 0x7f) { + /* Pre-flight the rest of the string. */ + while(i < srcLength) { + ch = (uint8_t)src[i++]; + if(U8_IS_SINGLE(ch)) { reqLength++; - ++pSrc; } else { if(ch >= 0xe0) { if( /* handle U+0000..U+FFFF inline */ ch <= 0xef && - ((pSrcLimit - pSrc) >= 3) && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f && - (uint8_t)(pSrc[2] - 0x80) <= 0x3f + (i+1) < srcLength && + (uint8_t)(src[i] - 0x80) <= 0x3f && + (uint8_t)(src[i+1] - 0x80) <= 0x3f ) { reqLength++; - pSrc += 3; + i += 2; continue; } } else { if( /* handle U+0000..U+07FF inline */ ch >= 0xc0 && - ((pSrcLimit - pSrc) >= 2) && - (uint8_t)(pSrc[1] - 0x80) <= 0x3f + i < srcLength && + (uint8_t)(src[i] - 0x80) <= 0x3f ) { reqLength++; - pSrc += 2; + ++i; continue; } } @@ -1484,8 +1288,7 @@ u_strFromJavaModifiedUTF8WithSub( return NULL; } else { /* function call for error cases */ - ++pSrc; /* continue after the lead byte */ - utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); + utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); ++numSubstitutions; reqLength+=U16_LENGTH(ch); }