+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
-* Copyright (C) 2001-2013, International Business Machines
+* Copyright (C) 2001-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
#include "ustr_imp.h"
#include "uassert.h"
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
-
U_CAPI UChar* U_EXPORT2
u_strFromUTF32WithSub(UChar *dest,
int32_t destCapacity,
pErrorCode);
}
-/* for utf8_nextCharSafeBodyTerminated() */
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - checks for NUL termination instead of length
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
- const uint8_t *s=*ps;
- uint8_t trail, illegal=0;
- uint8_t count=U8_COUNT_TRAIL_BYTES(c);
- U_ASSERT(count<6);
- U8_MASK_LEAD_BYTE((c), count);
- /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
- switch(count) {
- /* each branch falls through to the next one */
- case 5:
- case 4:
- /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
- illegal=1;
- break;
- case 3:
- trail=(uint8_t)(*s++ - 0x80);
- c=(c<<6)|trail;
- if(trail>0x3f || c>=0x110) {
- /* not a trail byte, or code point>0x10ffff (outside Unicode) */
- illegal=1;
- break;
- }
- case 2: /*fall through*/
- trail=(uint8_t)(*s++ - 0x80);
- if(trail>0x3f) {
- /* not a trail byte */
- illegal=1;
- break;
- }
- c=(c<<6)|trail;
- case 1: /*fall through*/
- trail=(uint8_t)(*s++ - 0x80);
- if(trail>0x3f) {
- /* not a trail byte */
- illegal=1;
- }
- c=(c<<6)|trail;
- break;
- case 0:
- return U_SENTINEL;
- /* no default branch to optimize switch() - all values are covered */
- }
-
- /* correct sequence - all trail bytes have (b7..b6)==(10)? */
- /* illegal is also set if count>=4 */
- if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
- /* error handling */
- /* don't go beyond this sequence */
- s=*ps;
- while(count>0 && U8_IS_TRAIL(*s)) {
- ++s;
- --count;
- }
- c=U_SENTINEL;
- }
- *ps=s;
- return c;
-}
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
- const uint8_t *s=*ps;
- uint8_t trail, illegal=0;
- uint8_t count=U8_COUNT_TRAIL_BYTES(c);
- if((limit-s)>=count) {
- U8_MASK_LEAD_BYTE((c), count);
- /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
- switch(count) {
- /* each branch falls through to the next one */
- case 5:
- case 4:
- /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
- illegal=1;
- break;
- case 3:
- trail=*s++;
- c=(c<<6)|(trail&0x3f);
- if(c<0x110) {
- illegal|=(trail&0xc0)^0x80;
- } else {
- /* code point>0x10ffff, outside Unicode */
- illegal=1;
- break;
- }
- case 2: /*fall through*/
- trail=*s++;
- c=(c<<6)|(trail&0x3f);
- illegal|=(trail&0xc0)^0x80;
- case 1: /*fall through*/
- trail=*s++;
- c=(c<<6)|(trail&0x3f);
- illegal|=(trail&0xc0)^0x80;
- break;
- case 0:
- return U_SENTINEL;
- /* no default branch to optimize switch() - all values are covered */
- }
- } else {
- illegal=1; /* too few bytes left */
- }
-
- /* correct sequence - all trail bytes have (b7..b6)==(10)? */
- /* illegal is also set if count>=4 */
- U_ASSERT(illegal || count<LENGTHOF(utf8_minLegal));
- if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
- /* error handling */
- /* don't go beyond this sequence */
- s=*ps;
- while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
- ++s;
- --count;
- }
- c=U_SENTINEL;
- }
- *ps=s;
- return c;
-}
-
U_CAPI UChar* U_EXPORT2
u_strFromUTF8WithSub(UChar *dest,
int32_t destCapacity,
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode){
- UChar *pDest = dest;
- UChar *pDestLimit = dest+destCapacity;
- UChar32 ch;
- int32_t reqLength = 0;
- const uint8_t* pSrc = (const uint8_t*) src;
- uint8_t t1, t2; /* trail bytes */
- int32_t numSubstitutions;
-
/* args check */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+ if(U_FAILURE(*pErrorCode)) {
return NULL;
}
-
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
(destCapacity<0) || (dest == NULL && destCapacity > 0) ||
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
if(pNumSubstitutions!=NULL) {
*pNumSubstitutions=0;
}
- numSubstitutions=0;
+ UChar *pDest = dest;
+ UChar *pDestLimit = dest+destCapacity;
+ int32_t reqLength = 0;
+ int32_t numSubstitutions=0;
/*
* Inline processing of UTF-8 byte sequences:
* The code explicitly checks for NULs only in the lead byte position.
* A NUL byte in the trail byte position fails the trail byte range check anyway.
*/
- while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
- if(ch <= 0x7f){
- *pDest++=(UChar)ch;
- ++pSrc;
+ int32_t i;
+ UChar32 c;
+ for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
+ // modified copy of U8_NEXT()
+ ++i;
+ if(U8_IS_SINGLE(c)) {
+ *pDest++=(UChar)c;
} else {
- if(ch > 0xe0) {
- if( /* handle U+1000..U+CFFF inline */
- ch <= 0xec &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
- ) {
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
- *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
- pSrc += 3;
- continue;
- }
- } else if(ch < 0xe0) {
- if( /* handle U+0080..U+07FF inline */
- ch >= 0xc2 &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
- ) {
- *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
- pSrc += 2;
- continue;
- }
- }
-
- /* function call for "complicated" and error cases */
- ++pSrc; /* continue after the lead byte */
- ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
- if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return NULL;
- } else if(ch<=0xFFFF) {
- *(pDest++)=(UChar)ch;
+ uint8_t __t1, __t2;
+ if( /* handle U+0800..U+FFFF inline */
+ (0xe0<=(c) && (c)<0xf0) &&
+ U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+ (__t2=src[(i)+1]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+ i+=2;
+ } else if( /* handle U+0080..U+07FF inline */
+ ((c)<0xe0 && (c)>=0xc2) &&
+ (__t1=src[i]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0x1f)<<6)|__t1;
+ ++(i);
} else {
- *(pDest++)=U16_LEAD(ch);
- if(pDest<pDestLimit) {
- *(pDest++)=U16_TRAIL(ch);
+ /* function call for "complicated" and error cases */
+ (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
+ if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+ *pErrorCode = U_INVALID_CHAR_FOUND;
+ return NULL;
+ } else if(c<=0xFFFF) {
+ *(pDest++)=(UChar)c;
} else {
- reqLength++;
- break;
+ *(pDest++)=U16_LEAD(c);
+ if(pDest<pDestLimit) {
+ *(pDest++)=U16_TRAIL(c);
+ } else {
+ reqLength++;
+ break;
+ }
}
}
}
}
/* Pre-flight the rest of the string. */
- while((ch = *pSrc) != 0) {
- if(ch <= 0x7f){
+ while((c = (uint8_t)src[i]) != 0) {
+ // modified copy of U8_NEXT()
+ ++i;
+ if(U8_IS_SINGLE(c)) {
++reqLength;
- ++pSrc;
} else {
- if(ch > 0xe0) {
- if( /* handle U+1000..U+CFFF inline */
- ch <= 0xec &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
- (uint8_t)(pSrc[2] - 0x80) <= 0x3f
- ) {
- ++reqLength;
- pSrc += 3;
- continue;
- }
- } else if(ch < 0xe0) {
- if( /* handle U+0080..U+07FF inline */
- ch >= 0xc2 &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f
- ) {
- ++reqLength;
- pSrc += 2;
- continue;
+ uint8_t __t1, __t2;
+ if( /* handle U+0800..U+FFFF inline */
+ (0xe0<=(c) && (c)<0xf0) &&
+ U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+ (__t2=src[(i)+1]-0x80)<=0x3f) {
+ ++reqLength;
+ i+=2;
+ } else if( /* handle U+0080..U+07FF inline */
+ ((c)<0xe0 && (c)>=0xc2) &&
+ (__t1=src[i]-0x80)<=0x3f) {
+ ++reqLength;
+ ++(i);
+ } else {
+ /* function call for "complicated" and error cases */
+ (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
+ if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+ *pErrorCode = U_INVALID_CHAR_FOUND;
+ return NULL;
}
+ reqLength += U16_LENGTH(c);
}
-
- /* function call for "complicated" and error cases */
- ++pSrc; /* continue after the lead byte */
- ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
- if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return NULL;
- }
- reqLength += U16_LENGTH(ch);
}
}
} else /* srcLength >= 0 */ {
- const uint8_t *pSrcLimit = pSrc + srcLength;
- int32_t count;
-
- /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
+ /* Faster loop without ongoing checking for srcLength and pDestLimit. */
+ int32_t i = 0;
+ UChar32 c;
for(;;) {
/*
* Each iteration of the inner loop progresses by at most 3 UTF-8
* For supplementary code points (4 & 2), which are rare,
* there is an additional adjustment.
*/
- count = (int32_t)(pDestLimit - pDest);
- srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
- if(count > srcLength) {
- count = srcLength; /* min(remaining dest, remaining src/3) */
+ int32_t count = (int32_t)(pDestLimit - pDest);
+ int32_t count2 = (srcLength - i) / 3;
+ if(count > count2) {
+ count = count2; /* min(remaining dest, remaining src/3) */
}
if(count < 3) {
/*
}
do {
- ch = *pSrc;
- if(ch <= 0x7f){
- *pDest++=(UChar)ch;
- ++pSrc;
+ // modified copy of U8_NEXT()
+ c = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(c)) {
+ *pDest++=(UChar)c;
} else {
- if(ch > 0xe0) {
- if( /* handle U+1000..U+CFFF inline */
- ch <= 0xec &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
- ) {
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
- *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
- pSrc += 3;
- continue;
- }
- } else if(ch < 0xe0) {
- if( /* handle U+0080..U+07FF inline */
- ch >= 0xc2 &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
- ) {
- *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
- pSrc += 2;
- continue;
+ uint8_t __t1, __t2;
+ if( /* handle U+0800..U+FFFF inline */
+ (0xe0<=(c) && (c)<0xf0) &&
+ ((i)+1)<srcLength &&
+ U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+ (__t2=src[(i)+1]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+ i+=2;
+ } else if( /* handle U+0080..U+07FF inline */
+ ((c)<0xe0 && (c)>=0xc2) &&
+ ((i)!=srcLength) &&
+ (__t1=src[i]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0x1f)<<6)|__t1;
+ ++(i);
+ } else {
+ if(c >= 0xf0 || subchar > 0xffff) {
+ // We may read up to four bytes and write up to two UChars,
+ // which we didn't account for with computing count,
+ // so we adjust it here.
+ if(--count == 0) {
+ --i; // back out byte c
+ break;
+ }
}
- }
- if(ch >= 0xf0 || subchar > 0xffff) {
- /*
- * We may read up to six bytes and write up to two UChars,
- * which we didn't account for with computing count,
- * so we adjust it here.
- */
- if(--count == 0) {
- break;
+ /* function call for "complicated" and error cases */
+ (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+ if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+ *pErrorCode = U_INVALID_CHAR_FOUND;
+ return NULL;
+ } else if(c<=0xFFFF) {
+ *(pDest++)=(UChar)c;
+ } else {
+ *(pDest++)=U16_LEAD(c);
+ *(pDest++)=U16_TRAIL(c);
}
}
-
- /* function call for "complicated" and error cases */
- ++pSrc; /* continue after the lead byte */
- ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
- if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return NULL;
- }else if(ch<=0xFFFF){
- *(pDest++)=(UChar)ch;
- }else{
- *(pDest++)=U16_LEAD(ch);
- *(pDest++)=U16_TRAIL(ch);
- }
}
} while(--count > 0);
}
- while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
- ch = *pSrc;
- if(ch <= 0x7f){
- *pDest++=(UChar)ch;
- ++pSrc;
+ while(i < srcLength && (pDest < pDestLimit)) {
+ // modified copy of U8_NEXT()
+ c = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(c)) {
+ *pDest++=(UChar)c;
} else {
- if(ch > 0xe0) {
- if( /* handle U+1000..U+CFFF inline */
- ch <= 0xec &&
- ((pSrcLimit - pSrc) >= 3) &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
- ) {
- /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
- *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
- pSrc += 3;
- continue;
- }
- } else if(ch < 0xe0) {
- if( /* handle U+0080..U+07FF inline */
- ch >= 0xc2 &&
- ((pSrcLimit - pSrc) >= 2) &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
- ) {
- *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
- pSrc += 2;
- continue;
- }
- }
-
- /* function call for "complicated" and error cases */
- ++pSrc; /* continue after the lead byte */
- ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
- if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return NULL;
- }else if(ch<=0xFFFF){
- *(pDest++)=(UChar)ch;
- }else{
- *(pDest++)=U16_LEAD(ch);
- if(pDest<pDestLimit){
- *(pDest++)=U16_TRAIL(ch);
- }else{
- reqLength++;
- break;
+ uint8_t __t1, __t2;
+ if( /* handle U+0800..U+FFFF inline */
+ (0xe0<=(c) && (c)<0xf0) &&
+ ((i)+1)<srcLength &&
+ U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+ (__t2=src[(i)+1]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
+ i+=2;
+ } else if( /* handle U+0080..U+07FF inline */
+ ((c)<0xe0 && (c)>=0xc2) &&
+ ((i)!=srcLength) &&
+ (__t1=src[i]-0x80)<=0x3f) {
+ *pDest++ = (((c)&0x1f)<<6)|__t1;
+ ++(i);
+ } else {
+ /* function call for "complicated" and error cases */
+ (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+ if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+ *pErrorCode = U_INVALID_CHAR_FOUND;
+ return NULL;
+ } else if(c<=0xFFFF) {
+ *(pDest++)=(UChar)c;
+ } else {
+ *(pDest++)=U16_LEAD(c);
+ if(pDest<pDestLimit) {
+ *(pDest++)=U16_TRAIL(c);
+ } else {
+ reqLength++;
+ break;
+ }
}
}
}
}
- /* do not fill the dest buffer just count the UChars needed */
- while(pSrc < pSrcLimit){
- ch = *pSrc;
- if(ch <= 0x7f){
- reqLength++;
- ++pSrc;
+
+ /* Pre-flight the rest of the string. */
+ while(i < srcLength) {
+ // modified copy of U8_NEXT()
+ c = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(c)) {
+ ++reqLength;
} else {
- if(ch > 0xe0) {
- if( /* handle U+1000..U+CFFF inline */
- ch <= 0xec &&
- ((pSrcLimit - pSrc) >= 3) &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
- (uint8_t)(pSrc[2] - 0x80) <= 0x3f
- ) {
- reqLength++;
- pSrc += 3;
- continue;
- }
- } else if(ch < 0xe0) {
- if( /* handle U+0080..U+07FF inline */
- ch >= 0xc2 &&
- ((pSrcLimit - pSrc) >= 2) &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f
- ) {
- reqLength++;
- pSrc += 2;
- continue;
+ uint8_t __t1, __t2;
+ if( /* handle U+0800..U+FFFF inline */
+ (0xe0<=(c) && (c)<0xf0) &&
+ ((i)+1)<srcLength &&
+ U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
+ (__t2=src[(i)+1]-0x80)<=0x3f) {
+ ++reqLength;
+ i+=2;
+ } else if( /* handle U+0080..U+07FF inline */
+ ((c)<0xe0 && (c)>=0xc2) &&
+ ((i)!=srcLength) &&
+ (__t1=src[i]-0x80)<=0x3f) {
+ ++reqLength;
+ ++(i);
+ } else {
+ /* function call for "complicated" and error cases */
+ (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
+ if(c<0 && (++numSubstitutions, c = subchar) < 0) {
+ *pErrorCode = U_INVALID_CHAR_FOUND;
+ return NULL;
}
+ reqLength += U16_LENGTH(c);
}
-
- /* function call for "complicated" and error cases */
- ++pSrc; /* continue after the lead byte */
- ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
- if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
- *pErrorCode = U_INVALID_CHAR_FOUND;
- return NULL;
- }
- reqLength+=U16_LENGTH(ch);
}
}
}
uint8_t* pSrc = (uint8_t*) src;
/* args check */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+ if(U_FAILURE(*pErrorCode)){
return NULL;
}
int32_t numSubstitutions;
/* args check */
- if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+ if(U_FAILURE(*pErrorCode)){
return NULL;
}
int32_t srcLength,
UChar32 subchar, int32_t *pNumSubstitutions,
UErrorCode *pErrorCode) {
- UChar *pDest = dest;
- UChar *pDestLimit = dest+destCapacity;
- UChar32 ch;
- int32_t reqLength = 0;
- const uint8_t* pSrc = (const uint8_t*) src;
- const uint8_t *pSrcLimit;
- int32_t count;
- uint8_t t1, t2; /* trail bytes */
- int32_t numSubstitutions;
-
/* args check */
- if(U_FAILURE(*pErrorCode)){
+ if(U_FAILURE(*pErrorCode)) {
return NULL;
}
if( (src==NULL && srcLength!=0) || srcLength < -1 ||
if(pNumSubstitutions!=NULL) {
*pNumSubstitutions=0;
}
- numSubstitutions=0;
+ UChar *pDest = dest;
+ UChar *pDestLimit = dest+destCapacity;
+ int32_t reqLength = 0;
+ int32_t numSubstitutions=0;
if(srcLength < 0) {
/*
* Transform a NUL-terminated ASCII string.
* Handle non-ASCII strings with slower code.
*/
- while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
- *pDest++=(UChar)ch;
- ++pSrc;
+ UChar32 c;
+ while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
+ *pDest++=(UChar)c;
+ ++src;
}
- if(ch == 0) {
+ if(c == 0) {
reqLength=(int32_t)(pDest - dest);
if(pDestLength) {
*pDestLength = reqLength;
u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
return dest;
}
- srcLength = uprv_strlen((const char *)pSrc);
+ srcLength = static_cast<int32_t>(uprv_strlen(src));
}
- /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
- pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
+ /* Faster loop without ongoing checking for srcLength and pDestLimit. */
+ UChar32 ch;
+ uint8_t t1, t2;
+ int32_t i = 0;
for(;;) {
- count = (int32_t)(pDestLimit - pDest);
- srcLength = (int32_t)(pSrcLimit - pSrc);
- if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
+ int32_t count = (int32_t)(pDestLimit - pDest);
+ int32_t count2 = srcLength - i;
+ if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
/* fast ASCII loop */
- const uint8_t *prevSrc = pSrc;
- int32_t delta;
- while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
- *pDest++=(UChar)ch;
- ++pSrc;
+ int32_t start = i;
+ uint8_t b;
+ while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
+ *pDest++=b;
+ ++i;
}
- delta = (int32_t)(pSrc - prevSrc);
+ int32_t delta = i - start;
count -= delta;
- srcLength -= delta;
+ count2 -= delta;
}
/*
* Each iteration of the inner loop progresses by at most 3 UTF-8
* bytes and one UChar.
*/
- srcLength /= 3;
- if(count > srcLength) {
- count = srcLength; /* min(remaining dest, remaining src/3) */
+ if(subchar > 0xFFFF) {
+ break;
+ }
+ count2 /= 3;
+ if(count > count2) {
+ count = count2; /* min(remaining dest, remaining src/3) */
}
if(count < 3) {
/*
break;
}
do {
- ch = *pSrc;
- if(ch <= 0x7f){
+ ch = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(ch)) {
*pDest++=(UChar)ch;
- ++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+ (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
+ (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
- pSrc += 3;
+ i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+ (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
- pSrc += 2;
+ ++i;
continue;
}
}
* We need to write two UChars, adjusted count for that,
* and ran out of space.
*/
+ --i; // back out byte ch
break;
} else {
/* function call for error cases */
- ++pSrc; /* continue after the lead byte */
- utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+ utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
- if(subchar<=0xFFFF) {
- *(pDest++)=(UChar)subchar;
- } else {
- *(pDest++)=U16_LEAD(subchar);
- *(pDest++)=U16_TRAIL(subchar);
- }
+ *(pDest++)=(UChar)subchar;
}
}
} while(--count > 0);
}
- while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
- ch = *pSrc;
- if(ch <= 0x7f){
+ while(i < srcLength && (pDest < pDestLimit)) {
+ ch = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(ch)){
*pDest++=(UChar)ch;
- ++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
- ((pSrcLimit - pSrc) >= 3) &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
- (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+ (i+1) < srcLength &&
+ (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
+ (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
) {
/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
*pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
- pSrc += 3;
+ i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
- ((pSrcLimit - pSrc) >= 2) &&
- (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+ i < srcLength &&
+ (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
) {
*pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
- pSrc += 2;
+ ++i;
continue;
}
}
return NULL;
} else {
/* function call for error cases */
- ++pSrc; /* continue after the lead byte */
- utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+ utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
if(subchar<=0xFFFF) {
*(pDest++)=(UChar)subchar;
}
}
- /* do not fill the dest buffer just count the UChars needed */
- while(pSrc < pSrcLimit){
- ch = *pSrc;
- if(ch <= 0x7f) {
+ /* Pre-flight the rest of the string. */
+ while(i < srcLength) {
+ ch = (uint8_t)src[i++];
+ if(U8_IS_SINGLE(ch)) {
reqLength++;
- ++pSrc;
} else {
if(ch >= 0xe0) {
if( /* handle U+0000..U+FFFF inline */
ch <= 0xef &&
- ((pSrcLimit - pSrc) >= 3) &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
- (uint8_t)(pSrc[2] - 0x80) <= 0x3f
+ (i+1) < srcLength &&
+ (uint8_t)(src[i] - 0x80) <= 0x3f &&
+ (uint8_t)(src[i+1] - 0x80) <= 0x3f
) {
reqLength++;
- pSrc += 3;
+ i += 2;
continue;
}
} else {
if( /* handle U+0000..U+07FF inline */
ch >= 0xc0 &&
- ((pSrcLimit - pSrc) >= 2) &&
- (uint8_t)(pSrc[1] - 0x80) <= 0x3f
+ i < srcLength &&
+ (uint8_t)(src[i] - 0x80) <= 0x3f
) {
reqLength++;
- pSrc += 2;
+ ++i;
continue;
}
}
return NULL;
} else {
/* function call for error cases */
- ++pSrc; /* continue after the lead byte */
- utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+ utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
++numSubstitutions;
reqLength+=U16_LENGTH(ch);
}