/*
******************************************************************************
*
-* Copyright (C) 1999-2011, International Business Machines
+* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
0x3ffffff, 0x7fffffff
};
+static UChar32
+errorValue(int32_t count, int8_t strict) {
+ if(strict>=0) {
+ return utf8_errorValue[count];
+ } else if(strict==-3) {
+ return 0xfffd;
+ } else {
+ return U_SENTINEL;
+ }
+}
+
/*
- * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling
- * UTF8_NEXT_CHAR_SAFE().
+ * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
+ * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
+ *
+ * U8_NEXT() supports NUL-terminated strings indicated via length<0.
*
* The "strict" parameter controls the error behavior:
- * <0 "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative
- * code point result.
+ * <0 "Safe" behavior of U8_NEXT():
+ * -1: All illegal byte sequences yield U_SENTINEL=-1.
+ * -2: Same as -1, except for lenient treatment of surrogate code points as legal.
+ * Some implementations use this for roundtripping of
+ * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
+ * contain unpaired surrogates.
+ * -3: All illegal byte sequences yield U+FFFD.
* 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
* All illegal byte sequences yield a positive code point such that this
* result code point would be encoded with the same number of bytes as
* Same as the obsolete "safe" behavior, but non-characters are also treated
* like illegal sequences.
*
- * The special negative (<0) value -2 is used for lenient treatment of surrogate
- * code points as legal. Some implementations use this for roundtripping of
- * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
- * contain unpaired surrogates.
- *
* Note that a UBool is the same as an int8_t.
*/
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
int32_t i=*pi;
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
- U_ASSERT(count >= 0 && count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
- if((i)+count<=(length)) {
- uint8_t trail, illegal=0;
+ U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
+ if(i+count<=length || length<0) {
+ uint8_t trail;
- U8_MASK_LEAD_BYTE((c), count);
- /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
+ U8_MASK_LEAD_BYTE(c, count);
+ /* support NUL-terminated strings: do not read beyond the first non-trail byte */
switch(count) {
/* each branch falls through to the next one */
+ case 0:
+ /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
case 5:
case 4:
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
- illegal=1;
break;
case 3:
- trail=s[(i)++];
- (c)=((c)<<6)|(trail&0x3f);
- if(c<0x110) {
- illegal|=(trail&0xc0)^0x80;
- } else {
- /* code point>0x10ffff, outside Unicode */
- illegal=1;
- break;
- }
+ trail=s[i++]-0x80;
+ c=(c<<6)|trail;
+ /* c>=0x110 would result in code point>0x10ffff, outside Unicode */
+ if(c>=0x110 || trail>0x3f) { break; }
case 2:
- trail=s[(i)++];
- (c)=((c)<<6)|(trail&0x3f);
- illegal|=(trail&0xc0)^0x80;
+ trail=s[i++]-0x80;
+ c=(c<<6)|trail;
+ /*
+ * test for a surrogate d800..dfff unless we are lenient:
+ * before the last (c<<6), a surrogate is c=360..37f
+ */
+ if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
case 1:
- trail=s[(i)++];
- (c)=((c)<<6)|(trail&0x3f);
- illegal|=(trail&0xc0)^0x80;
- break;
- case 0:
- if(strict>=0) {
- return UTF8_ERROR_VALUE_1;
- } else {
- return U_SENTINEL;
+ trail=s[i++]-0x80;
+ c=(c<<6)|trail;
+ if(trail>0x3f) { break; }
+ /* correct sequence - all trail bytes have (b7..b6)==(10) */
+ if(c>=utf8_minLegal[count] &&
+ /* strict: forbid non-characters like U+fffe */
+ (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
+ *pi=i;
+ return c;
}
/* no default branch to optimize switch() - all values are covered */
}
+ } else {
+ /* too few bytes left */
+ count=length-i;
+ }
- /*
- * All the error handling should return a value
- * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
- *
- * Starting with Unicode 3.0.1, non-shortest forms are illegal.
- * Starting with Unicode 3.2, surrogate code points must not be
- * encoded in UTF-8, and there are no irregular sequences any more.
- *
- * U8_ macros (new in ICU 2.4) return negative values for error conditions.
- */
-
- /* correct sequence - all trail bytes have (b7..b6)==(10)? */
- /* illegal is also set if count>=4 */
- if(illegal || (c)<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2)) {
- /* error handling */
- uint8_t errorCount=count;
- /* don't go beyond this sequence */
- i=*pi;
- while(count>0 && U8_IS_TRAIL(s[i])) {
- ++(i);
- --count;
- }
- if(strict>=0) {
- c=utf8_errorValue[errorCount-count];
- } else {
- c=U_SENTINEL;
- }
- } else if((strict)>0 && U_IS_UNICODE_NONCHAR(c)) {
- /* strict: forbid non-characters like U+fffe */
- c=utf8_errorValue[count];
- }
- } else /* too few bytes left */ {
- /* error handling */
- int32_t i0=i;
- /* don't just set (i)=(length) in case there is an illegal sequence */
- while((i)<(length) && U8_IS_TRAIL(s[i])) {
- ++(i);
- }
- if(strict>=0) {
- c=utf8_errorValue[i-i0];
- } else {
- c=U_SENTINEL;
- }
+ /* error handling */
+ i=*pi;
+ while(count>0 && U8_IS_TRAIL(s[i])) {
+ ++i;
+ --count;
}
+ c=errorValue(i-*pi, strict);
*pi=i;
return c;
}
int32_t i=*pi;
uint8_t b, count=1, shift=6;
+ if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
+
/* extract value bits from the last trail byte */
c&=0x3f;
for(;;) {
if(i<=start) {
/* no lead byte at all */
- if(strict>=0) {
- return UTF8_ERROR_VALUE_1;
- } else {
- return U_SENTINEL;
- }
- /*break;*/
+ return errorValue(0, strict);
}
/* read another previous byte */
if(count>=4) {
count=3;
}
- if(strict>=0) {
- c=utf8_errorValue[count];
- } else {
- c=U_SENTINEL;
- }
+ c=errorValue(count, strict);
} else {
/* exit with correct c */
}
include the trail byte that we started with */
if(count<shouldCount) {
*pi=i;
- if(strict>=0) {
- c=utf8_errorValue[count];
- } else {
- c=U_SENTINEL;
- }
+ c=errorValue(count, strict);
} else {
- if(strict>=0) {
- c=UTF8_ERROR_VALUE_1;
- } else {
- c=U_SENTINEL;
- }
+ c=errorValue(0, strict);
}
}
break;
shift+=6;
} else {
/* more than 5 trail bytes is illegal */
- if(strict>=0) {
- c=UTF8_ERROR_VALUE_1;
- } else {
- c=U_SENTINEL;
- }
+ c=errorValue(0, strict);
break;
}
} else {
/* single-byte character precedes trailing bytes */
- if(strict>=0) {
- c=UTF8_ERROR_VALUE_1;
- } else {
- c=U_SENTINEL;
- }
+ c=errorValue(0, strict);
break;
}
}