X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/f3c0d7a59d99c2a94c6b8822291f0e42be3773c9..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/common/utf_impl.cpp?ds=sidebyside diff --git a/icuSources/common/utf_impl.cpp b/icuSources/common/utf_impl.cpp index 293e6f18..9dd241a1 100644 --- a/icuSources/common/utf_impl.cpp +++ b/icuSources/common/utf_impl.cpp @@ -7,7 +7,7 @@ * Corporation and others. All Rights Reserved. * ****************************************************************************** -* file name: utf_impl.c +* file name: utf_impl.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 @@ -27,7 +27,6 @@ #include "unicode/utypes.h" #include "unicode/utf.h" #include "unicode/utf8.h" -#include "unicode/utf_old.h" #include "uassert.h" /* @@ -55,10 +54,6 @@ * - SUB AX, BX (result) * -finish: * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB) - * - * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal; - * lead bytes above 0xf4 are illegal. - * We keep them in this table for skipping long ISO 10646-UTF-8 sequences. */ extern "C" U_EXPORT const uint8_t utf8_countTrailBytes[256]={ @@ -77,24 +72,24 @@ utf8_countTrailBytes[256]={ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // illegal C0 & C1 + // 2-byte lead bytes C2..DF + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // 3-byte lead bytes E0..EF 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, - 3, 3, 3, /* illegal in Unicode */ - 4, 4, 4, 4, /* illegal in Unicode */ - 5, 5, /* illegal in Unicode */ - 0, 0 /* illegal bytes 0xfe and 0xff */ + // 4-byte lead bytes F0..F4 + // illegal F5..FF + 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -static const UChar32 -utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; - static const UChar32 utf8_errorValue[6]={ - UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff, - 0x3ffffff, 0x7fffffff + // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, + // but without relying on the obsolete unicode/utf_old.h. + 0x15, 0x9f, 0xffff, + 0x10ffff }; static UChar32 @@ -134,61 +129,59 @@ errorValue(int32_t count, int8_t strict) { */ U_CAPI UChar32 U_EXPORT2 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) { + // *pi is one after byte c. int32_t i=*pi; - uint8_t count=U8_COUNT_TRAIL_BYTES(c); - U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */ - if(i+count<=length || length<0) { - uint8_t trail; - - U8_MASK_LEAD_BYTE(c, count); - /* support NUL-terminated strings: do not read beyond the first non-trail byte */ - switch(count) { - /* each branch falls through to the next one */ - case 0: - /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ - case 5: - case 4: - /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ - break; - case 3: - trail=s[i++]-0x80; - c=(c<<6)|trail; - /* c>=0x110 would result in code point>0x10ffff, outside Unicode */ - if(c>=0x110 || trail>0x3f) { break; } - U_FALLTHROUGH; - case 2: - trail=s[i++]-0x80; - c=(c<<6)|trail; - /* - * test for a surrogate d800..dfff unless we are lenient: - * before the last (c<<6), a surrogate is c=360..37f - */ - if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; } - U_FALLTHROUGH; - case 1: - trail=s[i++]-0x80; - c=(c<<6)|trail; - if(trail>0x3f) { break; } - /* correct sequence - all trail bytes have (b7..b6)==(10) */ - if(c>=utf8_minLegal[count] && - /* strict: forbid non-characters like U+fffe */ - (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) { + // length can be negative for NUL-terminated strings: Read and validate one byte at a time. + if(i==length || c>0xf4) { + // end of string, or not a lead byte + } else if(c>=0xf0) { + // Test for 4-byte sequences first because + // U8_NEXT() handles shorter valid sequences inline. + uint8_t t1=s[i], t2, t3; + c&=7; + if(U8_IS_VALID_LEAD4_AND_T1(c, t1) && + ++i!=length && (t2=s[i]-0x80)<=0x3f && + ++i!=length && (t3=s[i]-0x80)<=0x3f) { + ++i; + c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3; + // strict: forbid non-characters like U+fffe + if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { *pi=i; return c; } - /* no default branch to optimize switch() - all values are covered */ } - } else { - /* too few bytes left */ - count=length-i; - } + } else if(c>=0xe0) { + c&=0xf; + if(strict!=-2) { + uint8_t t1=s[i], t2; + if(U8_IS_VALID_LEAD3_AND_T1(c, t1) && + ++i!=length && (t2=s[i]-0x80)<=0x3f) { + ++i; + c=(c<<12)|((t1&0x3f)<<6)|t2; + // strict: forbid non-characters like U+fffe + if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { + *pi=i; + return c; + } + } + } else { + // strict=-2 -> lenient: allow surrogates + uint8_t t1=s[i]-0x80, t2; + if(t1<=0x3f && (c>0 || t1>=0x20) && + ++i!=length && (t2=s[i]-0x80)<=0x3f) { + *pi=i+1; + return (c<<12)|(t1<<6)|t2; + } + } + } else if(c>=0xc2) { + uint8_t t1=s[i]-0x80; + if(t1<=0x3f) { + *pi=i+1; + return ((c-0xc0)<<6)|t1; + } + } // else 0x80<=c<0xc2 is not a lead byte /* error handling */ - i=*pi; - while(count>0 && U8_IS_TRAIL(s[i])) { - ++i; - --count; - } c=errorValue(i-*pi, strict); *pi=i; return c; @@ -232,7 +225,7 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool s+=i; offset=0; c=utf8_errorValue[length-1]; - UTF8_APPEND_CHAR_UNSAFE(s, offset, c); + U8_APPEND_UNSAFE(s, offset, c); i=i+offset; } } @@ -241,99 +234,96 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool U_CAPI UChar32 U_EXPORT2 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) { + // *pi is the index of byte c. int32_t i=*pi; - uint8_t b, count=1, shift=6; - - if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); } - - /* extract value bits from the last trail byte */ - c&=0x3f; - - for(;;) { - if(i<=start) { - /* no lead byte at all */ - return errorValue(0, strict); - } - - /* read another previous byte */ - b=s[--i]; - if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */ - if(b&0x40) { - /* lead byte, this will always end the loop */ - uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b); - - if(count==shouldCount) { - /* set the new position */ - *pi=i; - U8_MASK_LEAD_BYTE(b, count); - c|=(UChar32)b<=4 || c>0x10ffff || c0 && U_IS_UNICODE_NONCHAR(c))) { - /* illegal sequence or (strict and non-character) */ - if(count>=4) { - count=3; + if(U8_IS_TRAIL(c) && i>start) { + uint8_t b1=s[--i]; + if(U8_IS_LEAD(b1)) { + if(b1<0xe0) { + *pi=i; + return ((b1-0xc0)<<6)|(c&0x3f); + } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) { + // Truncated 3- or 4-byte sequence. + *pi=i; + return errorValue(1, strict); + } + } else if(U8_IS_TRAIL(b1) && i>start) { + // Extract the value bits from the last trail byte. + c&=0x3f; + uint8_t b2=s[--i]; + if(0xe0<=b2 && b2<=0xf4) { + if(b2<0xf0) { + b2&=0xf; + if(strict!=-2) { + if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { + *pi=i; + c=(b2<<12)|((b1&0x3f)<<6)|c; + if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { + return c; + } else { + // strict: forbid non-characters like U+fffe + return errorValue(2, strict); + } } - c=errorValue(count, strict); } else { - /* exit with correct c */ + // strict=-2 -> lenient: allow surrogates + b1-=0x80; + if((b2>0 || b1>=0x20)) { + *pi=i; + return (b2<<12)|(b1<<6)|c; + } } - } else { - /* the lead byte does not match the number of trail bytes */ - /* only set the position to the lead byte if it would - include the trail byte that we started with */ - if(countstart) { + uint8_t b3=s[--i]; + if(0xf0<=b3 && b3<=0xf4) { + b3&=7; + if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { *pi=i; - c=errorValue(count, strict); - } else { - c=errorValue(0, strict); + c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c; + if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { + return c; + } else { + // strict: forbid non-characters like U+fffe + return errorValue(3, strict); + } } } - break; - } else if(count<5) { - /* trail byte */ - c|=(UChar32)(b&0x3f)<start) { - Z=I-5; - } else { - Z=start; - } - - /* return I if the sequence starting there is long enough to include i */ - do { - b=s[I]; - if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */ - break; - } else if(b>=0xc0) { - if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) { - return I; - } else { - break; + // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points. + int32_t orig_i=i; + uint8_t c=s[i]; + if(U8_IS_TRAIL(c) && i>start) { + uint8_t b1=s[--i]; + if(U8_IS_LEAD(b1)) { + if(b1<0xe0 || + (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) { + return i; + } + } else if(U8_IS_TRAIL(b1) && i>start) { + uint8_t b2=s[--i]; + if(0xe0<=b2 && b2<=0xf4) { + if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { + return i; + } + } else if(U8_IS_TRAIL(b2) && i>start) { + uint8_t b3=s[--i]; + if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { + return i; + } } } - } while(Z<=--I); - - /* return i itself to be consistent with the FWD_1 macro */ - return i; + } + return orig_i; }