* Corporation and others. All Rights Reserved.
*
******************************************************************************
-* file name: utf_impl.c
+* file name: utf_impl.cpp
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
#include "unicode/utypes.h"
#include "unicode/utf.h"
#include "unicode/utf8.h"
-#include "unicode/utf_old.h"
#include "uassert.h"
/*
* - SUB AX, BX (result)
* -finish:
* (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
- *
- * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
- * lead bytes above 0xf4 are illegal.
- * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
*/
extern "C" U_EXPORT const uint8_t
utf8_countTrailBytes[256]={
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ // illegal C0 & C1
+ // 2-byte lead bytes C2..DF
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ // 3-byte lead bytes E0..EF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3,
- 3, 3, 3, /* illegal in Unicode */
- 4, 4, 4, 4, /* illegal in Unicode */
- 5, 5, /* illegal in Unicode */
- 0, 0 /* illegal bytes 0xfe and 0xff */
+ // 4-byte lead bytes F0..F4
+ // illegal F5..FF
+ 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
static const UChar32
utf8_errorValue[6]={
- UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
- 0x3ffffff, 0x7fffffff
+ // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
+ // but without relying on the obsolete unicode/utf_old.h.
+ 0x15, 0x9f, 0xffff,
+ 0x10ffff
};
static UChar32
*/
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
+ // *pi is one after byte c.
int32_t i=*pi;
- uint8_t count=U8_COUNT_TRAIL_BYTES(c);
- U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
- if(i+count<=length || length<0) {
- uint8_t trail;
-
- U8_MASK_LEAD_BYTE(c, count);
- /* support NUL-terminated strings: do not read beyond the first non-trail byte */
- switch(count) {
- /* each branch falls through to the next one */
- case 0:
- /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
- case 5:
- case 4:
- /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
- break;
- case 3:
- trail=s[i++]-0x80;
- c=(c<<6)|trail;
- /* c>=0x110 would result in code point>0x10ffff, outside Unicode */
- if(c>=0x110 || trail>0x3f) { break; }
- U_FALLTHROUGH;
- case 2:
- trail=s[i++]-0x80;
- c=(c<<6)|trail;
- /*
- * test for a surrogate d800..dfff unless we are lenient:
- * before the last (c<<6), a surrogate is c=360..37f
- */
- if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
- U_FALLTHROUGH;
- case 1:
- trail=s[i++]-0x80;
- c=(c<<6)|trail;
- if(trail>0x3f) { break; }
- /* correct sequence - all trail bytes have (b7..b6)==(10) */
- if(c>=utf8_minLegal[count] &&
- /* strict: forbid non-characters like U+fffe */
- (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
+ // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
+ if(i==length || c>0xf4) {
+ // end of string, or not a lead byte
+ } else if(c>=0xf0) {
+ // Test for 4-byte sequences first because
+ // U8_NEXT() handles shorter valid sequences inline.
+ uint8_t t1=s[i], t2, t3;
+ c&=7;
+ if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
+ ++i!=length && (t2=s[i]-0x80)<=0x3f &&
+ ++i!=length && (t3=s[i]-0x80)<=0x3f) {
+ ++i;
+ c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
+ // strict: forbid non-characters like U+fffe
+ if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
*pi=i;
return c;
}
- /* no default branch to optimize switch() - all values are covered */
}
- } else {
- /* too few bytes left */
- count=length-i;
- }
+ } else if(c>=0xe0) {
+ c&=0xf;
+ if(strict!=-2) {
+ uint8_t t1=s[i], t2;
+ if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
+ ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+ ++i;
+ c=(c<<12)|((t1&0x3f)<<6)|t2;
+ // strict: forbid non-characters like U+fffe
+ if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+ *pi=i;
+ return c;
+ }
+ }
+ } else {
+ // strict=-2 -> lenient: allow surrogates
+ uint8_t t1=s[i]-0x80, t2;
+ if(t1<=0x3f && (c>0 || t1>=0x20) &&
+ ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+ *pi=i+1;
+ return (c<<12)|(t1<<6)|t2;
+ }
+ }
+ } else if(c>=0xc2) {
+ uint8_t t1=s[i]-0x80;
+ if(t1<=0x3f) {
+ *pi=i+1;
+ return ((c-0xc0)<<6)|t1;
+ }
+ } // else 0x80<=c<0xc2 is not a lead byte
/* error handling */
- i=*pi;
- while(count>0 && U8_IS_TRAIL(s[i])) {
- ++i;
- --count;
- }
c=errorValue(i-*pi, strict);
*pi=i;
return c;
s+=i;
offset=0;
c=utf8_errorValue[length-1];
- UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
+ U8_APPEND_UNSAFE(s, offset, c);
i=i+offset;
}
}
U_CAPI UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
+ // *pi is the index of byte c.
int32_t i=*pi;
- uint8_t b, count=1, shift=6;
-
- if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
-
- /* extract value bits from the last trail byte */
- c&=0x3f;
-
- for(;;) {
- if(i<=start) {
- /* no lead byte at all */
- return errorValue(0, strict);
- }
-
- /* read another previous byte */
- b=s[--i];
- if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
- if(b&0x40) {
- /* lead byte, this will always end the loop */
- uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
-
- if(count==shouldCount) {
- /* set the new position */
- *pi=i;
- U8_MASK_LEAD_BYTE(b, count);
- c|=(UChar32)b<<shift;
- if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
- /* illegal sequence or (strict and non-character) */
- if(count>=4) {
- count=3;
+ if(U8_IS_TRAIL(c) && i>start) {
+ uint8_t b1=s[--i];
+ if(U8_IS_LEAD(b1)) {
+ if(b1<0xe0) {
+ *pi=i;
+ return ((b1-0xc0)<<6)|(c&0x3f);
+ } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
+ // Truncated 3- or 4-byte sequence.
+ *pi=i;
+ return errorValue(1, strict);
+ }
+ } else if(U8_IS_TRAIL(b1) && i>start) {
+ // Extract the value bits from the last trail byte.
+ c&=0x3f;
+ uint8_t b2=s[--i];
+ if(0xe0<=b2 && b2<=0xf4) {
+ if(b2<0xf0) {
+ b2&=0xf;
+ if(strict!=-2) {
+ if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+ *pi=i;
+ c=(b2<<12)|((b1&0x3f)<<6)|c;
+ if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+ return c;
+ } else {
+ // strict: forbid non-characters like U+fffe
+ return errorValue(2, strict);
+ }
}
- c=errorValue(count, strict);
} else {
- /* exit with correct c */
+ // strict=-2 -> lenient: allow surrogates
+ b1-=0x80;
+ if((b2>0 || b1>=0x20)) {
+ *pi=i;
+ return (b2<<12)|(b1<<6)|c;
+ }
}
- } else {
- /* the lead byte does not match the number of trail bytes */
- /* only set the position to the lead byte if it would
- include the trail byte that we started with */
- if(count<shouldCount) {
+ } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+ // Truncated 4-byte sequence.
+ *pi=i;
+ return errorValue(2, strict);
+ }
+ } else if(U8_IS_TRAIL(b2) && i>start) {
+ uint8_t b3=s[--i];
+ if(0xf0<=b3 && b3<=0xf4) {
+ b3&=7;
+ if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
*pi=i;
- c=errorValue(count, strict);
- } else {
- c=errorValue(0, strict);
+ c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
+ if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+ return c;
+ } else {
+ // strict: forbid non-characters like U+fffe
+ return errorValue(3, strict);
+ }
}
}
- break;
- } else if(count<5) {
- /* trail byte */
- c|=(UChar32)(b&0x3f)<<shift;
- ++count;
- shift+=6;
- } else {
- /* more than 5 trail bytes is illegal */
- c=errorValue(0, strict);
- break;
}
- } else {
- /* single-byte character precedes trailing bytes */
- c=errorValue(0, strict);
- break;
}
}
- return c;
+ return errorValue(0, strict);
}
U_CAPI int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
- /* i had been decremented once before the function call */
- int32_t I=i, Z;
- uint8_t b;
-
- /* read at most the 6 bytes s[Z] to s[i], inclusively */
- if(I-5>start) {
- Z=I-5;
- } else {
- Z=start;
- }
-
- /* return I if the sequence starting there is long enough to include i */
- do {
- b=s[I];
- if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
- break;
- } else if(b>=0xc0) {
- if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
- return I;
- } else {
- break;
+ // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
+ int32_t orig_i=i;
+ uint8_t c=s[i];
+ if(U8_IS_TRAIL(c) && i>start) {
+ uint8_t b1=s[--i];
+ if(U8_IS_LEAD(b1)) {
+ if(b1<0xe0 ||
+ (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+ return i;
+ }
+ } else if(U8_IS_TRAIL(b1) && i>start) {
+ uint8_t b2=s[--i];
+ if(0xe0<=b2 && b2<=0xf4) {
+ if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+ return i;
+ }
+ } else if(U8_IS_TRAIL(b2) && i>start) {
+ uint8_t b3=s[--i];
+ if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
+ return i;
+ }
}
}
- } while(Z<=--I);
-
- /* return i itself to be consistent with the FWD_1 macro */
- return i;
+ }
+ return orig_i;
}