- uint8_t count=U8_COUNT_TRAIL_BYTES(c);
- U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
- if(i+count<=length || length<0) {
- uint8_t trail;
-
- U8_MASK_LEAD_BYTE(c, count);
- /* support NUL-terminated strings: do not read beyond the first non-trail byte */
- switch(count) {
- /* each branch falls through to the next one */
- case 0:
- /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
- case 5:
- case 4:
- /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
- break;
- case 3:
- trail=s[i++]-0x80;
- c=(c<<6)|trail;
- /* c>=0x110 would result in code point>0x10ffff, outside Unicode */
- if(c>=0x110 || trail>0x3f) { break; }
- U_FALLTHROUGH;
- case 2:
- trail=s[i++]-0x80;
- c=(c<<6)|trail;
- /*
- * test for a surrogate d800..dfff unless we are lenient:
- * before the last (c<<6), a surrogate is c=360..37f
- */
- if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
- U_FALLTHROUGH;
- case 1:
- trail=s[i++]-0x80;
- c=(c<<6)|trail;
- if(trail>0x3f) { break; }
- /* correct sequence - all trail bytes have (b7..b6)==(10) */
- if(c>=utf8_minLegal[count] &&
- /* strict: forbid non-characters like U+fffe */
- (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
+ // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
+ if(i==length || c>0xf4) {
+ // end of string, or not a lead byte
+ } else if(c>=0xf0) {
+ // Test for 4-byte sequences first because
+ // U8_NEXT() handles shorter valid sequences inline.
+ uint8_t t1=s[i], t2, t3;
+ c&=7;
+ if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
+ ++i!=length && (t2=s[i]-0x80)<=0x3f &&
+ ++i!=length && (t3=s[i]-0x80)<=0x3f) {
+ ++i;
+ c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
+ // strict: forbid non-characters like U+fffe
+ if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {