-/* for utf8_nextCharSafeBodyTerminated() */
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - checks for NUL termination instead of length
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
- const uint8_t *s=*ps;
- uint8_t trail, illegal=0;
- uint8_t count=U8_COUNT_TRAIL_BYTES(c);
- U_ASSERT(count<6);
- U8_MASK_LEAD_BYTE((c), count);
- /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
- switch(count) {
- /* each branch falls through to the next one */
- case 5:
- case 4:
- /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
- illegal=1;
- break;
- case 3:
- trail=(uint8_t)(*s++ - 0x80);
- c=(c<<6)|trail;
- if(trail>0x3f || c>=0x110) {
- /* not a trail byte, or code point>0x10ffff (outside Unicode) */
- illegal=1;
- break;
- }
- case 2: /*fall through*/
- trail=(uint8_t)(*s++ - 0x80);
- if(trail>0x3f) {
- /* not a trail byte */
- illegal=1;
- break;
- }
- c=(c<<6)|trail;
- case 1: /*fall through*/
- trail=(uint8_t)(*s++ - 0x80);
- if(trail>0x3f) {
- /* not a trail byte */
- illegal=1;
- }
- c=(c<<6)|trail;
- break;
- case 0:
- return U_SENTINEL;
- /* no default branch to optimize switch() - all values are covered */
- }
-
- /* correct sequence - all trail bytes have (b7..b6)==(10)? */
- /* illegal is also set if count>=4 */
- if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
- /* error handling */
- /* don't go beyond this sequence */
- s=*ps;
- while(count>0 && U8_IS_TRAIL(*s)) {
- ++s;
- --count;
- }
- c=U_SENTINEL;
- }
- *ps=s;
- return c;
-}
-
-/*
- * Version of utf8_nextCharSafeBody() with the following differences:
- * - works with pointers instead of indexes
- * - always strict (strict==-1)
- *
- * *ps points to after the lead byte and will be moved to after the last trail byte.
- * c is the lead byte.
- * @return the code point, or U_SENTINEL
- */
-static UChar32
-utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
- const uint8_t *s=*ps;
- uint8_t trail, illegal=0;
- uint8_t count=U8_COUNT_TRAIL_BYTES(c);
- if((limit-s)>=count) {
- U8_MASK_LEAD_BYTE((c), count);
- /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
- switch(count) {
- /* each branch falls through to the next one */
- case 5:
- case 4:
- /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
- illegal=1;
- break;
- case 3:
- trail=*s++;
- c=(c<<6)|(trail&0x3f);
- if(c<0x110) {
- illegal|=(trail&0xc0)^0x80;
- } else {
- /* code point>0x10ffff, outside Unicode */
- illegal=1;
- break;
- }
- case 2: /*fall through*/
- trail=*s++;
- c=(c<<6)|(trail&0x3f);
- illegal|=(trail&0xc0)^0x80;
- case 1: /*fall through*/
- trail=*s++;
- c=(c<<6)|(trail&0x3f);
- illegal|=(trail&0xc0)^0x80;
- break;
- case 0:
- return U_SENTINEL;
- /* no default branch to optimize switch() - all values are covered */
- }
- } else {
- illegal=1; /* too few bytes left */
- }
-
- /* correct sequence - all trail bytes have (b7..b6)==(10)? */
- /* illegal is also set if count>=4 */
- U_ASSERT(illegal || count<LENGTHOF(utf8_minLegal));
- if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
- /* error handling */
- /* don't go beyond this sequence */
- s=*ps;
- while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
- ++s;
- --count;
- }
- c=U_SENTINEL;
- }
- *ps=s;
- return c;
-}
-