icuSources/common/utf_impl.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 *
   6 *   Copyright (C) 1999-2012, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 ******************************************************************************
  10 *   file name:  utf_impl.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 1999sep13
  16 *   created by: Markus W. Scherer
  17 *
  18 *   This file provides implementation functions for macros in the utfXX.h
  19 *   that would otherwise be too long as macros.
  20 */
  21
  22 /* set import/export definitions */
  23 #ifndef U_UTF8_IMPL
  24 #   define U_UTF8_IMPL
  25 #endif
  26
  27 #include "unicode/utypes.h"
  28 #include "unicode/utf.h"
  29 #include "unicode/utf8.h"
  30 #include "uassert.h"
  31
  32 /*
  33  * Table of the number of utf8 trail bytes, indexed by the lead byte.
  34  * Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h
  35  *
  36  * The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table.
  37  *
  38  * Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were
  39  * changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES
  40  * may exist in old client code that must continue to run with newer icu library versions.
  41  *
  42  * This table could be replaced on many machines by
  43  * a few lines of assembler code using an
  44  * "index of first 0-bit from msb" instruction and
  45  * one or two more integer instructions.
  46  *
  47  * For example, on an i386, do something like
  48  * - MOV AL, leadByte
  49  * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)
  50  * - MOV AH, 0
  51  * - BSR BX, AX     (16-bit)
  52  * - MOV AX, 6      (result)
  53  * - JZ finish      (ZF==1 if leadByte==0xff)
  54  * - SUB AX, BX (result)
  55  * -finish:
  56  * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
  57  */
  58 extern "C" U_EXPORT const uint8_t
  59 utf8_countTrailBytes[256]={
  60     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  61     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  62     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  63     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  64
  65     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  66     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  67     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  68     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  69
  70     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  71     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  72     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  73     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  74
  75     // illegal C0 & C1
  76     // 2-byte lead bytes C2..DF
  77     0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  78     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  79
  80     // 3-byte lead bytes E0..EF
  81     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  82     // 4-byte lead bytes F0..F4
  83     // illegal F5..FF
  84     3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  85 };
  86
  87 static const UChar32
  88 utf8_errorValue[6]={
  89     // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
  90     // but without relying on the obsolete unicode/utf_old.h.
  91     0x15, 0x9f, 0xffff,
  92     0x10ffff
  93 };
  94
  95 static UChar32
  96 errorValue(int32_t count, int8_t strict) {
  97     if(strict>=0) {
  98         return utf8_errorValue[count];
  99     } else if(strict==-3) {
 100         return 0xfffd;
 101     } else {
 102         return U_SENTINEL;
 103     }
 104 }
 105
 106 /*
 107  * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
 108  * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
 109  *
 110  * U8_NEXT() supports NUL-terminated strings indicated via length<0.
 111  *
 112  * The "strict" parameter controls the error behavior:
 113  * <0  "Safe" behavior of U8_NEXT():
 114  *     -1: All illegal byte sequences yield U_SENTINEL=-1.
 115  *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.
 116  *         Some implementations use this for roundtripping of
 117  *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they
 118  *         contain unpaired surrogates.
 119  *     -3: All illegal byte sequences yield U+FFFD.
 120  *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
 121  *     All illegal byte sequences yield a positive code point such that this
 122  *     result code point would be encoded with the same number of bytes as
 123  *     the illegal sequence.
 124  * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
 125  *     Same as the obsolete "safe" behavior, but non-characters are also treated
 126  *     like illegal sequences.
 127  *
 128  * Note that a UBool is the same as an int8_t.
 129  */
 130 U_CAPI UChar32 U_EXPORT2
 131 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
 132     // *pi is one after byte c.
 133     int32_t i=*pi;
 134     // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
 135     if(i==length || c>0xf4) {
 136         // end of string, or not a lead byte
 137     } else if(c>=0xf0) {
 138         // Test for 4-byte sequences first because
 139         // U8_NEXT() handles shorter valid sequences inline.
 140         uint8_t t1=s[i], t2, t3;
 141         c&=7;
 142         if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
 143                 ++i!=length && (t2=s[i]-0x80)<=0x3f &&
 144                 ++i!=length && (t3=s[i]-0x80)<=0x3f) {
 145             ++i;
 146             c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
 147             // strict: forbid non-characters like U+fffe
 148             if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
 149                 *pi=i;
 150                 return c;
 151             }
 152         }
 153     } else if(c>=0xe0) {
 154         c&=0xf;
 155         if(strict!=-2) {
 156             uint8_t t1=s[i], t2;
 157             if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
 158                     ++i!=length && (t2=s[i]-0x80)<=0x3f) {
 159                 ++i;
 160                 c=(c<<12)|((t1&0x3f)<<6)|t2;
 161                 // strict: forbid non-characters like U+fffe
 162                 if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
 163                     *pi=i;
 164                     return c;
 165                 }
 166             }
 167         } else {
 168             // strict=-2 -> lenient: allow surrogates
 169             uint8_t t1=s[i]-0x80, t2;
 170             if(t1<=0x3f && (c>0 || t1>=0x20) &&
 171                     ++i!=length && (t2=s[i]-0x80)<=0x3f) {
 172                 *pi=i+1;
 173                 return (c<<12)|(t1<<6)|t2;
 174             }
 175         }
 176     } else if(c>=0xc2) {
 177         uint8_t t1=s[i]-0x80;
 178         if(t1<=0x3f) {
 179             *pi=i+1;
 180             return ((c-0xc0)<<6)|t1;
 181         }
 182     }  // else 0x80<=c<0xc2 is not a lead byte
 183
 184     /* error handling */
 185     c=errorValue(i-*pi, strict);
 186     *pi=i;
 187     return c;
 188 }
 189
 190 U_CAPI int32_t U_EXPORT2
 191 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
 192     if((uint32_t)(c)<=0x7ff) {
 193         if((i)+1<(length)) {
 194             (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
 195             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
 196             return i;
 197         }
 198     } else if((uint32_t)(c)<=0xffff) {
 199         /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
 200         if((i)+2<(length) && !U_IS_SURROGATE(c)) {
 201             (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
 202             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
 203             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
 204             return i;
 205         }
 206     } else if((uint32_t)(c)<=0x10ffff) {
 207         if((i)+3<(length)) {
 208             (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
 209             (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
 210             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
 211             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
 212             return i;
 213         }
 214     }
 215     /* c>0x10ffff or not enough space, write an error value */
 216     if(pIsError!=NULL) {
 217         *pIsError=TRUE;
 218     } else {
 219         length-=i;
 220         if(length>0) {
 221             int32_t offset;
 222             if(length>3) {
 223                 length=3;
 224             }
 225             s+=i;
 226             offset=0;
 227             c=utf8_errorValue[length-1];
 228             U8_APPEND_UNSAFE(s, offset, c);
 229             i=i+offset;
 230         }
 231     }
 232     return i;
 233 }
 234
 235 U_CAPI UChar32 U_EXPORT2
 236 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
 237     // *pi is the index of byte c.
 238     int32_t i=*pi;
 239     if(U8_IS_TRAIL(c) && i>start) {
 240         uint8_t b1=s[--i];
 241         if(U8_IS_LEAD(b1)) {
 242             if(b1<0xe0) {
 243                 *pi=i;
 244                 return ((b1-0xc0)<<6)|(c&0x3f);
 245             } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
 246                 // Truncated 3- or 4-byte sequence.
 247                 *pi=i;
 248                 return errorValue(1, strict);
 249             }
 250         } else if(U8_IS_TRAIL(b1) && i>start) {
 251             // Extract the value bits from the last trail byte.
 252             c&=0x3f;
 253             uint8_t b2=s[--i];
 254             if(0xe0<=b2 && b2<=0xf4) {
 255                 if(b2<0xf0) {
 256                     b2&=0xf;
 257                     if(strict!=-2) {
 258                         if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
 259                             *pi=i;
 260                             c=(b2<<12)|((b1&0x3f)<<6)|c;
 261                             if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
 262                                 return c;
 263                             } else {
 264                                 // strict: forbid non-characters like U+fffe
 265                                 return errorValue(2, strict);
 266                             }
 267                         }
 268                     } else {
 269                         // strict=-2 -> lenient: allow surrogates
 270                         b1-=0x80;
 271                         if((b2>0 || b1>=0x20)) {
 272                             *pi=i;
 273                             return (b2<<12)|(b1<<6)|c;
 274                         }
 275                     }
 276                 } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
 277                     // Truncated 4-byte sequence.
 278                     *pi=i;
 279                     return errorValue(2, strict);
 280                 }
 281             } else if(U8_IS_TRAIL(b2) && i>start) {
 282                 uint8_t b3=s[--i];
 283                 if(0xf0<=b3 && b3<=0xf4) {
 284                     b3&=7;
 285                     if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
 286                         *pi=i;
 287                         c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
 288                         if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
 289                             return c;
 290                         } else {
 291                             // strict: forbid non-characters like U+fffe
 292                             return errorValue(3, strict);
 293                         }
 294                     }
 295                 }
 296             }
 297         }
 298     }
 299     return errorValue(0, strict);
 300 }
 301
 302 U_CAPI int32_t U_EXPORT2
 303 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
 304     // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
 305     int32_t orig_i=i;
 306     uint8_t c=s[i];
 307     if(U8_IS_TRAIL(c) && i>start) {
 308         uint8_t b1=s[--i];
 309         if(U8_IS_LEAD(b1)) {
 310             if(b1<0xe0 ||
 311                     (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
 312                 return i;
 313             }
 314         } else if(U8_IS_TRAIL(b1) && i>start) {
 315             uint8_t b2=s[--i];
 316             if(0xe0<=b2 && b2<=0xf4) {
 317                 if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
 318                     return i;
 319                 }
 320             } else if(U8_IS_TRAIL(b2) && i>start) {
 321                 uint8_t b3=s[--i];
 322                 if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
 323                     return i;
 324                 }
 325             }
 326         }
 327     }
 328     return orig_i;
 329 }