ICU-62107.0.1.tar.gz

[apple/icu.git] / icuSources / common / utf_impl.cpp
diff --git a/icuSources/common/utf_impl.cpp b/icuSources/common/utf_impl.cpp

index 293e6f181f3a3f9d9932392503afbd47604f6f93..9dd241a12bfa16788e4a6aeb06488c4df9a12a13 100644 (file)
--- a/icuSources/common/utf_impl.cpp
+++ b/icuSources/common/utf_impl.cpp
@@ -7,7 +7,7 @@
  *   Corporation and others.  All Rights Reserved.
  *
  ******************************************************************************
-*   file name:  utf_impl.c
+*   file name:  utf_impl.cpp
  *   encoding:   UTF-8
  *   tab size:   8 (not used)
  *   indentation:4
@@ -27,7 +27,6 @@
  #include "unicode/utypes.h"
  #include "unicode/utf.h"
  #include "unicode/utf8.h"
-#include "unicode/utf_old.h"
  #include "uassert.h"
  
  /*
@@ -55,10 +54,6 @@
   * - SUB AX, BX (result)
   * -finish:
   * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
- *
- * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
- * lead bytes above 0xf4 are illegal.
- * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
   */
  extern "C" U_EXPORT const uint8_t
  utf8_countTrailBytes[256]={
@@ -77,24 +72,24 @@ utf8_countTrailBytes[256]={
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    // illegal C0 & C1
+    // 2-byte lead bytes C2..DF
+    0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  
+    // 3-byte lead bytes E0..EF
      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3,
-    3, 3, 3,    /* illegal in Unicode */
-    4, 4, 4, 4, /* illegal in Unicode */
-    5, 5,       /* illegal in Unicode */
-    0, 0        /* illegal bytes 0xfe and 0xff */
+    // 4-byte lead bytes F0..F4
+    // illegal F5..FF
+    3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  };
  
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
  static const UChar32
  utf8_errorValue[6]={
-    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
-    0x3ffffff, 0x7fffffff
+    // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
+    // but without relying on the obsolete unicode/utf_old.h.
+    0x15, 0x9f, 0xffff,
+    0x10ffff
  };
  
  static UChar32
@@ -134,61 +129,59 @@ errorValue(int32_t count, int8_t strict) {
   */
  U_CAPI UChar32 U_EXPORT2
  utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
+    // *pi is one after byte c.
      int32_t i=*pi;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
-    if(i+count<=length || length<0) {
-        uint8_t trail;
-
-        U8_MASK_LEAD_BYTE(c, count);
-        /* support NUL-terminated strings: do not read beyond the first non-trail byte */
-        switch(count) {
-        /* each branch falls through to the next one */
-        case 0:
-            /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-        case 5:
-        case 4:
-            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-            break;
-        case 3:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            /* c>=0x110 would result in code point>0x10ffff, outside Unicode */
-            if(c>=0x110 || trail>0x3f) { break; }
-            U_FALLTHROUGH;
-        case 2:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            /*
-             * test for a surrogate d800..dfff unless we are lenient:
-             * before the last (c<<6), a surrogate is c=360..37f
-             */
-            if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
-            U_FALLTHROUGH;
-        case 1:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            if(trail>0x3f) { break; }
-            /* correct sequence - all trail bytes have (b7..b6)==(10) */
-            if(c>=utf8_minLegal[count] &&
-                    /* strict: forbid non-characters like U+fffe */
-                    (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
+    // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
+    if(i==length || c>0xf4) {
+        // end of string, or not a lead byte
+    } else if(c>=0xf0) {
+        // Test for 4-byte sequences first because
+        // U8_NEXT() handles shorter valid sequences inline.
+        uint8_t t1=s[i], t2, t3;
+        c&=7;
+        if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
+                ++i!=length && (t2=s[i]-0x80)<=0x3f &&
+                ++i!=length && (t3=s[i]-0x80)<=0x3f) {
+            ++i;
+            c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
+            // strict: forbid non-characters like U+fffe
+            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
                  *pi=i;
                  return c;
              }
-        /* no default branch to optimize switch()  - all values are covered */
          }
-    } else {
-        /* too few bytes left */
-        count=length-i;
-    }
+    } else if(c>=0xe0) {
+        c&=0xf;
+        if(strict!=-2) {
+            uint8_t t1=s[i], t2;
+            if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
+                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+                ++i;
+                c=(c<<12)|((t1&0x3f)<<6)|t2;
+                // strict: forbid non-characters like U+fffe
+                if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                    *pi=i;
+                    return c;
+                }
+            }
+        } else {
+            // strict=-2 -> lenient: allow surrogates
+            uint8_t t1=s[i]-0x80, t2;
+            if(t1<=0x3f && (c>0 || t1>=0x20) &&
+                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+                *pi=i+1;
+                return (c<<12)|(t1<<6)|t2;
+            }
+        }
+    } else if(c>=0xc2) {
+        uint8_t t1=s[i]-0x80;
+        if(t1<=0x3f) {
+            *pi=i+1;
+            return ((c-0xc0)<<6)|t1;
+        }
+    }  // else 0x80<=c<0xc2 is not a lead byte
  
      /* error handling */
-    i=*pi;
-    while(count>0 && U8_IS_TRAIL(s[i])) {
-        ++i;
-        --count;
-    }
      c=errorValue(i-*pi, strict);
      *pi=i;
      return c;
@@ -232,7 +225,7 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
              s+=i;
              offset=0;
              c=utf8_errorValue[length-1];
-            UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
+            U8_APPEND_UNSAFE(s, offset, c);
              i=i+offset;
          }
      }
@@ -241,99 +234,96 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
  
  U_CAPI UChar32 U_EXPORT2
  utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
+    // *pi is the index of byte c.
      int32_t i=*pi;
-    uint8_t b, count=1, shift=6;
-
-    if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
-
-    /* extract value bits from the last trail byte */
-    c&=0x3f;
-
-    for(;;) {
-        if(i<=start) {
-            /* no lead byte at all */
-            return errorValue(0, strict);
-        }
-
-        /* read another previous byte */
-        b=s[--i];
-        if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
-            if(b&0x40) {
-                /* lead byte, this will always end the loop */
-                uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
-
-                if(count==shouldCount) {
-                    /* set the new position */
-                    *pi=i;
-                    U8_MASK_LEAD_BYTE(b, count);
-                    c|=(UChar32)b<<shift;
-                    if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
-                        /* illegal sequence or (strict and non-character) */
-                        if(count>=4) {
-                            count=3;
+    if(U8_IS_TRAIL(c) && i>start) {
+        uint8_t b1=s[--i];
+        if(U8_IS_LEAD(b1)) {
+            if(b1<0xe0) {
+                *pi=i;
+                return ((b1-0xc0)<<6)|(c&0x3f);
+            } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
+                // Truncated 3- or 4-byte sequence.
+                *pi=i;
+                return errorValue(1, strict);
+            }
+        } else if(U8_IS_TRAIL(b1) && i>start) {
+            // Extract the value bits from the last trail byte.
+            c&=0x3f;
+            uint8_t b2=s[--i];
+            if(0xe0<=b2 && b2<=0xf4) {
+                if(b2<0xf0) {
+                    b2&=0xf;
+                    if(strict!=-2) {
+                        if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                            *pi=i;
+                            c=(b2<<12)|((b1&0x3f)<<6)|c;
+                            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                                return c;
+                            } else {
+                                // strict: forbid non-characters like U+fffe
+                                return errorValue(2, strict);
+                            }
                          }
-                        c=errorValue(count, strict);
                      } else {
-                        /* exit with correct c */
+                        // strict=-2 -> lenient: allow surrogates
+                        b1-=0x80;
+                        if((b2>0 || b1>=0x20)) {
+                            *pi=i;
+                            return (b2<<12)|(b1<<6)|c;
+                        }
                      }
-                } else {
-                    /* the lead byte does not match the number of trail bytes */
-                    /* only set the position to the lead byte if it would
-                       include the trail byte that we started with */
-                    if(count<shouldCount) {
+                } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+                    // Truncated 4-byte sequence.
+                    *pi=i;
+                    return errorValue(2, strict);
+                }
+            } else if(U8_IS_TRAIL(b2) && i>start) {
+                uint8_t b3=s[--i];
+                if(0xf0<=b3 && b3<=0xf4) {
+                    b3&=7;
+                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
                          *pi=i;
-                        c=errorValue(count, strict);
-                    } else {
-                        c=errorValue(0, strict);
+                        c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
+                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                            return c;
+                        } else {
+                            // strict: forbid non-characters like U+fffe
+                            return errorValue(3, strict);
+                        }
                      }
                  }
-                break;
-            } else if(count<5) {
-                /* trail byte */
-                c|=(UChar32)(b&0x3f)<<shift;
-                ++count;
-                shift+=6;
-            } else {
-                /* more than 5 trail bytes is illegal */
-                c=errorValue(0, strict);
-                break;
              }
-        } else {
-            /* single-byte character precedes trailing bytes */
-            c=errorValue(0, strict);
-            break;
          }
      }
-    return c;
+    return errorValue(0, strict);
  }
  
  U_CAPI int32_t U_EXPORT2
  utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
-    /* i had been decremented once before the function call */
-    int32_t I=i, Z;
-    uint8_t b;
-
-    /* read at most the 6 bytes s[Z] to s[i], inclusively */
-    if(I-5>start) {
-        Z=I-5;
-    } else {
-        Z=start;
-    }
-
-    /* return I if the sequence starting there is long enough to include i */
-    do {
-        b=s[I];
-        if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
-            break;
-        } else if(b>=0xc0) {
-            if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
-                return I;
-            } else {
-                break;
+    // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
+    int32_t orig_i=i;
+    uint8_t c=s[i];
+    if(U8_IS_TRAIL(c) && i>start) {
+        uint8_t b1=s[--i];
+        if(U8_IS_LEAD(b1)) {
+            if(b1<0xe0 ||
+                    (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+                return i;
+            }
+        } else if(U8_IS_TRAIL(b1) && i>start) {
+            uint8_t b2=s[--i];
+            if(0xe0<=b2 && b2<=0xf4) {
+                if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+                    return i;
+                }
+            } else if(U8_IS_TRAIL(b2) && i>start) {
+                uint8_t b3=s[--i];
+                if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
+                    return i;
+                }
              }
          }
-    } while(Z<=--I);
-
-    /* return i itself to be consistent with the FWD_1 macro */
-    return i;
+    }
+    return orig_i;
  }