ICU-64260.0.1.tar.gz

[apple/icu.git] / icuSources / common / utf_impl.cpp
diff --git a/icuSources/common/utf_impl.cpp b/icuSources/common/utf_impl.cpp

index 293e6f181f3a3f9d9932392503afbd47604f6f93..9dd241a12bfa16788e4a6aeb06488c4df9a12a13 100644 (file)
--- a/icuSources/common/utf_impl.cpp
+++ b/icuSources/common/utf_impl.cpp
@@ -7,7 +7,7 @@
  *   Corporation and others.  All Rights Reserved.
  *
  ******************************************************************************
  *   Corporation and others.  All Rights Reserved.
  *
  ******************************************************************************
-*   file name:  utf_impl.c
+*   file name:  utf_impl.cpp
  *   encoding:   UTF-8
  *   tab size:   8 (not used)
  *   indentation:4
  *   encoding:   UTF-8
  *   tab size:   8 (not used)
  *   indentation:4
@@ -27,7 +27,6 @@
  #include "unicode/utypes.h"
  #include "unicode/utf.h"
  #include "unicode/utf8.h"
  #include "unicode/utypes.h"
  #include "unicode/utf.h"
  #include "unicode/utf8.h"
-#include "unicode/utf_old.h"
  #include "uassert.h"
  
  /*
  #include "uassert.h"
  
  /*
@@ -55,10 +54,6 @@
   * - SUB AX, BX (result)
   * -finish:
   * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
   * - SUB AX, BX (result)
   * -finish:
   * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
- *
- * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
- * lead bytes above 0xf4 are illegal.
- * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
   */
  extern "C" U_EXPORT const uint8_t
  utf8_countTrailBytes[256]={
   */
  extern "C" U_EXPORT const uint8_t
  utf8_countTrailBytes[256]={
@@ -77,24 +72,24 @@ utf8_countTrailBytes[256]={
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    // illegal C0 & C1
+    // 2-byte lead bytes C2..DF
+    0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  
+    // 3-byte lead bytes E0..EF
      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3,
-    3, 3, 3,    /* illegal in Unicode */
-    4, 4, 4, 4, /* illegal in Unicode */
-    5, 5,       /* illegal in Unicode */
-    0, 0        /* illegal bytes 0xfe and 0xff */
+    // 4-byte lead bytes F0..F4
+    // illegal F5..FF
+    3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  };
  
  };
  
-static const UChar32
-utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
-
  static const UChar32
  utf8_errorValue[6]={
  static const UChar32
  utf8_errorValue[6]={
-    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
-    0x3ffffff, 0x7fffffff
+    // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
+    // but without relying on the obsolete unicode/utf_old.h.
+    0x15, 0x9f, 0xffff,
+    0x10ffff
  };
  
  static UChar32
  };
  
  static UChar32
@@ -134,61 +129,59 @@ errorValue(int32_t count, int8_t strict) {
   */
  U_CAPI UChar32 U_EXPORT2
  utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
   */
  U_CAPI UChar32 U_EXPORT2
  utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
+    // *pi is one after byte c.
      int32_t i=*pi;
      int32_t i=*pi;
-    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
-    U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
-    if(i+count<=length || length<0) {
-        uint8_t trail;
-
-        U8_MASK_LEAD_BYTE(c, count);
-        /* support NUL-terminated strings: do not read beyond the first non-trail byte */
-        switch(count) {
-        /* each branch falls through to the next one */
-        case 0:
-            /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
-        case 5:
-        case 4:
-            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
-            break;
-        case 3:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            /* c>=0x110 would result in code point>0x10ffff, outside Unicode */
-            if(c>=0x110 || trail>0x3f) { break; }
-            U_FALLTHROUGH;
-        case 2:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            /*
-             * test for a surrogate d800..dfff unless we are lenient:
-             * before the last (c<<6), a surrogate is c=360..37f
-             */
-            if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
-            U_FALLTHROUGH;
-        case 1:
-            trail=s[i++]-0x80;
-            c=(c<<6)|trail;
-            if(trail>0x3f) { break; }
-            /* correct sequence - all trail bytes have (b7..b6)==(10) */
-            if(c>=utf8_minLegal[count] &&
-                    /* strict: forbid non-characters like U+fffe */
-                    (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
+    // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
+    if(i==length || c>0xf4) {
+        // end of string, or not a lead byte
+    } else if(c>=0xf0) {
+        // Test for 4-byte sequences first because
+        // U8_NEXT() handles shorter valid sequences inline.
+        uint8_t t1=s[i], t2, t3;
+        c&=7;
+        if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
+                ++i!=length && (t2=s[i]-0x80)<=0x3f &&
+                ++i!=length && (t3=s[i]-0x80)<=0x3f) {
+            ++i;
+            c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
+            // strict: forbid non-characters like U+fffe
+            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
                  *pi=i;
                  return c;
              }
                  *pi=i;
                  return c;
              }
-        /* no default branch to optimize switch()  - all values are covered */
          }
          }
-    } else {
-        /* too few bytes left */
-        count=length-i;
-    }
+    } else if(c>=0xe0) {
+        c&=0xf;
+        if(strict!=-2) {
+            uint8_t t1=s[i], t2;
+            if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
+                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+                ++i;
+                c=(c<<12)|((t1&0x3f)<<6)|t2;
+                // strict: forbid non-characters like U+fffe
+                if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                    *pi=i;
+                    return c;
+                }
+            }
+        } else {
+            // strict=-2 -> lenient: allow surrogates
+            uint8_t t1=s[i]-0x80, t2;
+            if(t1<=0x3f && (c>0 || t1>=0x20) &&
+                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
+                *pi=i+1;
+                return (c<<12)|(t1<<6)|t2;
+            }
+        }
+    } else if(c>=0xc2) {
+        uint8_t t1=s[i]-0x80;
+        if(t1<=0x3f) {
+            *pi=i+1;
+            return ((c-0xc0)<<6)|t1;
+        }
+    }  // else 0x80<=c<0xc2 is not a lead byte
  
      /* error handling */
  
      /* error handling */
-    i=*pi;
-    while(count>0 && U8_IS_TRAIL(s[i])) {
-        ++i;
-        --count;
-    }
      c=errorValue(i-*pi, strict);
      *pi=i;
      return c;
      c=errorValue(i-*pi, strict);
      *pi=i;
      return c;
@@ -232,7 +225,7 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
              s+=i;
              offset=0;
              c=utf8_errorValue[length-1];
              s+=i;
              offset=0;
              c=utf8_errorValue[length-1];
-            UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
+            U8_APPEND_UNSAFE(s, offset, c);
              i=i+offset;
          }
      }
              i=i+offset;
          }
      }
@@ -241,99 +234,96 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
  
  U_CAPI UChar32 U_EXPORT2
  utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
  
  U_CAPI UChar32 U_EXPORT2
  utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
+    // *pi is the index of byte c.
      int32_t i=*pi;
      int32_t i=*pi;
-    uint8_t b, count=1, shift=6;
-
-    if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); }
-
-    /* extract value bits from the last trail byte */
-    c&=0x3f;
-
-    for(;;) {
-        if(i<=start) {
-            /* no lead byte at all */
-            return errorValue(0, strict);
-        }
-
-        /* read another previous byte */
-        b=s[--i];
-        if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */
-            if(b&0x40) {
-                /* lead byte, this will always end the loop */
-                uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);
-
-                if(count==shouldCount) {
-                    /* set the new position */
-                    *pi=i;
-                    U8_MASK_LEAD_BYTE(b, count);
-                    c|=(UChar32)b<<shift;
-                    if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) {
-                        /* illegal sequence or (strict and non-character) */
-                        if(count>=4) {
-                            count=3;
+    if(U8_IS_TRAIL(c) && i>start) {
+        uint8_t b1=s[--i];
+        if(U8_IS_LEAD(b1)) {
+            if(b1<0xe0) {
+                *pi=i;
+                return ((b1-0xc0)<<6)|(c&0x3f);
+            } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
+                // Truncated 3- or 4-byte sequence.
+                *pi=i;
+                return errorValue(1, strict);
+            }
+        } else if(U8_IS_TRAIL(b1) && i>start) {
+            // Extract the value bits from the last trail byte.
+            c&=0x3f;
+            uint8_t b2=s[--i];
+            if(0xe0<=b2 && b2<=0xf4) {
+                if(b2<0xf0) {
+                    b2&=0xf;
+                    if(strict!=-2) {
+                        if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
+                            *pi=i;
+                            c=(b2<<12)|((b1&0x3f)<<6)|c;
+                            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                                return c;
+                            } else {
+                                // strict: forbid non-characters like U+fffe
+                                return errorValue(2, strict);
+                            }
                          }
                          }
-                        c=errorValue(count, strict);
                      } else {
                      } else {
-                        /* exit with correct c */
+                        // strict=-2 -> lenient: allow surrogates
+                        b1-=0x80;
+                        if((b2>0 || b1>=0x20)) {
+                            *pi=i;
+                            return (b2<<12)|(b1<<6)|c;
+                        }
                      }
                      }
-                } else {
-                    /* the lead byte does not match the number of trail bytes */
-                    /* only set the position to the lead byte if it would
-                       include the trail byte that we started with */
-                    if(count<shouldCount) {
+                } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+                    // Truncated 4-byte sequence.
+                    *pi=i;
+                    return errorValue(2, strict);
+                }
+            } else if(U8_IS_TRAIL(b2) && i>start) {
+                uint8_t b3=s[--i];
+                if(0xf0<=b3 && b3<=0xf4) {
+                    b3&=7;
+                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
                          *pi=i;
                          *pi=i;
-                        c=errorValue(count, strict);
-                    } else {
-                        c=errorValue(0, strict);
+                        c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
+                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
+                            return c;
+                        } else {
+                            // strict: forbid non-characters like U+fffe
+                            return errorValue(3, strict);
+                        }
                      }
                  }
                      }
                  }
-                break;
-            } else if(count<5) {
-                /* trail byte */
-                c|=(UChar32)(b&0x3f)<<shift;
-                ++count;
-                shift+=6;
-            } else {
-                /* more than 5 trail bytes is illegal */
-                c=errorValue(0, strict);
-                break;
              }
              }
-        } else {
-            /* single-byte character precedes trailing bytes */
-            c=errorValue(0, strict);
-            break;
          }
      }
          }
      }
-    return c;
+    return errorValue(0, strict);
  }
  
  U_CAPI int32_t U_EXPORT2
  utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
  }
  
  U_CAPI int32_t U_EXPORT2
  utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
-    /* i had been decremented once before the function call */
-    int32_t I=i, Z;
-    uint8_t b;
-
-    /* read at most the 6 bytes s[Z] to s[i], inclusively */
-    if(I-5>start) {
-        Z=I-5;
-    } else {
-        Z=start;
-    }
-
-    /* return I if the sequence starting there is long enough to include i */
-    do {
-        b=s[I];
-        if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */
-            break;
-        } else if(b>=0xc0) {
-            if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) {
-                return I;
-            } else {
-                break;
+    // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
+    int32_t orig_i=i;
+    uint8_t c=s[i];
+    if(U8_IS_TRAIL(c) && i>start) {
+        uint8_t b1=s[--i];
+        if(U8_IS_LEAD(b1)) {
+            if(b1<0xe0 ||
+                    (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
+                return i;
+            }
+        } else if(U8_IS_TRAIL(b1) && i>start) {
+            uint8_t b2=s[--i];
+            if(0xe0<=b2 && b2<=0xf4) {
+                if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
+                    return i;
+                }
+            } else if(U8_IS_TRAIL(b2) && i>start) {
+                uint8_t b3=s[--i];
+                if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
+                    return i;
+                }
              }
          }
              }
          }
-    } while(Z<=--I);
-
-    /* return i itself to be consistent with the FWD_1 macro */
-    return i;
+    }
+    return orig_i;
  }
  }