]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/bmpset.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / bmpset.cpp
index 83cc9064f7e62da4484c51868264046202f3e027..bc79f5e5a63be1cacf4b142acde01bd39bba633e 100644 (file)
@@ -1,12 +1,14 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2007-2011, International Business Machines
+*   Copyright (C) 2007-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 *   file name:  bmpset.cpp
-*   encoding:   US-ASCII
+*   encoding:   UTF-8
 *   tab size:   8 (not used)
 *   indentation:4
 *
@@ -26,7 +28,7 @@ U_NAMESPACE_BEGIN
 
 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
         list(parentList), listLength(parentListLength) {
-    uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
+    uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
     uprv_memset(table7FF, 0, sizeof(table7FF));
     uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
 
@@ -43,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
         list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
     }
     list4kStarts[0x11]=listLength-1;
+    containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
 
     initBits();
     overrideIllegal();
 }
 
 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
+        containsFFFD(otherBMPSet.containsFFFD),
         list(newParentList), listLength(newParentListLength) {
-    uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
+    uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
     uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
     uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
     uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
@@ -96,9 +100,9 @@ static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
             ++lead;
         }
         if(lead<limitLead) {
-            bits=~((1<<lead)-1);
+            bits=~(((unsigned)1<<lead)-1);
             if(limitLead<0x20) {
-                bits&=(1<<limitLead)-1;
+                bits&=((unsigned)1<<limitLead)-1;
             }
             for(trail=0; trail<64; ++trail) {
                 table[trail]|=bits;
@@ -107,7 +111,7 @@ static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
         // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
         // In that case, bits=1<<limitLead is undefined but the bits value
         // is not used because trail<limitTrail is already false.
-        bits=1<<limitLead;
+        bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
         for(trail=0; trail<limitTrail; ++trail) {
             table[trail]|=bits;
         }
@@ -118,7 +122,7 @@ void BMPSet::initBits() {
     UChar32 start, limit;
     int32_t listIndex=0;
 
-    // Set asciiBytes[].
+    // Set latin1Contains[].
     do {
         start=list[listIndex++];
         if(listIndex<listLength) {
@@ -126,13 +130,30 @@ void BMPSet::initBits() {
         } else {
             limit=0x110000;
         }
-        if(start>=0x80) {
+        if(start>=0x100) {
             break;
         }
         do {
-            asciiBytes[start++]=1;
-        } while(start<limit && start<0x80);
-    } while(limit<=0x80);
+            latin1Contains[start++]=1;
+        } while(start<limit && start<0x100);
+    } while(limit<=0x100);
+
+    // Find the first range overlapping with (or after) 80..FF again,
+    // to include them in table7FF as well.
+    for(listIndex=0;;) {
+        start=list[listIndex++];
+        if(listIndex<listLength) {
+            limit=list[listIndex++];
+        } else {
+            limit=0x110000;
+        }
+        if(limit>0x80) {
+            if(start<0x80) {
+                start=0x80;
+            }
+            break;
+        }
+    }
 
     // Set table7FF[].
     while(start<0x800) {
@@ -202,19 +223,14 @@ void BMPSet::initBits() {
  * for faster validity checking at runtime.
  * No need to set 0 values where they were reset to 0 in the constructor
  * and not modified by initBits().
- * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
+ * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
  * Need to set 0 values for surrogates D800..DFFF.
  */
 void BMPSet::overrideIllegal() {
     uint32_t bits, mask;
     int32_t i;
 
-    if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
-        // contains(FFFD)==TRUE
-        for(i=0x80; i<0xc0; ++i) {
-            asciiBytes[i]=1;
-        }
-
+    if(containsFFFD) {
         bits=3;                 // Lead bytes 0xC0 and 0xC1.
         for(i=0; i<64; ++i) {
             table7FF[i]|=bits;
@@ -225,14 +241,13 @@ void BMPSet::overrideIllegal() {
             bmpBlockBits[i]|=bits;
         }
 
-        mask=~(0x10001<<0xd);   // Lead byte 0xED.
+        mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.
         bits=1<<0xd;
         for(i=32; i<64; ++i) {  // Second half of 4k block.
             bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
         }
     } else {
-        // contains(FFFD)==FALSE
-        mask=~(0x10001<<0xd);   // Lead byte 0xED.
+        mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.
         for(i=32; i<64; ++i) {  // Second half of 4k block.
             bmpBlockBits[i]&=mask;
         }
@@ -275,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
 
 UBool
 BMPSet::contains(UChar32 c) const {
-    if((uint32_t)c<=0x7f) {
-        return (UBool)asciiBytes[c];
+    if((uint32_t)c<=0xff) {
+        return (UBool)latin1Contains[c];
     } else if((uint32_t)c<=0x7ff) {
         return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
     } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
@@ -312,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
         // span
         do {
             c=*s;
-            if(c<=0x7f) {
-                if(!asciiBytes[c]) {
+            if(c<=0xff) {
+                if(!latin1Contains[c]) {
                     break;
                 }
             } else if(c<=0x7ff) {
@@ -352,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
         // span not
         do {
             c=*s;
-            if(c<=0x7f) {
-                if(asciiBytes[c]) {
+            if(c<=0xff) {
+                if(latin1Contains[c]) {
                     break;
                 }
             } else if(c<=0x7ff) {
@@ -401,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
         // span
         for(;;) {
             c=*(--limit);
-            if(c<=0x7f) {
-                if(!asciiBytes[c]) {
+            if(c<=0xff) {
+                if(!latin1Contains[c]) {
                     break;
                 }
             } else if(c<=0x7ff) {
@@ -444,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
         // span not
         for(;;) {
             c=*(--limit);
-            if(c<=0x7f) {
-                if(asciiBytes[c]) {
+            if(c<=0xff) {
+                if(latin1Contains[c]) {
                     break;
                 }
             } else if(c<=0x7ff) {
@@ -495,22 +510,22 @@ const uint8_t *
 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
     const uint8_t *limit=s+length;
     uint8_t b=*s;
-    if((int8_t)b>=0) {
+    if(U8_IS_SINGLE(b)) {
         // Initial all-ASCII span.
         if(spanCondition) {
             do {
-                if(!asciiBytes[b] || ++s==limit) {
+                if(!latin1Contains[b] || ++s==limit) {
                     return s;
                 }
                 b=*s;
-            } while((int8_t)b>=0);
+            } while(U8_IS_SINGLE(b));
         } else {
             do {
-                if(asciiBytes[b] || ++s==limit) {
+                if(latin1Contains[b] || ++s==limit) {
                     return s;
                 }
                 b=*s;
-            } while((int8_t)b>=0);
+            } while(U8_IS_SINGLE(b));
         }
         length=(int32_t)(limit-s);
     }
@@ -538,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
             // single trail byte, check for preceding 3- or 4-byte lead byte
             if(length>=2 && (b=*(limit-2))>=0xe0) {
                 limit-=2;
-                if(asciiBytes[0x80]!=spanCondition) {
+                if(containsFFFD!=spanCondition) {
                     limit0=limit;
                 }
             } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
                 // 4-byte lead byte with only two trail bytes
                 limit-=3;
-                if(asciiBytes[0x80]!=spanCondition) {
+                if(containsFFFD!=spanCondition) {
                     limit0=limit;
                 }
             }
         } else {
             // lead byte with no trail bytes
             --limit;
-            if(asciiBytes[0x80]!=spanCondition) {
+            if(containsFFFD!=spanCondition) {
                 limit0=limit;
             }
         }
@@ -561,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
 
     while(s<limit) {
         b=*s;
-        if(b<0xc0) {
-            // ASCII; or trail bytes with the result of contains(FFFD).
+        if(U8_IS_SINGLE(b)) {
+            // ASCII
             if(spanCondition) {
                 do {
-                    if(!asciiBytes[b]) {
+                    if(!latin1Contains[b]) {
                         return s;
                     } else if(++s==limit) {
                         return limit0;
                     }
                     b=*s;
-                } while(b<0xc0);
+                } while(U8_IS_SINGLE(b));
             } else {
                 do {
-                    if(asciiBytes[b]) {
+                    if(latin1Contains[b]) {
                         return s;
                     } else if(++s==limit) {
                         return limit0;
                     }
                     b=*s;
-                } while(b<0xc0);
+                } while(U8_IS_SINGLE(b));
             }
         }
         ++s;  // Advance past the lead byte.
@@ -617,7 +632,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
                 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
                 if( (   (0x10000<=c && c<=0x10ffff) ?
                             containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
-                            asciiBytes[0x80]
+                            containsFFFD
                     ) != spanCondition
                 ) {
                     return s-1;
@@ -625,8 +640,9 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
                 s+=3;
                 continue;
             }
-        } else /* 0xc0<=b<0xe0 */ {
+        } else {
             if( /* handle U+0000..U+07FF inline */
+                b>=0xc0 &&
                 (t1=(uint8_t)(*s-0x80)) <= 0x3f
             ) {
                 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
@@ -640,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
         // Give an illegal sequence the same value as the result of contains(FFFD).
         // Handle each byte of an illegal sequence separately to simplify the code;
         // no need to optimize error handling.
-        if(asciiBytes[0x80]!=spanCondition) {
+        if(containsFFFD!=spanCondition) {
             return s-1;
         }
     }
@@ -665,41 +681,34 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
 
     do {
         b=s[--length];
-        if((int8_t)b>=0) {
+        if(U8_IS_SINGLE(b)) {
             // ASCII sub-span
             if(spanCondition) {
                 do {
-                    if(!asciiBytes[b]) {
+                    if(!latin1Contains[b]) {
                         return length+1;
                     } else if(length==0) {
                         return 0;
                     }
                     b=s[--length];
-                } while((int8_t)b>=0);
+                } while(U8_IS_SINGLE(b));
             } else {
                 do {
-                    if(asciiBytes[b]) {
+                    if(latin1Contains[b]) {
                         return length+1;
                     } else if(length==0) {
                         return 0;
                     }
                     b=s[--length];
-                } while((int8_t)b>=0);
+                } while(U8_IS_SINGLE(b));
             }
         }
 
         int32_t prev=length;
         UChar32 c;
-        if(b<0xc0) {
-            // trail byte: collect a multi-byte character
-            c=utf8_prevCharSafeBody(s, 0, &length, b, -1);
-            if(c<0) {
-                c=0xfffd;
-            }
-        } else {
-            // lead byte in last-trail position
-            c=0xfffd;
-        }
+        // trail byte: collect a multi-byte character
+        // (or  lead byte in last-trail position)
+        c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
         // c is a valid code point, not ASCII, not a surrogate
         if(c<=0x7ff) {
             if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {