X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/4388f060552cc537e71e957d32f35e9d75a61233..HEAD:/icuSources/common/bmpset.cpp?ds=sidebyside diff --git a/icuSources/common/bmpset.cpp b/icuSources/common/bmpset.cpp index 83cc9064..bc79f5e5 100644 --- a/icuSources/common/bmpset.cpp +++ b/icuSources/common/bmpset.cpp @@ -1,12 +1,14 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * -* Copyright (C) 2007-2011, International Business Machines +* Copyright (C) 2007-2012, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: bmpset.cpp -* encoding: US-ASCII +* encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * @@ -26,7 +28,7 @@ U_NAMESPACE_BEGIN BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : list(parentList), listLength(parentListLength) { - uprv_memset(asciiBytes, 0, sizeof(asciiBytes)); + uprv_memset(latin1Contains, 0, sizeof(latin1Contains)); uprv_memset(table7FF, 0, sizeof(table7FF)); uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); @@ -43,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); } list4kStarts[0x11]=listLength-1; + containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]); initBits(); overrideIllegal(); } BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : + containsFFFD(otherBMPSet.containsFFFD), list(newParentList), listLength(newParentListLength) { - uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes)); + uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains)); uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); @@ -96,9 +100,9 @@ static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) { ++lead; } if(lead=0x80) { + if(start>=0x100) { break; } do { - asciiBytes[start++]=1; - } while(start0x80) { + if(start<0x80) { + start=0x80; + } + break; + } + } // Set table7FF[]. while(start<0x800) { @@ -202,19 +223,14 @@ void BMPSet::initBits() { * for faster validity checking at runtime. * No need to set 0 values where they were reset to 0 in the constructor * and not modified by initBits(). - * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF) + * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF) * Need to set 0 values for surrogates D800..DFFF. */ void BMPSet::overrideIllegal() { uint32_t bits, mask; int32_t i; - if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) { - // contains(FFFD)==TRUE - for(i=0x80; i<0xc0; ++i) { - asciiBytes[i]=1; - } - + if(containsFFFD) { bits=3; // Lead bytes 0xC0 and 0xC1. for(i=0; i<64; ++i) { table7FF[i]|=bits; @@ -225,14 +241,13 @@ void BMPSet::overrideIllegal() { bmpBlockBits[i]|=bits; } - mask=~(0x10001<<0xd); // Lead byte 0xED. + mask= static_cast(~(0x10001<<0xd)); // Lead byte 0xED. bits=1<<0xd; for(i=32; i<64; ++i) { // Second half of 4k block. bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits; } } else { - // contains(FFFD)==FALSE - mask=~(0x10001<<0xd); // Lead byte 0xED. + mask= static_cast(~(0x10001<<0xd)); // Lead byte 0xED. for(i=32; i<64; ++i) { // Second half of 4k block. bmpBlockBits[i]&=mask; } @@ -275,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { UBool BMPSet::contains(UChar32 c) const { - if((uint32_t)c<=0x7f) { - return (UBool)asciiBytes[c]; + if((uint32_t)c<=0xff) { + return (UBool)latin1Contains[c]; } else if((uint32_t)c<=0x7ff) { return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0); } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { @@ -312,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition // span do { c=*s; - if(c<=0x7f) { - if(!asciiBytes[c]) { + if(c<=0xff) { + if(!latin1Contains[c]) { break; } } else if(c<=0x7ff) { @@ -352,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition // span not do { c=*s; - if(c<=0x7f) { - if(asciiBytes[c]) { + if(c<=0xff) { + if(latin1Contains[c]) { break; } } else if(c<=0x7ff) { @@ -401,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi // span for(;;) { c=*(--limit); - if(c<=0x7f) { - if(!asciiBytes[c]) { + if(c<=0xff) { + if(!latin1Contains[c]) { break; } } else if(c<=0x7ff) { @@ -444,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi // span not for(;;) { c=*(--limit); - if(c<=0x7f) { - if(asciiBytes[c]) { + if(c<=0xff) { + if(latin1Contains[c]) { break; } } else if(c<=0x7ff) { @@ -495,22 +510,22 @@ const uint8_t * BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { const uint8_t *limit=s+length; uint8_t b=*s; - if((int8_t)b>=0) { + if(U8_IS_SINGLE(b)) { // Initial all-ASCII span. if(spanCondition) { do { - if(!asciiBytes[b] || ++s==limit) { + if(!latin1Contains[b] || ++s==limit) { return s; } b=*s; - } while((int8_t)b>=0); + } while(U8_IS_SINGLE(b)); } else { do { - if(asciiBytes[b] || ++s==limit) { + if(latin1Contains[b] || ++s==limit) { return s; } b=*s; - } while((int8_t)b>=0); + } while(U8_IS_SINGLE(b)); } length=(int32_t)(limit-s); } @@ -538,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi // single trail byte, check for preceding 3- or 4-byte lead byte if(length>=2 && (b=*(limit-2))>=0xe0) { limit-=2; - if(asciiBytes[0x80]!=spanCondition) { + if(containsFFFD!=spanCondition) { limit0=limit; } } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { // 4-byte lead byte with only two trail bytes limit-=3; - if(asciiBytes[0x80]!=spanCondition) { + if(containsFFFD!=spanCondition) { limit0=limit; } } } else { // lead byte with no trail bytes --limit; - if(asciiBytes[0x80]!=spanCondition) { + if(containsFFFD!=spanCondition) { limit0=limit; } } @@ -561,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi while(s=0xc0 && (t1=(uint8_t)(*s-0x80)) <= 0x3f ) { if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { @@ -640,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi // Give an illegal sequence the same value as the result of contains(FFFD). // Handle each byte of an illegal sequence separately to simplify the code; // no need to optimize error handling. - if(asciiBytes[0x80]!=spanCondition) { + if(containsFFFD!=spanCondition) { return s-1; } } @@ -665,41 +681,34 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon do { b=s[--length]; - if((int8_t)b>=0) { + if(U8_IS_SINGLE(b)) { // ASCII sub-span if(spanCondition) { do { - if(!asciiBytes[b]) { + if(!latin1Contains[b]) { return length+1; } else if(length==0) { return 0; } b=s[--length]; - } while((int8_t)b>=0); + } while(U8_IS_SINGLE(b)); } else { do { - if(asciiBytes[b]) { + if(latin1Contains[b]) { return length+1; } else if(length==0) { return 0; } b=s[--length]; - } while((int8_t)b>=0); + } while(U8_IS_SINGLE(b)); } } int32_t prev=length; UChar32 c; - if(b<0xc0) { - // trail byte: collect a multi-byte character - c=utf8_prevCharSafeBody(s, 0, &length, b, -1); - if(c<0) { - c=0xfffd; - } - } else { - // lead byte in last-trail position - c=0xfffd; - } + // trail byte: collect a multi-byte character + // (or lead byte in last-trail position) + c=utf8_prevCharSafeBody(s, 0, &length, b, -3); // c is a valid code point, not ASCII, not a surrogate if(c<=0x7ff) { if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {