+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
-* Copyright (C) 2007-2011, International Business Machines
+* Copyright (C) 2007-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: bmpset.cpp
-* encoding: US-ASCII
+* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list(parentList), listLength(parentListLength) {
- uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
+ uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
uprv_memset(table7FF, 0, sizeof(table7FF));
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
}
list4kStarts[0x11]=listLength-1;
+ containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
initBits();
overrideIllegal();
}
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
+ containsFFFD(otherBMPSet.containsFFFD),
list(newParentList), listLength(newParentListLength) {
- uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
+ uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
++lead;
}
if(lead<limitLead) {
- bits=~((1<<lead)-1);
+ bits=~(((unsigned)1<<lead)-1);
if(limitLead<0x20) {
- bits&=(1<<limitLead)-1;
+ bits&=((unsigned)1<<limitLead)-1;
}
for(trail=0; trail<64; ++trail) {
table[trail]|=bits;
// limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
// In that case, bits=1<<limitLead is undefined but the bits value
// is not used because trail<limitTrail is already false.
- bits=1<<limitLead;
+ bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
for(trail=0; trail<limitTrail; ++trail) {
table[trail]|=bits;
}
UChar32 start, limit;
int32_t listIndex=0;
- // Set asciiBytes[].
+ // Set latin1Contains[].
do {
start=list[listIndex++];
if(listIndex<listLength) {
} else {
limit=0x110000;
}
- if(start>=0x80) {
+ if(start>=0x100) {
break;
}
do {
- asciiBytes[start++]=1;
- } while(start<limit && start<0x80);
- } while(limit<=0x80);
+ latin1Contains[start++]=1;
+ } while(start<limit && start<0x100);
+ } while(limit<=0x100);
+
+ // Find the first range overlapping with (or after) 80..FF again,
+ // to include them in table7FF as well.
+ for(listIndex=0;;) {
+ start=list[listIndex++];
+ if(listIndex<listLength) {
+ limit=list[listIndex++];
+ } else {
+ limit=0x110000;
+ }
+ if(limit>0x80) {
+ if(start<0x80) {
+ start=0x80;
+ }
+ break;
+ }
+ }
// Set table7FF[].
while(start<0x800) {
* for faster validity checking at runtime.
* No need to set 0 values where they were reset to 0 in the constructor
* and not modified by initBits().
- * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
+ * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* Need to set 0 values for surrogates D800..DFFF.
*/
void BMPSet::overrideIllegal() {
uint32_t bits, mask;
int32_t i;
- if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
- // contains(FFFD)==TRUE
- for(i=0x80; i<0xc0; ++i) {
- asciiBytes[i]=1;
- }
-
+ if(containsFFFD) {
bits=3; // Lead bytes 0xC0 and 0xC1.
for(i=0; i<64; ++i) {
table7FF[i]|=bits;
bmpBlockBits[i]|=bits;
}
- mask=~(0x10001<<0xd); // Lead byte 0xED.
+ mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
bits=1<<0xd;
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
}
} else {
- // contains(FFFD)==FALSE
- mask=~(0x10001<<0xd); // Lead byte 0xED.
+ mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]&=mask;
}
UBool
BMPSet::contains(UChar32 c) const {
- if((uint32_t)c<=0x7f) {
- return (UBool)asciiBytes[c];
+ if((uint32_t)c<=0xff) {
+ return (UBool)latin1Contains[c];
} else if((uint32_t)c<=0x7ff) {
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
// span
do {
c=*s;
- if(c<=0x7f) {
- if(!asciiBytes[c]) {
+ if(c<=0xff) {
+ if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
// span not
do {
c=*s;
- if(c<=0x7f) {
- if(asciiBytes[c]) {
+ if(c<=0xff) {
+ if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
// span
for(;;) {
c=*(--limit);
- if(c<=0x7f) {
- if(!asciiBytes[c]) {
+ if(c<=0xff) {
+ if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
// span not
for(;;) {
c=*(--limit);
- if(c<=0x7f) {
- if(asciiBytes[c]) {
+ if(c<=0xff) {
+ if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
const uint8_t *limit=s+length;
uint8_t b=*s;
- if((int8_t)b>=0) {
+ if(U8_IS_SINGLE(b)) {
// Initial all-ASCII span.
if(spanCondition) {
do {
- if(!asciiBytes[b] || ++s==limit) {
+ if(!latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
- } while((int8_t)b>=0);
+ } while(U8_IS_SINGLE(b));
} else {
do {
- if(asciiBytes[b] || ++s==limit) {
+ if(latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
- } while((int8_t)b>=0);
+ } while(U8_IS_SINGLE(b));
}
length=(int32_t)(limit-s);
}
// single trail byte, check for preceding 3- or 4-byte lead byte
if(length>=2 && (b=*(limit-2))>=0xe0) {
limit-=2;
- if(asciiBytes[0x80]!=spanCondition) {
+ if(containsFFFD!=spanCondition) {
limit0=limit;
}
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
// 4-byte lead byte with only two trail bytes
limit-=3;
- if(asciiBytes[0x80]!=spanCondition) {
+ if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
} else {
// lead byte with no trail bytes
--limit;
- if(asciiBytes[0x80]!=spanCondition) {
+ if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
while(s<limit) {
b=*s;
- if(b<0xc0) {
- // ASCII; or trail bytes with the result of contains(FFFD).
+ if(U8_IS_SINGLE(b)) {
+ // ASCII
if(spanCondition) {
do {
- if(!asciiBytes[b]) {
+ if(!latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
- } while(b<0xc0);
+ } while(U8_IS_SINGLE(b));
} else {
do {
- if(asciiBytes[b]) {
+ if(latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
- } while(b<0xc0);
+ } while(U8_IS_SINGLE(b));
}
}
++s; // Advance past the lead byte.
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
if( ( (0x10000<=c && c<=0x10ffff) ?
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
- asciiBytes[0x80]
+ containsFFFD
) != spanCondition
) {
return s-1;
s+=3;
continue;
}
- } else /* 0xc0<=b<0xe0 */ {
+ } else {
if( /* handle U+0000..U+07FF inline */
+ b>=0xc0 &&
(t1=(uint8_t)(*s-0x80)) <= 0x3f
) {
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
// Give an illegal sequence the same value as the result of contains(FFFD).
// Handle each byte of an illegal sequence separately to simplify the code;
// no need to optimize error handling.
- if(asciiBytes[0x80]!=spanCondition) {
+ if(containsFFFD!=spanCondition) {
return s-1;
}
}
do {
b=s[--length];
- if((int8_t)b>=0) {
+ if(U8_IS_SINGLE(b)) {
// ASCII sub-span
if(spanCondition) {
do {
- if(!asciiBytes[b]) {
+ if(!latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
- } while((int8_t)b>=0);
+ } while(U8_IS_SINGLE(b));
} else {
do {
- if(asciiBytes[b]) {
+ if(latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
- } while((int8_t)b>=0);
+ } while(U8_IS_SINGLE(b));
}
}
int32_t prev=length;
UChar32 c;
- if(b<0xc0) {
- // trail byte: collect a multi-byte character
- c=utf8_prevCharSafeBody(s, 0, &length, b, -1);
- if(c<0) {
- c=0xfffd;
- }
- } else {
- // lead byte in last-trail position
- c=0xfffd;
- }
+ // trail byte: collect a multi-byte character
+ // (or lead byte in last-trail position)
+ c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
// c is a valid code point, not ASCII, not a surrogate
if(c<=0x7ff) {
if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {