1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2007-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
10 * file name: bmpset.cpp
12 * tab size: 8 (not used)
15 * created on: 2007jan29
16 * created by: Markus W. Scherer
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/utf8.h"
22 #include "unicode/utf16.h"
29 BMPSet::BMPSet(const int32_t *parentList
, int32_t parentListLength
) :
30 list(parentList
), listLength(parentListLength
) {
31 uprv_memset(latin1Contains
, 0, sizeof(latin1Contains
));
32 uprv_memset(table7FF
, 0, sizeof(table7FF
));
33 uprv_memset(bmpBlockBits
, 0, sizeof(bmpBlockBits
));
36 * Set the list indexes for binary searches for
37 * U+0800, U+1000, U+2000, .., U+F000, U+10000.
38 * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
39 * looked up in the bit tables.
40 * The last pair of indexes is for finding supplementary code points.
42 list4kStarts
[0]=findCodePoint(0x800, 0, listLength
-1);
44 for(i
=1; i
<=0x10; ++i
) {
45 list4kStarts
[i
]=findCodePoint(i
<<12, list4kStarts
[i
-1], listLength
-1);
47 list4kStarts
[0x11]=listLength
-1;
48 containsFFFD
=containsSlow(0xfffd, list4kStarts
[0xf], list4kStarts
[0x10]);
54 BMPSet::BMPSet(const BMPSet
&otherBMPSet
, const int32_t *newParentList
, int32_t newParentListLength
) :
55 containsFFFD(otherBMPSet
.containsFFFD
),
56 list(newParentList
), listLength(newParentListLength
) {
57 uprv_memcpy(latin1Contains
, otherBMPSet
.latin1Contains
, sizeof(latin1Contains
));
58 uprv_memcpy(table7FF
, otherBMPSet
.table7FF
, sizeof(table7FF
));
59 uprv_memcpy(bmpBlockBits
, otherBMPSet
.bmpBlockBits
, sizeof(bmpBlockBits
));
60 uprv_memcpy(list4kStarts
, otherBMPSet
.list4kStarts
, sizeof(list4kStarts
));
67 * Set bits in a bit rectangle in "vertical" bit organization.
70 static void set32x64Bits(uint32_t table
[64], int32_t start
, int32_t limit
) {
71 U_ASSERT(start
<limit
);
72 U_ASSERT(limit
<=0x800);
74 int32_t lead
=start
>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
75 int32_t trail
=start
&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
77 // Set one bit indicating an all-one block.
78 uint32_t bits
=(uint32_t)1<<lead
;
79 if((start
+1)==limit
) { // Single-character shortcut.
84 int32_t limitLead
=limit
>>6;
85 int32_t limitTrail
=limit
&0x3f;
88 // Partial vertical bit column.
89 while(trail
<limitTrail
) {
93 // Partial vertical bit column,
94 // followed by a bit rectangle,
95 // followed by another partial vertical bit column.
103 bits
=~(((unsigned)1<<lead
)-1);
105 bits
&=((unsigned)1<<limitLead
)-1;
107 for(trail
=0; trail
<64; ++trail
) {
111 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
112 // In that case, bits=1<<limitLead is undefined but the bits value
113 // is not used because trail<limitTrail is already false.
114 bits
=(uint32_t)1<<((limitLead
== 0x20) ? (limitLead
- 1) : limitLead
);
115 for(trail
=0; trail
<limitTrail
; ++trail
) {
121 void BMPSet::initBits() {
122 UChar32 start
, limit
;
125 // Set latin1Contains[].
127 start
=list
[listIndex
++];
128 if(listIndex
<listLength
) {
129 limit
=list
[listIndex
++];
137 latin1Contains
[start
++]=1;
138 } while(start
<limit
&& start
<0x100);
139 } while(limit
<=0x100);
141 // Find the first range overlapping with (or after) 80..FF again,
142 // to include them in table7FF as well.
144 start
=list
[listIndex
++];
145 if(listIndex
<listLength
) {
146 limit
=list
[listIndex
++];
160 set32x64Bits(table7FF
, start
, limit
<=0x800 ? limit
: 0x800);
166 start
=list
[listIndex
++];
167 if(listIndex
<listLength
) {
168 limit
=list
[listIndex
++];
174 // Set bmpBlockBits[].
175 int32_t minStart
=0x800;
176 while(start
<0x10000) {
184 if(start
<limit
) { // Else: Another range entirely in a known mixed-value block.
186 // Mixed-value block of 64 code points.
188 bmpBlockBits
[start
&0x3f]|=0x10001<<(start
>>6);
189 start
=(start
+1)<<6; // Round up to the next block boundary.
190 minStart
=start
; // Ignore further ranges in this block.
193 if(start
<(limit
&~0x3f)) {
194 // Multiple all-ones blocks of 64 code points each.
195 set32x64Bits(bmpBlockBits
, start
>>6, limit
>>6);
199 // Mixed-value block of 64 code points.
201 bmpBlockBits
[limit
&0x3f]|=0x10001<<(limit
>>6);
202 limit
=(limit
+1)<<6; // Round up to the next block boundary.
203 minStart
=limit
; // Ignore further ranges in this block.
212 start
=list
[listIndex
++];
213 if(listIndex
<listLength
) {
214 limit
=list
[listIndex
++];
222 * Override some bits and bytes to the result of contains(FFFD)
223 * for faster validity checking at runtime.
224 * No need to set 0 values where they were reset to 0 in the constructor
225 * and not modified by initBits().
226 * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
227 * Need to set 0 values for surrogates D800..DFFF.
229 void BMPSet::overrideIllegal() {
234 bits
=3; // Lead bytes 0xC0 and 0xC1.
235 for(i
=0; i
<64; ++i
) {
239 bits
=1; // Lead byte 0xE0.
240 for(i
=0; i
<32; ++i
) { // First half of 4k block.
241 bmpBlockBits
[i
]|=bits
;
244 mask
=~(0x10001<<0xd); // Lead byte 0xED.
246 for(i
=32; i
<64; ++i
) { // Second half of 4k block.
247 bmpBlockBits
[i
]=(bmpBlockBits
[i
]&mask
)|bits
;
250 mask
=~(0x10001<<0xd); // Lead byte 0xED.
251 for(i
=32; i
<64; ++i
) { // Second half of 4k block.
252 bmpBlockBits
[i
]&=mask
;
257 int32_t BMPSet::findCodePoint(UChar32 c
, int32_t lo
, int32_t hi
) const {
260 set list[] c=0 1 3 4 7 8
261 === ============== ===========
262 [] [110000] 0 0 0 0 0 0
263 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
264 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
265 [:Any:] [0, 110000] 1 1 1 1 1 1
268 // Return the smallest i such that c < list[i]. Assume
269 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
272 // High runner test. c is often after the last range, so an
273 // initial check for this condition pays off.
274 if (lo
>= hi
|| c
>= list
[hi
-1])
276 // invariant: c >= list[lo]
277 // invariant: c < list[hi]
279 int32_t i
= (lo
+ hi
) >> 1;
282 } else if (c
< list
[i
]) {
292 BMPSet::contains(UChar32 c
) const {
293 if((uint32_t)c
<=0xff) {
294 return (UBool
)latin1Contains
[c
];
295 } else if((uint32_t)c
<=0x7ff) {
296 return (UBool
)((table7FF
[c
&0x3f]&((uint32_t)1<<(c
>>6)))!=0);
297 } else if((uint32_t)c
<0xd800 || (c
>=0xe000 && c
<=0xffff)) {
299 uint32_t twoBits
=(bmpBlockBits
[(c
>>6)&0x3f]>>lead
)&0x10001;
301 // All 64 code points with the same bits 15..6
302 // are either in the set or not.
303 return (UBool
)twoBits
;
305 // Look up the code point in its 4k block of code points.
306 return containsSlow(c
, list4kStarts
[lead
], list4kStarts
[lead
+1]);
308 } else if((uint32_t)c
<=0x10ffff) {
309 // surrogate or supplementary code point
310 return containsSlow(c
, list4kStarts
[0xd], list4kStarts
[0x11]);
312 // Out-of-range code points get FALSE, consistent with long-standing
313 // behavior of UnicodeSet::contains(c).
319 * Check for sufficient length for trail unit for each surrogate pair.
320 * Handle single surrogates as surrogate code points as usual in ICU.
323 BMPSet::span(const UChar
*s
, const UChar
*limit
, USetSpanCondition spanCondition
) const {
331 if(!latin1Contains
[c
]) {
334 } else if(c
<=0x7ff) {
335 if((table7FF
[c
&0x3f]&((uint32_t)1<<(c
>>6)))==0) {
338 } else if(c
<0xd800 || c
>=0xe000) {
340 uint32_t twoBits
=(bmpBlockBits
[(c
>>6)&0x3f]>>lead
)&0x10001;
342 // All 64 code points with the same bits 15..6
343 // are either in the set or not.
348 // Look up the code point in its 4k block of code points.
349 if(!containsSlow(c
, list4kStarts
[lead
], list4kStarts
[lead
+1])) {
353 } else if(c
>=0xdc00 || (s
+1)==limit
|| (c2
=s
[1])<0xdc00 || c2
>=0xe000) {
354 // surrogate code point
355 if(!containsSlow(c
, list4kStarts
[0xd], list4kStarts
[0xe])) {
360 if(!containsSlow(U16_GET_SUPPLEMENTARY(c
, c2
), list4kStarts
[0x10], list4kStarts
[0x11])) {
371 if(latin1Contains
[c
]) {
374 } else if(c
<=0x7ff) {
375 if((table7FF
[c
&0x3f]&((uint32_t)1<<(c
>>6)))!=0) {
378 } else if(c
<0xd800 || c
>=0xe000) {
380 uint32_t twoBits
=(bmpBlockBits
[(c
>>6)&0x3f]>>lead
)&0x10001;
382 // All 64 code points with the same bits 15..6
383 // are either in the set or not.
388 // Look up the code point in its 4k block of code points.
389 if(containsSlow(c
, list4kStarts
[lead
], list4kStarts
[lead
+1])) {
393 } else if(c
>=0xdc00 || (s
+1)==limit
|| (c2
=s
[1])<0xdc00 || c2
>=0xe000) {
394 // surrogate code point
395 if(containsSlow(c
, list4kStarts
[0xd], list4kStarts
[0xe])) {
400 if(containsSlow(U16_GET_SUPPLEMENTARY(c
, c2
), list4kStarts
[0x10], list4kStarts
[0x11])) {
410 /* Symmetrical with span(). */
412 BMPSet::spanBack(const UChar
*s
, const UChar
*limit
, USetSpanCondition spanCondition
) const {
420 if(!latin1Contains
[c
]) {
423 } else if(c
<=0x7ff) {
424 if((table7FF
[c
&0x3f]&((uint32_t)1<<(c
>>6)))==0) {
427 } else if(c
<0xd800 || c
>=0xe000) {
429 uint32_t twoBits
=(bmpBlockBits
[(c
>>6)&0x3f]>>lead
)&0x10001;
431 // All 64 code points with the same bits 15..6
432 // are either in the set or not.
437 // Look up the code point in its 4k block of code points.
438 if(!containsSlow(c
, list4kStarts
[lead
], list4kStarts
[lead
+1])) {
442 } else if(c
<0xdc00 || s
==limit
|| (c2
=*(limit
-1))<0xd800 || c2
>=0xdc00) {
443 // surrogate code point
444 if(!containsSlow(c
, list4kStarts
[0xd], list4kStarts
[0xe])) {
449 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2
, c
), list4kStarts
[0x10], list4kStarts
[0x11])) {
463 if(latin1Contains
[c
]) {
466 } else if(c
<=0x7ff) {
467 if((table7FF
[c
&0x3f]&((uint32_t)1<<(c
>>6)))!=0) {
470 } else if(c
<0xd800 || c
>=0xe000) {
472 uint32_t twoBits
=(bmpBlockBits
[(c
>>6)&0x3f]>>lead
)&0x10001;
474 // All 64 code points with the same bits 15..6
475 // are either in the set or not.
480 // Look up the code point in its 4k block of code points.
481 if(containsSlow(c
, list4kStarts
[lead
], list4kStarts
[lead
+1])) {
485 } else if(c
<0xdc00 || s
==limit
|| (c2
=*(limit
-1))<0xd800 || c2
>=0xdc00) {
486 // surrogate code point
487 if(containsSlow(c
, list4kStarts
[0xd], list4kStarts
[0xe])) {
492 if(containsSlow(U16_GET_SUPPLEMENTARY(c2
, c
), list4kStarts
[0x10], list4kStarts
[0x11])) {
506 * Precheck for sufficient trail bytes at end of string only once per span.
510 BMPSet::spanUTF8(const uint8_t *s
, int32_t length
, USetSpanCondition spanCondition
) const {
511 const uint8_t *limit
=s
+length
;
513 if(U8_IS_SINGLE(b
)) {
514 // Initial all-ASCII span.
517 if(!latin1Contains
[b
] || ++s
==limit
) {
521 } while(U8_IS_SINGLE(b
));
524 if(latin1Contains
[b
] || ++s
==limit
) {
528 } while(U8_IS_SINGLE(b
));
530 length
=(int32_t)(limit
-s
);
533 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
534 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
537 const uint8_t *limit0
=limit
;
540 * Make sure that the last 1/2/3/4-byte sequence before limit is complete
541 * or runs into a lead byte.
542 * In the span loop compare s with limit only once
543 * per multi-byte character.
545 * Give a trailing illegal sequence the same value as the result of contains(FFFD),
546 * including it if that is part of the span, otherwise set limit0 to before
547 * the truncated sequence.
551 // b>=0x80: lead or trail byte
553 // single trail byte, check for preceding 3- or 4-byte lead byte
554 if(length
>=2 && (b
=*(limit
-2))>=0xe0) {
556 if(containsFFFD
!=spanCondition
) {
559 } else if(b
<0xc0 && b
>=0x80 && length
>=3 && (b
=*(limit
-3))>=0xf0) {
560 // 4-byte lead byte with only two trail bytes
562 if(containsFFFD
!=spanCondition
) {
567 // lead byte with no trail bytes
569 if(containsFFFD
!=spanCondition
) {
579 if(U8_IS_SINGLE(b
)) {
583 if(!latin1Contains
[b
]) {
585 } else if(++s
==limit
) {
589 } while(U8_IS_SINGLE(b
));
592 if(latin1Contains
[b
]) {
594 } else if(++s
==limit
) {
598 } while(U8_IS_SINGLE(b
));
601 ++s
; // Advance past the lead byte.
604 if( /* handle U+0000..U+FFFF inline */
605 (t1
=(uint8_t)(s
[0]-0x80)) <= 0x3f &&
606 (t2
=(uint8_t)(s
[1]-0x80)) <= 0x3f
609 uint32_t twoBits
=(bmpBlockBits
[t1
]>>b
)&0x10001;
611 // All 64 code points with this lead byte and middle trail byte
612 // are either in the set or not.
613 if(twoBits
!=(uint32_t)spanCondition
) {
617 // Look up the code point in its 4k block of code points.
618 UChar32 c
=(b
<<12)|(t1
<<6)|t2
;
619 if(containsSlow(c
, list4kStarts
[b
], list4kStarts
[b
+1]) != spanCondition
) {
626 } else if( /* handle U+10000..U+10FFFF inline */
627 (t1
=(uint8_t)(s
[0]-0x80)) <= 0x3f &&
628 (t2
=(uint8_t)(s
[1]-0x80)) <= 0x3f &&
629 (t3
=(uint8_t)(s
[2]-0x80)) <= 0x3f
631 // Give an illegal sequence the same value as the result of contains(FFFD).
632 UChar32 c
=((UChar32
)(b
-0xf0)<<18)|((UChar32
)t1
<<12)|(t2
<<6)|t3
;
633 if( ( (0x10000<=c
&& c
<=0x10ffff) ?
634 containsSlow(c
, list4kStarts
[0x10], list4kStarts
[0x11]) :
644 if( /* handle U+0000..U+07FF inline */
646 (t1
=(uint8_t)(*s
-0x80)) <= 0x3f
648 if((USetSpanCondition
)((table7FF
[t1
]&((uint32_t)1<<(b
&0x1f)))!=0) != spanCondition
) {
656 // Give an illegal sequence the same value as the result of contains(FFFD).
657 // Handle each byte of an illegal sequence separately to simplify the code;
658 // no need to optimize error handling.
659 if(containsFFFD
!=spanCondition
) {
668 * While going backwards through UTF-8 optimize only for ASCII.
669 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
670 * possible to tell from the last byte in a multi-byte sequence how many
671 * preceding bytes there should be. Therefore, going backwards through UTF-8
672 * is much harder than going forward.
675 BMPSet::spanBackUTF8(const uint8_t *s
, int32_t length
, USetSpanCondition spanCondition
) const {
676 if(spanCondition
!=USET_SPAN_NOT_CONTAINED
) {
677 spanCondition
=USET_SPAN_CONTAINED
; // Pin to 0/1 values.
684 if(U8_IS_SINGLE(b
)) {
688 if(!latin1Contains
[b
]) {
690 } else if(length
==0) {
694 } while(U8_IS_SINGLE(b
));
697 if(latin1Contains
[b
]) {
699 } else if(length
==0) {
703 } while(U8_IS_SINGLE(b
));
709 // trail byte: collect a multi-byte character
710 // (or lead byte in last-trail position)
711 c
=utf8_prevCharSafeBody(s
, 0, &length
, b
, -3);
712 // c is a valid code point, not ASCII, not a surrogate
714 if((USetSpanCondition
)((table7FF
[c
&0x3f]&((uint32_t)1<<(c
>>6)))!=0) != spanCondition
) {
717 } else if(c
<=0xffff) {
719 uint32_t twoBits
=(bmpBlockBits
[(c
>>6)&0x3f]>>lead
)&0x10001;
721 // All 64 code points with the same bits 15..6
722 // are either in the set or not.
723 if(twoBits
!=(uint32_t)spanCondition
) {
727 // Look up the code point in its 4k block of code points.
728 if(containsSlow(c
, list4kStarts
[lead
], list4kStarts
[lead
+1]) != spanCondition
) {
733 if(containsSlow(c
, list4kStarts
[0x10], list4kStarts
[0x11]) != spanCondition
) {