2 *************************************************************************
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
5 *************************************************************************
6 *************************************************************************
7 * Copyright (C) 2014, International Business Machines
8 * Corporation and others. All Rights Reserved.
9 *************************************************************************
10 * file name: bitset.cpp
12 * tab size: 8 (not used)
15 * created on: 2007jan15
16 * created by: Markus Scherer
18 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
19 * using a folded bit set consisting of a 1k-entry index table and a
20 * compacted array of 64-bit words.
21 * Uses a simple hash table for compaction.
22 * Uses the original set for supplementary code points.
25 #include "unicode/utypes.h"
27 #include "cmemory.h" // for UPRV_LENGTHOF
30 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
31 * Hashes 64-bit words and maps them to 16-bit integers which are
32 * assigned in order of new incoming words for subsequent storage
33 * in a contiguous array.
35 struct BMPBitHash
: public UObject
{
36 int64_t keys
[0x800]; // 2k
37 uint16_t values
[0x800];
38 uint16_t reverse
[0x400];
40 const int32_t prime
=1301; // Less than 2k.
42 BMPBitHash() : count(0) {
43 // Fill values[] with 0xffff.
44 uprv_memset(values
, 0xff, sizeof(values
));
48 * Map a key to an integer count.
49 * Map at most 1k=0x400 different keys with this data structure.
51 uint16_t map(int64_t key
) {
52 int32_t hash
=(int32_t)(key
>>55)&0x1ff;
53 hash
^=(int32_t)(key
>>44)&0x7ff;
54 hash
^=(int32_t)(key
>>33)&0x7ff;
55 hash
^=(int32_t)(key
>>22)&0x7ff;
56 hash
^=(int32_t)(key
>>11)&0x7ff;
57 hash
^=(int32_t)key
&0x7ff;
59 if(values
[hash
]==0xffff) {
63 return values
[hash
]=count
++;
64 } else if(keys
[hash
]==key
) {
65 // Found a slot with this key.
68 // Used slot with a different key, move to another slot.
69 hash
=(hash
+prime
)&0x7ff;
74 uint16_t countKeys() const { return count
; }
77 * Invert the hash map: Fill an array of length countKeys() with the keys
78 * indexed by their mapped values.
80 void invert(int64_t *k
) const {
83 for(i
=0; i
<count
; ++i
) {
84 k
[i
]=keys
[reverse
[i
]];
89 class BitSet
: public UObject
, public UnicodeContainable
{
91 BitSet(const UnicodeSet
&set
, UErrorCode
&errorCode
) : bits(shortBits
), restSet(set
.clone()) {
92 if(U_FAILURE(errorCode
)) {
95 BMPBitHash
*bitHash
=new BMPBitHash
;
96 if(bitHash
==NULL
|| restSet
==NULL
) {
97 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
101 UnicodeSetIterator
iter(set
);
104 int32_t prevIndex
, i
, j
;
106 b
=0; // Not necessary but makes compilers happy.
109 if(iter
.nextRange() && !iter
.isString()) {
110 start
=iter
.getCodepoint();
111 end
=iter
.getCodepointEnd();
117 // Finish the end of the previous range.
121 index
[prevIndex
++]=bitHash
->map(b
);
123 // Fill all-zero entries between ranges.
125 uint16_t zero
=bitHash
->map(0);
127 index
[prevIndex
++]=zero
;
128 } while(prevIndex
<i
);
135 b
|=~((INT64_C(1)<<(start
&0x3f))-1);
138 // Set bits for the start of the range.
139 index
[i
++]=bitHash
->map(b
);
140 // Fill all-one entries inside the range.
142 uint16_t all
=bitHash
->map(INT64_C(0xffffffffffffffff));
147 b
=INT64_C(0xffffffffffffffff);
150 b
&=(INT64_C(1)<<(end
&0x3f))-1;
154 if(bitHash
->countKeys()>UPRV_LENGTHOF(shortBits
)) {
155 bits
=(int64_t *)uprv_malloc(bitHash
->countKeys()*8);
158 bitHash
->invert(bits
);
161 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
165 latin1Set
[0]=(uint32_t)bits
[0];
166 latin1Set
[1]=(uint32_t)(bits
[0]>>32);
167 latin1Set
[2]=(uint32_t)bits
[1];
168 latin1Set
[3]=(uint32_t)(bits
[1]>>32);
169 latin1Set
[4]=(uint32_t)bits
[2];
170 latin1Set
[5]=(uint32_t)(bits
[2]>>32);
171 latin1Set
[6]=(uint32_t)bits
[3];
172 latin1Set
[7]=(uint32_t)(bits
[3]>>32);
174 restSet
.remove(0, 0xffff);
178 if(bits
!=shortBits
) {
184 UBool
contains(UChar32 c
) const {
185 if((uint32_t)c
<=0xff) {
186 return (UBool
)((latin1Set
[c
>>5]&((uint32_t)1<<(c
&0x1f)))!=0);
187 } else if((uint32_t)c
<0xffff) {
188 return (UBool
)((bits
[c
>>6]&(INT64_C(1)<<(c
&0x3f)))!=0);
190 return restSet
->contains(c
);
195 uint16_t index
[0x400];
196 int64_t shortBits
[32];
199 uint32_t latin1Bits
[8];