]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/perf/unisetperf/draft/bitset.cpp
ICU-59117.0.1.tar.gz
[apple/icu.git] / icuSources / test / perf / unisetperf / draft / bitset.cpp
1 /*
2 *************************************************************************
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
5 *************************************************************************
6 *************************************************************************
7 * Copyright (C) 2014, International Business Machines
8 * Corporation and others. All Rights Reserved.
9 *************************************************************************
10 * file name: bitset.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2007jan15
16 * created by: Markus Scherer
17 *
18 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
19 * using a folded bit set consisting of a 1k-entry index table and a
20 * compacted array of 64-bit words.
21 * Uses a simple hash table for compaction.
22 * Uses the original set for supplementary code points.
23 */
24
25 #include "unicode/utypes.h"
26 #include "unicont.h"
27 #include "cmemory.h" // for UPRV_LENGTHOF
28
29 /*
30 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
31 * Hashes 64-bit words and maps them to 16-bit integers which are
32 * assigned in order of new incoming words for subsequent storage
33 * in a contiguous array.
34 */
35 struct BMPBitHash : public UObject {
36 int64_t keys[0x800]; // 2k
37 uint16_t values[0x800];
38 uint16_t reverse[0x400];
39 uint16_t count;
40 const int32_t prime=1301; // Less than 2k.
41
42 BMPBitHash() : count(0) {
43 // Fill values[] with 0xffff.
44 uprv_memset(values, 0xff, sizeof(values));
45 }
46
47 /*
48 * Map a key to an integer count.
49 * Map at most 1k=0x400 different keys with this data structure.
50 */
51 uint16_t map(int64_t key) {
52 int32_t hash=(int32_t)(key>>55)&0x1ff;
53 hash^=(int32_t)(key>>44)&0x7ff;
54 hash^=(int32_t)(key>>33)&0x7ff;
55 hash^=(int32_t)(key>>22)&0x7ff;
56 hash^=(int32_t)(key>>11)&0x7ff;
57 hash^=(int32_t)key&0x7ff;
58 for(;;) {
59 if(values[hash]==0xffff) {
60 // Unused slot.
61 keys[hash]=key;
62 reverse[count]=hash;
63 return values[hash]=count++;
64 } else if(keys[hash]==key) {
65 // Found a slot with this key.
66 return values[hash];
67 } else {
68 // Used slot with a different key, move to another slot.
69 hash=(hash+prime)&0x7ff;
70 }
71 }
72 }
73
74 uint16_t countKeys() const { return count; }
75
76 /*
77 * Invert the hash map: Fill an array of length countKeys() with the keys
78 * indexed by their mapped values.
79 */
80 void invert(int64_t *k) const {
81 uint16_t i;
82
83 for(i=0; i<count; ++i) {
84 k[i]=keys[reverse[i]];
85 }
86 }
87 };
88
89 class BitSet : public UObject, public UnicodeContainable {
90 public:
91 BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
92 if(U_FAILURE(errorCode)) {
93 return;
94 }
95 BMPBitHash *bitHash=new BMPBitHash;
96 if(bitHash==NULL || restSet==NULL) {
97 errorCode=U_MEMORY_ALLOCATION_ERROR;
98 return;
99 }
100
101 UnicodeSetIterator iter(set);
102 int64_t b;
103 UChar32 start, end;
104 int32_t prevIndex, i, j;
105
106 b=0; // Not necessary but makes compilers happy.
107 prevIndex=-1;
108 for(;;) {
109 if(iter.nextRange() && !iter.isString()) {
110 start=iter.getCodepoint();
111 end=iter.getCodepointEnd();
112 } else {
113 start=0x10000;
114 }
115 i=start>>6;
116 if(prevIndex!=i) {
117 // Finish the end of the previous range.
118 if(prevIndex<0) {
119 prevIndex=0;
120 } else {
121 index[prevIndex++]=bitHash->map(b);
122 }
123 // Fill all-zero entries between ranges.
124 if(prevIndex<i) {
125 uint16_t zero=bitHash->map(0);
126 do {
127 index[prevIndex++]=zero;
128 } while(prevIndex<i);
129 }
130 b=0;
131 }
132 if(start>0xffff) {
133 break;
134 }
135 b|=~((INT64_C(1)<<(start&0x3f))-1);
136 j=end>>6;
137 if(i<j) {
138 // Set bits for the start of the range.
139 index[i++]=bitHash->map(b);
140 // Fill all-one entries inside the range.
141 if(i<j) {
142 uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
143 do {
144 index[i++]=all;
145 } while(i<j);
146 }
147 b=INT64_C(0xffffffffffffffff);
148 }
149 /* i==j */
150 b&=(INT64_C(1)<<(end&0x3f))-1;
151 prevIndex=j;
152 }
153
154 if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
155 bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
156 }
157 if(bits!=NULL) {
158 bitHash->invert(bits);
159 } else {
160 bits=shortBits;
161 errorCode=U_MEMORY_ALLOCATION_ERROR;
162 return;
163 }
164
165 latin1Set[0]=(uint32_t)bits[0];
166 latin1Set[1]=(uint32_t)(bits[0]>>32);
167 latin1Set[2]=(uint32_t)bits[1];
168 latin1Set[3]=(uint32_t)(bits[1]>>32);
169 latin1Set[4]=(uint32_t)bits[2];
170 latin1Set[5]=(uint32_t)(bits[2]>>32);
171 latin1Set[6]=(uint32_t)bits[3];
172 latin1Set[7]=(uint32_t)(bits[3]>>32);
173
174 restSet.remove(0, 0xffff);
175 }
176
177 ~BitSet() {
178 if(bits!=shortBits) {
179 uprv_free(bits);
180 }
181 delete restSet;
182 }
183
184 UBool contains(UChar32 c) const {
185 if((uint32_t)c<=0xff) {
186 return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
187 } else if((uint32_t)c<0xffff) {
188 return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
189 } else {
190 return restSet->contains(c);
191 }
192 }
193
194 private:
195 uint16_t index[0x400];
196 int64_t shortBits[32];
197 int64_t *bits;
198
199 uint32_t latin1Bits[8];
200
201 UnicodeSet *restSet;
202 };