]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/perf/unisetperf/draft/bitset.cpp
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / test / perf / unisetperf / draft / bitset.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: bitset.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2007jan15
12 * created by: Markus Scherer
13 *
14 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
15 * using a folded bit set consisting of a 1k-entry index table and a
16 * compacted array of 64-bit words.
17 * Uses a simple hash table for compaction.
18 * Uses the original set for supplementary code points.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicont.h"
23 #include "cmemory.h" // for UPRV_LENGTHOF
24
25 /*
26 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
27 * Hashes 64-bit words and maps them to 16-bit integers which are
28 * assigned in order of new incoming words for subsequent storage
29 * in a contiguous array.
30 */
31 struct BMPBitHash : public UObject {
32 int64_t keys[0x800]; // 2k
33 uint16_t values[0x800];
34 uint16_t reverse[0x400];
35 uint16_t count;
36 const int32_t prime=1301; // Less than 2k.
37
38 BMPBitHash() : count(0) {
39 // Fill values[] with 0xffff.
40 uprv_memset(values, 0xff, sizeof(values));
41 }
42
43 /*
44 * Map a key to an integer count.
45 * Map at most 1k=0x400 different keys with this data structure.
46 */
47 uint16_t map(int64_t key) {
48 int32_t hash=(int32_t)(key>>55)&0x1ff;
49 hash^=(int32_t)(key>>44)&0x7ff;
50 hash^=(int32_t)(key>>33)&0x7ff;
51 hash^=(int32_t)(key>>22)&0x7ff;
52 hash^=(int32_t)(key>>11)&0x7ff;
53 hash^=(int32_t)key&0x7ff;
54 for(;;) {
55 if(values[hash]==0xffff) {
56 // Unused slot.
57 keys[hash]=key;
58 reverse[count]=hash;
59 return values[hash]=count++;
60 } else if(keys[hash]==key) {
61 // Found a slot with this key.
62 return values[hash];
63 } else {
64 // Used slot with a different key, move to another slot.
65 hash=(hash+prime)&0x7ff;
66 }
67 }
68 }
69
70 uint16_t countKeys() const { return count; }
71
72 /*
73 * Invert the hash map: Fill an array of length countKeys() with the keys
74 * indexed by their mapped values.
75 */
76 void invert(int64_t *k) const {
77 uint16_t i;
78
79 for(i=0; i<count; ++i) {
80 k[i]=keys[reverse[i]];
81 }
82 }
83 };
84
85 class BitSet : public UObject, public UnicodeContainable {
86 public:
87 BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
88 if(U_FAILURE(errorCode)) {
89 return;
90 }
91 BMPBitHash *bitHash=new BMPBitHash;
92 if(bitHash==NULL || restSet==NULL) {
93 errorCode=U_MEMORY_ALLOCATION_ERROR;
94 return;
95 }
96
97 UnicodeSetIterator iter(set);
98 int64_t b;
99 UChar32 start, end;
100 int32_t prevIndex, i, j;
101
102 b=0; // Not necessary but makes compilers happy.
103 prevIndex=-1;
104 for(;;) {
105 if(iter.nextRange() && !iter.isString()) {
106 start=iter.getCodepoint();
107 end=iter.getCodepointEnd();
108 } else {
109 start=0x10000;
110 }
111 i=start>>6;
112 if(prevIndex!=i) {
113 // Finish the end of the previous range.
114 if(prevIndex<0) {
115 prevIndex=0;
116 } else {
117 index[prevIndex++]=bitHash->map(b);
118 }
119 // Fill all-zero entries between ranges.
120 if(prevIndex<i) {
121 uint16_t zero=bitHash->map(0);
122 do {
123 index[prevIndex++]=zero;
124 } while(prevIndex<i);
125 }
126 b=0;
127 }
128 if(start>0xffff) {
129 break;
130 }
131 b|=~((INT64_C(1)<<(start&0x3f))-1);
132 j=end>>6;
133 if(i<j) {
134 // Set bits for the start of the range.
135 index[i++]=bitHash->map(b);
136 // Fill all-one entries inside the range.
137 if(i<j) {
138 uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
139 do {
140 index[i++]=all;
141 } while(i<j);
142 }
143 b=INT64_C(0xffffffffffffffff);
144 }
145 /* i==j */
146 b&=(INT64_C(1)<<(end&0x3f))-1;
147 prevIndex=j;
148 }
149
150 if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
151 bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
152 }
153 if(bits!=NULL) {
154 bitHash->invert(bits);
155 } else {
156 bits=shortBits;
157 errorCode=U_MEMORY_ALLOCATION_ERROR;
158 return;
159 }
160
161 latin1Set[0]=(uint32_t)bits[0];
162 latin1Set[1]=(uint32_t)(bits[0]>>32);
163 latin1Set[2]=(uint32_t)bits[1];
164 latin1Set[3]=(uint32_t)(bits[1]>>32);
165 latin1Set[4]=(uint32_t)bits[2];
166 latin1Set[5]=(uint32_t)(bits[2]>>32);
167 latin1Set[6]=(uint32_t)bits[3];
168 latin1Set[7]=(uint32_t)(bits[3]>>32);
169
170 restSet.remove(0, 0xffff);
171 }
172
173 ~BitSet() {
174 if(bits!=shortBits) {
175 uprv_free(bits);
176 }
177 delete restSet;
178 }
179
180 UBool contains(UChar32 c) const {
181 if((uint32_t)c<=0xff) {
182 return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
183 } else if((uint32_t)c<0xffff) {
184 return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
185 } else {
186 return restSet->contains(c);
187 }
188 }
189
190 private:
191 uint16_t index[0x400];
192 int64_t shortBits[32];
193 int64_t *bits;
194
195 uint32_t latin1Bits[8];
196
197 UnicodeSet *restSet;
198 };