X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/fd0068a84e9996f225edba706498f6ed413d0673..46f4442e9a5a4f3b98b7c1083586332f6a8a99a4:/icuSources/test/perf/unisetperf/draft/bitset.cpp diff --git a/icuSources/test/perf/unisetperf/draft/bitset.cpp b/icuSources/test/perf/unisetperf/draft/bitset.cpp new file mode 100644 index 00000000..487d0c09 --- /dev/null +++ b/icuSources/test/perf/unisetperf/draft/bitset.cpp @@ -0,0 +1,197 @@ +/* +********************************************************************** +* Copyright (C) 2007, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* file name: bitset.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2007jan15 +* created by: Markus Scherer +* +* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet +* using a folded bit set consisting of a 1k-entry index table and a +* compacted array of 64-bit words. +* Uses a simple hash table for compaction. +* Uses the original set for supplementary code points. +*/ + +#include "unicode/utypes.h" +#include "unicont.h" + +/* + * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. + * Hashes 64-bit words and maps them to 16-bit integers which are + * assigned in order of new incoming words for subsequent storage + * in a contiguous array. + */ +struct BMPBitHash : public UObject { + int64_t keys[0x800]; // 2k + uint16_t values[0x800]; + uint16_t reverse[0x400]; + uint16_t count; + const int32_t prime=1301; // Less than 2k. + + BMPBitHash() : count(0) { + // Fill values[] with 0xffff. + uprv_memset(values, 0xff, sizeof(values)); + } + + /* + * Map a key to an integer count. + * Map at most 1k=0x400 different keys with this data structure. + */ + uint16_t map(int64_t key) { + int32_t hash=(int32_t)(key>>55)&0x1ff; + hash^=(int32_t)(key>>44)&0x7ff; + hash^=(int32_t)(key>>33)&0x7ff; + hash^=(int32_t)(key>>22)&0x7ff; + hash^=(int32_t)(key>>11)&0x7ff; + hash^=(int32_t)key&0x7ff; + for(;;) { + if(values[hash]==0xffff) { + // Unused slot. + keys[hash]=key; + reverse[count]=hash; + return values[hash]=count++; + } else if(keys[hash]==key) { + // Found a slot with this key. + return values[hash]; + } else { + // Used slot with a different key, move to another slot. + hash=(hash+prime)&0x7ff; + } + } + } + + uint16_t countKeys() const { return count; } + + /* + * Invert the hash map: Fill an array of length countKeys() with the keys + * indexed by their mapped values. + */ + void invert(int64_t *k) const { + uint16_t i; + + for(i=0; i>6; + if(prevIndex!=i) { + // Finish the end of the previous range. + if(prevIndex<0) { + prevIndex=0; + } else { + index[prevIndex++]=bitHash->map(b); + } + // Fill all-zero entries between ranges. + if(prevIndexmap(0); + do { + index[prevIndex++]=zero; + } while(prevIndex0xffff) { + break; + } + b|=~((INT64_C(1)<<(start&0x3f))-1); + j=end>>6; + if(imap(b); + // Fill all-one entries inside the range. + if(imap(INT64_C(0xffffffffffffffff)); + do { + index[i++]=all; + } while(icountKeys()>LENGTHOF(shortBits)) { + bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); + } + if(bits!=NULL) { + bitHash->invert(bits); + } else { + bits=shortBits; + errorCode=U_MEMORY_ALLOCATION_ERROR; + return; + } + + latin1Set[0]=(uint32_t)bits[0]; + latin1Set[1]=(uint32_t)(bits[0]>>32); + latin1Set[2]=(uint32_t)bits[1]; + latin1Set[3]=(uint32_t)(bits[1]>>32); + latin1Set[4]=(uint32_t)bits[2]; + latin1Set[5]=(uint32_t)(bits[2]>>32); + latin1Set[6]=(uint32_t)bits[3]; + latin1Set[7]=(uint32_t)(bits[3]>>32); + + restSet.remove(0, 0xffff); + } + + ~BitSet() { + if(bits!=shortBits) { + uprv_free(bits); + } + delete restSet; + } + + UBool contains(UChar32 c) const { + if((uint32_t)c<=0xff) { + return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); + } else if((uint32_t)c<0xffff) { + return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); + } else { + return restSet->contains(c); + } + } + +private: + uint16_t index[0x400]; + int64_t shortBits[32]; + int64_t *bits; + + uint32_t latin1Bits[8]; + + UnicodeSet *restSet; +};