ICU-400.37.tar.gz

[apple/icu.git] / icuSources / test / perf / unisetperf / draft / bitset.cpp
diff --git a/icuSources/test/perf/unisetperf/draft/bitset.cpp b/icuSources/test/perf/unisetperf/draft/bitset.cpp

new file mode 100644 (file)

index 0000000..487d0c0
--- /dev/null
+++ b/icuSources/test/perf/unisetperf/draft/bitset.cpp
@@ -0,0 +1,197 @@
+/*  
+**********************************************************************
+*   Copyright (C) 2007, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   file name:  bitset.cpp
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2007jan15
+*   created by: Markus Scherer
+*
+*   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
+*   using a folded bit set consisting of a 1k-entry index table and a
+*   compacted array of 64-bit words.
+*   Uses a simple hash table for compaction.
+*   Uses the original set for supplementary code points.
+*/
+
+#include "unicode/utypes.h"
+#include "unicont.h"
+
+/*
+ * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
+ * Hashes 64-bit words and maps them to 16-bit integers which are
+ * assigned in order of new incoming words for subsequent storage
+ * in a contiguous array.
+ */
+struct BMPBitHash : public UObject {
+    int64_t keys[0x800];  // 2k
+    uint16_t values[0x800];
+    uint16_t reverse[0x400];
+    uint16_t count;
+    const int32_t prime=1301;  // Less than 2k.
+
+    BMPBitHash() : count(0) {
+        // Fill values[] with 0xffff.
+        uprv_memset(values, 0xff, sizeof(values));
+    }
+
+    /*
+     * Map a key to an integer count.
+     * Map at most 1k=0x400 different keys with this data structure.
+     */
+    uint16_t map(int64_t key) {
+        int32_t hash=(int32_t)(key>>55)&0x1ff;
+        hash^=(int32_t)(key>>44)&0x7ff;
+        hash^=(int32_t)(key>>33)&0x7ff;
+        hash^=(int32_t)(key>>22)&0x7ff;
+        hash^=(int32_t)(key>>11)&0x7ff;
+        hash^=(int32_t)key&0x7ff;
+        for(;;) {
+            if(values[hash]==0xffff) {
+                // Unused slot.
+                keys[hash]=key;
+                reverse[count]=hash;
+                return values[hash]=count++;
+            } else if(keys[hash]==key) {
+                // Found a slot with this key.
+                return values[hash];
+            } else {
+                // Used slot with a different key, move to another slot.
+                hash=(hash+prime)&0x7ff;
+            }
+        }
+    }
+
+    uint16_t countKeys() const { return count; }
+
+    /*
+     * Invert the hash map: Fill an array of length countKeys() with the keys
+     * indexed by their mapped values.
+     */
+    void invert(int64_t *k) const {
+        uint16_t i;
+
+        for(i=0; i<count; ++i) {
+            k[i]=keys[reverse[i]];
+        }
+    }
+};
+
+class BitSet : public UObject, public UnicodeContainable {
+public:
+    BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
+        if(U_FAILURE(errorCode)) {
+            return;
+        }
+        BMPBitHash *bitHash=new BMPBitHash;
+        if(bitHash==NULL || restSet==NULL) {
+            errorCode=U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+
+        UnicodeSetIterator iter(set);
+        int64_t b;
+        UChar32 start, end;
+        int32_t prevIndex, i, j;
+
+        b=0;  // Not necessary but makes compilers happy.
+        prevIndex=-1;
+        for(;;) {
+            if(iter.nextRange() && !iter.isString()) {
+                start=iter.getCodepoint();
+                end=iter.getCodepointEnd();
+            } else {
+                start=0x10000;
+            }
+            i=start>>6;
+            if(prevIndex!=i) {
+                // Finish the end of the previous range.
+                if(prevIndex<0) {
+                    prevIndex=0;
+                } else {
+                    index[prevIndex++]=bitHash->map(b);
+                }
+                // Fill all-zero entries between ranges.
+                if(prevIndex<i) {
+                    uint16_t zero=bitHash->map(0);
+                    do {
+                        index[prevIndex++]=zero;
+                    } while(prevIndex<i);
+                }
+                b=0;
+            }
+            if(start>0xffff) {
+                break;
+            }
+            b|=~((INT64_C(1)<<(start&0x3f))-1);
+            j=end>>6;
+            if(i<j) {
+                // Set bits for the start of the range.
+                index[i++]=bitHash->map(b);
+                // Fill all-one entries inside the range.
+                if(i<j) {
+                    uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
+                    do {
+                        index[i++]=all;
+                    } while(i<j);
+                }
+                b=INT64_C(0xffffffffffffffff);
+            }
+            /* i==j */
+            b&=(INT64_C(1)<<(end&0x3f))-1;
+            prevIndex=j;
+        }
+
+        if(bitHash->countKeys()>LENGTHOF(shortBits)) {
+            bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
+        }
+        if(bits!=NULL) {
+            bitHash->invert(bits);
+        } else {
+            bits=shortBits;
+            errorCode=U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+
+        latin1Set[0]=(uint32_t)bits[0];
+        latin1Set[1]=(uint32_t)(bits[0]>>32);
+        latin1Set[2]=(uint32_t)bits[1];
+        latin1Set[3]=(uint32_t)(bits[1]>>32);
+        latin1Set[4]=(uint32_t)bits[2];
+        latin1Set[5]=(uint32_t)(bits[2]>>32);
+        latin1Set[6]=(uint32_t)bits[3];
+        latin1Set[7]=(uint32_t)(bits[3]>>32);
+
+        restSet.remove(0, 0xffff);
+    }
+
+    ~BitSet() {
+        if(bits!=shortBits) {
+            uprv_free(bits);
+        }
+        delete restSet;
+    }
+
+    UBool contains(UChar32 c) const {
+        if((uint32_t)c<=0xff) {
+            return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
+        } else if((uint32_t)c<0xffff) {
+            return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
+        } else {
+            return restSet->contains(c);
+        }
+    }
+
+private:
+    uint16_t index[0x400];
+    int64_t shortBits[32];
+    int64_t *bits;
+
+    uint32_t latin1Bits[8];
+
+    UnicodeSet *restSet;
+};