icuSources/test/perf/unisetperf/draft/bitset.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 2014, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   file name:  bitset.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 *   created on: 2007jan15
  12 *   created by: Markus Scherer
  13 *
  14 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
  15 *   using a folded bit set consisting of a 1k-entry index table and a
  16 *   compacted array of 64-bit words.
  17 *   Uses a simple hash table for compaction.
  18 *   Uses the original set for supplementary code points.
  19 */
  20
  21 #include "unicode/utypes.h"
  22 #include "unicont.h"
  23 #include "cmemory.h" // for UPRV_LENGTHOF
  24
  25 /*
  26  * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
  27  * Hashes 64-bit words and maps them to 16-bit integers which are
  28  * assigned in order of new incoming words for subsequent storage
  29  * in a contiguous array.
  30  */
  31 struct BMPBitHash : public UObject {
  32     int64_t keys[0x800];  // 2k
  33     uint16_t values[0x800];
  34     uint16_t reverse[0x400];
  35     uint16_t count;
  36     const int32_t prime=1301;  // Less than 2k.
  37
  38     BMPBitHash() : count(0) {
  39         // Fill values[] with 0xffff.
  40         uprv_memset(values, 0xff, sizeof(values));
  41     }
  42
  43     /*
  44      * Map a key to an integer count.
  45      * Map at most 1k=0x400 different keys with this data structure.
  46      */
  47     uint16_t map(int64_t key) {
  48         int32_t hash=(int32_t)(key>>55)&0x1ff;
  49         hash^=(int32_t)(key>>44)&0x7ff;
  50         hash^=(int32_t)(key>>33)&0x7ff;
  51         hash^=(int32_t)(key>>22)&0x7ff;
  52         hash^=(int32_t)(key>>11)&0x7ff;
  53         hash^=(int32_t)key&0x7ff;
  54         for(;;) {
  55             if(values[hash]==0xffff) {
  56                 // Unused slot.
  57                 keys[hash]=key;
  58                 reverse[count]=hash;
  59                 return values[hash]=count++;
  60             } else if(keys[hash]==key) {
  61                 // Found a slot with this key.
  62                 return values[hash];
  63             } else {
  64                 // Used slot with a different key, move to another slot.
  65                 hash=(hash+prime)&0x7ff;
  66             }
  67         }
  68     }
  69
  70     uint16_t countKeys() const { return count; }
  71
  72     /*
  73      * Invert the hash map: Fill an array of length countKeys() with the keys
  74      * indexed by their mapped values.
  75      */
  76     void invert(int64_t *k) const {
  77         uint16_t i;
  78
  79         for(i=0; i<count; ++i) {
  80             k[i]=keys[reverse[i]];
  81         }
  82     }
  83 };
  84
  85 class BitSet : public UObject, public UnicodeContainable {
  86 public:
  87     BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
  88         if(U_FAILURE(errorCode)) {
  89             return;
  90         }
  91         BMPBitHash *bitHash=new BMPBitHash;
  92         if(bitHash==NULL || restSet==NULL) {
  93             errorCode=U_MEMORY_ALLOCATION_ERROR;
  94             return;
  95         }
  96
  97         UnicodeSetIterator iter(set);
  98         int64_t b;
  99         UChar32 start, end;
 100         int32_t prevIndex, i, j;
 101
 102         b=0;  // Not necessary but makes compilers happy.
 103         prevIndex=-1;
 104         for(;;) {
 105             if(iter.nextRange() && !iter.isString()) {
 106                 start=iter.getCodepoint();
 107                 end=iter.getCodepointEnd();
 108             } else {
 109                 start=0x10000;
 110             }
 111             i=start>>6;
 112             if(prevIndex!=i) {
 113                 // Finish the end of the previous range.
 114                 if(prevIndex<0) {
 115                     prevIndex=0;
 116                 } else {
 117                     index[prevIndex++]=bitHash->map(b);
 118                 }
 119                 // Fill all-zero entries between ranges.
 120                 if(prevIndex<i) {
 121                     uint16_t zero=bitHash->map(0);
 122                     do {
 123                         index[prevIndex++]=zero;
 124                     } while(prevIndex<i);
 125                 }
 126                 b=0;
 127             }
 128             if(start>0xffff) {
 129                 break;
 130             }
 131             b|=~((INT64_C(1)<<(start&0x3f))-1);
 132             j=end>>6;
 133             if(i<j) {
 134                 // Set bits for the start of the range.
 135                 index[i++]=bitHash->map(b);
 136                 // Fill all-one entries inside the range.
 137                 if(i<j) {
 138                     uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
 139                     do {
 140                         index[i++]=all;
 141                     } while(i<j);
 142                 }
 143                 b=INT64_C(0xffffffffffffffff);
 144             }
 145             /* i==j */
 146             b&=(INT64_C(1)<<(end&0x3f))-1;
 147             prevIndex=j;
 148         }
 149
 150         if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
 151             bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
 152         }
 153         if(bits!=NULL) {
 154             bitHash->invert(bits);
 155         } else {
 156             bits=shortBits;
 157             errorCode=U_MEMORY_ALLOCATION_ERROR;
 158             return;
 159         }
 160
 161         latin1Set[0]=(uint32_t)bits[0];
 162         latin1Set[1]=(uint32_t)(bits[0]>>32);
 163         latin1Set[2]=(uint32_t)bits[1];
 164         latin1Set[3]=(uint32_t)(bits[1]>>32);
 165         latin1Set[4]=(uint32_t)bits[2];
 166         latin1Set[5]=(uint32_t)(bits[2]>>32);
 167         latin1Set[6]=(uint32_t)bits[3];
 168         latin1Set[7]=(uint32_t)(bits[3]>>32);
 169
 170         restSet.remove(0, 0xffff);
 171     }
 172
 173     ~BitSet() {
 174         if(bits!=shortBits) {
 175             uprv_free(bits);
 176         }
 177         delete restSet;
 178     }
 179
 180     UBool contains(UChar32 c) const {
 181         if((uint32_t)c<=0xff) {
 182             return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
 183         } else if((uint32_t)c<0xffff) {
 184             return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
 185         } else {
 186             return restSet->contains(c);
 187         }
 188     }
 189
 190 private:
 191     uint16_t index[0x400];
 192     int64_t shortBits[32];
 193     int64_t *bits;
 194
 195     uint32_t latin1Bits[8];
 196
 197     UnicodeSet *restSet;
 198 };