icuSources/test/perf/unisetperf/draft/bitset.cpp

   1 /*
   2 *************************************************************************
   3 *   © 2016 and later: Unicode, Inc. and others.
   4 *   License & terms of use: http://www.unicode.org/copyright.html#License
   5 *************************************************************************
   6 *************************************************************************
   7 *   Copyright (C) 2014, International Business Machines
   8 *   Corporation and others.  All Rights Reserved.
   9 *************************************************************************
  10 *   file name:  bitset.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2007jan15
  16 *   created by: Markus Scherer
  17 *
  18 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
  19 *   using a folded bit set consisting of a 1k-entry index table and a
  20 *   compacted array of 64-bit words.
  21 *   Uses a simple hash table for compaction.
  22 *   Uses the original set for supplementary code points.
  23 */
  24
  25 #include "unicode/utypes.h"
  26 #include "unicont.h"
  27 #include "cmemory.h" // for UPRV_LENGTHOF
  28
  29 /*
  30  * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
  31  * Hashes 64-bit words and maps them to 16-bit integers which are
  32  * assigned in order of new incoming words for subsequent storage
  33  * in a contiguous array.
  34  */
  35 struct BMPBitHash : public UObject {
  36     int64_t keys[0x800];  // 2k
  37     uint16_t values[0x800];
  38     uint16_t reverse[0x400];
  39     uint16_t count;
  40     const int32_t prime=1301;  // Less than 2k.
  41
  42     BMPBitHash() : count(0) {
  43         // Fill values[] with 0xffff.
  44         uprv_memset(values, 0xff, sizeof(values));
  45     }
  46
  47     /*
  48      * Map a key to an integer count.
  49      * Map at most 1k=0x400 different keys with this data structure.
  50      */
  51     uint16_t map(int64_t key) {
  52         int32_t hash=(int32_t)(key>>55)&0x1ff;
  53         hash^=(int32_t)(key>>44)&0x7ff;
  54         hash^=(int32_t)(key>>33)&0x7ff;
  55         hash^=(int32_t)(key>>22)&0x7ff;
  56         hash^=(int32_t)(key>>11)&0x7ff;
  57         hash^=(int32_t)key&0x7ff;
  58         for(;;) {
  59             if(values[hash]==0xffff) {
  60                 // Unused slot.
  61                 keys[hash]=key;
  62                 reverse[count]=hash;
  63                 return values[hash]=count++;
  64             } else if(keys[hash]==key) {
  65                 // Found a slot with this key.
  66                 return values[hash];
  67             } else {
  68                 // Used slot with a different key, move to another slot.
  69                 hash=(hash+prime)&0x7ff;
  70             }
  71         }
  72     }
  73
  74     uint16_t countKeys() const { return count; }
  75
  76     /*
  77      * Invert the hash map: Fill an array of length countKeys() with the keys
  78      * indexed by their mapped values.
  79      */
  80     void invert(int64_t *k) const {
  81         uint16_t i;
  82
  83         for(i=0; i<count; ++i) {
  84             k[i]=keys[reverse[i]];
  85         }
  86     }
  87 };
  88
  89 class BitSet : public UObject, public UnicodeContainable {
  90 public:
  91     BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
  92         if(U_FAILURE(errorCode)) {
  93             return;
  94         }
  95         BMPBitHash *bitHash=new BMPBitHash;
  96         if(bitHash==NULL || restSet==NULL) {
  97             errorCode=U_MEMORY_ALLOCATION_ERROR;
  98             return;
  99         }
 100
 101         UnicodeSetIterator iter(set);
 102         int64_t b;
 103         UChar32 start, end;
 104         int32_t prevIndex, i, j;
 105
 106         b=0;  // Not necessary but makes compilers happy.
 107         prevIndex=-1;
 108         for(;;) {
 109             if(iter.nextRange() && !iter.isString()) {
 110                 start=iter.getCodepoint();
 111                 end=iter.getCodepointEnd();
 112             } else {
 113                 start=0x10000;
 114             }
 115             i=start>>6;
 116             if(prevIndex!=i) {
 117                 // Finish the end of the previous range.
 118                 if(prevIndex<0) {
 119                     prevIndex=0;
 120                 } else {
 121                     index[prevIndex++]=bitHash->map(b);
 122                 }
 123                 // Fill all-zero entries between ranges.
 124                 if(prevIndex<i) {
 125                     uint16_t zero=bitHash->map(0);
 126                     do {
 127                         index[prevIndex++]=zero;
 128                     } while(prevIndex<i);
 129                 }
 130                 b=0;
 131             }
 132             if(start>0xffff) {
 133                 break;
 134             }
 135             b|=~((INT64_C(1)<<(start&0x3f))-1);
 136             j=end>>6;
 137             if(i<j) {
 138                 // Set bits for the start of the range.
 139                 index[i++]=bitHash->map(b);
 140                 // Fill all-one entries inside the range.
 141                 if(i<j) {
 142                     uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
 143                     do {
 144                         index[i++]=all;
 145                     } while(i<j);
 146                 }
 147                 b=INT64_C(0xffffffffffffffff);
 148             }
 149             /* i==j */
 150             b&=(INT64_C(1)<<(end&0x3f))-1;
 151             prevIndex=j;
 152         }
 153
 154         if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
 155             bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
 156         }
 157         if(bits!=NULL) {
 158             bitHash->invert(bits);
 159         } else {
 160             bits=shortBits;
 161             errorCode=U_MEMORY_ALLOCATION_ERROR;
 162             return;
 163         }
 164
 165         latin1Set[0]=(uint32_t)bits[0];
 166         latin1Set[1]=(uint32_t)(bits[0]>>32);
 167         latin1Set[2]=(uint32_t)bits[1];
 168         latin1Set[3]=(uint32_t)(bits[1]>>32);
 169         latin1Set[4]=(uint32_t)bits[2];
 170         latin1Set[5]=(uint32_t)(bits[2]>>32);
 171         latin1Set[6]=(uint32_t)bits[3];
 172         latin1Set[7]=(uint32_t)(bits[3]>>32);
 173
 174         restSet.remove(0, 0xffff);
 175     }
 176
 177     ~BitSet() {
 178         if(bits!=shortBits) {
 179             uprv_free(bits);
 180         }
 181         delete restSet;
 182     }
 183
 184     UBool contains(UChar32 c) const {
 185         if((uint32_t)c<=0xff) {
 186             return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
 187         } else if((uint32_t)c<0xffff) {
 188             return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
 189         } else {
 190             return restSet->contains(c);
 191         }
 192     }
 193
 194 private:
 195     uint16_t index[0x400];
 196     int64_t shortBits[32];
 197     int64_t *bits;
 198
 199     uint32_t latin1Bits[8];
 200
 201     UnicodeSet *restSet;
 202 };