ICU-400.37.tar.gz

[apple/icu.git] / icuSources / test / perf / unisetperf / draft / trieset.cpp
diff --git a/icuSources/test/perf/unisetperf/draft/trieset.cpp b/icuSources/test/perf/unisetperf/draft/trieset.cpp

new file mode 100644 (file)

index 0000000..6c47208
--- /dev/null
+++ b/icuSources/test/perf/unisetperf/draft/trieset.cpp
@@ -0,0 +1,111 @@
+/*  
+**********************************************************************
+*   Copyright (C) 2007, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   file name:  trieset.cpp
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2007jan15
+*   created by: Markus Scherer
+*
+*   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
+*   using a UTrie with 8-bit (byte) results per code point.
+*   Modifies the trie index to make the BMP linear, and uses the original set
+*   for supplementary code points.
+*/
+
+#include "unicode/utypes.h"
+#include "unicont.h"
+
+#define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
+
+#define UTRIE_GET8_FROM_LEAD(trie, c16) \
+    ((const uint8_t *)(trie)->data32)[ \
+        ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
+        ((c16)&UTRIE_MASK) \
+    ]
+
+class TrieSet : public UObject, public UnicodeContainable {
+public:
+    TrieSet(const UnicodeSet &set, UErrorCode &errorCode)
+            : trieData(NULL), latin1(NULL), restSet(set.clone()) {
+        if(U_FAILURE(errorCode)) {
+            return;
+        }
+        if(restSet==NULL) {
+            errorCode=U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+
+        UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE);
+        UChar32 start, end;
+
+        UnicodeSetIterator iter(set);
+
+        while(iter.nextRange() && !iter.isString()) {
+            start=iter.getCodepoint();
+            end=iter.getCodepointEnd();
+            if(start>0xffff) {
+                break;
+            }
+            if(end>0xffff) {
+                end=0xffff;
+            }
+            if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) {
+                errorCode=U_INTERNAL_PROGRAM_ERROR;
+                return;
+            }
+        }
+
+        // Preflight the trie length.
+        int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode);
+        if(errorCode!=U_BUFFER_OVERFLOW_ERROR) {
+            return;
+        }
+
+        trieData=(uint32_t *)uprv_malloc(length);
+        if(trieData==NULL) {
+            errorCode=U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
+
+        errorCode=U_ZERO_ERROR;
+        utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode);
+        utrie_unserialize(&trie, trieData, length, &errorCode);  // TODO: Implement for 8-bit UTrie!
+
+        if(U_SUCCESS(errorCode)) {
+            // Copy the indexes for surrogate code points into the BMP range
+            // for simple access across the entire BMP.
+            uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT),
+                        trie.index+UTRIE_BMP_INDEX_LENGTH,
+                        (0x800>>UTRIE_SHIFT)*2);
+            latin1=UTRIE_GET8_LATIN1(&trie);
+        }
+
+        restSet.remove(0, 0xffff);
+    }
+
+    ~TrieSet() {
+        uprv_free(trieData);
+        delete restSet;
+    }
+
+    UBool contains(UChar32 c) const {
+        if((uint32_t)c<=0xff) {
+            return (UBool)latin1[c];
+        } else if((uint32_t)c<0xffff) {
+            return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c);
+        } else {
+            return restSet->contains(c);
+        }
+    }
+
+private:
+    uint32_t *trieData;
+    const uint8_t *latin1;
+    UTrie trie;
+    UnicodeSet *restSet;
+};