]>
Commit | Line | Data |
---|---|---|
46f4442e A |
1 | /* |
2 | ********************************************************************** | |
3 | * Copyright (C) 2007, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ********************************************************************** | |
6 | * file name: trieset.cpp | |
7 | * encoding: US-ASCII | |
8 | * tab size: 8 (not used) | |
9 | * indentation:4 | |
10 | * | |
11 | * created on: 2007jan15 | |
12 | * created by: Markus Scherer | |
13 | * | |
14 | * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet | |
15 | * using a UTrie with 8-bit (byte) results per code point. | |
16 | * Modifies the trie index to make the BMP linear, and uses the original set | |
17 | * for supplementary code points. | |
18 | */ | |
19 | ||
20 | #include "unicode/utypes.h" | |
21 | #include "unicont.h" | |
22 | ||
23 | #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH) | |
24 | ||
25 | #define UTRIE_GET8_FROM_LEAD(trie, c16) \ | |
26 | ((const uint8_t *)(trie)->data32)[ \ | |
27 | ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \ | |
28 | ((c16)&UTRIE_MASK) \ | |
29 | ] | |
30 | ||
31 | class TrieSet : public UObject, public UnicodeContainable { | |
32 | public: | |
33 | TrieSet(const UnicodeSet &set, UErrorCode &errorCode) | |
34 | : trieData(NULL), latin1(NULL), restSet(set.clone()) { | |
35 | if(U_FAILURE(errorCode)) { | |
36 | return; | |
37 | } | |
38 | if(restSet==NULL) { | |
39 | errorCode=U_MEMORY_ALLOCATION_ERROR; | |
40 | return; | |
41 | } | |
42 | ||
43 | UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE); | |
44 | UChar32 start, end; | |
45 | ||
46 | UnicodeSetIterator iter(set); | |
47 | ||
48 | while(iter.nextRange() && !iter.isString()) { | |
49 | start=iter.getCodepoint(); | |
50 | end=iter.getCodepointEnd(); | |
51 | if(start>0xffff) { | |
52 | break; | |
53 | } | |
54 | if(end>0xffff) { | |
55 | end=0xffff; | |
56 | } | |
57 | if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) { | |
58 | errorCode=U_INTERNAL_PROGRAM_ERROR; | |
59 | return; | |
60 | } | |
61 | } | |
62 | ||
63 | // Preflight the trie length. | |
64 | int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode); | |
65 | if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { | |
66 | return; | |
67 | } | |
68 | ||
69 | trieData=(uint32_t *)uprv_malloc(length); | |
70 | if(trieData==NULL) { | |
71 | errorCode=U_MEMORY_ALLOCATION_ERROR; | |
72 | return; | |
73 | } | |
74 | ||
75 | errorCode=U_ZERO_ERROR; | |
76 | utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode); | |
77 | utrie_unserialize(&trie, trieData, length, &errorCode); // TODO: Implement for 8-bit UTrie! | |
78 | ||
79 | if(U_SUCCESS(errorCode)) { | |
80 | // Copy the indexes for surrogate code points into the BMP range | |
81 | // for simple access across the entire BMP. | |
82 | uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT), | |
83 | trie.index+UTRIE_BMP_INDEX_LENGTH, | |
84 | (0x800>>UTRIE_SHIFT)*2); | |
85 | latin1=UTRIE_GET8_LATIN1(&trie); | |
86 | } | |
87 | ||
88 | restSet.remove(0, 0xffff); | |
89 | } | |
90 | ||
91 | ~TrieSet() { | |
92 | uprv_free(trieData); | |
93 | delete restSet; | |
94 | } | |
95 | ||
96 | UBool contains(UChar32 c) const { | |
97 | if((uint32_t)c<=0xff) { | |
98 | return (UBool)latin1[c]; | |
99 | } else if((uint32_t)c<0xffff) { | |
100 | return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c); | |
101 | } else { | |
102 | return restSet->contains(c); | |
103 | } | |
104 | } | |
105 | ||
106 | private: | |
107 | uint32_t *trieData; | |
108 | const uint8_t *latin1; | |
109 | UTrie trie; | |
110 | UnicodeSet *restSet; | |
111 | }; |