]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/stringtriebuilder.h
ICU-491.11.2.tar.gz
[apple/icu.git] / icuSources / common / unicode / stringtriebuilder.h
1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: stringtriebuilder.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010dec24
12 * created by: Markus W. Scherer
13 */
14
15 #ifndef __STRINGTRIEBUILDER_H__
16 #define __STRINGTRIEBUILDER_H__
17
18 #include "unicode/utypes.h"
19 #include "unicode/uobject.h"
20
21 // Forward declaration.
22 struct UHashtable;
23 typedef struct UHashtable UHashtable;
24
25 /**
26 * Build options for BytesTrieBuilder and CharsTrieBuilder.
27 * @stable ICU 4.8
28 */
29 enum UStringTrieBuildOption {
30 /**
31 * Builds a trie quickly.
32 * @stable ICU 4.8
33 */
34 USTRINGTRIE_BUILD_FAST,
35 /**
36 * Builds a trie more slowly, attempting to generate
37 * a shorter but equivalent serialization.
38 * This build option also uses more memory.
39 *
40 * This option can be effective when many integer values are the same
41 * and string/byte sequence suffixes can be shared.
42 * Runtime speed is not expected to improve.
43 * @stable ICU 4.8
44 */
45 USTRINGTRIE_BUILD_SMALL
46 };
47
48 U_NAMESPACE_BEGIN
49
50 /**
51 * Base class for string trie builder classes.
52 *
53 * This class is not intended for public subclassing.
54 * @stable ICU 4.8
55 */
56 class U_COMMON_API StringTrieBuilder : public UObject {
57 public:
58 #ifndef U_HIDE_INTERNAL_API
59 /** @internal */
60 static UBool hashNode(const void *node);
61 /** @internal */
62 static UBool equalNodes(const void *left, const void *right);
63 #endif /* U_HIDE_INTERNAL_API */
64
65 protected:
66 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
67 // or else the compiler will create a public default constructor.
68 /** @internal */
69 StringTrieBuilder();
70 /** @internal */
71 virtual ~StringTrieBuilder();
72
73 #ifndef U_HIDE_INTERNAL_API
74 /** @internal */
75 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
76 /** @internal */
77 void deleteCompactBuilder();
78
79 /** @internal */
80 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
81
82 /** @internal */
83 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
84 /** @internal */
85 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
86 #endif /* U_HIDE_INTERNAL_API */
87
88 class Node;
89
90 #ifndef U_HIDE_INTERNAL_API
91 /** @internal */
92 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
93 /** @internal */
94 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
95 int32_t length, UErrorCode &errorCode);
96 #endif /* U_HIDE_INTERNAL_API */
97
98 /** @internal */
99 virtual int32_t getElementStringLength(int32_t i) const = 0;
100 /** @internal */
101 virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
102 /** @internal */
103 virtual int32_t getElementValue(int32_t i) const = 0;
104
105 // Finds the first unit index after this one where
106 // the first and last element have different units again.
107 /** @internal */
108 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
109
110 // Number of different units at unitIndex.
111 /** @internal */
112 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
113 /** @internal */
114 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
115 /** @internal */
116 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
117
118 /** @internal */
119 virtual UBool matchNodesCanHaveValues() const = 0;
120
121 /** @internal */
122 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
123 /** @internal */
124 virtual int32_t getMinLinearMatch() const = 0;
125 /** @internal */
126 virtual int32_t getMaxLinearMatchLength() const = 0;
127
128 #ifndef U_HIDE_INTERNAL_API
129 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
130 /** @internal */
131 static const int32_t kMaxBranchLinearSubNodeLength=5;
132
133 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
134 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
135 /** @internal */
136 static const int32_t kMaxSplitBranchLevels=14;
137
138 /**
139 * Makes sure that there is only one unique node registered that is
140 * equivalent to newNode.
141 * @param newNode Input node. The builder takes ownership.
142 * @param errorCode ICU in/out UErrorCode.
143 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
144 * @return newNode if it is the first of its kind, or
145 * an equivalent node if newNode is a duplicate.
146 * @internal
147 */
148 Node *registerNode(Node *newNode, UErrorCode &errorCode);
149 /**
150 * Makes sure that there is only one unique FinalValueNode registered
151 * with this value.
152 * Avoids creating a node if the value is a duplicate.
153 * @param value A final value.
154 * @param errorCode ICU in/out UErrorCode.
155 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
156 * @return A FinalValueNode with the given value.
157 * @internal
158 */
159 Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
160
161 /*
162 * C++ note:
163 * registerNode() and registerFinalValue() take ownership of their input nodes,
164 * and only return owned nodes.
165 * If they see a failure UErrorCode, they will delete the input node.
166 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
167 * If there is a failure, they return NULL.
168 *
169 * NULL Node pointers can be safely passed into other Nodes because
170 * they call the static Node::hashCode() which checks for a NULL pointer first.
171 *
172 * Therefore, as long as builder functions register a new node,
173 * they need to check for failures only before explicitly dereferencing
174 * a Node pointer, or before setting a new UErrorCode.
175 */
176
177 // Hash set of nodes, maps from nodes to integer 1.
178 /** @internal */
179 UHashtable *nodes;
180
181 /** @internal */
182 class Node : public UObject {
183 public:
184 Node(int32_t initialHash) : hash(initialHash), offset(0) {}
185 inline int32_t hashCode() const { return hash; }
186 // Handles node==NULL.
187 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
188 // Base class operator==() compares the actual class types.
189 virtual UBool operator==(const Node &other) const;
190 inline UBool operator!=(const Node &other) const { return !operator==(other); }
191 /**
192 * Traverses the Node graph and numbers branch edges, with rightmost edges first.
193 * This is to avoid writing a duplicate node twice.
194 *
195 * Branch nodes in this trie data structure are not symmetric.
196 * Most branch edges "jump" to other nodes but the rightmost branch edges
197 * just continue without a jump.
198 * Therefore, write() must write the rightmost branch edge last
199 * (trie units are written backwards), and must write it at that point even if
200 * it is a duplicate of a node previously written elsewhere.
201 *
202 * This function visits and marks right branch edges first.
203 * Edges are numbered with increasingly negative values because we share the
204 * offset field which gets positive values when nodes are written.
205 * A branch edge also remembers the first number for any of its edges.
206 *
207 * When a further-left branch edge has a number in the range of the rightmost
208 * edge's numbers, then it will be written as part of the required right edge
209 * and we can avoid writing it first.
210 *
211 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
212 * edge numbers.
213 *
214 * @param edgeNumber The first edge number for this node and its sub-nodes.
215 * @return An edge number that is at least the maximum-negative
216 * of the input edge number and the numbers of this node and all of its sub-nodes.
217 */
218 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
219 // write() must set the offset to a positive value.
220 virtual void write(StringTrieBuilder &builder) = 0;
221 // See markRightEdgesFirst.
222 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
223 StringTrieBuilder &builder) {
224 // Note: Edge numbers are negative, lastRight<=firstRight.
225 // If offset>0 then this node and its sub-nodes have been written already
226 // and we need not write them again.
227 // If this node is part of the unwritten right branch edge,
228 // then we wait until that is written.
229 if(offset<0 && (offset<lastRight || firstRight<offset)) {
230 write(builder);
231 }
232 }
233 inline int32_t getOffset() const { return offset; }
234 protected:
235 int32_t hash;
236 int32_t offset;
237 private:
238 // No ICU "poor man's RTTI" for this class nor its subclasses.
239 virtual UClassID getDynamicClassID() const;
240 };
241
242 // This class should not be overridden because
243 // registerFinalValue() compares a stack-allocated FinalValueNode
244 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
245 // with the input node, and the
246 // !Node::operator==(other) used inside FinalValueNode::operator==(other)
247 // will be false if the typeid's are different.
248 /** @internal */
249 class FinalValueNode : public Node {
250 public:
251 FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
252 virtual UBool operator==(const Node &other) const;
253 virtual void write(StringTrieBuilder &builder);
254 protected:
255 int32_t value;
256 };
257
258 /** @internal */
259 class ValueNode : public Node {
260 public:
261 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
262 virtual UBool operator==(const Node &other) const;
263 void setValue(int32_t v) {
264 hasValue=TRUE;
265 value=v;
266 hash=hash*37+v;
267 }
268 protected:
269 UBool hasValue;
270 int32_t value;
271 };
272
273 /** @internal */
274 class IntermediateValueNode : public ValueNode {
275 public:
276 IntermediateValueNode(int32_t v, Node *nextNode)
277 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
278 virtual UBool operator==(const Node &other) const;
279 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
280 virtual void write(StringTrieBuilder &builder);
281 protected:
282 Node *next;
283 };
284
285 /** @internal */
286 class LinearMatchNode : public ValueNode {
287 public:
288 LinearMatchNode(int32_t len, Node *nextNode)
289 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
290 length(len), next(nextNode) {}
291 virtual UBool operator==(const Node &other) const;
292 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
293 protected:
294 int32_t length;
295 Node *next;
296 };
297
298 /** @internal */
299 class BranchNode : public Node {
300 public:
301 BranchNode(int32_t initialHash) : Node(initialHash) {}
302 protected:
303 int32_t firstEdgeNumber;
304 };
305
306 /** @internal */
307 class ListBranchNode : public BranchNode {
308 public:
309 ListBranchNode() : BranchNode(0x444444), length(0) {}
310 virtual UBool operator==(const Node &other) const;
311 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
312 virtual void write(StringTrieBuilder &builder);
313 // Adds a unit with a final value.
314 void add(int32_t c, int32_t value) {
315 units[length]=(UChar)c;
316 equal[length]=NULL;
317 values[length]=value;
318 ++length;
319 hash=(hash*37+c)*37+value;
320 }
321 // Adds a unit which leads to another match node.
322 void add(int32_t c, Node *node) {
323 units[length]=(UChar)c;
324 equal[length]=node;
325 values[length]=0;
326 ++length;
327 hash=(hash*37+c)*37+hashCode(node);
328 }
329 protected:
330 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
331 int32_t length;
332 int32_t values[kMaxBranchLinearSubNodeLength];
333 UChar units[kMaxBranchLinearSubNodeLength];
334 };
335
336 /** @internal */
337 class SplitBranchNode : public BranchNode {
338 public:
339 SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
340 : BranchNode(((0x555555*37+middleUnit)*37+
341 hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
342 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
343 virtual UBool operator==(const Node &other) const;
344 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
345 virtual void write(StringTrieBuilder &builder);
346 protected:
347 UChar unit;
348 Node *lessThan;
349 Node *greaterOrEqual;
350 };
351
352 // Branch head node, for writing the actual node lead unit.
353 /** @internal */
354 class BranchHeadNode : public ValueNode {
355 public:
356 BranchHeadNode(int32_t len, Node *subNode)
357 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
358 length(len), next(subNode) {}
359 virtual UBool operator==(const Node &other) const;
360 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
361 virtual void write(StringTrieBuilder &builder);
362 protected:
363 int32_t length;
364 Node *next; // A branch sub-node.
365 };
366 #endif /* U_HIDE_INTERNAL_API */
367
368 /** @internal */
369 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
370 Node *nextNode) const = 0;
371
372 /** @internal */
373 virtual int32_t write(int32_t unit) = 0;
374 /** @internal */
375 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
376 /** @internal */
377 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
378 /** @internal */
379 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
380 /** @internal */
381 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
382
383 private:
384 // No ICU "poor man's RTTI" for this class nor its subclasses.
385 virtual UClassID getDynamicClassID() const;
386 };
387
388 U_NAMESPACE_END
389
390 #endif // __STRINGTRIEBUILDER_H__