]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/stringtriebuilder.h
ICU-511.25.tar.gz
[apple/icu.git] / icuSources / common / unicode / stringtriebuilder.h
1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: stringtriebuilder.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010dec24
12 * created by: Markus W. Scherer
13 */
14
15 #ifndef __STRINGTRIEBUILDER_H__
16 #define __STRINGTRIEBUILDER_H__
17
18 #include "unicode/utypes.h"
19 #include "unicode/uobject.h"
20
21 /**
22 * \file
23 * \brief C++ API: Builder API for trie builders
24 */
25
26 // Forward declaration.
27 struct UHashtable;
28 typedef struct UHashtable UHashtable;
29
30 /**
31 * Build options for BytesTrieBuilder and CharsTrieBuilder.
32 * @stable ICU 4.8
33 */
34 enum UStringTrieBuildOption {
35 /**
36 * Builds a trie quickly.
37 * @stable ICU 4.8
38 */
39 USTRINGTRIE_BUILD_FAST,
40 /**
41 * Builds a trie more slowly, attempting to generate
42 * a shorter but equivalent serialization.
43 * This build option also uses more memory.
44 *
45 * This option can be effective when many integer values are the same
46 * and string/byte sequence suffixes can be shared.
47 * Runtime speed is not expected to improve.
48 * @stable ICU 4.8
49 */
50 USTRINGTRIE_BUILD_SMALL
51 };
52
53 U_NAMESPACE_BEGIN
54
55 /**
56 * Base class for string trie builder classes.
57 *
58 * This class is not intended for public subclassing.
59 * @stable ICU 4.8
60 */
61 class U_COMMON_API StringTrieBuilder : public UObject {
62 public:
63 #ifndef U_HIDE_INTERNAL_API
64 /** @internal */
65 static UBool hashNode(const void *node);
66 /** @internal */
67 static UBool equalNodes(const void *left, const void *right);
68 #endif /* U_HIDE_INTERNAL_API */
69
70 protected:
71 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
72 // or else the compiler will create a public default constructor.
73 /** @internal */
74 StringTrieBuilder();
75 /** @internal */
76 virtual ~StringTrieBuilder();
77
78 #ifndef U_HIDE_INTERNAL_API
79 /** @internal */
80 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
81 /** @internal */
82 void deleteCompactBuilder();
83
84 /** @internal */
85 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
86
87 /** @internal */
88 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
89 /** @internal */
90 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
91 #endif /* U_HIDE_INTERNAL_API */
92
93 class Node;
94
95 #ifndef U_HIDE_INTERNAL_API
96 /** @internal */
97 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
98 /** @internal */
99 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
100 int32_t length, UErrorCode &errorCode);
101 #endif /* U_HIDE_INTERNAL_API */
102
103 /** @internal */
104 virtual int32_t getElementStringLength(int32_t i) const = 0;
105 /** @internal */
106 virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
107 /** @internal */
108 virtual int32_t getElementValue(int32_t i) const = 0;
109
110 // Finds the first unit index after this one where
111 // the first and last element have different units again.
112 /** @internal */
113 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
114
115 // Number of different units at unitIndex.
116 /** @internal */
117 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
118 /** @internal */
119 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
120 /** @internal */
121 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
122
123 /** @internal */
124 virtual UBool matchNodesCanHaveValues() const = 0;
125
126 /** @internal */
127 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
128 /** @internal */
129 virtual int32_t getMinLinearMatch() const = 0;
130 /** @internal */
131 virtual int32_t getMaxLinearMatchLength() const = 0;
132
133 #ifndef U_HIDE_INTERNAL_API
134 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
135 /** @internal */
136 static const int32_t kMaxBranchLinearSubNodeLength=5;
137
138 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
139 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
140 /** @internal */
141 static const int32_t kMaxSplitBranchLevels=14;
142
143 /**
144 * Makes sure that there is only one unique node registered that is
145 * equivalent to newNode.
146 * @param newNode Input node. The builder takes ownership.
147 * @param errorCode ICU in/out UErrorCode.
148 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
149 * @return newNode if it is the first of its kind, or
150 * an equivalent node if newNode is a duplicate.
151 * @internal
152 */
153 Node *registerNode(Node *newNode, UErrorCode &errorCode);
154 /**
155 * Makes sure that there is only one unique FinalValueNode registered
156 * with this value.
157 * Avoids creating a node if the value is a duplicate.
158 * @param value A final value.
159 * @param errorCode ICU in/out UErrorCode.
160 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
161 * @return A FinalValueNode with the given value.
162 * @internal
163 */
164 Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
165
166 /*
167 * C++ note:
168 * registerNode() and registerFinalValue() take ownership of their input nodes,
169 * and only return owned nodes.
170 * If they see a failure UErrorCode, they will delete the input node.
171 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
172 * If there is a failure, they return NULL.
173 *
174 * NULL Node pointers can be safely passed into other Nodes because
175 * they call the static Node::hashCode() which checks for a NULL pointer first.
176 *
177 * Therefore, as long as builder functions register a new node,
178 * they need to check for failures only before explicitly dereferencing
179 * a Node pointer, or before setting a new UErrorCode.
180 */
181
182 // Hash set of nodes, maps from nodes to integer 1.
183 /** @internal */
184 UHashtable *nodes;
185
186 /** @internal */
187 class Node : public UObject {
188 public:
189 Node(int32_t initialHash) : hash(initialHash), offset(0) {}
190 inline int32_t hashCode() const { return hash; }
191 // Handles node==NULL.
192 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
193 // Base class operator==() compares the actual class types.
194 virtual UBool operator==(const Node &other) const;
195 inline UBool operator!=(const Node &other) const { return !operator==(other); }
196 /**
197 * Traverses the Node graph and numbers branch edges, with rightmost edges first.
198 * This is to avoid writing a duplicate node twice.
199 *
200 * Branch nodes in this trie data structure are not symmetric.
201 * Most branch edges "jump" to other nodes but the rightmost branch edges
202 * just continue without a jump.
203 * Therefore, write() must write the rightmost branch edge last
204 * (trie units are written backwards), and must write it at that point even if
205 * it is a duplicate of a node previously written elsewhere.
206 *
207 * This function visits and marks right branch edges first.
208 * Edges are numbered with increasingly negative values because we share the
209 * offset field which gets positive values when nodes are written.
210 * A branch edge also remembers the first number for any of its edges.
211 *
212 * When a further-left branch edge has a number in the range of the rightmost
213 * edge's numbers, then it will be written as part of the required right edge
214 * and we can avoid writing it first.
215 *
216 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
217 * edge numbers.
218 *
219 * @param edgeNumber The first edge number for this node and its sub-nodes.
220 * @return An edge number that is at least the maximum-negative
221 * of the input edge number and the numbers of this node and all of its sub-nodes.
222 */
223 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
224 // write() must set the offset to a positive value.
225 virtual void write(StringTrieBuilder &builder) = 0;
226 // See markRightEdgesFirst.
227 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
228 StringTrieBuilder &builder) {
229 // Note: Edge numbers are negative, lastRight<=firstRight.
230 // If offset>0 then this node and its sub-nodes have been written already
231 // and we need not write them again.
232 // If this node is part of the unwritten right branch edge,
233 // then we wait until that is written.
234 if(offset<0 && (offset<lastRight || firstRight<offset)) {
235 write(builder);
236 }
237 }
238 inline int32_t getOffset() const { return offset; }
239 protected:
240 int32_t hash;
241 int32_t offset;
242 };
243
244 // This class should not be overridden because
245 // registerFinalValue() compares a stack-allocated FinalValueNode
246 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
247 // with the input node, and the
248 // !Node::operator==(other) used inside FinalValueNode::operator==(other)
249 // will be false if the typeid's are different.
250 /** @internal */
251 class FinalValueNode : public Node {
252 public:
253 FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
254 virtual UBool operator==(const Node &other) const;
255 virtual void write(StringTrieBuilder &builder);
256 protected:
257 int32_t value;
258 };
259
260 /**
261 * @internal
262 */
263 class ValueNode : public Node {
264 public:
265 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
266 virtual UBool operator==(const Node &other) const;
267 void setValue(int32_t v) {
268 hasValue=TRUE;
269 value=v;
270 hash=hash*37+v;
271 }
272 protected:
273 UBool hasValue;
274 int32_t value;
275 };
276
277 /**
278 * @internal
279 */
280 class IntermediateValueNode : public ValueNode {
281 public:
282 IntermediateValueNode(int32_t v, Node *nextNode)
283 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
284 virtual UBool operator==(const Node &other) const;
285 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
286 virtual void write(StringTrieBuilder &builder);
287 protected:
288 Node *next;
289 };
290
291 /**
292 * @internal
293 */
294 class LinearMatchNode : public ValueNode {
295 public:
296 LinearMatchNode(int32_t len, Node *nextNode)
297 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
298 length(len), next(nextNode) {}
299 virtual UBool operator==(const Node &other) const;
300 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
301 protected:
302 int32_t length;
303 Node *next;
304 };
305
306 /**
307 * @internal
308 */
309 class BranchNode : public Node {
310 public:
311 BranchNode(int32_t initialHash) : Node(initialHash) {}
312 protected:
313 int32_t firstEdgeNumber;
314 };
315
316 /**
317 * @internal
318 */
319 class ListBranchNode : public BranchNode {
320 public:
321 ListBranchNode() : BranchNode(0x444444), length(0) {}
322 virtual UBool operator==(const Node &other) const;
323 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
324 virtual void write(StringTrieBuilder &builder);
325 // Adds a unit with a final value.
326 void add(int32_t c, int32_t value) {
327 units[length]=(UChar)c;
328 equal[length]=NULL;
329 values[length]=value;
330 ++length;
331 hash=(hash*37+c)*37+value;
332 }
333 // Adds a unit which leads to another match node.
334 void add(int32_t c, Node *node) {
335 units[length]=(UChar)c;
336 equal[length]=node;
337 values[length]=0;
338 ++length;
339 hash=(hash*37+c)*37+hashCode(node);
340 }
341 protected:
342 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
343 int32_t length;
344 int32_t values[kMaxBranchLinearSubNodeLength];
345 UChar units[kMaxBranchLinearSubNodeLength];
346 };
347
348 /**
349 * @internal
350 */
351 class SplitBranchNode : public BranchNode {
352 public:
353 SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
354 : BranchNode(((0x555555*37+middleUnit)*37+
355 hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
356 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
357 virtual UBool operator==(const Node &other) const;
358 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
359 virtual void write(StringTrieBuilder &builder);
360 protected:
361 UChar unit;
362 Node *lessThan;
363 Node *greaterOrEqual;
364 };
365
366 // Branch head node, for writing the actual node lead unit.
367 /** @internal */
368 class BranchHeadNode : public ValueNode {
369 public:
370 BranchHeadNode(int32_t len, Node *subNode)
371 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
372 length(len), next(subNode) {}
373 virtual UBool operator==(const Node &other) const;
374 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
375 virtual void write(StringTrieBuilder &builder);
376 protected:
377 int32_t length;
378 Node *next; // A branch sub-node.
379 };
380 #endif /* U_HIDE_INTERNAL_API */
381
382 /** @internal */
383 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
384 Node *nextNode) const = 0;
385
386 /** @internal */
387 virtual int32_t write(int32_t unit) = 0;
388 /** @internal */
389 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
390 /** @internal */
391 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
392 /** @internal */
393 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
394 /** @internal */
395 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
396 };
397
398 U_NAMESPACE_END
399
400 #endif // __STRINGTRIEBUILDER_H__