]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/stringtriebuilder.h
fc02dba6293ba3bb124c86214c71467cbcb91e91
[apple/icu.git] / icuSources / common / unicode / stringtriebuilder.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2012,2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: stringtriebuilder.h
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010dec24
14 * created by: Markus W. Scherer
15 */
16
17 #ifndef __STRINGTRIEBUILDER_H__
18 #define __STRINGTRIEBUILDER_H__
19
20 #include "unicode/utypes.h"
21 #include "unicode/uobject.h"
22
23 /**
24 * \file
25 * \brief C++ API: Builder API for trie builders
26 */
27
28 // Forward declaration.
29 /// \cond
30 struct UHashtable;
31 typedef struct UHashtable UHashtable;
32 /// \endcond
33
34 /**
35 * Build options for BytesTrieBuilder and CharsTrieBuilder.
36 * @stable ICU 4.8
37 */
38 enum UStringTrieBuildOption {
39 /**
40 * Builds a trie quickly.
41 * @stable ICU 4.8
42 */
43 USTRINGTRIE_BUILD_FAST,
44 /**
45 * Builds a trie more slowly, attempting to generate
46 * a shorter but equivalent serialization.
47 * This build option also uses more memory.
48 *
49 * This option can be effective when many integer values are the same
50 * and string/byte sequence suffixes can be shared.
51 * Runtime speed is not expected to improve.
52 * @stable ICU 4.8
53 */
54 USTRINGTRIE_BUILD_SMALL
55 };
56
57 #if U_SHOW_CPLUSPLUS_API
58 U_NAMESPACE_BEGIN
59
60 /**
61 * Base class for string trie builder classes.
62 *
63 * This class is not intended for public subclassing.
64 * @stable ICU 4.8
65 */
66 class U_COMMON_API StringTrieBuilder : public UObject {
67 public:
68 #ifndef U_HIDE_INTERNAL_API
69 /** @internal */
70 static int32_t hashNode(const void *node);
71 /** @internal */
72 static UBool equalNodes(const void *left, const void *right);
73 #endif /* U_HIDE_INTERNAL_API */
74
75 protected:
76 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
77 // or else the compiler will create a public default constructor.
78 /** @internal */
79 StringTrieBuilder();
80 /** @internal */
81 virtual ~StringTrieBuilder();
82
83 #ifndef U_HIDE_INTERNAL_API
84 /** @internal */
85 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
86 /** @internal */
87 void deleteCompactBuilder();
88
89 /** @internal */
90 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
91
92 /** @internal */
93 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
94 /** @internal */
95 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
96 #endif /* U_HIDE_INTERNAL_API */
97
98 class Node;
99
100 #ifndef U_HIDE_INTERNAL_API
101 /** @internal */
102 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
103 /** @internal */
104 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
105 int32_t length, UErrorCode &errorCode);
106 #endif /* U_HIDE_INTERNAL_API */
107
108 /** @internal */
109 virtual int32_t getElementStringLength(int32_t i) const = 0;
110 /** @internal */
111 virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
112 /** @internal */
113 virtual int32_t getElementValue(int32_t i) const = 0;
114
115 // Finds the first unit index after this one where
116 // the first and last element have different units again.
117 /** @internal */
118 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
119
120 // Number of different units at unitIndex.
121 /** @internal */
122 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
123 /** @internal */
124 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
125 /** @internal */
126 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
127
128 /** @internal */
129 virtual UBool matchNodesCanHaveValues() const = 0;
130
131 /** @internal */
132 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
133 /** @internal */
134 virtual int32_t getMinLinearMatch() const = 0;
135 /** @internal */
136 virtual int32_t getMaxLinearMatchLength() const = 0;
137
138 #ifndef U_HIDE_INTERNAL_API
139 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
140 /** @internal */
141 static const int32_t kMaxBranchLinearSubNodeLength=5;
142
143 // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
144 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
145 /** @internal */
146 static const int32_t kMaxSplitBranchLevels=14;
147
148 /**
149 * Makes sure that there is only one unique node registered that is
150 * equivalent to newNode.
151 * @param newNode Input node. The builder takes ownership.
152 * @param errorCode ICU in/out UErrorCode.
153 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
154 * @return newNode if it is the first of its kind, or
155 * an equivalent node if newNode is a duplicate.
156 * @internal
157 */
158 Node *registerNode(Node *newNode, UErrorCode &errorCode);
159 /**
160 * Makes sure that there is only one unique FinalValueNode registered
161 * with this value.
162 * Avoids creating a node if the value is a duplicate.
163 * @param value A final value.
164 * @param errorCode ICU in/out UErrorCode.
165 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
166 * @return A FinalValueNode with the given value.
167 * @internal
168 */
169 Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
170 #endif /* U_HIDE_INTERNAL_API */
171
172 /*
173 * C++ note:
174 * registerNode() and registerFinalValue() take ownership of their input nodes,
175 * and only return owned nodes.
176 * If they see a failure UErrorCode, they will delete the input node.
177 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
178 * If there is a failure, they return NULL.
179 *
180 * NULL Node pointers can be safely passed into other Nodes because
181 * they call the static Node::hashCode() which checks for a NULL pointer first.
182 *
183 * Therefore, as long as builder functions register a new node,
184 * they need to check for failures only before explicitly dereferencing
185 * a Node pointer, or before setting a new UErrorCode.
186 */
187
188 // Hash set of nodes, maps from nodes to integer 1.
189 /** @internal */
190 UHashtable *nodes;
191
192 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
193 // it is needed for layout of other objects.
194 /**
195 * @internal
196 * \cond
197 */
198 class Node : public UObject {
199 public:
200 Node(int32_t initialHash) : hash(initialHash), offset(0) {}
201 inline int32_t hashCode() const { return hash; }
202 // Handles node==NULL.
203 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
204 // Base class operator==() compares the actual class types.
205 virtual UBool operator==(const Node &other) const;
206 inline UBool operator!=(const Node &other) const { return !operator==(other); }
207 /**
208 * Traverses the Node graph and numbers branch edges, with rightmost edges first.
209 * This is to avoid writing a duplicate node twice.
210 *
211 * Branch nodes in this trie data structure are not symmetric.
212 * Most branch edges "jump" to other nodes but the rightmost branch edges
213 * just continue without a jump.
214 * Therefore, write() must write the rightmost branch edge last
215 * (trie units are written backwards), and must write it at that point even if
216 * it is a duplicate of a node previously written elsewhere.
217 *
218 * This function visits and marks right branch edges first.
219 * Edges are numbered with increasingly negative values because we share the
220 * offset field which gets positive values when nodes are written.
221 * A branch edge also remembers the first number for any of its edges.
222 *
223 * When a further-left branch edge has a number in the range of the rightmost
224 * edge's numbers, then it will be written as part of the required right edge
225 * and we can avoid writing it first.
226 *
227 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
228 * edge numbers.
229 *
230 * @param edgeNumber The first edge number for this node and its sub-nodes.
231 * @return An edge number that is at least the maximum-negative
232 * of the input edge number and the numbers of this node and all of its sub-nodes.
233 */
234 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
235 // write() must set the offset to a positive value.
236 virtual void write(StringTrieBuilder &builder) = 0;
237 // See markRightEdgesFirst.
238 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
239 StringTrieBuilder &builder) {
240 // Note: Edge numbers are negative, lastRight<=firstRight.
241 // If offset>0 then this node and its sub-nodes have been written already
242 // and we need not write them again.
243 // If this node is part of the unwritten right branch edge,
244 // then we wait until that is written.
245 if(offset<0 && (offset<lastRight || firstRight<offset)) {
246 write(builder);
247 }
248 }
249 inline int32_t getOffset() const { return offset; }
250 protected:
251 int32_t hash;
252 int32_t offset;
253 };
254
255 #ifndef U_HIDE_INTERNAL_API
256 // This class should not be overridden because
257 // registerFinalValue() compares a stack-allocated FinalValueNode
258 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
259 // with the input node, and the
260 // !Node::operator==(other) used inside FinalValueNode::operator==(other)
261 // will be false if the typeid's are different.
262 /** @internal */
263 class FinalValueNode : public Node {
264 public:
265 FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
266 virtual UBool operator==(const Node &other) const;
267 virtual void write(StringTrieBuilder &builder);
268 protected:
269 int32_t value;
270 };
271 #endif /* U_HIDE_INTERNAL_API */
272
273 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
274 // it is needed for layout of other objects.
275 /**
276 * @internal
277 */
278 class ValueNode : public Node {
279 public:
280 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
281 virtual UBool operator==(const Node &other) const;
282 void setValue(int32_t v) {
283 hasValue=TRUE;
284 value=v;
285 hash=hash*37u+v;
286 }
287 protected:
288 UBool hasValue;
289 int32_t value;
290 };
291
292 #ifndef U_HIDE_INTERNAL_API
293 /**
294 * @internal
295 */
296 class IntermediateValueNode : public ValueNode {
297 public:
298 IntermediateValueNode(int32_t v, Node *nextNode)
299 : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
300 virtual UBool operator==(const Node &other) const;
301 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
302 virtual void write(StringTrieBuilder &builder);
303 protected:
304 Node *next;
305 };
306 #endif /* U_HIDE_INTERNAL_API */
307
308 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
309 // it is needed for layout of other objects.
310 /**
311 * @internal
312 */
313 class LinearMatchNode : public ValueNode {
314 public:
315 LinearMatchNode(int32_t len, Node *nextNode)
316 : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
317 length(len), next(nextNode) {}
318 virtual UBool operator==(const Node &other) const;
319 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
320 protected:
321 int32_t length;
322 Node *next;
323 };
324
325 #ifndef U_HIDE_INTERNAL_API
326 /**
327 * @internal
328 */
329 class BranchNode : public Node {
330 public:
331 BranchNode(int32_t initialHash) : Node(initialHash) {}
332 protected:
333 int32_t firstEdgeNumber;
334 };
335
336 /**
337 * @internal
338 */
339 class ListBranchNode : public BranchNode {
340 public:
341 ListBranchNode() : BranchNode(0x444444), length(0) {}
342 virtual UBool operator==(const Node &other) const;
343 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
344 virtual void write(StringTrieBuilder &builder);
345 // Adds a unit with a final value.
346 void add(int32_t c, int32_t value) {
347 units[length]=(char16_t)c;
348 equal[length]=NULL;
349 values[length]=value;
350 ++length;
351 hash=(hash*37u+c)*37u+value;
352 }
353 // Adds a unit which leads to another match node.
354 void add(int32_t c, Node *node) {
355 units[length]=(char16_t)c;
356 equal[length]=node;
357 values[length]=0;
358 ++length;
359 hash=(hash*37u+c)*37u+hashCode(node);
360 }
361 protected:
362 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
363 int32_t length;
364 int32_t values[kMaxBranchLinearSubNodeLength];
365 char16_t units[kMaxBranchLinearSubNodeLength];
366 };
367
368 /**
369 * @internal
370 */
371 class SplitBranchNode : public BranchNode {
372 public:
373 SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
374 : BranchNode(((0x555555u*37u+middleUnit)*37u+
375 hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
376 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
377 virtual UBool operator==(const Node &other) const;
378 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
379 virtual void write(StringTrieBuilder &builder);
380 protected:
381 char16_t unit;
382 Node *lessThan;
383 Node *greaterOrEqual;
384 };
385
386 // Branch head node, for writing the actual node lead unit.
387 /** @internal */
388 class BranchHeadNode : public ValueNode {
389 public:
390 BranchHeadNode(int32_t len, Node *subNode)
391 : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
392 length(len), next(subNode) {}
393 virtual UBool operator==(const Node &other) const;
394 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
395 virtual void write(StringTrieBuilder &builder);
396 protected:
397 int32_t length;
398 Node *next; // A branch sub-node.
399 };
400
401 #endif /* U_HIDE_INTERNAL_API */
402 /// \endcond
403
404 /** @internal */
405 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
406 Node *nextNode) const = 0;
407
408 /** @internal */
409 virtual int32_t write(int32_t unit) = 0;
410 /** @internal */
411 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
412 /** @internal */
413 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
414 /** @internal */
415 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
416 /** @internal */
417 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
418 };
419
420 U_NAMESPACE_END
421 #endif // U_SHOW_CPLUSPLUS_API
422
423 #endif // __STRINGTRIEBUILDER_H__