2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: stringtriebuilder.h
8 * tab size: 8 (not used)
11 * created on: 2010dec24
12 * created by: Markus W. Scherer
15 #ifndef __STRINGTRIEBUILDER_H__
16 #define __STRINGTRIEBUILDER_H__
18 #include "unicode/utypes.h"
19 #include "unicode/uobject.h"
21 // Forward declaration.
23 typedef struct UHashtable UHashtable
;
26 * Build options for BytesTrieBuilder and CharsTrieBuilder.
29 enum UStringTrieBuildOption
{
31 * Builds a trie quickly.
34 USTRINGTRIE_BUILD_FAST
,
36 * Builds a trie more slowly, attempting to generate
37 * a shorter but equivalent serialization.
38 * This build option also uses more memory.
40 * This option can be effective when many integer values are the same
41 * and string/byte sequence suffixes can be shared.
42 * Runtime speed is not expected to improve.
45 USTRINGTRIE_BUILD_SMALL
51 * Base class for string trie builder classes.
53 * This class is not intended for public subclassing.
56 class U_COMMON_API StringTrieBuilder
: public UObject
{
58 #ifndef U_HIDE_INTERNAL_API
60 static UBool
hashNode(const void *node
);
62 static UBool
equalNodes(const void *left
, const void *right
);
63 #endif /* U_HIDE_INTERNAL_API */
66 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
67 // or else the compiler will create a public default constructor.
71 virtual ~StringTrieBuilder();
73 #ifndef U_HIDE_INTERNAL_API
75 void createCompactBuilder(int32_t sizeGuess
, UErrorCode
&errorCode
);
77 void deleteCompactBuilder();
80 void build(UStringTrieBuildOption buildOption
, int32_t elementsLength
, UErrorCode
&errorCode
);
83 int32_t writeNode(int32_t start
, int32_t limit
, int32_t unitIndex
);
85 int32_t writeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
, int32_t length
);
86 #endif /* U_HIDE_INTERNAL_API */
90 #ifndef U_HIDE_INTERNAL_API
92 Node
*makeNode(int32_t start
, int32_t limit
, int32_t unitIndex
, UErrorCode
&errorCode
);
94 Node
*makeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
,
95 int32_t length
, UErrorCode
&errorCode
);
96 #endif /* U_HIDE_INTERNAL_API */
99 virtual int32_t getElementStringLength(int32_t i
) const = 0;
101 virtual UChar
getElementUnit(int32_t i
, int32_t unitIndex
) const = 0;
103 virtual int32_t getElementValue(int32_t i
) const = 0;
105 // Finds the first unit index after this one where
106 // the first and last element have different units again.
108 virtual int32_t getLimitOfLinearMatch(int32_t first
, int32_t last
, int32_t unitIndex
) const = 0;
110 // Number of different units at unitIndex.
112 virtual int32_t countElementUnits(int32_t start
, int32_t limit
, int32_t unitIndex
) const = 0;
114 virtual int32_t skipElementsBySomeUnits(int32_t i
, int32_t unitIndex
, int32_t count
) const = 0;
116 virtual int32_t indexOfElementWithNextUnit(int32_t i
, int32_t unitIndex
, UChar unit
) const = 0;
119 virtual UBool
matchNodesCanHaveValues() const = 0;
122 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
124 virtual int32_t getMinLinearMatch() const = 0;
126 virtual int32_t getMaxLinearMatchLength() const = 0;
128 #ifndef U_HIDE_INTERNAL_API
129 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
131 static const int32_t kMaxBranchLinearSubNodeLength
=5;
133 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
134 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
136 static const int32_t kMaxSplitBranchLevels
=14;
139 * Makes sure that there is only one unique node registered that is
140 * equivalent to newNode.
141 * @param newNode Input node. The builder takes ownership.
142 * @param errorCode ICU in/out UErrorCode.
143 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
144 * @return newNode if it is the first of its kind, or
145 * an equivalent node if newNode is a duplicate.
148 Node
*registerNode(Node
*newNode
, UErrorCode
&errorCode
);
150 * Makes sure that there is only one unique FinalValueNode registered
152 * Avoids creating a node if the value is a duplicate.
153 * @param value A final value.
154 * @param errorCode ICU in/out UErrorCode.
155 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
156 * @return A FinalValueNode with the given value.
159 Node
*registerFinalValue(int32_t value
, UErrorCode
&errorCode
);
163 * registerNode() and registerFinalValue() take ownership of their input nodes,
164 * and only return owned nodes.
165 * If they see a failure UErrorCode, they will delete the input node.
166 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
167 * If there is a failure, they return NULL.
169 * NULL Node pointers can be safely passed into other Nodes because
170 * they call the static Node::hashCode() which checks for a NULL pointer first.
172 * Therefore, as long as builder functions register a new node,
173 * they need to check for failures only before explicitly dereferencing
174 * a Node pointer, or before setting a new UErrorCode.
177 // Hash set of nodes, maps from nodes to integer 1.
182 class Node
: public UObject
{
184 Node(int32_t initialHash
) : hash(initialHash
), offset(0) {}
185 inline int32_t hashCode() const { return hash
; }
186 // Handles node==NULL.
187 static inline int32_t hashCode(const Node
*node
) { return node
==NULL
? 0 : node
->hashCode(); }
188 // Base class operator==() compares the actual class types.
189 virtual UBool
operator==(const Node
&other
) const;
190 inline UBool
operator!=(const Node
&other
) const { return !operator==(other
); }
192 * Traverses the Node graph and numbers branch edges, with rightmost edges first.
193 * This is to avoid writing a duplicate node twice.
195 * Branch nodes in this trie data structure are not symmetric.
196 * Most branch edges "jump" to other nodes but the rightmost branch edges
197 * just continue without a jump.
198 * Therefore, write() must write the rightmost branch edge last
199 * (trie units are written backwards), and must write it at that point even if
200 * it is a duplicate of a node previously written elsewhere.
202 * This function visits and marks right branch edges first.
203 * Edges are numbered with increasingly negative values because we share the
204 * offset field which gets positive values when nodes are written.
205 * A branch edge also remembers the first number for any of its edges.
207 * When a further-left branch edge has a number in the range of the rightmost
208 * edge's numbers, then it will be written as part of the required right edge
209 * and we can avoid writing it first.
211 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
214 * @param edgeNumber The first edge number for this node and its sub-nodes.
215 * @return An edge number that is at least the maximum-negative
216 * of the input edge number and the numbers of this node and all of its sub-nodes.
218 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
219 // write() must set the offset to a positive value.
220 virtual void write(StringTrieBuilder
&builder
) = 0;
221 // See markRightEdgesFirst.
222 inline void writeUnlessInsideRightEdge(int32_t firstRight
, int32_t lastRight
,
223 StringTrieBuilder
&builder
) {
224 // Note: Edge numbers are negative, lastRight<=firstRight.
225 // If offset>0 then this node and its sub-nodes have been written already
226 // and we need not write them again.
227 // If this node is part of the unwritten right branch edge,
228 // then we wait until that is written.
229 if(offset
<0 && (offset
<lastRight
|| firstRight
<offset
)) {
233 inline int32_t getOffset() const { return offset
; }
238 // No ICU "poor man's RTTI" for this class nor its subclasses.
239 virtual UClassID
getDynamicClassID() const;
242 // This class should not be overridden because
243 // registerFinalValue() compares a stack-allocated FinalValueNode
244 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
245 // with the input node, and the
246 // !Node::operator==(other) used inside FinalValueNode::operator==(other)
247 // will be false if the typeid's are different.
249 class FinalValueNode
: public Node
{
251 FinalValueNode(int32_t v
) : Node(0x111111*37+v
), value(v
) {}
252 virtual UBool
operator==(const Node
&other
) const;
253 virtual void write(StringTrieBuilder
&builder
);
259 class ValueNode
: public Node
{
261 ValueNode(int32_t initialHash
) : Node(initialHash
), hasValue(FALSE
), value(0) {}
262 virtual UBool
operator==(const Node
&other
) const;
263 void setValue(int32_t v
) {
274 class IntermediateValueNode
: public ValueNode
{
276 IntermediateValueNode(int32_t v
, Node
*nextNode
)
277 : ValueNode(0x222222*37+hashCode(nextNode
)), next(nextNode
) { setValue(v
); }
278 virtual UBool
operator==(const Node
&other
) const;
279 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
280 virtual void write(StringTrieBuilder
&builder
);
286 class LinearMatchNode
: public ValueNode
{
288 LinearMatchNode(int32_t len
, Node
*nextNode
)
289 : ValueNode((0x333333*37+len
)*37+hashCode(nextNode
)),
290 length(len
), next(nextNode
) {}
291 virtual UBool
operator==(const Node
&other
) const;
292 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
299 class BranchNode
: public Node
{
301 BranchNode(int32_t initialHash
) : Node(initialHash
) {}
303 int32_t firstEdgeNumber
;
307 class ListBranchNode
: public BranchNode
{
309 ListBranchNode() : BranchNode(0x444444), length(0) {}
310 virtual UBool
operator==(const Node
&other
) const;
311 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
312 virtual void write(StringTrieBuilder
&builder
);
313 // Adds a unit with a final value.
314 void add(int32_t c
, int32_t value
) {
315 units
[length
]=(UChar
)c
;
317 values
[length
]=value
;
319 hash
=(hash
*37+c
)*37+value
;
321 // Adds a unit which leads to another match node.
322 void add(int32_t c
, Node
*node
) {
323 units
[length
]=(UChar
)c
;
327 hash
=(hash
*37+c
)*37+hashCode(node
);
330 Node
*equal
[kMaxBranchLinearSubNodeLength
]; // NULL means "has final value".
332 int32_t values
[kMaxBranchLinearSubNodeLength
];
333 UChar units
[kMaxBranchLinearSubNodeLength
];
337 class SplitBranchNode
: public BranchNode
{
339 SplitBranchNode(UChar middleUnit
, Node
*lessThanNode
, Node
*greaterOrEqualNode
)
340 : BranchNode(((0x555555*37+middleUnit
)*37+
341 hashCode(lessThanNode
))*37+hashCode(greaterOrEqualNode
)),
342 unit(middleUnit
), lessThan(lessThanNode
), greaterOrEqual(greaterOrEqualNode
) {}
343 virtual UBool
operator==(const Node
&other
) const;
344 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
345 virtual void write(StringTrieBuilder
&builder
);
349 Node
*greaterOrEqual
;
352 // Branch head node, for writing the actual node lead unit.
354 class BranchHeadNode
: public ValueNode
{
356 BranchHeadNode(int32_t len
, Node
*subNode
)
357 : ValueNode((0x666666*37+len
)*37+hashCode(subNode
)),
358 length(len
), next(subNode
) {}
359 virtual UBool
operator==(const Node
&other
) const;
360 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
361 virtual void write(StringTrieBuilder
&builder
);
364 Node
*next
; // A branch sub-node.
366 #endif /* U_HIDE_INTERNAL_API */
369 virtual Node
*createLinearMatchNode(int32_t i
, int32_t unitIndex
, int32_t length
,
370 Node
*nextNode
) const = 0;
373 virtual int32_t write(int32_t unit
) = 0;
375 virtual int32_t writeElementUnits(int32_t i
, int32_t unitIndex
, int32_t length
) = 0;
377 virtual int32_t writeValueAndFinal(int32_t i
, UBool isFinal
) = 0;
379 virtual int32_t writeValueAndType(UBool hasValue
, int32_t value
, int32_t node
) = 0;
381 virtual int32_t writeDeltaTo(int32_t jumpTarget
) = 0;
384 // No ICU "poor man's RTTI" for this class nor its subclasses.
385 virtual UClassID
getDynamicClassID() const;
390 #endif // __STRINGTRIEBUILDER_H__