2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: stringtriebuilder.h
8 * tab size: 8 (not used)
11 * created on: 2010dec24
12 * created by: Markus W. Scherer
15 #ifndef __STRINGTRIEBUILDER_H__
16 #define __STRINGTRIEBUILDER_H__
18 #include "unicode/utypes.h"
19 #include "unicode/uobject.h"
23 * \brief C++ API: Builder API for trie builders
26 // Forward declaration.
28 typedef struct UHashtable UHashtable
;
31 * Build options for BytesTrieBuilder and CharsTrieBuilder.
34 enum UStringTrieBuildOption
{
36 * Builds a trie quickly.
39 USTRINGTRIE_BUILD_FAST
,
41 * Builds a trie more slowly, attempting to generate
42 * a shorter but equivalent serialization.
43 * This build option also uses more memory.
45 * This option can be effective when many integer values are the same
46 * and string/byte sequence suffixes can be shared.
47 * Runtime speed is not expected to improve.
50 USTRINGTRIE_BUILD_SMALL
56 * Base class for string trie builder classes.
58 * This class is not intended for public subclassing.
61 class U_COMMON_API StringTrieBuilder
: public UObject
{
63 #ifndef U_HIDE_INTERNAL_API
65 static UBool
hashNode(const void *node
);
67 static UBool
equalNodes(const void *left
, const void *right
);
68 #endif /* U_HIDE_INTERNAL_API */
71 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
72 // or else the compiler will create a public default constructor.
76 virtual ~StringTrieBuilder();
78 #ifndef U_HIDE_INTERNAL_API
80 void createCompactBuilder(int32_t sizeGuess
, UErrorCode
&errorCode
);
82 void deleteCompactBuilder();
85 void build(UStringTrieBuildOption buildOption
, int32_t elementsLength
, UErrorCode
&errorCode
);
88 int32_t writeNode(int32_t start
, int32_t limit
, int32_t unitIndex
);
90 int32_t writeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
, int32_t length
);
91 #endif /* U_HIDE_INTERNAL_API */
95 #ifndef U_HIDE_INTERNAL_API
97 Node
*makeNode(int32_t start
, int32_t limit
, int32_t unitIndex
, UErrorCode
&errorCode
);
99 Node
*makeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
,
100 int32_t length
, UErrorCode
&errorCode
);
101 #endif /* U_HIDE_INTERNAL_API */
104 virtual int32_t getElementStringLength(int32_t i
) const = 0;
106 virtual UChar
getElementUnit(int32_t i
, int32_t unitIndex
) const = 0;
108 virtual int32_t getElementValue(int32_t i
) const = 0;
110 // Finds the first unit index after this one where
111 // the first and last element have different units again.
113 virtual int32_t getLimitOfLinearMatch(int32_t first
, int32_t last
, int32_t unitIndex
) const = 0;
115 // Number of different units at unitIndex.
117 virtual int32_t countElementUnits(int32_t start
, int32_t limit
, int32_t unitIndex
) const = 0;
119 virtual int32_t skipElementsBySomeUnits(int32_t i
, int32_t unitIndex
, int32_t count
) const = 0;
121 virtual int32_t indexOfElementWithNextUnit(int32_t i
, int32_t unitIndex
, UChar unit
) const = 0;
124 virtual UBool
matchNodesCanHaveValues() const = 0;
127 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
129 virtual int32_t getMinLinearMatch() const = 0;
131 virtual int32_t getMaxLinearMatchLength() const = 0;
133 #ifndef U_HIDE_INTERNAL_API
134 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
136 static const int32_t kMaxBranchLinearSubNodeLength
=5;
138 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
139 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
141 static const int32_t kMaxSplitBranchLevels
=14;
144 * Makes sure that there is only one unique node registered that is
145 * equivalent to newNode.
146 * @param newNode Input node. The builder takes ownership.
147 * @param errorCode ICU in/out UErrorCode.
148 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
149 * @return newNode if it is the first of its kind, or
150 * an equivalent node if newNode is a duplicate.
153 Node
*registerNode(Node
*newNode
, UErrorCode
&errorCode
);
155 * Makes sure that there is only one unique FinalValueNode registered
157 * Avoids creating a node if the value is a duplicate.
158 * @param value A final value.
159 * @param errorCode ICU in/out UErrorCode.
160 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
161 * @return A FinalValueNode with the given value.
164 Node
*registerFinalValue(int32_t value
, UErrorCode
&errorCode
);
168 * registerNode() and registerFinalValue() take ownership of their input nodes,
169 * and only return owned nodes.
170 * If they see a failure UErrorCode, they will delete the input node.
171 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
172 * If there is a failure, they return NULL.
174 * NULL Node pointers can be safely passed into other Nodes because
175 * they call the static Node::hashCode() which checks for a NULL pointer first.
177 * Therefore, as long as builder functions register a new node,
178 * they need to check for failures only before explicitly dereferencing
179 * a Node pointer, or before setting a new UErrorCode.
182 // Hash set of nodes, maps from nodes to integer 1.
187 class Node
: public UObject
{
189 Node(int32_t initialHash
) : hash(initialHash
), offset(0) {}
190 inline int32_t hashCode() const { return hash
; }
191 // Handles node==NULL.
192 static inline int32_t hashCode(const Node
*node
) { return node
==NULL
? 0 : node
->hashCode(); }
193 // Base class operator==() compares the actual class types.
194 virtual UBool
operator==(const Node
&other
) const;
195 inline UBool
operator!=(const Node
&other
) const { return !operator==(other
); }
197 * Traverses the Node graph and numbers branch edges, with rightmost edges first.
198 * This is to avoid writing a duplicate node twice.
200 * Branch nodes in this trie data structure are not symmetric.
201 * Most branch edges "jump" to other nodes but the rightmost branch edges
202 * just continue without a jump.
203 * Therefore, write() must write the rightmost branch edge last
204 * (trie units are written backwards), and must write it at that point even if
205 * it is a duplicate of a node previously written elsewhere.
207 * This function visits and marks right branch edges first.
208 * Edges are numbered with increasingly negative values because we share the
209 * offset field which gets positive values when nodes are written.
210 * A branch edge also remembers the first number for any of its edges.
212 * When a further-left branch edge has a number in the range of the rightmost
213 * edge's numbers, then it will be written as part of the required right edge
214 * and we can avoid writing it first.
216 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
219 * @param edgeNumber The first edge number for this node and its sub-nodes.
220 * @return An edge number that is at least the maximum-negative
221 * of the input edge number and the numbers of this node and all of its sub-nodes.
223 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
224 // write() must set the offset to a positive value.
225 virtual void write(StringTrieBuilder
&builder
) = 0;
226 // See markRightEdgesFirst.
227 inline void writeUnlessInsideRightEdge(int32_t firstRight
, int32_t lastRight
,
228 StringTrieBuilder
&builder
) {
229 // Note: Edge numbers are negative, lastRight<=firstRight.
230 // If offset>0 then this node and its sub-nodes have been written already
231 // and we need not write them again.
232 // If this node is part of the unwritten right branch edge,
233 // then we wait until that is written.
234 if(offset
<0 && (offset
<lastRight
|| firstRight
<offset
)) {
238 inline int32_t getOffset() const { return offset
; }
244 // This class should not be overridden because
245 // registerFinalValue() compares a stack-allocated FinalValueNode
246 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
247 // with the input node, and the
248 // !Node::operator==(other) used inside FinalValueNode::operator==(other)
249 // will be false if the typeid's are different.
251 class FinalValueNode
: public Node
{
253 FinalValueNode(int32_t v
) : Node(0x111111*37+v
), value(v
) {}
254 virtual UBool
operator==(const Node
&other
) const;
255 virtual void write(StringTrieBuilder
&builder
);
263 class ValueNode
: public Node
{
265 ValueNode(int32_t initialHash
) : Node(initialHash
), hasValue(FALSE
), value(0) {}
266 virtual UBool
operator==(const Node
&other
) const;
267 void setValue(int32_t v
) {
280 class IntermediateValueNode
: public ValueNode
{
282 IntermediateValueNode(int32_t v
, Node
*nextNode
)
283 : ValueNode(0x222222*37+hashCode(nextNode
)), next(nextNode
) { setValue(v
); }
284 virtual UBool
operator==(const Node
&other
) const;
285 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
286 virtual void write(StringTrieBuilder
&builder
);
294 class LinearMatchNode
: public ValueNode
{
296 LinearMatchNode(int32_t len
, Node
*nextNode
)
297 : ValueNode((0x333333*37+len
)*37+hashCode(nextNode
)),
298 length(len
), next(nextNode
) {}
299 virtual UBool
operator==(const Node
&other
) const;
300 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
309 class BranchNode
: public Node
{
311 BranchNode(int32_t initialHash
) : Node(initialHash
) {}
313 int32_t firstEdgeNumber
;
319 class ListBranchNode
: public BranchNode
{
321 ListBranchNode() : BranchNode(0x444444), length(0) {}
322 virtual UBool
operator==(const Node
&other
) const;
323 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
324 virtual void write(StringTrieBuilder
&builder
);
325 // Adds a unit with a final value.
326 void add(int32_t c
, int32_t value
) {
327 units
[length
]=(UChar
)c
;
329 values
[length
]=value
;
331 hash
=(hash
*37+c
)*37+value
;
333 // Adds a unit which leads to another match node.
334 void add(int32_t c
, Node
*node
) {
335 units
[length
]=(UChar
)c
;
339 hash
=(hash
*37+c
)*37+hashCode(node
);
342 Node
*equal
[kMaxBranchLinearSubNodeLength
]; // NULL means "has final value".
344 int32_t values
[kMaxBranchLinearSubNodeLength
];
345 UChar units
[kMaxBranchLinearSubNodeLength
];
351 class SplitBranchNode
: public BranchNode
{
353 SplitBranchNode(UChar middleUnit
, Node
*lessThanNode
, Node
*greaterOrEqualNode
)
354 : BranchNode(((0x555555*37+middleUnit
)*37+
355 hashCode(lessThanNode
))*37+hashCode(greaterOrEqualNode
)),
356 unit(middleUnit
), lessThan(lessThanNode
), greaterOrEqual(greaterOrEqualNode
) {}
357 virtual UBool
operator==(const Node
&other
) const;
358 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
359 virtual void write(StringTrieBuilder
&builder
);
363 Node
*greaterOrEqual
;
366 // Branch head node, for writing the actual node lead unit.
368 class BranchHeadNode
: public ValueNode
{
370 BranchHeadNode(int32_t len
, Node
*subNode
)
371 : ValueNode((0x666666*37+len
)*37+hashCode(subNode
)),
372 length(len
), next(subNode
) {}
373 virtual UBool
operator==(const Node
&other
) const;
374 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
375 virtual void write(StringTrieBuilder
&builder
);
378 Node
*next
; // A branch sub-node.
380 #endif /* U_HIDE_INTERNAL_API */
383 virtual Node
*createLinearMatchNode(int32_t i
, int32_t unitIndex
, int32_t length
,
384 Node
*nextNode
) const = 0;
387 virtual int32_t write(int32_t unit
) = 0;
389 virtual int32_t writeElementUnits(int32_t i
, int32_t unitIndex
, int32_t length
) = 0;
391 virtual int32_t writeValueAndFinal(int32_t i
, UBool isFinal
) = 0;
393 virtual int32_t writeValueAndType(UBool hasValue
, int32_t value
, int32_t node
) = 0;
395 virtual int32_t writeDeltaTo(int32_t jumpTarget
) = 0;
400 #endif // __STRINGTRIEBUILDER_H__