2 *******************************************************************************
3 * Copyright (C) 2010-2012,2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: stringtriebuilder.h
8 * tab size: 8 (not used)
11 * created on: 2010dec24
12 * created by: Markus W. Scherer
15 #ifndef __STRINGTRIEBUILDER_H__
16 #define __STRINGTRIEBUILDER_H__
18 #include "unicode/utypes.h"
19 #include "unicode/uobject.h"
23 * \brief C++ API: Builder API for trie builders
26 // Forward declaration.
28 typedef struct UHashtable UHashtable
;
31 * Build options for BytesTrieBuilder and CharsTrieBuilder.
34 enum UStringTrieBuildOption
{
36 * Builds a trie quickly.
39 USTRINGTRIE_BUILD_FAST
,
41 * Builds a trie more slowly, attempting to generate
42 * a shorter but equivalent serialization.
43 * This build option also uses more memory.
45 * This option can be effective when many integer values are the same
46 * and string/byte sequence suffixes can be shared.
47 * Runtime speed is not expected to improve.
50 USTRINGTRIE_BUILD_SMALL
56 * Base class for string trie builder classes.
58 * This class is not intended for public subclassing.
61 class U_COMMON_API StringTrieBuilder
: public UObject
{
63 #ifndef U_HIDE_INTERNAL_API
65 static UBool
hashNode(const void *node
);
67 static UBool
equalNodes(const void *left
, const void *right
);
68 #endif /* U_HIDE_INTERNAL_API */
71 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
72 // or else the compiler will create a public default constructor.
76 virtual ~StringTrieBuilder();
78 #ifndef U_HIDE_INTERNAL_API
80 void createCompactBuilder(int32_t sizeGuess
, UErrorCode
&errorCode
);
82 void deleteCompactBuilder();
85 void build(UStringTrieBuildOption buildOption
, int32_t elementsLength
, UErrorCode
&errorCode
);
88 int32_t writeNode(int32_t start
, int32_t limit
, int32_t unitIndex
);
90 int32_t writeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
, int32_t length
);
91 #endif /* U_HIDE_INTERNAL_API */
95 #ifndef U_HIDE_INTERNAL_API
97 Node
*makeNode(int32_t start
, int32_t limit
, int32_t unitIndex
, UErrorCode
&errorCode
);
99 Node
*makeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
,
100 int32_t length
, UErrorCode
&errorCode
);
101 #endif /* U_HIDE_INTERNAL_API */
104 virtual int32_t getElementStringLength(int32_t i
) const = 0;
106 virtual UChar
getElementUnit(int32_t i
, int32_t unitIndex
) const = 0;
108 virtual int32_t getElementValue(int32_t i
) const = 0;
110 // Finds the first unit index after this one where
111 // the first and last element have different units again.
113 virtual int32_t getLimitOfLinearMatch(int32_t first
, int32_t last
, int32_t unitIndex
) const = 0;
115 // Number of different units at unitIndex.
117 virtual int32_t countElementUnits(int32_t start
, int32_t limit
, int32_t unitIndex
) const = 0;
119 virtual int32_t skipElementsBySomeUnits(int32_t i
, int32_t unitIndex
, int32_t count
) const = 0;
121 virtual int32_t indexOfElementWithNextUnit(int32_t i
, int32_t unitIndex
, UChar unit
) const = 0;
124 virtual UBool
matchNodesCanHaveValues() const = 0;
127 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
129 virtual int32_t getMinLinearMatch() const = 0;
131 virtual int32_t getMaxLinearMatchLength() const = 0;
133 #ifndef U_HIDE_INTERNAL_API
134 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
136 static const int32_t kMaxBranchLinearSubNodeLength
=5;
138 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
139 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
141 static const int32_t kMaxSplitBranchLevels
=14;
144 * Makes sure that there is only one unique node registered that is
145 * equivalent to newNode.
146 * @param newNode Input node. The builder takes ownership.
147 * @param errorCode ICU in/out UErrorCode.
148 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
149 * @return newNode if it is the first of its kind, or
150 * an equivalent node if newNode is a duplicate.
153 Node
*registerNode(Node
*newNode
, UErrorCode
&errorCode
);
155 * Makes sure that there is only one unique FinalValueNode registered
157 * Avoids creating a node if the value is a duplicate.
158 * @param value A final value.
159 * @param errorCode ICU in/out UErrorCode.
160 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
161 * @return A FinalValueNode with the given value.
164 Node
*registerFinalValue(int32_t value
, UErrorCode
&errorCode
);
165 #endif /* U_HIDE_INTERNAL_API */
169 * registerNode() and registerFinalValue() take ownership of their input nodes,
170 * and only return owned nodes.
171 * If they see a failure UErrorCode, they will delete the input node.
172 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
173 * If there is a failure, they return NULL.
175 * NULL Node pointers can be safely passed into other Nodes because
176 * they call the static Node::hashCode() which checks for a NULL pointer first.
178 * Therefore, as long as builder functions register a new node,
179 * they need to check for failures only before explicitly dereferencing
180 * a Node pointer, or before setting a new UErrorCode.
183 // Hash set of nodes, maps from nodes to integer 1.
187 #ifndef U_HIDE_INTERNAL_API
189 class Node
: public UObject
{
191 Node(int32_t initialHash
) : hash(initialHash
), offset(0) {}
192 inline int32_t hashCode() const { return hash
; }
193 // Handles node==NULL.
194 static inline int32_t hashCode(const Node
*node
) { return node
==NULL
? 0 : node
->hashCode(); }
195 // Base class operator==() compares the actual class types.
196 virtual UBool
operator==(const Node
&other
) const;
197 inline UBool
operator!=(const Node
&other
) const { return !operator==(other
); }
199 * Traverses the Node graph and numbers branch edges, with rightmost edges first.
200 * This is to avoid writing a duplicate node twice.
202 * Branch nodes in this trie data structure are not symmetric.
203 * Most branch edges "jump" to other nodes but the rightmost branch edges
204 * just continue without a jump.
205 * Therefore, write() must write the rightmost branch edge last
206 * (trie units are written backwards), and must write it at that point even if
207 * it is a duplicate of a node previously written elsewhere.
209 * This function visits and marks right branch edges first.
210 * Edges are numbered with increasingly negative values because we share the
211 * offset field which gets positive values when nodes are written.
212 * A branch edge also remembers the first number for any of its edges.
214 * When a further-left branch edge has a number in the range of the rightmost
215 * edge's numbers, then it will be written as part of the required right edge
216 * and we can avoid writing it first.
218 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
221 * @param edgeNumber The first edge number for this node and its sub-nodes.
222 * @return An edge number that is at least the maximum-negative
223 * of the input edge number and the numbers of this node and all of its sub-nodes.
225 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
226 // write() must set the offset to a positive value.
227 virtual void write(StringTrieBuilder
&builder
) = 0;
228 // See markRightEdgesFirst.
229 inline void writeUnlessInsideRightEdge(int32_t firstRight
, int32_t lastRight
,
230 StringTrieBuilder
&builder
) {
231 // Note: Edge numbers are negative, lastRight<=firstRight.
232 // If offset>0 then this node and its sub-nodes have been written already
233 // and we need not write them again.
234 // If this node is part of the unwritten right branch edge,
235 // then we wait until that is written.
236 if(offset
<0 && (offset
<lastRight
|| firstRight
<offset
)) {
240 inline int32_t getOffset() const { return offset
; }
246 // This class should not be overridden because
247 // registerFinalValue() compares a stack-allocated FinalValueNode
248 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
249 // with the input node, and the
250 // !Node::operator==(other) used inside FinalValueNode::operator==(other)
251 // will be false if the typeid's are different.
253 class FinalValueNode
: public Node
{
255 FinalValueNode(int32_t v
) : Node(0x111111*37+v
), value(v
) {}
256 virtual UBool
operator==(const Node
&other
) const;
257 virtual void write(StringTrieBuilder
&builder
);
265 class ValueNode
: public Node
{
267 ValueNode(int32_t initialHash
) : Node(initialHash
), hasValue(FALSE
), value(0) {}
268 virtual UBool
operator==(const Node
&other
) const;
269 void setValue(int32_t v
) {
282 class IntermediateValueNode
: public ValueNode
{
284 IntermediateValueNode(int32_t v
, Node
*nextNode
)
285 : ValueNode(0x222222*37+hashCode(nextNode
)), next(nextNode
) { setValue(v
); }
286 virtual UBool
operator==(const Node
&other
) const;
287 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
288 virtual void write(StringTrieBuilder
&builder
);
296 class LinearMatchNode
: public ValueNode
{
298 LinearMatchNode(int32_t len
, Node
*nextNode
)
299 : ValueNode((0x333333*37+len
)*37+hashCode(nextNode
)),
300 length(len
), next(nextNode
) {}
301 virtual UBool
operator==(const Node
&other
) const;
302 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
311 class BranchNode
: public Node
{
313 BranchNode(int32_t initialHash
) : Node(initialHash
) {}
315 int32_t firstEdgeNumber
;
321 class ListBranchNode
: public BranchNode
{
323 ListBranchNode() : BranchNode(0x444444), length(0) {}
324 virtual UBool
operator==(const Node
&other
) const;
325 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
326 virtual void write(StringTrieBuilder
&builder
);
327 // Adds a unit with a final value.
328 void add(int32_t c
, int32_t value
) {
329 units
[length
]=(UChar
)c
;
331 values
[length
]=value
;
333 hash
=(hash
*37+c
)*37+value
;
335 // Adds a unit which leads to another match node.
336 void add(int32_t c
, Node
*node
) {
337 units
[length
]=(UChar
)c
;
341 hash
=(hash
*37+c
)*37+hashCode(node
);
344 Node
*equal
[kMaxBranchLinearSubNodeLength
]; // NULL means "has final value".
346 int32_t values
[kMaxBranchLinearSubNodeLength
];
347 UChar units
[kMaxBranchLinearSubNodeLength
];
353 class SplitBranchNode
: public BranchNode
{
355 SplitBranchNode(UChar middleUnit
, Node
*lessThanNode
, Node
*greaterOrEqualNode
)
356 : BranchNode(((0x555555*37+middleUnit
)*37+
357 hashCode(lessThanNode
))*37+hashCode(greaterOrEqualNode
)),
358 unit(middleUnit
), lessThan(lessThanNode
), greaterOrEqual(greaterOrEqualNode
) {}
359 virtual UBool
operator==(const Node
&other
) const;
360 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
361 virtual void write(StringTrieBuilder
&builder
);
365 Node
*greaterOrEqual
;
368 // Branch head node, for writing the actual node lead unit.
370 class BranchHeadNode
: public ValueNode
{
372 BranchHeadNode(int32_t len
, Node
*subNode
)
373 : ValueNode((0x666666*37+len
)*37+hashCode(subNode
)),
374 length(len
), next(subNode
) {}
375 virtual UBool
operator==(const Node
&other
) const;
376 virtual int32_t markRightEdgesFirst(int32_t edgeNumber
);
377 virtual void write(StringTrieBuilder
&builder
);
380 Node
*next
; // A branch sub-node.
382 #endif /* U_HIDE_INTERNAL_API */
385 virtual Node
*createLinearMatchNode(int32_t i
, int32_t unitIndex
, int32_t length
,
386 Node
*nextNode
) const = 0;
389 virtual int32_t write(int32_t unit
) = 0;
391 virtual int32_t writeElementUnits(int32_t i
, int32_t unitIndex
, int32_t length
) = 0;
393 virtual int32_t writeValueAndFinal(int32_t i
, UBool isFinal
) = 0;
395 virtual int32_t writeValueAndType(UBool hasValue
, int32_t value
, int32_t node
) = 0;
397 virtual int32_t writeDeltaTo(int32_t jumpTarget
) = 0;
402 #endif // __STRINGTRIEBUILDER_H__