2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: stringtriebuilder.cpp
8 * tab size: 8 (not used)
11 * created on: 2010dec24
12 * created by: Markus W. Scherer
15 #include "utypeinfo.h" // for 'typeid' to work
16 #include "unicode/utypes.h"
17 #include "unicode/stringtriebuilder.h"
23 static int32_t U_CALLCONV
24 hashStringTrieNode(const UHashTok key
) {
25 return icu::StringTrieBuilder::hashNode(key
.pointer
);
28 static UBool U_CALLCONV
29 equalStringTrieNodes(const UHashTok key1
, const UHashTok key2
) {
30 return icu::StringTrieBuilder::equalNodes(key1
.pointer
, key2
.pointer
);
37 StringTrieBuilder::StringTrieBuilder() : nodes(NULL
) {}
39 StringTrieBuilder::~StringTrieBuilder() {
40 deleteCompactBuilder();
44 StringTrieBuilder::createCompactBuilder(int32_t sizeGuess
, UErrorCode
&errorCode
) {
45 if(U_FAILURE(errorCode
)) {
48 nodes
=uhash_openSize(hashStringTrieNode
, equalStringTrieNodes
, NULL
,
49 sizeGuess
, &errorCode
);
50 if(U_SUCCESS(errorCode
)) {
52 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
54 uhash_setKeyDeleter(nodes
, uprv_deleteUObject
);
60 StringTrieBuilder::deleteCompactBuilder() {
66 StringTrieBuilder::build(UStringTrieBuildOption buildOption
, int32_t elementsLength
,
67 UErrorCode
&errorCode
) {
68 if(buildOption
==USTRINGTRIE_BUILD_FAST
) {
69 writeNode(0, elementsLength
, 0);
70 } else /* USTRINGTRIE_BUILD_SMALL */ {
71 createCompactBuilder(2*elementsLength
, errorCode
);
72 Node
*root
=makeNode(0, elementsLength
, 0, errorCode
);
73 if(U_SUCCESS(errorCode
)) {
74 root
->markRightEdgesFirst(-1);
77 deleteCompactBuilder();
81 // Requires start<limit,
82 // and all strings of the [start..limit[ elements must be sorted and
83 // have a common prefix of length unitIndex.
85 StringTrieBuilder::writeNode(int32_t start
, int32_t limit
, int32_t unitIndex
) {
89 if(unitIndex
==getElementStringLength(start
)) {
90 // An intermediate or final value.
91 value
=getElementValue(start
++);
93 return writeValueAndFinal(value
, TRUE
); // final-value node
97 // Now all [start..limit[ strings are longer than unitIndex.
98 int32_t minUnit
=getElementUnit(start
, unitIndex
);
99 int32_t maxUnit
=getElementUnit(limit
-1, unitIndex
);
100 if(minUnit
==maxUnit
) {
101 // Linear-match node: All strings have the same character at unitIndex.
102 int32_t lastUnitIndex
=getLimitOfLinearMatch(start
, limit
-1, unitIndex
);
103 writeNode(start
, limit
, lastUnitIndex
);
104 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
105 int32_t length
=lastUnitIndex
-unitIndex
;
106 int32_t maxLinearMatchLength
=getMaxLinearMatchLength();
107 while(length
>maxLinearMatchLength
) {
108 lastUnitIndex
-=maxLinearMatchLength
;
109 length
-=maxLinearMatchLength
;
110 writeElementUnits(start
, lastUnitIndex
, maxLinearMatchLength
);
111 write(getMinLinearMatch()+maxLinearMatchLength
-1);
113 writeElementUnits(start
, unitIndex
, length
);
114 type
=getMinLinearMatch()+length
-1;
117 int32_t length
=countElementUnits(start
, limit
, unitIndex
);
118 // length>=2 because minUnit!=maxUnit.
119 writeBranchSubNode(start
, limit
, unitIndex
, length
);
120 if(--length
<getMinLinearMatch()) {
127 return writeValueAndType(hasValue
, value
, type
);
130 // start<limit && all strings longer than unitIndex &&
131 // length different units at unitIndex
133 StringTrieBuilder::writeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
, int32_t length
) {
134 UChar middleUnits
[kMaxSplitBranchLevels
];
135 int32_t lessThan
[kMaxSplitBranchLevels
];
137 while(length
>getMaxBranchLinearSubNodeLength()) {
138 // Branch on the middle unit.
139 // First, find the middle unit.
140 int32_t i
=skipElementsBySomeUnits(start
, unitIndex
, length
/2);
141 // Encode the less-than branch first.
142 middleUnits
[ltLength
]=getElementUnit(i
, unitIndex
); // middle unit
143 lessThan
[ltLength
]=writeBranchSubNode(start
, i
, unitIndex
, length
/2);
145 // Continue for the greater-or-equal branch.
147 length
=length
-length
/2;
149 // For each unit, find its elements array start and whether it has a final value.
150 int32_t starts
[kMaxBranchLinearSubNodeLength
];
151 UBool isFinal
[kMaxBranchLinearSubNodeLength
-1];
152 int32_t unitNumber
=0;
154 int32_t i
=starts
[unitNumber
]=start
;
155 UChar unit
=getElementUnit(i
++, unitIndex
);
156 i
=indexOfElementWithNextUnit(i
, unitIndex
, unit
);
157 isFinal
[unitNumber
]= start
==i
-1 && unitIndex
+1==getElementStringLength(start
);
159 } while(++unitNumber
<length
-1);
160 // unitNumber==length-1, and the maxUnit elements range is [start..limit[
161 starts
[unitNumber
]=start
;
163 // Write the sub-nodes in reverse order: The jump lengths are deltas from
164 // after their own positions, so if we wrote the minUnit sub-node first,
165 // then its jump delta would be larger.
166 // Instead we write the minUnit sub-node last, for a shorter delta.
167 int32_t jumpTargets
[kMaxBranchLinearSubNodeLength
-1];
170 if(!isFinal
[unitNumber
]) {
171 jumpTargets
[unitNumber
]=writeNode(starts
[unitNumber
], starts
[unitNumber
+1], unitIndex
+1);
173 } while(unitNumber
>0);
174 // The maxUnit sub-node is written as the very last one because we do
175 // not jump for it at all.
177 writeNode(start
, limit
, unitIndex
+1);
178 int32_t offset
=write(getElementUnit(start
, unitIndex
));
179 // Write the rest of this node's unit-value pairs.
180 while(--unitNumber
>=0) {
181 start
=starts
[unitNumber
];
183 if(isFinal
[unitNumber
]) {
184 // Write the final value for the one string ending with this unit.
185 value
=getElementValue(start
);
187 // Write the delta to the start position of the sub-node.
188 value
=offset
-jumpTargets
[unitNumber
];
190 writeValueAndFinal(value
, isFinal
[unitNumber
]);
191 offset
=write(getElementUnit(start
, unitIndex
));
193 // Write the split-branch nodes.
196 writeDeltaTo(lessThan
[ltLength
]);
197 offset
=write(middleUnits
[ltLength
]);
202 // Requires start<limit,
203 // and all strings of the [start..limit[ elements must be sorted and
204 // have a common prefix of length unitIndex.
205 StringTrieBuilder::Node
*
206 StringTrieBuilder::makeNode(int32_t start
, int32_t limit
, int32_t unitIndex
, UErrorCode
&errorCode
) {
207 if(U_FAILURE(errorCode
)) {
210 UBool hasValue
=FALSE
;
212 if(unitIndex
==getElementStringLength(start
)) {
213 // An intermediate or final value.
214 value
=getElementValue(start
++);
216 return registerFinalValue(value
, errorCode
);
221 // Now all [start..limit[ strings are longer than unitIndex.
222 int32_t minUnit
=getElementUnit(start
, unitIndex
);
223 int32_t maxUnit
=getElementUnit(limit
-1, unitIndex
);
224 if(minUnit
==maxUnit
) {
225 // Linear-match node: All strings have the same character at unitIndex.
226 int32_t lastUnitIndex
=getLimitOfLinearMatch(start
, limit
-1, unitIndex
);
227 Node
*nextNode
=makeNode(start
, limit
, lastUnitIndex
, errorCode
);
228 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
229 int32_t length
=lastUnitIndex
-unitIndex
;
230 int32_t maxLinearMatchLength
=getMaxLinearMatchLength();
231 while(length
>maxLinearMatchLength
) {
232 lastUnitIndex
-=maxLinearMatchLength
;
233 length
-=maxLinearMatchLength
;
234 node
=createLinearMatchNode(start
, lastUnitIndex
, maxLinearMatchLength
, nextNode
);
235 nextNode
=registerNode(node
, errorCode
);
237 node
=createLinearMatchNode(start
, unitIndex
, length
, nextNode
);
240 int32_t length
=countElementUnits(start
, limit
, unitIndex
);
241 // length>=2 because minUnit!=maxUnit.
242 Node
*subNode
=makeBranchSubNode(start
, limit
, unitIndex
, length
, errorCode
);
243 node
=new BranchHeadNode(length
, subNode
);
245 if(hasValue
&& node
!=NULL
) {
246 if(matchNodesCanHaveValues()) {
247 ((ValueNode
*)node
)->setValue(value
);
249 node
=new IntermediateValueNode(value
, registerNode(node
, errorCode
));
252 return registerNode(node
, errorCode
);
255 // start<limit && all strings longer than unitIndex &&
256 // length different units at unitIndex
257 StringTrieBuilder::Node
*
258 StringTrieBuilder::makeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
,
259 int32_t length
, UErrorCode
&errorCode
) {
260 if(U_FAILURE(errorCode
)) {
263 UChar middleUnits
[kMaxSplitBranchLevels
];
264 Node
*lessThan
[kMaxSplitBranchLevels
];
266 while(length
>getMaxBranchLinearSubNodeLength()) {
267 // Branch on the middle unit.
268 // First, find the middle unit.
269 int32_t i
=skipElementsBySomeUnits(start
, unitIndex
, length
/2);
270 // Create the less-than branch.
271 middleUnits
[ltLength
]=getElementUnit(i
, unitIndex
); // middle unit
272 lessThan
[ltLength
]=makeBranchSubNode(start
, i
, unitIndex
, length
/2, errorCode
);
274 // Continue for the greater-or-equal branch.
276 length
=length
-length
/2;
278 if(U_FAILURE(errorCode
)) {
281 ListBranchNode
*listNode
=new ListBranchNode();
283 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
286 // For each unit, find its elements array start and whether it has a final value.
287 int32_t unitNumber
=0;
290 UChar unit
=getElementUnit(i
++, unitIndex
);
291 i
=indexOfElementWithNextUnit(i
, unitIndex
, unit
);
292 if(start
==i
-1 && unitIndex
+1==getElementStringLength(start
)) {
293 listNode
->add(unit
, getElementValue(start
));
295 listNode
->add(unit
, makeNode(start
, i
, unitIndex
+1, errorCode
));
298 } while(++unitNumber
<length
-1);
299 // unitNumber==length-1, and the maxUnit elements range is [start..limit[
300 UChar unit
=getElementUnit(start
, unitIndex
);
301 if(start
==limit
-1 && unitIndex
+1==getElementStringLength(start
)) {
302 listNode
->add(unit
, getElementValue(start
));
304 listNode
->add(unit
, makeNode(start
, limit
, unitIndex
+1, errorCode
));
306 Node
*node
=registerNode(listNode
, errorCode
);
307 // Create the split-branch nodes.
311 new SplitBranchNode(middleUnits
[ltLength
], lessThan
[ltLength
], node
), errorCode
);
316 StringTrieBuilder::Node
*
317 StringTrieBuilder::registerNode(Node
*newNode
, UErrorCode
&errorCode
) {
318 if(U_FAILURE(errorCode
)) {
323 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
326 const UHashElement
*old
=uhash_find(nodes
, newNode
);
329 return (Node
*)old
->key
.pointer
;
331 // If uhash_puti() returns a non-zero value from an equivalent, previously
332 // registered node, then uhash_find() failed to find that and we will leak newNode.
334 int32_t oldValue
= // Only in debug mode to avoid a compiler warning about unused oldValue.
336 uhash_puti(nodes
, newNode
, 1, &errorCode
);
337 U_ASSERT(oldValue
==0);
338 if(U_FAILURE(errorCode
)) {
345 StringTrieBuilder::Node
*
346 StringTrieBuilder::registerFinalValue(int32_t value
, UErrorCode
&errorCode
) {
347 if(U_FAILURE(errorCode
)) {
350 FinalValueNode
key(value
);
351 const UHashElement
*old
=uhash_find(nodes
, &key
);
353 return (Node
*)old
->key
.pointer
;
355 Node
*newNode
=new FinalValueNode(value
);
357 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
360 // If uhash_puti() returns a non-zero value from an equivalent, previously
361 // registered node, then uhash_find() failed to find that and we will leak newNode.
363 int32_t oldValue
= // Only in debug mode to avoid a compiler warning about unused oldValue.
365 uhash_puti(nodes
, newNode
, 1, &errorCode
);
366 U_ASSERT(oldValue
==0);
367 if(U_FAILURE(errorCode
)) {
375 StringTrieBuilder::hashNode(const void *node
) {
376 return ((const Node
*)node
)->hashCode();
380 StringTrieBuilder::equalNodes(const void *left
, const void *right
) {
381 return *(const Node
*)left
==*(const Node
*)right
;
385 StringTrieBuilder::Node::operator==(const Node
&other
) const {
386 return this==&other
|| (typeid(*this)==typeid(other
) && hash
==other
.hash
);
390 StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber
) {
398 StringTrieBuilder::FinalValueNode::operator==(const Node
&other
) const {
402 if(!Node::operator==(other
)) {
405 const FinalValueNode
&o
=(const FinalValueNode
&)other
;
406 return value
==o
.value
;
410 StringTrieBuilder::FinalValueNode::write(StringTrieBuilder
&builder
) {
411 offset
=builder
.writeValueAndFinal(value
, TRUE
);
415 StringTrieBuilder::ValueNode::operator==(const Node
&other
) const {
419 if(!Node::operator==(other
)) {
422 const ValueNode
&o
=(const ValueNode
&)other
;
423 return hasValue
==o
.hasValue
&& (!hasValue
|| value
==o
.value
);
427 StringTrieBuilder::IntermediateValueNode::operator==(const Node
&other
) const {
431 if(!ValueNode::operator==(other
)) {
434 const IntermediateValueNode
&o
=(const IntermediateValueNode
&)other
;
439 StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber
) {
441 offset
=edgeNumber
=next
->markRightEdgesFirst(edgeNumber
);
447 StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder
&builder
) {
448 next
->write(builder
);
449 offset
=builder
.writeValueAndFinal(value
, FALSE
);
453 StringTrieBuilder::LinearMatchNode::operator==(const Node
&other
) const {
457 if(!ValueNode::operator==(other
)) {
460 const LinearMatchNode
&o
=(const LinearMatchNode
&)other
;
461 return length
==o
.length
&& next
==o
.next
;
465 StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber
) {
467 offset
=edgeNumber
=next
->markRightEdgesFirst(edgeNumber
);
473 StringTrieBuilder::ListBranchNode::operator==(const Node
&other
) const {
477 if(!Node::operator==(other
)) {
480 const ListBranchNode
&o
=(const ListBranchNode
&)other
;
481 for(int32_t i
=0; i
<length
; ++i
) {
482 if(units
[i
]!=o
.units
[i
] || values
[i
]!=o
.values
[i
] || equal
[i
]!=o
.equal
[i
]) {
490 StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber
) {
492 firstEdgeNumber
=edgeNumber
;
496 Node
*edge
=equal
[--i
];
498 edgeNumber
=edge
->markRightEdgesFirst(edgeNumber
-step
);
500 // For all but the rightmost edge, decrement the edge number.
509 StringTrieBuilder::ListBranchNode::write(StringTrieBuilder
&builder
) {
510 // Write the sub-nodes in reverse order: The jump lengths are deltas from
511 // after their own positions, so if we wrote the minUnit sub-node first,
512 // then its jump delta would be larger.
513 // Instead we write the minUnit sub-node last, for a shorter delta.
514 int32_t unitNumber
=length
-1;
515 Node
*rightEdge
=equal
[unitNumber
];
516 int32_t rightEdgeNumber
= rightEdge
==NULL
? firstEdgeNumber
: rightEdge
->getOffset();
519 if(equal
[unitNumber
]!=NULL
) {
520 equal
[unitNumber
]->writeUnlessInsideRightEdge(firstEdgeNumber
, rightEdgeNumber
, builder
);
522 } while(unitNumber
>0);
523 // The maxUnit sub-node is written as the very last one because we do
524 // not jump for it at all.
526 if(rightEdge
==NULL
) {
527 builder
.writeValueAndFinal(values
[unitNumber
], TRUE
);
529 rightEdge
->write(builder
);
531 offset
=builder
.write(units
[unitNumber
]);
532 // Write the rest of this node's unit-value pairs.
533 while(--unitNumber
>=0) {
536 if(equal
[unitNumber
]==NULL
) {
537 // Write the final value for the one string ending with this unit.
538 value
=values
[unitNumber
];
541 // Write the delta to the start position of the sub-node.
542 U_ASSERT(equal
[unitNumber
]->getOffset()>0);
543 value
=offset
-equal
[unitNumber
]->getOffset();
546 builder
.writeValueAndFinal(value
, isFinal
);
547 offset
=builder
.write(units
[unitNumber
]);
552 StringTrieBuilder::SplitBranchNode::operator==(const Node
&other
) const {
556 if(!Node::operator==(other
)) {
559 const SplitBranchNode
&o
=(const SplitBranchNode
&)other
;
560 return unit
==o
.unit
&& lessThan
==o
.lessThan
&& greaterOrEqual
==o
.greaterOrEqual
;
564 StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber
) {
566 firstEdgeNumber
=edgeNumber
;
567 edgeNumber
=greaterOrEqual
->markRightEdgesFirst(edgeNumber
);
568 offset
=edgeNumber
=lessThan
->markRightEdgesFirst(edgeNumber
-1);
574 StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder
&builder
) {
575 // Encode the less-than branch first.
576 lessThan
->writeUnlessInsideRightEdge(firstEdgeNumber
, greaterOrEqual
->getOffset(), builder
);
577 // Encode the greater-or-equal branch last because we do not jump for it at all.
578 greaterOrEqual
->write(builder
);
580 U_ASSERT(lessThan
->getOffset()>0);
581 builder
.writeDeltaTo(lessThan
->getOffset()); // less-than
582 offset
=builder
.write(unit
);
586 StringTrieBuilder::BranchHeadNode::operator==(const Node
&other
) const {
590 if(!ValueNode::operator==(other
)) {
593 const BranchHeadNode
&o
=(const BranchHeadNode
&)other
;
594 return length
==o
.length
&& next
==o
.next
;
598 StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber
) {
600 offset
=edgeNumber
=next
->markRightEdgesFirst(edgeNumber
);
606 StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder
&builder
) {
607 next
->write(builder
);
608 if(length
<=builder
.getMinLinearMatch()) {
609 offset
=builder
.writeValueAndType(hasValue
, value
, length
-1);
611 builder
.write(length
-1);
612 offset
=builder
.writeValueAndType(hasValue
, value
, 0);