1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: stringtriebuilder.cpp
10 * tab size: 8 (not used)
13 * created on: 2010dec24
14 * created by: Markus W. Scherer
17 #include "utypeinfo.h" // for 'typeid' to work
18 #include "unicode/utypes.h"
19 #include "unicode/stringtriebuilder.h"
25 static int32_t U_CALLCONV
26 hashStringTrieNode(const UHashTok key
) {
27 return icu::StringTrieBuilder::hashNode(key
.pointer
);
30 static UBool U_CALLCONV
31 equalStringTrieNodes(const UHashTok key1
, const UHashTok key2
) {
32 return icu::StringTrieBuilder::equalNodes(key1
.pointer
, key2
.pointer
);
39 StringTrieBuilder::StringTrieBuilder() : nodes(NULL
) {}
41 StringTrieBuilder::~StringTrieBuilder() {
42 deleteCompactBuilder();
46 StringTrieBuilder::createCompactBuilder(int32_t sizeGuess
, UErrorCode
&errorCode
) {
47 if(U_FAILURE(errorCode
)) {
50 nodes
=uhash_openSize(hashStringTrieNode
, equalStringTrieNodes
, NULL
,
51 sizeGuess
, &errorCode
);
52 if(U_SUCCESS(errorCode
)) {
54 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
56 uhash_setKeyDeleter(nodes
, uprv_deleteUObject
);
62 StringTrieBuilder::deleteCompactBuilder() {
68 StringTrieBuilder::build(UStringTrieBuildOption buildOption
, int32_t elementsLength
,
69 UErrorCode
&errorCode
) {
70 if(buildOption
==USTRINGTRIE_BUILD_FAST
) {
71 writeNode(0, elementsLength
, 0);
72 } else /* USTRINGTRIE_BUILD_SMALL */ {
73 createCompactBuilder(2*elementsLength
, errorCode
);
74 Node
*root
=makeNode(0, elementsLength
, 0, errorCode
);
75 if(U_SUCCESS(errorCode
)) {
76 root
->markRightEdgesFirst(-1);
79 deleteCompactBuilder();
83 // Requires start<limit,
84 // and all strings of the [start..limit[ elements must be sorted and
85 // have a common prefix of length unitIndex.
87 StringTrieBuilder::writeNode(int32_t start
, int32_t limit
, int32_t unitIndex
) {
91 if(unitIndex
==getElementStringLength(start
)) {
92 // An intermediate or final value.
93 value
=getElementValue(start
++);
95 return writeValueAndFinal(value
, TRUE
); // final-value node
99 // Now all [start..limit[ strings are longer than unitIndex.
100 int32_t minUnit
=getElementUnit(start
, unitIndex
);
101 int32_t maxUnit
=getElementUnit(limit
-1, unitIndex
);
102 if(minUnit
==maxUnit
) {
103 // Linear-match node: All strings have the same character at unitIndex.
104 int32_t lastUnitIndex
=getLimitOfLinearMatch(start
, limit
-1, unitIndex
);
105 writeNode(start
, limit
, lastUnitIndex
);
106 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
107 int32_t length
=lastUnitIndex
-unitIndex
;
108 int32_t maxLinearMatchLength
=getMaxLinearMatchLength();
109 while(length
>maxLinearMatchLength
) {
110 lastUnitIndex
-=maxLinearMatchLength
;
111 length
-=maxLinearMatchLength
;
112 writeElementUnits(start
, lastUnitIndex
, maxLinearMatchLength
);
113 write(getMinLinearMatch()+maxLinearMatchLength
-1);
115 writeElementUnits(start
, unitIndex
, length
);
116 type
=getMinLinearMatch()+length
-1;
119 int32_t length
=countElementUnits(start
, limit
, unitIndex
);
120 // length>=2 because minUnit!=maxUnit.
121 writeBranchSubNode(start
, limit
, unitIndex
, length
);
122 if(--length
<getMinLinearMatch()) {
129 return writeValueAndType(hasValue
, value
, type
);
132 // start<limit && all strings longer than unitIndex &&
133 // length different units at unitIndex
135 StringTrieBuilder::writeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
, int32_t length
) {
136 UChar middleUnits
[kMaxSplitBranchLevels
];
137 int32_t lessThan
[kMaxSplitBranchLevels
];
139 while(length
>getMaxBranchLinearSubNodeLength()) {
140 // Branch on the middle unit.
141 // First, find the middle unit.
142 int32_t i
=skipElementsBySomeUnits(start
, unitIndex
, length
/2);
143 // Encode the less-than branch first.
144 middleUnits
[ltLength
]=getElementUnit(i
, unitIndex
); // middle unit
145 lessThan
[ltLength
]=writeBranchSubNode(start
, i
, unitIndex
, length
/2);
147 // Continue for the greater-or-equal branch.
149 length
=length
-length
/2;
151 // For each unit, find its elements array start and whether it has a final value.
152 int32_t starts
[kMaxBranchLinearSubNodeLength
];
153 UBool isFinal
[kMaxBranchLinearSubNodeLength
-1];
154 int32_t unitNumber
=0;
156 int32_t i
=starts
[unitNumber
]=start
;
157 UChar unit
=getElementUnit(i
++, unitIndex
);
158 i
=indexOfElementWithNextUnit(i
, unitIndex
, unit
);
159 isFinal
[unitNumber
]= start
==i
-1 && unitIndex
+1==getElementStringLength(start
);
161 } while(++unitNumber
<length
-1);
162 // unitNumber==length-1, and the maxUnit elements range is [start..limit[
163 starts
[unitNumber
]=start
;
165 // Write the sub-nodes in reverse order: The jump lengths are deltas from
166 // after their own positions, so if we wrote the minUnit sub-node first,
167 // then its jump delta would be larger.
168 // Instead we write the minUnit sub-node last, for a shorter delta.
169 int32_t jumpTargets
[kMaxBranchLinearSubNodeLength
-1];
172 if(!isFinal
[unitNumber
]) {
173 jumpTargets
[unitNumber
]=writeNode(starts
[unitNumber
], starts
[unitNumber
+1], unitIndex
+1);
175 } while(unitNumber
>0);
176 // The maxUnit sub-node is written as the very last one because we do
177 // not jump for it at all.
179 writeNode(start
, limit
, unitIndex
+1);
180 int32_t offset
=write(getElementUnit(start
, unitIndex
));
181 // Write the rest of this node's unit-value pairs.
182 while(--unitNumber
>=0) {
183 start
=starts
[unitNumber
];
185 if(isFinal
[unitNumber
]) {
186 // Write the final value for the one string ending with this unit.
187 value
=getElementValue(start
);
189 // Write the delta to the start position of the sub-node.
190 value
=offset
-jumpTargets
[unitNumber
];
192 writeValueAndFinal(value
, isFinal
[unitNumber
]);
193 offset
=write(getElementUnit(start
, unitIndex
));
195 // Write the split-branch nodes.
198 writeDeltaTo(lessThan
[ltLength
]);
199 offset
=write(middleUnits
[ltLength
]);
204 // Requires start<limit,
205 // and all strings of the [start..limit[ elements must be sorted and
206 // have a common prefix of length unitIndex.
207 StringTrieBuilder::Node
*
208 StringTrieBuilder::makeNode(int32_t start
, int32_t limit
, int32_t unitIndex
, UErrorCode
&errorCode
) {
209 if(U_FAILURE(errorCode
)) {
212 UBool hasValue
=FALSE
;
214 if(unitIndex
==getElementStringLength(start
)) {
215 // An intermediate or final value.
216 value
=getElementValue(start
++);
218 return registerFinalValue(value
, errorCode
);
223 // Now all [start..limit[ strings are longer than unitIndex.
224 int32_t minUnit
=getElementUnit(start
, unitIndex
);
225 int32_t maxUnit
=getElementUnit(limit
-1, unitIndex
);
226 if(minUnit
==maxUnit
) {
227 // Linear-match node: All strings have the same character at unitIndex.
228 int32_t lastUnitIndex
=getLimitOfLinearMatch(start
, limit
-1, unitIndex
);
229 Node
*nextNode
=makeNode(start
, limit
, lastUnitIndex
, errorCode
);
230 // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
231 int32_t length
=lastUnitIndex
-unitIndex
;
232 int32_t maxLinearMatchLength
=getMaxLinearMatchLength();
233 while(length
>maxLinearMatchLength
) {
234 lastUnitIndex
-=maxLinearMatchLength
;
235 length
-=maxLinearMatchLength
;
236 node
=createLinearMatchNode(start
, lastUnitIndex
, maxLinearMatchLength
, nextNode
);
237 nextNode
=registerNode(node
, errorCode
);
239 node
=createLinearMatchNode(start
, unitIndex
, length
, nextNode
);
242 int32_t length
=countElementUnits(start
, limit
, unitIndex
);
243 // length>=2 because minUnit!=maxUnit.
244 Node
*subNode
=makeBranchSubNode(start
, limit
, unitIndex
, length
, errorCode
);
245 node
=new BranchHeadNode(length
, subNode
);
247 if(hasValue
&& node
!=NULL
) {
248 if(matchNodesCanHaveValues()) {
249 ((ValueNode
*)node
)->setValue(value
);
251 node
=new IntermediateValueNode(value
, registerNode(node
, errorCode
));
254 return registerNode(node
, errorCode
);
257 // start<limit && all strings longer than unitIndex &&
258 // length different units at unitIndex
259 StringTrieBuilder::Node
*
260 StringTrieBuilder::makeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
,
261 int32_t length
, UErrorCode
&errorCode
) {
262 if(U_FAILURE(errorCode
)) {
265 UChar middleUnits
[kMaxSplitBranchLevels
];
266 Node
*lessThan
[kMaxSplitBranchLevels
];
268 while(length
>getMaxBranchLinearSubNodeLength()) {
269 // Branch on the middle unit.
270 // First, find the middle unit.
271 int32_t i
=skipElementsBySomeUnits(start
, unitIndex
, length
/2);
272 // Create the less-than branch.
273 middleUnits
[ltLength
]=getElementUnit(i
, unitIndex
); // middle unit
274 lessThan
[ltLength
]=makeBranchSubNode(start
, i
, unitIndex
, length
/2, errorCode
);
276 // Continue for the greater-or-equal branch.
278 length
=length
-length
/2;
280 if(U_FAILURE(errorCode
)) {
283 ListBranchNode
*listNode
=new ListBranchNode();
285 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
288 // For each unit, find its elements array start and whether it has a final value.
289 int32_t unitNumber
=0;
292 UChar unit
=getElementUnit(i
++, unitIndex
);
293 i
=indexOfElementWithNextUnit(i
, unitIndex
, unit
);
294 if(start
==i
-1 && unitIndex
+1==getElementStringLength(start
)) {
295 listNode
->add(unit
, getElementValue(start
));
297 listNode
->add(unit
, makeNode(start
, i
, unitIndex
+1, errorCode
));
300 } while(++unitNumber
<length
-1);
301 // unitNumber==length-1, and the maxUnit elements range is [start..limit[
302 UChar unit
=getElementUnit(start
, unitIndex
);
303 if(start
==limit
-1 && unitIndex
+1==getElementStringLength(start
)) {
304 listNode
->add(unit
, getElementValue(start
));
306 listNode
->add(unit
, makeNode(start
, limit
, unitIndex
+1, errorCode
));
308 Node
*node
=registerNode(listNode
, errorCode
);
309 // Create the split-branch nodes.
313 new SplitBranchNode(middleUnits
[ltLength
], lessThan
[ltLength
], node
), errorCode
);
318 StringTrieBuilder::Node
*
319 StringTrieBuilder::registerNode(Node
*newNode
, UErrorCode
&errorCode
) {
320 if(U_FAILURE(errorCode
)) {
325 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
328 const UHashElement
*old
=uhash_find(nodes
, newNode
);
331 return (Node
*)old
->key
.pointer
;
333 // If uhash_puti() returns a non-zero value from an equivalent, previously
334 // registered node, then uhash_find() failed to find that and we will leak newNode.
336 int32_t oldValue
= // Only in debug mode to avoid a compiler warning about unused oldValue.
338 uhash_puti(nodes
, newNode
, 1, &errorCode
);
339 U_ASSERT(oldValue
==0);
340 if(U_FAILURE(errorCode
)) {
347 StringTrieBuilder::Node
*
348 StringTrieBuilder::registerFinalValue(int32_t value
, UErrorCode
&errorCode
) {
349 if(U_FAILURE(errorCode
)) {
352 FinalValueNode
key(value
);
353 const UHashElement
*old
=uhash_find(nodes
, &key
);
355 return (Node
*)old
->key
.pointer
;
357 Node
*newNode
=new FinalValueNode(value
);
359 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
362 // If uhash_puti() returns a non-zero value from an equivalent, previously
363 // registered node, then uhash_find() failed to find that and we will leak newNode.
365 int32_t oldValue
= // Only in debug mode to avoid a compiler warning about unused oldValue.
367 uhash_puti(nodes
, newNode
, 1, &errorCode
);
368 U_ASSERT(oldValue
==0);
369 if(U_FAILURE(errorCode
)) {
377 StringTrieBuilder::hashNode(const void *node
) {
378 return ((const Node
*)node
)->hashCode();
382 StringTrieBuilder::equalNodes(const void *left
, const void *right
) {
383 return *(const Node
*)left
==*(const Node
*)right
;
387 StringTrieBuilder::Node::operator==(const Node
&other
) const {
388 return this==&other
|| (typeid(*this)==typeid(other
) && hash
==other
.hash
);
392 StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber
) {
400 StringTrieBuilder::FinalValueNode::operator==(const Node
&other
) const {
404 if(!Node::operator==(other
)) {
407 const FinalValueNode
&o
=(const FinalValueNode
&)other
;
408 return value
==o
.value
;
412 StringTrieBuilder::FinalValueNode::write(StringTrieBuilder
&builder
) {
413 offset
=builder
.writeValueAndFinal(value
, TRUE
);
417 StringTrieBuilder::ValueNode::operator==(const Node
&other
) const {
421 if(!Node::operator==(other
)) {
424 const ValueNode
&o
=(const ValueNode
&)other
;
425 return hasValue
==o
.hasValue
&& (!hasValue
|| value
==o
.value
);
429 StringTrieBuilder::IntermediateValueNode::operator==(const Node
&other
) const {
433 if(!ValueNode::operator==(other
)) {
436 const IntermediateValueNode
&o
=(const IntermediateValueNode
&)other
;
441 StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber
) {
443 offset
=edgeNumber
=next
->markRightEdgesFirst(edgeNumber
);
449 StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder
&builder
) {
450 next
->write(builder
);
451 offset
=builder
.writeValueAndFinal(value
, FALSE
);
455 StringTrieBuilder::LinearMatchNode::operator==(const Node
&other
) const {
459 if(!ValueNode::operator==(other
)) {
462 const LinearMatchNode
&o
=(const LinearMatchNode
&)other
;
463 return length
==o
.length
&& next
==o
.next
;
467 StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber
) {
469 offset
=edgeNumber
=next
->markRightEdgesFirst(edgeNumber
);
475 StringTrieBuilder::ListBranchNode::operator==(const Node
&other
) const {
479 if(!Node::operator==(other
)) {
482 const ListBranchNode
&o
=(const ListBranchNode
&)other
;
483 for(int32_t i
=0; i
<length
; ++i
) {
484 if(units
[i
]!=o
.units
[i
] || values
[i
]!=o
.values
[i
] || equal
[i
]!=o
.equal
[i
]) {
492 StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber
) {
494 firstEdgeNumber
=edgeNumber
;
498 Node
*edge
=equal
[--i
];
500 edgeNumber
=edge
->markRightEdgesFirst(edgeNumber
-step
);
502 // For all but the rightmost edge, decrement the edge number.
511 StringTrieBuilder::ListBranchNode::write(StringTrieBuilder
&builder
) {
512 // Write the sub-nodes in reverse order: The jump lengths are deltas from
513 // after their own positions, so if we wrote the minUnit sub-node first,
514 // then its jump delta would be larger.
515 // Instead we write the minUnit sub-node last, for a shorter delta.
516 int32_t unitNumber
=length
-1;
517 Node
*rightEdge
=equal
[unitNumber
];
518 int32_t rightEdgeNumber
= rightEdge
==NULL
? firstEdgeNumber
: rightEdge
->getOffset();
521 if(equal
[unitNumber
]!=NULL
) {
522 equal
[unitNumber
]->writeUnlessInsideRightEdge(firstEdgeNumber
, rightEdgeNumber
, builder
);
524 } while(unitNumber
>0);
525 // The maxUnit sub-node is written as the very last one because we do
526 // not jump for it at all.
528 if(rightEdge
==NULL
) {
529 builder
.writeValueAndFinal(values
[unitNumber
], TRUE
);
531 rightEdge
->write(builder
);
533 offset
=builder
.write(units
[unitNumber
]);
534 // Write the rest of this node's unit-value pairs.
535 while(--unitNumber
>=0) {
538 if(equal
[unitNumber
]==NULL
) {
539 // Write the final value for the one string ending with this unit.
540 value
=values
[unitNumber
];
543 // Write the delta to the start position of the sub-node.
544 U_ASSERT(equal
[unitNumber
]->getOffset()>0);
545 value
=offset
-equal
[unitNumber
]->getOffset();
548 builder
.writeValueAndFinal(value
, isFinal
);
549 offset
=builder
.write(units
[unitNumber
]);
554 StringTrieBuilder::SplitBranchNode::operator==(const Node
&other
) const {
558 if(!Node::operator==(other
)) {
561 const SplitBranchNode
&o
=(const SplitBranchNode
&)other
;
562 return unit
==o
.unit
&& lessThan
==o
.lessThan
&& greaterOrEqual
==o
.greaterOrEqual
;
566 StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber
) {
568 firstEdgeNumber
=edgeNumber
;
569 edgeNumber
=greaterOrEqual
->markRightEdgesFirst(edgeNumber
);
570 offset
=edgeNumber
=lessThan
->markRightEdgesFirst(edgeNumber
-1);
576 StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder
&builder
) {
577 // Encode the less-than branch first.
578 lessThan
->writeUnlessInsideRightEdge(firstEdgeNumber
, greaterOrEqual
->getOffset(), builder
);
579 // Encode the greater-or-equal branch last because we do not jump for it at all.
580 greaterOrEqual
->write(builder
);
582 U_ASSERT(lessThan
->getOffset()>0);
583 builder
.writeDeltaTo(lessThan
->getOffset()); // less-than
584 offset
=builder
.write(unit
);
588 StringTrieBuilder::BranchHeadNode::operator==(const Node
&other
) const {
592 if(!ValueNode::operator==(other
)) {
595 const BranchHeadNode
&o
=(const BranchHeadNode
&)other
;
596 return length
==o
.length
&& next
==o
.next
;
600 StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber
) {
602 offset
=edgeNumber
=next
->markRightEdgesFirst(edgeNumber
);
608 StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder
&builder
) {
609 next
->write(builder
);
610 if(length
<=builder
.getMinLinearMatch()) {
611 offset
=builder
.writeValueAndType(hasValue
, value
, length
-1);
613 builder
.write(length
-1);
614 offset
=builder
.writeValueAndType(hasValue
, value
, 0);