1 // © 2016 and later: Unicode, Inc. and others. 
   2 // License & terms of use: http://www.unicode.org/copyright.html 
   4 ******************************************************************************* 
   5 *   Copyright (C) 2010-2012, International Business Machines 
   6 *   Corporation and others.  All Rights Reserved. 
   7 ******************************************************************************* 
   8 *   file name:  stringtriebuilder.cpp 
  10 *   tab size:   8 (not used) 
  13 *   created on: 2010dec24 
  14 *   created by: Markus W. Scherer 
  17 #include "utypeinfo.h"  // for 'typeid' to work 
  18 #include "unicode/utypes.h" 
  19 #include "unicode/stringtriebuilder.h" 
  25 static int32_t U_CALLCONV
 
  26 hashStringTrieNode(const UHashTok key
) { 
  27     return icu::StringTrieBuilder::hashNode(key
.pointer
); 
  30 static UBool U_CALLCONV
 
  31 equalStringTrieNodes(const UHashTok key1
, const UHashTok key2
) { 
  32     return icu::StringTrieBuilder::equalNodes(key1
.pointer
, key2
.pointer
); 
  39 StringTrieBuilder::StringTrieBuilder() : nodes(NULL
) {} 
  41 StringTrieBuilder::~StringTrieBuilder() { 
  42     deleteCompactBuilder(); 
  46 StringTrieBuilder::createCompactBuilder(int32_t sizeGuess
, UErrorCode 
&errorCode
) { 
  47     if(U_FAILURE(errorCode
)) { 
  50     nodes
=uhash_openSize(hashStringTrieNode
, equalStringTrieNodes
, NULL
, 
  51                          sizeGuess
, &errorCode
); 
  52     if(U_SUCCESS(errorCode
)) { 
  54           errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
  56           uhash_setKeyDeleter(nodes
, uprv_deleteUObject
); 
  62 StringTrieBuilder::deleteCompactBuilder() { 
  68 StringTrieBuilder::build(UStringTrieBuildOption buildOption
, int32_t elementsLength
, 
  69                        UErrorCode 
&errorCode
) { 
  70     if(buildOption
==USTRINGTRIE_BUILD_FAST
) { 
  71         writeNode(0, elementsLength
, 0); 
  72     } else /* USTRINGTRIE_BUILD_SMALL */ { 
  73         createCompactBuilder(2*elementsLength
, errorCode
); 
  74         Node 
*root
=makeNode(0, elementsLength
, 0, errorCode
); 
  75         if(U_SUCCESS(errorCode
)) { 
  76             root
->markRightEdgesFirst(-1); 
  79         deleteCompactBuilder(); 
  83 // Requires start<limit, 
  84 // and all strings of the [start..limit[ elements must be sorted and 
  85 // have a common prefix of length unitIndex. 
  87 StringTrieBuilder::writeNode(int32_t start
, int32_t limit
, int32_t unitIndex
) { 
  91     if(unitIndex
==getElementStringLength(start
)) { 
  92         // An intermediate or final value. 
  93         value
=getElementValue(start
++); 
  95             return writeValueAndFinal(value
, TRUE
);  // final-value node 
  99     // Now all [start..limit[ strings are longer than unitIndex. 
 100     int32_t minUnit
=getElementUnit(start
, unitIndex
); 
 101     int32_t maxUnit
=getElementUnit(limit
-1, unitIndex
); 
 102     if(minUnit
==maxUnit
) { 
 103         // Linear-match node: All strings have the same character at unitIndex. 
 104         int32_t lastUnitIndex
=getLimitOfLinearMatch(start
, limit
-1, unitIndex
); 
 105         writeNode(start
, limit
, lastUnitIndex
); 
 106         // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. 
 107         int32_t length
=lastUnitIndex
-unitIndex
; 
 108         int32_t maxLinearMatchLength
=getMaxLinearMatchLength(); 
 109         while(length
>maxLinearMatchLength
) { 
 110             lastUnitIndex
-=maxLinearMatchLength
; 
 111             length
-=maxLinearMatchLength
; 
 112             writeElementUnits(start
, lastUnitIndex
, maxLinearMatchLength
); 
 113             write(getMinLinearMatch()+maxLinearMatchLength
-1); 
 115         writeElementUnits(start
, unitIndex
, length
); 
 116         type
=getMinLinearMatch()+length
-1; 
 119         int32_t length
=countElementUnits(start
, limit
, unitIndex
); 
 120         // length>=2 because minUnit!=maxUnit. 
 121         writeBranchSubNode(start
, limit
, unitIndex
, length
); 
 122         if(--length
<getMinLinearMatch()) { 
 129     return writeValueAndType(hasValue
, value
, type
); 
 132 // start<limit && all strings longer than unitIndex && 
 133 // length different units at unitIndex 
 135 StringTrieBuilder::writeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
, int32_t length
) { 
 136     UChar middleUnits
[kMaxSplitBranchLevels
]; 
 137     int32_t lessThan
[kMaxSplitBranchLevels
]; 
 139     while(length
>getMaxBranchLinearSubNodeLength()) { 
 140         // Branch on the middle unit. 
 141         // First, find the middle unit. 
 142         int32_t i
=skipElementsBySomeUnits(start
, unitIndex
, length
/2); 
 143         // Encode the less-than branch first. 
 144         middleUnits
[ltLength
]=getElementUnit(i
, unitIndex
);  // middle unit 
 145         lessThan
[ltLength
]=writeBranchSubNode(start
, i
, unitIndex
, length
/2); 
 147         // Continue for the greater-or-equal branch. 
 149         length
=length
-length
/2; 
 151     // For each unit, find its elements array start and whether it has a final value. 
 152     int32_t starts
[kMaxBranchLinearSubNodeLength
]; 
 153     UBool isFinal
[kMaxBranchLinearSubNodeLength
-1]; 
 154     int32_t unitNumber
=0; 
 156         int32_t i
=starts
[unitNumber
]=start
; 
 157         UChar unit
=getElementUnit(i
++, unitIndex
); 
 158         i
=indexOfElementWithNextUnit(i
, unitIndex
, unit
); 
 159         isFinal
[unitNumber
]= start
==i
-1 && unitIndex
+1==getElementStringLength(start
); 
 161     } while(++unitNumber
<length
-1); 
 162     // unitNumber==length-1, and the maxUnit elements range is [start..limit[ 
 163     starts
[unitNumber
]=start
; 
 165     // Write the sub-nodes in reverse order: The jump lengths are deltas from 
 166     // after their own positions, so if we wrote the minUnit sub-node first, 
 167     // then its jump delta would be larger. 
 168     // Instead we write the minUnit sub-node last, for a shorter delta. 
 169     int32_t jumpTargets
[kMaxBranchLinearSubNodeLength
-1]; 
 172         if(!isFinal
[unitNumber
]) { 
 173             jumpTargets
[unitNumber
]=writeNode(starts
[unitNumber
], starts
[unitNumber
+1], unitIndex
+1); 
 175     } while(unitNumber
>0); 
 176     // The maxUnit sub-node is written as the very last one because we do 
 177     // not jump for it at all. 
 179     writeNode(start
, limit
, unitIndex
+1); 
 180     int32_t offset
=write(getElementUnit(start
, unitIndex
)); 
 181     // Write the rest of this node's unit-value pairs. 
 182     while(--unitNumber
>=0) { 
 183         start
=starts
[unitNumber
]; 
 185         if(isFinal
[unitNumber
]) { 
 186             // Write the final value for the one string ending with this unit. 
 187             value
=getElementValue(start
); 
 189             // Write the delta to the start position of the sub-node. 
 190             value
=offset
-jumpTargets
[unitNumber
]; 
 192         writeValueAndFinal(value
, isFinal
[unitNumber
]); 
 193         offset
=write(getElementUnit(start
, unitIndex
)); 
 195     // Write the split-branch nodes. 
 198         writeDeltaTo(lessThan
[ltLength
]); 
 199         offset
=write(middleUnits
[ltLength
]); 
 204 // Requires start<limit, 
 205 // and all strings of the [start..limit[ elements must be sorted and 
 206 // have a common prefix of length unitIndex. 
 207 StringTrieBuilder::Node 
* 
 208 StringTrieBuilder::makeNode(int32_t start
, int32_t limit
, int32_t unitIndex
, UErrorCode 
&errorCode
) { 
 209     if(U_FAILURE(errorCode
)) { 
 212     UBool hasValue
=FALSE
; 
 214     if(unitIndex
==getElementStringLength(start
)) { 
 215         // An intermediate or final value. 
 216         value
=getElementValue(start
++); 
 218             return registerFinalValue(value
, errorCode
); 
 223     // Now all [start..limit[ strings are longer than unitIndex. 
 224     int32_t minUnit
=getElementUnit(start
, unitIndex
); 
 225     int32_t maxUnit
=getElementUnit(limit
-1, unitIndex
); 
 226     if(minUnit
==maxUnit
) { 
 227         // Linear-match node: All strings have the same character at unitIndex. 
 228         int32_t lastUnitIndex
=getLimitOfLinearMatch(start
, limit
-1, unitIndex
); 
 229         Node 
*nextNode
=makeNode(start
, limit
, lastUnitIndex
, errorCode
); 
 230         // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength. 
 231         int32_t length
=lastUnitIndex
-unitIndex
; 
 232         int32_t maxLinearMatchLength
=getMaxLinearMatchLength(); 
 233         while(length
>maxLinearMatchLength
) { 
 234             lastUnitIndex
-=maxLinearMatchLength
; 
 235             length
-=maxLinearMatchLength
; 
 236             node
=createLinearMatchNode(start
, lastUnitIndex
, maxLinearMatchLength
, nextNode
); 
 237             nextNode
=registerNode(node
, errorCode
); 
 239         node
=createLinearMatchNode(start
, unitIndex
, length
, nextNode
); 
 242         int32_t length
=countElementUnits(start
, limit
, unitIndex
); 
 243         // length>=2 because minUnit!=maxUnit. 
 244         Node 
*subNode
=makeBranchSubNode(start
, limit
, unitIndex
, length
, errorCode
); 
 245         node
=new BranchHeadNode(length
, subNode
); 
 247     if(hasValue 
&& node
!=NULL
) { 
 248         if(matchNodesCanHaveValues()) { 
 249             ((ValueNode 
*)node
)->setValue(value
); 
 251             node
=new IntermediateValueNode(value
, registerNode(node
, errorCode
)); 
 254     return registerNode(node
, errorCode
); 
 257 // start<limit && all strings longer than unitIndex && 
 258 // length different units at unitIndex 
 259 StringTrieBuilder::Node 
* 
 260 StringTrieBuilder::makeBranchSubNode(int32_t start
, int32_t limit
, int32_t unitIndex
, 
 261                                    int32_t length
, UErrorCode 
&errorCode
) { 
 262     if(U_FAILURE(errorCode
)) { 
 265     UChar middleUnits
[kMaxSplitBranchLevels
]; 
 266     Node 
*lessThan
[kMaxSplitBranchLevels
]; 
 268     while(length
>getMaxBranchLinearSubNodeLength()) { 
 269         // Branch on the middle unit. 
 270         // First, find the middle unit. 
 271         int32_t i
=skipElementsBySomeUnits(start
, unitIndex
, length
/2); 
 272         // Create the less-than branch. 
 273         middleUnits
[ltLength
]=getElementUnit(i
, unitIndex
);  // middle unit 
 274         lessThan
[ltLength
]=makeBranchSubNode(start
, i
, unitIndex
, length
/2, errorCode
); 
 276         // Continue for the greater-or-equal branch. 
 278         length
=length
-length
/2; 
 280     if(U_FAILURE(errorCode
)) { 
 283     ListBranchNode 
*listNode
=new ListBranchNode(); 
 285         errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 288     // For each unit, find its elements array start and whether it has a final value. 
 289     int32_t unitNumber
=0; 
 292         UChar unit
=getElementUnit(i
++, unitIndex
); 
 293         i
=indexOfElementWithNextUnit(i
, unitIndex
, unit
); 
 294         if(start
==i
-1 && unitIndex
+1==getElementStringLength(start
)) { 
 295             listNode
->add(unit
, getElementValue(start
)); 
 297             listNode
->add(unit
, makeNode(start
, i
, unitIndex
+1, errorCode
)); 
 300     } while(++unitNumber
<length
-1); 
 301     // unitNumber==length-1, and the maxUnit elements range is [start..limit[ 
 302     UChar unit
=getElementUnit(start
, unitIndex
); 
 303     if(start
==limit
-1 && unitIndex
+1==getElementStringLength(start
)) { 
 304         listNode
->add(unit
, getElementValue(start
)); 
 306         listNode
->add(unit
, makeNode(start
, limit
, unitIndex
+1, errorCode
)); 
 308     Node 
*node
=registerNode(listNode
, errorCode
); 
 309     // Create the split-branch nodes. 
 313             new SplitBranchNode(middleUnits
[ltLength
], lessThan
[ltLength
], node
), errorCode
); 
 318 StringTrieBuilder::Node 
* 
 319 StringTrieBuilder::registerNode(Node 
*newNode
, UErrorCode 
&errorCode
) { 
 320     if(U_FAILURE(errorCode
)) { 
 325         errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 328     const UHashElement 
*old
=uhash_find(nodes
, newNode
); 
 331         return (Node 
*)old
->key
.pointer
; 
 333     // If uhash_puti() returns a non-zero value from an equivalent, previously 
 334     // registered node, then uhash_find() failed to find that and we will leak newNode. 
 336     int32_t oldValue
=  // Only in debug mode to avoid a compiler warning about unused oldValue. 
 338     uhash_puti(nodes
, newNode
, 1, &errorCode
); 
 339     U_ASSERT(oldValue
==0); 
 340     if(U_FAILURE(errorCode
)) { 
 347 StringTrieBuilder::Node 
* 
 348 StringTrieBuilder::registerFinalValue(int32_t value
, UErrorCode 
&errorCode
) { 
 349     if(U_FAILURE(errorCode
)) { 
 352     FinalValueNode 
key(value
); 
 353     const UHashElement 
*old
=uhash_find(nodes
, &key
); 
 355         return (Node 
*)old
->key
.pointer
; 
 357     Node 
*newNode
=new FinalValueNode(value
); 
 359         errorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 362     // If uhash_puti() returns a non-zero value from an equivalent, previously 
 363     // registered node, then uhash_find() failed to find that and we will leak newNode. 
 365     int32_t oldValue
=  // Only in debug mode to avoid a compiler warning about unused oldValue. 
 367     uhash_puti(nodes
, newNode
, 1, &errorCode
); 
 368     U_ASSERT(oldValue
==0); 
 369     if(U_FAILURE(errorCode
)) { 
 377 StringTrieBuilder::hashNode(const void *node
) { 
 378     return ((const Node 
*)node
)->hashCode(); 
 382 StringTrieBuilder::equalNodes(const void *left
, const void *right
) { 
 383     return *(const Node 
*)left
==*(const Node 
*)right
; 
 387 StringTrieBuilder::Node::operator==(const Node 
&other
) const { 
 388     return this==&other 
|| (typeid(*this)==typeid(other
) && hash
==other
.hash
); 
 392 StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber
) { 
 400 StringTrieBuilder::FinalValueNode::operator==(const Node 
&other
) const { 
 404     if(!Node::operator==(other
)) { 
 407     const FinalValueNode 
&o
=(const FinalValueNode 
&)other
; 
 408     return value
==o
.value
; 
 412 StringTrieBuilder::FinalValueNode::write(StringTrieBuilder 
&builder
) { 
 413     offset
=builder
.writeValueAndFinal(value
, TRUE
); 
 417 StringTrieBuilder::ValueNode::operator==(const Node 
&other
) const { 
 421     if(!Node::operator==(other
)) { 
 424     const ValueNode 
&o
=(const ValueNode 
&)other
; 
 425     return hasValue
==o
.hasValue 
&& (!hasValue 
|| value
==o
.value
); 
 429 StringTrieBuilder::IntermediateValueNode::operator==(const Node 
&other
) const { 
 433     if(!ValueNode::operator==(other
)) { 
 436     const IntermediateValueNode 
&o
=(const IntermediateValueNode 
&)other
; 
 441 StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber
) { 
 443         offset
=edgeNumber
=next
->markRightEdgesFirst(edgeNumber
); 
 449 StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder 
&builder
) { 
 450     next
->write(builder
); 
 451     offset
=builder
.writeValueAndFinal(value
, FALSE
); 
 455 StringTrieBuilder::LinearMatchNode::operator==(const Node 
&other
) const { 
 459     if(!ValueNode::operator==(other
)) { 
 462     const LinearMatchNode 
&o
=(const LinearMatchNode 
&)other
; 
 463     return length
==o
.length 
&& next
==o
.next
; 
 467 StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber
) { 
 469         offset
=edgeNumber
=next
->markRightEdgesFirst(edgeNumber
); 
 475 StringTrieBuilder::ListBranchNode::operator==(const Node 
&other
) const { 
 479     if(!Node::operator==(other
)) { 
 482     const ListBranchNode 
&o
=(const ListBranchNode 
&)other
; 
 483     for(int32_t i
=0; i
<length
; ++i
) { 
 484         if(units
[i
]!=o
.units
[i
] || values
[i
]!=o
.values
[i
] || equal
[i
]!=o
.equal
[i
]) { 
 492 StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber
) { 
 494         firstEdgeNumber
=edgeNumber
; 
 498             Node 
*edge
=equal
[--i
]; 
 500                 edgeNumber
=edge
->markRightEdgesFirst(edgeNumber
-step
); 
 502             // For all but the rightmost edge, decrement the edge number. 
 511 StringTrieBuilder::ListBranchNode::write(StringTrieBuilder 
&builder
) { 
 512     // Write the sub-nodes in reverse order: The jump lengths are deltas from 
 513     // after their own positions, so if we wrote the minUnit sub-node first, 
 514     // then its jump delta would be larger. 
 515     // Instead we write the minUnit sub-node last, for a shorter delta. 
 516     int32_t unitNumber
=length
-1; 
 517     Node 
*rightEdge
=equal
[unitNumber
]; 
 518     int32_t rightEdgeNumber
= rightEdge
==NULL 
? firstEdgeNumber 
: rightEdge
->getOffset(); 
 521         if(equal
[unitNumber
]!=NULL
) { 
 522             equal
[unitNumber
]->writeUnlessInsideRightEdge(firstEdgeNumber
, rightEdgeNumber
, builder
); 
 524     } while(unitNumber
>0); 
 525     // The maxUnit sub-node is written as the very last one because we do 
 526     // not jump for it at all. 
 528     if(rightEdge
==NULL
) { 
 529         builder
.writeValueAndFinal(values
[unitNumber
], TRUE
); 
 531         rightEdge
->write(builder
); 
 533     offset
=builder
.write(units
[unitNumber
]); 
 534     // Write the rest of this node's unit-value pairs. 
 535     while(--unitNumber
>=0) { 
 538         if(equal
[unitNumber
]==NULL
) { 
 539             // Write the final value for the one string ending with this unit. 
 540             value
=values
[unitNumber
]; 
 543             // Write the delta to the start position of the sub-node. 
 544             U_ASSERT(equal
[unitNumber
]->getOffset()>0); 
 545             value
=offset
-equal
[unitNumber
]->getOffset(); 
 548         builder
.writeValueAndFinal(value
, isFinal
); 
 549         offset
=builder
.write(units
[unitNumber
]); 
 554 StringTrieBuilder::SplitBranchNode::operator==(const Node 
&other
) const { 
 558     if(!Node::operator==(other
)) { 
 561     const SplitBranchNode 
&o
=(const SplitBranchNode 
&)other
; 
 562     return unit
==o
.unit 
&& lessThan
==o
.lessThan 
&& greaterOrEqual
==o
.greaterOrEqual
; 
 566 StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber
) { 
 568         firstEdgeNumber
=edgeNumber
; 
 569         edgeNumber
=greaterOrEqual
->markRightEdgesFirst(edgeNumber
); 
 570         offset
=edgeNumber
=lessThan
->markRightEdgesFirst(edgeNumber
-1); 
 576 StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder 
&builder
) { 
 577     // Encode the less-than branch first. 
 578     lessThan
->writeUnlessInsideRightEdge(firstEdgeNumber
, greaterOrEqual
->getOffset(), builder
); 
 579     // Encode the greater-or-equal branch last because we do not jump for it at all. 
 580     greaterOrEqual
->write(builder
); 
 582     U_ASSERT(lessThan
->getOffset()>0); 
 583     builder
.writeDeltaTo(lessThan
->getOffset());  // less-than 
 584     offset
=builder
.write(unit
); 
 588 StringTrieBuilder::BranchHeadNode::operator==(const Node 
&other
) const { 
 592     if(!ValueNode::operator==(other
)) { 
 595     const BranchHeadNode 
&o
=(const BranchHeadNode 
&)other
; 
 596     return length
==o
.length 
&& next
==o
.next
; 
 600 StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber
) { 
 602         offset
=edgeNumber
=next
->markRightEdgesFirst(edgeNumber
); 
 608 StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder 
&builder
) { 
 609     next
->write(builder
); 
 610     if(length
<=builder
.getMinLinearMatch()) { 
 611         offset
=builder
.writeValueAndType(hasValue
, value
, length
-1); 
 613         builder
.write(length
-1); 
 614         offset
=builder
.writeValueAndType(hasValue
, value
, 0);