2 *******************************************************************************
3 * Copyright (C) 2006-2011, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_BREAK_ITERATION
13 #include "unicode/chariter.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/strenum.h"
16 #include "unicode/uenum.h"
17 #include "unicode/udata.h"
24 //#define DEBUG_TRIE_DICT 1
26 #ifdef DEBUG_TRIE_DICT
27 #include <sys/times.h>
34 /*******************************************************************
38 TrieWordDictionary::TrieWordDictionary() {
41 TrieWordDictionary::~TrieWordDictionary() {
44 /*******************************************************************
45 * MutableTrieDictionary
48 // Node structure for the ternary, uncompressed trie
49 struct TernaryNode
: public UMemory
{
50 UChar ch
; // UTF-16 code unit
51 uint16_t flags
; // Flag word
52 TernaryNode
*low
; // Less-than link
53 TernaryNode
*equal
; // Equal link
54 TernaryNode
*high
; // Greater-than link
56 TernaryNode(UChar uc
);
60 enum MutableTrieNodeFlags
{
61 kEndsWord
= 0x0001 // This node marks the end of a valid word
65 TernaryNode::TernaryNode(UChar uc
) {
73 // Not inline since it's recursive
74 TernaryNode::~TernaryNode() {
80 MutableTrieDictionary::MutableTrieDictionary( UChar median
, UErrorCode
&status
) {
81 // Start the trie off with something. Having the root node already present
82 // cuts a special case out of the search/insertion functions.
83 // Making it a median character cuts the worse case for searches from
84 // 4x a balanced trie to 2x a balanced trie. It's best to choose something
85 // that starts a word that is midway in the list.
86 fTrie
= new TernaryNode(median
);
88 status
= U_MEMORY_ALLOCATION_ERROR
;
90 fIter
= utext_openUChars(NULL
, NULL
, 0, &status
);
91 if (U_SUCCESS(status
) && fIter
== NULL
) {
92 status
= U_MEMORY_ALLOCATION_ERROR
;
96 MutableTrieDictionary::MutableTrieDictionary( UErrorCode
&status
) {
98 fIter
= utext_openUChars(NULL
, NULL
, 0, &status
);
99 if (U_SUCCESS(status
) && fIter
== NULL
) {
100 status
= U_MEMORY_ALLOCATION_ERROR
;
104 MutableTrieDictionary::~MutableTrieDictionary() {
110 MutableTrieDictionary::search( UText
*text
,
115 TernaryNode
*&parent
,
116 UBool
&pMatched
) const {
117 // TODO: current implementation works in UTF-16 space
118 const TernaryNode
*up
= NULL
;
119 const TernaryNode
*p
= fTrie
;
124 UChar uc
= utext_current32(text
);
125 for (i
= 0; i
< maxLength
&& p
!= NULL
; ++i
) {
131 else if (uc
== p
->ch
) {
143 // Must be equal to get here
144 if (limit
> 0 && (p
->flags
& kEndsWord
)) {
145 lengths
[mycount
++] = i
+1;
150 uc
= utext_next32(text
);
151 uc
= utext_current32(text
);
154 // Note that there is no way to reach here with up == 0 unless
155 // maxLength is 0 coming in.
156 parent
= (TernaryNode
*)up
;
162 MutableTrieDictionary::addWord( const UChar
*word
,
164 UErrorCode
&status
) {
167 status
= U_ILLEGAL_ARGUMENT_ERROR
;
174 fIter
= utext_openUChars(fIter
, word
, length
, &status
);
177 matched
= search(fIter
, length
, NULL
, count
, 0, parent
, pMatched
);
179 while (matched
++ < length
) {
180 UChar32 uc
= utext_next32(fIter
); // TODO: supplemetary support?
181 U_ASSERT(uc
!= U_SENTINEL
);
182 TernaryNode
*newNode
= new TernaryNode(uc
);
183 if (newNode
== NULL
) {
184 status
= U_MEMORY_ALLOCATION_ERROR
;
188 parent
->equal
= newNode
;
192 if (uc
< parent
->ch
) {
193 parent
->low
= newNode
;
196 parent
->high
= newNode
;
202 parent
->flags
|= kEndsWord
;
207 MutableTrieDictionary::addWords( UEnumeration
*words
,
208 UErrorCode
&status
) {
211 while ((word
= uenum_unext(words
, &length
, &status
)) && U_SUCCESS(status
)) {
212 addWord(word
, length
, status
);
218 MutableTrieDictionary::matches( UText
*text
,
225 return search(text
, maxLength
, lengths
, count
, limit
, parent
, pMatched
);
228 // Implementation of iteration for MutableTrieDictionary
229 class MutableTrieEnumeration
: public StringEnumeration
{
231 UStack fNodeStack
; // Stack of nodes to process
232 UVector32 fBranchStack
; // Stack of which branch we are working on
233 TernaryNode
*fRoot
; // Root node
242 static UClassID U_EXPORT2
getStaticClassID(void);
243 virtual UClassID
getDynamicClassID(void) const;
245 MutableTrieEnumeration(TernaryNode
*root
, UErrorCode
&status
)
246 : fNodeStack(status
), fBranchStack(status
) {
248 fNodeStack
.push(root
, status
);
249 fBranchStack
.push(kLessThan
, status
);
253 virtual ~MutableTrieEnumeration();
255 virtual StringEnumeration
*clone() const {
256 UErrorCode status
= U_ZERO_ERROR
;
257 return new MutableTrieEnumeration(fRoot
, status
);
260 virtual const UnicodeString
*snext(UErrorCode
&status
) {
261 if (fNodeStack
.empty() || U_FAILURE(status
)) {
264 TernaryNode
*node
= (TernaryNode
*) fNodeStack
.peek();
265 StackBranch where
= (StackBranch
) fBranchStack
.peeki();
266 while (!fNodeStack
.empty() && U_SUCCESS(status
)) {
272 if (node
->low
!= NULL
) {
273 fBranchStack
.setElementAt(kEqual
, fBranchStack
.size()-1);
274 node
= (TernaryNode
*) fNodeStack
.push(node
->low
, status
);
275 where
= (StackBranch
) fBranchStack
.push(kLessThan
, status
);
278 case kEqual
: /*fall through*/
279 emit
= (node
->flags
& kEndsWord
) != 0;
280 equal
= (node
->equal
!= NULL
);
281 // If this node should be part of the next emitted string, append
282 // the UChar to the string, and make sure we pop it when we come
283 // back to this node. The character should only be in the string
284 // for as long as we're traversing the equal subtree of this node
286 unistr
.append(node
->ch
);
287 fBranchStack
.setElementAt(kGreaterThan
, fBranchStack
.size()-1);
290 node
= (TernaryNode
*) fNodeStack
.push(node
->equal
, status
);
291 where
= (StackBranch
) fBranchStack
.push(kLessThan
, status
);
299 case kGreaterThan
: /*fall through*/
300 // If this node's character is in the string, remove it.
301 if (node
->equal
!= NULL
|| (node
->flags
& kEndsWord
)) {
302 unistr
.truncate(unistr
.length()-1);
304 if (node
->high
!= NULL
) {
305 fBranchStack
.setElementAt(kDone
, fBranchStack
.size()-1);
306 node
= (TernaryNode
*) fNodeStack
.push(node
->high
, status
);
307 where
= (StackBranch
) fBranchStack
.push(kLessThan
, status
);
310 case kDone
: /*fall through*/
313 node
= (TernaryNode
*) fNodeStack
.peek();
314 where
= (StackBranch
) fBranchStack
.peeki();
323 // Very expensive, but this should never be used.
324 virtual int32_t count(UErrorCode
&status
) const {
325 MutableTrieEnumeration
counter(fRoot
, status
);
327 while (counter
.snext(status
) != NULL
&& U_SUCCESS(status
)) {
333 virtual void reset(UErrorCode
&status
) {
334 fNodeStack
.removeAllElements();
335 fBranchStack
.removeAllElements();
336 fNodeStack
.push(fRoot
, status
);
337 fBranchStack
.push(kLessThan
, status
);
342 MutableTrieEnumeration::~MutableTrieEnumeration() {}
344 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration
)
347 MutableTrieDictionary::openWords( UErrorCode
&status
) const {
348 if (U_FAILURE(status
)) {
351 return new MutableTrieEnumeration(fTrie
, status
);
354 /*******************************************************************
355 * CompactTrieDictionary
358 struct CompactTrieHeader
{
359 uint32_t size
; // Size of the data in bytes
360 uint32_t magic
; // Magic number (including version)
361 uint16_t nodeCount
; // Number of entries in offsets[]
362 uint16_t root
; // Node number of the root node
363 uint32_t offsets
[1]; // Offsets to nodes from start of data
366 // Note that to avoid platform-specific alignment issues, all members of the node
367 // structures should be the same size, or should contain explicit padding to
368 // natural alignment boundaries.
370 // We can't use a bitfield for the flags+count field, because the layout of those
371 // is not portable. 12 bits of count allows for up to 4096 entries in a node.
372 struct CompactTrieNode
{
373 uint16_t flagscount
; // Count of sub-entries, plus flags
376 enum CompactTrieNodeFlags
{
377 kVerticalNode
= 0x1000, // This is a vertical node
378 kParentEndsWord
= 0x2000, // The node whose equal link points to this ends a word
379 kReservedFlag1
= 0x4000,
380 kReservedFlag2
= 0x8000,
381 kCountMask
= 0x0FFF, // The count portion of flagscount
382 kFlagMask
= 0xF000 // The flags portion of flagscount
385 // The two node types are distinguished by the kVerticalNode flag.
387 struct CompactTrieHorizontalEntry
{
388 uint16_t ch
; // UChar
389 uint16_t equal
; // Equal link node index
392 // We don't use inheritance here because C++ does not guarantee that the
393 // base class comes first in memory!!
395 struct CompactTrieHorizontalNode
{
396 uint16_t flagscount
; // Count of sub-entries, plus flags
397 CompactTrieHorizontalEntry entries
[1];
400 struct CompactTrieVerticalNode
{
401 uint16_t flagscount
; // Count of sub-entries, plus flags
402 uint16_t equal
; // Equal link node index
403 uint16_t chars
[1]; // Code units
406 // {'Dic', 1}, version 1
407 #define COMPACT_TRIE_MAGIC_1 0x44696301
409 CompactTrieDictionary::CompactTrieDictionary(UDataMemory
*dataObj
,
413 fData
= (const CompactTrieHeader
*) udata_getMemory(dataObj
);
415 if (fData
->magic
!= COMPACT_TRIE_MAGIC_1
) {
416 status
= U_ILLEGAL_ARGUMENT_ERROR
;
420 CompactTrieDictionary::CompactTrieDictionary( const void *data
,
424 fData
= (const CompactTrieHeader
*) data
;
426 if (fData
->magic
!= COMPACT_TRIE_MAGIC_1
) {
427 status
= U_ILLEGAL_ARGUMENT_ERROR
;
432 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary
&dict
,
436 fData
= compactMutableTrieDictionary(dict
, status
);
437 fOwnData
= !U_FAILURE(status
);
440 CompactTrieDictionary::~CompactTrieDictionary() {
442 uprv_free((void *)fData
);
450 CompactTrieDictionary::dataSize() const {
455 CompactTrieDictionary::data() const {
459 // This function finds the address of a node for us, given its node ID
460 static inline const CompactTrieNode
*
461 getCompactNode(const CompactTrieHeader
*header
, uint16_t node
) {
462 return (const CompactTrieNode
*)((const uint8_t *)header
+ header
->offsets
[node
]);
466 CompactTrieDictionary::matches( UText
*text
,
471 // TODO: current implementation works in UTF-16 space
472 const CompactTrieNode
*node
= getCompactNode(fData
, fData
->root
);
475 UChar uc
= utext_current32(text
);
478 while (node
!= NULL
) {
479 // Check if the node we just exited ends a word
480 if (limit
> 0 && (node
->flagscount
& kParentEndsWord
)) {
481 lengths
[mycount
++] = i
;
484 // Check that we haven't exceeded the maximum number of input characters.
485 // We have to do that here rather than in the while condition so that
486 // we can check for ending a word, above.
487 if (i
>= maxLength
) {
491 int nodeCount
= (node
->flagscount
& kCountMask
);
492 if (nodeCount
== 0) {
493 // Special terminal node; return now
496 if (node
->flagscount
& kVerticalNode
) {
497 // Vertical node; check all the characters in it
498 const CompactTrieVerticalNode
*vnode
= (const CompactTrieVerticalNode
*)node
;
499 for (int j
= 0; j
< nodeCount
&& i
< maxLength
; ++j
) {
500 if (uc
!= vnode
->chars
[j
]) {
501 // We hit a non-equal character; return
505 uc
= utext_current32(text
);
508 // To get here we must have come through the whole list successfully;
509 // go on to the next node. Note that a word cannot end in the middle
510 // of a vertical node.
511 node
= getCompactNode(fData
, vnode
->equal
);
514 // Horizontal node; do binary search
515 const CompactTrieHorizontalNode
*hnode
= (const CompactTrieHorizontalNode
*)node
;
517 int high
= nodeCount
-1;
519 node
= NULL
; // If we don't find a match, we'll fall out of the loop
520 while (high
>= low
) {
521 middle
= (high
+low
)/2;
522 if (uc
== hnode
->entries
[middle
].ch
) {
523 // We hit a match; get the next node and next character
524 node
= getCompactNode(fData
, hnode
->entries
[middle
].equal
);
526 uc
= utext_current32(text
);
530 else if (uc
< hnode
->entries
[middle
].ch
) {
544 // Implementation of iteration for CompactTrieDictionary
545 class CompactTrieEnumeration
: public StringEnumeration
{
547 UVector32 fNodeStack
; // Stack of nodes to process
548 UVector32 fIndexStack
; // Stack of where in node we are
549 const CompactTrieHeader
*fHeader
; // Trie data
552 static UClassID U_EXPORT2
getStaticClassID(void);
553 virtual UClassID
getDynamicClassID(void) const;
555 CompactTrieEnumeration(const CompactTrieHeader
*header
, UErrorCode
&status
)
556 : fNodeStack(status
), fIndexStack(status
) {
558 fNodeStack
.push(header
->root
, status
);
559 fIndexStack
.push(0, status
);
563 virtual ~CompactTrieEnumeration();
565 virtual StringEnumeration
*clone() const {
566 UErrorCode status
= U_ZERO_ERROR
;
567 return new CompactTrieEnumeration(fHeader
, status
);
570 virtual const UnicodeString
* snext(UErrorCode
&status
);
572 // Very expensive, but this should never be used.
573 virtual int32_t count(UErrorCode
&status
) const {
574 CompactTrieEnumeration
counter(fHeader
, status
);
576 while (counter
.snext(status
) != NULL
&& U_SUCCESS(status
)) {
582 virtual void reset(UErrorCode
&status
) {
583 fNodeStack
.removeAllElements();
584 fIndexStack
.removeAllElements();
585 fNodeStack
.push(fHeader
->root
, status
);
586 fIndexStack
.push(0, status
);
591 CompactTrieEnumeration::~CompactTrieEnumeration() {}
593 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration
)
595 const UnicodeString
*
596 CompactTrieEnumeration::snext(UErrorCode
&status
) {
597 if (fNodeStack
.empty() || U_FAILURE(status
)) {
600 const CompactTrieNode
*node
= getCompactNode(fHeader
, fNodeStack
.peeki());
601 int where
= fIndexStack
.peeki();
602 while (!fNodeStack
.empty() && U_SUCCESS(status
)) {
603 int nodeCount
= (node
->flagscount
& kCountMask
);
604 UBool goingDown
= FALSE
;
605 if (nodeCount
== 0) {
606 // Terminal node; go up immediately
609 node
= getCompactNode(fHeader
, fNodeStack
.peeki());
610 where
= fIndexStack
.peeki();
612 else if (node
->flagscount
& kVerticalNode
) {
614 const CompactTrieVerticalNode
*vnode
= (const CompactTrieVerticalNode
*)node
;
617 unistr
.append((const UChar
*)vnode
->chars
, (int32_t) nodeCount
);
618 fIndexStack
.setElementAt(1, fIndexStack
.size()-1);
619 node
= getCompactNode(fHeader
, fNodeStack
.push(vnode
->equal
, status
));
620 where
= fIndexStack
.push(0, status
);
625 unistr
.truncate(unistr
.length()-nodeCount
);
628 node
= getCompactNode(fHeader
, fNodeStack
.peeki());
629 where
= fIndexStack
.peeki();
634 const CompactTrieHorizontalNode
*hnode
= (const CompactTrieHorizontalNode
*)node
;
637 unistr
.truncate(unistr
.length()-1);
639 if (where
< nodeCount
) {
641 unistr
.append((UChar
)hnode
->entries
[where
].ch
);
642 fIndexStack
.setElementAt(where
+1, fIndexStack
.size()-1);
643 node
= getCompactNode(fHeader
, fNodeStack
.push(hnode
->entries
[where
].equal
, status
));
644 where
= fIndexStack
.push(0, status
);
651 node
= getCompactNode(fHeader
, fNodeStack
.peeki());
652 where
= fIndexStack
.peeki();
655 // Check if the parent of the node we've just gone down to ends a
656 // word. If so, return it.
657 if (goingDown
&& (node
->flagscount
& kParentEndsWord
)) {
665 CompactTrieDictionary::openWords( UErrorCode
&status
) const {
666 if (U_FAILURE(status
)) {
669 return new CompactTrieEnumeration(fData
, status
);
673 // Below here is all code related to converting a ternary trie to a compact trie
677 // Helper classes to construct the compact trie
678 class BuildCompactTrieNode
: public UMemory
{
680 UBool fParentEndsWord
;
684 UnicodeString fChars
;
687 BuildCompactTrieNode(UBool parentEndsWord
, UBool vertical
, UStack
&nodes
, UErrorCode
&status
) {
688 fParentEndsWord
= parentEndsWord
;
689 fHasDuplicate
= FALSE
;
690 fVertical
= vertical
;
691 fNodeID
= nodes
.size();
692 nodes
.push(this, status
);
695 virtual ~BuildCompactTrieNode();
697 virtual uint32_t size() {
698 return sizeof(uint16_t);
701 virtual void write(uint8_t *bytes
, uint32_t &offset
, const UVector32
&/*translate*/) {
703 *((uint16_t *)(bytes
+offset
)) = (fChars
.length() & kCountMask
)
704 | (fVertical
? kVerticalNode
: 0) | (fParentEndsWord
? kParentEndsWord
: 0 );
705 offset
+= sizeof(uint16_t);
709 BuildCompactTrieNode::~BuildCompactTrieNode() {}
711 class BuildCompactTrieHorizontalNode
: public BuildCompactTrieNode
{
716 BuildCompactTrieHorizontalNode(UBool parentEndsWord
, UStack
&nodes
, UErrorCode
&status
)
717 : BuildCompactTrieNode(parentEndsWord
, FALSE
, nodes
, status
), fLinks(status
) {
720 virtual ~BuildCompactTrieHorizontalNode();
722 virtual uint32_t size() {
723 return offsetof(CompactTrieHorizontalNode
,entries
) +
724 (fChars
.length()*sizeof(CompactTrieHorizontalEntry
));
727 virtual void write(uint8_t *bytes
, uint32_t &offset
, const UVector32
&translate
) {
728 BuildCompactTrieNode::write(bytes
, offset
, translate
);
729 int32_t count
= fChars
.length();
730 for (int32_t i
= 0; i
< count
; ++i
) {
731 CompactTrieHorizontalEntry
*entry
= (CompactTrieHorizontalEntry
*)(bytes
+offset
);
732 entry
->ch
= fChars
[i
];
733 entry
->equal
= translate
.elementAti(((BuildCompactTrieNode
*)fLinks
[i
])->fNodeID
);
734 #ifdef DEBUG_TRIE_DICT
735 if (entry
->equal
== 0) {
736 fprintf(stderr
, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
737 i
, ((BuildCompactTrieNode
*)fLinks
[i
])->fNodeID
);
740 offset
+= sizeof(CompactTrieHorizontalEntry
);
744 void addNode(UChar ch
, BuildCompactTrieNode
*link
, UErrorCode
&status
) {
746 fLinks
.push(link
, status
);
750 BuildCompactTrieHorizontalNode::~BuildCompactTrieHorizontalNode() {}
752 class BuildCompactTrieVerticalNode
: public BuildCompactTrieNode
{
754 BuildCompactTrieNode
*fEqual
;
757 BuildCompactTrieVerticalNode(UBool parentEndsWord
, UStack
&nodes
, UErrorCode
&status
)
758 : BuildCompactTrieNode(parentEndsWord
, TRUE
, nodes
, status
) {
762 virtual ~BuildCompactTrieVerticalNode();
764 virtual uint32_t size() {
765 return offsetof(CompactTrieVerticalNode
,chars
) + (fChars
.length()*sizeof(uint16_t));
768 virtual void write(uint8_t *bytes
, uint32_t &offset
, const UVector32
&translate
) {
769 CompactTrieVerticalNode
*node
= (CompactTrieVerticalNode
*)(bytes
+offset
);
770 BuildCompactTrieNode::write(bytes
, offset
, translate
);
771 node
->equal
= translate
.elementAti(fEqual
->fNodeID
);
772 offset
+= sizeof(node
->equal
);
773 #ifdef DEBUG_TRIE_DICT
774 if (node
->equal
== 0) {
775 fprintf(stderr
, "ERROR: vertical link, logical node %d maps to physical node zero\n",
779 fChars
.extract(0, fChars
.length(), (UChar
*)node
->chars
);
780 offset
+= sizeof(uint16_t)*fChars
.length();
783 void addChar(UChar ch
) {
787 void setLink(BuildCompactTrieNode
*node
) {
792 BuildCompactTrieVerticalNode::~BuildCompactTrieVerticalNode() {}
794 // Forward declaration
795 static void walkHorizontal(const TernaryNode
*node
,
796 BuildCompactTrieHorizontalNode
*building
,
800 // Convert one node. Uses recursion.
802 static BuildCompactTrieNode
*
803 compactOneNode(const TernaryNode
*node
, UBool parentEndsWord
, UStack
&nodes
, UErrorCode
&status
) {
804 if (U_FAILURE(status
)) {
807 BuildCompactTrieNode
*result
= NULL
;
808 UBool horizontal
= (node
->low
!= NULL
|| node
->high
!= NULL
);
810 BuildCompactTrieHorizontalNode
*hResult
=
811 new BuildCompactTrieHorizontalNode(parentEndsWord
, nodes
, status
);
812 if (hResult
== NULL
) {
813 status
= U_MEMORY_ALLOCATION_ERROR
;
816 if (U_SUCCESS(status
)) {
817 walkHorizontal(node
, hResult
, nodes
, status
);
822 BuildCompactTrieVerticalNode
*vResult
=
823 new BuildCompactTrieVerticalNode(parentEndsWord
, nodes
, status
);
824 if (vResult
== NULL
) {
825 status
= U_MEMORY_ALLOCATION_ERROR
;
827 else if (U_SUCCESS(status
)) {
828 UBool endsWord
= FALSE
;
829 // Take up nodes until we end a word, or hit a node with < or > links
831 vResult
->addChar(node
->ch
);
832 endsWord
= (node
->flags
& kEndsWord
) != 0;
835 while(node
!= NULL
&& !endsWord
&& node
->low
== NULL
&& node
->high
== NULL
);
838 status
= U_ILLEGAL_ARGUMENT_ERROR
; // Corrupt input trie
841 vResult
->setLink((BuildCompactTrieNode
*)nodes
[1]);
845 vResult
->setLink(compactOneNode(node
, endsWord
, nodes
, status
));
853 // Walk the set of peers at the same level, to build a horizontal node.
856 static void walkHorizontal(const TernaryNode
*node
,
857 BuildCompactTrieHorizontalNode
*building
,
859 UErrorCode
&status
) {
860 while (U_SUCCESS(status
) && node
!= NULL
) {
861 if (node
->low
!= NULL
) {
862 walkHorizontal(node
->low
, building
, nodes
, status
);
864 BuildCompactTrieNode
*link
= NULL
;
865 if (node
->equal
!= NULL
) {
866 link
= compactOneNode(node
->equal
, (node
->flags
& kEndsWord
) != 0, nodes
, status
);
868 else if (node
->flags
& kEndsWord
) {
869 link
= (BuildCompactTrieNode
*)nodes
[1];
871 if (U_SUCCESS(status
) && link
!= NULL
) {
872 building
->addNode(node
->ch
, link
, status
);
874 // Tail recurse manually instead of leaving it to the compiler.
875 //if (node->high != NULL) {
876 // walkHorizontal(node->high, building, nodes, status);
885 static int32_t U_CALLCONV
886 _sortBuildNodes(const void * /*context*/, const void *voidl
, const void *voidr
) {
887 BuildCompactTrieNode
*left
= *(BuildCompactTrieNode
**)voidl
;
888 BuildCompactTrieNode
*right
= *(BuildCompactTrieNode
**)voidr
;
889 // Check for comparing a node to itself, to avoid spurious duplicates
893 // Most significant is type of node. Can never coalesce.
894 if (left
->fVertical
!= right
->fVertical
) {
895 return left
->fVertical
- right
->fVertical
;
897 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
898 if (left
->fParentEndsWord
!= right
->fParentEndsWord
) {
899 return left
->fParentEndsWord
- right
->fParentEndsWord
;
901 // Next, the string. If that differs, we can never coalesce.
902 int32_t result
= left
->fChars
.compare(right
->fChars
);
906 // We know they're both the same node type, so branch for the two cases.
907 if (left
->fVertical
) {
908 result
= ((BuildCompactTrieVerticalNode
*)left
)->fEqual
->fNodeID
909 - ((BuildCompactTrieVerticalNode
*)right
)->fEqual
->fNodeID
;
912 // We need to compare the links vectors. They should be the
913 // same size because the strings were equal.
914 // We compare the node IDs instead of the pointers, to handle
916 BuildCompactTrieHorizontalNode
*hleft
, *hright
;
917 hleft
= (BuildCompactTrieHorizontalNode
*)left
;
918 hright
= (BuildCompactTrieHorizontalNode
*)right
;
919 int32_t count
= hleft
->fLinks
.size();
920 for (int32_t i
= 0; i
< count
&& result
== 0; ++i
) {
921 result
= ((BuildCompactTrieNode
*)(hleft
->fLinks
[i
]))->fNodeID
-
922 ((BuildCompactTrieNode
*)(hright
->fLinks
[i
]))->fNodeID
;
925 // If they are equal to each other, mark them (speeds coalescing)
927 left
->fHasDuplicate
= TRUE
;
928 right
->fHasDuplicate
= TRUE
;
935 static void coalesceDuplicates(UStack
&nodes
, UErrorCode
&status
) {
936 // We sort the array of nodes to place duplicates next to each other
937 if (U_FAILURE(status
)) {
940 int32_t size
= nodes
.size();
941 void **array
= (void **)uprv_malloc(sizeof(void *)*size
);
943 status
= U_MEMORY_ALLOCATION_ERROR
;
946 (void) nodes
.toArray(array
);
948 // Now repeatedly identify duplicates until there are no more
951 #ifdef DEBUG_TRIE_DICT
955 BuildCompactTrieNode
*node
;
956 BuildCompactTrieNode
*first
= NULL
;
957 BuildCompactTrieNode
**p
;
958 BuildCompactTrieNode
**pFirst
= NULL
;
959 int32_t counter
= size
- 2;
960 // Sort the array, skipping nodes 0 and 1. Use quicksort for the first
961 // pass for speed. For the second and subsequent passes, we use stable
962 // (insertion) sort for two reasons:
963 // 1. The array is already mostly ordered, so we get better performance.
964 // 2. The way we find one and only one instance of a set of duplicates is to
965 // check that the node ID equals the array index. If we used an unstable
966 // sort for the second or later passes, it's possible that none of the
967 // duplicates would wind up with a node ID equal to its array index.
968 // The sort stability guarantees that, because as we coalesce more and
969 // more groups, the first element of the resultant group will be one of
970 // the first elements of the groups being coalesced.
971 // To use quicksort for the second and subsequent passes, we would have to
972 // find the minimum of the node numbers in a group, and set all the nodes
973 // in the group to that node number.
974 uprv_sortArray(array
+2, counter
, sizeof(void *), _sortBuildNodes
, NULL
, (passCount
> 0), &status
);
976 for (p
= (BuildCompactTrieNode
**)array
+ 2; counter
> 0; --counter
, ++p
) {
978 if (node
->fHasDuplicate
) {
983 else if (_sortBuildNodes(NULL
, pFirst
, p
) != 0) {
984 // Starting a new run of dupes
988 else if (node
->fNodeID
!= first
->fNodeID
) {
989 // Slave one to the other, note duplicate
990 node
->fNodeID
= first
->fNodeID
;
995 // This node has no dupes
1001 #ifdef DEBUG_TRIE_DICT
1002 totalDupes
+= dupes
;
1003 fprintf(stderr
, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount
, dupes
);
1007 #ifdef DEBUG_TRIE_DICT
1008 fprintf(stderr
, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes
, passCount
);
1011 // We no longer need the temporary array, as the nodes have all been marked appropriately.
1017 static void U_CALLCONV
_deleteBuildNode(void *obj
) {
1018 delete (BuildCompactTrieNode
*) obj
;
1024 CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary
&dict
,
1025 UErrorCode
&status
) {
1026 if (U_FAILURE(status
)) {
1029 #ifdef DEBUG_TRIE_DICT
1031 struct tms previous
;
1032 (void) ::times(&previous
);
1034 UStack
nodes(_deleteBuildNode
, NULL
, status
); // Index of nodes
1036 // Add node 0, used as the NULL pointer/sentinel.
1037 nodes
.addElement((int32_t)0, status
);
1039 // Start by creating the special empty node we use to indicate that the parent
1040 // terminates a word. This must be node 1, because the builder assumes
1042 if (U_FAILURE(status
)) {
1045 BuildCompactTrieNode
*terminal
= new BuildCompactTrieNode(TRUE
, FALSE
, nodes
, status
);
1046 if (terminal
== NULL
) {
1047 status
= U_MEMORY_ALLOCATION_ERROR
;
1050 // This call does all the work of building the new trie structure. The root
1052 BuildCompactTrieNode
*root
= compactOneNode(dict
.fTrie
, FALSE
, nodes
, status
);
1053 #ifdef DEBUG_TRIE_DICT
1054 (void) ::times(&timing
);
1055 fprintf(stderr
, "Compact trie built, %d nodes, time user %f system %f\n",
1056 nodes
.size(), (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1057 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1061 // Now coalesce all duplicate nodes.
1062 coalesceDuplicates(nodes
, status
);
1063 #ifdef DEBUG_TRIE_DICT
1064 (void) ::times(&timing
);
1065 fprintf(stderr
, "Duplicates coalesced, time user %f system %f\n",
1066 (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1067 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1071 // Next, build the output trie.
1072 // First we compute all the sizes and build the node ID translation table.
1073 uint32_t totalSize
= offsetof(CompactTrieHeader
,offsets
);
1074 int32_t count
= nodes
.size();
1075 int32_t nodeCount
= 1; // The sentinel node we already have
1076 BuildCompactTrieNode
*node
;
1078 UVector32
translate(count
, status
); // Should be no growth needed after this
1079 translate
.push(0, status
); // The sentinel node
1081 if (U_FAILURE(status
)) {
1085 for (i
= 1; i
< count
; ++i
) {
1086 node
= (BuildCompactTrieNode
*)nodes
[i
];
1087 if (node
->fNodeID
== i
) {
1088 // Only one node out of each duplicate set is used
1089 if (i
>= translate
.size()) {
1090 // Logically extend the mapping table
1091 translate
.setSize(i
+1);
1093 translate
.setElementAt(nodeCount
++, i
);
1094 totalSize
+= node
->size();
1098 // Check for overflowing 16 bits worth of nodes.
1099 if (nodeCount
> 0x10000) {
1100 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1104 // Add enough room for the offsets.
1105 totalSize
+= nodeCount
*sizeof(uint32_t);
1106 #ifdef DEBUG_TRIE_DICT
1107 (void) ::times(&timing
);
1108 fprintf(stderr
, "Sizes/mapping done, time user %f system %f\n",
1109 (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1110 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1112 fprintf(stderr
, "%d nodes, %d unique, %d bytes\n", nodes
.size(), nodeCount
, totalSize
);
1114 uint8_t *bytes
= (uint8_t *)uprv_malloc(totalSize
);
1115 if (bytes
== NULL
) {
1116 status
= U_MEMORY_ALLOCATION_ERROR
;
1120 CompactTrieHeader
*header
= (CompactTrieHeader
*)bytes
;
1121 header
->size
= totalSize
;
1122 header
->nodeCount
= nodeCount
;
1123 header
->offsets
[0] = 0; // Sentinel
1124 header
->root
= translate
.elementAti(root
->fNodeID
);
1125 #ifdef DEBUG_TRIE_DICT
1126 if (header
->root
== 0) {
1127 fprintf(stderr
, "ERROR: root node %d translate to physical zero\n", root
->fNodeID
);
1130 uint32_t offset
= offsetof(CompactTrieHeader
,offsets
)+(nodeCount
*sizeof(uint32_t));
1132 // Now write the data
1133 for (i
= 1; i
< count
; ++i
) {
1134 node
= (BuildCompactTrieNode
*)nodes
[i
];
1135 if (node
->fNodeID
== i
) {
1136 header
->offsets
[nodeCount
++] = offset
;
1137 node
->write(bytes
, offset
, translate
);
1140 #ifdef DEBUG_TRIE_DICT
1141 (void) ::times(&timing
);
1142 fprintf(stderr
, "Trie built, time user %f system %f\n",
1143 (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1144 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1146 fprintf(stderr
, "Final offset is %d\n", offset
);
1148 // Collect statistics on node types and sizes
1153 size_t hItemCount
= 0;
1154 size_t vItemCount
= 0;
1155 uint32_t previousOff
= offset
;
1156 for (uint16_t nodeIdx
= nodeCount
-1; nodeIdx
>= 2; --nodeIdx
) {
1157 const CompactTrieNode
*node
= getCompactNode(header
, nodeIdx
);
1158 if (node
->flagscount
& kVerticalNode
) {
1160 vItemCount
+= (node
->flagscount
& kCountMask
);
1161 vSize
+= previousOff
-header
->offsets
[nodeIdx
];
1165 hItemCount
+= (node
->flagscount
& kCountMask
);
1166 hSize
+= previousOff
-header
->offsets
[nodeIdx
];
1168 previousOff
= header
->offsets
[nodeIdx
];
1170 fprintf(stderr
, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount
,
1171 (double)hSize
/hCount
, (double)hItemCount
/hCount
);
1172 fprintf(stderr
, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount
,
1173 (double)vSize
/vCount
, (double)vItemCount
/vCount
);
1176 if (U_FAILURE(status
)) {
1181 header
->magic
= COMPACT_TRIE_MAGIC_1
;
1186 // Forward declaration
1187 static TernaryNode
*
1188 unpackOneNode( const CompactTrieHeader
*header
, const CompactTrieNode
*node
, UErrorCode
&status
);
1191 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
1192 static TernaryNode
*
1193 unpackHorizontalArray( const CompactTrieHeader
*header
, const CompactTrieHorizontalEntry
*array
,
1194 int low
, int high
, UErrorCode
&status
) {
1195 if (U_FAILURE(status
) || low
> high
) {
1198 int middle
= (low
+high
)/2;
1199 TernaryNode
*result
= new TernaryNode(array
[middle
].ch
);
1200 if (result
== NULL
) {
1201 status
= U_MEMORY_ALLOCATION_ERROR
;
1204 const CompactTrieNode
*equal
= getCompactNode(header
, array
[middle
].equal
);
1205 if (equal
->flagscount
& kParentEndsWord
) {
1206 result
->flags
|= kEndsWord
;
1208 result
->low
= unpackHorizontalArray(header
, array
, low
, middle
-1, status
);
1209 result
->high
= unpackHorizontalArray(header
, array
, middle
+1, high
, status
);
1210 result
->equal
= unpackOneNode(header
, equal
, status
);
1214 // Convert one compact trie node into a ternary subtrie
1215 static TernaryNode
*
1216 unpackOneNode( const CompactTrieHeader
*header
, const CompactTrieNode
*node
, UErrorCode
&status
) {
1217 int nodeCount
= (node
->flagscount
& kCountMask
);
1218 if (nodeCount
== 0 || U_FAILURE(status
)) {
1219 // Failure, or terminal node
1222 if (node
->flagscount
& kVerticalNode
) {
1223 const CompactTrieVerticalNode
*vnode
= (const CompactTrieVerticalNode
*)node
;
1224 TernaryNode
*head
= NULL
;
1225 TernaryNode
*previous
= NULL
;
1226 TernaryNode
*latest
= NULL
;
1227 for (int i
= 0; i
< nodeCount
; ++i
) {
1228 latest
= new TernaryNode(vnode
->chars
[i
]);
1229 if (latest
== NULL
) {
1230 status
= U_MEMORY_ALLOCATION_ERROR
;
1236 if (previous
!= NULL
) {
1237 previous
->equal
= latest
;
1241 if (latest
!= NULL
) {
1242 const CompactTrieNode
*equal
= getCompactNode(header
, vnode
->equal
);
1243 if (equal
->flagscount
& kParentEndsWord
) {
1244 latest
->flags
|= kEndsWord
;
1246 latest
->equal
= unpackOneNode(header
, equal
, status
);
1252 const CompactTrieHorizontalNode
*hnode
= (const CompactTrieHorizontalNode
*)node
;
1253 return unpackHorizontalArray(header
, &hnode
->entries
[0], 0, nodeCount
-1, status
);
1257 MutableTrieDictionary
*
1258 CompactTrieDictionary::cloneMutable( UErrorCode
&status
) const {
1259 MutableTrieDictionary
*result
= new MutableTrieDictionary( status
);
1260 if (result
== NULL
) {
1261 status
= U_MEMORY_ALLOCATION_ERROR
;
1264 TernaryNode
*root
= unpackOneNode(fData
, getCompactNode(fData
, fData
->root
), status
);
1265 if (U_FAILURE(status
)) {
1266 delete root
; // Clean up
1270 result
->fTrie
= root
;
1276 U_CAPI
int32_t U_EXPORT2
1277 triedict_swap(const UDataSwapper
*ds
, const void *inData
, int32_t length
, void *outData
,
1278 UErrorCode
*status
) {
1280 if (status
== NULL
|| U_FAILURE(*status
)) {
1283 if(ds
==NULL
|| inData
==NULL
|| length
<-1 || (length
>0 && outData
==NULL
)) {
1284 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
1289 // Check that the data header is for for dictionary data.
1290 // (Header contents are defined in genxxx.cpp)
1292 const UDataInfo
*pInfo
= (const UDataInfo
*)((const uint8_t *)inData
+4);
1293 if(!( pInfo
->dataFormat
[0]==0x54 && /* dataFormat="TrDc" */
1294 pInfo
->dataFormat
[1]==0x72 &&
1295 pInfo
->dataFormat
[2]==0x44 &&
1296 pInfo
->dataFormat
[3]==0x63 &&
1297 pInfo
->formatVersion
[0]==1 )) {
1298 udata_printError(ds
, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
1299 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
1300 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
1301 pInfo
->formatVersion
[0]);
1302 *status
=U_UNSUPPORTED_ERROR
;
1307 // Swap the data header. (This is the generic ICU Data Header, not the
1308 // CompactTrieHeader). This swap also conveniently gets us
1309 // the size of the ICU d.h., which lets us locate the start
1310 // of the RBBI specific data.
1312 int32_t headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, status
);
1315 // Get the CompactTrieHeader, and check that it appears to be OK.
1317 const uint8_t *inBytes
=(const uint8_t *)inData
+headerSize
;
1318 const CompactTrieHeader
*header
= (const CompactTrieHeader
*)inBytes
;
1319 if (ds
->readUInt32(header
->magic
) != COMPACT_TRIE_MAGIC_1
1320 || ds
->readUInt32(header
->size
) < sizeof(CompactTrieHeader
))
1322 udata_printError(ds
, "triedict_swap(): CompactTrieHeader is invalid.\n");
1323 *status
=U_UNSUPPORTED_ERROR
;
1328 // Prefight operation? Just return the size
1330 uint32_t totalSize
= ds
->readUInt32(header
->size
);
1331 int32_t sizeWithUData
= (int32_t)totalSize
+ headerSize
;
1333 return sizeWithUData
;
1337 // Check that length passed in is consistent with length from RBBI data header.
1339 if (length
< sizeWithUData
) {
1340 udata_printError(ds
, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
1342 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
1347 // Swap the Data. Do the data itself first, then the CompactTrieHeader, because
1348 // we need to reference the header to locate the data, and an
1349 // inplace swap of the header leaves it unusable.
1351 uint8_t *outBytes
= (uint8_t *)outData
+ headerSize
;
1352 CompactTrieHeader
*outputHeader
= (CompactTrieHeader
*)outBytes
;
1356 // If not swapping in place, zero out the output buffer before starting.
1358 if (inBytes
!= outBytes
) {
1359 uprv_memset(outBytes
, 0, totalSize
);
1362 // We need to loop through all the nodes in the offset table, and swap each one.
1363 uint16_t nodeCount
= ds
->readUInt16(header
->nodeCount
);
1364 // Skip node 0, which should always be 0.
1365 for (int i
= 1; i
< nodeCount
; ++i
) {
1366 uint32_t nodeOff
= ds
->readUInt32(header
->offsets
[i
]);
1367 const CompactTrieNode
*inNode
= (const CompactTrieNode
*)(inBytes
+ nodeOff
);
1368 CompactTrieNode
*outNode
= (CompactTrieNode
*)(outBytes
+ nodeOff
);
1369 uint16_t flagscount
= ds
->readUInt16(inNode
->flagscount
);
1370 uint16_t itemCount
= flagscount
& kCountMask
;
1371 ds
->writeUInt16(&outNode
->flagscount
, flagscount
);
1372 if (itemCount
> 0) {
1373 if (flagscount
& kVerticalNode
) {
1374 ds
->swapArray16(ds
, inBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,chars
),
1375 itemCount
*sizeof(uint16_t),
1376 outBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,chars
), status
);
1377 uint16_t equal
= ds
->readUInt16(inBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,equal
);
1378 ds
->writeUInt16(outBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,equal
));
1381 const CompactTrieHorizontalNode
*inHNode
= (const CompactTrieHorizontalNode
*)inNode
;
1382 CompactTrieHorizontalNode
*outHNode
= (CompactTrieHorizontalNode
*)outNode
;
1383 for (int j
= 0; j
< itemCount
; ++j
) {
1384 uint16_t word
= ds
->readUInt16(inHNode
->entries
[j
].ch
);
1385 ds
->writeUInt16(&outHNode
->entries
[j
].ch
, word
);
1386 word
= ds
->readUInt16(inHNode
->entries
[j
].equal
);
1387 ds
->writeUInt16(&outHNode
->entries
[j
].equal
, word
);
1394 // All the data in all the nodes consist of 16 bit items. Swap them all at once.
1395 uint16_t nodeCount
= ds
->readUInt16(header
->nodeCount
);
1396 uint32_t nodesOff
= offsetof(CompactTrieHeader
,offsets
)+((uint32_t)nodeCount
*sizeof(uint32_t));
1397 ds
->swapArray16(ds
, inBytes
+nodesOff
, totalSize
-nodesOff
, outBytes
+nodesOff
, status
);
1400 ds
->writeUInt32(&outputHeader
->size
, totalSize
);
1401 uint32_t magic
= ds
->readUInt32(header
->magic
);
1402 ds
->writeUInt32(&outputHeader
->magic
, magic
);
1403 ds
->writeUInt16(&outputHeader
->nodeCount
, nodeCount
);
1404 uint16_t root
= ds
->readUInt16(header
->root
);
1405 ds
->writeUInt16(&outputHeader
->root
, root
);
1406 ds
->swapArray32(ds
, inBytes
+offsetof(CompactTrieHeader
,offsets
),
1407 sizeof(uint32_t)*(int32_t)nodeCount
,
1408 outBytes
+offsetof(CompactTrieHeader
,offsets
), status
);
1410 return sizeWithUData
;
1413 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */