2 *******************************************************************************
3 * Copyright (C) 2006, International Business Machines Corporation and others. *
4 * All Rights Reserved. *
5 *******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_BREAK_ITERATION
13 #include "unicode/chariter.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/strenum.h"
16 #include "unicode/uenum.h"
17 #include "unicode/udata.h"
24 //#define DEBUG_TRIE_DICT 1
26 #ifdef DEBUG_TRIE_DICT
27 #include <sys/times.h>
34 /*******************************************************************
38 TrieWordDictionary::TrieWordDictionary() {
41 TrieWordDictionary::~TrieWordDictionary() {
44 /*******************************************************************
45 * MutableTrieDictionary
48 // Node structure for the ternary, uncompressed trie
49 struct TernaryNode
: public UMemory
{
50 UChar ch
; // UTF-16 code unit
51 uint16_t flags
; // Flag word
52 TernaryNode
*low
; // Less-than link
53 TernaryNode
*equal
; // Equal link
54 TernaryNode
*high
; // Greater-than link
56 TernaryNode(UChar uc
);
60 enum MutableTrieNodeFlags
{
61 kEndsWord
= 0x0001 // This node marks the end of a valid word
65 TernaryNode::TernaryNode(UChar uc
) {
73 // Not inline since it's recursive
74 TernaryNode::~TernaryNode() {
80 MutableTrieDictionary::MutableTrieDictionary( UChar median
, UErrorCode
&status
) {
81 // Start the trie off with something. Having the root node already present
82 // cuts a special case out of the search/insertion functions.
83 // Making it a median character cuts the worse case for searches from
84 // 4x a balanced trie to 2x a balanced trie. It's best to choose something
85 // that starts a word that is midway in the list.
86 fTrie
= new TernaryNode(median
);
88 status
= U_MEMORY_ALLOCATION_ERROR
;
90 fIter
= utext_openUChars(NULL
, NULL
, 0, &status
);
91 if (U_SUCCESS(status
) && fIter
== NULL
) {
92 status
= U_MEMORY_ALLOCATION_ERROR
;
96 MutableTrieDictionary::MutableTrieDictionary( UErrorCode
&status
) {
98 fIter
= utext_openUChars(NULL
, NULL
, 0, &status
);
99 if (U_SUCCESS(status
) && fIter
== NULL
) {
100 status
= U_MEMORY_ALLOCATION_ERROR
;
104 MutableTrieDictionary::~MutableTrieDictionary() {
110 MutableTrieDictionary::search( UText
*text
,
115 TernaryNode
*&parent
,
116 UBool
&pMatched
) const {
117 // TODO: current implementation works in UTF-16 space
118 const TernaryNode
*up
= NULL
;
119 const TernaryNode
*p
= fTrie
;
124 UChar uc
= utext_current32(text
);
125 for (i
= 0; i
< maxLength
&& p
!= NULL
; ++i
) {
131 else if (uc
== p
->ch
) {
143 // Must be equal to get here
144 if (limit
> 0 && (p
->flags
& kEndsWord
)) {
145 lengths
[mycount
++] = i
+1;
150 uc
= utext_next32(text
);
151 uc
= utext_current32(text
);
154 // Note that there is no way to reach here with up == 0 unless
155 // maxLength is 0 coming in.
156 parent
= (TernaryNode
*)up
;
162 MutableTrieDictionary::addWord( const UChar
*word
,
164 UErrorCode
&status
) {
167 status
= U_ILLEGAL_ARGUMENT_ERROR
;
174 fIter
= utext_openUChars(fIter
, word
, length
, &status
);
177 matched
= search(fIter
, length
, NULL
, count
, 0, parent
, pMatched
);
179 while (matched
++ < length
) {
180 UChar32 uc
= utext_next32(fIter
); // TODO: supplemetary support?
181 U_ASSERT(uc
!= U_SENTINEL
);
182 TernaryNode
*newNode
= new TernaryNode(uc
);
183 if (newNode
== NULL
) {
184 status
= U_MEMORY_ALLOCATION_ERROR
;
188 parent
->equal
= newNode
;
192 if (uc
< parent
->ch
) {
193 parent
->low
= newNode
;
196 parent
->high
= newNode
;
202 parent
->flags
|= kEndsWord
;
207 MutableTrieDictionary::addWords( UEnumeration
*words
,
208 UErrorCode
&status
) {
211 while ((word
= uenum_unext(words
, &length
, &status
)) && U_SUCCESS(status
)) {
212 addWord(word
, length
, status
);
218 MutableTrieDictionary::matches( UText
*text
,
225 return search(text
, maxLength
, lengths
, count
, limit
, parent
, pMatched
);
228 // Implementation of iteration for MutableTrieDictionary
229 class MutableTrieEnumeration
: public StringEnumeration
{
231 UStack fNodeStack
; // Stack of nodes to process
232 UVector32 fBranchStack
; // Stack of which branch we are working on
233 TernaryNode
*fRoot
; // Root node
234 static const char fgClassID
;
243 static UClassID U_EXPORT2
getStaticClassID(void) { return (UClassID
)&fgClassID
; }
244 virtual UClassID
getDynamicClassID(void) const { return getStaticClassID(); }
246 MutableTrieEnumeration(TernaryNode
*root
, UErrorCode
&status
)
247 : fNodeStack(status
), fBranchStack(status
) {
249 fNodeStack
.push(root
, status
);
250 fBranchStack
.push(kLessThan
, status
);
254 virtual ~MutableTrieEnumeration() {
257 virtual StringEnumeration
*clone() const {
258 UErrorCode status
= U_ZERO_ERROR
;
259 return new MutableTrieEnumeration(fRoot
, status
);
262 virtual const UnicodeString
*snext(UErrorCode
&status
) {
263 if (fNodeStack
.empty() || U_FAILURE(status
)) {
266 TernaryNode
*node
= (TernaryNode
*) fNodeStack
.peek();
267 StackBranch where
= (StackBranch
) fBranchStack
.peeki();
268 while (!fNodeStack
.empty() && U_SUCCESS(status
)) {
274 if (node
->low
!= NULL
) {
275 fBranchStack
.setElementAt(kEqual
, fBranchStack
.size()-1);
276 node
= (TernaryNode
*) fNodeStack
.push(node
->low
, status
);
277 where
= (StackBranch
) fBranchStack
.push(kLessThan
, status
);
281 emit
= (node
->flags
& kEndsWord
) != 0;
282 equal
= (node
->equal
!= NULL
);
283 // If this node should be part of the next emitted string, append
284 // the UChar to the string, and make sure we pop it when we come
285 // back to this node. The character should only be in the string
286 // for as long as we're traversing the equal subtree of this node
288 unistr
.append(node
->ch
);
289 fBranchStack
.setElementAt(kGreaterThan
, fBranchStack
.size()-1);
292 node
= (TernaryNode
*) fNodeStack
.push(node
->equal
, status
);
293 where
= (StackBranch
) fBranchStack
.push(kLessThan
, status
);
302 // If this node's character is in the string, remove it.
303 if (node
->equal
!= NULL
|| (node
->flags
& kEndsWord
)) {
304 unistr
.truncate(unistr
.length()-1);
306 if (node
->high
!= NULL
) {
307 fBranchStack
.setElementAt(kDone
, fBranchStack
.size()-1);
308 node
= (TernaryNode
*) fNodeStack
.push(node
->high
, status
);
309 where
= (StackBranch
) fBranchStack
.push(kLessThan
, status
);
315 node
= (TernaryNode
*) fNodeStack
.peek();
316 where
= (StackBranch
) fBranchStack
.peeki();
325 // Very expensive, but this should never be used.
326 virtual int32_t count(UErrorCode
&status
) const {
327 MutableTrieEnumeration
counter(fRoot
, status
);
329 while (counter
.snext(status
) != NULL
&& U_SUCCESS(status
)) {
335 virtual void reset(UErrorCode
&status
) {
336 fNodeStack
.removeAllElements();
337 fBranchStack
.removeAllElements();
338 fNodeStack
.push(fRoot
, status
);
339 fBranchStack
.push(kLessThan
, status
);
344 const char MutableTrieEnumeration::fgClassID
= '\0';
347 MutableTrieDictionary::openWords( UErrorCode
&status
) const {
348 if (U_FAILURE(status
)) {
351 return new MutableTrieEnumeration(fTrie
, status
);
354 /*******************************************************************
355 * CompactTrieDictionary
358 struct CompactTrieHeader
{
359 uint32_t size
; // Size of the data in bytes
360 uint32_t magic
; // Magic number (including version)
361 uint16_t nodeCount
; // Number of entries in offsets[]
362 uint16_t root
; // Node number of the root node
363 uint32_t offsets
[1]; // Offsets to nodes from start of data
366 // Note that to avoid platform-specific alignment issues, all members of the node
367 // structures should be the same size, or should contain explicit padding to
368 // natural alignment boundaries.
370 // We can't use a bitfield for the flags+count field, because the layout of those
371 // is not portable. 12 bits of count allows for up to 4096 entries in a node.
372 struct CompactTrieNode
{
373 uint16_t flagscount
; // Count of sub-entries, plus flags
376 enum CompactTrieNodeFlags
{
377 kVerticalNode
= 0x1000, // This is a vertical node
378 kParentEndsWord
= 0x2000, // The node whose equal link points to this ends a word
379 kReservedFlag1
= 0x4000,
380 kReservedFlag2
= 0x8000,
381 kCountMask
= 0x0FFF, // The count portion of flagscount
382 kFlagMask
= 0xF000 // The flags portion of flagscount
385 // The two node types are distinguished by the kVerticalNode flag.
387 struct CompactTrieHorizontalEntry
{
388 uint16_t ch
; // UChar
389 uint16_t equal
; // Equal link node index
392 // We don't use inheritance here because C++ does not guarantee that the
393 // base class comes first in memory!!
395 struct CompactTrieHorizontalNode
{
396 uint16_t flagscount
; // Count of sub-entries, plus flags
397 CompactTrieHorizontalEntry entries
[1];
400 struct CompactTrieVerticalNode
{
401 uint16_t flagscount
; // Count of sub-entries, plus flags
402 uint16_t equal
; // Equal link node index
403 uint16_t chars
[1]; // Code units
406 // {'Dic', 1}, version 1
407 #define COMPACT_TRIE_MAGIC_1 0x44696301
409 CompactTrieDictionary::CompactTrieDictionary(UDataMemory
*dataObj
,
413 fData
= (const CompactTrieHeader
*) udata_getMemory(dataObj
);
415 if (fData
->magic
!= COMPACT_TRIE_MAGIC_1
) {
416 status
= U_ILLEGAL_ARGUMENT_ERROR
;
420 CompactTrieDictionary::CompactTrieDictionary( const void *data
,
424 fData
= (const CompactTrieHeader
*) data
;
426 if (fData
->magic
!= COMPACT_TRIE_MAGIC_1
) {
427 status
= U_ILLEGAL_ARGUMENT_ERROR
;
432 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary
&dict
,
436 fData
= compactMutableTrieDictionary(dict
, status
);
437 fOwnData
= !U_FAILURE(status
);
440 CompactTrieDictionary::~CompactTrieDictionary() {
442 uprv_free((void *)fData
);
450 CompactTrieDictionary::dataSize() const {
455 CompactTrieDictionary::data() const {
459 // This function finds the address of a node for us, given its node ID
460 static inline const CompactTrieNode
*
461 getCompactNode(const CompactTrieHeader
*header
, uint16_t node
) {
462 return (const CompactTrieNode
*)((const uint8_t *)header
+ header
->offsets
[node
]);
466 CompactTrieDictionary::matches( UText
*text
,
471 // TODO: current implementation works in UTF-16 space
472 const CompactTrieNode
*node
= getCompactNode(fData
, fData
->root
);
475 UChar uc
= utext_current32(text
);
478 while (node
!= NULL
) {
479 // Check if the node we just exited ends a word
480 if (limit
> 0 && (node
->flagscount
& kParentEndsWord
)) {
481 lengths
[mycount
++] = i
;
484 // Check that we haven't exceeded the maximum number of input characters.
485 // We have to do that here rather than in the while condition so that
486 // we can check for ending a word, above.
487 if (i
>= maxLength
) {
491 int nodeCount
= (node
->flagscount
& kCountMask
);
492 if (nodeCount
== 0) {
493 // Special terminal node; return now
496 if (node
->flagscount
& kVerticalNode
) {
497 // Vertical node; check all the characters in it
498 const CompactTrieVerticalNode
*vnode
= (const CompactTrieVerticalNode
*)node
;
499 for (int j
= 0; j
< nodeCount
&& i
< maxLength
; ++j
) {
500 if (uc
!= vnode
->chars
[j
]) {
501 // We hit a non-equal character; return
505 uc
= utext_current32(text
);
508 // To get here we must have come through the whole list successfully;
509 // go on to the next node. Note that a word cannot end in the middle
510 // of a vertical node.
511 node
= getCompactNode(fData
, vnode
->equal
);
514 // Horizontal node; do binary search
515 const CompactTrieHorizontalNode
*hnode
= (const CompactTrieHorizontalNode
*)node
;
517 int high
= nodeCount
-1;
519 node
= NULL
; // If we don't find a match, we'll fall out of the loop
520 while (high
>= low
) {
521 middle
= (high
+low
)/2;
522 if (uc
== hnode
->entries
[middle
].ch
) {
523 // We hit a match; get the next node and next character
524 node
= getCompactNode(fData
, hnode
->entries
[middle
].equal
);
526 uc
= utext_current32(text
);
530 else if (uc
< hnode
->entries
[middle
].ch
) {
544 // Implementation of iteration for CompactTrieDictionary
545 class CompactTrieEnumeration
: public StringEnumeration
{
547 UVector32 fNodeStack
; // Stack of nodes to process
548 UVector32 fIndexStack
; // Stack of where in node we are
549 const CompactTrieHeader
*fHeader
; // Trie data
550 static const char fgClassID
;
553 static UClassID U_EXPORT2
getStaticClassID(void) { return (UClassID
)&fgClassID
; }
554 virtual UClassID
getDynamicClassID(void) const { return getStaticClassID(); }
556 CompactTrieEnumeration(const CompactTrieHeader
*header
, UErrorCode
&status
)
557 : fNodeStack(status
), fIndexStack(status
) {
559 fNodeStack
.push(header
->root
, status
);
560 fIndexStack
.push(0, status
);
564 virtual ~CompactTrieEnumeration() {
567 virtual StringEnumeration
*clone() const {
568 UErrorCode status
= U_ZERO_ERROR
;
569 return new CompactTrieEnumeration(fHeader
, status
);
572 virtual const UnicodeString
* snext(UErrorCode
&status
);
574 // Very expensive, but this should never be used.
575 virtual int32_t count(UErrorCode
&status
) const {
576 CompactTrieEnumeration
counter(fHeader
, status
);
578 while (counter
.snext(status
) != NULL
&& U_SUCCESS(status
)) {
584 virtual void reset(UErrorCode
&status
) {
585 fNodeStack
.removeAllElements();
586 fIndexStack
.removeAllElements();
587 fNodeStack
.push(fHeader
->root
, status
);
588 fIndexStack
.push(0, status
);
593 const char CompactTrieEnumeration::fgClassID
= '\0';
595 const UnicodeString
*
596 CompactTrieEnumeration::snext(UErrorCode
&status
) {
597 if (fNodeStack
.empty() || U_FAILURE(status
)) {
600 const CompactTrieNode
*node
= getCompactNode(fHeader
, fNodeStack
.peeki());
601 int where
= fIndexStack
.peeki();
602 while (!fNodeStack
.empty() && U_SUCCESS(status
)) {
603 int nodeCount
= (node
->flagscount
& kCountMask
);
604 UBool goingDown
= FALSE
;
605 if (nodeCount
== 0) {
606 // Terminal node; go up immediately
609 node
= getCompactNode(fHeader
, fNodeStack
.peeki());
610 where
= fIndexStack
.peeki();
612 else if (node
->flagscount
& kVerticalNode
) {
614 const CompactTrieVerticalNode
*vnode
= (const CompactTrieVerticalNode
*)node
;
617 unistr
.append((const UChar
*)vnode
->chars
, (int32_t) nodeCount
);
618 fIndexStack
.setElementAt(1, fIndexStack
.size()-1);
619 node
= getCompactNode(fHeader
, fNodeStack
.push(vnode
->equal
, status
));
620 where
= fIndexStack
.push(0, status
);
625 unistr
.truncate(unistr
.length()-nodeCount
);
628 node
= getCompactNode(fHeader
, fNodeStack
.peeki());
629 where
= fIndexStack
.peeki();
634 const CompactTrieHorizontalNode
*hnode
= (const CompactTrieHorizontalNode
*)node
;
637 unistr
.truncate(unistr
.length()-1);
639 if (where
< nodeCount
) {
641 unistr
.append((UChar
)hnode
->entries
[where
].ch
);
642 fIndexStack
.setElementAt(where
+1, fIndexStack
.size()-1);
643 node
= getCompactNode(fHeader
, fNodeStack
.push(hnode
->entries
[where
].equal
, status
));
644 where
= fIndexStack
.push(0, status
);
651 node
= getCompactNode(fHeader
, fNodeStack
.peeki());
652 where
= fIndexStack
.peeki();
655 // Check if the parent of the node we've just gone down to ends a
656 // word. If so, return it.
657 if (goingDown
&& (node
->flagscount
& kParentEndsWord
)) {
665 CompactTrieDictionary::openWords( UErrorCode
&status
) const {
666 if (U_FAILURE(status
)) {
669 return new CompactTrieEnumeration(fData
, status
);
673 // Below here is all code related to converting a ternary trie to a compact trie
677 // Helper classes to construct the compact trie
678 class BuildCompactTrieNode
: public UMemory
{
680 UBool fParentEndsWord
;
684 UnicodeString fChars
;
687 BuildCompactTrieNode(UBool parentEndsWord
, UBool vertical
, UStack
&nodes
, UErrorCode
&status
) {
688 fParentEndsWord
= parentEndsWord
;
689 fHasDuplicate
= FALSE
;
690 fVertical
= vertical
;
691 fNodeID
= nodes
.size();
692 nodes
.push(this, status
);
695 virtual ~BuildCompactTrieNode() {
698 virtual uint32_t size() {
699 return sizeof(uint16_t);
702 virtual void write(uint8_t *bytes
, uint32_t &offset
, const UVector32
&/*translate*/) {
704 *((uint16_t *)(bytes
+offset
)) = (fChars
.length() & kCountMask
)
705 | (fVertical
? kVerticalNode
: 0) | (fParentEndsWord
? kParentEndsWord
: 0 );
706 offset
+= sizeof(uint16_t);
710 class BuildCompactTrieHorizontalNode
: public BuildCompactTrieNode
{
715 BuildCompactTrieHorizontalNode(UBool parentEndsWord
, UStack
&nodes
, UErrorCode
&status
)
716 : BuildCompactTrieNode(parentEndsWord
, FALSE
, nodes
, status
), fLinks(status
) {
719 virtual ~BuildCompactTrieHorizontalNode() {
722 virtual uint32_t size() {
723 return offsetof(CompactTrieHorizontalNode
,entries
) +
724 (fChars
.length()*sizeof(CompactTrieHorizontalEntry
));
727 virtual void write(uint8_t *bytes
, uint32_t &offset
, const UVector32
&translate
) {
728 BuildCompactTrieNode::write(bytes
, offset
, translate
);
729 int32_t count
= fChars
.length();
730 for (int32_t i
= 0; i
< count
; ++i
) {
731 CompactTrieHorizontalEntry
*entry
= (CompactTrieHorizontalEntry
*)(bytes
+offset
);
732 entry
->ch
= fChars
[i
];
733 entry
->equal
= translate
.elementAti(((BuildCompactTrieNode
*)fLinks
[i
])->fNodeID
);
734 #ifdef DEBUG_TRIE_DICT
735 if (entry
->equal
== 0) {
736 fprintf(stderr
, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
737 i
, ((BuildCompactTrieNode
*)fLinks
[i
])->fNodeID
);
740 offset
+= sizeof(CompactTrieHorizontalEntry
);
744 void addNode(UChar ch
, BuildCompactTrieNode
*link
, UErrorCode
&status
) {
746 fLinks
.push(link
, status
);
750 class BuildCompactTrieVerticalNode
: public BuildCompactTrieNode
{
752 BuildCompactTrieNode
*fEqual
;
755 BuildCompactTrieVerticalNode(UBool parentEndsWord
, UStack
&nodes
, UErrorCode
&status
)
756 : BuildCompactTrieNode(parentEndsWord
, TRUE
, nodes
, status
) {
760 virtual ~BuildCompactTrieVerticalNode() {
763 virtual uint32_t size() {
764 return offsetof(CompactTrieVerticalNode
,chars
) + (fChars
.length()*sizeof(uint16_t));
767 virtual void write(uint8_t *bytes
, uint32_t &offset
, const UVector32
&translate
) {
768 CompactTrieVerticalNode
*node
= (CompactTrieVerticalNode
*)(bytes
+offset
);
769 BuildCompactTrieNode::write(bytes
, offset
, translate
);
770 node
->equal
= translate
.elementAti(fEqual
->fNodeID
);
771 offset
+= sizeof(node
->equal
);
772 #ifdef DEBUG_TRIE_DICT
773 if (node
->equal
== 0) {
774 fprintf(stderr
, "ERROR: vertical link, logical node %d maps to physical node zero\n",
778 fChars
.extract(0, fChars
.length(), (UChar
*)node
->chars
);
779 offset
+= sizeof(uint16_t)*fChars
.length();
782 void addChar(UChar ch
) {
786 void setLink(BuildCompactTrieNode
*node
) {
791 // Forward declaration
792 static void walkHorizontal(const TernaryNode
*node
,
793 BuildCompactTrieHorizontalNode
*building
,
797 // Convert one node. Uses recursion.
799 static BuildCompactTrieNode
*
800 compactOneNode(const TernaryNode
*node
, UBool parentEndsWord
, UStack
&nodes
, UErrorCode
&status
) {
801 if (U_FAILURE(status
)) {
804 BuildCompactTrieNode
*result
= NULL
;
805 UBool horizontal
= (node
->low
!= NULL
|| node
->high
!= NULL
);
807 BuildCompactTrieHorizontalNode
*hResult
=
808 new BuildCompactTrieHorizontalNode(parentEndsWord
, nodes
, status
);
809 if (hResult
== NULL
) {
810 status
= U_MEMORY_ALLOCATION_ERROR
;
812 if (U_SUCCESS(status
)) {
813 walkHorizontal(node
, hResult
, nodes
, status
);
818 BuildCompactTrieVerticalNode
*vResult
=
819 new BuildCompactTrieVerticalNode(parentEndsWord
, nodes
, status
);
820 if (vResult
== NULL
) {
821 status
= U_MEMORY_ALLOCATION_ERROR
;
823 if (U_SUCCESS(status
)) {
824 UBool endsWord
= FALSE
;
825 // Take up nodes until we end a word, or hit a node with < or > links
827 vResult
->addChar(node
->ch
);
828 endsWord
= (node
->flags
& kEndsWord
) != 0;
831 while(node
!= NULL
&& !endsWord
&& node
->low
== NULL
&& node
->high
== NULL
);
834 status
= U_ILLEGAL_ARGUMENT_ERROR
; // Corrupt input trie
837 vResult
->setLink((BuildCompactTrieNode
*)nodes
[1]);
841 vResult
->setLink(compactOneNode(node
, endsWord
, nodes
, status
));
849 // Walk the set of peers at the same level, to build a horizontal node.
852 static void walkHorizontal(const TernaryNode
*node
,
853 BuildCompactTrieHorizontalNode
*building
,
855 UErrorCode
&status
) {
856 while (U_SUCCESS(status
) && node
!= NULL
) {
857 if (node
->low
!= NULL
) {
858 walkHorizontal(node
->low
, building
, nodes
, status
);
860 BuildCompactTrieNode
*link
= NULL
;
861 if (node
->equal
!= NULL
) {
862 link
= compactOneNode(node
->equal
, (node
->flags
& kEndsWord
) != 0, nodes
, status
);
864 else if (node
->flags
& kEndsWord
) {
865 link
= (BuildCompactTrieNode
*)nodes
[1];
867 if (U_SUCCESS(status
) && link
!= NULL
) {
868 building
->addNode(node
->ch
, link
, status
);
870 // Tail recurse manually instead of leaving it to the compiler.
871 //if (node->high != NULL) {
872 // walkHorizontal(node->high, building, nodes, status);
880 static int32_t U_CALLCONV
881 _sortBuildNodes(const void * /*context*/, const void *voidl
, const void *voidr
) {
882 BuildCompactTrieNode
*left
= *(BuildCompactTrieNode
**)voidl
;
883 BuildCompactTrieNode
*right
= *(BuildCompactTrieNode
**)voidr
;
884 // Check for comparing a node to itself, to avoid spurious duplicates
888 // Most significant is type of node. Can never coalesce.
889 if (left
->fVertical
!= right
->fVertical
) {
890 return left
->fVertical
- right
->fVertical
;
892 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
893 if (left
->fParentEndsWord
!= right
->fParentEndsWord
) {
894 return left
->fParentEndsWord
- right
->fParentEndsWord
;
896 // Next, the string. If that differs, we can never coalesce.
897 int32_t result
= left
->fChars
.compare(right
->fChars
);
901 // We know they're both the same node type, so branch for the two cases.
902 if (left
->fVertical
) {
903 result
= ((BuildCompactTrieVerticalNode
*)left
)->fEqual
->fNodeID
904 - ((BuildCompactTrieVerticalNode
*)right
)->fEqual
->fNodeID
;
907 // We need to compare the links vectors. They should be the
908 // same size because the strings were equal.
909 // We compare the node IDs instead of the pointers, to handle
911 BuildCompactTrieHorizontalNode
*hleft
, *hright
;
912 hleft
= (BuildCompactTrieHorizontalNode
*)left
;
913 hright
= (BuildCompactTrieHorizontalNode
*)right
;
914 int32_t count
= hleft
->fLinks
.size();
915 for (int32_t i
= 0; i
< count
&& result
== 0; ++i
) {
916 result
= ((BuildCompactTrieNode
*)(hleft
->fLinks
[i
]))->fNodeID
-
917 ((BuildCompactTrieNode
*)(hright
->fLinks
[i
]))->fNodeID
;
920 // If they are equal to each other, mark them (speeds coalescing)
922 left
->fHasDuplicate
= TRUE
;
923 right
->fHasDuplicate
= TRUE
;
930 static void coalesceDuplicates(UStack
&nodes
, UErrorCode
&status
) {
931 // We sort the array of nodes to place duplicates next to each other
932 if (U_FAILURE(status
)) {
935 int32_t size
= nodes
.size();
936 void **array
= (void **)uprv_malloc(sizeof(void *)*size
);
938 status
= U_MEMORY_ALLOCATION_ERROR
;
941 (void) nodes
.toArray(array
);
943 // Now repeatedly identify duplicates until there are no more
946 #ifdef DEBUG_TRIE_DICT
950 BuildCompactTrieNode
*node
;
951 BuildCompactTrieNode
*first
= NULL
;
952 BuildCompactTrieNode
**p
;
953 BuildCompactTrieNode
**pFirst
= NULL
;
954 int32_t counter
= size
- 2;
955 // Sort the array, skipping nodes 0 and 1. Use quicksort for the first
956 // pass for speed. For the second and subsequent passes, we use stable
957 // (insertion) sort for two reasons:
958 // 1. The array is already mostly ordered, so we get better performance.
959 // 2. The way we find one and only one instance of a set of duplicates is to
960 // check that the node ID equals the array index. If we used an unstable
961 // sort for the second or later passes, it's possible that none of the
962 // duplicates would wind up with a node ID equal to its array index.
963 // The sort stability guarantees that, because as we coalesce more and
964 // more groups, the first element of the resultant group will be one of
965 // the first elements of the groups being coalesced.
966 // To use quicksort for the second and subsequent passes, we would have to
967 // find the minimum of the node numbers in a group, and set all the nodes
968 // in the group to that node number.
969 uprv_sortArray(array
+2, counter
, sizeof(void *), _sortBuildNodes
, NULL
, (passCount
> 0), &status
);
971 for (p
= (BuildCompactTrieNode
**)array
+ 2; counter
> 0; --counter
, ++p
) {
973 if (node
->fHasDuplicate
) {
978 else if (_sortBuildNodes(NULL
, pFirst
, p
) != 0) {
979 // Starting a new run of dupes
983 else if (node
->fNodeID
!= first
->fNodeID
) {
984 // Slave one to the other, note duplicate
985 node
->fNodeID
= first
->fNodeID
;
990 // This node has no dupes
996 #ifdef DEBUG_TRIE_DICT
998 fprintf(stderr
, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount
, dupes
);
1002 #ifdef DEBUG_TRIE_DICT
1003 fprintf(stderr
, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes
, passCount
);
1006 // We no longer need the temporary array, as the nodes have all been marked appropriately.
1012 static void U_CALLCONV
_deleteBuildNode(void *obj
) {
1013 delete (BuildCompactTrieNode
*) obj
;
1019 CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary
&dict
,
1020 UErrorCode
&status
) {
1021 if (U_FAILURE(status
)) {
1024 #ifdef DEBUG_TRIE_DICT
1026 struct tms previous
;
1027 (void) ::times(&previous
);
1029 UStack
nodes(_deleteBuildNode
, NULL
, status
); // Index of nodes
1031 // Add node 0, used as the NULL pointer/sentinel.
1032 nodes
.addElement((int32_t)0, status
);
1034 // Start by creating the special empty node we use to indicate that the parent
1035 // terminates a word. This must be node 1, because the builder assumes
1037 if (U_FAILURE(status
)) {
1040 BuildCompactTrieNode
*terminal
= new BuildCompactTrieNode(TRUE
, FALSE
, nodes
, status
);
1041 if (terminal
== NULL
) {
1042 status
= U_MEMORY_ALLOCATION_ERROR
;
1045 // This call does all the work of building the new trie structure. The root
1047 BuildCompactTrieNode
*root
= compactOneNode(dict
.fTrie
, FALSE
, nodes
, status
);
1048 #ifdef DEBUG_TRIE_DICT
1049 (void) ::times(&timing
);
1050 fprintf(stderr
, "Compact trie built, %d nodes, time user %f system %f\n",
1051 nodes
.size(), (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1052 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1056 // Now coalesce all duplicate nodes.
1057 coalesceDuplicates(nodes
, status
);
1058 #ifdef DEBUG_TRIE_DICT
1059 (void) ::times(&timing
);
1060 fprintf(stderr
, "Duplicates coalesced, time user %f system %f\n",
1061 (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1062 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1066 // Next, build the output trie.
1067 // First we compute all the sizes and build the node ID translation table.
1068 uint32_t totalSize
= offsetof(CompactTrieHeader
,offsets
);
1069 int32_t count
= nodes
.size();
1070 int32_t nodeCount
= 1; // The sentinel node we already have
1071 BuildCompactTrieNode
*node
;
1072 UVector32
translate(count
, status
); // Should be no growth needed after this
1073 translate
.push(0, status
); // The sentinel node
1075 if (U_FAILURE(status
)) {
1079 for (int32_t i
= 1; i
< count
; ++i
) {
1080 node
= (BuildCompactTrieNode
*)nodes
[i
];
1081 if (node
->fNodeID
== i
) {
1082 // Only one node out of each duplicate set is used
1083 if (i
>= translate
.size()) {
1084 // Logically extend the mapping table
1085 translate
.setSize(i
+1);
1087 translate
.setElementAt(nodeCount
++, i
);
1088 totalSize
+= node
->size();
1092 // Check for overflowing 16 bits worth of nodes.
1093 if (nodeCount
> 0x10000) {
1094 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1098 // Add enough room for the offsets.
1099 totalSize
+= nodeCount
*sizeof(uint32_t);
1100 #ifdef DEBUG_TRIE_DICT
1101 (void) ::times(&timing
);
1102 fprintf(stderr
, "Sizes/mapping done, time user %f system %f\n",
1103 (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1104 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1106 fprintf(stderr
, "%d nodes, %d unique, %d bytes\n", nodes
.size(), nodeCount
, totalSize
);
1108 uint8_t *bytes
= (uint8_t *)uprv_malloc(totalSize
);
1109 if (bytes
== NULL
) {
1110 status
= U_MEMORY_ALLOCATION_ERROR
;
1114 CompactTrieHeader
*header
= (CompactTrieHeader
*)bytes
;
1115 header
->size
= totalSize
;
1116 header
->nodeCount
= nodeCount
;
1117 header
->offsets
[0] = 0; // Sentinel
1118 header
->root
= translate
.elementAti(root
->fNodeID
);
1119 #ifdef DEBUG_TRIE_DICT
1120 if (header
->root
== 0) {
1121 fprintf(stderr
, "ERROR: root node %d translate to physical zero\n", root
->fNodeID
);
1124 uint32_t offset
= offsetof(CompactTrieHeader
,offsets
)+(nodeCount
*sizeof(uint32_t));
1126 // Now write the data
1127 for (int32_t i
= 1; i
< count
; ++i
) {
1128 node
= (BuildCompactTrieNode
*)nodes
[i
];
1129 if (node
->fNodeID
== i
) {
1130 header
->offsets
[nodeCount
++] = offset
;
1131 node
->write(bytes
, offset
, translate
);
1134 #ifdef DEBUG_TRIE_DICT
1135 (void) ::times(&timing
);
1136 fprintf(stderr
, "Trie built, time user %f system %f\n",
1137 (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1138 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1140 fprintf(stderr
, "Final offset is %d\n", offset
);
1142 // Collect statistics on node types and sizes
1147 size_t hItemCount
= 0;
1148 size_t vItemCount
= 0;
1149 uint32_t previousOff
= offset
;
1150 for (uint16_t i
= nodeCount
-1; i
>= 2; --i
) {
1151 const CompactTrieNode
*node
= getCompactNode(header
, i
);
1152 if (node
->flagscount
& kVerticalNode
) {
1154 vItemCount
+= (node
->flagscount
& kCountMask
);
1155 vSize
+= previousOff
-header
->offsets
[i
];
1159 hItemCount
+= (node
->flagscount
& kCountMask
);
1160 hSize
+= previousOff
-header
->offsets
[i
];
1162 previousOff
= header
->offsets
[i
];
1164 fprintf(stderr
, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount
,
1165 (double)hSize
/hCount
, (double)hItemCount
/hCount
);
1166 fprintf(stderr
, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount
,
1167 (double)vSize
/vCount
, (double)vItemCount
/vCount
);
1170 if (U_FAILURE(status
)) {
1175 header
->magic
= COMPACT_TRIE_MAGIC_1
;
1180 // Forward declaration
1181 static TernaryNode
*
1182 unpackOneNode( const CompactTrieHeader
*header
, const CompactTrieNode
*node
, UErrorCode
&status
);
1185 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
1186 static TernaryNode
*
1187 unpackHorizontalArray( const CompactTrieHeader
*header
, const CompactTrieHorizontalEntry
*array
,
1188 int low
, int high
, UErrorCode
&status
) {
1189 if (U_FAILURE(status
) || low
> high
) {
1192 int middle
= (low
+high
)/2;
1193 TernaryNode
*result
= new TernaryNode(array
[middle
].ch
);
1194 if (result
== NULL
) {
1195 status
= U_MEMORY_ALLOCATION_ERROR
;
1198 const CompactTrieNode
*equal
= getCompactNode(header
, array
[middle
].equal
);
1199 if (equal
->flagscount
& kParentEndsWord
) {
1200 result
->flags
|= kEndsWord
;
1202 result
->low
= unpackHorizontalArray(header
, array
, low
, middle
-1, status
);
1203 result
->high
= unpackHorizontalArray(header
, array
, middle
+1, high
, status
);
1204 result
->equal
= unpackOneNode(header
, equal
, status
);
1208 // Convert one compact trie node into a ternary subtrie
1209 static TernaryNode
*
1210 unpackOneNode( const CompactTrieHeader
*header
, const CompactTrieNode
*node
, UErrorCode
&status
) {
1211 int nodeCount
= (node
->flagscount
& kCountMask
);
1212 if (nodeCount
== 0 || U_FAILURE(status
)) {
1213 // Failure, or terminal node
1216 if (node
->flagscount
& kVerticalNode
) {
1217 const CompactTrieVerticalNode
*vnode
= (const CompactTrieVerticalNode
*)node
;
1218 TernaryNode
*head
= NULL
;
1219 TernaryNode
*previous
= NULL
;
1220 TernaryNode
*latest
= NULL
;
1221 for (int i
= 0; i
< nodeCount
; ++i
) {
1222 latest
= new TernaryNode(vnode
->chars
[i
]);
1223 if (latest
== NULL
) {
1224 status
= U_MEMORY_ALLOCATION_ERROR
;
1230 if (previous
!= NULL
) {
1231 previous
->equal
= latest
;
1235 if (latest
!= NULL
) {
1236 const CompactTrieNode
*equal
= getCompactNode(header
, vnode
->equal
);
1237 if (equal
->flagscount
& kParentEndsWord
) {
1238 latest
->flags
|= kEndsWord
;
1240 latest
->equal
= unpackOneNode(header
, equal
, status
);
1246 const CompactTrieHorizontalNode
*hnode
= (const CompactTrieHorizontalNode
*)node
;
1247 return unpackHorizontalArray(header
, &hnode
->entries
[0], 0, nodeCount
-1, status
);
1251 MutableTrieDictionary
*
1252 CompactTrieDictionary::cloneMutable( UErrorCode
&status
) const {
1253 MutableTrieDictionary
*result
= new MutableTrieDictionary( status
);
1254 if (result
== NULL
) {
1255 status
= U_MEMORY_ALLOCATION_ERROR
;
1258 TernaryNode
*root
= unpackOneNode(fData
, getCompactNode(fData
, fData
->root
), status
);
1259 if (U_FAILURE(status
)) {
1260 delete root
; // Clean up
1264 result
->fTrie
= root
;
1270 U_CAPI
int32_t U_EXPORT2
1271 triedict_swap(const UDataSwapper
*ds
, const void *inData
, int32_t length
, void *outData
,
1272 UErrorCode
*status
) {
1274 if (status
== NULL
|| U_FAILURE(*status
)) {
1277 if(ds
==NULL
|| inData
==NULL
|| length
<-1 || (length
>0 && outData
==NULL
)) {
1278 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
1283 // Check that the data header is for for dictionary data.
1284 // (Header contents are defined in genxxx.cpp)
1286 const UDataInfo
*pInfo
= (const UDataInfo
*)((const uint8_t *)inData
+4);
1287 if(!( pInfo
->dataFormat
[0]==0x54 && /* dataFormat="TrDc" */
1288 pInfo
->dataFormat
[1]==0x72 &&
1289 pInfo
->dataFormat
[2]==0x44 &&
1290 pInfo
->dataFormat
[3]==0x63 &&
1291 pInfo
->formatVersion
[0]==1 )) {
1292 udata_printError(ds
, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
1293 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
1294 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
1295 pInfo
->formatVersion
[0]);
1296 *status
=U_UNSUPPORTED_ERROR
;
1301 // Swap the data header. (This is the generic ICU Data Header, not the
1302 // CompactTrieHeader). This swap also conveniently gets us
1303 // the size of the ICU d.h., which lets us locate the start
1304 // of the RBBI specific data.
1306 int32_t headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, status
);
1309 // Get the CompactTrieHeader, and check that it appears to be OK.
1311 const uint8_t *inBytes
=(const uint8_t *)inData
+headerSize
;
1312 const CompactTrieHeader
*header
= (const CompactTrieHeader
*)inBytes
;
1313 if (ds
->readUInt32(header
->magic
) != COMPACT_TRIE_MAGIC_1
1314 || ds
->readUInt32(header
->size
) < sizeof(CompactTrieHeader
))
1316 udata_printError(ds
, "triedict_swap(): CompactTrieHeader is invalid.\n");
1317 *status
=U_UNSUPPORTED_ERROR
;
1322 // Prefight operation? Just return the size
1324 uint32_t totalSize
= ds
->readUInt32(header
->size
);
1325 int32_t sizeWithUData
= (int32_t)totalSize
+ headerSize
;
1327 return sizeWithUData
;
1331 // Check that length passed in is consistent with length from RBBI data header.
1333 if (length
< sizeWithUData
) {
1334 udata_printError(ds
, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
1336 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
1341 // Swap the Data. Do the data itself first, then the CompactTrieHeader, because
1342 // we need to reference the header to locate the data, and an
1343 // inplace swap of the header leaves it unusable.
1345 uint8_t *outBytes
= (uint8_t *)outData
+ headerSize
;
1346 CompactTrieHeader
*outputHeader
= (CompactTrieHeader
*)outBytes
;
1350 // If not swapping in place, zero out the output buffer before starting.
1352 if (inBytes
!= outBytes
) {
1353 uprv_memset(outBytes
, 0, totalSize
);
1356 // We need to loop through all the nodes in the offset table, and swap each one.
1357 uint16_t nodeCount
= ds
->readUInt16(header
->nodeCount
);
1358 // Skip node 0, which should always be 0.
1359 for (int i
= 1; i
< nodeCount
; ++i
) {
1360 uint32_t nodeOff
= ds
->readUInt32(header
->offsets
[i
]);
1361 const CompactTrieNode
*inNode
= (const CompactTrieNode
*)(inBytes
+ nodeOff
);
1362 CompactTrieNode
*outNode
= (CompactTrieNode
*)(outBytes
+ nodeOff
);
1363 uint16_t flagscount
= ds
->readUInt16(inNode
->flagscount
);
1364 uint16_t itemCount
= flagscount
& kCountMask
;
1365 ds
->writeUInt16(&outNode
->flagscount
, flagscount
);
1366 if (itemCount
> 0) {
1367 if (flagscount
& kVerticalNode
) {
1368 ds
->swapArray16(ds
, inBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,chars
),
1369 itemCount
*sizeof(uint16_t),
1370 outBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,chars
), status
);
1371 uint16_t equal
= ds
->readUInt16(inBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,equal
);
1372 ds
->writeUInt16(outBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,equal
));
1375 const CompactTrieHorizontalNode
*inHNode
= (const CompactTrieHorizontalNode
*)inNode
;
1376 CompactTrieHorizontalNode
*outHNode
= (CompactTrieHorizontalNode
*)outNode
;
1377 for (int j
= 0; j
< itemCount
; ++j
) {
1378 uint16_t word
= ds
->readUInt16(inHNode
->entries
[j
].ch
);
1379 ds
->writeUInt16(&outHNode
->entries
[j
].ch
, word
);
1380 word
= ds
->readUInt16(inHNode
->entries
[j
].equal
);
1381 ds
->writeUInt16(&outHNode
->entries
[j
].equal
, word
);
1388 // All the data in all the nodes consist of 16 bit items. Swap them all at once.
1389 uint16_t nodeCount
= ds
->readUInt16(header
->nodeCount
);
1390 uint32_t nodesOff
= offsetof(CompactTrieHeader
,offsets
)+((uint32_t)nodeCount
*sizeof(uint32_t));
1391 ds
->swapArray16(ds
, inBytes
+nodesOff
, totalSize
-nodesOff
, outBytes
+nodesOff
, status
);
1394 ds
->writeUInt32(&outputHeader
->size
, totalSize
);
1395 uint32_t magic
= ds
->readUInt32(header
->magic
);
1396 ds
->writeUInt32(&outputHeader
->magic
, magic
);
1397 ds
->writeUInt16(&outputHeader
->nodeCount
, nodeCount
);
1398 uint16_t root
= ds
->readUInt16(header
->root
);
1399 ds
->writeUInt16(&outputHeader
->root
, root
);
1400 ds
->swapArray32(ds
, inBytes
+offsetof(CompactTrieHeader
,offsets
),
1401 sizeof(uint32_t)*(int32_t)nodeCount
,
1402 outBytes
+offsetof(CompactTrieHeader
,offsets
), status
);
1404 return sizeWithUData
;
1407 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */