2 *******************************************************************************
3 * Copyright (C) 2006-2008, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_BREAK_ITERATION
13 #include "unicode/chariter.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/strenum.h"
16 #include "unicode/uenum.h"
17 #include "unicode/udata.h"
24 //#define DEBUG_TRIE_DICT 1
26 #ifdef DEBUG_TRIE_DICT
27 #include <sys/times.h>
34 /*******************************************************************
38 TrieWordDictionary::TrieWordDictionary() {
41 TrieWordDictionary::~TrieWordDictionary() {
44 /*******************************************************************
45 * MutableTrieDictionary
48 // Node structure for the ternary, uncompressed trie
49 struct TernaryNode
: public UMemory
{
50 UChar ch
; // UTF-16 code unit
51 uint16_t flags
; // Flag word
52 TernaryNode
*low
; // Less-than link
53 TernaryNode
*equal
; // Equal link
54 TernaryNode
*high
; // Greater-than link
56 TernaryNode(UChar uc
);
60 enum MutableTrieNodeFlags
{
61 kEndsWord
= 0x0001 // This node marks the end of a valid word
65 TernaryNode::TernaryNode(UChar uc
) {
73 // Not inline since it's recursive
74 TernaryNode::~TernaryNode() {
80 MutableTrieDictionary::MutableTrieDictionary( UChar median
, UErrorCode
&status
) {
81 // Start the trie off with something. Having the root node already present
82 // cuts a special case out of the search/insertion functions.
83 // Making it a median character cuts the worse case for searches from
84 // 4x a balanced trie to 2x a balanced trie. It's best to choose something
85 // that starts a word that is midway in the list.
86 fTrie
= new TernaryNode(median
);
88 status
= U_MEMORY_ALLOCATION_ERROR
;
90 fIter
= utext_openUChars(NULL
, NULL
, 0, &status
);
91 if (U_SUCCESS(status
) && fIter
== NULL
) {
92 status
= U_MEMORY_ALLOCATION_ERROR
;
96 MutableTrieDictionary::MutableTrieDictionary( UErrorCode
&status
) {
98 fIter
= utext_openUChars(NULL
, NULL
, 0, &status
);
99 if (U_SUCCESS(status
) && fIter
== NULL
) {
100 status
= U_MEMORY_ALLOCATION_ERROR
;
104 MutableTrieDictionary::~MutableTrieDictionary() {
110 MutableTrieDictionary::search( UText
*text
,
115 TernaryNode
*&parent
,
116 UBool
&pMatched
) const {
117 // TODO: current implementation works in UTF-16 space
118 const TernaryNode
*up
= NULL
;
119 const TernaryNode
*p
= fTrie
;
124 UChar uc
= utext_current32(text
);
125 for (i
= 0; i
< maxLength
&& p
!= NULL
; ++i
) {
131 else if (uc
== p
->ch
) {
143 // Must be equal to get here
144 if (limit
> 0 && (p
->flags
& kEndsWord
)) {
145 lengths
[mycount
++] = i
+1;
150 uc
= utext_next32(text
);
151 uc
= utext_current32(text
);
154 // Note that there is no way to reach here with up == 0 unless
155 // maxLength is 0 coming in.
156 parent
= (TernaryNode
*)up
;
162 MutableTrieDictionary::addWord( const UChar
*word
,
164 UErrorCode
&status
) {
167 status
= U_ILLEGAL_ARGUMENT_ERROR
;
174 fIter
= utext_openUChars(fIter
, word
, length
, &status
);
177 matched
= search(fIter
, length
, NULL
, count
, 0, parent
, pMatched
);
179 while (matched
++ < length
) {
180 UChar32 uc
= utext_next32(fIter
); // TODO: supplemetary support?
181 U_ASSERT(uc
!= U_SENTINEL
);
182 TernaryNode
*newNode
= new TernaryNode(uc
);
183 if (newNode
== NULL
) {
184 status
= U_MEMORY_ALLOCATION_ERROR
;
188 parent
->equal
= newNode
;
192 if (uc
< parent
->ch
) {
193 parent
->low
= newNode
;
196 parent
->high
= newNode
;
202 parent
->flags
|= kEndsWord
;
207 MutableTrieDictionary::addWords( UEnumeration
*words
,
208 UErrorCode
&status
) {
211 while ((word
= uenum_unext(words
, &length
, &status
)) && U_SUCCESS(status
)) {
212 addWord(word
, length
, status
);
218 MutableTrieDictionary::matches( UText
*text
,
225 return search(text
, maxLength
, lengths
, count
, limit
, parent
, pMatched
);
228 // Implementation of iteration for MutableTrieDictionary
229 class MutableTrieEnumeration
: public StringEnumeration
{
231 UStack fNodeStack
; // Stack of nodes to process
232 UVector32 fBranchStack
; // Stack of which branch we are working on
233 TernaryNode
*fRoot
; // Root node
242 static UClassID U_EXPORT2
getStaticClassID(void);
243 virtual UClassID
getDynamicClassID(void) const;
245 MutableTrieEnumeration(TernaryNode
*root
, UErrorCode
&status
)
246 : fNodeStack(status
), fBranchStack(status
) {
248 fNodeStack
.push(root
, status
);
249 fBranchStack
.push(kLessThan
, status
);
253 virtual ~MutableTrieEnumeration() {
256 virtual StringEnumeration
*clone() const {
257 UErrorCode status
= U_ZERO_ERROR
;
258 return new MutableTrieEnumeration(fRoot
, status
);
261 virtual const UnicodeString
*snext(UErrorCode
&status
) {
262 if (fNodeStack
.empty() || U_FAILURE(status
)) {
265 TernaryNode
*node
= (TernaryNode
*) fNodeStack
.peek();
266 StackBranch where
= (StackBranch
) fBranchStack
.peeki();
267 while (!fNodeStack
.empty() && U_SUCCESS(status
)) {
273 if (node
->low
!= NULL
) {
274 fBranchStack
.setElementAt(kEqual
, fBranchStack
.size()-1);
275 node
= (TernaryNode
*) fNodeStack
.push(node
->low
, status
);
276 where
= (StackBranch
) fBranchStack
.push(kLessThan
, status
);
280 emit
= (node
->flags
& kEndsWord
) != 0;
281 equal
= (node
->equal
!= NULL
);
282 // If this node should be part of the next emitted string, append
283 // the UChar to the string, and make sure we pop it when we come
284 // back to this node. The character should only be in the string
285 // for as long as we're traversing the equal subtree of this node
287 unistr
.append(node
->ch
);
288 fBranchStack
.setElementAt(kGreaterThan
, fBranchStack
.size()-1);
291 node
= (TernaryNode
*) fNodeStack
.push(node
->equal
, status
);
292 where
= (StackBranch
) fBranchStack
.push(kLessThan
, status
);
301 // If this node's character is in the string, remove it.
302 if (node
->equal
!= NULL
|| (node
->flags
& kEndsWord
)) {
303 unistr
.truncate(unistr
.length()-1);
305 if (node
->high
!= NULL
) {
306 fBranchStack
.setElementAt(kDone
, fBranchStack
.size()-1);
307 node
= (TernaryNode
*) fNodeStack
.push(node
->high
, status
);
308 where
= (StackBranch
) fBranchStack
.push(kLessThan
, status
);
314 node
= (TernaryNode
*) fNodeStack
.peek();
315 where
= (StackBranch
) fBranchStack
.peeki();
324 // Very expensive, but this should never be used.
325 virtual int32_t count(UErrorCode
&status
) const {
326 MutableTrieEnumeration
counter(fRoot
, status
);
328 while (counter
.snext(status
) != NULL
&& U_SUCCESS(status
)) {
334 virtual void reset(UErrorCode
&status
) {
335 fNodeStack
.removeAllElements();
336 fBranchStack
.removeAllElements();
337 fNodeStack
.push(fRoot
, status
);
338 fBranchStack
.push(kLessThan
, status
);
343 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration
)
346 MutableTrieDictionary::openWords( UErrorCode
&status
) const {
347 if (U_FAILURE(status
)) {
350 return new MutableTrieEnumeration(fTrie
, status
);
353 /*******************************************************************
354 * CompactTrieDictionary
357 struct CompactTrieHeader
{
358 uint32_t size
; // Size of the data in bytes
359 uint32_t magic
; // Magic number (including version)
360 uint16_t nodeCount
; // Number of entries in offsets[]
361 uint16_t root
; // Node number of the root node
362 uint32_t offsets
[1]; // Offsets to nodes from start of data
365 // Note that to avoid platform-specific alignment issues, all members of the node
366 // structures should be the same size, or should contain explicit padding to
367 // natural alignment boundaries.
369 // We can't use a bitfield for the flags+count field, because the layout of those
370 // is not portable. 12 bits of count allows for up to 4096 entries in a node.
371 struct CompactTrieNode
{
372 uint16_t flagscount
; // Count of sub-entries, plus flags
375 enum CompactTrieNodeFlags
{
376 kVerticalNode
= 0x1000, // This is a vertical node
377 kParentEndsWord
= 0x2000, // The node whose equal link points to this ends a word
378 kReservedFlag1
= 0x4000,
379 kReservedFlag2
= 0x8000,
380 kCountMask
= 0x0FFF, // The count portion of flagscount
381 kFlagMask
= 0xF000 // The flags portion of flagscount
384 // The two node types are distinguished by the kVerticalNode flag.
386 struct CompactTrieHorizontalEntry
{
387 uint16_t ch
; // UChar
388 uint16_t equal
; // Equal link node index
391 // We don't use inheritance here because C++ does not guarantee that the
392 // base class comes first in memory!!
394 struct CompactTrieHorizontalNode
{
395 uint16_t flagscount
; // Count of sub-entries, plus flags
396 CompactTrieHorizontalEntry entries
[1];
399 struct CompactTrieVerticalNode
{
400 uint16_t flagscount
; // Count of sub-entries, plus flags
401 uint16_t equal
; // Equal link node index
402 uint16_t chars
[1]; // Code units
405 // {'Dic', 1}, version 1
406 #define COMPACT_TRIE_MAGIC_1 0x44696301
408 CompactTrieDictionary::CompactTrieDictionary(UDataMemory
*dataObj
,
412 fData
= (const CompactTrieHeader
*) udata_getMemory(dataObj
);
414 if (fData
->magic
!= COMPACT_TRIE_MAGIC_1
) {
415 status
= U_ILLEGAL_ARGUMENT_ERROR
;
419 CompactTrieDictionary::CompactTrieDictionary( const void *data
,
423 fData
= (const CompactTrieHeader
*) data
;
425 if (fData
->magic
!= COMPACT_TRIE_MAGIC_1
) {
426 status
= U_ILLEGAL_ARGUMENT_ERROR
;
431 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary
&dict
,
435 fData
= compactMutableTrieDictionary(dict
, status
);
436 fOwnData
= !U_FAILURE(status
);
439 CompactTrieDictionary::~CompactTrieDictionary() {
441 uprv_free((void *)fData
);
449 CompactTrieDictionary::dataSize() const {
454 CompactTrieDictionary::data() const {
458 // This function finds the address of a node for us, given its node ID
459 static inline const CompactTrieNode
*
460 getCompactNode(const CompactTrieHeader
*header
, uint16_t node
) {
461 return (const CompactTrieNode
*)((const uint8_t *)header
+ header
->offsets
[node
]);
465 CompactTrieDictionary::matches( UText
*text
,
470 // TODO: current implementation works in UTF-16 space
471 const CompactTrieNode
*node
= getCompactNode(fData
, fData
->root
);
474 UChar uc
= utext_current32(text
);
477 while (node
!= NULL
) {
478 // Check if the node we just exited ends a word
479 if (limit
> 0 && (node
->flagscount
& kParentEndsWord
)) {
480 lengths
[mycount
++] = i
;
483 // Check that we haven't exceeded the maximum number of input characters.
484 // We have to do that here rather than in the while condition so that
485 // we can check for ending a word, above.
486 if (i
>= maxLength
) {
490 int nodeCount
= (node
->flagscount
& kCountMask
);
491 if (nodeCount
== 0) {
492 // Special terminal node; return now
495 if (node
->flagscount
& kVerticalNode
) {
496 // Vertical node; check all the characters in it
497 const CompactTrieVerticalNode
*vnode
= (const CompactTrieVerticalNode
*)node
;
498 for (int j
= 0; j
< nodeCount
&& i
< maxLength
; ++j
) {
499 if (uc
!= vnode
->chars
[j
]) {
500 // We hit a non-equal character; return
504 uc
= utext_current32(text
);
507 // To get here we must have come through the whole list successfully;
508 // go on to the next node. Note that a word cannot end in the middle
509 // of a vertical node.
510 node
= getCompactNode(fData
, vnode
->equal
);
513 // Horizontal node; do binary search
514 const CompactTrieHorizontalNode
*hnode
= (const CompactTrieHorizontalNode
*)node
;
516 int high
= nodeCount
-1;
518 node
= NULL
; // If we don't find a match, we'll fall out of the loop
519 while (high
>= low
) {
520 middle
= (high
+low
)/2;
521 if (uc
== hnode
->entries
[middle
].ch
) {
522 // We hit a match; get the next node and next character
523 node
= getCompactNode(fData
, hnode
->entries
[middle
].equal
);
525 uc
= utext_current32(text
);
529 else if (uc
< hnode
->entries
[middle
].ch
) {
543 // Implementation of iteration for CompactTrieDictionary
544 class CompactTrieEnumeration
: public StringEnumeration
{
546 UVector32 fNodeStack
; // Stack of nodes to process
547 UVector32 fIndexStack
; // Stack of where in node we are
548 const CompactTrieHeader
*fHeader
; // Trie data
551 static UClassID U_EXPORT2
getStaticClassID(void);
552 virtual UClassID
getDynamicClassID(void) const;
554 CompactTrieEnumeration(const CompactTrieHeader
*header
, UErrorCode
&status
)
555 : fNodeStack(status
), fIndexStack(status
) {
557 fNodeStack
.push(header
->root
, status
);
558 fIndexStack
.push(0, status
);
562 virtual ~CompactTrieEnumeration() {
565 virtual StringEnumeration
*clone() const {
566 UErrorCode status
= U_ZERO_ERROR
;
567 return new CompactTrieEnumeration(fHeader
, status
);
570 virtual const UnicodeString
* snext(UErrorCode
&status
);
572 // Very expensive, but this should never be used.
573 virtual int32_t count(UErrorCode
&status
) const {
574 CompactTrieEnumeration
counter(fHeader
, status
);
576 while (counter
.snext(status
) != NULL
&& U_SUCCESS(status
)) {
582 virtual void reset(UErrorCode
&status
) {
583 fNodeStack
.removeAllElements();
584 fIndexStack
.removeAllElements();
585 fNodeStack
.push(fHeader
->root
, status
);
586 fIndexStack
.push(0, status
);
591 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration
)
593 const UnicodeString
*
594 CompactTrieEnumeration::snext(UErrorCode
&status
) {
595 if (fNodeStack
.empty() || U_FAILURE(status
)) {
598 const CompactTrieNode
*node
= getCompactNode(fHeader
, fNodeStack
.peeki());
599 int where
= fIndexStack
.peeki();
600 while (!fNodeStack
.empty() && U_SUCCESS(status
)) {
601 int nodeCount
= (node
->flagscount
& kCountMask
);
602 UBool goingDown
= FALSE
;
603 if (nodeCount
== 0) {
604 // Terminal node; go up immediately
607 node
= getCompactNode(fHeader
, fNodeStack
.peeki());
608 where
= fIndexStack
.peeki();
610 else if (node
->flagscount
& kVerticalNode
) {
612 const CompactTrieVerticalNode
*vnode
= (const CompactTrieVerticalNode
*)node
;
615 unistr
.append((const UChar
*)vnode
->chars
, (int32_t) nodeCount
);
616 fIndexStack
.setElementAt(1, fIndexStack
.size()-1);
617 node
= getCompactNode(fHeader
, fNodeStack
.push(vnode
->equal
, status
));
618 where
= fIndexStack
.push(0, status
);
623 unistr
.truncate(unistr
.length()-nodeCount
);
626 node
= getCompactNode(fHeader
, fNodeStack
.peeki());
627 where
= fIndexStack
.peeki();
632 const CompactTrieHorizontalNode
*hnode
= (const CompactTrieHorizontalNode
*)node
;
635 unistr
.truncate(unistr
.length()-1);
637 if (where
< nodeCount
) {
639 unistr
.append((UChar
)hnode
->entries
[where
].ch
);
640 fIndexStack
.setElementAt(where
+1, fIndexStack
.size()-1);
641 node
= getCompactNode(fHeader
, fNodeStack
.push(hnode
->entries
[where
].equal
, status
));
642 where
= fIndexStack
.push(0, status
);
649 node
= getCompactNode(fHeader
, fNodeStack
.peeki());
650 where
= fIndexStack
.peeki();
653 // Check if the parent of the node we've just gone down to ends a
654 // word. If so, return it.
655 if (goingDown
&& (node
->flagscount
& kParentEndsWord
)) {
663 CompactTrieDictionary::openWords( UErrorCode
&status
) const {
664 if (U_FAILURE(status
)) {
667 return new CompactTrieEnumeration(fData
, status
);
671 // Below here is all code related to converting a ternary trie to a compact trie
675 // Helper classes to construct the compact trie
676 class BuildCompactTrieNode
: public UMemory
{
678 UBool fParentEndsWord
;
682 UnicodeString fChars
;
685 BuildCompactTrieNode(UBool parentEndsWord
, UBool vertical
, UStack
&nodes
, UErrorCode
&status
) {
686 fParentEndsWord
= parentEndsWord
;
687 fHasDuplicate
= FALSE
;
688 fVertical
= vertical
;
689 fNodeID
= nodes
.size();
690 nodes
.push(this, status
);
693 virtual ~BuildCompactTrieNode() {
696 virtual uint32_t size() {
697 return sizeof(uint16_t);
700 virtual void write(uint8_t *bytes
, uint32_t &offset
, const UVector32
&/*translate*/) {
702 *((uint16_t *)(bytes
+offset
)) = (fChars
.length() & kCountMask
)
703 | (fVertical
? kVerticalNode
: 0) | (fParentEndsWord
? kParentEndsWord
: 0 );
704 offset
+= sizeof(uint16_t);
708 class BuildCompactTrieHorizontalNode
: public BuildCompactTrieNode
{
713 BuildCompactTrieHorizontalNode(UBool parentEndsWord
, UStack
&nodes
, UErrorCode
&status
)
714 : BuildCompactTrieNode(parentEndsWord
, FALSE
, nodes
, status
), fLinks(status
) {
717 virtual ~BuildCompactTrieHorizontalNode() {
720 virtual uint32_t size() {
721 return offsetof(CompactTrieHorizontalNode
,entries
) +
722 (fChars
.length()*sizeof(CompactTrieHorizontalEntry
));
725 virtual void write(uint8_t *bytes
, uint32_t &offset
, const UVector32
&translate
) {
726 BuildCompactTrieNode::write(bytes
, offset
, translate
);
727 int32_t count
= fChars
.length();
728 for (int32_t i
= 0; i
< count
; ++i
) {
729 CompactTrieHorizontalEntry
*entry
= (CompactTrieHorizontalEntry
*)(bytes
+offset
);
730 entry
->ch
= fChars
[i
];
731 entry
->equal
= translate
.elementAti(((BuildCompactTrieNode
*)fLinks
[i
])->fNodeID
);
732 #ifdef DEBUG_TRIE_DICT
733 if (entry
->equal
== 0) {
734 fprintf(stderr
, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
735 i
, ((BuildCompactTrieNode
*)fLinks
[i
])->fNodeID
);
738 offset
+= sizeof(CompactTrieHorizontalEntry
);
742 void addNode(UChar ch
, BuildCompactTrieNode
*link
, UErrorCode
&status
) {
744 fLinks
.push(link
, status
);
748 class BuildCompactTrieVerticalNode
: public BuildCompactTrieNode
{
750 BuildCompactTrieNode
*fEqual
;
753 BuildCompactTrieVerticalNode(UBool parentEndsWord
, UStack
&nodes
, UErrorCode
&status
)
754 : BuildCompactTrieNode(parentEndsWord
, TRUE
, nodes
, status
) {
758 virtual ~BuildCompactTrieVerticalNode() {
761 virtual uint32_t size() {
762 return offsetof(CompactTrieVerticalNode
,chars
) + (fChars
.length()*sizeof(uint16_t));
765 virtual void write(uint8_t *bytes
, uint32_t &offset
, const UVector32
&translate
) {
766 CompactTrieVerticalNode
*node
= (CompactTrieVerticalNode
*)(bytes
+offset
);
767 BuildCompactTrieNode::write(bytes
, offset
, translate
);
768 node
->equal
= translate
.elementAti(fEqual
->fNodeID
);
769 offset
+= sizeof(node
->equal
);
770 #ifdef DEBUG_TRIE_DICT
771 if (node
->equal
== 0) {
772 fprintf(stderr
, "ERROR: vertical link, logical node %d maps to physical node zero\n",
776 fChars
.extract(0, fChars
.length(), (UChar
*)node
->chars
);
777 offset
+= sizeof(uint16_t)*fChars
.length();
780 void addChar(UChar ch
) {
784 void setLink(BuildCompactTrieNode
*node
) {
789 // Forward declaration
790 static void walkHorizontal(const TernaryNode
*node
,
791 BuildCompactTrieHorizontalNode
*building
,
795 // Convert one node. Uses recursion.
797 static BuildCompactTrieNode
*
798 compactOneNode(const TernaryNode
*node
, UBool parentEndsWord
, UStack
&nodes
, UErrorCode
&status
) {
799 if (U_FAILURE(status
)) {
802 BuildCompactTrieNode
*result
= NULL
;
803 UBool horizontal
= (node
->low
!= NULL
|| node
->high
!= NULL
);
805 BuildCompactTrieHorizontalNode
*hResult
=
806 new BuildCompactTrieHorizontalNode(parentEndsWord
, nodes
, status
);
807 if (hResult
== NULL
) {
808 status
= U_MEMORY_ALLOCATION_ERROR
;
811 if (U_SUCCESS(status
)) {
812 walkHorizontal(node
, hResult
, nodes
, status
);
817 BuildCompactTrieVerticalNode
*vResult
=
818 new BuildCompactTrieVerticalNode(parentEndsWord
, nodes
, status
);
819 if (vResult
== NULL
) {
820 status
= U_MEMORY_ALLOCATION_ERROR
;
822 else if (U_SUCCESS(status
)) {
823 UBool endsWord
= FALSE
;
824 // Take up nodes until we end a word, or hit a node with < or > links
826 vResult
->addChar(node
->ch
);
827 endsWord
= (node
->flags
& kEndsWord
) != 0;
830 while(node
!= NULL
&& !endsWord
&& node
->low
== NULL
&& node
->high
== NULL
);
833 status
= U_ILLEGAL_ARGUMENT_ERROR
; // Corrupt input trie
836 vResult
->setLink((BuildCompactTrieNode
*)nodes
[1]);
840 vResult
->setLink(compactOneNode(node
, endsWord
, nodes
, status
));
848 // Walk the set of peers at the same level, to build a horizontal node.
851 static void walkHorizontal(const TernaryNode
*node
,
852 BuildCompactTrieHorizontalNode
*building
,
854 UErrorCode
&status
) {
855 while (U_SUCCESS(status
) && node
!= NULL
) {
856 if (node
->low
!= NULL
) {
857 walkHorizontal(node
->low
, building
, nodes
, status
);
859 BuildCompactTrieNode
*link
= NULL
;
860 if (node
->equal
!= NULL
) {
861 link
= compactOneNode(node
->equal
, (node
->flags
& kEndsWord
) != 0, nodes
, status
);
863 else if (node
->flags
& kEndsWord
) {
864 link
= (BuildCompactTrieNode
*)nodes
[1];
866 if (U_SUCCESS(status
) && link
!= NULL
) {
867 building
->addNode(node
->ch
, link
, status
);
869 // Tail recurse manually instead of leaving it to the compiler.
870 //if (node->high != NULL) {
871 // walkHorizontal(node->high, building, nodes, status);
880 static int32_t U_CALLCONV
881 _sortBuildNodes(const void * /*context*/, const void *voidl
, const void *voidr
) {
882 BuildCompactTrieNode
*left
= *(BuildCompactTrieNode
**)voidl
;
883 BuildCompactTrieNode
*right
= *(BuildCompactTrieNode
**)voidr
;
884 // Check for comparing a node to itself, to avoid spurious duplicates
888 // Most significant is type of node. Can never coalesce.
889 if (left
->fVertical
!= right
->fVertical
) {
890 return left
->fVertical
- right
->fVertical
;
892 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
893 if (left
->fParentEndsWord
!= right
->fParentEndsWord
) {
894 return left
->fParentEndsWord
- right
->fParentEndsWord
;
896 // Next, the string. If that differs, we can never coalesce.
897 int32_t result
= left
->fChars
.compare(right
->fChars
);
901 // We know they're both the same node type, so branch for the two cases.
902 if (left
->fVertical
) {
903 result
= ((BuildCompactTrieVerticalNode
*)left
)->fEqual
->fNodeID
904 - ((BuildCompactTrieVerticalNode
*)right
)->fEqual
->fNodeID
;
907 // We need to compare the links vectors. They should be the
908 // same size because the strings were equal.
909 // We compare the node IDs instead of the pointers, to handle
911 BuildCompactTrieHorizontalNode
*hleft
, *hright
;
912 hleft
= (BuildCompactTrieHorizontalNode
*)left
;
913 hright
= (BuildCompactTrieHorizontalNode
*)right
;
914 int32_t count
= hleft
->fLinks
.size();
915 for (int32_t i
= 0; i
< count
&& result
== 0; ++i
) {
916 result
= ((BuildCompactTrieNode
*)(hleft
->fLinks
[i
]))->fNodeID
-
917 ((BuildCompactTrieNode
*)(hright
->fLinks
[i
]))->fNodeID
;
920 // If they are equal to each other, mark them (speeds coalescing)
922 left
->fHasDuplicate
= TRUE
;
923 right
->fHasDuplicate
= TRUE
;
930 static void coalesceDuplicates(UStack
&nodes
, UErrorCode
&status
) {
931 // We sort the array of nodes to place duplicates next to each other
932 if (U_FAILURE(status
)) {
935 int32_t size
= nodes
.size();
936 void **array
= (void **)uprv_malloc(sizeof(void *)*size
);
938 status
= U_MEMORY_ALLOCATION_ERROR
;
941 (void) nodes
.toArray(array
);
943 // Now repeatedly identify duplicates until there are no more
946 #ifdef DEBUG_TRIE_DICT
950 BuildCompactTrieNode
*node
;
951 BuildCompactTrieNode
*first
= NULL
;
952 BuildCompactTrieNode
**p
;
953 BuildCompactTrieNode
**pFirst
= NULL
;
954 int32_t counter
= size
- 2;
955 // Sort the array, skipping nodes 0 and 1. Use quicksort for the first
956 // pass for speed. For the second and subsequent passes, we use stable
957 // (insertion) sort for two reasons:
958 // 1. The array is already mostly ordered, so we get better performance.
959 // 2. The way we find one and only one instance of a set of duplicates is to
960 // check that the node ID equals the array index. If we used an unstable
961 // sort for the second or later passes, it's possible that none of the
962 // duplicates would wind up with a node ID equal to its array index.
963 // The sort stability guarantees that, because as we coalesce more and
964 // more groups, the first element of the resultant group will be one of
965 // the first elements of the groups being coalesced.
966 // To use quicksort for the second and subsequent passes, we would have to
967 // find the minimum of the node numbers in a group, and set all the nodes
968 // in the group to that node number.
969 uprv_sortArray(array
+2, counter
, sizeof(void *), _sortBuildNodes
, NULL
, (passCount
> 0), &status
);
971 for (p
= (BuildCompactTrieNode
**)array
+ 2; counter
> 0; --counter
, ++p
) {
973 if (node
->fHasDuplicate
) {
978 else if (_sortBuildNodes(NULL
, pFirst
, p
) != 0) {
979 // Starting a new run of dupes
983 else if (node
->fNodeID
!= first
->fNodeID
) {
984 // Slave one to the other, note duplicate
985 node
->fNodeID
= first
->fNodeID
;
990 // This node has no dupes
996 #ifdef DEBUG_TRIE_DICT
998 fprintf(stderr
, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount
, dupes
);
1002 #ifdef DEBUG_TRIE_DICT
1003 fprintf(stderr
, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes
, passCount
);
1006 // We no longer need the temporary array, as the nodes have all been marked appropriately.
1012 static void U_CALLCONV
_deleteBuildNode(void *obj
) {
1013 delete (BuildCompactTrieNode
*) obj
;
1019 CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary
&dict
,
1020 UErrorCode
&status
) {
1021 if (U_FAILURE(status
)) {
1024 #ifdef DEBUG_TRIE_DICT
1026 struct tms previous
;
1027 (void) ::times(&previous
);
1029 UStack
nodes(_deleteBuildNode
, NULL
, status
); // Index of nodes
1031 // Add node 0, used as the NULL pointer/sentinel.
1032 nodes
.addElement((int32_t)0, status
);
1034 // Start by creating the special empty node we use to indicate that the parent
1035 // terminates a word. This must be node 1, because the builder assumes
1037 if (U_FAILURE(status
)) {
1040 BuildCompactTrieNode
*terminal
= new BuildCompactTrieNode(TRUE
, FALSE
, nodes
, status
);
1041 if (terminal
== NULL
) {
1042 status
= U_MEMORY_ALLOCATION_ERROR
;
1045 // This call does all the work of building the new trie structure. The root
1047 BuildCompactTrieNode
*root
= compactOneNode(dict
.fTrie
, FALSE
, nodes
, status
);
1048 #ifdef DEBUG_TRIE_DICT
1049 (void) ::times(&timing
);
1050 fprintf(stderr
, "Compact trie built, %d nodes, time user %f system %f\n",
1051 nodes
.size(), (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1052 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1056 // Now coalesce all duplicate nodes.
1057 coalesceDuplicates(nodes
, status
);
1058 #ifdef DEBUG_TRIE_DICT
1059 (void) ::times(&timing
);
1060 fprintf(stderr
, "Duplicates coalesced, time user %f system %f\n",
1061 (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1062 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1066 // Next, build the output trie.
1067 // First we compute all the sizes and build the node ID translation table.
1068 uint32_t totalSize
= offsetof(CompactTrieHeader
,offsets
);
1069 int32_t count
= nodes
.size();
1070 int32_t nodeCount
= 1; // The sentinel node we already have
1071 BuildCompactTrieNode
*node
;
1073 UVector32
translate(count
, status
); // Should be no growth needed after this
1074 translate
.push(0, status
); // The sentinel node
1076 if (U_FAILURE(status
)) {
1080 for (i
= 1; i
< count
; ++i
) {
1081 node
= (BuildCompactTrieNode
*)nodes
[i
];
1082 if (node
->fNodeID
== i
) {
1083 // Only one node out of each duplicate set is used
1084 if (i
>= translate
.size()) {
1085 // Logically extend the mapping table
1086 translate
.setSize(i
+1);
1088 translate
.setElementAt(nodeCount
++, i
);
1089 totalSize
+= node
->size();
1093 // Check for overflowing 16 bits worth of nodes.
1094 if (nodeCount
> 0x10000) {
1095 status
= U_ILLEGAL_ARGUMENT_ERROR
;
1099 // Add enough room for the offsets.
1100 totalSize
+= nodeCount
*sizeof(uint32_t);
1101 #ifdef DEBUG_TRIE_DICT
1102 (void) ::times(&timing
);
1103 fprintf(stderr
, "Sizes/mapping done, time user %f system %f\n",
1104 (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1105 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1107 fprintf(stderr
, "%d nodes, %d unique, %d bytes\n", nodes
.size(), nodeCount
, totalSize
);
1109 uint8_t *bytes
= (uint8_t *)uprv_malloc(totalSize
);
1110 if (bytes
== NULL
) {
1111 status
= U_MEMORY_ALLOCATION_ERROR
;
1115 CompactTrieHeader
*header
= (CompactTrieHeader
*)bytes
;
1116 header
->size
= totalSize
;
1117 header
->nodeCount
= nodeCount
;
1118 header
->offsets
[0] = 0; // Sentinel
1119 header
->root
= translate
.elementAti(root
->fNodeID
);
1120 #ifdef DEBUG_TRIE_DICT
1121 if (header
->root
== 0) {
1122 fprintf(stderr
, "ERROR: root node %d translate to physical zero\n", root
->fNodeID
);
1125 uint32_t offset
= offsetof(CompactTrieHeader
,offsets
)+(nodeCount
*sizeof(uint32_t));
1127 // Now write the data
1128 for (i
= 1; i
< count
; ++i
) {
1129 node
= (BuildCompactTrieNode
*)nodes
[i
];
1130 if (node
->fNodeID
== i
) {
1131 header
->offsets
[nodeCount
++] = offset
;
1132 node
->write(bytes
, offset
, translate
);
1135 #ifdef DEBUG_TRIE_DICT
1136 (void) ::times(&timing
);
1137 fprintf(stderr
, "Trie built, time user %f system %f\n",
1138 (double)(timing
.tms_utime
-previous
.tms_utime
)/CLK_TCK
,
1139 (double)(timing
.tms_stime
-previous
.tms_stime
)/CLK_TCK
);
1141 fprintf(stderr
, "Final offset is %d\n", offset
);
1143 // Collect statistics on node types and sizes
1148 size_t hItemCount
= 0;
1149 size_t vItemCount
= 0;
1150 uint32_t previousOff
= offset
;
1151 for (uint16_t nodeIdx
= nodeCount
-1; nodeIdx
>= 2; --nodeIdx
) {
1152 const CompactTrieNode
*node
= getCompactNode(header
, nodeIdx
);
1153 if (node
->flagscount
& kVerticalNode
) {
1155 vItemCount
+= (node
->flagscount
& kCountMask
);
1156 vSize
+= previousOff
-header
->offsets
[nodeIdx
];
1160 hItemCount
+= (node
->flagscount
& kCountMask
);
1161 hSize
+= previousOff
-header
->offsets
[nodeIdx
];
1163 previousOff
= header
->offsets
[nodeIdx
];
1165 fprintf(stderr
, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount
,
1166 (double)hSize
/hCount
, (double)hItemCount
/hCount
);
1167 fprintf(stderr
, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount
,
1168 (double)vSize
/vCount
, (double)vItemCount
/vCount
);
1171 if (U_FAILURE(status
)) {
1176 header
->magic
= COMPACT_TRIE_MAGIC_1
;
1181 // Forward declaration
1182 static TernaryNode
*
1183 unpackOneNode( const CompactTrieHeader
*header
, const CompactTrieNode
*node
, UErrorCode
&status
);
1186 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
1187 static TernaryNode
*
1188 unpackHorizontalArray( const CompactTrieHeader
*header
, const CompactTrieHorizontalEntry
*array
,
1189 int low
, int high
, UErrorCode
&status
) {
1190 if (U_FAILURE(status
) || low
> high
) {
1193 int middle
= (low
+high
)/2;
1194 TernaryNode
*result
= new TernaryNode(array
[middle
].ch
);
1195 if (result
== NULL
) {
1196 status
= U_MEMORY_ALLOCATION_ERROR
;
1199 const CompactTrieNode
*equal
= getCompactNode(header
, array
[middle
].equal
);
1200 if (equal
->flagscount
& kParentEndsWord
) {
1201 result
->flags
|= kEndsWord
;
1203 result
->low
= unpackHorizontalArray(header
, array
, low
, middle
-1, status
);
1204 result
->high
= unpackHorizontalArray(header
, array
, middle
+1, high
, status
);
1205 result
->equal
= unpackOneNode(header
, equal
, status
);
1209 // Convert one compact trie node into a ternary subtrie
1210 static TernaryNode
*
1211 unpackOneNode( const CompactTrieHeader
*header
, const CompactTrieNode
*node
, UErrorCode
&status
) {
1212 int nodeCount
= (node
->flagscount
& kCountMask
);
1213 if (nodeCount
== 0 || U_FAILURE(status
)) {
1214 // Failure, or terminal node
1217 if (node
->flagscount
& kVerticalNode
) {
1218 const CompactTrieVerticalNode
*vnode
= (const CompactTrieVerticalNode
*)node
;
1219 TernaryNode
*head
= NULL
;
1220 TernaryNode
*previous
= NULL
;
1221 TernaryNode
*latest
= NULL
;
1222 for (int i
= 0; i
< nodeCount
; ++i
) {
1223 latest
= new TernaryNode(vnode
->chars
[i
]);
1224 if (latest
== NULL
) {
1225 status
= U_MEMORY_ALLOCATION_ERROR
;
1231 if (previous
!= NULL
) {
1232 previous
->equal
= latest
;
1236 if (latest
!= NULL
) {
1237 const CompactTrieNode
*equal
= getCompactNode(header
, vnode
->equal
);
1238 if (equal
->flagscount
& kParentEndsWord
) {
1239 latest
->flags
|= kEndsWord
;
1241 latest
->equal
= unpackOneNode(header
, equal
, status
);
1247 const CompactTrieHorizontalNode
*hnode
= (const CompactTrieHorizontalNode
*)node
;
1248 return unpackHorizontalArray(header
, &hnode
->entries
[0], 0, nodeCount
-1, status
);
1252 MutableTrieDictionary
*
1253 CompactTrieDictionary::cloneMutable( UErrorCode
&status
) const {
1254 MutableTrieDictionary
*result
= new MutableTrieDictionary( status
);
1255 if (result
== NULL
) {
1256 status
= U_MEMORY_ALLOCATION_ERROR
;
1259 TernaryNode
*root
= unpackOneNode(fData
, getCompactNode(fData
, fData
->root
), status
);
1260 if (U_FAILURE(status
)) {
1261 delete root
; // Clean up
1265 result
->fTrie
= root
;
1271 U_CAPI
int32_t U_EXPORT2
1272 triedict_swap(const UDataSwapper
*ds
, const void *inData
, int32_t length
, void *outData
,
1273 UErrorCode
*status
) {
1275 if (status
== NULL
|| U_FAILURE(*status
)) {
1278 if(ds
==NULL
|| inData
==NULL
|| length
<-1 || (length
>0 && outData
==NULL
)) {
1279 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
1284 // Check that the data header is for for dictionary data.
1285 // (Header contents are defined in genxxx.cpp)
1287 const UDataInfo
*pInfo
= (const UDataInfo
*)((const uint8_t *)inData
+4);
1288 if(!( pInfo
->dataFormat
[0]==0x54 && /* dataFormat="TrDc" */
1289 pInfo
->dataFormat
[1]==0x72 &&
1290 pInfo
->dataFormat
[2]==0x44 &&
1291 pInfo
->dataFormat
[3]==0x63 &&
1292 pInfo
->formatVersion
[0]==1 )) {
1293 udata_printError(ds
, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
1294 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
1295 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
1296 pInfo
->formatVersion
[0]);
1297 *status
=U_UNSUPPORTED_ERROR
;
1302 // Swap the data header. (This is the generic ICU Data Header, not the
1303 // CompactTrieHeader). This swap also conveniently gets us
1304 // the size of the ICU d.h., which lets us locate the start
1305 // of the RBBI specific data.
1307 int32_t headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, status
);
1310 // Get the CompactTrieHeader, and check that it appears to be OK.
1312 const uint8_t *inBytes
=(const uint8_t *)inData
+headerSize
;
1313 const CompactTrieHeader
*header
= (const CompactTrieHeader
*)inBytes
;
1314 if (ds
->readUInt32(header
->magic
) != COMPACT_TRIE_MAGIC_1
1315 || ds
->readUInt32(header
->size
) < sizeof(CompactTrieHeader
))
1317 udata_printError(ds
, "triedict_swap(): CompactTrieHeader is invalid.\n");
1318 *status
=U_UNSUPPORTED_ERROR
;
1323 // Prefight operation? Just return the size
1325 uint32_t totalSize
= ds
->readUInt32(header
->size
);
1326 int32_t sizeWithUData
= (int32_t)totalSize
+ headerSize
;
1328 return sizeWithUData
;
1332 // Check that length passed in is consistent with length from RBBI data header.
1334 if (length
< sizeWithUData
) {
1335 udata_printError(ds
, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
1337 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
1342 // Swap the Data. Do the data itself first, then the CompactTrieHeader, because
1343 // we need to reference the header to locate the data, and an
1344 // inplace swap of the header leaves it unusable.
1346 uint8_t *outBytes
= (uint8_t *)outData
+ headerSize
;
1347 CompactTrieHeader
*outputHeader
= (CompactTrieHeader
*)outBytes
;
1351 // If not swapping in place, zero out the output buffer before starting.
1353 if (inBytes
!= outBytes
) {
1354 uprv_memset(outBytes
, 0, totalSize
);
1357 // We need to loop through all the nodes in the offset table, and swap each one.
1358 uint16_t nodeCount
= ds
->readUInt16(header
->nodeCount
);
1359 // Skip node 0, which should always be 0.
1360 for (int i
= 1; i
< nodeCount
; ++i
) {
1361 uint32_t nodeOff
= ds
->readUInt32(header
->offsets
[i
]);
1362 const CompactTrieNode
*inNode
= (const CompactTrieNode
*)(inBytes
+ nodeOff
);
1363 CompactTrieNode
*outNode
= (CompactTrieNode
*)(outBytes
+ nodeOff
);
1364 uint16_t flagscount
= ds
->readUInt16(inNode
->flagscount
);
1365 uint16_t itemCount
= flagscount
& kCountMask
;
1366 ds
->writeUInt16(&outNode
->flagscount
, flagscount
);
1367 if (itemCount
> 0) {
1368 if (flagscount
& kVerticalNode
) {
1369 ds
->swapArray16(ds
, inBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,chars
),
1370 itemCount
*sizeof(uint16_t),
1371 outBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,chars
), status
);
1372 uint16_t equal
= ds
->readUInt16(inBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,equal
);
1373 ds
->writeUInt16(outBytes
+nodeOff
+offsetof(CompactTrieVerticalNode
,equal
));
1376 const CompactTrieHorizontalNode
*inHNode
= (const CompactTrieHorizontalNode
*)inNode
;
1377 CompactTrieHorizontalNode
*outHNode
= (CompactTrieHorizontalNode
*)outNode
;
1378 for (int j
= 0; j
< itemCount
; ++j
) {
1379 uint16_t word
= ds
->readUInt16(inHNode
->entries
[j
].ch
);
1380 ds
->writeUInt16(&outHNode
->entries
[j
].ch
, word
);
1381 word
= ds
->readUInt16(inHNode
->entries
[j
].equal
);
1382 ds
->writeUInt16(&outHNode
->entries
[j
].equal
, word
);
1389 // All the data in all the nodes consist of 16 bit items. Swap them all at once.
1390 uint16_t nodeCount
= ds
->readUInt16(header
->nodeCount
);
1391 uint32_t nodesOff
= offsetof(CompactTrieHeader
,offsets
)+((uint32_t)nodeCount
*sizeof(uint32_t));
1392 ds
->swapArray16(ds
, inBytes
+nodesOff
, totalSize
-nodesOff
, outBytes
+nodesOff
, status
);
1395 ds
->writeUInt32(&outputHeader
->size
, totalSize
);
1396 uint32_t magic
= ds
->readUInt32(header
->magic
);
1397 ds
->writeUInt32(&outputHeader
->magic
, magic
);
1398 ds
->writeUInt16(&outputHeader
->nodeCount
, nodeCount
);
1399 uint16_t root
= ds
->readUInt16(header
->root
);
1400 ds
->writeUInt16(&outputHeader
->root
, root
);
1401 ds
->swapArray32(ds
, inBytes
+offsetof(CompactTrieHeader
,offsets
),
1402 sizeof(uint32_t)*(int32_t)nodeCount
,
1403 outBytes
+offsetof(CompactTrieHeader
,offsets
), status
);
1405 return sizeWithUData
;
1408 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */