2 **********************************************************************
3 * Copyright (c) 2002-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_BREAK_ITERATION
16 #include "unicode/unistr.h"
27 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder
*rb
, RBBINode
**rootNode
) :
30 fStatus
= fRB
->fStatus
;
31 UErrorCode status
= U_ZERO_ERROR
;
32 fDStates
= new UVector(status
);
33 if (U_FAILURE(*fStatus
)) {
36 if (U_FAILURE(status
)) {
40 if (fDStates
== NULL
) {
41 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;;
47 RBBITableBuilder::~RBBITableBuilder() {
49 for (i
=0; i
<fDStates
->size(); i
++) {
50 delete (RBBIStateDescriptor
*)fDStates
->elementAt(i
);
56 //-----------------------------------------------------------------------------
58 // RBBITableBuilder::build - This is the main function for building the DFA state transtion
59 // table from the RBBI rules parse tree.
61 //-----------------------------------------------------------------------------
62 void RBBITableBuilder::build() {
64 if (U_FAILURE(*fStatus
)) {
68 // If there were no rules, just return. This situation can easily arise
69 // for the reverse rules.
75 // Walk through the tree, replacing any references to $variables with a copy of the
76 // parse tree for the substition expression.
78 fTree
= fTree
->flattenVariables();
80 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "ftree")) {
81 RBBIDebugPuts("Parse tree after flattening variable references.");
82 fTree
->printTree(TRUE
);
87 // If the rules contained any references to {bof}
88 // add a {bof} <cat> <former root of tree> to the
89 // tree. Means that all matches must start out with the
90 // {bof} fake character.
92 if (fRB
->fSetBuilder
->sawBOF()) {
93 RBBINode
*bofTop
= new RBBINode(RBBINode::opCat
);
94 RBBINode
*bofLeaf
= new RBBINode(RBBINode::leafChar
);
95 // Delete and exit if memory allocation failed.
96 if (bofTop
== NULL
|| bofLeaf
== NULL
) {
97 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
102 bofTop
->fLeftChild
= bofLeaf
;
103 bofTop
->fRightChild
= fTree
;
104 bofLeaf
->fParent
= bofTop
;
105 bofLeaf
->fVal
= 2; // Reserved value for {bof}.
110 // Add a unique right-end marker to the expression.
111 // Appears as a cat-node, left child being the original tree,
112 // right child being the end marker.
114 RBBINode
*cn
= new RBBINode(RBBINode::opCat
);
115 // Exit if memory allocation failed.
117 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
120 cn
->fLeftChild
= fTree
;
122 cn
->fRightChild
= new RBBINode(RBBINode::endMark
);
123 // Delete and exit if memory allocation failed.
124 if (cn
->fRightChild
== NULL
) {
125 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
129 cn
->fRightChild
->fParent
= cn
;
133 // Replace all references to UnicodeSets with the tree for the equivalent
136 fTree
->flattenSets();
138 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "stree")) {
139 RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
140 fTree
->printTree(TRUE
);
146 // calculate the functions nullable, firstpos, lastpos and followpos on
147 // nodes in the parse tree.
148 // See the alogrithm description in Aho.
149 // Understanding how this works by looking at the code alone will be
150 // nearly impossible.
155 calcFollowPos(fTree
);
156 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "pos")) {
162 // For "chained" rules, modify the followPos sets
164 if (fRB
->fChainRules
) {
165 calcChainedFollowPos(fTree
);
169 // BOF (start of input) test fixup.
171 if (fRB
->fSetBuilder
->sawBOF()) {
176 // Build the DFA state transition tables.
179 flagAcceptingStates();
180 flagLookAheadStates();
184 // Update the global table of rule status {tag} values
185 // The rule builder has a global vector of status values that are common
186 // for all tables. Merge the ones from this table into the global set.
188 mergeRuleStatusVals();
190 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "states")) {printStates();};
195 //-----------------------------------------------------------------------------
197 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
199 //-----------------------------------------------------------------------------
200 void RBBITableBuilder::calcNullable(RBBINode
*n
) {
204 if (n
->fType
== RBBINode::setRef
||
205 n
->fType
== RBBINode::endMark
) {
206 // These are non-empty leaf node types.
207 n
->fNullable
= FALSE
;
211 if (n
->fType
== RBBINode::lookAhead
|| n
->fType
== RBBINode::tag
) {
212 // Lookahead marker node. It's a leaf, so no recursion on children.
213 // It's nullable because it does not match any literal text from the input stream.
219 // The node is not a leaf.
220 // Calculate nullable on its children.
221 calcNullable(n
->fLeftChild
);
222 calcNullable(n
->fRightChild
);
224 // Apply functions from table 3.40 in Aho
225 if (n
->fType
== RBBINode::opOr
) {
226 n
->fNullable
= n
->fLeftChild
->fNullable
|| n
->fRightChild
->fNullable
;
228 else if (n
->fType
== RBBINode::opCat
) {
229 n
->fNullable
= n
->fLeftChild
->fNullable
&& n
->fRightChild
->fNullable
;
231 else if (n
->fType
== RBBINode::opStar
|| n
->fType
== RBBINode::opQuestion
) {
235 n
->fNullable
= FALSE
;
242 //-----------------------------------------------------------------------------
244 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
246 //-----------------------------------------------------------------------------
247 void RBBITableBuilder::calcFirstPos(RBBINode
*n
) {
251 if (n
->fType
== RBBINode::leafChar
||
252 n
->fType
== RBBINode::endMark
||
253 n
->fType
== RBBINode::lookAhead
||
254 n
->fType
== RBBINode::tag
) {
255 // These are non-empty leaf node types.
256 // Note: In order to maintain the sort invariant on the set,
257 // this function should only be called on a node whose set is
258 // empty to start with.
259 n
->fFirstPosSet
->addElement(n
, *fStatus
);
263 // The node is not a leaf.
264 // Calculate firstPos on its children.
265 calcFirstPos(n
->fLeftChild
);
266 calcFirstPos(n
->fRightChild
);
268 // Apply functions from table 3.40 in Aho
269 if (n
->fType
== RBBINode::opOr
) {
270 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
271 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
273 else if (n
->fType
== RBBINode::opCat
) {
274 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
275 if (n
->fLeftChild
->fNullable
) {
276 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
279 else if (n
->fType
== RBBINode::opStar
||
280 n
->fType
== RBBINode::opQuestion
||
281 n
->fType
== RBBINode::opPlus
) {
282 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
288 //-----------------------------------------------------------------------------
290 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
292 //-----------------------------------------------------------------------------
293 void RBBITableBuilder::calcLastPos(RBBINode
*n
) {
297 if (n
->fType
== RBBINode::leafChar
||
298 n
->fType
== RBBINode::endMark
||
299 n
->fType
== RBBINode::lookAhead
||
300 n
->fType
== RBBINode::tag
) {
301 // These are non-empty leaf node types.
302 // Note: In order to maintain the sort invariant on the set,
303 // this function should only be called on a node whose set is
304 // empty to start with.
305 n
->fLastPosSet
->addElement(n
, *fStatus
);
309 // The node is not a leaf.
310 // Calculate lastPos on its children.
311 calcLastPos(n
->fLeftChild
);
312 calcLastPos(n
->fRightChild
);
314 // Apply functions from table 3.40 in Aho
315 if (n
->fType
== RBBINode::opOr
) {
316 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
317 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
319 else if (n
->fType
== RBBINode::opCat
) {
320 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
321 if (n
->fRightChild
->fNullable
) {
322 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
325 else if (n
->fType
== RBBINode::opStar
||
326 n
->fType
== RBBINode::opQuestion
||
327 n
->fType
== RBBINode::opPlus
) {
328 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
334 //-----------------------------------------------------------------------------
336 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
338 //-----------------------------------------------------------------------------
339 void RBBITableBuilder::calcFollowPos(RBBINode
*n
) {
341 n
->fType
== RBBINode::leafChar
||
342 n
->fType
== RBBINode::endMark
) {
346 calcFollowPos(n
->fLeftChild
);
347 calcFollowPos(n
->fRightChild
);
350 if (n
->fType
== RBBINode::opCat
) {
351 RBBINode
*i
; // is 'i' in Aho's description
354 UVector
*LastPosOfLeftChild
= n
->fLeftChild
->fLastPosSet
;
356 for (ix
=0; ix
<(uint32_t)LastPosOfLeftChild
->size(); ix
++) {
357 i
= (RBBINode
*)LastPosOfLeftChild
->elementAt(ix
);
358 setAdd(i
->fFollowPos
, n
->fRightChild
->fFirstPosSet
);
363 if (n
->fType
== RBBINode::opStar
||
364 n
->fType
== RBBINode::opPlus
) {
365 RBBINode
*i
; // again, n and i are the names from Aho's description.
368 for (ix
=0; ix
<(uint32_t)n
->fLastPosSet
->size(); ix
++) {
369 i
= (RBBINode
*)n
->fLastPosSet
->elementAt(ix
);
370 setAdd(i
->fFollowPos
, n
->fFirstPosSet
);
379 //-----------------------------------------------------------------------------
381 // calcChainedFollowPos. Modify the previously calculated followPos sets
382 // to implement rule chaining. NOT described by Aho
384 //-----------------------------------------------------------------------------
385 void RBBITableBuilder::calcChainedFollowPos(RBBINode
*tree
) {
387 UVector
endMarkerNodes(*fStatus
);
388 UVector
leafNodes(*fStatus
);
391 if (U_FAILURE(*fStatus
)) {
395 // get a list of all endmarker nodes.
396 tree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
398 // get a list all leaf nodes
399 tree
->findNodes(&leafNodes
, RBBINode::leafChar
, *fStatus
);
400 if (U_FAILURE(*fStatus
)) {
404 // Get all nodes that can be the start a match, which is FirstPosition()
405 // of the portion of the tree corresponding to user-written rules.
406 // See the tree description in bofFixup().
407 RBBINode
*userRuleRoot
= tree
;
408 if (fRB
->fSetBuilder
->sawBOF()) {
409 userRuleRoot
= tree
->fLeftChild
->fRightChild
;
411 U_ASSERT(userRuleRoot
!= NULL
);
412 UVector
*matchStartNodes
= userRuleRoot
->fFirstPosSet
;
415 // Iteratate over all leaf nodes,
420 for (endNodeIx
=0; endNodeIx
<leafNodes
.size(); endNodeIx
++) {
421 RBBINode
*tNode
= (RBBINode
*)leafNodes
.elementAt(endNodeIx
);
422 RBBINode
*endNode
= NULL
;
424 // Identify leaf nodes that correspond to overall rule match positions.
425 // These include an endMarkerNode in their followPos sets.
426 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
427 if (tNode
->fFollowPos
->contains(endMarkerNodes
.elementAt(i
))) {
432 if (endNode
== NULL
) {
433 // node wasn't an end node. Try again with the next.
437 // We've got a node that can end a match.
439 // Line Break Specific hack: If this node's val correspond to the $CM char class,
440 // don't chain from it.
441 // TODO: Add rule syntax for this behavior, get specifics out of here and
442 // into the rule file.
443 if (fRB
->fLBCMNoChain
) {
444 UChar32 c
= this->fRB
->fSetBuilder
->getFirstChar(endNode
->fVal
);
446 // c == -1 occurs with sets containing only the {eof} marker string.
447 ULineBreak cLBProp
= (ULineBreak
)u_getIntPropertyValue(c
, UCHAR_LINE_BREAK
);
448 if (cLBProp
== U_LB_COMBINING_MARK
) {
455 // Now iterate over the nodes that can start a match, looking for ones
456 // with the same char class as our ending node.
458 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
->size(); startNodeIx
++) {
459 startNode
= (RBBINode
*)matchStartNodes
->elementAt(startNodeIx
);
460 if (startNode
->fType
!= RBBINode::leafChar
) {
464 if (endNode
->fVal
== startNode
->fVal
) {
465 // The end val (character class) of one possible match is the
466 // same as the start of another.
468 // Add all nodes from the followPos of the start node to the
469 // followPos set of the end node, which will have the effect of
470 // letting matches transition from a match state at endNode
471 // to the second char of a match starting with startNode.
472 setAdd(endNode
->fFollowPos
, startNode
->fFollowPos
);
479 //-----------------------------------------------------------------------------
481 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
482 // Do an swizzle similar to chaining, modifying the followPos set of
483 // the bofNode to include the followPos nodes from other {bot} nodes
484 // scattered through the tree.
486 // This function has much in common with calcChainedFollowPos().
488 //-----------------------------------------------------------------------------
489 void RBBITableBuilder::bofFixup() {
491 if (U_FAILURE(*fStatus
)) {
495 // The parse tree looks like this ...
496 // fTree root ---> <cat>
503 // We will be adding things to the followPos set of the <bofNode>
505 RBBINode
*bofNode
= fTree
->fLeftChild
->fLeftChild
;
506 U_ASSERT(bofNode
->fType
== RBBINode::leafChar
);
507 U_ASSERT(bofNode
->fVal
== 2);
509 // Get all nodes that can be the start a match of the user-written rules
510 // (excluding the fake bofNode)
511 // We want the nodes that can start a match in the
512 // part labeled "rest of tree"
514 UVector
*matchStartNodes
= fTree
->fLeftChild
->fRightChild
->fFirstPosSet
;
518 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
->size(); startNodeIx
++) {
519 startNode
= (RBBINode
*)matchStartNodes
->elementAt(startNodeIx
);
520 if (startNode
->fType
!= RBBINode::leafChar
) {
524 if (startNode
->fVal
== bofNode
->fVal
) {
525 // We found a leaf node corresponding to a {bof} that was
526 // explicitly written into a rule.
527 // Add everything from the followPos set of this node to the
528 // followPos set of the fake bofNode at the start of the tree.
530 setAdd(bofNode
->fFollowPos
, startNode
->fFollowPos
);
535 //-----------------------------------------------------------------------------
537 // buildStateTable() Determine the set of runtime DFA states and the
538 // transition tables for these states, by the algorithm
539 // of fig. 3.44 in Aho.
541 // Most of the comments are quotes of Aho's psuedo-code.
543 //-----------------------------------------------------------------------------
544 void RBBITableBuilder::buildStateTable() {
545 if (U_FAILURE(*fStatus
)) {
548 RBBIStateDescriptor
*failState
;
549 // Set it to NULL to avoid uninitialized warning
550 RBBIStateDescriptor
*initialState
= NULL
;
552 // Add a dummy state 0 - the stop state. Not from Aho.
553 int lastInputSymbol
= fRB
->fSetBuilder
->getNumCharCategories() - 1;
554 failState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
555 if (failState
== NULL
) {
556 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
557 goto ExitBuildSTdeleteall
;
559 failState
->fPositions
= new UVector(*fStatus
);
560 if (failState
->fPositions
== NULL
) {
561 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
563 if (failState
->fPositions
== NULL
|| U_FAILURE(*fStatus
)) {
564 goto ExitBuildSTdeleteall
;
566 fDStates
->addElement(failState
, *fStatus
);
567 if (U_FAILURE(*fStatus
)) {
568 goto ExitBuildSTdeleteall
;
571 // initially, the only unmarked state in Dstates is firstpos(root),
572 // where toot is the root of the syntax tree for (r)#;
573 initialState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
574 if (initialState
== NULL
) {
575 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
577 if (U_FAILURE(*fStatus
)) {
578 goto ExitBuildSTdeleteall
;
580 initialState
->fPositions
= new UVector(*fStatus
);
581 if (initialState
->fPositions
== NULL
) {
582 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
584 if (U_FAILURE(*fStatus
)) {
585 goto ExitBuildSTdeleteall
;
587 setAdd(initialState
->fPositions
, fTree
->fFirstPosSet
);
588 fDStates
->addElement(initialState
, *fStatus
);
589 if (U_FAILURE(*fStatus
)) {
590 goto ExitBuildSTdeleteall
;
593 // while there is an unmarked state T in Dstates do begin
595 RBBIStateDescriptor
*T
= NULL
;
597 for (tx
=1; tx
<fDStates
->size(); tx
++) {
598 RBBIStateDescriptor
*temp
;
599 temp
= (RBBIStateDescriptor
*)fDStates
->elementAt(tx
);
600 if (temp
->fMarked
== FALSE
) {
612 // for each input symbol a do begin
614 for (a
= 1; a
<=lastInputSymbol
; a
++) {
615 // let U be the set of positions that are in followpos(p)
616 // for some position p in T
617 // such that the symbol at position p is a;
621 for (px
=0; px
<T
->fPositions
->size(); px
++) {
622 p
= (RBBINode
*)T
->fPositions
->elementAt(px
);
623 if ((p
->fType
== RBBINode::leafChar
) && (p
->fVal
== a
)) {
625 U
= new UVector(*fStatus
);
627 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
628 goto ExitBuildSTdeleteall
;
631 setAdd(U
, p
->fFollowPos
);
635 // if U is not empty and not in DStates then
637 UBool UinDstates
= FALSE
;
639 U_ASSERT(U
->size() > 0);
641 for (ix
=0; ix
<fDStates
->size(); ix
++) {
642 RBBIStateDescriptor
*temp2
;
643 temp2
= (RBBIStateDescriptor
*)fDStates
->elementAt(ix
);
644 if (setEquals(U
, temp2
->fPositions
)) {
646 U
= temp2
->fPositions
;
653 // Add U as an unmarked state to Dstates
656 RBBIStateDescriptor
*newState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
657 if (newState
== NULL
) {
658 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
660 if (U_FAILURE(*fStatus
)) {
661 goto ExitBuildSTdeleteall
;
663 newState
->fPositions
= U
;
664 fDStates
->addElement(newState
, *fStatus
);
665 if (U_FAILURE(*fStatus
)) {
668 ux
= fDStates
->size()-1;
672 T
->fDtran
->setElementAt(ux
, a
);
677 // delete local pointers only if error occured.
678 ExitBuildSTdeleteall
:
685 //-----------------------------------------------------------------------------
687 // flagAcceptingStates Identify accepting states.
688 // First get a list of all of the end marker nodes.
689 // Then, for each state s,
690 // if s contains one of the end marker nodes in its list of tree positions then
691 // s is an accepting state.
693 //-----------------------------------------------------------------------------
694 void RBBITableBuilder::flagAcceptingStates() {
695 if (U_FAILURE(*fStatus
)) {
698 UVector
endMarkerNodes(*fStatus
);
703 if (U_FAILURE(*fStatus
)) {
707 fTree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
708 if (U_FAILURE(*fStatus
)) {
712 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
713 endMarker
= (RBBINode
*)endMarkerNodes
.elementAt(i
);
714 for (n
=0; n
<fDStates
->size(); n
++) {
715 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
716 if (sd
->fPositions
->indexOf(endMarker
) >= 0) {
717 // Any non-zero value for fAccepting means this is an accepting node.
718 // The value is what will be returned to the user as the break status.
719 // If no other value was specified, force it to -1.
721 if (sd
->fAccepting
==0) {
722 // State hasn't been marked as accepting yet. Do it now.
723 sd
->fAccepting
= endMarker
->fVal
;
724 if (sd
->fAccepting
== 0) {
728 if (sd
->fAccepting
==-1 && endMarker
->fVal
!= 0) {
729 // Both lookahead and non-lookahead accepting for this state.
730 // Favor the look-ahead. Expedient for line break.
731 // TODO: need a more elegant resolution for conflicting rules.
732 sd
->fAccepting
= endMarker
->fVal
;
735 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
737 // If the end marker node is from a look-ahead rule, set
738 // the fLookAhead field or this state also.
739 if (endMarker
->fLookAheadEnd
) {
740 // TODO: don't change value if already set?
741 // TODO: allow for more than one active look-ahead rule in engine.
742 // Make value here an index to a side array in engine?
743 sd
->fLookAhead
= sd
->fAccepting
;
751 //-----------------------------------------------------------------------------
753 // flagLookAheadStates Very similar to flagAcceptingStates, above.
755 //-----------------------------------------------------------------------------
756 void RBBITableBuilder::flagLookAheadStates() {
757 if (U_FAILURE(*fStatus
)) {
760 UVector
lookAheadNodes(*fStatus
);
761 RBBINode
*lookAheadNode
;
765 fTree
->findNodes(&lookAheadNodes
, RBBINode::lookAhead
, *fStatus
);
766 if (U_FAILURE(*fStatus
)) {
769 for (i
=0; i
<lookAheadNodes
.size(); i
++) {
770 lookAheadNode
= (RBBINode
*)lookAheadNodes
.elementAt(i
);
772 for (n
=0; n
<fDStates
->size(); n
++) {
773 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
774 if (sd
->fPositions
->indexOf(lookAheadNode
) >= 0) {
775 sd
->fLookAhead
= lookAheadNode
->fVal
;
784 //-----------------------------------------------------------------------------
788 //-----------------------------------------------------------------------------
789 void RBBITableBuilder::flagTaggedStates() {
790 if (U_FAILURE(*fStatus
)) {
793 UVector
tagNodes(*fStatus
);
798 if (U_FAILURE(*fStatus
)) {
801 fTree
->findNodes(&tagNodes
, RBBINode::tag
, *fStatus
);
802 if (U_FAILURE(*fStatus
)) {
805 for (i
=0; i
<tagNodes
.size(); i
++) { // For each tag node t (all of 'em)
806 tagNode
= (RBBINode
*)tagNodes
.elementAt(i
);
808 for (n
=0; n
<fDStates
->size(); n
++) { // For each state s (row in the state table)
809 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
810 if (sd
->fPositions
->indexOf(tagNode
) >= 0) { // if s include the tag node t
811 sortedAdd(&sd
->fTagVals
, tagNode
->fVal
);
820 //-----------------------------------------------------------------------------
822 // mergeRuleStatusVals
824 // Update the global table of rule status {tag} values
825 // The rule builder has a global vector of status values that are common
826 // for all tables. Merge the ones from this table into the global set.
828 //-----------------------------------------------------------------------------
829 void RBBITableBuilder::mergeRuleStatusVals() {
831 // The basic outline of what happens here is this...
833 // for each state in this state table
834 // if the status tag list for this state is in the global statuses list
836 // continue with the next state
838 // add the tag list for this state to the global list.
843 // Pre-set a single tag of {0} into the table.
844 // We will need this as a default, for rule sets with no explicit tagging.
845 if (fRB
->fRuleStatusVals
->size() == 0) {
846 fRB
->fRuleStatusVals
->addElement(1, *fStatus
); // Num of statuses in group
847 fRB
->fRuleStatusVals
->addElement((int32_t)0, *fStatus
); // and our single status of zero
851 for (n
=0; n
<fDStates
->size(); n
++) {
852 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
853 UVector
*thisStatesTagValues
= sd
->fTagVals
;
854 if (thisStatesTagValues
== NULL
) {
855 // No tag values are explicitly associated with this state.
856 // Set the default tag value.
861 // There are tag(s) associated with this state.
862 // fTagsIdx will be the index into the global tag list for this state's tag values.
863 // Initial value of -1 flags that we haven't got it set yet.
865 int32_t thisTagGroupStart
= 0; // indexes into the global rule status vals list
866 int32_t nextTagGroupStart
= 0;
868 // Loop runs once per group of tags in the global list
869 while (nextTagGroupStart
< fRB
->fRuleStatusVals
->size()) {
870 thisTagGroupStart
= nextTagGroupStart
;
871 nextTagGroupStart
+= fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
) + 1;
872 if (thisStatesTagValues
->size() != fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
)) {
873 // The number of tags for this state is different from
874 // the number of tags in this group from the global list.
875 // Continue with the next group from the global list.
878 // The lengths match, go ahead and compare the actual tag values
879 // between this state and the group from the global list.
880 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
881 if (thisStatesTagValues
->elementAti(i
) !=
882 fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
+ 1 + i
) ) {
888 if (i
== thisStatesTagValues
->size()) {
889 // We found a set of tag values in the global list that match
890 // those for this state. Use them.
891 sd
->fTagsIdx
= thisTagGroupStart
;
896 if (sd
->fTagsIdx
== -1) {
897 // No suitable entry in the global tag list already. Add one
898 sd
->fTagsIdx
= fRB
->fRuleStatusVals
->size();
899 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->size(), *fStatus
);
900 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
901 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->elementAti(i
), *fStatus
);
913 //-----------------------------------------------------------------------------
915 // sortedAdd Add a value to a vector of sorted values (ints).
916 // Do not replicate entries; if the value is already there, do not
918 // Lazily create the vector if it does not already exist.
920 //-----------------------------------------------------------------------------
921 void RBBITableBuilder::sortedAdd(UVector
**vector
, int32_t val
) {
924 if (*vector
== NULL
) {
925 *vector
= new UVector(*fStatus
);
927 if (*vector
== NULL
|| U_FAILURE(*fStatus
)) {
930 UVector
*vec
= *vector
;
931 int32_t vSize
= vec
->size();
932 for (i
=0; i
<vSize
; i
++) {
933 int32_t valAtI
= vec
->elementAti(i
);
935 // The value is already in the vector. Don't add it again.
942 vec
->insertElementAt(val
, i
, *fStatus
);
947 //-----------------------------------------------------------------------------
949 // setAdd Set operation on UVector
950 // dest = dest union source
951 // Elements may only appear once and must be sorted.
953 //-----------------------------------------------------------------------------
954 void RBBITableBuilder::setAdd(UVector
*dest
, UVector
*source
) {
955 int32_t destOriginalSize
= dest
->size();
956 int32_t sourceSize
= source
->size();
958 MaybeStackArray
<void *, 16> destArray
, sourceArray
; // Handle small cases without malloc
959 void **destPtr
, **sourcePtr
;
960 void **destLim
, **sourceLim
;
962 if (destOriginalSize
> destArray
.getCapacity()) {
963 if (destArray
.resize(destOriginalSize
) == NULL
) {
967 destPtr
= destArray
.getAlias();
968 destLim
= destPtr
+ destOriginalSize
; // destArray.getArrayLimit()?
970 if (sourceSize
> sourceArray
.getCapacity()) {
971 if (sourceArray
.resize(sourceSize
) == NULL
) {
975 sourcePtr
= sourceArray
.getAlias();
976 sourceLim
= sourcePtr
+ sourceSize
; // sourceArray.getArrayLimit()?
978 // Avoid multiple "get element" calls by getting the contents into arrays
979 (void) dest
->toArray(destPtr
);
980 (void) source
->toArray(sourcePtr
);
982 dest
->setSize(sourceSize
+destOriginalSize
, *fStatus
);
984 while (sourcePtr
< sourceLim
&& destPtr
< destLim
) {
985 if (*destPtr
== *sourcePtr
) {
986 dest
->setElementAt(*sourcePtr
++, di
++);
989 // This check is required for machines with segmented memory, like i5/OS.
990 // Direct pointer comparison is not recommended.
991 else if (uprv_memcmp(destPtr
, sourcePtr
, sizeof(void *)) < 0) {
992 dest
->setElementAt(*destPtr
++, di
++);
994 else { /* *sourcePtr < *destPtr */
995 dest
->setElementAt(*sourcePtr
++, di
++);
999 // At most one of these two cleanup loops will execute
1000 while (destPtr
< destLim
) {
1001 dest
->setElementAt(*destPtr
++, di
++);
1003 while (sourcePtr
< sourceLim
) {
1004 dest
->setElementAt(*sourcePtr
++, di
++);
1007 dest
->setSize(di
, *fStatus
);
1012 //-----------------------------------------------------------------------------
1014 // setEqual Set operation on UVector.
1015 // Compare for equality.
1016 // Elements must be sorted.
1018 //-----------------------------------------------------------------------------
1019 UBool
RBBITableBuilder::setEquals(UVector
*a
, UVector
*b
) {
1020 return a
->equals(*b
);
1024 //-----------------------------------------------------------------------------
1026 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1027 // for each node in the tree.
1029 //-----------------------------------------------------------------------------
1031 void RBBITableBuilder::printPosSets(RBBINode
*n
) {
1036 RBBIDebugPrintf(" Nullable: %s\n", n
->fNullable
?"TRUE":"FALSE");
1038 RBBIDebugPrintf(" firstpos: ");
1039 printSet(n
->fFirstPosSet
);
1041 RBBIDebugPrintf(" lastpos: ");
1042 printSet(n
->fLastPosSet
);
1044 RBBIDebugPrintf(" followpos: ");
1045 printSet(n
->fFollowPos
);
1047 printPosSets(n
->fLeftChild
);
1048 printPosSets(n
->fRightChild
);
1054 //-----------------------------------------------------------------------------
1056 // getTableSize() Calculate the size of the runtime form of this
1057 // state transition table.
1059 //-----------------------------------------------------------------------------
1060 int32_t RBBITableBuilder::getTableSize() const {
1066 if (fTree
== NULL
) {
1070 size
= sizeof(RBBIStateTable
) - 4; // The header, with no rows to the table.
1072 numRows
= fDStates
->size();
1073 numCols
= fRB
->fSetBuilder
->getNumCharCategories();
1075 // Note The declaration of RBBIStateTableRow is for a table of two columns.
1076 // Therefore we subtract two from numCols when determining
1077 // how much storage to add to a row for the total columns.
1078 rowSize
= sizeof(RBBIStateTableRow
) + sizeof(uint16_t)*(numCols
-2);
1079 size
+= numRows
* rowSize
;
1085 //-----------------------------------------------------------------------------
1087 // exportTable() export the state transition table in the format required
1088 // by the runtime engine. getTableSize() bytes of memory
1089 // must be available at the output address "where".
1091 //-----------------------------------------------------------------------------
1092 void RBBITableBuilder::exportTable(void *where
) {
1093 RBBIStateTable
*table
= (RBBIStateTable
*)where
;
1097 if (U_FAILURE(*fStatus
) || fTree
== NULL
) {
1101 if (fRB
->fSetBuilder
->getNumCharCategories() > 0x7fff ||
1102 fDStates
->size() > 0x7fff) {
1103 *fStatus
= U_BRK_INTERNAL_ERROR
;
1107 table
->fRowLen
= sizeof(RBBIStateTableRow
) +
1108 sizeof(uint16_t) * (fRB
->fSetBuilder
->getNumCharCategories() - 2);
1109 table
->fNumStates
= fDStates
->size();
1111 if (fRB
->fLookAheadHardBreak
) {
1112 table
->fFlags
|= RBBI_LOOKAHEAD_HARD_BREAK
;
1114 if (fRB
->fSetBuilder
->sawBOF()) {
1115 table
->fFlags
|= RBBI_BOF_REQUIRED
;
1117 table
->fReserved
= 0;
1119 for (state
=0; state
<table
->fNumStates
; state
++) {
1120 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(state
);
1121 RBBIStateTableRow
*row
= (RBBIStateTableRow
*)(table
->fTableData
+ state
*table
->fRowLen
);
1122 U_ASSERT (-32768 < sd
->fAccepting
&& sd
->fAccepting
<= 32767);
1123 U_ASSERT (-32768 < sd
->fLookAhead
&& sd
->fLookAhead
<= 32767);
1124 row
->fAccepting
= (int16_t)sd
->fAccepting
;
1125 row
->fLookAhead
= (int16_t)sd
->fLookAhead
;
1126 row
->fTagIdx
= (int16_t)sd
->fTagsIdx
;
1127 for (col
=0; col
<fRB
->fSetBuilder
->getNumCharCategories(); col
++) {
1128 row
->fNextState
[col
] = (uint16_t)sd
->fDtran
->elementAti(col
);
1135 //-----------------------------------------------------------------------------
1137 // printSet Debug function. Print the contents of a UVector
1139 //-----------------------------------------------------------------------------
1141 void RBBITableBuilder::printSet(UVector
*s
) {
1143 for (i
=0; i
<s
->size(); i
++) {
1144 void *v
= s
->elementAt(i
);
1145 RBBIDebugPrintf("%10p", v
);
1147 RBBIDebugPrintf("\n");
1152 //-----------------------------------------------------------------------------
1154 // printStates Debug Function. Dump the fully constructed state transition table.
1156 //-----------------------------------------------------------------------------
1158 void RBBITableBuilder::printStates() {
1159 int c
; // input "character"
1160 int n
; // state number
1162 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1163 RBBIDebugPrintf(" | Acc LA Tag");
1164 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1165 RBBIDebugPrintf(" %2d", c
);
1167 RBBIDebugPrintf("\n");
1168 RBBIDebugPrintf(" |---------------");
1169 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1170 RBBIDebugPrintf("---");
1172 RBBIDebugPrintf("\n");
1174 for (n
=0; n
<fDStates
->size(); n
++) {
1175 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
1176 RBBIDebugPrintf(" %3d | " , n
);
1177 RBBIDebugPrintf("%3d %3d %5d ", sd
->fAccepting
, sd
->fLookAhead
, sd
->fTagsIdx
);
1178 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1179 RBBIDebugPrintf(" %2d", sd
->fDtran
->elementAti(c
));
1181 RBBIDebugPrintf("\n");
1183 RBBIDebugPrintf("\n\n");
1189 //-----------------------------------------------------------------------------
1191 // printRuleStatusTable Debug Function. Dump the common rule status table
1193 //-----------------------------------------------------------------------------
1195 void RBBITableBuilder::printRuleStatusTable() {
1196 int32_t thisRecord
= 0;
1197 int32_t nextRecord
= 0;
1199 UVector
*tbl
= fRB
->fRuleStatusVals
;
1201 RBBIDebugPrintf("index | tags \n");
1202 RBBIDebugPrintf("-------------------\n");
1204 while (nextRecord
< tbl
->size()) {
1205 thisRecord
= nextRecord
;
1206 nextRecord
= thisRecord
+ tbl
->elementAti(thisRecord
) + 1;
1207 RBBIDebugPrintf("%4d ", thisRecord
);
1208 for (i
=thisRecord
+1; i
<nextRecord
; i
++) {
1209 RBBIDebugPrintf(" %5d", tbl
->elementAti(i
));
1211 RBBIDebugPrintf("\n");
1213 RBBIDebugPrintf("\n\n");
1218 //-----------------------------------------------------------------------------
1220 // RBBIStateDescriptor Methods. This is a very struct-like class
1221 // Most access is directly to the fields.
1223 //-----------------------------------------------------------------------------
1225 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol
, UErrorCode
*fStatus
) {
1234 fDtran
= new UVector(lastInputSymbol
+1, *fStatus
);
1235 if (U_FAILURE(*fStatus
)) {
1238 if (fDtran
== NULL
) {
1239 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
1242 fDtran
->setSize(lastInputSymbol
+1, *fStatus
); // fDtran needs to be pre-sized.
1243 // It is indexed by input symbols, and will
1244 // hold the next state number for each
1249 RBBIStateDescriptor::~RBBIStateDescriptor() {
1260 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */