2 **********************************************************************
3 * Copyright (c) 2002-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_BREAK_ITERATION
16 #include "unicode/unistr.h"
27 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder
*rb
, RBBINode
**rootNode
) :
30 fStatus
= fRB
->fStatus
;
31 UErrorCode status
= U_ZERO_ERROR
;
32 fDStates
= new UVector(status
);
33 if (U_FAILURE(*fStatus
)) {
36 if (U_FAILURE(status
)) {
40 if (fDStates
== NULL
) {
41 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;;
47 RBBITableBuilder::~RBBITableBuilder() {
49 for (i
=0; i
<fDStates
->size(); i
++) {
50 delete (RBBIStateDescriptor
*)fDStates
->elementAt(i
);
56 //-----------------------------------------------------------------------------
58 // RBBITableBuilder::build - This is the main function for building the DFA state transtion
59 // table from the RBBI rules parse tree.
61 //-----------------------------------------------------------------------------
62 void RBBITableBuilder::build() {
64 if (U_FAILURE(*fStatus
)) {
68 // If there were no rules, just return. This situation can easily arise
69 // for the reverse rules.
75 // Walk through the tree, replacing any references to $variables with a copy of the
76 // parse tree for the substition expression.
78 fTree
= fTree
->flattenVariables();
80 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "ftree")) {
81 RBBIDebugPuts("\nParse tree after flattening variable references.");
82 fTree
->printTree(TRUE
);
87 // If the rules contained any references to {bof}
88 // add a {bof} <cat> <former root of tree> to the
89 // tree. Means that all matches must start out with the
90 // {bof} fake character.
92 if (fRB
->fSetBuilder
->sawBOF()) {
93 RBBINode
*bofTop
= new RBBINode(RBBINode::opCat
);
94 RBBINode
*bofLeaf
= new RBBINode(RBBINode::leafChar
);
95 // Delete and exit if memory allocation failed.
96 if (bofTop
== NULL
|| bofLeaf
== NULL
) {
97 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
102 bofTop
->fLeftChild
= bofLeaf
;
103 bofTop
->fRightChild
= fTree
;
104 bofLeaf
->fParent
= bofTop
;
105 bofLeaf
->fVal
= 2; // Reserved value for {bof}.
110 // Add a unique right-end marker to the expression.
111 // Appears as a cat-node, left child being the original tree,
112 // right child being the end marker.
114 RBBINode
*cn
= new RBBINode(RBBINode::opCat
);
115 // Exit if memory allocation failed.
117 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
120 cn
->fLeftChild
= fTree
;
122 cn
->fRightChild
= new RBBINode(RBBINode::endMark
);
123 // Delete and exit if memory allocation failed.
124 if (cn
->fRightChild
== NULL
) {
125 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
129 cn
->fRightChild
->fParent
= cn
;
133 // Replace all references to UnicodeSets with the tree for the equivalent
136 fTree
->flattenSets();
138 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "stree")) {
139 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
140 fTree
->printTree(TRUE
);
146 // calculate the functions nullable, firstpos, lastpos and followpos on
147 // nodes in the parse tree.
148 // See the alogrithm description in Aho.
149 // Understanding how this works by looking at the code alone will be
150 // nearly impossible.
155 calcFollowPos(fTree
);
156 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "pos")) {
162 // For "chained" rules, modify the followPos sets
164 if (fRB
->fChainRules
) {
165 calcChainedFollowPos(fTree
);
169 // BOF (start of input) test fixup.
171 if (fRB
->fSetBuilder
->sawBOF()) {
176 // Build the DFA state transition tables.
179 flagAcceptingStates();
180 flagLookAheadStates();
184 // Update the global table of rule status {tag} values
185 // The rule builder has a global vector of status values that are common
186 // for all tables. Merge the ones from this table into the global set.
188 mergeRuleStatusVals();
190 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "states")) {printStates();};
195 //-----------------------------------------------------------------------------
197 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
199 //-----------------------------------------------------------------------------
200 void RBBITableBuilder::calcNullable(RBBINode
*n
) {
204 if (n
->fType
== RBBINode::setRef
||
205 n
->fType
== RBBINode::endMark
) {
206 // These are non-empty leaf node types.
207 n
->fNullable
= FALSE
;
211 if (n
->fType
== RBBINode::lookAhead
|| n
->fType
== RBBINode::tag
) {
212 // Lookahead marker node. It's a leaf, so no recursion on children.
213 // It's nullable because it does not match any literal text from the input stream.
219 // The node is not a leaf.
220 // Calculate nullable on its children.
221 calcNullable(n
->fLeftChild
);
222 calcNullable(n
->fRightChild
);
224 // Apply functions from table 3.40 in Aho
225 if (n
->fType
== RBBINode::opOr
) {
226 n
->fNullable
= n
->fLeftChild
->fNullable
|| n
->fRightChild
->fNullable
;
228 else if (n
->fType
== RBBINode::opCat
) {
229 n
->fNullable
= n
->fLeftChild
->fNullable
&& n
->fRightChild
->fNullable
;
231 else if (n
->fType
== RBBINode::opStar
|| n
->fType
== RBBINode::opQuestion
) {
235 n
->fNullable
= FALSE
;
242 //-----------------------------------------------------------------------------
244 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
246 //-----------------------------------------------------------------------------
247 void RBBITableBuilder::calcFirstPos(RBBINode
*n
) {
251 if (n
->fType
== RBBINode::leafChar
||
252 n
->fType
== RBBINode::endMark
||
253 n
->fType
== RBBINode::lookAhead
||
254 n
->fType
== RBBINode::tag
) {
255 // These are non-empty leaf node types.
256 // Note: In order to maintain the sort invariant on the set,
257 // this function should only be called on a node whose set is
258 // empty to start with.
259 n
->fFirstPosSet
->addElement(n
, *fStatus
);
263 // The node is not a leaf.
264 // Calculate firstPos on its children.
265 calcFirstPos(n
->fLeftChild
);
266 calcFirstPos(n
->fRightChild
);
268 // Apply functions from table 3.40 in Aho
269 if (n
->fType
== RBBINode::opOr
) {
270 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
271 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
273 else if (n
->fType
== RBBINode::opCat
) {
274 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
275 if (n
->fLeftChild
->fNullable
) {
276 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
279 else if (n
->fType
== RBBINode::opStar
||
280 n
->fType
== RBBINode::opQuestion
||
281 n
->fType
== RBBINode::opPlus
) {
282 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
288 //-----------------------------------------------------------------------------
290 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
292 //-----------------------------------------------------------------------------
293 void RBBITableBuilder::calcLastPos(RBBINode
*n
) {
297 if (n
->fType
== RBBINode::leafChar
||
298 n
->fType
== RBBINode::endMark
||
299 n
->fType
== RBBINode::lookAhead
||
300 n
->fType
== RBBINode::tag
) {
301 // These are non-empty leaf node types.
302 // Note: In order to maintain the sort invariant on the set,
303 // this function should only be called on a node whose set is
304 // empty to start with.
305 n
->fLastPosSet
->addElement(n
, *fStatus
);
309 // The node is not a leaf.
310 // Calculate lastPos on its children.
311 calcLastPos(n
->fLeftChild
);
312 calcLastPos(n
->fRightChild
);
314 // Apply functions from table 3.40 in Aho
315 if (n
->fType
== RBBINode::opOr
) {
316 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
317 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
319 else if (n
->fType
== RBBINode::opCat
) {
320 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
321 if (n
->fRightChild
->fNullable
) {
322 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
325 else if (n
->fType
== RBBINode::opStar
||
326 n
->fType
== RBBINode::opQuestion
||
327 n
->fType
== RBBINode::opPlus
) {
328 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
334 //-----------------------------------------------------------------------------
336 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
338 //-----------------------------------------------------------------------------
339 void RBBITableBuilder::calcFollowPos(RBBINode
*n
) {
341 n
->fType
== RBBINode::leafChar
||
342 n
->fType
== RBBINode::endMark
) {
346 calcFollowPos(n
->fLeftChild
);
347 calcFollowPos(n
->fRightChild
);
350 if (n
->fType
== RBBINode::opCat
) {
351 RBBINode
*i
; // is 'i' in Aho's description
354 UVector
*LastPosOfLeftChild
= n
->fLeftChild
->fLastPosSet
;
356 for (ix
=0; ix
<(uint32_t)LastPosOfLeftChild
->size(); ix
++) {
357 i
= (RBBINode
*)LastPosOfLeftChild
->elementAt(ix
);
358 setAdd(i
->fFollowPos
, n
->fRightChild
->fFirstPosSet
);
363 if (n
->fType
== RBBINode::opStar
||
364 n
->fType
== RBBINode::opPlus
) {
365 RBBINode
*i
; // again, n and i are the names from Aho's description.
368 for (ix
=0; ix
<(uint32_t)n
->fLastPosSet
->size(); ix
++) {
369 i
= (RBBINode
*)n
->fLastPosSet
->elementAt(ix
);
370 setAdd(i
->fFollowPos
, n
->fFirstPosSet
);
378 //-----------------------------------------------------------------------------
380 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
381 // as roots of a rule to a destination vector.
383 //-----------------------------------------------------------------------------
384 void RBBITableBuilder::addRuleRootNodes(UVector
*dest
, RBBINode
*node
) {
385 if (node
== NULL
|| U_FAILURE(*fStatus
)) {
388 if (node
->fRuleRoot
) {
389 dest
->addElement(node
, *fStatus
);
390 // Note: rules cannot nest. If we found a rule start node,
391 // no child node can also be a start node.
394 addRuleRootNodes(dest
, node
->fLeftChild
);
395 addRuleRootNodes(dest
, node
->fRightChild
);
398 //-----------------------------------------------------------------------------
400 // calcChainedFollowPos. Modify the previously calculated followPos sets
401 // to implement rule chaining. NOT described by Aho
403 //-----------------------------------------------------------------------------
404 void RBBITableBuilder::calcChainedFollowPos(RBBINode
*tree
) {
406 UVector
endMarkerNodes(*fStatus
);
407 UVector
leafNodes(*fStatus
);
410 if (U_FAILURE(*fStatus
)) {
414 // get a list of all endmarker nodes.
415 tree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
417 // get a list all leaf nodes
418 tree
->findNodes(&leafNodes
, RBBINode::leafChar
, *fStatus
);
419 if (U_FAILURE(*fStatus
)) {
423 // Collect all leaf nodes that can start matches for rules
424 // with inbound chaining enabled, which is the union of the
425 // firstPosition sets from each of the rule root nodes.
427 UVector
ruleRootNodes(*fStatus
);
428 addRuleRootNodes(&ruleRootNodes
, tree
);
430 UVector
matchStartNodes(*fStatus
);
431 for (int i
=0; i
<ruleRootNodes
.size(); ++i
) {
432 RBBINode
*node
= static_cast<RBBINode
*>(ruleRootNodes
.elementAt(i
));
433 if (node
->fChainIn
) {
434 setAdd(&matchStartNodes
, node
->fFirstPosSet
);
437 if (U_FAILURE(*fStatus
)) {
444 for (endNodeIx
=0; endNodeIx
<leafNodes
.size(); endNodeIx
++) {
445 RBBINode
*tNode
= (RBBINode
*)leafNodes
.elementAt(endNodeIx
);
446 RBBINode
*endNode
= NULL
;
448 // Identify leaf nodes that correspond to overall rule match positions.
449 // These include an endMarkerNode in their followPos sets.
450 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
451 if (tNode
->fFollowPos
->contains(endMarkerNodes
.elementAt(i
))) {
456 if (endNode
== NULL
) {
457 // node wasn't an end node. Try again with the next.
461 // We've got a node that can end a match.
463 // Line Break Specific hack: If this node's val correspond to the $CM char class,
464 // don't chain from it.
465 // TODO: Add rule syntax for this behavior, get specifics out of here and
466 // into the rule file.
467 if (fRB
->fLBCMNoChain
|| fRB
->fRINoChain
) {
468 UChar32 c
= this->fRB
->fSetBuilder
->getFirstChar(endNode
->fVal
);
470 // c == -1 occurs with sets containing only the {eof} marker string.
471 if (fRB
->fLBCMNoChain
) {
472 ULineBreak cLBProp
= (ULineBreak
)u_getIntPropertyValue(c
, UCHAR_LINE_BREAK
);
473 if (cLBProp
== U_LB_COMBINING_MARK
) {
477 if (fRB
->fRINoChain
) {
478 UGraphemeClusterBreak cGBProp
= (UGraphemeClusterBreak
)u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
479 if (cGBProp
== U_GCB_REGIONAL_INDICATOR
) {
487 // Now iterate over the nodes that can start a match, looking for ones
488 // with the same char class as our ending node.
490 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
.size(); startNodeIx
++) {
491 startNode
= (RBBINode
*)matchStartNodes
.elementAt(startNodeIx
);
492 if (startNode
->fType
!= RBBINode::leafChar
) {
496 if (endNode
->fVal
== startNode
->fVal
) {
497 // The end val (character class) of one possible match is the
498 // same as the start of another.
500 // Add all nodes from the followPos of the start node to the
501 // followPos set of the end node, which will have the effect of
502 // letting matches transition from a match state at endNode
503 // to the second char of a match starting with startNode.
504 setAdd(endNode
->fFollowPos
, startNode
->fFollowPos
);
511 //-----------------------------------------------------------------------------
513 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
514 // Do an swizzle similar to chaining, modifying the followPos set of
515 // the bofNode to include the followPos nodes from other {bot} nodes
516 // scattered through the tree.
518 // This function has much in common with calcChainedFollowPos().
520 //-----------------------------------------------------------------------------
521 void RBBITableBuilder::bofFixup() {
523 if (U_FAILURE(*fStatus
)) {
527 // The parse tree looks like this ...
528 // fTree root ---> <cat>
535 // We will be adding things to the followPos set of the <bofNode>
537 RBBINode
*bofNode
= fTree
->fLeftChild
->fLeftChild
;
538 U_ASSERT(bofNode
->fType
== RBBINode::leafChar
);
539 U_ASSERT(bofNode
->fVal
== 2);
541 // Get all nodes that can be the start a match of the user-written rules
542 // (excluding the fake bofNode)
543 // We want the nodes that can start a match in the
544 // part labeled "rest of tree"
546 UVector
*matchStartNodes
= fTree
->fLeftChild
->fRightChild
->fFirstPosSet
;
550 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
->size(); startNodeIx
++) {
551 startNode
= (RBBINode
*)matchStartNodes
->elementAt(startNodeIx
);
552 if (startNode
->fType
!= RBBINode::leafChar
) {
556 if (startNode
->fVal
== bofNode
->fVal
) {
557 // We found a leaf node corresponding to a {bof} that was
558 // explicitly written into a rule.
559 // Add everything from the followPos set of this node to the
560 // followPos set of the fake bofNode at the start of the tree.
562 setAdd(bofNode
->fFollowPos
, startNode
->fFollowPos
);
567 //-----------------------------------------------------------------------------
569 // buildStateTable() Determine the set of runtime DFA states and the
570 // transition tables for these states, by the algorithm
571 // of fig. 3.44 in Aho.
573 // Most of the comments are quotes of Aho's psuedo-code.
575 //-----------------------------------------------------------------------------
576 void RBBITableBuilder::buildStateTable() {
577 if (U_FAILURE(*fStatus
)) {
580 RBBIStateDescriptor
*failState
;
581 // Set it to NULL to avoid uninitialized warning
582 RBBIStateDescriptor
*initialState
= NULL
;
584 // Add a dummy state 0 - the stop state. Not from Aho.
585 int lastInputSymbol
= fRB
->fSetBuilder
->getNumCharCategories() - 1;
586 failState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
587 if (failState
== NULL
) {
588 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
589 goto ExitBuildSTdeleteall
;
591 failState
->fPositions
= new UVector(*fStatus
);
592 if (failState
->fPositions
== NULL
) {
593 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
595 if (failState
->fPositions
== NULL
|| U_FAILURE(*fStatus
)) {
596 goto ExitBuildSTdeleteall
;
598 fDStates
->addElement(failState
, *fStatus
);
599 if (U_FAILURE(*fStatus
)) {
600 goto ExitBuildSTdeleteall
;
603 // initially, the only unmarked state in Dstates is firstpos(root),
604 // where toot is the root of the syntax tree for (r)#;
605 initialState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
606 if (initialState
== NULL
) {
607 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
609 if (U_FAILURE(*fStatus
)) {
610 goto ExitBuildSTdeleteall
;
612 initialState
->fPositions
= new UVector(*fStatus
);
613 if (initialState
->fPositions
== NULL
) {
614 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
616 if (U_FAILURE(*fStatus
)) {
617 goto ExitBuildSTdeleteall
;
619 setAdd(initialState
->fPositions
, fTree
->fFirstPosSet
);
620 fDStates
->addElement(initialState
, *fStatus
);
621 if (U_FAILURE(*fStatus
)) {
622 goto ExitBuildSTdeleteall
;
625 // while there is an unmarked state T in Dstates do begin
627 RBBIStateDescriptor
*T
= NULL
;
629 for (tx
=1; tx
<fDStates
->size(); tx
++) {
630 RBBIStateDescriptor
*temp
;
631 temp
= (RBBIStateDescriptor
*)fDStates
->elementAt(tx
);
632 if (temp
->fMarked
== FALSE
) {
644 // for each input symbol a do begin
646 for (a
= 1; a
<=lastInputSymbol
; a
++) {
647 // let U be the set of positions that are in followpos(p)
648 // for some position p in T
649 // such that the symbol at position p is a;
653 for (px
=0; px
<T
->fPositions
->size(); px
++) {
654 p
= (RBBINode
*)T
->fPositions
->elementAt(px
);
655 if ((p
->fType
== RBBINode::leafChar
) && (p
->fVal
== a
)) {
657 U
= new UVector(*fStatus
);
659 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
660 goto ExitBuildSTdeleteall
;
663 setAdd(U
, p
->fFollowPos
);
667 // if U is not empty and not in DStates then
669 UBool UinDstates
= FALSE
;
671 U_ASSERT(U
->size() > 0);
673 for (ix
=0; ix
<fDStates
->size(); ix
++) {
674 RBBIStateDescriptor
*temp2
;
675 temp2
= (RBBIStateDescriptor
*)fDStates
->elementAt(ix
);
676 if (setEquals(U
, temp2
->fPositions
)) {
678 U
= temp2
->fPositions
;
685 // Add U as an unmarked state to Dstates
688 RBBIStateDescriptor
*newState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
689 if (newState
== NULL
) {
690 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
692 if (U_FAILURE(*fStatus
)) {
693 goto ExitBuildSTdeleteall
;
695 newState
->fPositions
= U
;
696 fDStates
->addElement(newState
, *fStatus
);
697 if (U_FAILURE(*fStatus
)) {
700 ux
= fDStates
->size()-1;
704 T
->fDtran
->setElementAt(ux
, a
);
709 // delete local pointers only if error occured.
710 ExitBuildSTdeleteall
:
717 //-----------------------------------------------------------------------------
719 // flagAcceptingStates Identify accepting states.
720 // First get a list of all of the end marker nodes.
721 // Then, for each state s,
722 // if s contains one of the end marker nodes in its list of tree positions then
723 // s is an accepting state.
725 //-----------------------------------------------------------------------------
726 void RBBITableBuilder::flagAcceptingStates() {
727 if (U_FAILURE(*fStatus
)) {
730 UVector
endMarkerNodes(*fStatus
);
735 if (U_FAILURE(*fStatus
)) {
739 fTree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
740 if (U_FAILURE(*fStatus
)) {
744 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
745 endMarker
= (RBBINode
*)endMarkerNodes
.elementAt(i
);
746 for (n
=0; n
<fDStates
->size(); n
++) {
747 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
748 if (sd
->fPositions
->indexOf(endMarker
) >= 0) {
749 // Any non-zero value for fAccepting means this is an accepting node.
750 // The value is what will be returned to the user as the break status.
751 // If no other value was specified, force it to -1.
753 if (sd
->fAccepting
==0) {
754 // State hasn't been marked as accepting yet. Do it now.
755 sd
->fAccepting
= endMarker
->fVal
;
756 if (sd
->fAccepting
== 0) {
760 if (sd
->fAccepting
==-1 && endMarker
->fVal
!= 0) {
761 // Both lookahead and non-lookahead accepting for this state.
762 // Favor the look-ahead. Expedient for line break.
763 // TODO: need a more elegant resolution for conflicting rules.
764 sd
->fAccepting
= endMarker
->fVal
;
767 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
769 // If the end marker node is from a look-ahead rule, set
770 // the fLookAhead field or this state also.
771 if (endMarker
->fLookAheadEnd
) {
772 // TODO: don't change value if already set?
773 // TODO: allow for more than one active look-ahead rule in engine.
774 // Make value here an index to a side array in engine?
775 sd
->fLookAhead
= sd
->fAccepting
;
783 //-----------------------------------------------------------------------------
785 // flagLookAheadStates Very similar to flagAcceptingStates, above.
787 //-----------------------------------------------------------------------------
788 void RBBITableBuilder::flagLookAheadStates() {
789 if (U_FAILURE(*fStatus
)) {
792 UVector
lookAheadNodes(*fStatus
);
793 RBBINode
*lookAheadNode
;
797 fTree
->findNodes(&lookAheadNodes
, RBBINode::lookAhead
, *fStatus
);
798 if (U_FAILURE(*fStatus
)) {
801 for (i
=0; i
<lookAheadNodes
.size(); i
++) {
802 lookAheadNode
= (RBBINode
*)lookAheadNodes
.elementAt(i
);
804 for (n
=0; n
<fDStates
->size(); n
++) {
805 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
806 if (sd
->fPositions
->indexOf(lookAheadNode
) >= 0) {
807 sd
->fLookAhead
= lookAheadNode
->fVal
;
816 //-----------------------------------------------------------------------------
820 //-----------------------------------------------------------------------------
821 void RBBITableBuilder::flagTaggedStates() {
822 if (U_FAILURE(*fStatus
)) {
825 UVector
tagNodes(*fStatus
);
830 if (U_FAILURE(*fStatus
)) {
833 fTree
->findNodes(&tagNodes
, RBBINode::tag
, *fStatus
);
834 if (U_FAILURE(*fStatus
)) {
837 for (i
=0; i
<tagNodes
.size(); i
++) { // For each tag node t (all of 'em)
838 tagNode
= (RBBINode
*)tagNodes
.elementAt(i
);
840 for (n
=0; n
<fDStates
->size(); n
++) { // For each state s (row in the state table)
841 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
842 if (sd
->fPositions
->indexOf(tagNode
) >= 0) { // if s include the tag node t
843 sortedAdd(&sd
->fTagVals
, tagNode
->fVal
);
852 //-----------------------------------------------------------------------------
854 // mergeRuleStatusVals
856 // Update the global table of rule status {tag} values
857 // The rule builder has a global vector of status values that are common
858 // for all tables. Merge the ones from this table into the global set.
860 //-----------------------------------------------------------------------------
861 void RBBITableBuilder::mergeRuleStatusVals() {
863 // The basic outline of what happens here is this...
865 // for each state in this state table
866 // if the status tag list for this state is in the global statuses list
868 // continue with the next state
870 // add the tag list for this state to the global list.
875 // Pre-set a single tag of {0} into the table.
876 // We will need this as a default, for rule sets with no explicit tagging.
877 if (fRB
->fRuleStatusVals
->size() == 0) {
878 fRB
->fRuleStatusVals
->addElement(1, *fStatus
); // Num of statuses in group
879 fRB
->fRuleStatusVals
->addElement((int32_t)0, *fStatus
); // and our single status of zero
883 for (n
=0; n
<fDStates
->size(); n
++) {
884 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
885 UVector
*thisStatesTagValues
= sd
->fTagVals
;
886 if (thisStatesTagValues
== NULL
) {
887 // No tag values are explicitly associated with this state.
888 // Set the default tag value.
893 // There are tag(s) associated with this state.
894 // fTagsIdx will be the index into the global tag list for this state's tag values.
895 // Initial value of -1 flags that we haven't got it set yet.
897 int32_t thisTagGroupStart
= 0; // indexes into the global rule status vals list
898 int32_t nextTagGroupStart
= 0;
900 // Loop runs once per group of tags in the global list
901 while (nextTagGroupStart
< fRB
->fRuleStatusVals
->size()) {
902 thisTagGroupStart
= nextTagGroupStart
;
903 nextTagGroupStart
+= fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
) + 1;
904 if (thisStatesTagValues
->size() != fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
)) {
905 // The number of tags for this state is different from
906 // the number of tags in this group from the global list.
907 // Continue with the next group from the global list.
910 // The lengths match, go ahead and compare the actual tag values
911 // between this state and the group from the global list.
912 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
913 if (thisStatesTagValues
->elementAti(i
) !=
914 fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
+ 1 + i
) ) {
920 if (i
== thisStatesTagValues
->size()) {
921 // We found a set of tag values in the global list that match
922 // those for this state. Use them.
923 sd
->fTagsIdx
= thisTagGroupStart
;
928 if (sd
->fTagsIdx
== -1) {
929 // No suitable entry in the global tag list already. Add one
930 sd
->fTagsIdx
= fRB
->fRuleStatusVals
->size();
931 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->size(), *fStatus
);
932 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
933 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->elementAti(i
), *fStatus
);
945 //-----------------------------------------------------------------------------
947 // sortedAdd Add a value to a vector of sorted values (ints).
948 // Do not replicate entries; if the value is already there, do not
950 // Lazily create the vector if it does not already exist.
952 //-----------------------------------------------------------------------------
953 void RBBITableBuilder::sortedAdd(UVector
**vector
, int32_t val
) {
956 if (*vector
== NULL
) {
957 *vector
= new UVector(*fStatus
);
959 if (*vector
== NULL
|| U_FAILURE(*fStatus
)) {
962 UVector
*vec
= *vector
;
963 int32_t vSize
= vec
->size();
964 for (i
=0; i
<vSize
; i
++) {
965 int32_t valAtI
= vec
->elementAti(i
);
967 // The value is already in the vector. Don't add it again.
974 vec
->insertElementAt(val
, i
, *fStatus
);
979 //-----------------------------------------------------------------------------
981 // setAdd Set operation on UVector
982 // dest = dest union source
983 // Elements may only appear once and must be sorted.
985 //-----------------------------------------------------------------------------
986 void RBBITableBuilder::setAdd(UVector
*dest
, UVector
*source
) {
987 int32_t destOriginalSize
= dest
->size();
988 int32_t sourceSize
= source
->size();
990 MaybeStackArray
<void *, 16> destArray
, sourceArray
; // Handle small cases without malloc
991 void **destPtr
, **sourcePtr
;
992 void **destLim
, **sourceLim
;
994 if (destOriginalSize
> destArray
.getCapacity()) {
995 if (destArray
.resize(destOriginalSize
) == NULL
) {
999 destPtr
= destArray
.getAlias();
1000 destLim
= destPtr
+ destOriginalSize
; // destArray.getArrayLimit()?
1002 if (sourceSize
> sourceArray
.getCapacity()) {
1003 if (sourceArray
.resize(sourceSize
) == NULL
) {
1007 sourcePtr
= sourceArray
.getAlias();
1008 sourceLim
= sourcePtr
+ sourceSize
; // sourceArray.getArrayLimit()?
1010 // Avoid multiple "get element" calls by getting the contents into arrays
1011 (void) dest
->toArray(destPtr
);
1012 (void) source
->toArray(sourcePtr
);
1014 dest
->setSize(sourceSize
+destOriginalSize
, *fStatus
);
1016 while (sourcePtr
< sourceLim
&& destPtr
< destLim
) {
1017 if (*destPtr
== *sourcePtr
) {
1018 dest
->setElementAt(*sourcePtr
++, di
++);
1021 // This check is required for machines with segmented memory, like i5/OS.
1022 // Direct pointer comparison is not recommended.
1023 else if (uprv_memcmp(destPtr
, sourcePtr
, sizeof(void *)) < 0) {
1024 dest
->setElementAt(*destPtr
++, di
++);
1026 else { /* *sourcePtr < *destPtr */
1027 dest
->setElementAt(*sourcePtr
++, di
++);
1031 // At most one of these two cleanup loops will execute
1032 while (destPtr
< destLim
) {
1033 dest
->setElementAt(*destPtr
++, di
++);
1035 while (sourcePtr
< sourceLim
) {
1036 dest
->setElementAt(*sourcePtr
++, di
++);
1039 dest
->setSize(di
, *fStatus
);
1044 //-----------------------------------------------------------------------------
1046 // setEqual Set operation on UVector.
1047 // Compare for equality.
1048 // Elements must be sorted.
1050 //-----------------------------------------------------------------------------
1051 UBool
RBBITableBuilder::setEquals(UVector
*a
, UVector
*b
) {
1052 return a
->equals(*b
);
1056 //-----------------------------------------------------------------------------
1058 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1059 // for each node in the tree.
1061 //-----------------------------------------------------------------------------
1063 void RBBITableBuilder::printPosSets(RBBINode
*n
) {
1068 RBBINode::printNodeHeader();
1070 RBBIDebugPrintf(" Nullable: %s\n", n
->fNullable
?"TRUE":"FALSE");
1072 RBBIDebugPrintf(" firstpos: ");
1073 printSet(n
->fFirstPosSet
);
1075 RBBIDebugPrintf(" lastpos: ");
1076 printSet(n
->fLastPosSet
);
1078 RBBIDebugPrintf(" followpos: ");
1079 printSet(n
->fFollowPos
);
1081 printPosSets(n
->fLeftChild
);
1082 printPosSets(n
->fRightChild
);
1088 //-----------------------------------------------------------------------------
1090 // getTableSize() Calculate the size of the runtime form of this
1091 // state transition table.
1093 //-----------------------------------------------------------------------------
1094 int32_t RBBITableBuilder::getTableSize() const {
1100 if (fTree
== NULL
) {
1104 size
= sizeof(RBBIStateTable
) - 4; // The header, with no rows to the table.
1106 numRows
= fDStates
->size();
1107 numCols
= fRB
->fSetBuilder
->getNumCharCategories();
1109 // Note The declaration of RBBIStateTableRow is for a table of two columns.
1110 // Therefore we subtract two from numCols when determining
1111 // how much storage to add to a row for the total columns.
1112 rowSize
= sizeof(RBBIStateTableRow
) + sizeof(uint16_t)*(numCols
-2);
1113 size
+= numRows
* rowSize
;
1119 //-----------------------------------------------------------------------------
1121 // exportTable() export the state transition table in the format required
1122 // by the runtime engine. getTableSize() bytes of memory
1123 // must be available at the output address "where".
1125 //-----------------------------------------------------------------------------
1126 void RBBITableBuilder::exportTable(void *where
) {
1127 RBBIStateTable
*table
= (RBBIStateTable
*)where
;
1131 if (U_FAILURE(*fStatus
) || fTree
== NULL
) {
1135 if (fRB
->fSetBuilder
->getNumCharCategories() > 0x7fff ||
1136 fDStates
->size() > 0x7fff) {
1137 *fStatus
= U_BRK_INTERNAL_ERROR
;
1141 table
->fRowLen
= sizeof(RBBIStateTableRow
) +
1142 sizeof(uint16_t) * (fRB
->fSetBuilder
->getNumCharCategories() - 2);
1143 table
->fNumStates
= fDStates
->size();
1145 if (fRB
->fLookAheadHardBreak
) {
1146 table
->fFlags
|= RBBI_LOOKAHEAD_HARD_BREAK
;
1148 if (fRB
->fSetBuilder
->sawBOF()) {
1149 table
->fFlags
|= RBBI_BOF_REQUIRED
;
1151 table
->fReserved
= 0;
1153 for (state
=0; state
<table
->fNumStates
; state
++) {
1154 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(state
);
1155 RBBIStateTableRow
*row
= (RBBIStateTableRow
*)(table
->fTableData
+ state
*table
->fRowLen
);
1156 U_ASSERT (-32768 < sd
->fAccepting
&& sd
->fAccepting
<= 32767);
1157 U_ASSERT (-32768 < sd
->fLookAhead
&& sd
->fLookAhead
<= 32767);
1158 row
->fAccepting
= (int16_t)sd
->fAccepting
;
1159 row
->fLookAhead
= (int16_t)sd
->fLookAhead
;
1160 row
->fTagIdx
= (int16_t)sd
->fTagsIdx
;
1161 for (col
=0; col
<fRB
->fSetBuilder
->getNumCharCategories(); col
++) {
1162 row
->fNextState
[col
] = (uint16_t)sd
->fDtran
->elementAti(col
);
1169 //-----------------------------------------------------------------------------
1171 // printSet Debug function. Print the contents of a UVector
1173 //-----------------------------------------------------------------------------
1175 void RBBITableBuilder::printSet(UVector
*s
) {
1177 for (i
=0; i
<s
->size(); i
++) {
1178 const RBBINode
*v
= static_cast<const RBBINode
*>(s
->elementAt(i
));
1179 RBBIDebugPrintf("%5d", v
==NULL
? -1 : v
->fSerialNum
);
1181 RBBIDebugPrintf("\n");
1186 //-----------------------------------------------------------------------------
1188 // printStates Debug Function. Dump the fully constructed state transition table.
1190 //-----------------------------------------------------------------------------
1192 void RBBITableBuilder::printStates() {
1193 int c
; // input "character"
1194 int n
; // state number
1196 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1197 RBBIDebugPrintf(" | Acc LA Tag");
1198 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1199 RBBIDebugPrintf(" %2d", c
);
1201 RBBIDebugPrintf("\n");
1202 RBBIDebugPrintf(" |---------------");
1203 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1204 RBBIDebugPrintf("---");
1206 RBBIDebugPrintf("\n");
1208 for (n
=0; n
<fDStates
->size(); n
++) {
1209 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
1210 RBBIDebugPrintf(" %3d | " , n
);
1211 RBBIDebugPrintf("%3d %3d %5d ", sd
->fAccepting
, sd
->fLookAhead
, sd
->fTagsIdx
);
1212 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1213 RBBIDebugPrintf(" %2d", sd
->fDtran
->elementAti(c
));
1215 RBBIDebugPrintf("\n");
1217 RBBIDebugPrintf("\n\n");
1223 //-----------------------------------------------------------------------------
1225 // printRuleStatusTable Debug Function. Dump the common rule status table
1227 //-----------------------------------------------------------------------------
1229 void RBBITableBuilder::printRuleStatusTable() {
1230 int32_t thisRecord
= 0;
1231 int32_t nextRecord
= 0;
1233 UVector
*tbl
= fRB
->fRuleStatusVals
;
1235 RBBIDebugPrintf("index | tags \n");
1236 RBBIDebugPrintf("-------------------\n");
1238 while (nextRecord
< tbl
->size()) {
1239 thisRecord
= nextRecord
;
1240 nextRecord
= thisRecord
+ tbl
->elementAti(thisRecord
) + 1;
1241 RBBIDebugPrintf("%4d ", thisRecord
);
1242 for (i
=thisRecord
+1; i
<nextRecord
; i
++) {
1243 RBBIDebugPrintf(" %5d", tbl
->elementAti(i
));
1245 RBBIDebugPrintf("\n");
1247 RBBIDebugPrintf("\n\n");
1252 //-----------------------------------------------------------------------------
1254 // RBBIStateDescriptor Methods. This is a very struct-like class
1255 // Most access is directly to the fields.
1257 //-----------------------------------------------------------------------------
1259 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol
, UErrorCode
*fStatus
) {
1268 fDtran
= new UVector(lastInputSymbol
+1, *fStatus
);
1269 if (U_FAILURE(*fStatus
)) {
1272 if (fDtran
== NULL
) {
1273 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
1276 fDtran
->setSize(lastInputSymbol
+1, *fStatus
); // fDtran needs to be pre-sized.
1277 // It is indexed by input symbols, and will
1278 // hold the next state number for each
1283 RBBIStateDescriptor::~RBBIStateDescriptor() {
1294 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */