1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_BREAK_ITERATION
15 #include "unicode/unistr.h"
16 #include "rbbitblb57.h"
18 #include "rbbisetb57.h"
19 #include "rbbidata57.h"
26 RBBITableBuilder57::RBBITableBuilder57(RBBIRuleBuilder57
*rb
, RBBINode
**rootNode
) :
29 fStatus
= fRB
->fStatus
;
30 UErrorCode status
= U_ZERO_ERROR
;
31 fDStates
= new UVector(status
);
32 if (U_FAILURE(*fStatus
)) {
35 if (U_FAILURE(status
)) {
39 if (fDStates
== NULL
) {
40 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;;
46 RBBITableBuilder57::~RBBITableBuilder57() {
48 for (i
=0; i
<fDStates
->size(); i
++) {
49 delete (RBBIStateDescriptor
*)fDStates
->elementAt(i
);
55 //-----------------------------------------------------------------------------
57 // RBBITableBuilder57::build - This is the main function for building the DFA state transtion
58 // table from the RBBI rules parse tree.
60 //-----------------------------------------------------------------------------
61 void RBBITableBuilder57::build() {
63 if (U_FAILURE(*fStatus
)) {
67 // If there were no rules, just return. This situation can easily arise
68 // for the reverse rules.
74 // Walk through the tree, replacing any references to $variables with a copy of the
75 // parse tree for the substition expression.
77 fTree
= fTree
->flattenVariables();
79 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "ftree")) {
80 RBBIDebugPuts("\nParse tree after flattening variable references.");
81 fTree
->printTree(TRUE
);
86 // If the rules contained any references to {bof}
87 // add a {bof} <cat> <former root of tree> to the
88 // tree. Means that all matches must start out with the
89 // {bof} fake character.
91 if (fRB
->fSetBuilder
->sawBOF()) {
92 RBBINode
*bofTop
= new RBBINode(RBBINode::opCat
);
93 RBBINode
*bofLeaf
= new RBBINode(RBBINode::leafChar
);
94 // Delete and exit if memory allocation failed.
95 if (bofTop
== NULL
|| bofLeaf
== NULL
) {
96 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
101 bofTop
->fLeftChild
= bofLeaf
;
102 bofTop
->fRightChild
= fTree
;
103 bofLeaf
->fParent
= bofTop
;
104 bofLeaf
->fVal
= 2; // Reserved value for {bof}.
109 // Add a unique right-end marker to the expression.
110 // Appears as a cat-node, left child being the original tree,
111 // right child being the end marker.
113 RBBINode
*cn
= new RBBINode(RBBINode::opCat
);
114 // Exit if memory allocation failed.
116 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
119 cn
->fLeftChild
= fTree
;
121 cn
->fRightChild
= new RBBINode(RBBINode::endMark
);
122 // Delete and exit if memory allocation failed.
123 if (cn
->fRightChild
== NULL
) {
124 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
128 cn
->fRightChild
->fParent
= cn
;
132 // Replace all references to UnicodeSets with the tree for the equivalent
135 fTree
->flattenSets();
137 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "stree")) {
138 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
139 fTree
->printTree(TRUE
);
145 // calculate the functions nullable, firstpos, lastpos and followpos on
146 // nodes in the parse tree.
147 // See the alogrithm description in Aho.
148 // Understanding how this works by looking at the code alone will be
149 // nearly impossible.
154 calcFollowPos(fTree
);
155 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "pos")) {
161 // For "chained" rules, modify the followPos sets
163 if (fRB
->fChainRules
) {
164 calcChainedFollowPos(fTree
);
168 // BOF (start of input) test fixup.
170 if (fRB
->fSetBuilder
->sawBOF()) {
175 // Build the DFA state transition tables.
178 flagAcceptingStates();
179 flagLookAheadStates();
183 // Update the global table of rule status {tag} values
184 // The rule builder has a global vector of status values that are common
185 // for all tables. Merge the ones from this table into the global set.
187 mergeRuleStatusVals();
189 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "states")) {printStates();};
194 //-----------------------------------------------------------------------------
196 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
198 //-----------------------------------------------------------------------------
199 void RBBITableBuilder57::calcNullable(RBBINode
*n
) {
203 if (n
->fType
== RBBINode::setRef
||
204 n
->fType
== RBBINode::endMark
) {
205 // These are non-empty leaf node types.
206 n
->fNullable
= FALSE
;
210 if (n
->fType
== RBBINode::lookAhead
|| n
->fType
== RBBINode::tag
) {
211 // Lookahead marker node. It's a leaf, so no recursion on children.
212 // It's nullable because it does not match any literal text from the input stream.
218 // The node is not a leaf.
219 // Calculate nullable on its children.
220 calcNullable(n
->fLeftChild
);
221 calcNullable(n
->fRightChild
);
223 // Apply functions from table 3.40 in Aho
224 if (n
->fType
== RBBINode::opOr
) {
225 n
->fNullable
= n
->fLeftChild
->fNullable
|| n
->fRightChild
->fNullable
;
227 else if (n
->fType
== RBBINode::opCat
) {
228 n
->fNullable
= n
->fLeftChild
->fNullable
&& n
->fRightChild
->fNullable
;
230 else if (n
->fType
== RBBINode::opStar
|| n
->fType
== RBBINode::opQuestion
) {
234 n
->fNullable
= FALSE
;
241 //-----------------------------------------------------------------------------
243 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
245 //-----------------------------------------------------------------------------
246 void RBBITableBuilder57::calcFirstPos(RBBINode
*n
) {
250 if (n
->fType
== RBBINode::leafChar
||
251 n
->fType
== RBBINode::endMark
||
252 n
->fType
== RBBINode::lookAhead
||
253 n
->fType
== RBBINode::tag
) {
254 // These are non-empty leaf node types.
255 // Note: In order to maintain the sort invariant on the set,
256 // this function should only be called on a node whose set is
257 // empty to start with.
258 n
->fFirstPosSet
->addElement(n
, *fStatus
);
262 // The node is not a leaf.
263 // Calculate firstPos on its children.
264 calcFirstPos(n
->fLeftChild
);
265 calcFirstPos(n
->fRightChild
);
267 // Apply functions from table 3.40 in Aho
268 if (n
->fType
== RBBINode::opOr
) {
269 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
270 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
272 else if (n
->fType
== RBBINode::opCat
) {
273 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
274 if (n
->fLeftChild
->fNullable
) {
275 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
278 else if (n
->fType
== RBBINode::opStar
||
279 n
->fType
== RBBINode::opQuestion
||
280 n
->fType
== RBBINode::opPlus
) {
281 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
287 //-----------------------------------------------------------------------------
289 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
291 //-----------------------------------------------------------------------------
292 void RBBITableBuilder57::calcLastPos(RBBINode
*n
) {
296 if (n
->fType
== RBBINode::leafChar
||
297 n
->fType
== RBBINode::endMark
||
298 n
->fType
== RBBINode::lookAhead
||
299 n
->fType
== RBBINode::tag
) {
300 // These are non-empty leaf node types.
301 // Note: In order to maintain the sort invariant on the set,
302 // this function should only be called on a node whose set is
303 // empty to start with.
304 n
->fLastPosSet
->addElement(n
, *fStatus
);
308 // The node is not a leaf.
309 // Calculate lastPos on its children.
310 calcLastPos(n
->fLeftChild
);
311 calcLastPos(n
->fRightChild
);
313 // Apply functions from table 3.40 in Aho
314 if (n
->fType
== RBBINode::opOr
) {
315 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
316 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
318 else if (n
->fType
== RBBINode::opCat
) {
319 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
320 if (n
->fRightChild
->fNullable
) {
321 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
324 else if (n
->fType
== RBBINode::opStar
||
325 n
->fType
== RBBINode::opQuestion
||
326 n
->fType
== RBBINode::opPlus
) {
327 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
333 //-----------------------------------------------------------------------------
335 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
337 //-----------------------------------------------------------------------------
338 void RBBITableBuilder57::calcFollowPos(RBBINode
*n
) {
340 n
->fType
== RBBINode::leafChar
||
341 n
->fType
== RBBINode::endMark
) {
345 calcFollowPos(n
->fLeftChild
);
346 calcFollowPos(n
->fRightChild
);
349 if (n
->fType
== RBBINode::opCat
) {
350 RBBINode
*i
; // is 'i' in Aho's description
353 UVector
*LastPosOfLeftChild
= n
->fLeftChild
->fLastPosSet
;
355 for (ix
=0; ix
<(uint32_t)LastPosOfLeftChild
->size(); ix
++) {
356 i
= (RBBINode
*)LastPosOfLeftChild
->elementAt(ix
);
357 setAdd(i
->fFollowPos
, n
->fRightChild
->fFirstPosSet
);
362 if (n
->fType
== RBBINode::opStar
||
363 n
->fType
== RBBINode::opPlus
) {
364 RBBINode
*i
; // again, n and i are the names from Aho's description.
367 for (ix
=0; ix
<(uint32_t)n
->fLastPosSet
->size(); ix
++) {
368 i
= (RBBINode
*)n
->fLastPosSet
->elementAt(ix
);
369 setAdd(i
->fFollowPos
, n
->fFirstPosSet
);
377 //-----------------------------------------------------------------------------
379 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
380 // as roots of a rule to a destination vector.
382 //-----------------------------------------------------------------------------
383 void RBBITableBuilder57::addRuleRootNodes(UVector
*dest
, RBBINode
*node
) {
384 if (node
== NULL
|| U_FAILURE(*fStatus
)) {
387 if (node
->fRuleRoot
) {
388 dest
->addElement(node
, *fStatus
);
389 // Note: rules cannot nest. If we found a rule start node,
390 // no child node can also be a start node.
393 addRuleRootNodes(dest
, node
->fLeftChild
);
394 addRuleRootNodes(dest
, node
->fRightChild
);
397 //-----------------------------------------------------------------------------
399 // calcChainedFollowPos. Modify the previously calculated followPos sets
400 // to implement rule chaining. NOT described by Aho
402 //-----------------------------------------------------------------------------
403 void RBBITableBuilder57::calcChainedFollowPos(RBBINode
*tree
) {
405 UVector
endMarkerNodes(*fStatus
);
406 UVector
leafNodes(*fStatus
);
409 if (U_FAILURE(*fStatus
)) {
413 // get a list of all endmarker nodes.
414 tree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
416 // get a list all leaf nodes
417 tree
->findNodes(&leafNodes
, RBBINode::leafChar
, *fStatus
);
418 if (U_FAILURE(*fStatus
)) {
422 // Collect all leaf nodes that can start matches for rules
423 // with inbound chaining enabled, which is the union of the
424 // firstPosition sets from each of the rule root nodes.
426 UVector
ruleRootNodes(*fStatus
);
427 addRuleRootNodes(&ruleRootNodes
, tree
);
429 UVector
matchStartNodes(*fStatus
);
430 for (int i
=0; i
<ruleRootNodes
.size(); ++i
) {
431 RBBINode
*node
= static_cast<RBBINode
*>(ruleRootNodes
.elementAt(i
));
432 if (node
->fChainIn
) {
433 setAdd(&matchStartNodes
, node
->fFirstPosSet
);
436 if (U_FAILURE(*fStatus
)) {
443 for (endNodeIx
=0; endNodeIx
<leafNodes
.size(); endNodeIx
++) {
444 RBBINode
*tNode
= (RBBINode
*)leafNodes
.elementAt(endNodeIx
);
445 RBBINode
*endNode
= NULL
;
447 // Identify leaf nodes that correspond to overall rule match positions.
448 // These include an endMarkerNode in their followPos sets.
449 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
450 if (tNode
->fFollowPos
->contains(endMarkerNodes
.elementAt(i
))) {
455 if (endNode
== NULL
) {
456 // node wasn't an end node. Try again with the next.
460 // We've got a node that can end a match.
462 // Line Break Specific hack: If this node's val correspond to the $CM char class,
463 // don't chain from it.
464 // TODO: Add rule syntax for this behavior, get specifics out of here and
465 // into the rule file.
466 if (fRB
->fLBCMNoChain
|| fRB
->fRINoChain
) {
467 UChar32 c
= this->fRB
->fSetBuilder
->getFirstChar(endNode
->fVal
);
469 // c == -1 occurs with sets containing only the {eof} marker string.
470 if (fRB
->fLBCMNoChain
) {
471 ULineBreak cLBProp
= (ULineBreak
)u_getIntPropertyValue(c
, UCHAR_LINE_BREAK
);
472 if (cLBProp
== U_LB_COMBINING_MARK
) {
476 if (fRB
->fRINoChain
) {
477 UGraphemeClusterBreak cGBProp
= (UGraphemeClusterBreak
)u_getIntPropertyValue(c
, UCHAR_GRAPHEME_CLUSTER_BREAK
);
478 if (cGBProp
== U_GCB_REGIONAL_INDICATOR
) {
486 // Now iterate over the nodes that can start a match, looking for ones
487 // with the same char class as our ending node.
489 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
.size(); startNodeIx
++) {
490 startNode
= (RBBINode
*)matchStartNodes
.elementAt(startNodeIx
);
491 if (startNode
->fType
!= RBBINode::leafChar
) {
495 if (endNode
->fVal
== startNode
->fVal
) {
496 // The end val (character class) of one possible match is the
497 // same as the start of another.
499 // Add all nodes from the followPos of the start node to the
500 // followPos set of the end node, which will have the effect of
501 // letting matches transition from a match state at endNode
502 // to the second char of a match starting with startNode.
503 setAdd(endNode
->fFollowPos
, startNode
->fFollowPos
);
510 //-----------------------------------------------------------------------------
512 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
513 // Do an swizzle similar to chaining, modifying the followPos set of
514 // the bofNode to include the followPos nodes from other {bot} nodes
515 // scattered through the tree.
517 // This function has much in common with calcChainedFollowPos().
519 //-----------------------------------------------------------------------------
520 void RBBITableBuilder57::bofFixup() {
522 if (U_FAILURE(*fStatus
)) {
526 // The parse tree looks like this ...
527 // fTree root ---> <cat>
534 // We will be adding things to the followPos set of the <bofNode>
536 RBBINode
*bofNode
= fTree
->fLeftChild
->fLeftChild
;
537 U_ASSERT(bofNode
->fType
== RBBINode::leafChar
);
538 U_ASSERT(bofNode
->fVal
== 2);
540 // Get all nodes that can be the start a match of the user-written rules
541 // (excluding the fake bofNode)
542 // We want the nodes that can start a match in the
543 // part labeled "rest of tree"
545 UVector
*matchStartNodes
= fTree
->fLeftChild
->fRightChild
->fFirstPosSet
;
549 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
->size(); startNodeIx
++) {
550 startNode
= (RBBINode
*)matchStartNodes
->elementAt(startNodeIx
);
551 if (startNode
->fType
!= RBBINode::leafChar
) {
555 if (startNode
->fVal
== bofNode
->fVal
) {
556 // We found a leaf node corresponding to a {bof} that was
557 // explicitly written into a rule.
558 // Add everything from the followPos set of this node to the
559 // followPos set of the fake bofNode at the start of the tree.
561 setAdd(bofNode
->fFollowPos
, startNode
->fFollowPos
);
566 //-----------------------------------------------------------------------------
568 // buildStateTable() Determine the set of runtime DFA states and the
569 // transition tables for these states, by the algorithm
570 // of fig. 3.44 in Aho.
572 // Most of the comments are quotes of Aho's psuedo-code.
574 //-----------------------------------------------------------------------------
575 void RBBITableBuilder57::buildStateTable() {
576 if (U_FAILURE(*fStatus
)) {
579 RBBIStateDescriptor
*failState
;
580 // Set it to NULL to avoid uninitialized warning
581 RBBIStateDescriptor
*initialState
= NULL
;
583 // Add a dummy state 0 - the stop state. Not from Aho.
584 int lastInputSymbol
= fRB
->fSetBuilder
->getNumCharCategories() - 1;
585 failState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
586 if (failState
== NULL
) {
587 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
588 goto ExitBuildSTdeleteall
;
590 failState
->fPositions
= new UVector(*fStatus
);
591 if (failState
->fPositions
== NULL
) {
592 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
594 if (failState
->fPositions
== NULL
|| U_FAILURE(*fStatus
)) {
595 goto ExitBuildSTdeleteall
;
597 fDStates
->addElement(failState
, *fStatus
);
598 if (U_FAILURE(*fStatus
)) {
599 goto ExitBuildSTdeleteall
;
602 // initially, the only unmarked state in Dstates is firstpos(root),
603 // where toot is the root of the syntax tree for (r)#;
604 initialState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
605 if (initialState
== NULL
) {
606 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
608 if (U_FAILURE(*fStatus
)) {
609 goto ExitBuildSTdeleteall
;
611 initialState
->fPositions
= new UVector(*fStatus
);
612 if (initialState
->fPositions
== NULL
) {
613 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
615 if (U_FAILURE(*fStatus
)) {
616 goto ExitBuildSTdeleteall
;
618 setAdd(initialState
->fPositions
, fTree
->fFirstPosSet
);
619 fDStates
->addElement(initialState
, *fStatus
);
620 if (U_FAILURE(*fStatus
)) {
621 goto ExitBuildSTdeleteall
;
624 // while there is an unmarked state T in Dstates do begin
626 RBBIStateDescriptor
*T
= NULL
;
628 for (tx
=1; tx
<fDStates
->size(); tx
++) {
629 RBBIStateDescriptor
*temp
;
630 temp
= (RBBIStateDescriptor
*)fDStates
->elementAt(tx
);
631 if (temp
->fMarked
== FALSE
) {
643 // for each input symbol a do begin
645 for (a
= 1; a
<=lastInputSymbol
; a
++) {
646 // let U be the set of positions that are in followpos(p)
647 // for some position p in T
648 // such that the symbol at position p is a;
652 for (px
=0; px
<T
->fPositions
->size(); px
++) {
653 p
= (RBBINode
*)T
->fPositions
->elementAt(px
);
654 if ((p
->fType
== RBBINode::leafChar
) && (p
->fVal
== a
)) {
656 U
= new UVector(*fStatus
);
658 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
659 goto ExitBuildSTdeleteall
;
662 setAdd(U
, p
->fFollowPos
);
666 // if U is not empty and not in DStates then
668 UBool UinDstates
= FALSE
;
670 U_ASSERT(U
->size() > 0);
672 for (ix
=0; ix
<fDStates
->size(); ix
++) {
673 RBBIStateDescriptor
*temp2
;
674 temp2
= (RBBIStateDescriptor
*)fDStates
->elementAt(ix
);
675 if (setEquals(U
, temp2
->fPositions
)) {
677 U
= temp2
->fPositions
;
684 // Add U as an unmarked state to Dstates
687 RBBIStateDescriptor
*newState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
688 if (newState
== NULL
) {
689 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
691 if (U_FAILURE(*fStatus
)) {
692 goto ExitBuildSTdeleteall
;
694 newState
->fPositions
= U
;
695 fDStates
->addElement(newState
, *fStatus
);
696 if (U_FAILURE(*fStatus
)) {
699 ux
= fDStates
->size()-1;
703 T
->fDtran
->setElementAt(ux
, a
);
708 // delete local pointers only if error occured.
709 ExitBuildSTdeleteall
:
716 //-----------------------------------------------------------------------------
718 // flagAcceptingStates Identify accepting states.
719 // First get a list of all of the end marker nodes.
720 // Then, for each state s,
721 // if s contains one of the end marker nodes in its list of tree positions then
722 // s is an accepting state.
724 //-----------------------------------------------------------------------------
725 void RBBITableBuilder57::flagAcceptingStates() {
726 if (U_FAILURE(*fStatus
)) {
729 UVector
endMarkerNodes(*fStatus
);
734 if (U_FAILURE(*fStatus
)) {
738 fTree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
739 if (U_FAILURE(*fStatus
)) {
743 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
744 endMarker
= (RBBINode
*)endMarkerNodes
.elementAt(i
);
745 for (n
=0; n
<fDStates
->size(); n
++) {
746 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
747 if (sd
->fPositions
->indexOf(endMarker
) >= 0) {
748 // Any non-zero value for fAccepting means this is an accepting node.
749 // The value is what will be returned to the user as the break status.
750 // If no other value was specified, force it to -1.
752 if (sd
->fAccepting
==0) {
753 // State hasn't been marked as accepting yet. Do it now.
754 sd
->fAccepting
= endMarker
->fVal
;
755 if (sd
->fAccepting
== 0) {
759 if (sd
->fAccepting
==-1 && endMarker
->fVal
!= 0) {
760 // Both lookahead and non-lookahead accepting for this state.
761 // Favor the look-ahead. Expedient for line break.
762 // TODO: need a more elegant resolution for conflicting rules.
763 sd
->fAccepting
= endMarker
->fVal
;
766 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
768 // If the end marker node is from a look-ahead rule, set
769 // the fLookAhead field or this state also.
770 if (endMarker
->fLookAheadEnd
) {
771 // TODO: don't change value if already set?
772 // TODO: allow for more than one active look-ahead rule in engine.
773 // Make value here an index to a side array in engine?
774 sd
->fLookAhead
= sd
->fAccepting
;
782 //-----------------------------------------------------------------------------
784 // flagLookAheadStates Very similar to flagAcceptingStates, above.
786 //-----------------------------------------------------------------------------
787 void RBBITableBuilder57::flagLookAheadStates() {
788 if (U_FAILURE(*fStatus
)) {
791 UVector
lookAheadNodes(*fStatus
);
792 RBBINode
*lookAheadNode
;
796 fTree
->findNodes(&lookAheadNodes
, RBBINode::lookAhead
, *fStatus
);
797 if (U_FAILURE(*fStatus
)) {
800 for (i
=0; i
<lookAheadNodes
.size(); i
++) {
801 lookAheadNode
= (RBBINode
*)lookAheadNodes
.elementAt(i
);
803 for (n
=0; n
<fDStates
->size(); n
++) {
804 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
805 if (sd
->fPositions
->indexOf(lookAheadNode
) >= 0) {
806 sd
->fLookAhead
= lookAheadNode
->fVal
;
815 //-----------------------------------------------------------------------------
819 //-----------------------------------------------------------------------------
820 void RBBITableBuilder57::flagTaggedStates() {
821 if (U_FAILURE(*fStatus
)) {
824 UVector
tagNodes(*fStatus
);
829 if (U_FAILURE(*fStatus
)) {
832 fTree
->findNodes(&tagNodes
, RBBINode::tag
, *fStatus
);
833 if (U_FAILURE(*fStatus
)) {
836 for (i
=0; i
<tagNodes
.size(); i
++) { // For each tag node t (all of 'em)
837 tagNode
= (RBBINode
*)tagNodes
.elementAt(i
);
839 for (n
=0; n
<fDStates
->size(); n
++) { // For each state s (row in the state table)
840 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
841 if (sd
->fPositions
->indexOf(tagNode
) >= 0) { // if s include the tag node t
842 sortedAdd(&sd
->fTagVals
, tagNode
->fVal
);
851 //-----------------------------------------------------------------------------
853 // mergeRuleStatusVals
855 // Update the global table of rule status {tag} values
856 // The rule builder has a global vector of status values that are common
857 // for all tables. Merge the ones from this table into the global set.
859 //-----------------------------------------------------------------------------
860 void RBBITableBuilder57::mergeRuleStatusVals() {
862 // The basic outline of what happens here is this...
864 // for each state in this state table
865 // if the status tag list for this state is in the global statuses list
867 // continue with the next state
869 // add the tag list for this state to the global list.
874 // Pre-set a single tag of {0} into the table.
875 // We will need this as a default, for rule sets with no explicit tagging.
876 if (fRB
->fRuleStatusVals
->size() == 0) {
877 fRB
->fRuleStatusVals
->addElement(1, *fStatus
); // Num of statuses in group
878 fRB
->fRuleStatusVals
->addElement((int32_t)0, *fStatus
); // and our single status of zero
882 for (n
=0; n
<fDStates
->size(); n
++) {
883 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
884 UVector
*thisStatesTagValues
= sd
->fTagVals
;
885 if (thisStatesTagValues
== NULL
) {
886 // No tag values are explicitly associated with this state.
887 // Set the default tag value.
892 // There are tag(s) associated with this state.
893 // fTagsIdx will be the index into the global tag list for this state's tag values.
894 // Initial value of -1 flags that we haven't got it set yet.
896 int32_t thisTagGroupStart
= 0; // indexes into the global rule status vals list
897 int32_t nextTagGroupStart
= 0;
899 // Loop runs once per group of tags in the global list
900 while (nextTagGroupStart
< fRB
->fRuleStatusVals
->size()) {
901 thisTagGroupStart
= nextTagGroupStart
;
902 nextTagGroupStart
+= fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
) + 1;
903 if (thisStatesTagValues
->size() != fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
)) {
904 // The number of tags for this state is different from
905 // the number of tags in this group from the global list.
906 // Continue with the next group from the global list.
909 // The lengths match, go ahead and compare the actual tag values
910 // between this state and the group from the global list.
911 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
912 if (thisStatesTagValues
->elementAti(i
) !=
913 fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
+ 1 + i
) ) {
919 if (i
== thisStatesTagValues
->size()) {
920 // We found a set of tag values in the global list that match
921 // those for this state. Use them.
922 sd
->fTagsIdx
= thisTagGroupStart
;
927 if (sd
->fTagsIdx
== -1) {
928 // No suitable entry in the global tag list already. Add one
929 sd
->fTagsIdx
= fRB
->fRuleStatusVals
->size();
930 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->size(), *fStatus
);
931 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
932 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->elementAti(i
), *fStatus
);
944 //-----------------------------------------------------------------------------
946 // sortedAdd Add a value to a vector of sorted values (ints).
947 // Do not replicate entries; if the value is already there, do not
949 // Lazily create the vector if it does not already exist.
951 //-----------------------------------------------------------------------------
952 void RBBITableBuilder57::sortedAdd(UVector
**vector
, int32_t val
) {
955 if (*vector
== NULL
) {
956 *vector
= new UVector(*fStatus
);
958 if (*vector
== NULL
|| U_FAILURE(*fStatus
)) {
961 UVector
*vec
= *vector
;
962 int32_t vSize
= vec
->size();
963 for (i
=0; i
<vSize
; i
++) {
964 int32_t valAtI
= vec
->elementAti(i
);
966 // The value is already in the vector. Don't add it again.
973 vec
->insertElementAt(val
, i
, *fStatus
);
978 //-----------------------------------------------------------------------------
980 // setAdd Set operation on UVector
981 // dest = dest union source
982 // Elements may only appear once and must be sorted.
984 //-----------------------------------------------------------------------------
985 void RBBITableBuilder57::setAdd(UVector
*dest
, UVector
*source
) {
986 int32_t destOriginalSize
= dest
->size();
987 int32_t sourceSize
= source
->size();
989 MaybeStackArray
<void *, 16> destArray
, sourceArray
; // Handle small cases without malloc
990 void **destPtr
, **sourcePtr
;
991 void **destLim
, **sourceLim
;
993 if (destOriginalSize
> destArray
.getCapacity()) {
994 if (destArray
.resize(destOriginalSize
) == NULL
) {
998 destPtr
= destArray
.getAlias();
999 destLim
= destPtr
+ destOriginalSize
; // destArray.getArrayLimit()?
1001 if (sourceSize
> sourceArray
.getCapacity()) {
1002 if (sourceArray
.resize(sourceSize
) == NULL
) {
1006 sourcePtr
= sourceArray
.getAlias();
1007 sourceLim
= sourcePtr
+ sourceSize
; // sourceArray.getArrayLimit()?
1009 // Avoid multiple "get element" calls by getting the contents into arrays
1010 (void) dest
->toArray(destPtr
);
1011 (void) source
->toArray(sourcePtr
);
1013 dest
->setSize(sourceSize
+destOriginalSize
, *fStatus
);
1015 while (sourcePtr
< sourceLim
&& destPtr
< destLim
) {
1016 if (*destPtr
== *sourcePtr
) {
1017 dest
->setElementAt(*sourcePtr
++, di
++);
1020 // This check is required for machines with segmented memory, like i5/OS.
1021 // Direct pointer comparison is not recommended.
1022 else if (uprv_memcmp(destPtr
, sourcePtr
, sizeof(void *)) < 0) {
1023 dest
->setElementAt(*destPtr
++, di
++);
1025 else { /* *sourcePtr < *destPtr */
1026 dest
->setElementAt(*sourcePtr
++, di
++);
1030 // At most one of these two cleanup loops will execute
1031 while (destPtr
< destLim
) {
1032 dest
->setElementAt(*destPtr
++, di
++);
1034 while (sourcePtr
< sourceLim
) {
1035 dest
->setElementAt(*sourcePtr
++, di
++);
1038 dest
->setSize(di
, *fStatus
);
1043 //-----------------------------------------------------------------------------
1045 // setEqual Set operation on UVector.
1046 // Compare for equality.
1047 // Elements must be sorted.
1049 //-----------------------------------------------------------------------------
1050 UBool
RBBITableBuilder57::setEquals(UVector
*a
, UVector
*b
) {
1051 return a
->equals(*b
);
1055 //-----------------------------------------------------------------------------
1057 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1058 // for each node in the tree.
1060 //-----------------------------------------------------------------------------
1062 void RBBITableBuilder57::printPosSets(RBBINode
*n
) {
1067 RBBINode::printNodeHeader();
1069 RBBIDebugPrintf(" Nullable: %s\n", n
->fNullable
?"TRUE":"FALSE");
1071 RBBIDebugPrintf(" firstpos: ");
1072 printSet(n
->fFirstPosSet
);
1074 RBBIDebugPrintf(" lastpos: ");
1075 printSet(n
->fLastPosSet
);
1077 RBBIDebugPrintf(" followpos: ");
1078 printSet(n
->fFollowPos
);
1080 printPosSets(n
->fLeftChild
);
1081 printPosSets(n
->fRightChild
);
1087 //-----------------------------------------------------------------------------
1089 // getTableSize() Calculate the size of the runtime form of this
1090 // state transition table.
1092 //-----------------------------------------------------------------------------
1093 int32_t RBBITableBuilder57::getTableSize() const {
1099 if (fTree
== NULL
) {
1103 size
= sizeof(RBBIStateTable
) - 4; // The header, with no rows to the table.
1105 numRows
= fDStates
->size();
1106 numCols
= fRB
->fSetBuilder
->getNumCharCategories();
1108 // Note The declaration of RBBIStateTableRow is for a table of two columns.
1109 // Therefore we subtract two from numCols when determining
1110 // how much storage to add to a row for the total columns.
1111 rowSize
= sizeof(RBBIStateTableRow
) + sizeof(uint16_t)*(numCols
-2);
1112 size
+= numRows
* rowSize
;
1118 //-----------------------------------------------------------------------------
1120 // exportTable() export the state transition table in the format required
1121 // by the runtime engine. getTableSize() bytes of memory
1122 // must be available at the output address "where".
1124 //-----------------------------------------------------------------------------
1125 void RBBITableBuilder57::exportTable(void *where
) {
1126 RBBIStateTable
*table
= (RBBIStateTable
*)where
;
1130 if (U_FAILURE(*fStatus
) || fTree
== NULL
) {
1134 if (fRB
->fSetBuilder
->getNumCharCategories() > 0x7fff ||
1135 fDStates
->size() > 0x7fff) {
1136 *fStatus
= U_BRK_INTERNAL_ERROR
;
1140 table
->fRowLen
= sizeof(RBBIStateTableRow
) +
1141 sizeof(uint16_t) * (fRB
->fSetBuilder
->getNumCharCategories() - 2);
1142 table
->fNumStates
= fDStates
->size();
1144 if (fRB
->fLookAheadHardBreak
) {
1145 table
->fFlags
|= RBBI_LOOKAHEAD_HARD_BREAK
;
1147 if (fRB
->fSetBuilder
->sawBOF()) {
1148 table
->fFlags
|= RBBI_BOF_REQUIRED
;
1150 table
->fReserved
= 0;
1152 for (state
=0; state
<table
->fNumStates
; state
++) {
1153 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(state
);
1154 RBBIStateTableRow
*row
= (RBBIStateTableRow
*)(table
->fTableData
+ state
*table
->fRowLen
);
1155 U_ASSERT (-32768 < sd
->fAccepting
&& sd
->fAccepting
<= 32767);
1156 U_ASSERT (-32768 < sd
->fLookAhead
&& sd
->fLookAhead
<= 32767);
1157 row
->fAccepting
= (int16_t)sd
->fAccepting
;
1158 row
->fLookAhead
= (int16_t)sd
->fLookAhead
;
1159 row
->fTagIdx
= (int16_t)sd
->fTagsIdx
;
1160 for (col
=0; col
<fRB
->fSetBuilder
->getNumCharCategories(); col
++) {
1161 row
->fNextState
[col
] = (uint16_t)sd
->fDtran
->elementAti(col
);
1168 //-----------------------------------------------------------------------------
1170 // printSet Debug function. Print the contents of a UVector
1172 //-----------------------------------------------------------------------------
1174 void RBBITableBuilder57::printSet(UVector
*s
) {
1176 for (i
=0; i
<s
->size(); i
++) {
1177 const RBBINode
*v
= static_cast<const RBBINode
*>(s
->elementAt(i
));
1178 RBBIDebugPrintf("%5d", v
==NULL
? -1 : v
->fSerialNum
);
1180 RBBIDebugPrintf("\n");
1185 //-----------------------------------------------------------------------------
1187 // printStates Debug Function. Dump the fully constructed state transition table.
1189 //-----------------------------------------------------------------------------
1191 void RBBITableBuilder57::printStates() {
1192 int c
; // input "character"
1193 int n
; // state number
1195 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1196 RBBIDebugPrintf(" | Acc LA Tag");
1197 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1198 RBBIDebugPrintf(" %2d", c
);
1200 RBBIDebugPrintf("\n");
1201 RBBIDebugPrintf(" |---------------");
1202 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1203 RBBIDebugPrintf("---");
1205 RBBIDebugPrintf("\n");
1207 for (n
=0; n
<fDStates
->size(); n
++) {
1208 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
1209 RBBIDebugPrintf(" %3d | " , n
);
1210 RBBIDebugPrintf("%3d %3d %5d ", sd
->fAccepting
, sd
->fLookAhead
, sd
->fTagsIdx
);
1211 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1212 RBBIDebugPrintf(" %2d", sd
->fDtran
->elementAti(c
));
1214 RBBIDebugPrintf("\n");
1216 RBBIDebugPrintf("\n\n");
1222 //-----------------------------------------------------------------------------
1224 // printRuleStatusTable Debug Function. Dump the common rule status table
1226 //-----------------------------------------------------------------------------
1228 void RBBITableBuilder57::printRuleStatusTable() {
1229 int32_t thisRecord
= 0;
1230 int32_t nextRecord
= 0;
1232 UVector
*tbl
= fRB
->fRuleStatusVals
;
1234 RBBIDebugPrintf("index | tags \n");
1235 RBBIDebugPrintf("-------------------\n");
1237 while (nextRecord
< tbl
->size()) {
1238 thisRecord
= nextRecord
;
1239 nextRecord
= thisRecord
+ tbl
->elementAti(thisRecord
) + 1;
1240 RBBIDebugPrintf("%4d ", thisRecord
);
1241 for (i
=thisRecord
+1; i
<nextRecord
; i
++) {
1242 RBBIDebugPrintf(" %5d", tbl
->elementAti(i
));
1244 RBBIDebugPrintf("\n");
1246 RBBIDebugPrintf("\n\n");
1251 //-----------------------------------------------------------------------------
1253 // RBBIStateDescriptor - in standard rbbitblb.cpp
1255 //-----------------------------------------------------------------------------
1259 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */