1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_BREAK_ITERATION
18 #include "unicode/unistr.h"
29 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder
*rb
, RBBINode
**rootNode
) :
32 fStatus
= fRB
->fStatus
;
33 UErrorCode status
= U_ZERO_ERROR
;
34 fDStates
= new UVector(status
);
35 if (U_FAILURE(*fStatus
)) {
38 if (U_FAILURE(status
)) {
42 if (fDStates
== NULL
) {
43 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;;
49 RBBITableBuilder::~RBBITableBuilder() {
51 for (i
=0; i
<fDStates
->size(); i
++) {
52 delete (RBBIStateDescriptor
*)fDStates
->elementAt(i
);
58 //-----------------------------------------------------------------------------
60 // RBBITableBuilder::build - This is the main function for building the DFA state transtion
61 // table from the RBBI rules parse tree.
63 //-----------------------------------------------------------------------------
64 void RBBITableBuilder::build() {
66 if (U_FAILURE(*fStatus
)) {
70 // If there were no rules, just return. This situation can easily arise
71 // for the reverse rules.
77 // Walk through the tree, replacing any references to $variables with a copy of the
78 // parse tree for the substition expression.
80 fTree
= fTree
->flattenVariables();
82 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "ftree")) {
83 RBBIDebugPuts("\nParse tree after flattening variable references.");
84 RBBINode::printTree(fTree
, TRUE
);
89 // If the rules contained any references to {bof}
90 // add a {bof} <cat> <former root of tree> to the
91 // tree. Means that all matches must start out with the
92 // {bof} fake character.
94 if (fRB
->fSetBuilder
->sawBOF()) {
95 RBBINode
*bofTop
= new RBBINode(RBBINode::opCat
);
96 RBBINode
*bofLeaf
= new RBBINode(RBBINode::leafChar
);
97 // Delete and exit if memory allocation failed.
98 if (bofTop
== NULL
|| bofLeaf
== NULL
) {
99 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
104 bofTop
->fLeftChild
= bofLeaf
;
105 bofTop
->fRightChild
= fTree
;
106 bofLeaf
->fParent
= bofTop
;
107 bofLeaf
->fVal
= 2; // Reserved value for {bof}.
112 // Add a unique right-end marker to the expression.
113 // Appears as a cat-node, left child being the original tree,
114 // right child being the end marker.
116 RBBINode
*cn
= new RBBINode(RBBINode::opCat
);
117 // Exit if memory allocation failed.
119 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
122 cn
->fLeftChild
= fTree
;
124 cn
->fRightChild
= new RBBINode(RBBINode::endMark
);
125 // Delete and exit if memory allocation failed.
126 if (cn
->fRightChild
== NULL
) {
127 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
131 cn
->fRightChild
->fParent
= cn
;
135 // Replace all references to UnicodeSets with the tree for the equivalent
138 fTree
->flattenSets();
140 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "stree")) {
141 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
142 RBBINode::printTree(fTree
, TRUE
);
148 // calculate the functions nullable, firstpos, lastpos and followpos on
149 // nodes in the parse tree.
150 // See the alogrithm description in Aho.
151 // Understanding how this works by looking at the code alone will be
152 // nearly impossible.
157 calcFollowPos(fTree
);
158 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "pos")) {
164 // For "chained" rules, modify the followPos sets
166 if (fRB
->fChainRules
) {
167 calcChainedFollowPos(fTree
);
171 // BOF (start of input) test fixup.
173 if (fRB
->fSetBuilder
->sawBOF()) {
178 // Build the DFA state transition tables.
181 flagAcceptingStates();
182 flagLookAheadStates();
186 // Update the global table of rule status {tag} values
187 // The rule builder has a global vector of status values that are common
188 // for all tables. Merge the ones from this table into the global set.
190 mergeRuleStatusVals();
192 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "states")) {printStates();};
197 //-----------------------------------------------------------------------------
199 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
201 //-----------------------------------------------------------------------------
202 void RBBITableBuilder::calcNullable(RBBINode
*n
) {
206 if (n
->fType
== RBBINode::setRef
||
207 n
->fType
== RBBINode::endMark
) {
208 // These are non-empty leaf node types.
209 n
->fNullable
= FALSE
;
213 if (n
->fType
== RBBINode::lookAhead
|| n
->fType
== RBBINode::tag
) {
214 // Lookahead marker node. It's a leaf, so no recursion on children.
215 // It's nullable because it does not match any literal text from the input stream.
221 // The node is not a leaf.
222 // Calculate nullable on its children.
223 calcNullable(n
->fLeftChild
);
224 calcNullable(n
->fRightChild
);
226 // Apply functions from table 3.40 in Aho
227 if (n
->fType
== RBBINode::opOr
) {
228 n
->fNullable
= n
->fLeftChild
->fNullable
|| n
->fRightChild
->fNullable
;
230 else if (n
->fType
== RBBINode::opCat
) {
231 n
->fNullable
= n
->fLeftChild
->fNullable
&& n
->fRightChild
->fNullable
;
233 else if (n
->fType
== RBBINode::opStar
|| n
->fType
== RBBINode::opQuestion
) {
237 n
->fNullable
= FALSE
;
244 //-----------------------------------------------------------------------------
246 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
248 //-----------------------------------------------------------------------------
249 void RBBITableBuilder::calcFirstPos(RBBINode
*n
) {
253 if (n
->fType
== RBBINode::leafChar
||
254 n
->fType
== RBBINode::endMark
||
255 n
->fType
== RBBINode::lookAhead
||
256 n
->fType
== RBBINode::tag
) {
257 // These are non-empty leaf node types.
258 // Note: In order to maintain the sort invariant on the set,
259 // this function should only be called on a node whose set is
260 // empty to start with.
261 n
->fFirstPosSet
->addElement(n
, *fStatus
);
265 // The node is not a leaf.
266 // Calculate firstPos on its children.
267 calcFirstPos(n
->fLeftChild
);
268 calcFirstPos(n
->fRightChild
);
270 // Apply functions from table 3.40 in Aho
271 if (n
->fType
== RBBINode::opOr
) {
272 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
273 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
275 else if (n
->fType
== RBBINode::opCat
) {
276 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
277 if (n
->fLeftChild
->fNullable
) {
278 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
281 else if (n
->fType
== RBBINode::opStar
||
282 n
->fType
== RBBINode::opQuestion
||
283 n
->fType
== RBBINode::opPlus
) {
284 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
290 //-----------------------------------------------------------------------------
292 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
294 //-----------------------------------------------------------------------------
295 void RBBITableBuilder::calcLastPos(RBBINode
*n
) {
299 if (n
->fType
== RBBINode::leafChar
||
300 n
->fType
== RBBINode::endMark
||
301 n
->fType
== RBBINode::lookAhead
||
302 n
->fType
== RBBINode::tag
) {
303 // These are non-empty leaf node types.
304 // Note: In order to maintain the sort invariant on the set,
305 // this function should only be called on a node whose set is
306 // empty to start with.
307 n
->fLastPosSet
->addElement(n
, *fStatus
);
311 // The node is not a leaf.
312 // Calculate lastPos on its children.
313 calcLastPos(n
->fLeftChild
);
314 calcLastPos(n
->fRightChild
);
316 // Apply functions from table 3.40 in Aho
317 if (n
->fType
== RBBINode::opOr
) {
318 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
319 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
321 else if (n
->fType
== RBBINode::opCat
) {
322 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
323 if (n
->fRightChild
->fNullable
) {
324 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
327 else if (n
->fType
== RBBINode::opStar
||
328 n
->fType
== RBBINode::opQuestion
||
329 n
->fType
== RBBINode::opPlus
) {
330 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
336 //-----------------------------------------------------------------------------
338 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
340 //-----------------------------------------------------------------------------
341 void RBBITableBuilder::calcFollowPos(RBBINode
*n
) {
343 n
->fType
== RBBINode::leafChar
||
344 n
->fType
== RBBINode::endMark
) {
348 calcFollowPos(n
->fLeftChild
);
349 calcFollowPos(n
->fRightChild
);
352 if (n
->fType
== RBBINode::opCat
) {
353 RBBINode
*i
; // is 'i' in Aho's description
356 UVector
*LastPosOfLeftChild
= n
->fLeftChild
->fLastPosSet
;
358 for (ix
=0; ix
<(uint32_t)LastPosOfLeftChild
->size(); ix
++) {
359 i
= (RBBINode
*)LastPosOfLeftChild
->elementAt(ix
);
360 setAdd(i
->fFollowPos
, n
->fRightChild
->fFirstPosSet
);
365 if (n
->fType
== RBBINode::opStar
||
366 n
->fType
== RBBINode::opPlus
) {
367 RBBINode
*i
; // again, n and i are the names from Aho's description.
370 for (ix
=0; ix
<(uint32_t)n
->fLastPosSet
->size(); ix
++) {
371 i
= (RBBINode
*)n
->fLastPosSet
->elementAt(ix
);
372 setAdd(i
->fFollowPos
, n
->fFirstPosSet
);
380 //-----------------------------------------------------------------------------
382 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
383 // as roots of a rule to a destination vector.
385 //-----------------------------------------------------------------------------
386 void RBBITableBuilder::addRuleRootNodes(UVector
*dest
, RBBINode
*node
) {
387 if (node
== NULL
|| U_FAILURE(*fStatus
)) {
390 if (node
->fRuleRoot
) {
391 dest
->addElement(node
, *fStatus
);
392 // Note: rules cannot nest. If we found a rule start node,
393 // no child node can also be a start node.
396 addRuleRootNodes(dest
, node
->fLeftChild
);
397 addRuleRootNodes(dest
, node
->fRightChild
);
400 //-----------------------------------------------------------------------------
402 // calcChainedFollowPos. Modify the previously calculated followPos sets
403 // to implement rule chaining. NOT described by Aho
405 //-----------------------------------------------------------------------------
406 void RBBITableBuilder::calcChainedFollowPos(RBBINode
*tree
) {
408 UVector
endMarkerNodes(*fStatus
);
409 UVector
leafNodes(*fStatus
);
412 if (U_FAILURE(*fStatus
)) {
416 // get a list of all endmarker nodes.
417 tree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
419 // get a list all leaf nodes
420 tree
->findNodes(&leafNodes
, RBBINode::leafChar
, *fStatus
);
421 if (U_FAILURE(*fStatus
)) {
425 // Collect all leaf nodes that can start matches for rules
426 // with inbound chaining enabled, which is the union of the
427 // firstPosition sets from each of the rule root nodes.
429 UVector
ruleRootNodes(*fStatus
);
430 addRuleRootNodes(&ruleRootNodes
, tree
);
432 UVector
matchStartNodes(*fStatus
);
433 for (int i
=0; i
<ruleRootNodes
.size(); ++i
) {
434 RBBINode
*node
= static_cast<RBBINode
*>(ruleRootNodes
.elementAt(i
));
435 if (node
->fChainIn
) {
436 setAdd(&matchStartNodes
, node
->fFirstPosSet
);
439 if (U_FAILURE(*fStatus
)) {
446 for (endNodeIx
=0; endNodeIx
<leafNodes
.size(); endNodeIx
++) {
447 RBBINode
*tNode
= (RBBINode
*)leafNodes
.elementAt(endNodeIx
);
448 RBBINode
*endNode
= NULL
;
450 // Identify leaf nodes that correspond to overall rule match positions.
451 // These include an endMarkerNode in their followPos sets.
452 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
453 if (tNode
->fFollowPos
->contains(endMarkerNodes
.elementAt(i
))) {
458 if (endNode
== NULL
) {
459 // node wasn't an end node. Try again with the next.
463 // We've got a node that can end a match.
465 // Line Break Specific hack: If this node's val correspond to the $CM char class,
466 // don't chain from it.
467 // TODO: Add rule syntax for this behavior, get specifics out of here and
468 // into the rule file.
469 if (fRB
->fLBCMNoChain
) {
470 UChar32 c
= this->fRB
->fSetBuilder
->getFirstChar(endNode
->fVal
);
472 // c == -1 occurs with sets containing only the {eof} marker string.
473 ULineBreak cLBProp
= (ULineBreak
)u_getIntPropertyValue(c
, UCHAR_LINE_BREAK
);
474 if (cLBProp
== U_LB_COMBINING_MARK
) {
481 // Now iterate over the nodes that can start a match, looking for ones
482 // with the same char class as our ending node.
484 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
.size(); startNodeIx
++) {
485 startNode
= (RBBINode
*)matchStartNodes
.elementAt(startNodeIx
);
486 if (startNode
->fType
!= RBBINode::leafChar
) {
490 if (endNode
->fVal
== startNode
->fVal
) {
491 // The end val (character class) of one possible match is the
492 // same as the start of another.
494 // Add all nodes from the followPos of the start node to the
495 // followPos set of the end node, which will have the effect of
496 // letting matches transition from a match state at endNode
497 // to the second char of a match starting with startNode.
498 setAdd(endNode
->fFollowPos
, startNode
->fFollowPos
);
505 //-----------------------------------------------------------------------------
507 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
508 // Do an swizzle similar to chaining, modifying the followPos set of
509 // the bofNode to include the followPos nodes from other {bot} nodes
510 // scattered through the tree.
512 // This function has much in common with calcChainedFollowPos().
514 //-----------------------------------------------------------------------------
515 void RBBITableBuilder::bofFixup() {
517 if (U_FAILURE(*fStatus
)) {
521 // The parse tree looks like this ...
522 // fTree root ---> <cat>
529 // We will be adding things to the followPos set of the <bofNode>
531 RBBINode
*bofNode
= fTree
->fLeftChild
->fLeftChild
;
532 U_ASSERT(bofNode
->fType
== RBBINode::leafChar
);
533 U_ASSERT(bofNode
->fVal
== 2);
535 // Get all nodes that can be the start a match of the user-written rules
536 // (excluding the fake bofNode)
537 // We want the nodes that can start a match in the
538 // part labeled "rest of tree"
540 UVector
*matchStartNodes
= fTree
->fLeftChild
->fRightChild
->fFirstPosSet
;
544 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
->size(); startNodeIx
++) {
545 startNode
= (RBBINode
*)matchStartNodes
->elementAt(startNodeIx
);
546 if (startNode
->fType
!= RBBINode::leafChar
) {
550 if (startNode
->fVal
== bofNode
->fVal
) {
551 // We found a leaf node corresponding to a {bof} that was
552 // explicitly written into a rule.
553 // Add everything from the followPos set of this node to the
554 // followPos set of the fake bofNode at the start of the tree.
556 setAdd(bofNode
->fFollowPos
, startNode
->fFollowPos
);
561 //-----------------------------------------------------------------------------
563 // buildStateTable() Determine the set of runtime DFA states and the
564 // transition tables for these states, by the algorithm
565 // of fig. 3.44 in Aho.
567 // Most of the comments are quotes of Aho's psuedo-code.
569 //-----------------------------------------------------------------------------
570 void RBBITableBuilder::buildStateTable() {
571 if (U_FAILURE(*fStatus
)) {
574 RBBIStateDescriptor
*failState
;
575 // Set it to NULL to avoid uninitialized warning
576 RBBIStateDescriptor
*initialState
= NULL
;
578 // Add a dummy state 0 - the stop state. Not from Aho.
579 int lastInputSymbol
= fRB
->fSetBuilder
->getNumCharCategories() - 1;
580 failState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
581 if (failState
== NULL
) {
582 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
583 goto ExitBuildSTdeleteall
;
585 failState
->fPositions
= new UVector(*fStatus
);
586 if (failState
->fPositions
== NULL
) {
587 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
589 if (failState
->fPositions
== NULL
|| U_FAILURE(*fStatus
)) {
590 goto ExitBuildSTdeleteall
;
592 fDStates
->addElement(failState
, *fStatus
);
593 if (U_FAILURE(*fStatus
)) {
594 goto ExitBuildSTdeleteall
;
597 // initially, the only unmarked state in Dstates is firstpos(root),
598 // where toot is the root of the syntax tree for (r)#;
599 initialState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
600 if (initialState
== NULL
) {
601 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
603 if (U_FAILURE(*fStatus
)) {
604 goto ExitBuildSTdeleteall
;
606 initialState
->fPositions
= new UVector(*fStatus
);
607 if (initialState
->fPositions
== NULL
) {
608 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
610 if (U_FAILURE(*fStatus
)) {
611 goto ExitBuildSTdeleteall
;
613 setAdd(initialState
->fPositions
, fTree
->fFirstPosSet
);
614 fDStates
->addElement(initialState
, *fStatus
);
615 if (U_FAILURE(*fStatus
)) {
616 goto ExitBuildSTdeleteall
;
619 // while there is an unmarked state T in Dstates do begin
621 RBBIStateDescriptor
*T
= NULL
;
623 for (tx
=1; tx
<fDStates
->size(); tx
++) {
624 RBBIStateDescriptor
*temp
;
625 temp
= (RBBIStateDescriptor
*)fDStates
->elementAt(tx
);
626 if (temp
->fMarked
== FALSE
) {
638 // for each input symbol a do begin
640 for (a
= 1; a
<=lastInputSymbol
; a
++) {
641 // let U be the set of positions that are in followpos(p)
642 // for some position p in T
643 // such that the symbol at position p is a;
647 for (px
=0; px
<T
->fPositions
->size(); px
++) {
648 p
= (RBBINode
*)T
->fPositions
->elementAt(px
);
649 if ((p
->fType
== RBBINode::leafChar
) && (p
->fVal
== a
)) {
651 U
= new UVector(*fStatus
);
653 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
654 goto ExitBuildSTdeleteall
;
657 setAdd(U
, p
->fFollowPos
);
661 // if U is not empty and not in DStates then
663 UBool UinDstates
= FALSE
;
665 U_ASSERT(U
->size() > 0);
667 for (ix
=0; ix
<fDStates
->size(); ix
++) {
668 RBBIStateDescriptor
*temp2
;
669 temp2
= (RBBIStateDescriptor
*)fDStates
->elementAt(ix
);
670 if (setEquals(U
, temp2
->fPositions
)) {
672 U
= temp2
->fPositions
;
679 // Add U as an unmarked state to Dstates
682 RBBIStateDescriptor
*newState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
683 if (newState
== NULL
) {
684 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
686 if (U_FAILURE(*fStatus
)) {
687 goto ExitBuildSTdeleteall
;
689 newState
->fPositions
= U
;
690 fDStates
->addElement(newState
, *fStatus
);
691 if (U_FAILURE(*fStatus
)) {
694 ux
= fDStates
->size()-1;
698 T
->fDtran
->setElementAt(ux
, a
);
703 // delete local pointers only if error occured.
704 ExitBuildSTdeleteall
:
711 //-----------------------------------------------------------------------------
713 // flagAcceptingStates Identify accepting states.
714 // First get a list of all of the end marker nodes.
715 // Then, for each state s,
716 // if s contains one of the end marker nodes in its list of tree positions then
717 // s is an accepting state.
719 //-----------------------------------------------------------------------------
720 void RBBITableBuilder::flagAcceptingStates() {
721 if (U_FAILURE(*fStatus
)) {
724 UVector
endMarkerNodes(*fStatus
);
729 if (U_FAILURE(*fStatus
)) {
733 fTree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
734 if (U_FAILURE(*fStatus
)) {
738 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
739 endMarker
= (RBBINode
*)endMarkerNodes
.elementAt(i
);
740 for (n
=0; n
<fDStates
->size(); n
++) {
741 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
742 if (sd
->fPositions
->indexOf(endMarker
) >= 0) {
743 // Any non-zero value for fAccepting means this is an accepting node.
744 // The value is what will be returned to the user as the break status.
745 // If no other value was specified, force it to -1.
747 if (sd
->fAccepting
==0) {
748 // State hasn't been marked as accepting yet. Do it now.
749 sd
->fAccepting
= endMarker
->fVal
;
750 if (sd
->fAccepting
== 0) {
754 if (sd
->fAccepting
==-1 && endMarker
->fVal
!= 0) {
755 // Both lookahead and non-lookahead accepting for this state.
756 // Favor the look-ahead. Expedient for line break.
757 // TODO: need a more elegant resolution for conflicting rules.
758 sd
->fAccepting
= endMarker
->fVal
;
761 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
763 // If the end marker node is from a look-ahead rule, set
764 // the fLookAhead field or this state also.
765 if (endMarker
->fLookAheadEnd
) {
766 // TODO: don't change value if already set?
767 // TODO: allow for more than one active look-ahead rule in engine.
768 // Make value here an index to a side array in engine?
769 sd
->fLookAhead
= sd
->fAccepting
;
777 //-----------------------------------------------------------------------------
779 // flagLookAheadStates Very similar to flagAcceptingStates, above.
781 //-----------------------------------------------------------------------------
782 void RBBITableBuilder::flagLookAheadStates() {
783 if (U_FAILURE(*fStatus
)) {
786 UVector
lookAheadNodes(*fStatus
);
787 RBBINode
*lookAheadNode
;
791 fTree
->findNodes(&lookAheadNodes
, RBBINode::lookAhead
, *fStatus
);
792 if (U_FAILURE(*fStatus
)) {
795 for (i
=0; i
<lookAheadNodes
.size(); i
++) {
796 lookAheadNode
= (RBBINode
*)lookAheadNodes
.elementAt(i
);
798 for (n
=0; n
<fDStates
->size(); n
++) {
799 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
800 if (sd
->fPositions
->indexOf(lookAheadNode
) >= 0) {
801 sd
->fLookAhead
= lookAheadNode
->fVal
;
810 //-----------------------------------------------------------------------------
814 //-----------------------------------------------------------------------------
815 void RBBITableBuilder::flagTaggedStates() {
816 if (U_FAILURE(*fStatus
)) {
819 UVector
tagNodes(*fStatus
);
824 if (U_FAILURE(*fStatus
)) {
827 fTree
->findNodes(&tagNodes
, RBBINode::tag
, *fStatus
);
828 if (U_FAILURE(*fStatus
)) {
831 for (i
=0; i
<tagNodes
.size(); i
++) { // For each tag node t (all of 'em)
832 tagNode
= (RBBINode
*)tagNodes
.elementAt(i
);
834 for (n
=0; n
<fDStates
->size(); n
++) { // For each state s (row in the state table)
835 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
836 if (sd
->fPositions
->indexOf(tagNode
) >= 0) { // if s include the tag node t
837 sortedAdd(&sd
->fTagVals
, tagNode
->fVal
);
846 //-----------------------------------------------------------------------------
848 // mergeRuleStatusVals
850 // Update the global table of rule status {tag} values
851 // The rule builder has a global vector of status values that are common
852 // for all tables. Merge the ones from this table into the global set.
854 //-----------------------------------------------------------------------------
855 void RBBITableBuilder::mergeRuleStatusVals() {
857 // The basic outline of what happens here is this...
859 // for each state in this state table
860 // if the status tag list for this state is in the global statuses list
862 // continue with the next state
864 // add the tag list for this state to the global list.
869 // Pre-set a single tag of {0} into the table.
870 // We will need this as a default, for rule sets with no explicit tagging.
871 if (fRB
->fRuleStatusVals
->size() == 0) {
872 fRB
->fRuleStatusVals
->addElement(1, *fStatus
); // Num of statuses in group
873 fRB
->fRuleStatusVals
->addElement((int32_t)0, *fStatus
); // and our single status of zero
877 for (n
=0; n
<fDStates
->size(); n
++) {
878 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
879 UVector
*thisStatesTagValues
= sd
->fTagVals
;
880 if (thisStatesTagValues
== NULL
) {
881 // No tag values are explicitly associated with this state.
882 // Set the default tag value.
887 // There are tag(s) associated with this state.
888 // fTagsIdx will be the index into the global tag list for this state's tag values.
889 // Initial value of -1 flags that we haven't got it set yet.
891 int32_t thisTagGroupStart
= 0; // indexes into the global rule status vals list
892 int32_t nextTagGroupStart
= 0;
894 // Loop runs once per group of tags in the global list
895 while (nextTagGroupStart
< fRB
->fRuleStatusVals
->size()) {
896 thisTagGroupStart
= nextTagGroupStart
;
897 nextTagGroupStart
+= fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
) + 1;
898 if (thisStatesTagValues
->size() != fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
)) {
899 // The number of tags for this state is different from
900 // the number of tags in this group from the global list.
901 // Continue with the next group from the global list.
904 // The lengths match, go ahead and compare the actual tag values
905 // between this state and the group from the global list.
906 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
907 if (thisStatesTagValues
->elementAti(i
) !=
908 fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
+ 1 + i
) ) {
914 if (i
== thisStatesTagValues
->size()) {
915 // We found a set of tag values in the global list that match
916 // those for this state. Use them.
917 sd
->fTagsIdx
= thisTagGroupStart
;
922 if (sd
->fTagsIdx
== -1) {
923 // No suitable entry in the global tag list already. Add one
924 sd
->fTagsIdx
= fRB
->fRuleStatusVals
->size();
925 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->size(), *fStatus
);
926 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
927 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->elementAti(i
), *fStatus
);
939 //-----------------------------------------------------------------------------
941 // sortedAdd Add a value to a vector of sorted values (ints).
942 // Do not replicate entries; if the value is already there, do not
944 // Lazily create the vector if it does not already exist.
946 //-----------------------------------------------------------------------------
947 void RBBITableBuilder::sortedAdd(UVector
**vector
, int32_t val
) {
950 if (*vector
== NULL
) {
951 *vector
= new UVector(*fStatus
);
953 if (*vector
== NULL
|| U_FAILURE(*fStatus
)) {
956 UVector
*vec
= *vector
;
957 int32_t vSize
= vec
->size();
958 for (i
=0; i
<vSize
; i
++) {
959 int32_t valAtI
= vec
->elementAti(i
);
961 // The value is already in the vector. Don't add it again.
968 vec
->insertElementAt(val
, i
, *fStatus
);
973 //-----------------------------------------------------------------------------
975 // setAdd Set operation on UVector
976 // dest = dest union source
977 // Elements may only appear once and must be sorted.
979 //-----------------------------------------------------------------------------
980 void RBBITableBuilder::setAdd(UVector
*dest
, UVector
*source
) {
981 int32_t destOriginalSize
= dest
->size();
982 int32_t sourceSize
= source
->size();
984 MaybeStackArray
<void *, 16> destArray
, sourceArray
; // Handle small cases without malloc
985 void **destPtr
, **sourcePtr
;
986 void **destLim
, **sourceLim
;
988 if (destOriginalSize
> destArray
.getCapacity()) {
989 if (destArray
.resize(destOriginalSize
) == NULL
) {
993 destPtr
= destArray
.getAlias();
994 destLim
= destPtr
+ destOriginalSize
; // destArray.getArrayLimit()?
996 if (sourceSize
> sourceArray
.getCapacity()) {
997 if (sourceArray
.resize(sourceSize
) == NULL
) {
1001 sourcePtr
= sourceArray
.getAlias();
1002 sourceLim
= sourcePtr
+ sourceSize
; // sourceArray.getArrayLimit()?
1004 // Avoid multiple "get element" calls by getting the contents into arrays
1005 (void) dest
->toArray(destPtr
);
1006 (void) source
->toArray(sourcePtr
);
1008 dest
->setSize(sourceSize
+destOriginalSize
, *fStatus
);
1010 while (sourcePtr
< sourceLim
&& destPtr
< destLim
) {
1011 if (*destPtr
== *sourcePtr
) {
1012 dest
->setElementAt(*sourcePtr
++, di
++);
1015 // This check is required for machines with segmented memory, like i5/OS.
1016 // Direct pointer comparison is not recommended.
1017 else if (uprv_memcmp(destPtr
, sourcePtr
, sizeof(void *)) < 0) {
1018 dest
->setElementAt(*destPtr
++, di
++);
1020 else { /* *sourcePtr < *destPtr */
1021 dest
->setElementAt(*sourcePtr
++, di
++);
1025 // At most one of these two cleanup loops will execute
1026 while (destPtr
< destLim
) {
1027 dest
->setElementAt(*destPtr
++, di
++);
1029 while (sourcePtr
< sourceLim
) {
1030 dest
->setElementAt(*sourcePtr
++, di
++);
1033 dest
->setSize(di
, *fStatus
);
1038 //-----------------------------------------------------------------------------
1040 // setEqual Set operation on UVector.
1041 // Compare for equality.
1042 // Elements must be sorted.
1044 //-----------------------------------------------------------------------------
1045 UBool
RBBITableBuilder::setEquals(UVector
*a
, UVector
*b
) {
1046 return a
->equals(*b
);
1050 //-----------------------------------------------------------------------------
1052 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1053 // for each node in the tree.
1055 //-----------------------------------------------------------------------------
1057 void RBBITableBuilder::printPosSets(RBBINode
*n
) {
1062 RBBINode::printNodeHeader();
1063 RBBINode::printNode(n
);
1064 RBBIDebugPrintf(" Nullable: %s\n", n
->fNullable
?"TRUE":"FALSE");
1066 RBBIDebugPrintf(" firstpos: ");
1067 printSet(n
->fFirstPosSet
);
1069 RBBIDebugPrintf(" lastpos: ");
1070 printSet(n
->fLastPosSet
);
1072 RBBIDebugPrintf(" followpos: ");
1073 printSet(n
->fFollowPos
);
1075 printPosSets(n
->fLeftChild
);
1076 printPosSets(n
->fRightChild
);
1082 //-----------------------------------------------------------------------------
1084 // getTableSize() Calculate the size of the runtime form of this
1085 // state transition table.
1087 //-----------------------------------------------------------------------------
1088 int32_t RBBITableBuilder::getTableSize() const {
1094 if (fTree
== NULL
) {
1098 size
= sizeof(RBBIStateTable
) - 4; // The header, with no rows to the table.
1100 numRows
= fDStates
->size();
1101 numCols
= fRB
->fSetBuilder
->getNumCharCategories();
1103 // Note The declaration of RBBIStateTableRow is for a table of two columns.
1104 // Therefore we subtract two from numCols when determining
1105 // how much storage to add to a row for the total columns.
1106 rowSize
= sizeof(RBBIStateTableRow
) + sizeof(uint16_t)*(numCols
-2);
1107 size
+= numRows
* rowSize
;
1113 //-----------------------------------------------------------------------------
1115 // exportTable() export the state transition table in the format required
1116 // by the runtime engine. getTableSize() bytes of memory
1117 // must be available at the output address "where".
1119 //-----------------------------------------------------------------------------
1120 void RBBITableBuilder::exportTable(void *where
) {
1121 RBBIStateTable
*table
= (RBBIStateTable
*)where
;
1125 if (U_FAILURE(*fStatus
) || fTree
== NULL
) {
1129 if (fRB
->fSetBuilder
->getNumCharCategories() > 0x7fff ||
1130 fDStates
->size() > 0x7fff) {
1131 *fStatus
= U_BRK_INTERNAL_ERROR
;
1135 table
->fRowLen
= sizeof(RBBIStateTableRow
) +
1136 sizeof(uint16_t) * (fRB
->fSetBuilder
->getNumCharCategories() - 2);
1137 table
->fNumStates
= fDStates
->size();
1139 if (fRB
->fLookAheadHardBreak
) {
1140 table
->fFlags
|= RBBI_LOOKAHEAD_HARD_BREAK
;
1142 if (fRB
->fSetBuilder
->sawBOF()) {
1143 table
->fFlags
|= RBBI_BOF_REQUIRED
;
1145 table
->fReserved
= 0;
1147 for (state
=0; state
<table
->fNumStates
; state
++) {
1148 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(state
);
1149 RBBIStateTableRow
*row
= (RBBIStateTableRow
*)(table
->fTableData
+ state
*table
->fRowLen
);
1150 U_ASSERT (-32768 < sd
->fAccepting
&& sd
->fAccepting
<= 32767);
1151 U_ASSERT (-32768 < sd
->fLookAhead
&& sd
->fLookAhead
<= 32767);
1152 row
->fAccepting
= (int16_t)sd
->fAccepting
;
1153 row
->fLookAhead
= (int16_t)sd
->fLookAhead
;
1154 row
->fTagIdx
= (int16_t)sd
->fTagsIdx
;
1155 for (col
=0; col
<fRB
->fSetBuilder
->getNumCharCategories(); col
++) {
1156 row
->fNextState
[col
] = (uint16_t)sd
->fDtran
->elementAti(col
);
1163 //-----------------------------------------------------------------------------
1165 // printSet Debug function. Print the contents of a UVector
1167 //-----------------------------------------------------------------------------
1169 void RBBITableBuilder::printSet(UVector
*s
) {
1171 for (i
=0; i
<s
->size(); i
++) {
1172 const RBBINode
*v
= static_cast<const RBBINode
*>(s
->elementAt(i
));
1173 RBBIDebugPrintf("%5d", v
==NULL
? -1 : v
->fSerialNum
);
1175 RBBIDebugPrintf("\n");
1180 //-----------------------------------------------------------------------------
1182 // printStates Debug Function. Dump the fully constructed state transition table.
1184 //-----------------------------------------------------------------------------
1186 void RBBITableBuilder::printStates() {
1187 int c
; // input "character"
1188 int n
; // state number
1190 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1191 RBBIDebugPrintf(" | Acc LA Tag");
1192 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1193 RBBIDebugPrintf(" %2d", c
);
1195 RBBIDebugPrintf("\n");
1196 RBBIDebugPrintf(" |---------------");
1197 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1198 RBBIDebugPrintf("---");
1200 RBBIDebugPrintf("\n");
1202 for (n
=0; n
<fDStates
->size(); n
++) {
1203 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
1204 RBBIDebugPrintf(" %3d | " , n
);
1205 RBBIDebugPrintf("%3d %3d %5d ", sd
->fAccepting
, sd
->fLookAhead
, sd
->fTagsIdx
);
1206 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1207 RBBIDebugPrintf(" %2d", sd
->fDtran
->elementAti(c
));
1209 RBBIDebugPrintf("\n");
1211 RBBIDebugPrintf("\n\n");
1217 //-----------------------------------------------------------------------------
1219 // printRuleStatusTable Debug Function. Dump the common rule status table
1221 //-----------------------------------------------------------------------------
1223 void RBBITableBuilder::printRuleStatusTable() {
1224 int32_t thisRecord
= 0;
1225 int32_t nextRecord
= 0;
1227 UVector
*tbl
= fRB
->fRuleStatusVals
;
1229 RBBIDebugPrintf("index | tags \n");
1230 RBBIDebugPrintf("-------------------\n");
1232 while (nextRecord
< tbl
->size()) {
1233 thisRecord
= nextRecord
;
1234 nextRecord
= thisRecord
+ tbl
->elementAti(thisRecord
) + 1;
1235 RBBIDebugPrintf("%4d ", thisRecord
);
1236 for (i
=thisRecord
+1; i
<nextRecord
; i
++) {
1237 RBBIDebugPrintf(" %5d", tbl
->elementAti(i
));
1239 RBBIDebugPrintf("\n");
1241 RBBIDebugPrintf("\n\n");
1246 //-----------------------------------------------------------------------------
1248 // RBBIStateDescriptor Methods. This is a very struct-like class
1249 // Most access is directly to the fields.
1251 //-----------------------------------------------------------------------------
1253 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol
, UErrorCode
*fStatus
) {
1262 fDtran
= new UVector(lastInputSymbol
+1, *fStatus
);
1263 if (U_FAILURE(*fStatus
)) {
1266 if (fDtran
== NULL
) {
1267 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
1270 fDtran
->setSize(lastInputSymbol
+1, *fStatus
); // fDtran needs to be pre-sized.
1271 // It is indexed by input symbols, and will
1272 // hold the next state number for each
1277 RBBIStateDescriptor::~RBBIStateDescriptor() {
1288 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */