2 **********************************************************************
3 * Copyright (c) 2002-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_BREAK_ITERATION
16 #include "unicode/unistr.h"
27 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder
*rb
, RBBINode
**rootNode
) :
30 fStatus
= fRB
->fStatus
;
31 UErrorCode status
= U_ZERO_ERROR
;
32 fDStates
= new UVector(status
);
33 if (U_FAILURE(*fStatus
)) {
36 if (U_FAILURE(status
)) {
40 if (fDStates
== NULL
) {
41 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;;
47 RBBITableBuilder::~RBBITableBuilder() {
49 for (i
=0; i
<fDStates
->size(); i
++) {
50 delete (RBBIStateDescriptor
*)fDStates
->elementAt(i
);
56 //-----------------------------------------------------------------------------
58 // RBBITableBuilder::build - This is the main function for building the DFA state transtion
59 // table from the RBBI rules parse tree.
61 //-----------------------------------------------------------------------------
62 void RBBITableBuilder::build() {
64 if (U_FAILURE(*fStatus
)) {
68 // If there were no rules, just return. This situation can easily arise
69 // for the reverse rules.
75 // Walk through the tree, replacing any references to $variables with a copy of the
76 // parse tree for the substition expression.
78 fTree
= fTree
->flattenVariables();
80 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "ftree")) {
81 RBBIDebugPuts("Parse tree after flattening variable references.");
82 fTree
->printTree(TRUE
);
87 // If the rules contained any references to {bof}
88 // add a {bof} <cat> <former root of tree> to the
89 // tree. Means that all matches must start out with the
90 // {bof} fake character.
92 if (fRB
->fSetBuilder
->sawBOF()) {
93 RBBINode
*bofTop
= new RBBINode(RBBINode::opCat
);
94 RBBINode
*bofLeaf
= new RBBINode(RBBINode::leafChar
);
95 bofTop
->fLeftChild
= bofLeaf
;
96 bofTop
->fRightChild
= fTree
;
97 bofLeaf
->fParent
= bofTop
;
98 bofLeaf
->fVal
= 2; // Reserved value for {bof}.
103 // Add a unique right-end marker to the expression.
104 // Appears as a cat-node, left child being the original tree,
105 // right child being the end marker.
107 RBBINode
*cn
= new RBBINode(RBBINode::opCat
);
108 cn
->fLeftChild
= fTree
;
110 cn
->fRightChild
= new RBBINode(RBBINode::endMark
);
111 cn
->fRightChild
->fParent
= cn
;
115 // Replace all references to UnicodeSets with the tree for the equivalent
118 fTree
->flattenSets();
120 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "stree")) {
121 RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
122 fTree
->printTree(TRUE
);
128 // calculate the functions nullable, firstpos, lastpos and followpos on
129 // nodes in the parse tree.
130 // See the alogrithm description in Aho.
131 // Understanding how this works by looking at the code alone will be
132 // nearly impossible.
137 calcFollowPos(fTree
);
138 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "pos")) {
144 // For "chained" rules, modify the followPos sets
146 if (fRB
->fChainRules
) {
147 calcChainedFollowPos(fTree
);
151 // BOF (start of input) test fixup.
153 if (fRB
->fSetBuilder
->sawBOF()) {
158 // Build the DFA state transition tables.
161 flagAcceptingStates();
162 flagLookAheadStates();
166 // Update the global table of rule status {tag} values
167 // The rule builder has a global vector of status values that are common
168 // for all tables. Merge the ones from this table into the global set.
170 mergeRuleStatusVals();
172 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "states")) {printStates();};
177 //-----------------------------------------------------------------------------
179 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
181 //-----------------------------------------------------------------------------
182 void RBBITableBuilder::calcNullable(RBBINode
*n
) {
186 if (n
->fType
== RBBINode::setRef
||
187 n
->fType
== RBBINode::endMark
) {
188 // These are non-empty leaf node types.
189 n
->fNullable
= FALSE
;
193 if (n
->fType
== RBBINode::lookAhead
|| n
->fType
== RBBINode::tag
) {
194 // Lookahead marker node. It's a leaf, so no recursion on children.
195 // It's nullable because it does not match any literal text from the input stream.
201 // The node is not a leaf.
202 // Calculate nullable on its children.
203 calcNullable(n
->fLeftChild
);
204 calcNullable(n
->fRightChild
);
206 // Apply functions from table 3.40 in Aho
207 if (n
->fType
== RBBINode::opOr
) {
208 n
->fNullable
= n
->fLeftChild
->fNullable
|| n
->fRightChild
->fNullable
;
210 else if (n
->fType
== RBBINode::opCat
) {
211 n
->fNullable
= n
->fLeftChild
->fNullable
&& n
->fRightChild
->fNullable
;
213 else if (n
->fType
== RBBINode::opStar
|| n
->fType
== RBBINode::opQuestion
) {
217 n
->fNullable
= FALSE
;
224 //-----------------------------------------------------------------------------
226 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
228 //-----------------------------------------------------------------------------
229 void RBBITableBuilder::calcFirstPos(RBBINode
*n
) {
233 if (n
->fType
== RBBINode::leafChar
||
234 n
->fType
== RBBINode::endMark
||
235 n
->fType
== RBBINode::lookAhead
||
236 n
->fType
== RBBINode::tag
) {
237 // These are non-empty leaf node types.
238 // Note: In order to maintain the sort invariant on the set,
239 // this function should only be called on a node whose set is
240 // empty to start with.
241 n
->fFirstPosSet
->addElement(n
, *fStatus
);
245 // The node is not a leaf.
246 // Calculate firstPos on its children.
247 calcFirstPos(n
->fLeftChild
);
248 calcFirstPos(n
->fRightChild
);
250 // Apply functions from table 3.40 in Aho
251 if (n
->fType
== RBBINode::opOr
) {
252 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
253 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
255 else if (n
->fType
== RBBINode::opCat
) {
256 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
257 if (n
->fLeftChild
->fNullable
) {
258 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
261 else if (n
->fType
== RBBINode::opStar
||
262 n
->fType
== RBBINode::opQuestion
||
263 n
->fType
== RBBINode::opPlus
) {
264 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
270 //-----------------------------------------------------------------------------
272 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
274 //-----------------------------------------------------------------------------
275 void RBBITableBuilder::calcLastPos(RBBINode
*n
) {
279 if (n
->fType
== RBBINode::leafChar
||
280 n
->fType
== RBBINode::endMark
||
281 n
->fType
== RBBINode::lookAhead
||
282 n
->fType
== RBBINode::tag
) {
283 // These are non-empty leaf node types.
284 // Note: In order to maintain the sort invariant on the set,
285 // this function should only be called on a node whose set is
286 // empty to start with.
287 n
->fLastPosSet
->addElement(n
, *fStatus
);
291 // The node is not a leaf.
292 // Calculate lastPos on its children.
293 calcLastPos(n
->fLeftChild
);
294 calcLastPos(n
->fRightChild
);
296 // Apply functions from table 3.40 in Aho
297 if (n
->fType
== RBBINode::opOr
) {
298 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
299 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
301 else if (n
->fType
== RBBINode::opCat
) {
302 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
303 if (n
->fRightChild
->fNullable
) {
304 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
307 else if (n
->fType
== RBBINode::opStar
||
308 n
->fType
== RBBINode::opQuestion
||
309 n
->fType
== RBBINode::opPlus
) {
310 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
316 //-----------------------------------------------------------------------------
318 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
320 //-----------------------------------------------------------------------------
321 void RBBITableBuilder::calcFollowPos(RBBINode
*n
) {
323 n
->fType
== RBBINode::leafChar
||
324 n
->fType
== RBBINode::endMark
) {
328 calcFollowPos(n
->fLeftChild
);
329 calcFollowPos(n
->fRightChild
);
332 if (n
->fType
== RBBINode::opCat
) {
333 RBBINode
*i
; // is 'i' in Aho's description
336 UVector
*LastPosOfLeftChild
= n
->fLeftChild
->fLastPosSet
;
338 for (ix
=0; ix
<(uint32_t)LastPosOfLeftChild
->size(); ix
++) {
339 i
= (RBBINode
*)LastPosOfLeftChild
->elementAt(ix
);
340 setAdd(i
->fFollowPos
, n
->fRightChild
->fFirstPosSet
);
345 if (n
->fType
== RBBINode::opStar
||
346 n
->fType
== RBBINode::opPlus
) {
347 RBBINode
*i
; // again, n and i are the names from Aho's description.
350 for (ix
=0; ix
<(uint32_t)n
->fLastPosSet
->size(); ix
++) {
351 i
= (RBBINode
*)n
->fLastPosSet
->elementAt(ix
);
352 setAdd(i
->fFollowPos
, n
->fFirstPosSet
);
361 //-----------------------------------------------------------------------------
363 // calcChainedFollowPos. Modify the previously calculated followPos sets
364 // to implement rule chaining. NOT described by Aho
366 //-----------------------------------------------------------------------------
367 void RBBITableBuilder::calcChainedFollowPos(RBBINode
*tree
) {
369 UVector
endMarkerNodes(*fStatus
);
370 UVector
leafNodes(*fStatus
);
373 if (U_FAILURE(*fStatus
)) {
377 // get a list of all endmarker nodes.
378 tree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
380 // get a list all leaf nodes
381 tree
->findNodes(&leafNodes
, RBBINode::leafChar
, *fStatus
);
382 if (U_FAILURE(*fStatus
)) {
386 // Get all nodes that can be the start a match, which is FirstPosition()
387 // of the portion of the tree corresponding to user-written rules.
388 // See the tree description in bofFixup().
389 RBBINode
*userRuleRoot
= tree
;
390 if (fRB
->fSetBuilder
->sawBOF()) {
391 userRuleRoot
= tree
->fLeftChild
->fRightChild
;
393 U_ASSERT(userRuleRoot
!= NULL
);
394 UVector
*matchStartNodes
= userRuleRoot
->fFirstPosSet
;
397 // Iteratate over all leaf nodes,
402 for (endNodeIx
=0; endNodeIx
<leafNodes
.size(); endNodeIx
++) {
403 RBBINode
*tNode
= (RBBINode
*)leafNodes
.elementAt(endNodeIx
);
404 RBBINode
*endNode
= NULL
;
406 // Identify leaf nodes that correspond to overall rule match positions.
407 // These include an endMarkerNode in their followPos sets.
408 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
409 if (tNode
->fFollowPos
->contains(endMarkerNodes
.elementAt(i
))) {
414 if (endNode
== NULL
) {
415 // node wasn't an end node. Try again with the next.
419 // We've got a node that can end a match.
421 // Line Break Specific hack: If this node's val correspond to the $CM char class,
422 // don't chain from it.
423 // TODO: Add rule syntax for this behavior, get specifics out of here and
424 // into the rule file.
425 if (fRB
->fLBCMNoChain
) {
426 UChar32 c
= this->fRB
->fSetBuilder
->getFirstChar(endNode
->fVal
);
428 // c == -1 occurs with sets containing only the {eof} marker string.
429 ULineBreak cLBProp
= (ULineBreak
)u_getIntPropertyValue(c
, UCHAR_LINE_BREAK
);
430 if (cLBProp
== U_LB_COMBINING_MARK
) {
437 // Now iterate over the nodes that can start a match, looking for ones
438 // with the same char class as our ending node.
440 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
->size(); startNodeIx
++) {
441 startNode
= (RBBINode
*)matchStartNodes
->elementAt(startNodeIx
);
442 if (startNode
->fType
!= RBBINode::leafChar
) {
446 if (endNode
->fVal
== startNode
->fVal
) {
447 // The end val (character class) of one possible match is the
448 // same as the start of another.
450 // Add all nodes from the followPos of the start node to the
451 // followPos set of the end node, which will have the effect of
452 // letting matches transition from a match state at endNode
453 // to the second char of a match starting with startNode.
454 setAdd(endNode
->fFollowPos
, startNode
->fFollowPos
);
461 //-----------------------------------------------------------------------------
463 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
464 // Do an swizzle similar to chaining, modifying the followPos set of
465 // the bofNode to include the followPos nodes from other {bot} nodes
466 // scattered through the tree.
468 // This function has much in common with calcChainedFollowPos().
470 //-----------------------------------------------------------------------------
471 void RBBITableBuilder::bofFixup() {
473 if (U_FAILURE(*fStatus
)) {
477 // The parse tree looks like this ...
478 // fTree root ---> <cat>
485 // We will be adding things to the followPos set of the <bofNode>
487 RBBINode
*bofNode
= fTree
->fLeftChild
->fLeftChild
;
488 U_ASSERT(bofNode
->fType
== RBBINode::leafChar
);
489 U_ASSERT(bofNode
->fVal
== 2);
491 // Get all nodes that can be the start a match of the user-written rules
492 // (excluding the fake bofNode)
493 // We want the nodes that can start a match in the
494 // part labeled "rest of tree"
496 UVector
*matchStartNodes
= fTree
->fLeftChild
->fRightChild
->fFirstPosSet
;
500 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
->size(); startNodeIx
++) {
501 startNode
= (RBBINode
*)matchStartNodes
->elementAt(startNodeIx
);
502 if (startNode
->fType
!= RBBINode::leafChar
) {
506 if (startNode
->fVal
== bofNode
->fVal
) {
507 // We found a leaf node corresponding to a {bof} that was
508 // explicitly written into a rule.
509 // Add everything from the followPos set of this node to the
510 // followPos set of the fake bofNode at the start of the tree.
512 setAdd(bofNode
->fFollowPos
, startNode
->fFollowPos
);
517 //-----------------------------------------------------------------------------
519 // buildStateTable() Determine the set of runtime DFA states and the
520 // transition tables for these states, by the algorithm
521 // of fig. 3.44 in Aho.
523 // Most of the comments are quotes of Aho's psuedo-code.
525 //-----------------------------------------------------------------------------
526 void RBBITableBuilder::buildStateTable() {
527 if (U_FAILURE(*fStatus
)) {
531 // Add a dummy state 0 - the stop state. Not from Aho.
532 int lastInputSymbol
= fRB
->fSetBuilder
->getNumCharCategories() - 1;
533 RBBIStateDescriptor
*failState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
534 failState
->fPositions
= new UVector(*fStatus
);
535 if (U_FAILURE(*fStatus
)) {
538 fDStates
->addElement(failState
, *fStatus
);
539 if (U_FAILURE(*fStatus
)) {
543 // initially, the only unmarked state in Dstates is firstpos(root),
544 // where toot is the root of the syntax tree for (r)#;
545 RBBIStateDescriptor
*initialState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
546 if (U_FAILURE(*fStatus
)) {
549 initialState
->fPositions
= new UVector(*fStatus
);
550 if (U_FAILURE(*fStatus
)) {
553 setAdd(initialState
->fPositions
, fTree
->fFirstPosSet
);
554 fDStates
->addElement(initialState
, *fStatus
);
555 if (U_FAILURE(*fStatus
)) {
559 // while there is an unmarked state T in Dstates do begin
561 RBBIStateDescriptor
*T
= NULL
;
563 for (tx
=1; tx
<fDStates
->size(); tx
++) {
564 RBBIStateDescriptor
*temp
;
565 temp
= (RBBIStateDescriptor
*)fDStates
->elementAt(tx
);
566 if (temp
->fMarked
== FALSE
) {
578 // for each input symbol a do begin
580 for (a
= 1; a
<=lastInputSymbol
; a
++) {
581 // let U be the set of positions that are in followpos(p)
582 // for some position p in T
583 // such that the symbol at position p is a;
587 for (px
=0; px
<T
->fPositions
->size(); px
++) {
588 p
= (RBBINode
*)T
->fPositions
->elementAt(px
);
589 if ((p
->fType
== RBBINode::leafChar
) && (p
->fVal
== a
)) {
591 U
= new UVector(*fStatus
);
593 setAdd(U
, p
->fFollowPos
);
597 // if U is not empty and not in DStates then
599 UBool UinDstates
= FALSE
;
601 U_ASSERT(U
->size() > 0);
603 for (ix
=0; ix
<fDStates
->size(); ix
++) {
604 RBBIStateDescriptor
*temp2
;
605 temp2
= (RBBIStateDescriptor
*)fDStates
->elementAt(ix
);
606 if (setEquals(U
, temp2
->fPositions
)) {
608 U
= temp2
->fPositions
;
615 // Add U as an unmarked state to Dstates
618 RBBIStateDescriptor
*newState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
619 if (U_FAILURE(*fStatus
)) {
622 newState
->fPositions
= U
;
623 fDStates
->addElement(newState
, *fStatus
);
624 if (U_FAILURE(*fStatus
)) {
627 ux
= fDStates
->size()-1;
631 T
->fDtran
->setElementAt(ux
, a
);
639 //-----------------------------------------------------------------------------
641 // flagAcceptingStates Identify accepting states.
642 // First get a list of all of the end marker nodes.
643 // Then, for each state s,
644 // if s contains one of the end marker nodes in its list of tree positions then
645 // s is an accepting state.
647 //-----------------------------------------------------------------------------
648 void RBBITableBuilder::flagAcceptingStates() {
649 if (U_FAILURE(*fStatus
)) {
652 UVector
endMarkerNodes(*fStatus
);
657 if (U_FAILURE(*fStatus
)) {
661 fTree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
662 if (U_FAILURE(*fStatus
)) {
666 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
667 endMarker
= (RBBINode
*)endMarkerNodes
.elementAt(i
);
668 for (n
=0; n
<fDStates
->size(); n
++) {
669 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
670 if (sd
->fPositions
->indexOf(endMarker
) >= 0) {
671 // Any non-zero value for fAccepting means this is an accepting node.
672 // The value is what will be returned to the user as the break status.
673 // If no other value was specified, force it to -1.
675 if (sd
->fAccepting
==0) {
676 // State hasn't been marked as accepting yet. Do it now.
677 sd
->fAccepting
= endMarker
->fVal
;
678 if (sd
->fAccepting
== 0) {
682 if (sd
->fAccepting
==-1 && endMarker
->fVal
!= 0) {
683 // Both lookahead and non-lookahead accepting for this state.
684 // Favor the look-ahead. Expedient for line break.
685 // TODO: need a more elegant resolution for conflicting rules.
686 sd
->fAccepting
= endMarker
->fVal
;
689 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
691 // If the end marker node is from a look-ahead rule, set
692 // the fLookAhead field or this state also.
693 if (endMarker
->fLookAheadEnd
) {
694 // TODO: don't change value if already set?
695 // TODO: allow for more than one active look-ahead rule in engine.
696 // Make value here an index to a side array in engine?
697 sd
->fLookAhead
= sd
->fAccepting
;
705 //-----------------------------------------------------------------------------
707 // flagLookAheadStates Very similar to flagAcceptingStates, above.
709 //-----------------------------------------------------------------------------
710 void RBBITableBuilder::flagLookAheadStates() {
711 if (U_FAILURE(*fStatus
)) {
714 UVector
lookAheadNodes(*fStatus
);
715 RBBINode
*lookAheadNode
;
719 fTree
->findNodes(&lookAheadNodes
, RBBINode::lookAhead
, *fStatus
);
720 if (U_FAILURE(*fStatus
)) {
723 for (i
=0; i
<lookAheadNodes
.size(); i
++) {
724 lookAheadNode
= (RBBINode
*)lookAheadNodes
.elementAt(i
);
726 for (n
=0; n
<fDStates
->size(); n
++) {
727 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
728 if (sd
->fPositions
->indexOf(lookAheadNode
) >= 0) {
729 sd
->fLookAhead
= lookAheadNode
->fVal
;
738 //-----------------------------------------------------------------------------
742 //-----------------------------------------------------------------------------
743 void RBBITableBuilder::flagTaggedStates() {
744 if (U_FAILURE(*fStatus
)) {
747 UVector
tagNodes(*fStatus
);
752 if (U_FAILURE(*fStatus
)) {
755 fTree
->findNodes(&tagNodes
, RBBINode::tag
, *fStatus
);
756 if (U_FAILURE(*fStatus
)) {
759 for (i
=0; i
<tagNodes
.size(); i
++) { // For each tag node t (all of 'em)
760 tagNode
= (RBBINode
*)tagNodes
.elementAt(i
);
762 for (n
=0; n
<fDStates
->size(); n
++) { // For each state s (row in the state table)
763 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
764 if (sd
->fPositions
->indexOf(tagNode
) >= 0) { // if s include the tag node t
765 sortedAdd(&sd
->fTagVals
, tagNode
->fVal
);
774 //-----------------------------------------------------------------------------
776 // mergeRuleStatusVals
778 // Update the global table of rule status {tag} values
779 // The rule builder has a global vector of status values that are common
780 // for all tables. Merge the ones from this table into the global set.
782 //-----------------------------------------------------------------------------
783 void RBBITableBuilder::mergeRuleStatusVals() {
785 // The basic outline of what happens here is this...
787 // for each state in this state table
788 // if the status tag list for this state is in the global statuses list
790 // continue with the next state
792 // add the tag list for this state to the global list.
797 // Pre-set a single tag of {0} into the table.
798 // We will need this as a default, for rule sets with no explicit tagging.
799 if (fRB
->fRuleStatusVals
->size() == 0) {
800 fRB
->fRuleStatusVals
->addElement(1, *fStatus
); // Num of statuses in group
801 fRB
->fRuleStatusVals
->addElement((int32_t)0, *fStatus
); // and our single status of zero
805 for (n
=0; n
<fDStates
->size(); n
++) {
806 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
807 UVector
*thisStatesTagValues
= sd
->fTagVals
;
808 if (thisStatesTagValues
== NULL
) {
809 // No tag values are explicitly associated with this state.
810 // Set the default tag value.
815 // There are tag(s) associated with this state.
816 // fTagsIdx will be the index into the global tag list for this state's tag values.
817 // Initial value of -1 flags that we haven't got it set yet.
819 int32_t thisTagGroupStart
= 0; // indexes into the global rule status vals list
820 int32_t nextTagGroupStart
= 0;
822 // Loop runs once per group of tags in the global list
823 while (nextTagGroupStart
< fRB
->fRuleStatusVals
->size()) {
824 thisTagGroupStart
= nextTagGroupStart
;
825 nextTagGroupStart
+= fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
) + 1;
826 if (thisStatesTagValues
->size() != fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
)) {
827 // The number of tags for this state is different from
828 // the number of tags in this group from the global list.
829 // Continue with the next group from the global list.
832 // The lengths match, go ahead and compare the actual tag values
833 // between this state and the group from the global list.
834 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
835 if (thisStatesTagValues
->elementAti(i
) !=
836 fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
+ 1 + i
) ) {
842 if (i
== thisStatesTagValues
->size()) {
843 // We found a set of tag values in the global list that match
844 // those for this state. Use them.
845 sd
->fTagsIdx
= thisTagGroupStart
;
850 if (sd
->fTagsIdx
== -1) {
851 // No suitable entry in the global tag list already. Add one
852 sd
->fTagsIdx
= fRB
->fRuleStatusVals
->size();
853 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->size(), *fStatus
);
854 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
855 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->elementAti(i
), *fStatus
);
867 //-----------------------------------------------------------------------------
869 // sortedAdd Add a value to a vector of sorted values (ints).
870 // Do not replicate entries; if the value is already there, do not
872 // Lazily create the vector if it does not already exist.
874 //-----------------------------------------------------------------------------
875 void RBBITableBuilder::sortedAdd(UVector
**vector
, int32_t val
) {
878 if (*vector
== NULL
) {
879 *vector
= new UVector(*fStatus
);
881 if (*vector
== NULL
|| U_FAILURE(*fStatus
)) {
884 UVector
*vec
= *vector
;
885 int32_t vSize
= vec
->size();
886 for (i
=0; i
<vSize
; i
++) {
887 int32_t valAtI
= vec
->elementAti(i
);
889 // The value is already in the vector. Don't add it again.
896 vec
->insertElementAt(val
, i
, *fStatus
);
901 //-----------------------------------------------------------------------------
903 // setAdd Set operation on UVector
904 // dest = dest union source
905 // Elements may only appear once and must be sorted.
907 //-----------------------------------------------------------------------------
908 void RBBITableBuilder::setAdd(UVector
*dest
, UVector
*source
) {
909 int destOriginalSize
= dest
->size();
910 int sourceSize
= source
->size();
912 void *(destS
[16]), *(sourceS
[16]); // Handle small cases without malloc
913 void **destH
= 0, **sourceH
= 0;
914 void **destBuff
, **sourceBuff
;
915 void **destLim
, **sourceLim
;
917 if (destOriginalSize
> sizeof(destS
)/sizeof(destS
[0])) {
918 destH
= (void **)uprv_malloc(sizeof(void *) * destOriginalSize
);
927 destLim
= destBuff
+ destOriginalSize
;
929 if (sourceSize
> sizeof(sourceS
)/sizeof(sourceS
[0])) {
930 sourceH
= (void **)uprv_malloc(sizeof(void *) * sourceSize
);
931 sourceBuff
= sourceH
;
934 sourceBuff
= sourceS
;
936 if (sourceBuff
== 0) {
942 sourceLim
= sourceBuff
+ sourceSize
;
944 // Avoid multiple "get element" calls by getting the contents into arrays
945 (void) dest
->toArray(destBuff
);
946 (void) source
->toArray(sourceBuff
);
948 dest
->setSize(sourceSize
+destOriginalSize
);
950 while (sourceBuff
< sourceLim
&& destBuff
< destLim
) {
951 if (*destBuff
< *sourceBuff
) {
952 dest
->setElementAt(*destBuff
++, di
++);
954 else if (*sourceBuff
< *destBuff
) {
955 dest
->setElementAt(*sourceBuff
++, di
++);
958 dest
->setElementAt(*sourceBuff
++, di
++);
963 // At most one of these two cleanup loops will execute
964 while (destBuff
< destLim
) {
965 dest
->setElementAt(*destBuff
++, di
++);
967 while (sourceBuff
< sourceLim
) {
968 dest
->setElementAt(*sourceBuff
++, di
++);
982 //-----------------------------------------------------------------------------
984 // setEqual Set operation on UVector.
985 // Compare for equality.
986 // Elements must be sorted.
988 //-----------------------------------------------------------------------------
989 UBool
RBBITableBuilder::setEquals(UVector
*a
, UVector
*b
) {
990 return a
->equals(*b
);
994 //-----------------------------------------------------------------------------
996 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
997 // for each node in the tree.
999 //-----------------------------------------------------------------------------
1001 void RBBITableBuilder::printPosSets(RBBINode
*n
) {
1006 RBBIDebugPrintf(" Nullable: %s\n", n
->fNullable
?"TRUE":"FALSE");
1008 RBBIDebugPrintf(" firstpos: ");
1009 printSet(n
->fFirstPosSet
);
1011 RBBIDebugPrintf(" lastpos: ");
1012 printSet(n
->fLastPosSet
);
1014 RBBIDebugPrintf(" followpos: ");
1015 printSet(n
->fFollowPos
);
1017 printPosSets(n
->fLeftChild
);
1018 printPosSets(n
->fRightChild
);
1024 //-----------------------------------------------------------------------------
1026 // getTableSize() Calculate the size of the runtime form of this
1027 // state transition table.
1029 //-----------------------------------------------------------------------------
1030 int32_t RBBITableBuilder::getTableSize() const {
1036 if (fTree
== NULL
) {
1040 size
= sizeof(RBBIStateTable
) - 4; // The header, with no rows to the table.
1042 numRows
= fDStates
->size();
1043 numCols
= fRB
->fSetBuilder
->getNumCharCategories();
1045 // Note The declaration of RBBIStateTableRow is for a table of two columns.
1046 // Therefore we subtract two from numCols when determining
1047 // how much storage to add to a row for the total columns.
1048 rowSize
= sizeof(RBBIStateTableRow
) + sizeof(uint16_t)*(numCols
-2);
1049 size
+= numRows
* rowSize
;
1055 //-----------------------------------------------------------------------------
1057 // exportTable() export the state transition table in the format required
1058 // by the runtime engine. getTableSize() bytes of memory
1059 // must be available at the output address "where".
1061 //-----------------------------------------------------------------------------
1062 void RBBITableBuilder::exportTable(void *where
) {
1063 RBBIStateTable
*table
= (RBBIStateTable
*)where
;
1067 if (U_FAILURE(*fStatus
) || fTree
== NULL
) {
1071 if (fRB
->fSetBuilder
->getNumCharCategories() > 0x7fff ||
1072 fDStates
->size() > 0x7fff) {
1073 *fStatus
= U_BRK_INTERNAL_ERROR
;
1077 table
->fRowLen
= sizeof(RBBIStateTableRow
) +
1078 sizeof(uint16_t) * (fRB
->fSetBuilder
->getNumCharCategories() - 2);
1079 table
->fNumStates
= fDStates
->size();
1081 if (fRB
->fLookAheadHardBreak
) {
1082 table
->fFlags
|= RBBI_LOOKAHEAD_HARD_BREAK
;
1084 if (fRB
->fSetBuilder
->sawBOF()) {
1085 table
->fFlags
|= RBBI_BOF_REQUIRED
;
1087 table
->fReserved
= 0;
1089 for (state
=0; state
<table
->fNumStates
; state
++) {
1090 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(state
);
1091 RBBIStateTableRow
*row
= (RBBIStateTableRow
*)(table
->fTableData
+ state
*table
->fRowLen
);
1092 U_ASSERT (-32768 < sd
->fAccepting
&& sd
->fAccepting
<= 32767);
1093 U_ASSERT (-32768 < sd
->fLookAhead
&& sd
->fLookAhead
<= 32767);
1094 row
->fAccepting
= (int16_t)sd
->fAccepting
;
1095 row
->fLookAhead
= (int16_t)sd
->fLookAhead
;
1096 row
->fTagIdx
= (int16_t)sd
->fTagsIdx
;
1097 for (col
=0; col
<fRB
->fSetBuilder
->getNumCharCategories(); col
++) {
1098 row
->fNextState
[col
] = (uint16_t)sd
->fDtran
->elementAti(col
);
1105 //-----------------------------------------------------------------------------
1107 // printSet Debug function. Print the contents of a UVector
1109 //-----------------------------------------------------------------------------
1111 void RBBITableBuilder::printSet(UVector
*s
) {
1113 for (i
=0; i
<s
->size(); i
++) {
1114 void *v
= s
->elementAt(i
);
1115 RBBIDebugPrintf("%10p", v
);
1117 RBBIDebugPrintf("\n");
1122 //-----------------------------------------------------------------------------
1124 // printStates Debug Function. Dump the fully constructed state transition table.
1126 //-----------------------------------------------------------------------------
1128 void RBBITableBuilder::printStates() {
1129 int c
; // input "character"
1130 int n
; // state number
1132 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1133 RBBIDebugPrintf(" | Acc LA Tag");
1134 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1135 RBBIDebugPrintf(" %2d", c
);
1137 RBBIDebugPrintf("\n");
1138 RBBIDebugPrintf(" |---------------");
1139 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1140 RBBIDebugPrintf("---");
1142 RBBIDebugPrintf("\n");
1144 for (n
=0; n
<fDStates
->size(); n
++) {
1145 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
1146 RBBIDebugPrintf(" %3d | " , n
);
1147 RBBIDebugPrintf("%3d %3d %5d ", sd
->fAccepting
, sd
->fLookAhead
, sd
->fTagsIdx
);
1148 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1149 RBBIDebugPrintf(" %2d", sd
->fDtran
->elementAti(c
));
1151 RBBIDebugPrintf("\n");
1153 RBBIDebugPrintf("\n\n");
1159 //-----------------------------------------------------------------------------
1161 // printRuleStatusTable Debug Function. Dump the common rule status table
1163 //-----------------------------------------------------------------------------
1165 void RBBITableBuilder::printRuleStatusTable() {
1166 int32_t thisRecord
= 0;
1167 int32_t nextRecord
= 0;
1169 UVector
*tbl
= fRB
->fRuleStatusVals
;
1171 RBBIDebugPrintf("index | tags \n");
1172 RBBIDebugPrintf("-------------------\n");
1174 while (nextRecord
< tbl
->size()) {
1175 thisRecord
= nextRecord
;
1176 nextRecord
= thisRecord
+ tbl
->elementAti(thisRecord
) + 1;
1177 RBBIDebugPrintf("%4d ", thisRecord
);
1178 for (i
=thisRecord
+1; i
<nextRecord
; i
++) {
1179 RBBIDebugPrintf(" %5d", tbl
->elementAti(i
));
1181 RBBIDebugPrintf("\n");
1183 RBBIDebugPrintf("\n\n");
1188 //-----------------------------------------------------------------------------
1190 // RBBIStateDescriptor Methods. This is a very struct-like class
1191 // Most access is directly to the fields.
1193 //-----------------------------------------------------------------------------
1195 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol
, UErrorCode
*fStatus
) {
1204 fDtran
= new UVector(lastInputSymbol
+1, *fStatus
);
1205 if (U_FAILURE(*fStatus
)) {
1208 if (fDtran
== NULL
) {
1209 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
1212 fDtran
->setSize(lastInputSymbol
+1); // fDtran needs to be pre-sized.
1213 // It is indexed by input symbols, and will
1214 // hold the next state number for each
1219 RBBIStateDescriptor::~RBBIStateDescriptor() {
1230 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */