2 ********************************************************************** 
   3 *   Copyright (c) 2002-2006, International Business Machines 
   4 *   Corporation and others.  All Rights Reserved. 
   5 ********************************************************************** 
  12 #include "unicode/utypes.h" 
  14 #if !UCONFIG_NO_BREAK_ITERATION 
  16 #include "unicode/unistr.h" 
  27 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder 
*rb
, RBBINode 
**rootNode
) : 
  30     fStatus             
= fRB
->fStatus
; 
  31     UErrorCode status   
= U_ZERO_ERROR
; 
  32     fDStates            
= new UVector(status
); 
  33     if (U_FAILURE(*fStatus
)) { 
  36     if (U_FAILURE(status
)) { 
  40     if (fDStates 
== NULL
) { 
  41         *fStatus 
= U_MEMORY_ALLOCATION_ERROR
;; 
  47 RBBITableBuilder::~RBBITableBuilder() { 
  49     for (i
=0; i
<fDStates
->size(); i
++) { 
  50         delete (RBBIStateDescriptor 
*)fDStates
->elementAt(i
); 
  56 //----------------------------------------------------------------------------- 
  58 //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion 
  59 //                               table from the RBBI rules parse tree. 
  61 //----------------------------------------------------------------------------- 
  62 void  RBBITableBuilder::build() { 
  64     if (U_FAILURE(*fStatus
)) { 
  68     // If there were no rules, just return.  This situation can easily arise 
  69     //   for the reverse rules. 
  75     // Walk through the tree, replacing any references to $variables with a copy of the 
  76     //   parse tree for the substition expression. 
  78     fTree 
= fTree
->flattenVariables(); 
  80     if (fRB
->fDebugEnv 
&& uprv_strstr(fRB
->fDebugEnv
, "ftree")) { 
  81         RBBIDebugPuts("Parse tree after flattening variable references."); 
  82         fTree
->printTree(TRUE
); 
  87     // If the rules contained any references to {bof}  
  88     //   add a {bof} <cat> <former root of tree> to the 
  89     //   tree.  Means that all matches must start out with the  
  90     //   {bof} fake character. 
  92     if (fRB
->fSetBuilder
->sawBOF()) { 
  93         RBBINode 
*bofTop    
= new RBBINode(RBBINode::opCat
); 
  94         RBBINode 
*bofLeaf   
= new RBBINode(RBBINode::leafChar
); 
  95         bofTop
->fLeftChild  
= bofLeaf
; 
  96         bofTop
->fRightChild 
= fTree
; 
  97         bofLeaf
->fParent    
= bofTop
; 
  98         bofLeaf
->fVal       
= 2;      // Reserved value for {bof}. 
 103     // Add a unique right-end marker to the expression. 
 104     //   Appears as a cat-node, left child being the original tree, 
 105     //   right child being the end marker. 
 107     RBBINode 
*cn 
= new RBBINode(RBBINode::opCat
); 
 108     cn
->fLeftChild 
= fTree
; 
 110     cn
->fRightChild 
= new RBBINode(RBBINode::endMark
); 
 111     cn
->fRightChild
->fParent 
= cn
; 
 115     //  Replace all references to UnicodeSets with the tree for the equivalent 
 118     fTree
->flattenSets(); 
 120     if (fRB
->fDebugEnv 
&& uprv_strstr(fRB
->fDebugEnv
, "stree")) { 
 121         RBBIDebugPuts("Parse tree after flattening Unicode Set references."); 
 122         fTree
->printTree(TRUE
); 
 128     // calculate the functions nullable, firstpos, lastpos and followpos on 
 129     // nodes in the parse tree. 
 130     //    See the alogrithm description in Aho. 
 131     //    Understanding how this works by looking at the code alone will be 
 132     //       nearly impossible. 
 137     calcFollowPos(fTree
); 
 138     if (fRB
->fDebugEnv 
&& uprv_strstr(fRB
->fDebugEnv
, "pos")) { 
 144     //  For "chained" rules, modify the followPos sets 
 146     if (fRB
->fChainRules
) { 
 147         calcChainedFollowPos(fTree
); 
 151     //  BOF (start of input) test fixup. 
 153     if (fRB
->fSetBuilder
->sawBOF()) { 
 158     // Build the DFA state transition tables. 
 161     flagAcceptingStates(); 
 162     flagLookAheadStates(); 
 166     // Update the global table of rule status {tag} values 
 167     // The rule builder has a global vector of status values that are common 
 168     //    for all tables.  Merge the ones from this table into the global set. 
 170     mergeRuleStatusVals(); 
 172     if (fRB
->fDebugEnv 
&& uprv_strstr(fRB
->fDebugEnv
, "states")) {printStates();}; 
 177 //----------------------------------------------------------------------------- 
 179 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9 
 181 //----------------------------------------------------------------------------- 
 182 void RBBITableBuilder::calcNullable(RBBINode 
*n
) { 
 186     if (n
->fType 
== RBBINode::setRef 
|| 
 187         n
->fType 
== RBBINode::endMark 
) { 
 188         // These are non-empty leaf node types. 
 189         n
->fNullable 
= FALSE
; 
 193     if (n
->fType 
== RBBINode::lookAhead 
|| n
->fType 
== RBBINode::tag
) { 
 194         // Lookahead marker node.  It's a leaf, so no recursion on children. 
 195         // It's nullable because it does not match any literal text from the input stream. 
 201     // The node is not a leaf. 
 202     //  Calculate nullable on its children. 
 203     calcNullable(n
->fLeftChild
); 
 204     calcNullable(n
->fRightChild
); 
 206     // Apply functions from table 3.40 in Aho 
 207     if (n
->fType 
== RBBINode::opOr
) { 
 208         n
->fNullable 
= n
->fLeftChild
->fNullable 
|| n
->fRightChild
->fNullable
; 
 210     else if (n
->fType 
== RBBINode::opCat
) { 
 211         n
->fNullable 
= n
->fLeftChild
->fNullable 
&& n
->fRightChild
->fNullable
; 
 213     else if (n
->fType 
== RBBINode::opStar 
|| n
->fType 
== RBBINode::opQuestion
) { 
 217         n
->fNullable 
= FALSE
; 
 224 //----------------------------------------------------------------------------- 
 226 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9 
 228 //----------------------------------------------------------------------------- 
 229 void RBBITableBuilder::calcFirstPos(RBBINode 
*n
) { 
 233     if (n
->fType 
== RBBINode::leafChar  
|| 
 234         n
->fType 
== RBBINode::endMark   
|| 
 235         n
->fType 
== RBBINode::lookAhead 
|| 
 236         n
->fType 
== RBBINode::tag
) { 
 237         // These are non-empty leaf node types. 
 238         // Note: In order to maintain the sort invariant on the set, 
 239         // this function should only be called on a node whose set is 
 240         // empty to start with. 
 241         n
->fFirstPosSet
->addElement(n
, *fStatus
); 
 245     // The node is not a leaf. 
 246     //  Calculate firstPos on its children. 
 247     calcFirstPos(n
->fLeftChild
); 
 248     calcFirstPos(n
->fRightChild
); 
 250     // Apply functions from table 3.40 in Aho 
 251     if (n
->fType 
== RBBINode::opOr
) { 
 252         setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
); 
 253         setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
); 
 255     else if (n
->fType 
== RBBINode::opCat
) { 
 256         setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
); 
 257         if (n
->fLeftChild
->fNullable
) { 
 258             setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
); 
 261     else if (n
->fType 
== RBBINode::opStar 
|| 
 262              n
->fType 
== RBBINode::opQuestion 
|| 
 263              n
->fType 
== RBBINode::opPlus
) { 
 264         setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
); 
 270 //----------------------------------------------------------------------------- 
 272 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9 
 274 //----------------------------------------------------------------------------- 
 275 void RBBITableBuilder::calcLastPos(RBBINode 
*n
) { 
 279     if (n
->fType 
== RBBINode::leafChar  
|| 
 280         n
->fType 
== RBBINode::endMark   
|| 
 281         n
->fType 
== RBBINode::lookAhead 
|| 
 282         n
->fType 
== RBBINode::tag
) { 
 283         // These are non-empty leaf node types. 
 284         // Note: In order to maintain the sort invariant on the set, 
 285         // this function should only be called on a node whose set is 
 286         // empty to start with. 
 287         n
->fLastPosSet
->addElement(n
, *fStatus
); 
 291     // The node is not a leaf. 
 292     //  Calculate lastPos on its children. 
 293     calcLastPos(n
->fLeftChild
); 
 294     calcLastPos(n
->fRightChild
); 
 296     // Apply functions from table 3.40 in Aho 
 297     if (n
->fType 
== RBBINode::opOr
) { 
 298         setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
); 
 299         setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
); 
 301     else if (n
->fType 
== RBBINode::opCat
) { 
 302         setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
); 
 303         if (n
->fRightChild
->fNullable
) { 
 304             setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
); 
 307     else if (n
->fType 
== RBBINode::opStar     
|| 
 308              n
->fType 
== RBBINode::opQuestion 
|| 
 309              n
->fType 
== RBBINode::opPlus
) { 
 310         setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
); 
 316 //----------------------------------------------------------------------------- 
 318 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9 
 320 //----------------------------------------------------------------------------- 
 321 void RBBITableBuilder::calcFollowPos(RBBINode 
*n
) { 
 323         n
->fType 
== RBBINode::leafChar 
|| 
 324         n
->fType 
== RBBINode::endMark
) { 
 328     calcFollowPos(n
->fLeftChild
); 
 329     calcFollowPos(n
->fRightChild
); 
 332     if (n
->fType 
== RBBINode::opCat
) { 
 333         RBBINode 
*i
;   // is 'i' in Aho's description 
 336         UVector 
*LastPosOfLeftChild 
= n
->fLeftChild
->fLastPosSet
; 
 338         for (ix
=0; ix
<(uint32_t)LastPosOfLeftChild
->size(); ix
++) { 
 339             i 
= (RBBINode 
*)LastPosOfLeftChild
->elementAt(ix
); 
 340             setAdd(i
->fFollowPos
, n
->fRightChild
->fFirstPosSet
); 
 345     if (n
->fType 
== RBBINode::opStar 
|| 
 346         n
->fType 
== RBBINode::opPlus
) { 
 347         RBBINode   
*i
;  // again, n and i are the names from Aho's description. 
 350         for (ix
=0; ix
<(uint32_t)n
->fLastPosSet
->size(); ix
++) { 
 351             i 
= (RBBINode 
*)n
->fLastPosSet
->elementAt(ix
); 
 352             setAdd(i
->fFollowPos
, n
->fFirstPosSet
); 
 361 //----------------------------------------------------------------------------- 
 363 //   calcChainedFollowPos.    Modify the previously calculated followPos sets 
 364 //                            to implement rule chaining.  NOT described by Aho 
 366 //----------------------------------------------------------------------------- 
 367 void RBBITableBuilder::calcChainedFollowPos(RBBINode 
*tree
) { 
 369     UVector         
endMarkerNodes(*fStatus
); 
 370     UVector         
leafNodes(*fStatus
); 
 373     if (U_FAILURE(*fStatus
)) { 
 377     // get a list of all endmarker nodes. 
 378     tree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
); 
 380     // get a list all leaf nodes 
 381     tree
->findNodes(&leafNodes
, RBBINode::leafChar
, *fStatus
); 
 382     if (U_FAILURE(*fStatus
)) { 
 386     // Get all nodes that can be the start a match, which is FirstPosition() 
 387     // of the portion of the tree corresponding to user-written rules. 
 388     // See the tree description in bofFixup(). 
 389     RBBINode 
*userRuleRoot 
= tree
; 
 390     if (fRB
->fSetBuilder
->sawBOF()) { 
 391         userRuleRoot 
= tree
->fLeftChild
->fRightChild
; 
 393     U_ASSERT(userRuleRoot 
!= NULL
); 
 394     UVector 
*matchStartNodes 
= userRuleRoot
->fFirstPosSet
; 
 397     // Iteratate over all leaf nodes, 
 402     for (endNodeIx
=0; endNodeIx
<leafNodes
.size(); endNodeIx
++) { 
 403         RBBINode 
*tNode   
= (RBBINode 
*)leafNodes
.elementAt(endNodeIx
); 
 404         RBBINode 
*endNode 
= NULL
; 
 406         // Identify leaf nodes that correspond to overall rule match positions. 
 407         //   These include an endMarkerNode in their followPos sets. 
 408         for (i
=0; i
<endMarkerNodes
.size(); i
++) { 
 409             if (tNode
->fFollowPos
->contains(endMarkerNodes
.elementAt(i
))) { 
 414         if (endNode 
== NULL
) { 
 415             // node wasn't an end node.  Try again with the next. 
 419         // We've got a node that can end a match. 
 421         // Line Break Specific hack:  If this node's val correspond to the $CM char class, 
 422         //                            don't chain from it. 
 423         // TODO:  Add rule syntax for this behavior, get specifics out of here and 
 424         //        into the rule file. 
 425         if (fRB
->fLBCMNoChain
) { 
 426             UChar32 c 
= this->fRB
->fSetBuilder
->getFirstChar(endNode
->fVal
); 
 428                 // c == -1 occurs with sets containing only the {eof} marker string. 
 429                 ULineBreak cLBProp 
= (ULineBreak
)u_getIntPropertyValue(c
, UCHAR_LINE_BREAK
); 
 430                 if (cLBProp 
== U_LB_COMBINING_MARK
) { 
 437         // Now iterate over the nodes that can start a match, looking for ones 
 438         //   with the same char class as our ending node. 
 440         for (startNodeIx 
= 0; startNodeIx
<matchStartNodes
->size(); startNodeIx
++) { 
 441             startNode 
= (RBBINode 
*)matchStartNodes
->elementAt(startNodeIx
); 
 442             if (startNode
->fType 
!= RBBINode::leafChar
) { 
 446             if (endNode
->fVal 
== startNode
->fVal
) { 
 447                 // The end val (character class) of one possible match is the 
 448                 //   same as the start of another. 
 450                 // Add all nodes from the followPos of the start node to the 
 451                 //  followPos set of the end node, which will have the effect of 
 452                 //  letting matches transition from a match state at endNode 
 453                 //  to the second char of a match starting with startNode. 
 454                 setAdd(endNode
->fFollowPos
, startNode
->fFollowPos
); 
 461 //----------------------------------------------------------------------------- 
 463 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing. 
 464 //                Do an swizzle similar to chaining, modifying the followPos set of 
 465 //                the bofNode to include the followPos nodes from other {bot} nodes 
 466 //                scattered through the tree. 
 468 //                This function has much in common with calcChainedFollowPos(). 
 470 //----------------------------------------------------------------------------- 
 471 void RBBITableBuilder::bofFixup() { 
 473     if (U_FAILURE(*fStatus
)) { 
 477     //   The parse tree looks like this ... 
 478     //         fTree root  --->       <cat> 
 485     //    We will be adding things to the followPos set of the <bofNode> 
 487     RBBINode  
*bofNode 
= fTree
->fLeftChild
->fLeftChild
; 
 488     U_ASSERT(bofNode
->fType 
== RBBINode::leafChar
); 
 489     U_ASSERT(bofNode
->fVal 
== 2); 
 491     // Get all nodes that can be the start a match of the user-written rules 
 492     //  (excluding the fake bofNode) 
 493     //  We want the nodes that can start a match in the 
 494     //     part labeled "rest of tree" 
 496     UVector 
*matchStartNodes 
= fTree
->fLeftChild
->fRightChild
->fFirstPosSet
; 
 500     for (startNodeIx 
= 0; startNodeIx
<matchStartNodes
->size(); startNodeIx
++) { 
 501         startNode 
= (RBBINode 
*)matchStartNodes
->elementAt(startNodeIx
); 
 502         if (startNode
->fType 
!= RBBINode::leafChar
) { 
 506         if (startNode
->fVal 
== bofNode
->fVal
) { 
 507             //  We found a leaf node corresponding to a {bof} that was 
 508             //    explicitly written into a rule. 
 509             //  Add everything from the followPos set of this node to the 
 510             //    followPos set of the fake bofNode at the start of the tree. 
 512             setAdd(bofNode
->fFollowPos
, startNode
->fFollowPos
); 
 517 //----------------------------------------------------------------------------- 
 519 //   buildStateTable()    Determine the set of runtime DFA states and the 
 520 //                        transition tables for these states, by the algorithm 
 521 //                        of fig. 3.44 in Aho. 
 523 //                        Most of the comments are quotes of Aho's psuedo-code. 
 525 //----------------------------------------------------------------------------- 
 526 void RBBITableBuilder::buildStateTable() { 
 527     if (U_FAILURE(*fStatus
)) { 
 531     // Add a dummy state 0 - the stop state.  Not from Aho. 
 532     int      lastInputSymbol 
= fRB
->fSetBuilder
->getNumCharCategories() - 1; 
 533     RBBIStateDescriptor 
*failState 
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
); 
 534     failState
->fPositions 
= new UVector(*fStatus
); 
 535     if (U_FAILURE(*fStatus
)) { 
 538     fDStates
->addElement(failState
, *fStatus
); 
 539     if (U_FAILURE(*fStatus
)) { 
 543     // initially, the only unmarked state in Dstates is firstpos(root), 
 544     //       where toot is the root of the syntax tree for (r)#; 
 545     RBBIStateDescriptor 
*initialState 
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
); 
 546     if (U_FAILURE(*fStatus
)) { 
 549     initialState
->fPositions 
= new UVector(*fStatus
); 
 550     if (U_FAILURE(*fStatus
)) { 
 553     setAdd(initialState
->fPositions
, fTree
->fFirstPosSet
); 
 554     fDStates
->addElement(initialState
, *fStatus
); 
 555     if (U_FAILURE(*fStatus
)) { 
 559     // while there is an unmarked state T in Dstates do begin 
 561         RBBIStateDescriptor 
*T 
= NULL
; 
 563         for (tx
=1; tx
<fDStates
->size(); tx
++) { 
 564             RBBIStateDescriptor 
*temp
; 
 565             temp 
= (RBBIStateDescriptor 
*)fDStates
->elementAt(tx
); 
 566             if (temp
->fMarked 
== FALSE
) { 
 578         // for each input symbol a do begin 
 580         for (a 
= 1; a
<=lastInputSymbol
; a
++) { 
 581             // let U be the set of positions that are in followpos(p) 
 582             //    for some position p in T 
 583             //    such that the symbol at position p is a; 
 587             for (px
=0; px
<T
->fPositions
->size(); px
++) { 
 588                 p 
= (RBBINode 
*)T
->fPositions
->elementAt(px
); 
 589                 if ((p
->fType 
== RBBINode::leafChar
) &&  (p
->fVal 
== a
)) { 
 591                         U 
= new UVector(*fStatus
); 
 593                     setAdd(U
, p
->fFollowPos
); 
 597             // if U is not empty and not in DStates then 
 599             UBool    UinDstates 
= FALSE
; 
 601                 U_ASSERT(U
->size() > 0); 
 603                 for (ix
=0; ix
<fDStates
->size(); ix
++) { 
 604                     RBBIStateDescriptor 
*temp2
; 
 605                     temp2 
= (RBBIStateDescriptor 
*)fDStates
->elementAt(ix
); 
 606                     if (setEquals(U
, temp2
->fPositions
)) { 
 608                         U  
= temp2
->fPositions
; 
 615                 // Add U as an unmarked state to Dstates 
 618                     RBBIStateDescriptor 
*newState 
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
); 
 619                     if (U_FAILURE(*fStatus
)) { 
 622                     newState
->fPositions 
= U
; 
 623                     fDStates
->addElement(newState
, *fStatus
); 
 624                     if (U_FAILURE(*fStatus
)) { 
 627                     ux 
= fDStates
->size()-1; 
 631                 T
->fDtran
->setElementAt(ux
, a
); 
 639 //----------------------------------------------------------------------------- 
 641 //   flagAcceptingStates    Identify accepting states. 
 642 //                          First get a list of all of the end marker nodes. 
 643 //                          Then, for each state s, 
 644 //                              if s contains one of the end marker nodes in its list of tree positions then 
 645 //                                  s is an accepting state. 
 647 //----------------------------------------------------------------------------- 
 648 void     RBBITableBuilder::flagAcceptingStates() { 
 649     if (U_FAILURE(*fStatus
)) { 
 652     UVector     
endMarkerNodes(*fStatus
); 
 657     if (U_FAILURE(*fStatus
)) { 
 661     fTree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
); 
 662     if (U_FAILURE(*fStatus
)) { 
 666     for (i
=0; i
<endMarkerNodes
.size(); i
++) { 
 667         endMarker 
= (RBBINode 
*)endMarkerNodes
.elementAt(i
); 
 668         for (n
=0; n
<fDStates
->size(); n
++) { 
 669             RBBIStateDescriptor 
*sd 
= (RBBIStateDescriptor 
*)fDStates
->elementAt(n
); 
 670             if (sd
->fPositions
->indexOf(endMarker
) >= 0) { 
 671                 // Any non-zero value for fAccepting means this is an accepting node. 
 672                 // The value is what will be returned to the user as the break status. 
 673                 // If no other value was specified, force it to -1. 
 675                 if (sd
->fAccepting
==0) { 
 676                     // State hasn't been marked as accepting yet.  Do it now. 
 677                     sd
->fAccepting 
= endMarker
->fVal
; 
 678                     if (sd
->fAccepting 
== 0) { 
 682                 if (sd
->fAccepting
==-1 && endMarker
->fVal 
!= 0) { 
 683                     // Both lookahead and non-lookahead accepting for this state. 
 684                     // Favor the look-ahead.  Expedient for line break. 
 685                     // TODO:  need a more elegant resolution for conflicting rules. 
 686                     sd
->fAccepting 
= endMarker
->fVal
; 
 689                 // if sd->fAccepting already had a value other than 0 or -1, leave it be. 
 691                 // If the end marker node is from a look-ahead rule, set 
 692                 //   the fLookAhead field or this state also. 
 693                 if (endMarker
->fLookAheadEnd
) { 
 694                     // TODO:  don't change value if already set? 
 695                     // TODO:  allow for more than one active look-ahead rule in engine. 
 696                     //        Make value here an index to a side array in engine? 
 697                     sd
->fLookAhead 
= sd
->fAccepting
; 
 705 //----------------------------------------------------------------------------- 
 707 //    flagLookAheadStates   Very similar to flagAcceptingStates, above. 
 709 //----------------------------------------------------------------------------- 
 710 void     RBBITableBuilder::flagLookAheadStates() { 
 711     if (U_FAILURE(*fStatus
)) { 
 714     UVector     
lookAheadNodes(*fStatus
); 
 715     RBBINode    
*lookAheadNode
; 
 719     fTree
->findNodes(&lookAheadNodes
, RBBINode::lookAhead
, *fStatus
); 
 720     if (U_FAILURE(*fStatus
)) { 
 723     for (i
=0; i
<lookAheadNodes
.size(); i
++) { 
 724         lookAheadNode 
= (RBBINode 
*)lookAheadNodes
.elementAt(i
); 
 726         for (n
=0; n
<fDStates
->size(); n
++) { 
 727             RBBIStateDescriptor 
*sd 
= (RBBIStateDescriptor 
*)fDStates
->elementAt(n
); 
 728             if (sd
->fPositions
->indexOf(lookAheadNode
) >= 0) { 
 729                 sd
->fLookAhead 
= lookAheadNode
->fVal
; 
 738 //----------------------------------------------------------------------------- 
 742 //----------------------------------------------------------------------------- 
 743 void     RBBITableBuilder::flagTaggedStates() { 
 744     if (U_FAILURE(*fStatus
)) { 
 747     UVector     
tagNodes(*fStatus
); 
 752     if (U_FAILURE(*fStatus
)) { 
 755     fTree
->findNodes(&tagNodes
, RBBINode::tag
, *fStatus
); 
 756     if (U_FAILURE(*fStatus
)) { 
 759     for (i
=0; i
<tagNodes
.size(); i
++) {                   // For each tag node t (all of 'em) 
 760         tagNode 
= (RBBINode 
*)tagNodes
.elementAt(i
); 
 762         for (n
=0; n
<fDStates
->size(); n
++) {              //    For each state  s (row in the state table) 
 763             RBBIStateDescriptor 
*sd 
= (RBBIStateDescriptor 
*)fDStates
->elementAt(n
); 
 764             if (sd
->fPositions
->indexOf(tagNode
) >= 0) {  //       if  s include the tag node t 
 765                 sortedAdd(&sd
->fTagVals
, tagNode
->fVal
); 
 774 //----------------------------------------------------------------------------- 
 776 //  mergeRuleStatusVals 
 778 //      Update the global table of rule status {tag} values 
 779 //      The rule builder has a global vector of status values that are common 
 780 //      for all tables.  Merge the ones from this table into the global set. 
 782 //----------------------------------------------------------------------------- 
 783 void  RBBITableBuilder::mergeRuleStatusVals() { 
 785     //  The basic outline of what happens here is this... 
 787     //    for each state in this state table 
 788     //       if the status tag list for this state is in the global statuses list 
 790     //           continue with the next state 
 792     //           add the tag list for this state to the global list. 
 797     // Pre-set a single tag of {0} into the table. 
 798     //   We will need this as a default, for rule sets with no explicit tagging. 
 799     if (fRB
->fRuleStatusVals
->size() == 0) { 
 800         fRB
->fRuleStatusVals
->addElement(1, *fStatus
);  // Num of statuses in group 
 801         fRB
->fRuleStatusVals
->addElement((int32_t)0, *fStatus
);  //   and our single status of zero 
 805     for (n
=0; n
<fDStates
->size(); n
++) { 
 806         RBBIStateDescriptor 
*sd 
= (RBBIStateDescriptor 
*)fDStates
->elementAt(n
); 
 807         UVector 
*thisStatesTagValues 
= sd
->fTagVals
; 
 808         if (thisStatesTagValues 
== NULL
) { 
 809             // No tag values are explicitly associated with this state. 
 810             //   Set the default tag value. 
 815         // There are tag(s) associated with this state. 
 816         //   fTagsIdx will be the index into the global tag list for this state's tag values. 
 817         //   Initial value of -1 flags that we haven't got it set yet. 
 819         int32_t  thisTagGroupStart 
= 0;   // indexes into the global rule status vals list 
 820         int32_t  nextTagGroupStart 
= 0; 
 822         // Loop runs once per group of tags in the global list 
 823         while (nextTagGroupStart 
< fRB
->fRuleStatusVals
->size()) { 
 824             thisTagGroupStart 
= nextTagGroupStart
; 
 825             nextTagGroupStart 
+= fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
) + 1; 
 826             if (thisStatesTagValues
->size() != fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
)) { 
 827                 // The number of tags for this state is different from 
 828                 //    the number of tags in this group from the global list. 
 829                 //    Continue with the next group from the global list. 
 832             // The lengths match, go ahead and compare the actual tag values 
 833             //    between this state and the group from the global list. 
 834             for (i
=0; i
<thisStatesTagValues
->size(); i
++) { 
 835                 if (thisStatesTagValues
->elementAti(i
) != 
 836                     fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart 
+ 1 + i
) ) { 
 842             if (i 
== thisStatesTagValues
->size()) { 
 843                 // We found a set of tag values in the global list that match 
 844                 //   those for this state.  Use them. 
 845                 sd
->fTagsIdx 
= thisTagGroupStart
; 
 850         if (sd
->fTagsIdx 
== -1) { 
 851             // No suitable entry in the global tag list already.  Add one 
 852             sd
->fTagsIdx 
= fRB
->fRuleStatusVals
->size(); 
 853             fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->size(), *fStatus
); 
 854             for (i
=0; i
<thisStatesTagValues
->size(); i
++) { 
 855                 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->elementAti(i
), *fStatus
); 
 867 //----------------------------------------------------------------------------- 
 869 //  sortedAdd  Add a value to a vector of sorted values (ints). 
 870 //             Do not replicate entries; if the value is already there, do not 
 872 //             Lazily create the vector if it does not already exist. 
 874 //----------------------------------------------------------------------------- 
 875 void RBBITableBuilder::sortedAdd(UVector 
**vector
, int32_t val
) { 
 878     if (*vector 
== NULL
) { 
 879         *vector 
= new UVector(*fStatus
); 
 881     if (*vector 
== NULL 
|| U_FAILURE(*fStatus
)) { 
 884     UVector 
*vec 
= *vector
; 
 885     int32_t  vSize 
= vec
->size(); 
 886     for (i
=0; i
<vSize
; i
++) { 
 887         int32_t valAtI 
= vec
->elementAti(i
); 
 889             // The value is already in the vector.  Don't add it again. 
 896     vec
->insertElementAt(val
, i
, *fStatus
); 
 901 //----------------------------------------------------------------------------- 
 903 //  setAdd     Set operation on UVector 
 904 //             dest = dest union source 
 905 //             Elements may only appear once and must be sorted. 
 907 //----------------------------------------------------------------------------- 
 908 void RBBITableBuilder::setAdd(UVector 
*dest
, UVector 
*source
) { 
 909     int destOriginalSize 
= dest
->size(); 
 910     int sourceSize       
= source
->size(); 
 912     void *(destS
[16]), *(sourceS
[16]);  // Handle small cases without malloc 
 913     void **destH 
= 0, **sourceH 
= 0; 
 914     void **destBuff
, **sourceBuff
; 
 915     void **destLim
, **sourceLim
; 
 917     if (destOriginalSize 
> sizeof(destS
)/sizeof(destS
[0])) { 
 918         destH 
= (void **)uprv_malloc(sizeof(void *) * destOriginalSize
); 
 927     destLim 
= destBuff 
+ destOriginalSize
; 
 929     if (sourceSize 
> sizeof(sourceS
)/sizeof(sourceS
[0])) { 
 930         sourceH 
= (void **)uprv_malloc(sizeof(void *) * sourceSize
); 
 931         sourceBuff 
= sourceH
; 
 934         sourceBuff 
= sourceS
; 
 936     if (sourceBuff 
== 0) { 
 942     sourceLim 
= sourceBuff 
+ sourceSize
; 
 944     // Avoid multiple "get element" calls by getting the contents into arrays 
 945     (void) dest
->toArray(destBuff
); 
 946     (void) source
->toArray(sourceBuff
); 
 948     dest
->setSize(sourceSize
+destOriginalSize
); 
 950     while (sourceBuff 
< sourceLim 
&& destBuff 
< destLim
) { 
 951         if (*destBuff 
< *sourceBuff
) { 
 952             dest
->setElementAt(*destBuff
++, di
++); 
 954         else if (*sourceBuff 
< *destBuff
) { 
 955             dest
->setElementAt(*sourceBuff
++, di
++); 
 958             dest
->setElementAt(*sourceBuff
++, di
++); 
 963     // At most one of these two cleanup loops will execute 
 964     while (destBuff 
< destLim
) { 
 965         dest
->setElementAt(*destBuff
++, di
++); 
 967     while (sourceBuff 
< sourceLim
) { 
 968         dest
->setElementAt(*sourceBuff
++, di
++); 
 982 //----------------------------------------------------------------------------- 
 984 //  setEqual    Set operation on UVector. 
 985 //              Compare for equality. 
 986 //              Elements must be sorted. 
 988 //----------------------------------------------------------------------------- 
 989 UBool 
RBBITableBuilder::setEquals(UVector 
*a
, UVector 
*b
) { 
 990     return a
->equals(*b
); 
 994 //----------------------------------------------------------------------------- 
 996 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos 
 997 //                 for each node in the tree. 
 999 //----------------------------------------------------------------------------- 
1001 void RBBITableBuilder::printPosSets(RBBINode 
*n
) { 
1006     RBBIDebugPrintf("         Nullable:  %s\n", n
->fNullable
?"TRUE":"FALSE"); 
1008     RBBIDebugPrintf("         firstpos:  "); 
1009     printSet(n
->fFirstPosSet
); 
1011     RBBIDebugPrintf("         lastpos:   "); 
1012     printSet(n
->fLastPosSet
); 
1014     RBBIDebugPrintf("         followpos: "); 
1015     printSet(n
->fFollowPos
); 
1017     printPosSets(n
->fLeftChild
); 
1018     printPosSets(n
->fRightChild
); 
1024 //----------------------------------------------------------------------------- 
1026 //   getTableSize()    Calculate the size of the runtime form of this 
1027 //                     state transition table. 
1029 //----------------------------------------------------------------------------- 
1030 int32_t  RBBITableBuilder::getTableSize() const { 
1036     if (fTree 
== NULL
) { 
1040     size    
= sizeof(RBBIStateTable
) - 4;    // The header, with no rows to the table. 
1042     numRows 
= fDStates
->size(); 
1043     numCols 
= fRB
->fSetBuilder
->getNumCharCategories(); 
1045     //  Note  The declaration of RBBIStateTableRow is for a table of two columns. 
1046     //        Therefore we subtract two from numCols when determining 
1047     //        how much storage to add to a row for the total columns. 
1048     rowSize 
= sizeof(RBBIStateTableRow
) + sizeof(uint16_t)*(numCols
-2); 
1049     size   
+= numRows 
* rowSize
; 
1055 //----------------------------------------------------------------------------- 
1057 //   exportTable()    export the state transition table in the format required 
1058 //                    by the runtime engine.  getTableSize() bytes of memory 
1059 //                    must be available at the output address "where". 
1061 //----------------------------------------------------------------------------- 
1062 void RBBITableBuilder::exportTable(void *where
) { 
1063     RBBIStateTable    
*table 
= (RBBIStateTable 
*)where
; 
1067     if (U_FAILURE(*fStatus
) || fTree 
== NULL
) { 
1071     if (fRB
->fSetBuilder
->getNumCharCategories() > 0x7fff || 
1072         fDStates
->size() > 0x7fff) { 
1073         *fStatus 
= U_BRK_INTERNAL_ERROR
; 
1077     table
->fRowLen    
= sizeof(RBBIStateTableRow
) + 
1078                             sizeof(uint16_t) * (fRB
->fSetBuilder
->getNumCharCategories() - 2); 
1079     table
->fNumStates 
= fDStates
->size(); 
1081     if (fRB
->fLookAheadHardBreak
) { 
1082         table
->fFlags  
|= RBBI_LOOKAHEAD_HARD_BREAK
; 
1084     if (fRB
->fSetBuilder
->sawBOF()) { 
1085         table
->fFlags  
|= RBBI_BOF_REQUIRED
; 
1087     table
->fReserved  
= 0; 
1089     for (state
=0; state
<table
->fNumStates
; state
++) { 
1090         RBBIStateDescriptor 
*sd 
= (RBBIStateDescriptor 
*)fDStates
->elementAt(state
); 
1091         RBBIStateTableRow   
*row 
= (RBBIStateTableRow 
*)(table
->fTableData 
+ state
*table
->fRowLen
); 
1092         U_ASSERT (-32768 < sd
->fAccepting 
&& sd
->fAccepting 
<= 32767); 
1093         U_ASSERT (-32768 < sd
->fLookAhead 
&& sd
->fLookAhead 
<= 32767); 
1094         row
->fAccepting 
= (int16_t)sd
->fAccepting
; 
1095         row
->fLookAhead 
= (int16_t)sd
->fLookAhead
; 
1096         row
->fTagIdx    
= (int16_t)sd
->fTagsIdx
; 
1097         for (col
=0; col
<fRB
->fSetBuilder
->getNumCharCategories(); col
++) { 
1098             row
->fNextState
[col
] = (uint16_t)sd
->fDtran
->elementAti(col
); 
1105 //----------------------------------------------------------------------------- 
1107 //   printSet    Debug function.   Print the contents of a UVector 
1109 //----------------------------------------------------------------------------- 
1111 void RBBITableBuilder::printSet(UVector 
*s
) { 
1113     for (i
=0; i
<s
->size(); i
++) { 
1114         void *v 
= s
->elementAt(i
); 
1115         RBBIDebugPrintf("%10p", v
); 
1117     RBBIDebugPrintf("\n"); 
1122 //----------------------------------------------------------------------------- 
1124 //   printStates    Debug Function.  Dump the fully constructed state transition table. 
1126 //----------------------------------------------------------------------------- 
1128 void RBBITableBuilder::printStates() { 
1129     int     c
;    // input "character" 
1130     int     n
;    // state number 
1132     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n"); 
1133     RBBIDebugPrintf("      | Acc  LA    Tag"); 
1134     for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) { 
1135         RBBIDebugPrintf(" %2d", c
); 
1137     RBBIDebugPrintf("\n"); 
1138     RBBIDebugPrintf("      |---------------"); 
1139     for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) { 
1140         RBBIDebugPrintf("---"); 
1142     RBBIDebugPrintf("\n"); 
1144     for (n
=0; n
<fDStates
->size(); n
++) { 
1145         RBBIStateDescriptor 
*sd 
= (RBBIStateDescriptor 
*)fDStates
->elementAt(n
); 
1146         RBBIDebugPrintf("  %3d | " , n
); 
1147         RBBIDebugPrintf("%3d %3d %5d ", sd
->fAccepting
, sd
->fLookAhead
, sd
->fTagsIdx
); 
1148         for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) { 
1149             RBBIDebugPrintf(" %2d", sd
->fDtran
->elementAti(c
)); 
1151         RBBIDebugPrintf("\n"); 
1153     RBBIDebugPrintf("\n\n"); 
1159 //----------------------------------------------------------------------------- 
1161 //   printRuleStatusTable    Debug Function.  Dump the common rule status table 
1163 //----------------------------------------------------------------------------- 
1165 void RBBITableBuilder::printRuleStatusTable() { 
1166     int32_t  thisRecord 
= 0; 
1167     int32_t  nextRecord 
= 0; 
1169     UVector  
*tbl 
= fRB
->fRuleStatusVals
; 
1171     RBBIDebugPrintf("index |  tags \n"); 
1172     RBBIDebugPrintf("-------------------\n"); 
1174     while (nextRecord 
< tbl
->size()) { 
1175         thisRecord 
= nextRecord
; 
1176         nextRecord 
= thisRecord 
+ tbl
->elementAti(thisRecord
) + 1; 
1177         RBBIDebugPrintf("%4d   ", thisRecord
); 
1178         for (i
=thisRecord
+1; i
<nextRecord
; i
++) { 
1179             RBBIDebugPrintf("  %5d", tbl
->elementAti(i
)); 
1181         RBBIDebugPrintf("\n"); 
1183     RBBIDebugPrintf("\n\n"); 
1188 //----------------------------------------------------------------------------- 
1190 //   RBBIStateDescriptor     Methods.  This is a very struct-like class 
1191 //                           Most access is directly to the fields. 
1193 //----------------------------------------------------------------------------- 
1195 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol
, UErrorCode 
*fStatus
) { 
1204     fDtran     
= new UVector(lastInputSymbol
+1, *fStatus
); 
1205     if (U_FAILURE(*fStatus
)) { 
1208     if (fDtran 
== NULL
) { 
1209         *fStatus 
= U_MEMORY_ALLOCATION_ERROR
; 
1212     fDtran
->setSize(lastInputSymbol
+1);    // fDtran needs to be pre-sized. 
1213                                            //   It is indexed by input symbols, and will 
1214                                            //   hold  the next state number for each 
1219 RBBIStateDescriptor::~RBBIStateDescriptor() { 
1230 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */