1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
14 #include "unicode/utypes.h"
16 #if !UCONFIG_NO_BREAK_ITERATION
18 #include "unicode/unistr.h"
30 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder
*rb
, RBBINode
**rootNode
, UErrorCode
&status
) :
36 if (U_FAILURE(status
)) {
39 // fDStates is UVector<RBBIStateDescriptor *>
40 fDStates
= new UVector(status
);
41 if (U_SUCCESS(status
) && fDStates
== nullptr ) {
42 status
= U_MEMORY_ALLOCATION_ERROR
;
48 RBBITableBuilder::~RBBITableBuilder() {
50 for (i
=0; i
<fDStates
->size(); i
++) {
51 delete (RBBIStateDescriptor
*)fDStates
->elementAt(i
);
58 //-----------------------------------------------------------------------------
60 // RBBITableBuilder::buildForwardTable - This is the main function for building
61 // the DFA state transition table from the RBBI rules parse tree.
63 //-----------------------------------------------------------------------------
64 void RBBITableBuilder::buildForwardTable() {
66 if (U_FAILURE(*fStatus
)) {
70 // If there were no rules, just return. This situation can easily arise
71 // for the reverse rules.
77 // Walk through the tree, replacing any references to $variables with a copy of the
78 // parse tree for the substition expression.
80 fTree
= fTree
->flattenVariables();
82 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "ftree")) {
83 RBBIDebugPuts("\nParse tree after flattening variable references.");
84 RBBINode::printTree(fTree
, TRUE
);
89 // If the rules contained any references to {bof}
90 // add a {bof} <cat> <former root of tree> to the
91 // tree. Means that all matches must start out with the
92 // {bof} fake character.
94 if (fRB
->fSetBuilder
->sawBOF()) {
95 RBBINode
*bofTop
= new RBBINode(RBBINode::opCat
);
96 RBBINode
*bofLeaf
= new RBBINode(RBBINode::leafChar
);
97 // Delete and exit if memory allocation failed.
98 if (bofTop
== NULL
|| bofLeaf
== NULL
) {
99 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
104 bofTop
->fLeftChild
= bofLeaf
;
105 bofTop
->fRightChild
= fTree
;
106 bofLeaf
->fParent
= bofTop
;
107 bofLeaf
->fVal
= 2; // Reserved value for {bof}.
112 // Add a unique right-end marker to the expression.
113 // Appears as a cat-node, left child being the original tree,
114 // right child being the end marker.
116 RBBINode
*cn
= new RBBINode(RBBINode::opCat
);
117 // Exit if memory allocation failed.
119 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
122 cn
->fLeftChild
= fTree
;
124 cn
->fRightChild
= new RBBINode(RBBINode::endMark
);
125 // Delete and exit if memory allocation failed.
126 if (cn
->fRightChild
== NULL
) {
127 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
131 cn
->fRightChild
->fParent
= cn
;
135 // Replace all references to UnicodeSets with the tree for the equivalent
138 fTree
->flattenSets();
140 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "stree")) {
141 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
142 RBBINode::printTree(fTree
, TRUE
);
148 // calculate the functions nullable, firstpos, lastpos and followpos on
149 // nodes in the parse tree.
150 // See the alogrithm description in Aho.
151 // Understanding how this works by looking at the code alone will be
152 // nearly impossible.
157 calcFollowPos(fTree
);
158 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "pos")) {
164 // For "chained" rules, modify the followPos sets
166 if (fRB
->fChainRules
) {
167 calcChainedFollowPos(fTree
);
171 // BOF (start of input) test fixup.
173 if (fRB
->fSetBuilder
->sawBOF()) {
178 // Build the DFA state transition tables.
181 flagAcceptingStates();
182 flagLookAheadStates();
186 // Update the global table of rule status {tag} values
187 // The rule builder has a global vector of status values that are common
188 // for all tables. Merge the ones from this table into the global set.
190 mergeRuleStatusVals();
195 //-----------------------------------------------------------------------------
197 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
199 //-----------------------------------------------------------------------------
200 void RBBITableBuilder::calcNullable(RBBINode
*n
) {
204 if (n
->fType
== RBBINode::setRef
||
205 n
->fType
== RBBINode::endMark
) {
206 // These are non-empty leaf node types.
207 n
->fNullable
= FALSE
;
211 if (n
->fType
== RBBINode::lookAhead
|| n
->fType
== RBBINode::tag
) {
212 // Lookahead marker node. It's a leaf, so no recursion on children.
213 // It's nullable because it does not match any literal text from the input stream.
219 // The node is not a leaf.
220 // Calculate nullable on its children.
221 calcNullable(n
->fLeftChild
);
222 calcNullable(n
->fRightChild
);
224 // Apply functions from table 3.40 in Aho
225 if (n
->fType
== RBBINode::opOr
) {
226 n
->fNullable
= n
->fLeftChild
->fNullable
|| n
->fRightChild
->fNullable
;
228 else if (n
->fType
== RBBINode::opCat
) {
229 n
->fNullable
= n
->fLeftChild
->fNullable
&& n
->fRightChild
->fNullable
;
231 else if (n
->fType
== RBBINode::opStar
|| n
->fType
== RBBINode::opQuestion
) {
235 n
->fNullable
= FALSE
;
242 //-----------------------------------------------------------------------------
244 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
246 //-----------------------------------------------------------------------------
247 void RBBITableBuilder::calcFirstPos(RBBINode
*n
) {
251 if (n
->fType
== RBBINode::leafChar
||
252 n
->fType
== RBBINode::endMark
||
253 n
->fType
== RBBINode::lookAhead
||
254 n
->fType
== RBBINode::tag
) {
255 // These are non-empty leaf node types.
256 // Note: In order to maintain the sort invariant on the set,
257 // this function should only be called on a node whose set is
258 // empty to start with.
259 n
->fFirstPosSet
->addElement(n
, *fStatus
);
263 // The node is not a leaf.
264 // Calculate firstPos on its children.
265 calcFirstPos(n
->fLeftChild
);
266 calcFirstPos(n
->fRightChild
);
268 // Apply functions from table 3.40 in Aho
269 if (n
->fType
== RBBINode::opOr
) {
270 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
271 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
273 else if (n
->fType
== RBBINode::opCat
) {
274 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
275 if (n
->fLeftChild
->fNullable
) {
276 setAdd(n
->fFirstPosSet
, n
->fRightChild
->fFirstPosSet
);
279 else if (n
->fType
== RBBINode::opStar
||
280 n
->fType
== RBBINode::opQuestion
||
281 n
->fType
== RBBINode::opPlus
) {
282 setAdd(n
->fFirstPosSet
, n
->fLeftChild
->fFirstPosSet
);
288 //-----------------------------------------------------------------------------
290 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
292 //-----------------------------------------------------------------------------
293 void RBBITableBuilder::calcLastPos(RBBINode
*n
) {
297 if (n
->fType
== RBBINode::leafChar
||
298 n
->fType
== RBBINode::endMark
||
299 n
->fType
== RBBINode::lookAhead
||
300 n
->fType
== RBBINode::tag
) {
301 // These are non-empty leaf node types.
302 // Note: In order to maintain the sort invariant on the set,
303 // this function should only be called on a node whose set is
304 // empty to start with.
305 n
->fLastPosSet
->addElement(n
, *fStatus
);
309 // The node is not a leaf.
310 // Calculate lastPos on its children.
311 calcLastPos(n
->fLeftChild
);
312 calcLastPos(n
->fRightChild
);
314 // Apply functions from table 3.40 in Aho
315 if (n
->fType
== RBBINode::opOr
) {
316 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
317 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
319 else if (n
->fType
== RBBINode::opCat
) {
320 setAdd(n
->fLastPosSet
, n
->fRightChild
->fLastPosSet
);
321 if (n
->fRightChild
->fNullable
) {
322 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
325 else if (n
->fType
== RBBINode::opStar
||
326 n
->fType
== RBBINode::opQuestion
||
327 n
->fType
== RBBINode::opPlus
) {
328 setAdd(n
->fLastPosSet
, n
->fLeftChild
->fLastPosSet
);
334 //-----------------------------------------------------------------------------
336 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
338 //-----------------------------------------------------------------------------
339 void RBBITableBuilder::calcFollowPos(RBBINode
*n
) {
341 n
->fType
== RBBINode::leafChar
||
342 n
->fType
== RBBINode::endMark
) {
346 calcFollowPos(n
->fLeftChild
);
347 calcFollowPos(n
->fRightChild
);
350 if (n
->fType
== RBBINode::opCat
) {
351 RBBINode
*i
; // is 'i' in Aho's description
354 UVector
*LastPosOfLeftChild
= n
->fLeftChild
->fLastPosSet
;
356 for (ix
=0; ix
<(uint32_t)LastPosOfLeftChild
->size(); ix
++) {
357 i
= (RBBINode
*)LastPosOfLeftChild
->elementAt(ix
);
358 setAdd(i
->fFollowPos
, n
->fRightChild
->fFirstPosSet
);
363 if (n
->fType
== RBBINode::opStar
||
364 n
->fType
== RBBINode::opPlus
) {
365 RBBINode
*i
; // again, n and i are the names from Aho's description.
368 for (ix
=0; ix
<(uint32_t)n
->fLastPosSet
->size(); ix
++) {
369 i
= (RBBINode
*)n
->fLastPosSet
->elementAt(ix
);
370 setAdd(i
->fFollowPos
, n
->fFirstPosSet
);
378 //-----------------------------------------------------------------------------
380 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
381 // as roots of a rule to a destination vector.
383 //-----------------------------------------------------------------------------
384 void RBBITableBuilder::addRuleRootNodes(UVector
*dest
, RBBINode
*node
) {
385 if (node
== NULL
|| U_FAILURE(*fStatus
)) {
388 if (node
->fRuleRoot
) {
389 dest
->addElement(node
, *fStatus
);
390 // Note: rules cannot nest. If we found a rule start node,
391 // no child node can also be a start node.
394 addRuleRootNodes(dest
, node
->fLeftChild
);
395 addRuleRootNodes(dest
, node
->fRightChild
);
398 //-----------------------------------------------------------------------------
400 // calcChainedFollowPos. Modify the previously calculated followPos sets
401 // to implement rule chaining. NOT described by Aho
403 //-----------------------------------------------------------------------------
404 void RBBITableBuilder::calcChainedFollowPos(RBBINode
*tree
) {
406 UVector
endMarkerNodes(*fStatus
);
407 UVector
leafNodes(*fStatus
);
410 if (U_FAILURE(*fStatus
)) {
414 // get a list of all endmarker nodes.
415 tree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
417 // get a list all leaf nodes
418 tree
->findNodes(&leafNodes
, RBBINode::leafChar
, *fStatus
);
419 if (U_FAILURE(*fStatus
)) {
423 // Collect all leaf nodes that can start matches for rules
424 // with inbound chaining enabled, which is the union of the
425 // firstPosition sets from each of the rule root nodes.
427 UVector
ruleRootNodes(*fStatus
);
428 addRuleRootNodes(&ruleRootNodes
, tree
);
430 UVector
matchStartNodes(*fStatus
);
431 for (int i
=0; i
<ruleRootNodes
.size(); ++i
) {
432 RBBINode
*node
= static_cast<RBBINode
*>(ruleRootNodes
.elementAt(i
));
433 if (node
->fChainIn
) {
434 setAdd(&matchStartNodes
, node
->fFirstPosSet
);
437 if (U_FAILURE(*fStatus
)) {
444 for (endNodeIx
=0; endNodeIx
<leafNodes
.size(); endNodeIx
++) {
445 RBBINode
*tNode
= (RBBINode
*)leafNodes
.elementAt(endNodeIx
);
446 RBBINode
*endNode
= NULL
;
448 // Identify leaf nodes that correspond to overall rule match positions.
449 // These include an endMarkerNode in their followPos sets.
450 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
451 if (tNode
->fFollowPos
->contains(endMarkerNodes
.elementAt(i
))) {
456 if (endNode
== NULL
) {
457 // node wasn't an end node. Try again with the next.
461 // We've got a node that can end a match.
463 // Line Break Specific hack: If this node's val correspond to the $CM char class,
464 // don't chain from it.
465 // TODO: Add rule syntax for this behavior, get specifics out of here and
466 // into the rule file.
467 if (fRB
->fLBCMNoChain
) {
468 UChar32 c
= this->fRB
->fSetBuilder
->getFirstChar(endNode
->fVal
);
470 // c == -1 occurs with sets containing only the {eof} marker string.
471 ULineBreak cLBProp
= (ULineBreak
)u_getIntPropertyValue(c
, UCHAR_LINE_BREAK
);
472 if (cLBProp
== U_LB_COMBINING_MARK
) {
479 // Now iterate over the nodes that can start a match, looking for ones
480 // with the same char class as our ending node.
482 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
.size(); startNodeIx
++) {
483 startNode
= (RBBINode
*)matchStartNodes
.elementAt(startNodeIx
);
484 if (startNode
->fType
!= RBBINode::leafChar
) {
488 if (endNode
->fVal
== startNode
->fVal
) {
489 // The end val (character class) of one possible match is the
490 // same as the start of another.
492 // Add all nodes from the followPos of the start node to the
493 // followPos set of the end node, which will have the effect of
494 // letting matches transition from a match state at endNode
495 // to the second char of a match starting with startNode.
496 setAdd(endNode
->fFollowPos
, startNode
->fFollowPos
);
503 //-----------------------------------------------------------------------------
505 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
506 // Do an swizzle similar to chaining, modifying the followPos set of
507 // the bofNode to include the followPos nodes from other {bot} nodes
508 // scattered through the tree.
510 // This function has much in common with calcChainedFollowPos().
512 //-----------------------------------------------------------------------------
513 void RBBITableBuilder::bofFixup() {
515 if (U_FAILURE(*fStatus
)) {
519 // The parse tree looks like this ...
520 // fTree root ---> <cat>
527 // We will be adding things to the followPos set of the <bofNode>
529 RBBINode
*bofNode
= fTree
->fLeftChild
->fLeftChild
;
530 U_ASSERT(bofNode
->fType
== RBBINode::leafChar
);
531 U_ASSERT(bofNode
->fVal
== 2);
533 // Get all nodes that can be the start a match of the user-written rules
534 // (excluding the fake bofNode)
535 // We want the nodes that can start a match in the
536 // part labeled "rest of tree"
538 UVector
*matchStartNodes
= fTree
->fLeftChild
->fRightChild
->fFirstPosSet
;
542 for (startNodeIx
= 0; startNodeIx
<matchStartNodes
->size(); startNodeIx
++) {
543 startNode
= (RBBINode
*)matchStartNodes
->elementAt(startNodeIx
);
544 if (startNode
->fType
!= RBBINode::leafChar
) {
548 if (startNode
->fVal
== bofNode
->fVal
) {
549 // We found a leaf node corresponding to a {bof} that was
550 // explicitly written into a rule.
551 // Add everything from the followPos set of this node to the
552 // followPos set of the fake bofNode at the start of the tree.
554 setAdd(bofNode
->fFollowPos
, startNode
->fFollowPos
);
559 //-----------------------------------------------------------------------------
561 // buildStateTable() Determine the set of runtime DFA states and the
562 // transition tables for these states, by the algorithm
563 // of fig. 3.44 in Aho.
565 // Most of the comments are quotes of Aho's psuedo-code.
567 //-----------------------------------------------------------------------------
568 void RBBITableBuilder::buildStateTable() {
569 if (U_FAILURE(*fStatus
)) {
572 RBBIStateDescriptor
*failState
;
573 // Set it to NULL to avoid uninitialized warning
574 RBBIStateDescriptor
*initialState
= NULL
;
576 // Add a dummy state 0 - the stop state. Not from Aho.
577 int lastInputSymbol
= fRB
->fSetBuilder
->getNumCharCategories() - 1;
578 failState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
579 if (failState
== NULL
) {
580 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
581 goto ExitBuildSTdeleteall
;
583 failState
->fPositions
= new UVector(*fStatus
);
584 if (failState
->fPositions
== NULL
) {
585 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
587 if (failState
->fPositions
== NULL
|| U_FAILURE(*fStatus
)) {
588 goto ExitBuildSTdeleteall
;
590 fDStates
->addElement(failState
, *fStatus
);
591 if (U_FAILURE(*fStatus
)) {
592 goto ExitBuildSTdeleteall
;
595 // initially, the only unmarked state in Dstates is firstpos(root),
596 // where toot is the root of the syntax tree for (r)#;
597 initialState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
598 if (initialState
== NULL
) {
599 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
601 if (U_FAILURE(*fStatus
)) {
602 goto ExitBuildSTdeleteall
;
604 initialState
->fPositions
= new UVector(*fStatus
);
605 if (initialState
->fPositions
== NULL
) {
606 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
608 if (U_FAILURE(*fStatus
)) {
609 goto ExitBuildSTdeleteall
;
611 setAdd(initialState
->fPositions
, fTree
->fFirstPosSet
);
612 fDStates
->addElement(initialState
, *fStatus
);
613 if (U_FAILURE(*fStatus
)) {
614 goto ExitBuildSTdeleteall
;
617 // while there is an unmarked state T in Dstates do begin
619 RBBIStateDescriptor
*T
= NULL
;
621 for (tx
=1; tx
<fDStates
->size(); tx
++) {
622 RBBIStateDescriptor
*temp
;
623 temp
= (RBBIStateDescriptor
*)fDStates
->elementAt(tx
);
624 if (temp
->fMarked
== FALSE
) {
636 // for each input symbol a do begin
638 for (a
= 1; a
<=lastInputSymbol
; a
++) {
639 // let U be the set of positions that are in followpos(p)
640 // for some position p in T
641 // such that the symbol at position p is a;
645 for (px
=0; px
<T
->fPositions
->size(); px
++) {
646 p
= (RBBINode
*)T
->fPositions
->elementAt(px
);
647 if ((p
->fType
== RBBINode::leafChar
) && (p
->fVal
== a
)) {
649 U
= new UVector(*fStatus
);
651 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
652 goto ExitBuildSTdeleteall
;
655 setAdd(U
, p
->fFollowPos
);
659 // if U is not empty and not in DStates then
661 UBool UinDstates
= FALSE
;
663 U_ASSERT(U
->size() > 0);
665 for (ix
=0; ix
<fDStates
->size(); ix
++) {
666 RBBIStateDescriptor
*temp2
;
667 temp2
= (RBBIStateDescriptor
*)fDStates
->elementAt(ix
);
668 if (setEquals(U
, temp2
->fPositions
)) {
670 U
= temp2
->fPositions
;
677 // Add U as an unmarked state to Dstates
680 RBBIStateDescriptor
*newState
= new RBBIStateDescriptor(lastInputSymbol
, fStatus
);
681 if (newState
== NULL
) {
682 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
684 if (U_FAILURE(*fStatus
)) {
685 goto ExitBuildSTdeleteall
;
687 newState
->fPositions
= U
;
688 fDStates
->addElement(newState
, *fStatus
);
689 if (U_FAILURE(*fStatus
)) {
692 ux
= fDStates
->size()-1;
696 T
->fDtran
->setElementAt(ux
, a
);
701 // delete local pointers only if error occured.
702 ExitBuildSTdeleteall
:
709 //-----------------------------------------------------------------------------
711 // flagAcceptingStates Identify accepting states.
712 // First get a list of all of the end marker nodes.
713 // Then, for each state s,
714 // if s contains one of the end marker nodes in its list of tree positions then
715 // s is an accepting state.
717 //-----------------------------------------------------------------------------
718 void RBBITableBuilder::flagAcceptingStates() {
719 if (U_FAILURE(*fStatus
)) {
722 UVector
endMarkerNodes(*fStatus
);
727 if (U_FAILURE(*fStatus
)) {
731 fTree
->findNodes(&endMarkerNodes
, RBBINode::endMark
, *fStatus
);
732 if (U_FAILURE(*fStatus
)) {
736 for (i
=0; i
<endMarkerNodes
.size(); i
++) {
737 endMarker
= (RBBINode
*)endMarkerNodes
.elementAt(i
);
738 for (n
=0; n
<fDStates
->size(); n
++) {
739 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
740 if (sd
->fPositions
->indexOf(endMarker
) >= 0) {
741 // Any non-zero value for fAccepting means this is an accepting node.
742 // The value is what will be returned to the user as the break status.
743 // If no other value was specified, force it to -1.
745 if (sd
->fAccepting
==0) {
746 // State hasn't been marked as accepting yet. Do it now.
747 sd
->fAccepting
= endMarker
->fVal
;
748 if (sd
->fAccepting
== 0) {
752 if (sd
->fAccepting
==-1 && endMarker
->fVal
!= 0) {
753 // Both lookahead and non-lookahead accepting for this state.
754 // Favor the look-ahead. Expedient for line break.
755 // TODO: need a more elegant resolution for conflicting rules.
756 sd
->fAccepting
= endMarker
->fVal
;
759 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
761 // If the end marker node is from a look-ahead rule, set
762 // the fLookAhead field for this state also.
763 if (endMarker
->fLookAheadEnd
) {
764 // TODO: don't change value if already set?
765 // TODO: allow for more than one active look-ahead rule in engine.
766 // Make value here an index to a side array in engine?
767 sd
->fLookAhead
= sd
->fAccepting
;
775 //-----------------------------------------------------------------------------
777 // flagLookAheadStates Very similar to flagAcceptingStates, above.
779 //-----------------------------------------------------------------------------
780 void RBBITableBuilder::flagLookAheadStates() {
781 if (U_FAILURE(*fStatus
)) {
784 UVector
lookAheadNodes(*fStatus
);
785 RBBINode
*lookAheadNode
;
789 fTree
->findNodes(&lookAheadNodes
, RBBINode::lookAhead
, *fStatus
);
790 if (U_FAILURE(*fStatus
)) {
793 for (i
=0; i
<lookAheadNodes
.size(); i
++) {
794 lookAheadNode
= (RBBINode
*)lookAheadNodes
.elementAt(i
);
796 for (n
=0; n
<fDStates
->size(); n
++) {
797 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
798 if (sd
->fPositions
->indexOf(lookAheadNode
) >= 0) {
799 sd
->fLookAhead
= lookAheadNode
->fVal
;
808 //-----------------------------------------------------------------------------
812 //-----------------------------------------------------------------------------
813 void RBBITableBuilder::flagTaggedStates() {
814 if (U_FAILURE(*fStatus
)) {
817 UVector
tagNodes(*fStatus
);
822 if (U_FAILURE(*fStatus
)) {
825 fTree
->findNodes(&tagNodes
, RBBINode::tag
, *fStatus
);
826 if (U_FAILURE(*fStatus
)) {
829 for (i
=0; i
<tagNodes
.size(); i
++) { // For each tag node t (all of 'em)
830 tagNode
= (RBBINode
*)tagNodes
.elementAt(i
);
832 for (n
=0; n
<fDStates
->size(); n
++) { // For each state s (row in the state table)
833 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
834 if (sd
->fPositions
->indexOf(tagNode
) >= 0) { // if s include the tag node t
835 sortedAdd(&sd
->fTagVals
, tagNode
->fVal
);
844 //-----------------------------------------------------------------------------
846 // mergeRuleStatusVals
848 // Update the global table of rule status {tag} values
849 // The rule builder has a global vector of status values that are common
850 // for all tables. Merge the ones from this table into the global set.
852 //-----------------------------------------------------------------------------
853 void RBBITableBuilder::mergeRuleStatusVals() {
855 // The basic outline of what happens here is this...
857 // for each state in this state table
858 // if the status tag list for this state is in the global statuses list
860 // continue with the next state
862 // add the tag list for this state to the global list.
867 // Pre-set a single tag of {0} into the table.
868 // We will need this as a default, for rule sets with no explicit tagging.
869 if (fRB
->fRuleStatusVals
->size() == 0) {
870 fRB
->fRuleStatusVals
->addElement(1, *fStatus
); // Num of statuses in group
871 fRB
->fRuleStatusVals
->addElement((int32_t)0, *fStatus
); // and our single status of zero
875 for (n
=0; n
<fDStates
->size(); n
++) {
876 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
877 UVector
*thisStatesTagValues
= sd
->fTagVals
;
878 if (thisStatesTagValues
== NULL
) {
879 // No tag values are explicitly associated with this state.
880 // Set the default tag value.
885 // There are tag(s) associated with this state.
886 // fTagsIdx will be the index into the global tag list for this state's tag values.
887 // Initial value of -1 flags that we haven't got it set yet.
889 int32_t thisTagGroupStart
= 0; // indexes into the global rule status vals list
890 int32_t nextTagGroupStart
= 0;
892 // Loop runs once per group of tags in the global list
893 while (nextTagGroupStart
< fRB
->fRuleStatusVals
->size()) {
894 thisTagGroupStart
= nextTagGroupStart
;
895 nextTagGroupStart
+= fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
) + 1;
896 if (thisStatesTagValues
->size() != fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
)) {
897 // The number of tags for this state is different from
898 // the number of tags in this group from the global list.
899 // Continue with the next group from the global list.
902 // The lengths match, go ahead and compare the actual tag values
903 // between this state and the group from the global list.
904 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
905 if (thisStatesTagValues
->elementAti(i
) !=
906 fRB
->fRuleStatusVals
->elementAti(thisTagGroupStart
+ 1 + i
) ) {
912 if (i
== thisStatesTagValues
->size()) {
913 // We found a set of tag values in the global list that match
914 // those for this state. Use them.
915 sd
->fTagsIdx
= thisTagGroupStart
;
920 if (sd
->fTagsIdx
== -1) {
921 // No suitable entry in the global tag list already. Add one
922 sd
->fTagsIdx
= fRB
->fRuleStatusVals
->size();
923 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->size(), *fStatus
);
924 for (i
=0; i
<thisStatesTagValues
->size(); i
++) {
925 fRB
->fRuleStatusVals
->addElement(thisStatesTagValues
->elementAti(i
), *fStatus
);
937 //-----------------------------------------------------------------------------
939 // sortedAdd Add a value to a vector of sorted values (ints).
940 // Do not replicate entries; if the value is already there, do not
942 // Lazily create the vector if it does not already exist.
944 //-----------------------------------------------------------------------------
945 void RBBITableBuilder::sortedAdd(UVector
**vector
, int32_t val
) {
948 if (*vector
== NULL
) {
949 *vector
= new UVector(*fStatus
);
951 if (*vector
== NULL
|| U_FAILURE(*fStatus
)) {
954 UVector
*vec
= *vector
;
955 int32_t vSize
= vec
->size();
956 for (i
=0; i
<vSize
; i
++) {
957 int32_t valAtI
= vec
->elementAti(i
);
959 // The value is already in the vector. Don't add it again.
966 vec
->insertElementAt(val
, i
, *fStatus
);
971 //-----------------------------------------------------------------------------
973 // setAdd Set operation on UVector
974 // dest = dest union source
975 // Elements may only appear once and must be sorted.
977 //-----------------------------------------------------------------------------
978 void RBBITableBuilder::setAdd(UVector
*dest
, UVector
*source
) {
979 int32_t destOriginalSize
= dest
->size();
980 int32_t sourceSize
= source
->size();
982 MaybeStackArray
<void *, 16> destArray
, sourceArray
; // Handle small cases without malloc
983 void **destPtr
, **sourcePtr
;
984 void **destLim
, **sourceLim
;
986 if (destOriginalSize
> destArray
.getCapacity()) {
987 if (destArray
.resize(destOriginalSize
) == NULL
) {
991 destPtr
= destArray
.getAlias();
992 destLim
= destPtr
+ destOriginalSize
; // destArray.getArrayLimit()?
994 if (sourceSize
> sourceArray
.getCapacity()) {
995 if (sourceArray
.resize(sourceSize
) == NULL
) {
999 sourcePtr
= sourceArray
.getAlias();
1000 sourceLim
= sourcePtr
+ sourceSize
; // sourceArray.getArrayLimit()?
1002 // Avoid multiple "get element" calls by getting the contents into arrays
1003 (void) dest
->toArray(destPtr
);
1004 (void) source
->toArray(sourcePtr
);
1006 dest
->setSize(sourceSize
+destOriginalSize
, *fStatus
);
1008 while (sourcePtr
< sourceLim
&& destPtr
< destLim
) {
1009 if (*destPtr
== *sourcePtr
) {
1010 dest
->setElementAt(*sourcePtr
++, di
++);
1013 // This check is required for machines with segmented memory, like i5/OS.
1014 // Direct pointer comparison is not recommended.
1015 else if (uprv_memcmp(destPtr
, sourcePtr
, sizeof(void *)) < 0) {
1016 dest
->setElementAt(*destPtr
++, di
++);
1018 else { /* *sourcePtr < *destPtr */
1019 dest
->setElementAt(*sourcePtr
++, di
++);
1023 // At most one of these two cleanup loops will execute
1024 while (destPtr
< destLim
) {
1025 dest
->setElementAt(*destPtr
++, di
++);
1027 while (sourcePtr
< sourceLim
) {
1028 dest
->setElementAt(*sourcePtr
++, di
++);
1031 dest
->setSize(di
, *fStatus
);
1036 //-----------------------------------------------------------------------------
1038 // setEqual Set operation on UVector.
1039 // Compare for equality.
1040 // Elements must be sorted.
1042 //-----------------------------------------------------------------------------
1043 UBool
RBBITableBuilder::setEquals(UVector
*a
, UVector
*b
) {
1044 return a
->equals(*b
);
1048 //-----------------------------------------------------------------------------
1050 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1051 // for each node in the tree.
1053 //-----------------------------------------------------------------------------
1055 void RBBITableBuilder::printPosSets(RBBINode
*n
) {
1060 RBBINode::printNodeHeader();
1061 RBBINode::printNode(n
);
1062 RBBIDebugPrintf(" Nullable: %s\n", n
->fNullable
?"TRUE":"FALSE");
1064 RBBIDebugPrintf(" firstpos: ");
1065 printSet(n
->fFirstPosSet
);
1067 RBBIDebugPrintf(" lastpos: ");
1068 printSet(n
->fLastPosSet
);
1070 RBBIDebugPrintf(" followpos: ");
1071 printSet(n
->fFollowPos
);
1073 printPosSets(n
->fLeftChild
);
1074 printPosSets(n
->fRightChild
);
1079 // findDuplCharClassFrom()
1081 bool RBBITableBuilder::findDuplCharClassFrom(IntPair
*categories
) {
1082 int32_t numStates
= fDStates
->size();
1083 int32_t numCols
= fRB
->fSetBuilder
->getNumCharCategories();
1085 uint16_t table_base
;
1086 uint16_t table_dupl
;
1087 for (; categories
->first
< numCols
-1; categories
->first
++) {
1088 for (categories
->second
=categories
->first
+1; categories
->second
< numCols
; categories
->second
++) {
1089 for (int32_t state
=0; state
<numStates
; state
++) {
1090 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(state
);
1091 table_base
= (uint16_t)sd
->fDtran
->elementAti(categories
->first
);
1092 table_dupl
= (uint16_t)sd
->fDtran
->elementAti(categories
->second
);
1093 if (table_base
!= table_dupl
) {
1097 if (table_base
== table_dupl
) {
1109 void RBBITableBuilder::removeColumn(int32_t column
) {
1110 int32_t numStates
= fDStates
->size();
1111 for (int32_t state
=0; state
<numStates
; state
++) {
1112 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(state
);
1113 U_ASSERT(column
< sd
->fDtran
->size());
1114 sd
->fDtran
->removeElementAt(column
);
1119 * findDuplicateState
1121 bool RBBITableBuilder::findDuplicateState(IntPair
*states
) {
1122 int32_t numStates
= fDStates
->size();
1123 int32_t numCols
= fRB
->fSetBuilder
->getNumCharCategories();
1125 for (; states
->first
<numStates
-1; states
->first
++) {
1126 RBBIStateDescriptor
*firstSD
= (RBBIStateDescriptor
*)fDStates
->elementAt(states
->first
);
1127 for (states
->second
=states
->first
+1; states
->second
<numStates
; states
->second
++) {
1128 RBBIStateDescriptor
*duplSD
= (RBBIStateDescriptor
*)fDStates
->elementAt(states
->second
);
1129 if (firstSD
->fAccepting
!= duplSD
->fAccepting
||
1130 firstSD
->fLookAhead
!= duplSD
->fLookAhead
||
1131 firstSD
->fTagsIdx
!= duplSD
->fTagsIdx
) {
1134 bool rowsMatch
= true;
1135 for (int32_t col
=0; col
< numCols
; ++col
) {
1136 int32_t firstVal
= firstSD
->fDtran
->elementAti(col
);
1137 int32_t duplVal
= duplSD
->fDtran
->elementAti(col
);
1138 if (!((firstVal
== duplVal
) ||
1139 ((firstVal
== states
->first
|| firstVal
== states
->second
) &&
1140 (duplVal
== states
->first
|| duplVal
== states
->second
)))) {
1154 bool RBBITableBuilder::findDuplicateSafeState(IntPair
*states
) {
1155 int32_t numStates
= fSafeTable
->size();
1157 for (; states
->first
<numStates
-1; states
->first
++) {
1158 UnicodeString
*firstRow
= static_cast<UnicodeString
*>(fSafeTable
->elementAt(states
->first
));
1159 for (states
->second
=states
->first
+1; states
->second
<numStates
; states
->second
++) {
1160 UnicodeString
*duplRow
= static_cast<UnicodeString
*>(fSafeTable
->elementAt(states
->second
));
1161 bool rowsMatch
= true;
1162 int32_t numCols
= firstRow
->length();
1163 for (int32_t col
=0; col
< numCols
; ++col
) {
1164 int32_t firstVal
= firstRow
->charAt(col
);
1165 int32_t duplVal
= duplRow
->charAt(col
);
1166 if (!((firstVal
== duplVal
) ||
1167 ((firstVal
== states
->first
|| firstVal
== states
->second
) &&
1168 (duplVal
== states
->first
|| duplVal
== states
->second
)))) {
1182 void RBBITableBuilder::removeState(IntPair duplStates
) {
1183 const int32_t keepState
= duplStates
.first
;
1184 const int32_t duplState
= duplStates
.second
;
1185 U_ASSERT(keepState
< duplState
);
1186 U_ASSERT(duplState
< fDStates
->size());
1188 RBBIStateDescriptor
*duplSD
= (RBBIStateDescriptor
*)fDStates
->elementAt(duplState
);
1189 fDStates
->removeElementAt(duplState
);
1192 int32_t numStates
= fDStates
->size();
1193 int32_t numCols
= fRB
->fSetBuilder
->getNumCharCategories();
1194 for (int32_t state
=0; state
<numStates
; ++state
) {
1195 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(state
);
1196 for (int32_t col
=0; col
<numCols
; col
++) {
1197 int32_t existingVal
= sd
->fDtran
->elementAti(col
);
1198 int32_t newVal
= existingVal
;
1199 if (existingVal
== duplState
) {
1201 } else if (existingVal
> duplState
) {
1202 newVal
= existingVal
- 1;
1204 sd
->fDtran
->setElementAt(newVal
, col
);
1206 if (sd
->fAccepting
== duplState
) {
1207 sd
->fAccepting
= keepState
;
1208 } else if (sd
->fAccepting
> duplState
) {
1211 if (sd
->fLookAhead
== duplState
) {
1212 sd
->fLookAhead
= keepState
;
1213 } else if (sd
->fLookAhead
> duplState
) {
1219 void RBBITableBuilder::removeSafeState(IntPair duplStates
) {
1220 const int32_t keepState
= duplStates
.first
;
1221 const int32_t duplState
= duplStates
.second
;
1222 U_ASSERT(keepState
< duplState
);
1223 U_ASSERT(duplState
< fSafeTable
->size());
1225 fSafeTable
->removeElementAt(duplState
); // Note that fSafeTable has a deleter function
1226 // and will auto-delete the removed element.
1227 int32_t numStates
= fSafeTable
->size();
1228 for (int32_t state
=0; state
<numStates
; ++state
) {
1229 UnicodeString
*sd
= (UnicodeString
*)fSafeTable
->elementAt(state
);
1230 int32_t numCols
= sd
->length();
1231 for (int32_t col
=0; col
<numCols
; col
++) {
1232 int32_t existingVal
= sd
->charAt(col
);
1233 int32_t newVal
= existingVal
;
1234 if (existingVal
== duplState
) {
1236 } else if (existingVal
> duplState
) {
1237 newVal
= existingVal
- 1;
1239 sd
->setCharAt(col
, newVal
);
1246 * RemoveDuplicateStates
1248 void RBBITableBuilder::removeDuplicateStates() {
1249 IntPair dupls
= {3, 0};
1250 while (findDuplicateState(&dupls
)) {
1251 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1257 //-----------------------------------------------------------------------------
1259 // getTableSize() Calculate the size of the runtime form of this
1260 // state transition table.
1262 //-----------------------------------------------------------------------------
1263 int32_t RBBITableBuilder::getTableSize() const {
1269 if (fTree
== NULL
) {
1273 size
= offsetof(RBBIStateTable
, fTableData
); // The header, with no rows to the table.
1275 numRows
= fDStates
->size();
1276 numCols
= fRB
->fSetBuilder
->getNumCharCategories();
1278 rowSize
= offsetof(RBBIStateTableRow
, fNextState
) + sizeof(uint16_t)*numCols
;
1279 size
+= numRows
* rowSize
;
1284 //-----------------------------------------------------------------------------
1286 // exportTable() export the state transition table in the format required
1287 // by the runtime engine. getTableSize() bytes of memory
1288 // must be available at the output address "where".
1290 //-----------------------------------------------------------------------------
1291 void RBBITableBuilder::exportTable(void *where
) {
1292 RBBIStateTable
*table
= (RBBIStateTable
*)where
;
1296 if (U_FAILURE(*fStatus
) || fTree
== NULL
) {
1300 int32_t catCount
= fRB
->fSetBuilder
->getNumCharCategories();
1301 if (catCount
> 0x7fff ||
1302 fDStates
->size() > 0x7fff) {
1303 *fStatus
= U_BRK_INTERNAL_ERROR
;
1307 table
->fRowLen
= offsetof(RBBIStateTableRow
, fNextState
) + sizeof(uint16_t) * catCount
;
1308 table
->fNumStates
= fDStates
->size();
1310 if (fRB
->fLookAheadHardBreak
) {
1311 table
->fFlags
|= RBBI_LOOKAHEAD_HARD_BREAK
;
1313 if (fRB
->fSetBuilder
->sawBOF()) {
1314 table
->fFlags
|= RBBI_BOF_REQUIRED
;
1316 table
->fReserved
= 0;
1318 for (state
=0; state
<table
->fNumStates
; state
++) {
1319 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(state
);
1320 RBBIStateTableRow
*row
= (RBBIStateTableRow
*)(table
->fTableData
+ state
*table
->fRowLen
);
1321 U_ASSERT (-32768 < sd
->fAccepting
&& sd
->fAccepting
<= 32767);
1322 U_ASSERT (-32768 < sd
->fLookAhead
&& sd
->fLookAhead
<= 32767);
1323 row
->fAccepting
= (int16_t)sd
->fAccepting
;
1324 row
->fLookAhead
= (int16_t)sd
->fLookAhead
;
1325 row
->fTagIdx
= (int16_t)sd
->fTagsIdx
;
1326 for (col
=0; col
<catCount
; col
++) {
1327 row
->fNextState
[col
] = (uint16_t)sd
->fDtran
->elementAti(col
);
1334 * Synthesize a safe state table from the main state table.
1336 void RBBITableBuilder::buildSafeReverseTable(UErrorCode
&status
) {
1337 // The safe table creation has three steps:
1339 // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
1340 // following the pair do not depend on context or state before the pair. To test
1341 // whether a pair is safe, run it through the main forward state table, starting
1342 // from each state. If the the final state is the same, no matter what the starting state,
1343 // the pair is safe.
1345 // 2. Build a state table that recognizes the safe pairs. It's similar to their
1346 // forward table, with a column for each input character [class], and a row for
1347 // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1348 // create an additional state for each input character category; being in
1349 // one of these states means that the character has been seen, and is potentially
1350 // the first of a pair. In each of these rows, the entry for the second character
1351 // of a safe pair is set to the stop state (0), indicating that a match was found.
1352 // All other table entries are set to the state corresponding the current input
1353 // character, allowing that charcter to be the of a start following pair.
1355 // Because the safe rules are to be run in reverse, moving backwards in the text,
1356 // the first and second pair categories are swapped when building the table.
1358 // 3. Compress the table. There are typically many rows (states) that are
1359 // equivalent - that have zeroes (match completed) in the same columns -
1360 // and can be folded together.
1362 // Each safe pair is stored as two UChars in the safePair string.
1363 UnicodeString safePairs
;
1365 int32_t numCharClasses
= fRB
->fSetBuilder
->getNumCharCategories();
1366 int32_t numStates
= fDStates
->size();
1368 for (int32_t c1
=0; c1
<numCharClasses
; ++c1
) {
1369 for (int32_t c2
=0; c2
< numCharClasses
; ++c2
) {
1370 int32_t wantedEndState
= -1;
1371 int32_t endState
= 0;
1372 for (int32_t startState
= 1; startState
< numStates
; ++startState
) {
1373 RBBIStateDescriptor
*startStateD
= static_cast<RBBIStateDescriptor
*>(fDStates
->elementAt(startState
));
1374 int32_t s2
= startStateD
->fDtran
->elementAti(c1
);
1375 RBBIStateDescriptor
*s2StateD
= static_cast<RBBIStateDescriptor
*>(fDStates
->elementAt(s2
));
1376 endState
= s2StateD
->fDtran
->elementAti(c2
);
1377 if (wantedEndState
< 0) {
1378 wantedEndState
= endState
;
1380 if (wantedEndState
!= endState
) {
1385 if (wantedEndState
== endState
) {
1386 safePairs
.append((char16_t)c1
);
1387 safePairs
.append((char16_t)c2
);
1388 // printf("(%d, %d) ", c1, c2);
1394 // Populate the initial safe table.
1395 // The table as a whole is UVector<UnicodeString>
1396 // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1397 // Row 0 is the stop state.
1398 // Row 1 is the start sate.
1399 // Row 2 and beyond are other states, initially one per char class, but
1400 // after initial construction, many of the states will be combined, compacting the table.
1401 // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1402 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1404 U_ASSERT(fSafeTable
== nullptr);
1405 fSafeTable
= new UVector(uprv_deleteUObject
, uhash_compareUnicodeString
, numCharClasses
+ 2, status
);
1406 for (int32_t row
=0; row
<numCharClasses
+ 2; ++row
) {
1407 fSafeTable
->addElement(new UnicodeString(numCharClasses
, 0, numCharClasses
+4), status
);
1410 // From the start state, each input char class transitions to the state for that input.
1411 UnicodeString
&startState
= *static_cast<UnicodeString
*>(fSafeTable
->elementAt(1));
1412 for (int32_t charClass
=0; charClass
< numCharClasses
; ++charClass
) {
1413 // Note: +2 for the start & stop state.
1414 startState
.setCharAt(charClass
, charClass
+2);
1417 // Initially make every other state table row look like the start state row,
1418 for (int32_t row
=2; row
<numCharClasses
+2; ++row
) {
1419 UnicodeString
&rowState
= *static_cast<UnicodeString
*>(fSafeTable
->elementAt(row
));
1420 rowState
= startState
; // UnicodeString assignment, copies contents.
1423 // Run through the safe pairs, set the next state to zero when pair has been seen.
1424 // Zero being the stop state, meaning we found a safe point.
1425 for (int32_t pairIdx
=0; pairIdx
<safePairs
.length(); pairIdx
+=2) {
1426 int32_t c1
= safePairs
.charAt(pairIdx
);
1427 int32_t c2
= safePairs
.charAt(pairIdx
+ 1);
1429 UnicodeString
&rowState
= *static_cast<UnicodeString
*>(fSafeTable
->elementAt(c2
+ 2));
1430 rowState
.setCharAt(c1
, 0);
1433 // Remove duplicate or redundant rows from the table.
1434 IntPair states
= {1, 0};
1435 while (findDuplicateSafeState(&states
)) {
1436 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1437 removeSafeState(states
);
1442 //-----------------------------------------------------------------------------
1444 // getSafeTableSize() Calculate the size of the runtime form of this
1445 // safe state table.
1447 //-----------------------------------------------------------------------------
1448 int32_t RBBITableBuilder::getSafeTableSize() const {
1454 if (fSafeTable
== nullptr) {
1458 size
= offsetof(RBBIStateTable
, fTableData
); // The header, with no rows to the table.
1460 numRows
= fSafeTable
->size();
1461 numCols
= fRB
->fSetBuilder
->getNumCharCategories();
1463 rowSize
= offsetof(RBBIStateTableRow
, fNextState
) + sizeof(uint16_t)*numCols
;
1464 size
+= numRows
* rowSize
;
1469 //-----------------------------------------------------------------------------
1471 // exportSafeTable() export the state transition table in the format required
1472 // by the runtime engine. getTableSize() bytes of memory
1473 // must be available at the output address "where".
1475 //-----------------------------------------------------------------------------
1476 void RBBITableBuilder::exportSafeTable(void *where
) {
1477 RBBIStateTable
*table
= (RBBIStateTable
*)where
;
1481 if (U_FAILURE(*fStatus
) || fSafeTable
== nullptr) {
1485 int32_t catCount
= fRB
->fSetBuilder
->getNumCharCategories();
1486 if (catCount
> 0x7fff ||
1487 fSafeTable
->size() > 0x7fff) {
1488 *fStatus
= U_BRK_INTERNAL_ERROR
;
1492 table
->fRowLen
= offsetof(RBBIStateTableRow
, fNextState
) + sizeof(uint16_t) * catCount
;
1493 table
->fNumStates
= fSafeTable
->size();
1495 table
->fReserved
= 0;
1497 for (state
=0; state
<table
->fNumStates
; state
++) {
1498 UnicodeString
*rowString
= (UnicodeString
*)fSafeTable
->elementAt(state
);
1499 RBBIStateTableRow
*row
= (RBBIStateTableRow
*)(table
->fTableData
+ state
*table
->fRowLen
);
1500 row
->fAccepting
= 0;
1501 row
->fLookAhead
= 0;
1504 for (col
=0; col
<catCount
; col
++) {
1505 row
->fNextState
[col
] = rowString
->charAt(col
);
1513 //-----------------------------------------------------------------------------
1515 // printSet Debug function. Print the contents of a UVector
1517 //-----------------------------------------------------------------------------
1519 void RBBITableBuilder::printSet(UVector
*s
) {
1521 for (i
=0; i
<s
->size(); i
++) {
1522 const RBBINode
*v
= static_cast<const RBBINode
*>(s
->elementAt(i
));
1523 RBBIDebugPrintf("%5d", v
==NULL
? -1 : v
->fSerialNum
);
1525 RBBIDebugPrintf("\n");
1530 //-----------------------------------------------------------------------------
1532 // printStates Debug Function. Dump the fully constructed state transition table.
1534 //-----------------------------------------------------------------------------
1536 void RBBITableBuilder::printStates() {
1537 int c
; // input "character"
1538 int n
; // state number
1540 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1541 RBBIDebugPrintf(" | Acc LA Tag");
1542 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1543 RBBIDebugPrintf(" %2d", c
);
1545 RBBIDebugPrintf("\n");
1546 RBBIDebugPrintf(" |---------------");
1547 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1548 RBBIDebugPrintf("---");
1550 RBBIDebugPrintf("\n");
1552 for (n
=0; n
<fDStates
->size(); n
++) {
1553 RBBIStateDescriptor
*sd
= (RBBIStateDescriptor
*)fDStates
->elementAt(n
);
1554 RBBIDebugPrintf(" %3d | " , n
);
1555 RBBIDebugPrintf("%3d %3d %5d ", sd
->fAccepting
, sd
->fLookAhead
, sd
->fTagsIdx
);
1556 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1557 RBBIDebugPrintf(" %2d", sd
->fDtran
->elementAti(c
));
1559 RBBIDebugPrintf("\n");
1561 RBBIDebugPrintf("\n\n");
1566 //-----------------------------------------------------------------------------
1568 // printSafeTable Debug Function. Dump the fully constructed safe table.
1570 //-----------------------------------------------------------------------------
1572 void RBBITableBuilder::printReverseTable() {
1573 int c
; // input "character"
1574 int n
; // state number
1576 RBBIDebugPrintf(" Safe Reverse Table \n");
1577 if (fSafeTable
== nullptr) {
1578 RBBIDebugPrintf(" --- nullptr ---\n");
1581 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1582 RBBIDebugPrintf(" | Acc LA Tag");
1583 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1584 RBBIDebugPrintf(" %2d", c
);
1586 RBBIDebugPrintf("\n");
1587 RBBIDebugPrintf(" |---------------");
1588 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1589 RBBIDebugPrintf("---");
1591 RBBIDebugPrintf("\n");
1593 for (n
=0; n
<fSafeTable
->size(); n
++) {
1594 UnicodeString
*rowString
= (UnicodeString
*)fSafeTable
->elementAt(n
);
1595 RBBIDebugPrintf(" %3d | " , n
);
1596 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
1597 for (c
=0; c
<fRB
->fSetBuilder
->getNumCharCategories(); c
++) {
1598 RBBIDebugPrintf(" %2d", rowString
->charAt(c
));
1600 RBBIDebugPrintf("\n");
1602 RBBIDebugPrintf("\n\n");
1608 //-----------------------------------------------------------------------------
1610 // printRuleStatusTable Debug Function. Dump the common rule status table
1612 //-----------------------------------------------------------------------------
1614 void RBBITableBuilder::printRuleStatusTable() {
1615 int32_t thisRecord
= 0;
1616 int32_t nextRecord
= 0;
1618 UVector
*tbl
= fRB
->fRuleStatusVals
;
1620 RBBIDebugPrintf("index | tags \n");
1621 RBBIDebugPrintf("-------------------\n");
1623 while (nextRecord
< tbl
->size()) {
1624 thisRecord
= nextRecord
;
1625 nextRecord
= thisRecord
+ tbl
->elementAti(thisRecord
) + 1;
1626 RBBIDebugPrintf("%4d ", thisRecord
);
1627 for (i
=thisRecord
+1; i
<nextRecord
; i
++) {
1628 RBBIDebugPrintf(" %5d", tbl
->elementAti(i
));
1630 RBBIDebugPrintf("\n");
1632 RBBIDebugPrintf("\n\n");
1637 //-----------------------------------------------------------------------------
1639 // RBBIStateDescriptor Methods. This is a very struct-like class
1640 // Most access is directly to the fields.
1642 //-----------------------------------------------------------------------------
1644 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol
, UErrorCode
*fStatus
) {
1653 fDtran
= new UVector32(lastInputSymbol
+1, *fStatus
);
1654 if (U_FAILURE(*fStatus
)) {
1657 if (fDtran
== NULL
) {
1658 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
1661 fDtran
->setSize(lastInputSymbol
+1); // fDtran needs to be pre-sized.
1662 // It is indexed by input symbols, and will
1663 // hold the next state number for each
1668 RBBIStateDescriptor::~RBBIStateDescriptor() {
1679 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */