ICU-62135.0.1.tar.gz

[apple/icu.git] / icuSources / common / rbbitblb.cpp
diff --git a/icuSources/common/rbbitblb.cpp b/icuSources/common/rbbitblb.cpp

index 2fd1063a8e2b9e527125370f7c8633ec67d85b2e..8a6f7c792f33b9339125460f6459515c1b44ff4a 100644 (file)
--- a/icuSources/common/rbbitblb.cpp
+++ b/icuSources/common/rbbitblb.cpp
@@ -1,13 +1,15 @@
-//
-//  rbbitblb.cpp
-//
-
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
  **********************************************************************
-*   Copyright (c) 2002-2003, International Business Machines
+*   Copyright (c) 2002-2016, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  **********************************************************************
  */
+//
+//  rbbitblb.cpp
+//
+
  
  #include "unicode/utypes.h"
  
@@ -20,14 +22,25 @@
  #include "rbbidata.h"
  #include "cstring.h"
  #include "uassert.h"
+#include "uvectr32.h"
+#include "cmemory.h"
  
  U_NAMESPACE_BEGIN
  
-RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
- fTree(*rootNode) {
-    fRB             = rb;
-    fStatus         = fRB->fStatus;
-    fDStates        = new UVector(*fStatus);
+RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
+        fRB(rb),
+        fTree(*rootNode),
+        fStatus(&status),
+        fDStates(nullptr),
+        fSafeTable(nullptr) {
+    if (U_FAILURE(status)) {
+        return;
+    }
+    // fDStates is UVector<RBBIStateDescriptor *>
+    fDStates = new UVector(status);
+    if (U_SUCCESS(status) && fDStates == nullptr ) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+    }
  }
  
  
@@ -37,17 +50,18 @@ RBBITableBuilder::~RBBITableBuilder() {
      for (i=0; i<fDStates->size(); i++) {
          delete (RBBIStateDescriptor *)fDStates->elementAt(i);
      }
-    delete   fDStates;
+    delete fDStates;
+    delete fSafeTable;
  }
  
  
  //-----------------------------------------------------------------------------
  //
-//   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
-//                               table from the RBBI rules parse tree.
+//   RBBITableBuilder::buildForwardTable  -  This is the main function for building
+//                               the DFA state transition table from the RBBI rules parse tree.
  //
  //-----------------------------------------------------------------------------
-void  RBBITableBuilder::build() {
+void  RBBITableBuilder::buildForwardTable() {
  
      if (U_FAILURE(*fStatus)) {
          return;
@@ -64,9 +78,34 @@ void  RBBITableBuilder::build() {
      //   parse tree for the substition expression.
      //
      fTree = fTree->flattenVariables();
+#ifdef RBBI_DEBUG
      if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
-        RBBIDebugPrintf("Parse tree after flattening variable references.\n");
-        fTree->printTree(TRUE);
+        RBBIDebugPuts("\nParse tree after flattening variable references.");
+        RBBINode::printTree(fTree, TRUE);
+    }
+#endif
+
+    //
+    // If the rules contained any references to {bof} 
+    //   add a {bof} <cat> <former root of tree> to the
+    //   tree.  Means that all matches must start out with the 
+    //   {bof} fake character.
+    // 
+    if (fRB->fSetBuilder->sawBOF()) {
+        RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
+        RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
+        // Delete and exit if memory allocation failed.
+        if (bofTop == NULL || bofLeaf == NULL) {
+            *fStatus = U_MEMORY_ALLOCATION_ERROR;
+            delete bofTop;
+            delete bofLeaf;
+            return;
+        }
+        bofTop->fLeftChild  = bofLeaf;
+        bofTop->fRightChild = fTree;
+        bofLeaf->fParent    = bofTop;
+        bofLeaf->fVal       = 2;      // Reserved value for {bof}.
+        fTree               = bofTop;
      }
  
      //
@@ -75,9 +114,20 @@ void  RBBITableBuilder::build() {
      //   right child being the end marker.
      //
      RBBINode *cn = new RBBINode(RBBINode::opCat);
+    // Exit if memory allocation failed.
+    if (cn == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
      cn->fLeftChild = fTree;
      fTree->fParent = cn;
      cn->fRightChild = new RBBINode(RBBINode::endMark);
+    // Delete and exit if memory allocation failed.
+    if (cn->fRightChild == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+        delete cn;
+        return;
+    }
      cn->fRightChild->fParent = cn;
      fTree = cn;
  
@@ -86,10 +136,12 @@ void  RBBITableBuilder::build() {
      //      expression.
      //
      fTree->flattenSets();
+#ifdef RBBI_DEBUG
      if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
-        RBBIDebugPrintf("Parse tree after flattening Unicode Set references.\n");
-        fTree->printTree(TRUE);
+        RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
+        RBBINode::printTree(fTree, TRUE);
      }
+#endif
  
  
      //
@@ -104,10 +156,24 @@ void  RBBITableBuilder::build() {
      calcLastPos(fTree);
      calcFollowPos(fTree);
      if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
-        RBBIDebugPrintf("\n\n");
+        RBBIDebugPuts("\n");
          printPosSets(fTree);
      }
  
+    //
+    //  For "chained" rules, modify the followPos sets
+    //
+    if (fRB->fChainRules) {
+        calcChainedFollowPos(fTree);
+    }
+
+    //
+    //  BOF (start of input) test fixup.
+    //
+    if (fRB->fSetBuilder->sawBOF()) {
+        bofFixup();
+    }
+
      //
      // Build the DFA state transition tables.
      //
@@ -115,8 +181,13 @@ void  RBBITableBuilder::build() {
      flagAcceptingStates();
      flagLookAheadStates();
      flagTaggedStates();
-    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
  
+    //
+    // Update the global table of rule status {tag} values
+    // The rule builder has a global vector of status values that are common
+    //    for all tables.  Merge the ones from this table into the global set.
+    //
+    mergeRuleStatusVals();
  }
  
  
@@ -182,6 +253,9 @@ void RBBITableBuilder::calcFirstPos(RBBINode *n) {
          n->fType == RBBINode::lookAhead ||
          n->fType == RBBINode::tag) {
          // These are non-empty leaf node types.
+        // Note: In order to maintain the sort invariant on the set,
+        // this function should only be called on a node whose set is
+        // empty to start with.
          n->fFirstPosSet->addElement(n, *fStatus);
          return;
      }
@@ -225,6 +299,9 @@ void RBBITableBuilder::calcLastPos(RBBINode *n) {
          n->fType == RBBINode::lookAhead ||
          n->fType == RBBINode::tag) {
          // These are non-empty leaf node types.
+        // Note: In order to maintain the sort invariant on the set,
+        // this function should only be called on a node whose set is
+        // empty to start with.
          n->fLastPosSet->addElement(n, *fStatus);
          return;
      }
@@ -298,6 +375,186 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) {
  
  }
  
+//-----------------------------------------------------------------------------
+//
+//    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
+//                        as roots of a rule to a destination vector.
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
+    if (node == NULL || U_FAILURE(*fStatus)) {
+        return;
+    }
+    if (node->fRuleRoot) {
+        dest->addElement(node, *fStatus);
+        // Note: rules cannot nest. If we found a rule start node,
+        //       no child node can also be a start node.
+        return;
+    }
+    addRuleRootNodes(dest, node->fLeftChild);
+    addRuleRootNodes(dest, node->fRightChild);
+}
+
+//-----------------------------------------------------------------------------
+//
+//   calcChainedFollowPos.    Modify the previously calculated followPos sets
+//                            to implement rule chaining.  NOT described by Aho
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
+
+    UVector         endMarkerNodes(*fStatus);
+    UVector         leafNodes(*fStatus);
+    int32_t         i;
+
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
+    // get a list of all endmarker nodes.
+    tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
+
+    // get a list all leaf nodes
+    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
+    // Collect all leaf nodes that can start matches for rules
+    // with inbound chaining enabled, which is the union of the 
+    // firstPosition sets from each of the rule root nodes.
+    
+    UVector ruleRootNodes(*fStatus);
+    addRuleRootNodes(&ruleRootNodes, tree);
+
+    UVector matchStartNodes(*fStatus);
+    for (int i=0; i<ruleRootNodes.size(); ++i) {
+        RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
+        if (node->fChainIn) {
+            setAdd(&matchStartNodes, node->fFirstPosSet);
+        }
+    }
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
+    int32_t  endNodeIx;
+    int32_t  startNodeIx;
+
+    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
+        RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
+        RBBINode *endNode = NULL;
+
+        // Identify leaf nodes that correspond to overall rule match positions.
+        //   These include an endMarkerNode in their followPos sets.
+        for (i=0; i<endMarkerNodes.size(); i++) {
+            if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
+                endNode = tNode;
+                break;
+            }
+        }
+        if (endNode == NULL) {
+            // node wasn't an end node.  Try again with the next.
+            continue;
+        }
+
+        // We've got a node that can end a match.
+
+        // Line Break Specific hack:  If this node's val correspond to the $CM char class,
+        //                            don't chain from it.
+        // TODO:  Add rule syntax for this behavior, get specifics out of here and
+        //        into the rule file.
+        if (fRB->fLBCMNoChain) {
+            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
+            if (c != -1) {
+                // c == -1 occurs with sets containing only the {eof} marker string.
+                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
+                if (cLBProp == U_LB_COMBINING_MARK) {
+                    continue;
+                }
+            }
+        }
+
+
+        // Now iterate over the nodes that can start a match, looking for ones
+        //   with the same char class as our ending node.
+        RBBINode *startNode;
+        for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
+            startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
+            if (startNode->fType != RBBINode::leafChar) {
+                continue;
+            }
+
+            if (endNode->fVal == startNode->fVal) {
+                // The end val (character class) of one possible match is the
+                //   same as the start of another.
+
+                // Add all nodes from the followPos of the start node to the
+                //  followPos set of the end node, which will have the effect of
+                //  letting matches transition from a match state at endNode
+                //  to the second char of a match starting with startNode.
+                setAdd(endNode->fFollowPos, startNode->fFollowPos);
+            }
+        }
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+//
+//   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
+//                Do an swizzle similar to chaining, modifying the followPos set of
+//                the bofNode to include the followPos nodes from other {bot} nodes
+//                scattered through the tree.
+//
+//                This function has much in common with calcChainedFollowPos().
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::bofFixup() {
+
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
+    //   The parse tree looks like this ...
+    //         fTree root  --->       <cat>
+    //                               /     \       .
+    //                            <cat>   <#end node>
+    //                           /     \  .
+    //                     <bofNode>   rest
+    //                               of tree
+    //
+    //    We will be adding things to the followPos set of the <bofNode>
+    //
+    RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
+    U_ASSERT(bofNode->fType == RBBINode::leafChar);
+    U_ASSERT(bofNode->fVal == 2);
+
+    // Get all nodes that can be the start a match of the user-written rules
+    //  (excluding the fake bofNode)
+    //  We want the nodes that can start a match in the
+    //     part labeled "rest of tree"
+    // 
+    UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
+
+    RBBINode *startNode;
+    int       startNodeIx;
+    for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
+        startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
+        if (startNode->fType != RBBINode::leafChar) {
+            continue;
+        }
+
+        if (startNode->fVal == bofNode->fVal) {
+            //  We found a leaf node corresponding to a {bof} that was
+            //    explicitly written into a rule.
+            //  Add everything from the followPos set of this node to the
+            //    followPos set of the fake bofNode at the start of the tree.
+            //  
+            setAdd(bofNode->fFollowPos, startNode->fFollowPos);
+        }
+    }
+}
  
  //-----------------------------------------------------------------------------
  //
@@ -309,19 +566,53 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) {
  //
  //-----------------------------------------------------------------------------
  void RBBITableBuilder::buildStateTable() {
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+    RBBIStateDescriptor *failState;
+    // Set it to NULL to avoid uninitialized warning
+    RBBIStateDescriptor *initialState = NULL; 
      //
      // Add a dummy state 0 - the stop state.  Not from Aho.
      int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
-    RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+    failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+    if (failState == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+        goto ExitBuildSTdeleteall;
+    }
      failState->fPositions = new UVector(*fStatus);
+    if (failState->fPositions == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+    }
+    if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
+        goto ExitBuildSTdeleteall;
+    }
      fDStates->addElement(failState, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        goto ExitBuildSTdeleteall;
+    }
  
      // initially, the only unmarked state in Dstates is firstpos(root),
      //       where toot is the root of the syntax tree for (r)#;
-    RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+    initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+    if (initialState == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+    }
+    if (U_FAILURE(*fStatus)) {
+        goto ExitBuildSTdeleteall;
+    }
      initialState->fPositions = new UVector(*fStatus);
+    if (initialState->fPositions == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+    }
+    if (U_FAILURE(*fStatus)) {
+        goto ExitBuildSTdeleteall;
+    }
      setAdd(initialState->fPositions, fTree->fFirstPosSet);
      fDStates->addElement(initialState, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        goto ExitBuildSTdeleteall;
+    }
  
      // while there is an unmarked state T in Dstates do begin
      for (;;) {
@@ -356,6 +647,10 @@ void RBBITableBuilder::buildStateTable() {
                  if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
                      if (U == NULL) {
                          U = new UVector(*fStatus);
+                        if (U == NULL) {
+                               *fStatus = U_MEMORY_ALLOCATION_ERROR;
+                               goto ExitBuildSTdeleteall;
+                        }
                      }
                      setAdd(U, p->fFollowPos);
                  }
@@ -383,8 +678,17 @@ void RBBITableBuilder::buildStateTable() {
                  if (!UinDstates)
                  {
                      RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+                    if (newState == NULL) {
+                       *fStatus = U_MEMORY_ALLOCATION_ERROR;
+                    }
+                    if (U_FAILURE(*fStatus)) {
+                        goto ExitBuildSTdeleteall;
+                    }
                      newState->fPositions = U;
                      fDStates->addElement(newState, *fStatus);
+                    if (U_FAILURE(*fStatus)) {
+                        return;
+                    }
                      ux = fDStates->size()-1;
                  }
  
@@ -393,6 +697,11 @@ void RBBITableBuilder::buildStateTable() {
              }
          }
      }
+    return;
+    // delete local pointers only if error occured.
+ExitBuildSTdeleteall:
+    delete initialState;
+    delete failState;
  }
  
  
@@ -407,12 +716,22 @@ void RBBITableBuilder::buildStateTable() {
  //
  //-----------------------------------------------------------------------------
  void     RBBITableBuilder::flagAcceptingStates() {
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
      UVector     endMarkerNodes(*fStatus);
      RBBINode    *endMarker;
      int32_t     i;
      int32_t     n;
  
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
      fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
  
      for (i=0; i<endMarkerNodes.size(); i++) {
          endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
@@ -422,14 +741,29 @@ void     RBBITableBuilder::flagAcceptingStates() {
                  // Any non-zero value for fAccepting means this is an accepting node.
                  // The value is what will be returned to the user as the break status.
                  // If no other value was specified, force it to -1.
-                sd->fAccepting = endMarker->fVal;
-                if (sd->fAccepting == 0) {
-                    sd->fAccepting = -1;
+
+                if (sd->fAccepting==0) {
+                    // State hasn't been marked as accepting yet.  Do it now.
+                    sd->fAccepting = endMarker->fVal;
+                    if (sd->fAccepting == 0) {
+                        sd->fAccepting = -1;
+                    }
+                }
+                if (sd->fAccepting==-1 && endMarker->fVal != 0) {
+                    // Both lookahead and non-lookahead accepting for this state.
+                    // Favor the look-ahead.  Expedient for line break.
+                    // TODO:  need a more elegant resolution for conflicting rules.
+                    sd->fAccepting = endMarker->fVal;
                  }
+                // implicit else:
+                // if sd->fAccepting already had a value other than 0 or -1, leave it be.
  
                  // If the end marker node is from a look-ahead rule, set
-                //   the fLookAhead field or this state also.
+                //   the fLookAhead field for this state also.
                  if (endMarker->fLookAheadEnd) {
+                    // TODO:  don't change value if already set?
+                    // TODO:  allow for more than one active look-ahead rule in engine.
+                    //        Make value here an index to a side array in engine?
                      sd->fLookAhead = sd->fAccepting;
                  }
              }
@@ -444,12 +778,18 @@ void     RBBITableBuilder::flagAcceptingStates() {
  //
  //-----------------------------------------------------------------------------
  void     RBBITableBuilder::flagLookAheadStates() {
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
      UVector     lookAheadNodes(*fStatus);
      RBBINode    *lookAheadNode;
      int32_t     i;
      int32_t     n;
  
      fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
      for (i=0; i<lookAheadNodes.size(); i++) {
          lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
  
@@ -471,26 +811,159 @@ void     RBBITableBuilder::flagLookAheadStates() {
  //
  //-----------------------------------------------------------------------------
  void     RBBITableBuilder::flagTaggedStates() {
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
      UVector     tagNodes(*fStatus);
      RBBINode    *tagNode;
      int32_t     i;
      int32_t     n;
  
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
      fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
      for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
          tagNode = (RBBINode *)tagNodes.elementAt(i);
  
          for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
              RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
              if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
-                if (sd->fTagVal < tagNode->fVal) {
-                    // If more than one rule tag applies to this state, the larger
-                    //   tag takes precedence.
-                sd->fTagVal = tagNode->fVal;
+                sortedAdd(&sd->fTagVals, tagNode->fVal);
+            }
+        }
+    }
+}
+
+
+
+
+//-----------------------------------------------------------------------------
+//
+//  mergeRuleStatusVals
+//
+//      Update the global table of rule status {tag} values
+//      The rule builder has a global vector of status values that are common
+//      for all tables.  Merge the ones from this table into the global set.
+//
+//-----------------------------------------------------------------------------
+void  RBBITableBuilder::mergeRuleStatusVals() {
+    //
+    //  The basic outline of what happens here is this...
+    //
+    //    for each state in this state table
+    //       if the status tag list for this state is in the global statuses list
+    //           record where and
+    //           continue with the next state
+    //       else
+    //           add the tag list for this state to the global list.
+    //
+    int i;
+    int n;
+
+    // Pre-set a single tag of {0} into the table.
+    //   We will need this as a default, for rule sets with no explicit tagging.
+    if (fRB->fRuleStatusVals->size() == 0) {
+        fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
+        fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
+    }
+
+    //    For each state
+    for (n=0; n<fDStates->size(); n++) {
+        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
+        UVector *thisStatesTagValues = sd->fTagVals;
+        if (thisStatesTagValues == NULL) {
+            // No tag values are explicitly associated with this state.
+            //   Set the default tag value.
+            sd->fTagsIdx = 0;
+            continue;
+        }
+
+        // There are tag(s) associated with this state.
+        //   fTagsIdx will be the index into the global tag list for this state's tag values.
+        //   Initial value of -1 flags that we haven't got it set yet.
+        sd->fTagsIdx = -1;
+        int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
+        int32_t  nextTagGroupStart = 0;
+
+        // Loop runs once per group of tags in the global list
+        while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
+            thisTagGroupStart = nextTagGroupStart;
+            nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
+            if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
+                // The number of tags for this state is different from
+                //    the number of tags in this group from the global list.
+                //    Continue with the next group from the global list.
+                continue;
+            }
+            // The lengths match, go ahead and compare the actual tag values
+            //    between this state and the group from the global list.
+            for (i=0; i<thisStatesTagValues->size(); i++) {
+                if (thisStatesTagValues->elementAti(i) !=
+                    fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
+                    // Mismatch.
+                    break;
+                }
+            }
+
+            if (i == thisStatesTagValues->size()) {
+                // We found a set of tag values in the global list that match
+                //   those for this state.  Use them.
+                sd->fTagsIdx = thisTagGroupStart;
+                break;
+            }
+        }
+
+        if (sd->fTagsIdx == -1) {
+            // No suitable entry in the global tag list already.  Add one
+            sd->fTagsIdx = fRB->fRuleStatusVals->size();
+            fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
+            for (i=0; i<thisStatesTagValues->size(); i++) {
+                fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
              }
          }
      }
  }
+
+
+
+
+
+
+
+//-----------------------------------------------------------------------------
+//
+//  sortedAdd  Add a value to a vector of sorted values (ints).
+//             Do not replicate entries; if the value is already there, do not
+//                add a second one.
+//             Lazily create the vector if it does not already exist.
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
+    int32_t i;
+
+    if (*vector == NULL) {
+        *vector = new UVector(*fStatus);
+    }
+    if (*vector == NULL || U_FAILURE(*fStatus)) {
+        return;
+    }
+    UVector *vec = *vector;
+    int32_t  vSize = vec->size();
+    for (i=0; i<vSize; i++) {
+        int32_t valAtI = vec->elementAti(i);
+        if (valAtI == val) {
+            // The value is already in the vector.  Don't add it again.
+            return;
+        }
+        if (valAtI > val) {
+            break;
+        }
+    }
+    vec->insertElementAt(val, i, *fStatus);
  }
  
  
@@ -499,65 +972,76 @@ void     RBBITableBuilder::flagTaggedStates() {
  //
  //  setAdd     Set operation on UVector
  //             dest = dest union source
-//             Elements may only appear once.   Order is unimportant.
+//             Elements may only appear once and must be sorted.
  //
  //-----------------------------------------------------------------------------
  void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
-    int destOriginalSize = dest->size();
-    int sourceSize       = source->size();
-    int32_t  si, di;
-
-    for (si=0; si<sourceSize; si++) {
-        void *elToAdd = source->elementAt(si);
-        for (di=0; di<destOriginalSize; di++) {
-            if (dest->elementAt(di) == elToAdd) {
-                goto  elementAlreadyInDest;
-            }
+    int32_t destOriginalSize = dest->size();
+    int32_t sourceSize       = source->size();
+    int32_t di           = 0;
+    MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
+    void **destPtr, **sourcePtr;
+    void **destLim, **sourceLim;
+
+    if (destOriginalSize > destArray.getCapacity()) {
+        if (destArray.resize(destOriginalSize) == NULL) {
+            return;
          }
-        dest->addElement(elToAdd, *fStatus);
-    elementAlreadyInDest: ;
      }
+    destPtr = destArray.getAlias();
+    destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
+
+    if (sourceSize > sourceArray.getCapacity()) {
+        if (sourceArray.resize(sourceSize) == NULL) {
+            return;
+        }
+    }
+    sourcePtr = sourceArray.getAlias();
+    sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
+
+    // Avoid multiple "get element" calls by getting the contents into arrays
+    (void) dest->toArray(destPtr);
+    (void) source->toArray(sourcePtr);
+
+    dest->setSize(sourceSize+destOriginalSize, *fStatus);
+
+    while (sourcePtr < sourceLim && destPtr < destLim) {
+        if (*destPtr == *sourcePtr) {
+            dest->setElementAt(*sourcePtr++, di++);
+            destPtr++;
+        }
+        // This check is required for machines with segmented memory, like i5/OS.
+        // Direct pointer comparison is not recommended.
+        else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
+            dest->setElementAt(*destPtr++, di++);
+        }
+        else { /* *sourcePtr < *destPtr */
+            dest->setElementAt(*sourcePtr++, di++);
+        }
+    }
+
+    // At most one of these two cleanup loops will execute
+    while (destPtr < destLim) {
+        dest->setElementAt(*destPtr++, di++);
+    }
+    while (sourcePtr < sourceLim) {
+        dest->setElementAt(*sourcePtr++, di++);
+    }
+
+    dest->setSize(di, *fStatus);
  }
  
  
+
  //-----------------------------------------------------------------------------
  //
  //  setEqual    Set operation on UVector.
  //              Compare for equality.
-//              Elements may appear only once.
-//              Elements may appear in any order.
+//              Elements must be sorted.
  //
  //-----------------------------------------------------------------------------
  UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
-    int32_t    aSize = a->size();
-    int32_t    bSize = b->size();
-
-    if (aSize != bSize) {
-        return FALSE;
-    }
-
-    int32_t  ax;
-    int32_t  bx;
-    int32_t  firstBx = 0;
-    void     *aVal;
-    void     *bVal = NULL;
-
-    for (ax=0; ax<aSize; ax++) {
-        aVal = a->elementAt(ax);
-        for (bx=firstBx; bx<bSize; bx++) {
-            bVal = b->elementAt(bx);
-            if (aVal == bVal) {
-                if (bx==firstBx) {
-                    firstBx++;
-                }
-                break;
-            }
-        }
-        if (aVal != bVal) {
-            return FALSE;
-        }
-    }
-    return TRUE;
+    return a->equals(*b);
  }
  
  
@@ -567,12 +1051,14 @@ UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
  //                 for each node in the tree.
  //
  //-----------------------------------------------------------------------------
-void RBBITableBuilder::printPosSets(RBBINode *n) {
  #ifdef RBBI_DEBUG
+void RBBITableBuilder::printPosSets(RBBINode *n) {
      if (n==NULL) {
          return;
      }
-    n->print();
+    printf("\n");
+    RBBINode::printNodeHeader();
+    RBBINode::printNode(n);
      RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
  
      RBBIDebugPrintf("         firstpos:  ");
@@ -586,10 +1072,187 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
  
      printPosSets(n->fLeftChild);
      printPosSets(n->fRightChild);
+}
  #endif
+
+//
+//    findDuplCharClassFrom()
+//
+bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
+    int32_t numStates = fDStates->size();
+    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+
+    uint16_t table_base;
+    uint16_t table_dupl;
+    for (; categories->first < numCols-1; categories->first++) {
+        for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
+             for (int32_t state=0; state<numStates; state++) {
+                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+                 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
+                 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
+                 if (table_base != table_dupl) {
+                     break;
+                 }
+             }
+             if (table_base == table_dupl) {
+                 return true;
+             }
+        }
+    }
+    return false;
+}
+
+
+//
+//    removeColumn()
+//
+void RBBITableBuilder::removeColumn(int32_t column) {
+    int32_t numStates = fDStates->size();
+    for (int32_t state=0; state<numStates; state++) {
+        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+        U_ASSERT(column < sd->fDtran->size());
+        sd->fDtran->removeElementAt(column);
+    }
+}
+
+/*
+ * findDuplicateState
+ */
+bool RBBITableBuilder::findDuplicateState(IntPair *states) {
+    int32_t numStates = fDStates->size();
+    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+
+    for (; states->first<numStates-1; states->first++) {
+        RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
+        for (states->second=states->first+1; states->second<numStates; states->second++) {
+            RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
+            if (firstSD->fAccepting != duplSD->fAccepting ||
+                firstSD->fLookAhead != duplSD->fLookAhead ||
+                firstSD->fTagsIdx   != duplSD->fTagsIdx) {
+                continue;
+            }
+            bool rowsMatch = true;
+            for (int32_t col=0; col < numCols; ++col) {
+                int32_t firstVal = firstSD->fDtran->elementAti(col);
+                int32_t duplVal = duplSD->fDtran->elementAti(col);
+                if (!((firstVal == duplVal) ||
+                        ((firstVal == states->first || firstVal == states->second) &&
+                        (duplVal  == states->first || duplVal  == states->second)))) {
+                    rowsMatch = false;
+                    break;
+                }
+            }
+            if (rowsMatch) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+
+bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
+    int32_t numStates = fSafeTable->size();
+
+    for (; states->first<numStates-1; states->first++) {
+        UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
+        for (states->second=states->first+1; states->second<numStates; states->second++) {
+            UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
+            bool rowsMatch = true;
+            int32_t numCols = firstRow->length();
+            for (int32_t col=0; col < numCols; ++col) {
+                int32_t firstVal = firstRow->charAt(col);
+                int32_t duplVal = duplRow->charAt(col);
+                if (!((firstVal == duplVal) ||
+                        ((firstVal == states->first || firstVal == states->second) &&
+                        (duplVal  == states->first || duplVal  == states->second)))) {
+                    rowsMatch = false;
+                    break;
+                }
+            }
+            if (rowsMatch) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+
+void RBBITableBuilder::removeState(IntPair duplStates) {
+    const int32_t keepState = duplStates.first;
+    const int32_t duplState = duplStates.second;
+    U_ASSERT(keepState < duplState);
+    U_ASSERT(duplState < fDStates->size());
+
+    RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
+    fDStates->removeElementAt(duplState);
+    delete duplSD;
+
+    int32_t numStates = fDStates->size();
+    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+    for (int32_t state=0; state<numStates; ++state) {
+        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+        for (int32_t col=0; col<numCols; col++) {
+            int32_t existingVal = sd->fDtran->elementAti(col);
+            int32_t newVal = existingVal;
+            if (existingVal == duplState) {
+                newVal = keepState;
+            } else if (existingVal > duplState) {
+                newVal = existingVal - 1;
+            }
+            sd->fDtran->setElementAt(newVal, col);
+        }
+        if (sd->fAccepting == duplState) {
+            sd->fAccepting = keepState;
+        } else if (sd->fAccepting > duplState) {
+            sd->fAccepting--;
+        }
+        if (sd->fLookAhead == duplState) {
+            sd->fLookAhead = keepState;
+        } else if (sd->fLookAhead > duplState) {
+            sd->fLookAhead--;
+        }
+    }
+}
+
+void RBBITableBuilder::removeSafeState(IntPair duplStates) {
+    const int32_t keepState = duplStates.first;
+    const int32_t duplState = duplStates.second;
+    U_ASSERT(keepState < duplState);
+    U_ASSERT(duplState < fSafeTable->size());
+
+    fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
+                                              // and will auto-delete the removed element.
+    int32_t numStates = fSafeTable->size();
+    for (int32_t state=0; state<numStates; ++state) {
+        UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
+        int32_t numCols = sd->length();
+        for (int32_t col=0; col<numCols; col++) {
+            int32_t existingVal = sd->charAt(col);
+            int32_t newVal = existingVal;
+            if (existingVal == duplState) {
+                newVal = keepState;
+            } else if (existingVal > duplState) {
+                newVal = existingVal - 1;
+            }
+            sd->setCharAt(col, newVal);
+        }
+    }
  }
  
  
+/*
+ * RemoveDuplicateStates
+ */
+void RBBITableBuilder::removeDuplicateStates() {
+    IntPair dupls = {3, 0};
+    while (findDuplicateState(&dupls)) {
+        // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
+        removeState(dupls);
+    }
+}
+
  
  //-----------------------------------------------------------------------------
  //
@@ -597,7 +1260,7 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
  //                     state transition table.
  //
  //-----------------------------------------------------------------------------
-int32_t  RBBITableBuilder::getTableSize() {
+int32_t  RBBITableBuilder::getTableSize() const {
      int32_t    size = 0;
      int32_t    numRows;
      int32_t    numCols;
@@ -607,21 +1270,17 @@ int32_t  RBBITableBuilder::getTableSize() {
          return 0;
      }
  
-    size    = sizeof(RBBIStateTable) - 4;    // The header, with no rows to the table.
+    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
  
      numRows = fDStates->size();
      numCols = fRB->fSetBuilder->getNumCharCategories();
  
-    //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
-    //        Therefore we subtract two from numCols when determining
-    //        how much storage to add to a row for the total columns.
-    rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
+    rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
      size   += numRows * rowSize;
      return size;
  }
  
  
-
  //-----------------------------------------------------------------------------
  //
  //   exportTable()    export the state transition table in the format required
@@ -638,15 +1297,23 @@ void RBBITableBuilder::exportTable(void *where) {
          return;
      }
  
-    if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
+    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
+    if (catCount > 0x7fff ||
          fDStates->size() > 0x7fff) {
          *fStatus = U_BRK_INTERNAL_ERROR;
          return;
      }
  
-    table->fRowLen    = sizeof(RBBIStateTableRow) +
-                            sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
+    table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
      table->fNumStates = fDStates->size();
+    table->fFlags     = 0;
+    if (fRB->fLookAheadHardBreak) {
+        table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
+    }
+    if (fRB->fSetBuilder->sawBOF()) {
+        table->fFlags  |= RBBI_BOF_REQUIRED;
+    }
+    table->fReserved  = 0;
  
      for (state=0; state<table->fNumStates; state++) {
          RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
@@ -655,30 +1322,209 @@ void RBBITableBuilder::exportTable(void *where) {
          U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
          row->fAccepting = (int16_t)sd->fAccepting;
          row->fLookAhead = (int16_t)sd->fLookAhead;
-        row->fTag       = (int16_t)sd->fTagVal;
-        for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
+        row->fTagIdx    = (int16_t)sd->fTagsIdx;
+        for (col=0; col<catCount; col++) {
              row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
          }
      }
  }
  
  
+/**
+ *   Synthesize a safe state table from the main state table.
+ */
+void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
+    // The safe table creation has three steps:
+
+    // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
+    // following the pair do not depend on context or state before the pair. To test
+    // whether a pair is safe, run it through the main forward state table, starting
+    // from each state. If the the final state is the same, no matter what the starting state,
+    // the pair is safe.
+    //
+    // 2. Build a state table that recognizes the safe pairs. It's similar to their
+    // forward table, with a column for each input character [class], and a row for
+    // each state. Row 1 is the start state, and row 0 is the stop state. Initially
+    // create an additional state for each input character category; being in
+    // one of these states means that the character has been seen, and is potentially
+    // the first of a pair. In each of these rows, the entry for the second character
+    // of a safe pair is set to the stop state (0), indicating that a match was found.
+    // All other table entries are set to the state corresponding the current input
+    // character, allowing that charcter to be the of a start following pair.
+    //
+    // Because the safe rules are to be run in reverse, moving backwards in the text,
+    // the first and second pair categories are swapped when building the table.
+    //
+    // 3. Compress the table. There are typically many rows (states) that are
+    // equivalent - that have zeroes (match completed) in the same columns -
+    // and can be folded together.
+
+    // Each safe pair is stored as two UChars in the safePair string.
+    UnicodeString safePairs;
+
+    int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
+    int32_t numStates = fDStates->size();
+
+    for (int32_t c1=0; c1<numCharClasses; ++c1) {
+        for (int32_t c2=0; c2 < numCharClasses; ++c2) {
+            int32_t wantedEndState = -1;
+            int32_t endState = 0;
+            for (int32_t startState = 1; startState < numStates; ++startState) {
+                RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
+                int32_t s2 = startStateD->fDtran->elementAti(c1);
+                RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
+                endState = s2StateD->fDtran->elementAti(c2);
+                if (wantedEndState < 0) {
+                    wantedEndState = endState;
+                } else {
+                    if (wantedEndState != endState) {
+                        break;
+                    }
+                }
+            }
+            if (wantedEndState == endState) {
+                safePairs.append((char16_t)c1);
+                safePairs.append((char16_t)c2);
+                // printf("(%d, %d) ", c1, c2);
+            }
+        }
+        // printf("\n");
+    }
+
+    // Populate the initial safe table.
+    // The table as a whole is UVector<UnicodeString>
+    // Each row is represented by a UnicodeString, being used as a Vector<int16>.
+    // Row 0 is the stop state.
+    // Row 1 is the start sate.
+    // Row 2 and beyond are other states, initially one per char class, but
+    //   after initial construction, many of the states will be combined, compacting the table.
+    // The String holds the nextState data only. The four leading fields of a row, fAccepting,
+    // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
+
+    U_ASSERT(fSafeTable == nullptr);
+    fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
+    for (int32_t row=0; row<numCharClasses + 2; ++row) {
+        fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
+    }
+
+    // From the start state, each input char class transitions to the state for that input.
+    UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
+    for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
+        // Note: +2 for the start & stop state.
+        startState.setCharAt(charClass, charClass+2);
+    }
+
+    // Initially make every other state table row look like the start state row,
+    for (int32_t row=2; row<numCharClasses+2; ++row) {
+        UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
+        rowState = startState;   // UnicodeString assignment, copies contents.
+    }
+
+    // Run through the safe pairs, set the next state to zero when pair has been seen.
+    // Zero being the stop state, meaning we found a safe point.
+    for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
+        int32_t c1 = safePairs.charAt(pairIdx);
+        int32_t c2 = safePairs.charAt(pairIdx + 1);
+
+        UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
+        rowState.setCharAt(c1, 0);
+    }
+
+    // Remove duplicate or redundant rows from the table.
+    IntPair states = {1, 0};
+    while (findDuplicateSafeState(&states)) {
+        // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
+        removeSafeState(states);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+//
+//   getSafeTableSize()    Calculate the size of the runtime form of this
+//                         safe state table.
+//
+//-----------------------------------------------------------------------------
+int32_t  RBBITableBuilder::getSafeTableSize() const {
+    int32_t    size = 0;
+    int32_t    numRows;
+    int32_t    numCols;
+    int32_t    rowSize;
+
+    if (fSafeTable == nullptr) {
+        return 0;
+    }
+
+    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
+
+    numRows = fSafeTable->size();
+    numCols = fRB->fSetBuilder->getNumCharCategories();
+
+    rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
+    size   += numRows * rowSize;
+    return size;
+}
+
+
+//-----------------------------------------------------------------------------
+//
+//   exportSafeTable()   export the state transition table in the format required
+//                       by the runtime engine.  getTableSize() bytes of memory
+//                       must be available at the output address "where".
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::exportSafeTable(void *where) {
+    RBBIStateTable    *table = (RBBIStateTable *)where;
+    uint32_t           state;
+    int                col;
+
+    if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
+        return;
+    }
+
+    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
+    if (catCount > 0x7fff ||
+            fSafeTable->size() > 0x7fff) {
+        *fStatus = U_BRK_INTERNAL_ERROR;
+        return;
+    }
+
+    table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
+    table->fNumStates = fSafeTable->size();
+    table->fFlags     = 0;
+    table->fReserved  = 0;
+
+    for (state=0; state<table->fNumStates; state++) {
+        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
+        RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
+        row->fAccepting = 0;
+        row->fLookAhead = 0;
+        row->fTagIdx    = 0;
+        row->fReserved  = 0;
+        for (col=0; col<catCount; col++) {
+            row->fNextState[col] = rowString->charAt(col);
+        }
+    }
+}
+
+
+
  
  //-----------------------------------------------------------------------------
  //
  //   printSet    Debug function.   Print the contents of a UVector
  //
  //-----------------------------------------------------------------------------
-void RBBITableBuilder::printSet(UVector *s) {
  #ifdef RBBI_DEBUG
+void RBBITableBuilder::printSet(UVector *s) {
      int32_t  i;
      for (i=0; i<s->size(); i++) {
-        void *v = s->elementAt(i);
-        RBBIDebugPrintf("%10p", v);
+        const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
+        RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
      }
      RBBIDebugPrintf("\n");
-#endif
  }
+#endif
  
  
  //-----------------------------------------------------------------------------
@@ -686,34 +1532,106 @@ void RBBITableBuilder::printSet(UVector *s) {
  //   printStates    Debug Function.  Dump the fully constructed state transition table.
  //
  //-----------------------------------------------------------------------------
-void RBBITableBuilder::printStates() {
  #ifdef RBBI_DEBUG
+void RBBITableBuilder::printStates() {
      int     c;    // input "character"
      int     n;    // state number
  
      RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
      RBBIDebugPrintf("      | Acc  LA    Tag");
-    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {RBBIDebugPrintf(" %2d", c);};
+    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+        RBBIDebugPrintf(" %2d", c);
+    }
      RBBIDebugPrintf("\n");
      RBBIDebugPrintf("      |---------------");
-    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {RBBIDebugPrintf("---");};
+    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+        RBBIDebugPrintf("---");
+    }
      RBBIDebugPrintf("\n");
  
      for (n=0; n<fDStates->size(); n++) {
          RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
          RBBIDebugPrintf("  %3d | " , n);
-        RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagVal);
+        RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
          for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
              RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
          }
          RBBIDebugPrintf("\n");
      }
      RBBIDebugPrintf("\n\n");
+}
  #endif
+
+
+//-----------------------------------------------------------------------------
+//
+//   printSafeTable    Debug Function.  Dump the fully constructed safe table.
+//
+//-----------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
+void RBBITableBuilder::printReverseTable() {
+    int     c;    // input "character"
+    int     n;    // state number
+
+    RBBIDebugPrintf("    Safe Reverse Table \n");
+    if (fSafeTable == nullptr) {
+        RBBIDebugPrintf("   --- nullptr ---\n");
+        return;
+    }
+    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
+    RBBIDebugPrintf("      | Acc  LA    Tag");
+    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+        RBBIDebugPrintf(" %2d", c);
+    }
+    RBBIDebugPrintf("\n");
+    RBBIDebugPrintf("      |---------------");
+    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+        RBBIDebugPrintf("---");
+    }
+    RBBIDebugPrintf("\n");
+
+    for (n=0; n<fSafeTable->size(); n++) {
+        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
+        RBBIDebugPrintf("  %3d | " , n);
+        RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
+        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+            RBBIDebugPrintf(" %2d", rowString->charAt(c));
+        }
+        RBBIDebugPrintf("\n");
+    }
+    RBBIDebugPrintf("\n\n");
  }
+#endif
  
  
  
+//-----------------------------------------------------------------------------
+//
+//   printRuleStatusTable    Debug Function.  Dump the common rule status table
+//
+//-----------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
+void RBBITableBuilder::printRuleStatusTable() {
+    int32_t  thisRecord = 0;
+    int32_t  nextRecord = 0;
+    int      i;
+    UVector  *tbl = fRB->fRuleStatusVals;
+
+    RBBIDebugPrintf("index |  tags \n");
+    RBBIDebugPrintf("-------------------\n");
+
+    while (nextRecord < tbl->size()) {
+        thisRecord = nextRecord;
+        nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
+        RBBIDebugPrintf("%4d   ", thisRecord);
+        for (i=thisRecord+1; i<nextRecord; i++) {
+            RBBIDebugPrintf("  %5d", tbl->elementAti(i));
+        }
+        RBBIDebugPrintf("\n");
+    }
+    RBBIDebugPrintf("\n\n");
+}
+#endif
  
  
  //-----------------------------------------------------------------------------
@@ -727,13 +1645,15 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
      fMarked    = FALSE;
      fAccepting = 0;
      fLookAhead = 0;
-    fTagVal    = 0;
+    fTagsIdx   = 0;
+    fTagVals   = NULL;
      fPositions = NULL;
      fDtran     = NULL;
+
+    fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
      if (U_FAILURE(*fStatus)) {
          return;
      }
-    fDtran     = new UVector(lastInputSymbol+1, *fStatus);
      if (fDtran == NULL) {
          *fStatus = U_MEMORY_ALLOCATION_ERROR;
          return;
@@ -748,8 +1668,10 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
  RBBIStateDescriptor::~RBBIStateDescriptor() {
      delete       fPositions;
      delete       fDtran;
+    delete       fTagVals;
      fPositions = NULL;
      fDtran     = NULL;
+    fTagVals   = NULL;
  }
  
  U_NAMESPACE_END