]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/rbbitblb.cpp
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbitblb.cpp
index 0f1a901ffc542261bce396c0e362e1c20bfeddba..8a6f7c792f33b9339125460f6459515c1b44ff4a 100644 (file)
 #include "rbbidata.h"
 #include "cstring.h"
 #include "uassert.h"
+#include "uvectr32.h"
 #include "cmemory.h"
 
 U_NAMESPACE_BEGIN
 
-RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
- fTree(*rootNode) {
-    fRB                 = rb;
-    fStatus             = fRB->fStatus;
-    UErrorCode status   = U_ZERO_ERROR;
-    fDStates            = new UVector(status);
-    if (U_FAILURE(*fStatus)) {
-        return;
-    }
+RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
+        fRB(rb),
+        fTree(*rootNode),
+        fStatus(&status),
+        fDStates(nullptr),
+        fSafeTable(nullptr) {
     if (U_FAILURE(status)) {
-        *fStatus = status;
         return;
     }
-    if (fDStates == NULL) {
-        *fStatus = U_MEMORY_ALLOCATION_ERROR;;
+    // fDStates is UVector<RBBIStateDescriptor *>
+    fDStates = new UVector(status);
+    if (U_SUCCESS(status) && fDStates == nullptr ) {
+        status = U_MEMORY_ALLOCATION_ERROR;
     }
 }
 
@@ -51,17 +50,18 @@ RBBITableBuilder::~RBBITableBuilder() {
     for (i=0; i<fDStates->size(); i++) {
         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
     }
-    delete   fDStates;
+    delete fDStates;
+    delete fSafeTable;
 }
 
 
 //-----------------------------------------------------------------------------
 //
-//   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
-//                               table from the RBBI rules parse tree.
+//   RBBITableBuilder::buildForwardTable  -  This is the main function for building
+//                               the DFA state transition table from the RBBI rules parse tree.
 //
 //-----------------------------------------------------------------------------
-void  RBBITableBuilder::build() {
+void  RBBITableBuilder::buildForwardTable() {
 
     if (U_FAILURE(*fStatus)) {
         return;
@@ -188,8 +188,6 @@ void  RBBITableBuilder::build() {
     //    for all tables.  Merge the ones from this table into the global set.
     //
     mergeRuleStatusVals();
-
-    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
 }
 
 
@@ -761,7 +759,7 @@ void     RBBITableBuilder::flagAcceptingStates() {
                 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
 
                 // If the end marker node is from a look-ahead rule, set
-                //   the fLookAhead field or this state also.
+                //   the fLookAhead field for this state also.
                 if (endMarker->fLookAheadEnd) {
                     // TODO:  don't change value if already set?
                     // TODO:  allow for more than one active look-ahead rule in engine.
@@ -1077,6 +1075,183 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
 }
 #endif
 
+//
+//    findDuplCharClassFrom()
+//
+bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
+    int32_t numStates = fDStates->size();
+    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+
+    uint16_t table_base;
+    uint16_t table_dupl;
+    for (; categories->first < numCols-1; categories->first++) {
+        for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
+             for (int32_t state=0; state<numStates; state++) {
+                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+                 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
+                 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
+                 if (table_base != table_dupl) {
+                     break;
+                 }
+             }
+             if (table_base == table_dupl) {
+                 return true;
+             }
+        }
+    }
+    return false;
+}
+
+
+//
+//    removeColumn()
+//
+void RBBITableBuilder::removeColumn(int32_t column) {
+    int32_t numStates = fDStates->size();
+    for (int32_t state=0; state<numStates; state++) {
+        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+        U_ASSERT(column < sd->fDtran->size());
+        sd->fDtran->removeElementAt(column);
+    }
+}
+
+/*
+ * findDuplicateState
+ */
+bool RBBITableBuilder::findDuplicateState(IntPair *states) {
+    int32_t numStates = fDStates->size();
+    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+
+    for (; states->first<numStates-1; states->first++) {
+        RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
+        for (states->second=states->first+1; states->second<numStates; states->second++) {
+            RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
+            if (firstSD->fAccepting != duplSD->fAccepting ||
+                firstSD->fLookAhead != duplSD->fLookAhead ||
+                firstSD->fTagsIdx   != duplSD->fTagsIdx) {
+                continue;
+            }
+            bool rowsMatch = true;
+            for (int32_t col=0; col < numCols; ++col) {
+                int32_t firstVal = firstSD->fDtran->elementAti(col);
+                int32_t duplVal = duplSD->fDtran->elementAti(col);
+                if (!((firstVal == duplVal) ||
+                        ((firstVal == states->first || firstVal == states->second) &&
+                        (duplVal  == states->first || duplVal  == states->second)))) {
+                    rowsMatch = false;
+                    break;
+                }
+            }
+            if (rowsMatch) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+
+bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
+    int32_t numStates = fSafeTable->size();
+
+    for (; states->first<numStates-1; states->first++) {
+        UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
+        for (states->second=states->first+1; states->second<numStates; states->second++) {
+            UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
+            bool rowsMatch = true;
+            int32_t numCols = firstRow->length();
+            for (int32_t col=0; col < numCols; ++col) {
+                int32_t firstVal = firstRow->charAt(col);
+                int32_t duplVal = duplRow->charAt(col);
+                if (!((firstVal == duplVal) ||
+                        ((firstVal == states->first || firstVal == states->second) &&
+                        (duplVal  == states->first || duplVal  == states->second)))) {
+                    rowsMatch = false;
+                    break;
+                }
+            }
+            if (rowsMatch) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+
+void RBBITableBuilder::removeState(IntPair duplStates) {
+    const int32_t keepState = duplStates.first;
+    const int32_t duplState = duplStates.second;
+    U_ASSERT(keepState < duplState);
+    U_ASSERT(duplState < fDStates->size());
+
+    RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
+    fDStates->removeElementAt(duplState);
+    delete duplSD;
+
+    int32_t numStates = fDStates->size();
+    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+    for (int32_t state=0; state<numStates; ++state) {
+        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+        for (int32_t col=0; col<numCols; col++) {
+            int32_t existingVal = sd->fDtran->elementAti(col);
+            int32_t newVal = existingVal;
+            if (existingVal == duplState) {
+                newVal = keepState;
+            } else if (existingVal > duplState) {
+                newVal = existingVal - 1;
+            }
+            sd->fDtran->setElementAt(newVal, col);
+        }
+        if (sd->fAccepting == duplState) {
+            sd->fAccepting = keepState;
+        } else if (sd->fAccepting > duplState) {
+            sd->fAccepting--;
+        }
+        if (sd->fLookAhead == duplState) {
+            sd->fLookAhead = keepState;
+        } else if (sd->fLookAhead > duplState) {
+            sd->fLookAhead--;
+        }
+    }
+}
+
+void RBBITableBuilder::removeSafeState(IntPair duplStates) {
+    const int32_t keepState = duplStates.first;
+    const int32_t duplState = duplStates.second;
+    U_ASSERT(keepState < duplState);
+    U_ASSERT(duplState < fSafeTable->size());
+
+    fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
+                                              // and will auto-delete the removed element.
+    int32_t numStates = fSafeTable->size();
+    for (int32_t state=0; state<numStates; ++state) {
+        UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
+        int32_t numCols = sd->length();
+        for (int32_t col=0; col<numCols; col++) {
+            int32_t existingVal = sd->charAt(col);
+            int32_t newVal = existingVal;
+            if (existingVal == duplState) {
+                newVal = keepState;
+            } else if (existingVal > duplState) {
+                newVal = existingVal - 1;
+            }
+            sd->setCharAt(col, newVal);
+        }
+    }
+}
+
+
+/*
+ * RemoveDuplicateStates
+ */
+void RBBITableBuilder::removeDuplicateStates() {
+    IntPair dupls = {3, 0};
+    while (findDuplicateState(&dupls)) {
+        // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
+        removeState(dupls);
+    }
+}
 
 
 //-----------------------------------------------------------------------------
@@ -1095,21 +1270,17 @@ int32_t  RBBITableBuilder::getTableSize() const {
         return 0;
     }
 
-    size    = sizeof(RBBIStateTable) - 4;    // The header, with no rows to the table.
+    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
 
     numRows = fDStates->size();
     numCols = fRB->fSetBuilder->getNumCharCategories();
 
-    //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
-    //        Therefore we subtract two from numCols when determining
-    //        how much storage to add to a row for the total columns.
-    rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
+    rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
     size   += numRows * rowSize;
     return size;
 }
 
 
-
 //-----------------------------------------------------------------------------
 //
 //   exportTable()    export the state transition table in the format required
@@ -1126,14 +1297,14 @@ void RBBITableBuilder::exportTable(void *where) {
         return;
     }
 
-    if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
+    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
+    if (catCount > 0x7fff ||
         fDStates->size() > 0x7fff) {
         *fStatus = U_BRK_INTERNAL_ERROR;
         return;
     }
 
-    table->fRowLen    = sizeof(RBBIStateTableRow) +
-                            sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
+    table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
     table->fNumStates = fDStates->size();
     table->fFlags     = 0;
     if (fRB->fLookAheadHardBreak) {
@@ -1152,13 +1323,192 @@ void RBBITableBuilder::exportTable(void *where) {
         row->fAccepting = (int16_t)sd->fAccepting;
         row->fLookAhead = (int16_t)sd->fLookAhead;
         row->fTagIdx    = (int16_t)sd->fTagsIdx;
-        for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
+        for (col=0; col<catCount; col++) {
             row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
         }
     }
 }
 
 
+/**
+ *   Synthesize a safe state table from the main state table.
+ */
+void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
+    // The safe table creation has three steps:
+
+    // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
+    // following the pair do not depend on context or state before the pair. To test
+    // whether a pair is safe, run it through the main forward state table, starting
+    // from each state. If the the final state is the same, no matter what the starting state,
+    // the pair is safe.
+    //
+    // 2. Build a state table that recognizes the safe pairs. It's similar to their
+    // forward table, with a column for each input character [class], and a row for
+    // each state. Row 1 is the start state, and row 0 is the stop state. Initially
+    // create an additional state for each input character category; being in
+    // one of these states means that the character has been seen, and is potentially
+    // the first of a pair. In each of these rows, the entry for the second character
+    // of a safe pair is set to the stop state (0), indicating that a match was found.
+    // All other table entries are set to the state corresponding the current input
+    // character, allowing that charcter to be the of a start following pair.
+    //
+    // Because the safe rules are to be run in reverse, moving backwards in the text,
+    // the first and second pair categories are swapped when building the table.
+    //
+    // 3. Compress the table. There are typically many rows (states) that are
+    // equivalent - that have zeroes (match completed) in the same columns -
+    // and can be folded together.
+
+    // Each safe pair is stored as two UChars in the safePair string.
+    UnicodeString safePairs;
+
+    int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
+    int32_t numStates = fDStates->size();
+
+    for (int32_t c1=0; c1<numCharClasses; ++c1) {
+        for (int32_t c2=0; c2 < numCharClasses; ++c2) {
+            int32_t wantedEndState = -1;
+            int32_t endState = 0;
+            for (int32_t startState = 1; startState < numStates; ++startState) {
+                RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
+                int32_t s2 = startStateD->fDtran->elementAti(c1);
+                RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
+                endState = s2StateD->fDtran->elementAti(c2);
+                if (wantedEndState < 0) {
+                    wantedEndState = endState;
+                } else {
+                    if (wantedEndState != endState) {
+                        break;
+                    }
+                }
+            }
+            if (wantedEndState == endState) {
+                safePairs.append((char16_t)c1);
+                safePairs.append((char16_t)c2);
+                // printf("(%d, %d) ", c1, c2);
+            }
+        }
+        // printf("\n");
+    }
+
+    // Populate the initial safe table.
+    // The table as a whole is UVector<UnicodeString>
+    // Each row is represented by a UnicodeString, being used as a Vector<int16>.
+    // Row 0 is the stop state.
+    // Row 1 is the start sate.
+    // Row 2 and beyond are other states, initially one per char class, but
+    //   after initial construction, many of the states will be combined, compacting the table.
+    // The String holds the nextState data only. The four leading fields of a row, fAccepting,
+    // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
+
+    U_ASSERT(fSafeTable == nullptr);
+    fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
+    for (int32_t row=0; row<numCharClasses + 2; ++row) {
+        fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
+    }
+
+    // From the start state, each input char class transitions to the state for that input.
+    UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
+    for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
+        // Note: +2 for the start & stop state.
+        startState.setCharAt(charClass, charClass+2);
+    }
+
+    // Initially make every other state table row look like the start state row,
+    for (int32_t row=2; row<numCharClasses+2; ++row) {
+        UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
+        rowState = startState;   // UnicodeString assignment, copies contents.
+    }
+
+    // Run through the safe pairs, set the next state to zero when pair has been seen.
+    // Zero being the stop state, meaning we found a safe point.
+    for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
+        int32_t c1 = safePairs.charAt(pairIdx);
+        int32_t c2 = safePairs.charAt(pairIdx + 1);
+
+        UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
+        rowState.setCharAt(c1, 0);
+    }
+
+    // Remove duplicate or redundant rows from the table.
+    IntPair states = {1, 0};
+    while (findDuplicateSafeState(&states)) {
+        // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
+        removeSafeState(states);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+//
+//   getSafeTableSize()    Calculate the size of the runtime form of this
+//                         safe state table.
+//
+//-----------------------------------------------------------------------------
+int32_t  RBBITableBuilder::getSafeTableSize() const {
+    int32_t    size = 0;
+    int32_t    numRows;
+    int32_t    numCols;
+    int32_t    rowSize;
+
+    if (fSafeTable == nullptr) {
+        return 0;
+    }
+
+    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
+
+    numRows = fSafeTable->size();
+    numCols = fRB->fSetBuilder->getNumCharCategories();
+
+    rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
+    size   += numRows * rowSize;
+    return size;
+}
+
+
+//-----------------------------------------------------------------------------
+//
+//   exportSafeTable()   export the state transition table in the format required
+//                       by the runtime engine.  getTableSize() bytes of memory
+//                       must be available at the output address "where".
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::exportSafeTable(void *where) {
+    RBBIStateTable    *table = (RBBIStateTable *)where;
+    uint32_t           state;
+    int                col;
+
+    if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
+        return;
+    }
+
+    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
+    if (catCount > 0x7fff ||
+            fSafeTable->size() > 0x7fff) {
+        *fStatus = U_BRK_INTERNAL_ERROR;
+        return;
+    }
+
+    table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
+    table->fNumStates = fSafeTable->size();
+    table->fFlags     = 0;
+    table->fReserved  = 0;
+
+    for (state=0; state<table->fNumStates; state++) {
+        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
+        RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
+        row->fAccepting = 0;
+        row->fLookAhead = 0;
+        row->fTagIdx    = 0;
+        row->fReserved  = 0;
+        for (col=0; col<catCount; col++) {
+            row->fNextState[col] = rowString->charAt(col);
+        }
+    }
+}
+
+
+
 
 //-----------------------------------------------------------------------------
 //
@@ -1213,6 +1563,47 @@ void RBBITableBuilder::printStates() {
 #endif
 
 
+//-----------------------------------------------------------------------------
+//
+//   printSafeTable    Debug Function.  Dump the fully constructed safe table.
+//
+//-----------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
+void RBBITableBuilder::printReverseTable() {
+    int     c;    // input "character"
+    int     n;    // state number
+
+    RBBIDebugPrintf("    Safe Reverse Table \n");
+    if (fSafeTable == nullptr) {
+        RBBIDebugPrintf("   --- nullptr ---\n");
+        return;
+    }
+    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
+    RBBIDebugPrintf("      | Acc  LA    Tag");
+    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+        RBBIDebugPrintf(" %2d", c);
+    }
+    RBBIDebugPrintf("\n");
+    RBBIDebugPrintf("      |---------------");
+    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+        RBBIDebugPrintf("---");
+    }
+    RBBIDebugPrintf("\n");
+
+    for (n=0; n<fSafeTable->size(); n++) {
+        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
+        RBBIDebugPrintf("  %3d | " , n);
+        RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
+        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+            RBBIDebugPrintf(" %2d", rowString->charAt(c));
+        }
+        RBBIDebugPrintf("\n");
+    }
+    RBBIDebugPrintf("\n\n");
+}
+#endif
+
+
 
 //-----------------------------------------------------------------------------
 //
@@ -1259,7 +1650,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
     fPositions = NULL;
     fDtran     = NULL;
 
-    fDtran     = new UVector(lastInputSymbol+1, *fStatus);
+    fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
     if (U_FAILURE(*fStatus)) {
         return;
     }
@@ -1267,7 +1658,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
         *fStatus = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
-    fDtran->setSize(lastInputSymbol+1, *fStatus);    // fDtran needs to be pre-sized.
+    fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
                                            //   It is indexed by input symbols, and will
                                            //   hold  the next state number for each
                                            //   symbol.