#include "rbbidata.h"
#include "cstring.h"
#include "uassert.h"
+#include "uvectr32.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
-RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
- fTree(*rootNode) {
- fRB = rb;
- fStatus = fRB->fStatus;
- UErrorCode status = U_ZERO_ERROR;
- fDStates = new UVector(status);
- if (U_FAILURE(*fStatus)) {
- return;
- }
+RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
+ fRB(rb),
+ fTree(*rootNode),
+ fStatus(&status),
+ fDStates(nullptr),
+ fSafeTable(nullptr) {
if (U_FAILURE(status)) {
- *fStatus = status;
return;
}
- if (fDStates == NULL) {
- *fStatus = U_MEMORY_ALLOCATION_ERROR;;
+ // fDStates is UVector<RBBIStateDescriptor *>
+ fDStates = new UVector(status);
+ if (U_SUCCESS(status) && fDStates == nullptr ) {
+ status = U_MEMORY_ALLOCATION_ERROR;
}
}
for (i=0; i<fDStates->size(); i++) {
delete (RBBIStateDescriptor *)fDStates->elementAt(i);
}
- delete fDStates;
+ delete fDStates;
+ delete fSafeTable;
}
//-----------------------------------------------------------------------------
//
-// RBBITableBuilder::build - This is the main function for building the DFA state transtion
-// table from the RBBI rules parse tree.
+// RBBITableBuilder::buildForwardTable - This is the main function for building
+// the DFA state transition table from the RBBI rules parse tree.
//
//-----------------------------------------------------------------------------
-void RBBITableBuilder::build() {
+void RBBITableBuilder::buildForwardTable() {
if (U_FAILURE(*fStatus)) {
return;
// for all tables. Merge the ones from this table into the global set.
//
mergeRuleStatusVals();
-
- if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
}
// if sd->fAccepting already had a value other than 0 or -1, leave it be.
// If the end marker node is from a look-ahead rule, set
- // the fLookAhead field or this state also.
+ // the fLookAhead field for this state also.
if (endMarker->fLookAheadEnd) {
// TODO: don't change value if already set?
// TODO: allow for more than one active look-ahead rule in engine.
}
#endif
+//
+// findDuplCharClassFrom()
+//
+bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
+ int32_t numStates = fDStates->size();
+ int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+
+ uint16_t table_base;
+ uint16_t table_dupl;
+ for (; categories->first < numCols-1; categories->first++) {
+ for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
+ for (int32_t state=0; state<numStates; state++) {
+ RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+ table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
+ table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
+ if (table_base != table_dupl) {
+ break;
+ }
+ }
+ if (table_base == table_dupl) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+//
+// removeColumn()
+//
+void RBBITableBuilder::removeColumn(int32_t column) {
+ int32_t numStates = fDStates->size();
+ for (int32_t state=0; state<numStates; state++) {
+ RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+ U_ASSERT(column < sd->fDtran->size());
+ sd->fDtran->removeElementAt(column);
+ }
+}
+
+/*
+ * findDuplicateState
+ */
+bool RBBITableBuilder::findDuplicateState(IntPair *states) {
+ int32_t numStates = fDStates->size();
+ int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+
+ for (; states->first<numStates-1; states->first++) {
+ RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
+ for (states->second=states->first+1; states->second<numStates; states->second++) {
+ RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
+ if (firstSD->fAccepting != duplSD->fAccepting ||
+ firstSD->fLookAhead != duplSD->fLookAhead ||
+ firstSD->fTagsIdx != duplSD->fTagsIdx) {
+ continue;
+ }
+ bool rowsMatch = true;
+ for (int32_t col=0; col < numCols; ++col) {
+ int32_t firstVal = firstSD->fDtran->elementAti(col);
+ int32_t duplVal = duplSD->fDtran->elementAti(col);
+ if (!((firstVal == duplVal) ||
+ ((firstVal == states->first || firstVal == states->second) &&
+ (duplVal == states->first || duplVal == states->second)))) {
+ rowsMatch = false;
+ break;
+ }
+ }
+ if (rowsMatch) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
+ int32_t numStates = fSafeTable->size();
+
+ for (; states->first<numStates-1; states->first++) {
+ UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
+ for (states->second=states->first+1; states->second<numStates; states->second++) {
+ UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
+ bool rowsMatch = true;
+ int32_t numCols = firstRow->length();
+ for (int32_t col=0; col < numCols; ++col) {
+ int32_t firstVal = firstRow->charAt(col);
+ int32_t duplVal = duplRow->charAt(col);
+ if (!((firstVal == duplVal) ||
+ ((firstVal == states->first || firstVal == states->second) &&
+ (duplVal == states->first || duplVal == states->second)))) {
+ rowsMatch = false;
+ break;
+ }
+ }
+ if (rowsMatch) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+void RBBITableBuilder::removeState(IntPair duplStates) {
+ const int32_t keepState = duplStates.first;
+ const int32_t duplState = duplStates.second;
+ U_ASSERT(keepState < duplState);
+ U_ASSERT(duplState < fDStates->size());
+
+ RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
+ fDStates->removeElementAt(duplState);
+ delete duplSD;
+
+ int32_t numStates = fDStates->size();
+ int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
+ for (int32_t state=0; state<numStates; ++state) {
+ RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
+ for (int32_t col=0; col<numCols; col++) {
+ int32_t existingVal = sd->fDtran->elementAti(col);
+ int32_t newVal = existingVal;
+ if (existingVal == duplState) {
+ newVal = keepState;
+ } else if (existingVal > duplState) {
+ newVal = existingVal - 1;
+ }
+ sd->fDtran->setElementAt(newVal, col);
+ }
+ if (sd->fAccepting == duplState) {
+ sd->fAccepting = keepState;
+ } else if (sd->fAccepting > duplState) {
+ sd->fAccepting--;
+ }
+ if (sd->fLookAhead == duplState) {
+ sd->fLookAhead = keepState;
+ } else if (sd->fLookAhead > duplState) {
+ sd->fLookAhead--;
+ }
+ }
+}
+
+void RBBITableBuilder::removeSafeState(IntPair duplStates) {
+ const int32_t keepState = duplStates.first;
+ const int32_t duplState = duplStates.second;
+ U_ASSERT(keepState < duplState);
+ U_ASSERT(duplState < fSafeTable->size());
+
+ fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
+ // and will auto-delete the removed element.
+ int32_t numStates = fSafeTable->size();
+ for (int32_t state=0; state<numStates; ++state) {
+ UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
+ int32_t numCols = sd->length();
+ for (int32_t col=0; col<numCols; col++) {
+ int32_t existingVal = sd->charAt(col);
+ int32_t newVal = existingVal;
+ if (existingVal == duplState) {
+ newVal = keepState;
+ } else if (existingVal > duplState) {
+ newVal = existingVal - 1;
+ }
+ sd->setCharAt(col, newVal);
+ }
+ }
+}
+
+
+/*
+ * RemoveDuplicateStates
+ */
+void RBBITableBuilder::removeDuplicateStates() {
+ IntPair dupls = {3, 0};
+ while (findDuplicateState(&dupls)) {
+ // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
+ removeState(dupls);
+ }
+}
//-----------------------------------------------------------------------------
return 0;
}
- size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table.
+ size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
numRows = fDStates->size();
numCols = fRB->fSetBuilder->getNumCharCategories();
- // Note The declaration of RBBIStateTableRow is for a table of two columns.
- // Therefore we subtract two from numCols when determining
- // how much storage to add to a row for the total columns.
- rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
+ rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
size += numRows * rowSize;
return size;
}
-
//-----------------------------------------------------------------------------
//
// exportTable() export the state transition table in the format required
return;
}
- if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
+ int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
+ if (catCount > 0x7fff ||
fDStates->size() > 0x7fff) {
*fStatus = U_BRK_INTERNAL_ERROR;
return;
}
- table->fRowLen = sizeof(RBBIStateTableRow) +
- sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
+ table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
table->fNumStates = fDStates->size();
table->fFlags = 0;
if (fRB->fLookAheadHardBreak) {
row->fAccepting = (int16_t)sd->fAccepting;
row->fLookAhead = (int16_t)sd->fLookAhead;
row->fTagIdx = (int16_t)sd->fTagsIdx;
- for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
+ for (col=0; col<catCount; col++) {
row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
}
}
}
+/**
+ * Synthesize a safe state table from the main state table.
+ */
+void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
+ // The safe table creation has three steps:
+
+ // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
+ // following the pair do not depend on context or state before the pair. To test
+ // whether a pair is safe, run it through the main forward state table, starting
+ // from each state. If the the final state is the same, no matter what the starting state,
+ // the pair is safe.
+ //
+ // 2. Build a state table that recognizes the safe pairs. It's similar to their
+ // forward table, with a column for each input character [class], and a row for
+ // each state. Row 1 is the start state, and row 0 is the stop state. Initially
+ // create an additional state for each input character category; being in
+ // one of these states means that the character has been seen, and is potentially
+ // the first of a pair. In each of these rows, the entry for the second character
+ // of a safe pair is set to the stop state (0), indicating that a match was found.
+ // All other table entries are set to the state corresponding the current input
+ // character, allowing that charcter to be the of a start following pair.
+ //
+ // Because the safe rules are to be run in reverse, moving backwards in the text,
+ // the first and second pair categories are swapped when building the table.
+ //
+ // 3. Compress the table. There are typically many rows (states) that are
+ // equivalent - that have zeroes (match completed) in the same columns -
+ // and can be folded together.
+
+ // Each safe pair is stored as two UChars in the safePair string.
+ UnicodeString safePairs;
+
+ int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
+ int32_t numStates = fDStates->size();
+
+ for (int32_t c1=0; c1<numCharClasses; ++c1) {
+ for (int32_t c2=0; c2 < numCharClasses; ++c2) {
+ int32_t wantedEndState = -1;
+ int32_t endState = 0;
+ for (int32_t startState = 1; startState < numStates; ++startState) {
+ RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
+ int32_t s2 = startStateD->fDtran->elementAti(c1);
+ RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
+ endState = s2StateD->fDtran->elementAti(c2);
+ if (wantedEndState < 0) {
+ wantedEndState = endState;
+ } else {
+ if (wantedEndState != endState) {
+ break;
+ }
+ }
+ }
+ if (wantedEndState == endState) {
+ safePairs.append((char16_t)c1);
+ safePairs.append((char16_t)c2);
+ // printf("(%d, %d) ", c1, c2);
+ }
+ }
+ // printf("\n");
+ }
+
+ // Populate the initial safe table.
+ // The table as a whole is UVector<UnicodeString>
+ // Each row is represented by a UnicodeString, being used as a Vector<int16>.
+ // Row 0 is the stop state.
+ // Row 1 is the start sate.
+ // Row 2 and beyond are other states, initially one per char class, but
+ // after initial construction, many of the states will be combined, compacting the table.
+ // The String holds the nextState data only. The four leading fields of a row, fAccepting,
+ // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
+
+ U_ASSERT(fSafeTable == nullptr);
+ fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
+ for (int32_t row=0; row<numCharClasses + 2; ++row) {
+ fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
+ }
+
+ // From the start state, each input char class transitions to the state for that input.
+ UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
+ for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
+ // Note: +2 for the start & stop state.
+ startState.setCharAt(charClass, charClass+2);
+ }
+
+ // Initially make every other state table row look like the start state row,
+ for (int32_t row=2; row<numCharClasses+2; ++row) {
+ UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
+ rowState = startState; // UnicodeString assignment, copies contents.
+ }
+
+ // Run through the safe pairs, set the next state to zero when pair has been seen.
+ // Zero being the stop state, meaning we found a safe point.
+ for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
+ int32_t c1 = safePairs.charAt(pairIdx);
+ int32_t c2 = safePairs.charAt(pairIdx + 1);
+
+ UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
+ rowState.setCharAt(c1, 0);
+ }
+
+ // Remove duplicate or redundant rows from the table.
+ IntPair states = {1, 0};
+ while (findDuplicateSafeState(&states)) {
+ // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
+ removeSafeState(states);
+ }
+}
+
+
+//-----------------------------------------------------------------------------
+//
+// getSafeTableSize() Calculate the size of the runtime form of this
+// safe state table.
+//
+//-----------------------------------------------------------------------------
+int32_t RBBITableBuilder::getSafeTableSize() const {
+ int32_t size = 0;
+ int32_t numRows;
+ int32_t numCols;
+ int32_t rowSize;
+
+ if (fSafeTable == nullptr) {
+ return 0;
+ }
+
+ size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
+
+ numRows = fSafeTable->size();
+ numCols = fRB->fSetBuilder->getNumCharCategories();
+
+ rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
+ size += numRows * rowSize;
+ return size;
+}
+
+
+//-----------------------------------------------------------------------------
+//
+// exportSafeTable() export the state transition table in the format required
+// by the runtime engine. getTableSize() bytes of memory
+// must be available at the output address "where".
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::exportSafeTable(void *where) {
+ RBBIStateTable *table = (RBBIStateTable *)where;
+ uint32_t state;
+ int col;
+
+ if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
+ return;
+ }
+
+ int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
+ if (catCount > 0x7fff ||
+ fSafeTable->size() > 0x7fff) {
+ *fStatus = U_BRK_INTERNAL_ERROR;
+ return;
+ }
+
+ table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
+ table->fNumStates = fSafeTable->size();
+ table->fFlags = 0;
+ table->fReserved = 0;
+
+ for (state=0; state<table->fNumStates; state++) {
+ UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
+ RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
+ row->fAccepting = 0;
+ row->fLookAhead = 0;
+ row->fTagIdx = 0;
+ row->fReserved = 0;
+ for (col=0; col<catCount; col++) {
+ row->fNextState[col] = rowString->charAt(col);
+ }
+ }
+}
+
+
+
//-----------------------------------------------------------------------------
//
#endif
+//-----------------------------------------------------------------------------
+//
+// printSafeTable Debug Function. Dump the fully constructed safe table.
+//
+//-----------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
+void RBBITableBuilder::printReverseTable() {
+ int c; // input "character"
+ int n; // state number
+
+ RBBIDebugPrintf(" Safe Reverse Table \n");
+ if (fSafeTable == nullptr) {
+ RBBIDebugPrintf(" --- nullptr ---\n");
+ return;
+ }
+ RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
+ RBBIDebugPrintf(" | Acc LA Tag");
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf(" %2d", c);
+ }
+ RBBIDebugPrintf("\n");
+ RBBIDebugPrintf(" |---------------");
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf("---");
+ }
+ RBBIDebugPrintf("\n");
+
+ for (n=0; n<fSafeTable->size(); n++) {
+ UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
+ RBBIDebugPrintf(" %3d | " , n);
+ RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf(" %2d", rowString->charAt(c));
+ }
+ RBBIDebugPrintf("\n");
+ }
+ RBBIDebugPrintf("\n\n");
+}
+#endif
+
+
//-----------------------------------------------------------------------------
//
fPositions = NULL;
fDtran = NULL;
- fDtran = new UVector(lastInputSymbol+1, *fStatus);
+ fDtran = new UVector32(lastInputSymbol+1, *fStatus);
if (U_FAILURE(*fStatus)) {
return;
}
*fStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
- fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized.
+ fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
// It is indexed by input symbols, and will
// hold the next state number for each
// symbol.