+/**
+ * Synthesize a safe state table from the main state table.
+ */
+void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
+ // The safe table creation has three steps:
+
+ // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
+ // following the pair do not depend on context or state before the pair. To test
+ // whether a pair is safe, run it through the main forward state table, starting
+ // from each state. If the the final state is the same, no matter what the starting state,
+ // the pair is safe.
+ //
+ // 2. Build a state table that recognizes the safe pairs. It's similar to their
+ // forward table, with a column for each input character [class], and a row for
+ // each state. Row 1 is the start state, and row 0 is the stop state. Initially
+ // create an additional state for each input character category; being in
+ // one of these states means that the character has been seen, and is potentially
+ // the first of a pair. In each of these rows, the entry for the second character
+ // of a safe pair is set to the stop state (0), indicating that a match was found.
+ // All other table entries are set to the state corresponding the current input
+ // character, allowing that charcter to be the of a start following pair.
+ //
+ // Because the safe rules are to be run in reverse, moving backwards in the text,
+ // the first and second pair categories are swapped when building the table.
+ //
+ // 3. Compress the table. There are typically many rows (states) that are
+ // equivalent - that have zeroes (match completed) in the same columns -
+ // and can be folded together.
+
+ // Each safe pair is stored as two UChars in the safePair string.
+ UnicodeString safePairs;
+
+ int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
+ int32_t numStates = fDStates->size();
+
+ for (int32_t c1=0; c1<numCharClasses; ++c1) {
+ for (int32_t c2=0; c2 < numCharClasses; ++c2) {
+ int32_t wantedEndState = -1;
+ int32_t endState = 0;
+ for (int32_t startState = 1; startState < numStates; ++startState) {
+ RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
+ int32_t s2 = startStateD->fDtran->elementAti(c1);
+ RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
+ endState = s2StateD->fDtran->elementAti(c2);
+ if (wantedEndState < 0) {
+ wantedEndState = endState;
+ } else {
+ if (wantedEndState != endState) {
+ break;
+ }
+ }
+ }
+ if (wantedEndState == endState) {
+ safePairs.append((char16_t)c1);
+ safePairs.append((char16_t)c2);
+ // printf("(%d, %d) ", c1, c2);
+ }
+ }
+ // printf("\n");
+ }
+
+ // Populate the initial safe table.
+ // The table as a whole is UVector<UnicodeString>
+ // Each row is represented by a UnicodeString, being used as a Vector<int16>.
+ // Row 0 is the stop state.
+ // Row 1 is the start sate.
+ // Row 2 and beyond are other states, initially one per char class, but
+ // after initial construction, many of the states will be combined, compacting the table.
+ // The String holds the nextState data only. The four leading fields of a row, fAccepting,
+ // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
+
+ U_ASSERT(fSafeTable == nullptr);
+ fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
+ for (int32_t row=0; row<numCharClasses + 2; ++row) {
+ fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
+ }
+
+ // From the start state, each input char class transitions to the state for that input.
+ UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
+ for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
+ // Note: +2 for the start & stop state.
+ startState.setCharAt(charClass, charClass+2);
+ }
+
+ // Initially make every other state table row look like the start state row,
+ for (int32_t row=2; row<numCharClasses+2; ++row) {
+ UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
+ rowState = startState; // UnicodeString assignment, copies contents.
+ }
+
+ // Run through the safe pairs, set the next state to zero when pair has been seen.
+ // Zero being the stop state, meaning we found a safe point.
+ for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
+ int32_t c1 = safePairs.charAt(pairIdx);
+ int32_t c2 = safePairs.charAt(pairIdx + 1);
+
+ UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
+ rowState.setCharAt(c1, 0);
+ }
+
+ // Remove duplicate or redundant rows from the table.
+ IntPair states = {1, 0};
+ while (findDuplicateSafeState(&states)) {
+ // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
+ removeSafeState(states);
+ }
+}
+
+
+//-----------------------------------------------------------------------------
+//
+// getSafeTableSize() Calculate the size of the runtime form of this
+// safe state table.
+//
+//-----------------------------------------------------------------------------
+int32_t RBBITableBuilder::getSafeTableSize() const {
+ int32_t size = 0;
+ int32_t numRows;
+ int32_t numCols;
+ int32_t rowSize;
+
+ if (fSafeTable == nullptr) {
+ return 0;
+ }
+
+ size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
+
+ numRows = fSafeTable->size();
+ numCols = fRB->fSetBuilder->getNumCharCategories();
+
+ rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
+ size += numRows * rowSize;
+ return size;
+}
+
+
+//-----------------------------------------------------------------------------
+//
+// exportSafeTable() export the state transition table in the format required
+// by the runtime engine. getTableSize() bytes of memory
+// must be available at the output address "where".
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::exportSafeTable(void *where) {
+ RBBIStateTable *table = (RBBIStateTable *)where;
+ uint32_t state;
+ int col;
+
+ if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
+ return;
+ }
+
+ int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
+ if (catCount > 0x7fff ||
+ fSafeTable->size() > 0x7fff) {
+ *fStatus = U_BRK_INTERNAL_ERROR;
+ return;
+ }
+
+ table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
+ table->fNumStates = fSafeTable->size();
+ table->fFlags = 0;
+ table->fReserved = 0;
+
+ for (state=0; state<table->fNumStates; state++) {
+ UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
+ RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
+ row->fAccepting = 0;
+ row->fLookAhead = 0;
+ row->fTagIdx = 0;
+ row->fReserved = 0;
+ for (col=0; col<catCount; col++) {
+ row->fNextState[col] = rowString->charAt(col);
+ }
+ }
+}
+
+
+