/*
**********************************************************************
-* Copyright (c) 2002-2003, International Business Machines
+* Copyright (c) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
fTree(*rootNode) {
- fRB = rb;
- fStatus = fRB->fStatus;
- fDStates = new UVector(*fStatus);
+ fRB = rb;
+ fStatus = fRB->fStatus;
+ UErrorCode status = U_ZERO_ERROR;
+ fDStates = new UVector(status);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
+ if (U_FAILURE(status)) {
+ *fStatus = status;
+ return;
+ }
+ if (fDStates == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;;
+ }
}
//
fTree = fTree->flattenVariables();
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
- RBBIDebugPrintf("Parse tree after flattening variable references.\n");
+ RBBIDebugPuts("Parse tree after flattening variable references.");
fTree->printTree(TRUE);
}
//
fTree->flattenSets();
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
- RBBIDebugPrintf("Parse tree after flattening Unicode Set references.\n");
+ RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
fTree->printTree(TRUE);
}
calcLastPos(fTree);
calcFollowPos(fTree);
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
- RBBIDebugPrintf("\n\n");
+ RBBIDebugPuts("\n");
printPosSets(fTree);
}
+ //
+ // For "chained" rules, modify the followPos sets
+ //
+ if (fRB->fChainRules) {
+ calcChainedFollowPos(fTree);
+ }
+
//
// Build the DFA state transition tables.
//
flagAcceptingStates();
flagLookAheadStates();
flagTaggedStates();
- if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
+ //
+ // Update the global table of rule status {tag} values
+ // The rule builder has a global vector of status values that are common
+ // for all tables. Merge the ones from this table into the global set.
+ //
+ mergeRuleStatusVals();
+
+ if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
}
}
+//-----------------------------------------------------------------------------
+//
+// calcChainedFollowPos. Modify the previously calculated followPos sets
+// to implement rule chaining. NOT described by Aho
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
+
+ UVector endMarkerNodes(*fStatus);
+ UVector leafNodes(*fStatus);
+ int32_t i;
+
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
+
+ // get a list of all endmarker nodes.
+ tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
+
+ // get a list all leaf nodes
+ tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
+
+ // Get all nodes that can be the start a match, which is FirstPosition(root)
+ UVector *matchStartNodes = tree->fFirstPosSet;
+
+
+ // Iteratate over all leaf nodes,
+ //
+ int32_t endNodeIx;
+ int32_t startNodeIx;
+
+ for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
+ RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
+ RBBINode *endNode = NULL;
+
+ // Identify leaf nodes that correspond to overall rule match positions.
+ // These include an endMarkerNode in their followPos sets.
+ for (i=0; i<endMarkerNodes.size(); i++) {
+ if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
+ endNode = tNode;
+ break;
+ }
+ }
+ if (endNode == NULL) {
+ // node wasn't an end node. Try again with the next.
+ continue;
+ }
+
+ // We've got a node that can end a match.
+
+ // Line Break Specific hack: If this node's val correspond to the $CM char class,
+ // don't chain from it.
+ // TODO: Add rule syntax for this behavior, get specifics out of here and
+ // into the rule file.
+ if (fRB->fLBCMNoChain) {
+ UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
+ U_ASSERT(c != -1);
+ ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
+ if (cLBProp == U_LB_COMBINING_MARK) {
+ continue;
+ }
+ }
+
+
+ // Now iterate over the nodes that can start a match, looking for ones
+ // with the same char class as our ending node.
+ RBBINode *startNode;
+ for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
+ startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
+ if (startNode->fType != RBBINode::leafChar) {
+ continue;
+ }
+
+ if (endNode->fVal == startNode->fVal) {
+ // The end val (character class) of one possible match is the
+ // same as the start of another.
+
+ // Add all nodes from the followPos of the start node to the
+ // followPos set of the end node, which will have the effect of
+ // letting matches transition from a match state at endNode
+ // to the second char of a match starting with startNode.
+ setAdd(endNode->fFollowPos, startNode->fFollowPos);
+ }
+ }
+ }
+}
+
+
//-----------------------------------------------------------------------------
//
// buildStateTable() Determine the set of runtime DFA states and the
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::buildStateTable() {
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
//
// Add a dummy state 0 - the stop state. Not from Aho.
int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
failState->fPositions = new UVector(*fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
fDStates->addElement(failState, *fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
// initially, the only unmarked state in Dstates is firstpos(root),
// where toot is the root of the syntax tree for (r)#;
RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
initialState->fPositions = new UVector(*fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
setAdd(initialState->fPositions, fTree->fFirstPosSet);
fDStates->addElement(initialState, *fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
// while there is an unmarked state T in Dstates do begin
for (;;) {
if (!UinDstates)
{
RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
newState->fPositions = U;
fDStates->addElement(newState, *fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
ux = fDStates->size()-1;
}
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::flagAcceptingStates() {
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
UVector endMarkerNodes(*fStatus);
RBBINode *endMarker;
int32_t i;
int32_t n;
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
+
fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
for (i=0; i<endMarkerNodes.size(); i++) {
endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::flagLookAheadStates() {
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
UVector lookAheadNodes(*fStatus);
RBBINode *lookAheadNode;
int32_t i;
int32_t n;
fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
for (i=0; i<lookAheadNodes.size(); i++) {
lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::flagTaggedStates() {
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
UVector tagNodes(*fStatus);
RBBINode *tagNode;
int32_t i;
int32_t n;
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
tagNode = (RBBINode *)tagNodes.elementAt(i);
-
+
for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
- if (sd->fTagVal < tagNode->fVal) {
- // If more than one rule tag applies to this state, the larger
- // tag takes precedence.
- sd->fTagVal = tagNode->fVal;
+ sortedAdd(&sd->fTagVals, tagNode->fVal);
}
}
}
}
+
+
+
+
+//-----------------------------------------------------------------------------
+//
+// mergeRuleStatusVals
+//
+// Update the global table of rule status {tag} values
+// The rule builder has a global vector of status values that are common
+// for all tables. Merge the ones from this table into the global set.
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::mergeRuleStatusVals() {
+ //
+ // The basic outline of what happens here is this...
+ //
+ // for each state in this state table
+ // if the status tag list for this state is in the global statuses list
+ // record where and
+ // continue with the next state
+ // else
+ // add the tag list for this state to the global list.
+ //
+ int i;
+ int n;
+
+ // Pre-set a single tag of {0} into the table.
+ // We will need this as a default, for rule sets with no explicit tagging.
+ if (fRB->fRuleStatusVals->size() == 0) {
+ fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
+ fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
+ }
+
+ // For each state
+ for (n=0; n<fDStates->size(); n++) {
+ RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
+ UVector *thisStatesTagValues = sd->fTagVals;
+ if (thisStatesTagValues == NULL) {
+ // No tag values are explicitly associated with this state.
+ // Set the default tag value.
+ sd->fTagsIdx = 0;
+ continue;
+ }
+
+ // There are tag(s) associated with this state.
+ // fTagsIdx will be the index into the global tag list for this state's tag values.
+ // Initial value of -1 flags that we haven't got it set yet.
+ sd->fTagsIdx = -1;
+ int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
+ int32_t nextTagGroupStart = 0;
+
+ // Loop runs once per group of tags in the global list
+ while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
+ thisTagGroupStart = nextTagGroupStart;
+ nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
+ if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
+ // The number of tags for this state is different from
+ // the number of tags in this group from the global list.
+ // Continue with the next group from the global list.
+ continue;
+ }
+ // The lengths match, go ahead and compare the actual tag values
+ // between this state and the group from the global list.
+ for (i=0; i<thisStatesTagValues->size(); i++) {
+ if (thisStatesTagValues->elementAti(i) !=
+ fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
+ // Mismatch.
+ break;
+ }
+ }
+
+ if (i == thisStatesTagValues->size()) {
+ // We found a set of tag values in the global list that match
+ // those for this state. Use them.
+ sd->fTagsIdx = thisTagGroupStart;
+ break;
+ }
+ }
+
+ if (sd->fTagsIdx == -1) {
+ // No suitable entry in the global tag list already. Add one
+ sd->fTagsIdx = fRB->fRuleStatusVals->size();
+ fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
+ for (i=0; i<thisStatesTagValues->size(); i++) {
+ fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
+ }
+ }
+ }
+}
+
+
+
+
+
+
+
+//-----------------------------------------------------------------------------
+//
+// sortedAdd Add a value to a vector of sorted values (ints).
+// Do not replicate entries; if the value is already there, do not
+// add a second one.
+// Lazily create the vector if it does not already exist.
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
+ int32_t i;
+
+ if (*vector == NULL) {
+ *vector = new UVector(*fStatus);
+ }
+ if (*vector == NULL || U_FAILURE(*fStatus)) {
+ return;
+ }
+ UVector *vec = *vector;
+ int32_t vSize = vec->size();
+ for (i=0; i<vSize; i++) {
+ int32_t valAtI = vec->elementAti(i);
+ if (valAtI == val) {
+ // The value is already in the vector. Don't add it again.
+ return;
+ }
+ if (valAtI > val) {
+ break;
+ }
+ }
+ vec->insertElementAt(val, i, *fStatus);
}
int sourceSize = source->size();
int32_t si, di;
- for (si=0; si<sourceSize; si++) {
+ for (si=0; si<sourceSize && U_SUCCESS(*fStatus); si++) {
void *elToAdd = source->elementAt(si);
for (di=0; di<destOriginalSize; di++) {
if (dest->elementAt(di) == elToAdd) {
}
}
dest->addElement(elToAdd, *fStatus);
- elementAlreadyInDest: ;
+ elementAlreadyInDest: ;
}
}
+
//-----------------------------------------------------------------------------
//
// setEqual Set operation on UVector.
// for each node in the tree.
//
//-----------------------------------------------------------------------------
-void RBBITableBuilder::printPosSets(RBBINode *n) {
#ifdef RBBI_DEBUG
+void RBBITableBuilder::printPosSets(RBBINode *n) {
if (n==NULL) {
return;
}
- n->print();
+ n->printNode();
RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
RBBIDebugPrintf(" firstpos: ");
printPosSets(n->fLeftChild);
printPosSets(n->fRightChild);
-#endif
}
+#endif
// state transition table.
//
//-----------------------------------------------------------------------------
-int32_t RBBITableBuilder::getTableSize() {
+int32_t RBBITableBuilder::getTableSize() const {
int32_t size = 0;
int32_t numRows;
int32_t numCols;
table->fRowLen = sizeof(RBBIStateTableRow) +
sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
table->fNumStates = fDStates->size();
+ table->fFlags = 0;
+ if (fRB->fLookAheadHardBreak) {
+ table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
+ }
+ table->fReserved = 0;
for (state=0; state<table->fNumStates; state++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
row->fAccepting = (int16_t)sd->fAccepting;
row->fLookAhead = (int16_t)sd->fLookAhead;
- row->fTag = (int16_t)sd->fTagVal;
+ row->fTagIdx = (int16_t)sd->fTagsIdx;
for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
}
// printSet Debug function. Print the contents of a UVector
//
//-----------------------------------------------------------------------------
-void RBBITableBuilder::printSet(UVector *s) {
#ifdef RBBI_DEBUG
+void RBBITableBuilder::printSet(UVector *s) {
int32_t i;
for (i=0; i<s->size(); i++) {
void *v = s->elementAt(i);
RBBIDebugPrintf("%10p", v);
}
RBBIDebugPrintf("\n");
-#endif
}
+#endif
//-----------------------------------------------------------------------------
// printStates Debug Function. Dump the fully constructed state transition table.
//
//-----------------------------------------------------------------------------
-void RBBITableBuilder::printStates() {
#ifdef RBBI_DEBUG
+void RBBITableBuilder::printStates() {
int c; // input "character"
int n; // state number
RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
RBBIDebugPrintf(" | Acc LA Tag");
- for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {RBBIDebugPrintf(" %2d", c);};
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf(" %2d", c);
+ }
RBBIDebugPrintf("\n");
RBBIDebugPrintf(" |---------------");
- for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {RBBIDebugPrintf("---");};
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf("---");
+ }
RBBIDebugPrintf("\n");
for (n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
RBBIDebugPrintf(" %3d | " , n);
- RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagVal);
+ RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
}
RBBIDebugPrintf("\n");
}
RBBIDebugPrintf("\n\n");
-#endif
}
+#endif
+//-----------------------------------------------------------------------------
+//
+// printRuleStatusTable Debug Function. Dump the common rule status table
+//
+//-----------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
+void RBBITableBuilder::printRuleStatusTable() {
+ int32_t thisRecord = 0;
+ int32_t nextRecord = 0;
+ int i;
+ UVector *tbl = fRB->fRuleStatusVals;
+
+ RBBIDebugPrintf("index | tags \n");
+ RBBIDebugPrintf("-------------------\n");
+
+ while (nextRecord < tbl->size()) {
+ thisRecord = nextRecord;
+ nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
+ RBBIDebugPrintf("%4d ", thisRecord);
+ for (i=thisRecord+1; i<nextRecord; i++) {
+ RBBIDebugPrintf(" %5d", tbl->elementAti(i));
+ }
+ RBBIDebugPrintf("\n");
+ }
+ RBBIDebugPrintf("\n\n");
+}
+#endif
//-----------------------------------------------------------------------------
fMarked = FALSE;
fAccepting = 0;
fLookAhead = 0;
- fTagVal = 0;
+ fTagsIdx = 0;
+ fTagVals = NULL;
fPositions = NULL;
fDtran = NULL;
+
+ fDtran = new UVector(lastInputSymbol+1, *fStatus);
if (U_FAILURE(*fStatus)) {
return;
}
- fDtran = new UVector(lastInputSymbol+1, *fStatus);
if (fDtran == NULL) {
*fStatus = U_MEMORY_ALLOCATION_ERROR;
return;
RBBIStateDescriptor::~RBBIStateDescriptor() {
delete fPositions;
delete fDtran;
+ delete fTagVals;
fPositions = NULL;
fDtran = NULL;
+ fTagVals = NULL;
}
U_NAMESPACE_END