-//
-// rbbitblb.cpp
-//
-
/*
**********************************************************************
-* Copyright (c) 2002-2004, International Business Machines
+* Copyright (c) 2002-2008, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
+//
+// rbbitblb.cpp
+//
+
#include "unicode/utypes.h"
#include "rbbidata.h"
#include "cstring.h"
#include "uassert.h"
+#include "cmemory.h"
U_NAMESPACE_BEGIN
// parse tree for the substition expression.
//
fTree = fTree->flattenVariables();
+#ifdef RBBI_DEBUG
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
RBBIDebugPuts("Parse tree after flattening variable references.");
fTree->printTree(TRUE);
}
+#endif
+
+ //
+ // If the rules contained any references to {bof}
+ // add a {bof} <cat> <former root of tree> to the
+ // tree. Means that all matches must start out with the
+ // {bof} fake character.
+ //
+ if (fRB->fSetBuilder->sawBOF()) {
+ RBBINode *bofTop = new RBBINode(RBBINode::opCat);
+ RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
+ // Delete and exit if memory allocation failed.
+ if (bofTop == NULL || bofLeaf == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ delete bofTop;
+ delete bofLeaf;
+ return;
+ }
+ bofTop->fLeftChild = bofLeaf;
+ bofTop->fRightChild = fTree;
+ bofLeaf->fParent = bofTop;
+ bofLeaf->fVal = 2; // Reserved value for {bof}.
+ fTree = bofTop;
+ }
//
// Add a unique right-end marker to the expression.
// right child being the end marker.
//
RBBINode *cn = new RBBINode(RBBINode::opCat);
+ // Exit if memory allocation failed.
+ if (cn == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
cn->fLeftChild = fTree;
fTree->fParent = cn;
cn->fRightChild = new RBBINode(RBBINode::endMark);
+ // Delete and exit if memory allocation failed.
+ if (cn->fRightChild == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ delete cn;
+ return;
+ }
cn->fRightChild->fParent = cn;
fTree = cn;
// expression.
//
fTree->flattenSets();
+#ifdef RBBI_DEBUG
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
fTree->printTree(TRUE);
}
+#endif
//
calcChainedFollowPos(fTree);
}
+ //
+ // BOF (start of input) test fixup.
+ //
+ if (fRB->fSetBuilder->sawBOF()) {
+ bofFixup();
+ }
+
//
// Build the DFA state transition tables.
//
n->fType == RBBINode::lookAhead ||
n->fType == RBBINode::tag) {
// These are non-empty leaf node types.
+ // Note: In order to maintain the sort invariant on the set,
+ // this function should only be called on a node whose set is
+ // empty to start with.
n->fFirstPosSet->addElement(n, *fStatus);
return;
}
n->fType == RBBINode::lookAhead ||
n->fType == RBBINode::tag) {
// These are non-empty leaf node types.
+ // Note: In order to maintain the sort invariant on the set,
+ // this function should only be called on a node whose set is
+ // empty to start with.
n->fLastPosSet->addElement(n, *fStatus);
return;
}
// get a list of all endmarker nodes.
tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
- // get a list all leaf nodes
+ // get a list all leaf nodes
tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
if (U_FAILURE(*fStatus)) {
return;
}
- // Get all nodes that can be the start a match, which is FirstPosition(root)
- UVector *matchStartNodes = tree->fFirstPosSet;
+ // Get all nodes that can be the start a match, which is FirstPosition()
+ // of the portion of the tree corresponding to user-written rules.
+ // See the tree description in bofFixup().
+ RBBINode *userRuleRoot = tree;
+ if (fRB->fSetBuilder->sawBOF()) {
+ userRuleRoot = tree->fLeftChild->fRightChild;
+ }
+ U_ASSERT(userRuleRoot != NULL);
+ UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
// Iteratate over all leaf nodes,
// into the rule file.
if (fRB->fLBCMNoChain) {
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
- U_ASSERT(c != -1);
- ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
- if (cLBProp == U_LB_COMBINING_MARK) {
- continue;
+ if (c != -1) {
+ // c == -1 occurs with sets containing only the {eof} marker string.
+ ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
+ if (cLBProp == U_LB_COMBINING_MARK) {
+ continue;
+ }
}
}
}
+//-----------------------------------------------------------------------------
+//
+// bofFixup. Fixup for state tables that include {bof} beginning of input testing.
+// Do an swizzle similar to chaining, modifying the followPos set of
+// the bofNode to include the followPos nodes from other {bot} nodes
+// scattered through the tree.
+//
+// This function has much in common with calcChainedFollowPos().
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::bofFixup() {
+
+ if (U_FAILURE(*fStatus)) {
+ return;
+ }
+
+ // The parse tree looks like this ...
+ // fTree root ---> <cat>
+ // / \ .
+ // <cat> <#end node>
+ // / \ .
+ // <bofNode> rest
+ // of tree
+ //
+ // We will be adding things to the followPos set of the <bofNode>
+ //
+ RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
+ U_ASSERT(bofNode->fType == RBBINode::leafChar);
+ U_ASSERT(bofNode->fVal == 2);
+
+ // Get all nodes that can be the start a match of the user-written rules
+ // (excluding the fake bofNode)
+ // We want the nodes that can start a match in the
+ // part labeled "rest of tree"
+ //
+ UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
+
+ RBBINode *startNode;
+ int startNodeIx;
+ for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
+ startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
+ if (startNode->fType != RBBINode::leafChar) {
+ continue;
+ }
+
+ if (startNode->fVal == bofNode->fVal) {
+ // We found a leaf node corresponding to a {bof} that was
+ // explicitly written into a rule.
+ // Add everything from the followPos set of this node to the
+ // followPos set of the fake bofNode at the start of the tree.
+ //
+ setAdd(bofNode->fFollowPos, startNode->fFollowPos);
+ }
+ }
+}
+
//-----------------------------------------------------------------------------
//
// buildStateTable() Determine the set of runtime DFA states and the
if (U_FAILURE(*fStatus)) {
return;
}
+ RBBIStateDescriptor *failState;
+ // Set it to NULL to avoid uninitialized warning
+ RBBIStateDescriptor *initialState = NULL;
//
// Add a dummy state 0 - the stop state. Not from Aho.
int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
- RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+ failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+ if (failState == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ goto ExitBuildSTdeleteall;
+ }
failState->fPositions = new UVector(*fStatus);
- if (U_FAILURE(*fStatus)) {
- return;
+ if (failState->fPositions == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ }
+ if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
+ goto ExitBuildSTdeleteall;
}
fDStates->addElement(failState, *fStatus);
if (U_FAILURE(*fStatus)) {
- return;
+ goto ExitBuildSTdeleteall;
}
// initially, the only unmarked state in Dstates is firstpos(root),
// where toot is the root of the syntax tree for (r)#;
- RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+ initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+ if (initialState == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ }
if (U_FAILURE(*fStatus)) {
- return;
+ goto ExitBuildSTdeleteall;
}
initialState->fPositions = new UVector(*fStatus);
+ if (initialState->fPositions == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ }
if (U_FAILURE(*fStatus)) {
- return;
+ goto ExitBuildSTdeleteall;
}
setAdd(initialState->fPositions, fTree->fFirstPosSet);
fDStates->addElement(initialState, *fStatus);
if (U_FAILURE(*fStatus)) {
- return;
+ goto ExitBuildSTdeleteall;
}
// while there is an unmarked state T in Dstates do begin
if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
if (U == NULL) {
U = new UVector(*fStatus);
+ if (U == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ goto ExitBuildSTdeleteall;
+ }
}
setAdd(U, p->fFollowPos);
}
if (!UinDstates)
{
RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+ if (newState == NULL) {
+ *fStatus = U_MEMORY_ALLOCATION_ERROR;
+ }
if (U_FAILURE(*fStatus)) {
- return;
+ goto ExitBuildSTdeleteall;
}
newState->fPositions = U;
fDStates->addElement(newState, *fStatus);
}
}
}
+ return;
+ // delete local pointers only if error occured.
+ExitBuildSTdeleteall:
+ delete initialState;
+ delete failState;
}
// Any non-zero value for fAccepting means this is an accepting node.
// The value is what will be returned to the user as the break status.
// If no other value was specified, force it to -1.
- sd->fAccepting = endMarker->fVal;
- if (sd->fAccepting == 0) {
- sd->fAccepting = -1;
+
+ if (sd->fAccepting==0) {
+ // State hasn't been marked as accepting yet. Do it now.
+ sd->fAccepting = endMarker->fVal;
+ if (sd->fAccepting == 0) {
+ sd->fAccepting = -1;
+ }
+ }
+ if (sd->fAccepting==-1 && endMarker->fVal != 0) {
+ // Both lookahead and non-lookahead accepting for this state.
+ // Favor the look-ahead. Expedient for line break.
+ // TODO: need a more elegant resolution for conflicting rules.
+ sd->fAccepting = endMarker->fVal;
}
+ // implicit else:
+ // if sd->fAccepting already had a value other than 0 or -1, leave it be.
// If the end marker node is from a look-ahead rule, set
// the fLookAhead field or this state also.
if (endMarker->fLookAheadEnd) {
+ // TODO: don't change value if already set?
+ // TODO: allow for more than one active look-ahead rule in engine.
+ // Make value here an index to a side array in engine?
sd->fLookAhead = sd->fAccepting;
}
}
}
for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
tagNode = (RBBINode *)tagNodes.elementAt(i);
-
+
for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
}
-
- // For each state
- for (n=0; n<fDStates->size(); n++) {
+
+ // For each state
+ for (n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
UVector *thisStatesTagValues = sd->fTagVals;
if (thisStatesTagValues == NULL) {
sd->fTagsIdx = -1;
int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
int32_t nextTagGroupStart = 0;
-
+
// Loop runs once per group of tags in the global list
while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
thisTagGroupStart = nextTagGroupStart;
// The lengths match, go ahead and compare the actual tag values
// between this state and the group from the global list.
for (i=0; i<thisStatesTagValues->size(); i++) {
- if (thisStatesTagValues->elementAti(i) !=
+ if (thisStatesTagValues->elementAti(i) !=
fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
- // Mismatch.
+ // Mismatch.
break;
}
}
-
+
if (i == thisStatesTagValues->size()) {
// We found a set of tag values in the global list that match
// those for this state. Use them.
sd->fTagsIdx = thisTagGroupStart;
- break;
+ break;
}
}
-
+
if (sd->fTagsIdx == -1) {
// No suitable entry in the global tag list already. Add one
sd->fTagsIdx = fRB->fRuleStatusVals->size();
//
// setAdd Set operation on UVector
// dest = dest union source
-// Elements may only appear once. Order is unimportant.
+// Elements may only appear once and must be sorted.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
- int destOriginalSize = dest->size();
- int sourceSize = source->size();
- int32_t si, di;
-
- for (si=0; si<sourceSize && U_SUCCESS(*fStatus); si++) {
- void *elToAdd = source->elementAt(si);
- for (di=0; di<destOriginalSize; di++) {
- if (dest->elementAt(di) == elToAdd) {
- goto elementAlreadyInDest;
- }
+ int32_t destOriginalSize = dest->size();
+ int32_t sourceSize = source->size();
+ int32_t di = 0;
+ void *(destS[16]), *(sourceS[16]); // Handle small cases without malloc
+ void **destH = 0, **sourceH = 0;
+ void **destBuff, **sourceBuff;
+ void **destLim, **sourceLim;
+
+ if (destOriginalSize > (int32_t)(sizeof(destS)/sizeof(destS[0]))) {
+ destH = (void **)uprv_malloc(sizeof(void *) * destOriginalSize);
+ destBuff = destH;
+ }
+ else {
+ destBuff = destS;
+ }
+ if (destBuff == 0) {
+ return;
+ }
+ destLim = destBuff + destOriginalSize;
+
+ if (sourceSize > (int32_t)(sizeof(sourceS)/sizeof(sourceS[0]))) {
+ sourceH = (void **)uprv_malloc(sizeof(void *) * sourceSize);
+ sourceBuff = sourceH;
+ }
+ else {
+ sourceBuff = sourceS;
+ }
+ if (sourceBuff == 0) {
+ if (destH) {
+ uprv_free(destH);
+ }
+ return;
+ }
+ sourceLim = sourceBuff + sourceSize;
+
+ // Avoid multiple "get element" calls by getting the contents into arrays
+ (void) dest->toArray(destBuff);
+ (void) source->toArray(sourceBuff);
+
+ dest->setSize(sourceSize+destOriginalSize, *fStatus);
+
+ while (sourceBuff < sourceLim && destBuff < destLim) {
+ if (*destBuff == *sourceBuff) {
+ dest->setElementAt(*sourceBuff++, di++);
+ destBuff++;
+ }
+ // This check is required for machines with segmented memory, like i5/OS.
+ // Direct pointer comparison is not recommended.
+ else if (uprv_memcmp(destBuff, sourceBuff, sizeof(void *)) < 0) {
+ dest->setElementAt(*destBuff++, di++);
}
- dest->addElement(elToAdd, *fStatus);
- elementAlreadyInDest: ;
+ else { /* *sourceBuff < *destBuff */
+ dest->setElementAt(*sourceBuff++, di++);
+ }
+ }
+
+ // At most one of these two cleanup loops will execute
+ while (destBuff < destLim) {
+ dest->setElementAt(*destBuff++, di++);
+ }
+ while (sourceBuff < sourceLim) {
+ dest->setElementAt(*sourceBuff++, di++);
+ }
+
+ dest->setSize(di, *fStatus);
+ if (destH) {
+ uprv_free(destH);
+ }
+ if (sourceH) {
+ uprv_free(sourceH);
}
}
//
// setEqual Set operation on UVector.
// Compare for equality.
-// Elements may appear only once.
-// Elements may appear in any order.
+// Elements must be sorted.
//
//-----------------------------------------------------------------------------
UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
- int32_t aSize = a->size();
- int32_t bSize = b->size();
-
- if (aSize != bSize) {
- return FALSE;
- }
-
- int32_t ax;
- int32_t bx;
- int32_t firstBx = 0;
- void *aVal;
- void *bVal = NULL;
-
- for (ax=0; ax<aSize; ax++) {
- aVal = a->elementAt(ax);
- for (bx=firstBx; bx<bSize; bx++) {
- bVal = b->elementAt(bx);
- if (aVal == bVal) {
- if (bx==firstBx) {
- firstBx++;
- }
- break;
- }
- }
- if (aVal != bVal) {
- return FALSE;
- }
- }
- return TRUE;
+ return a->equals(*b);
}
if (fRB->fLookAheadHardBreak) {
table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
}
+ if (fRB->fSetBuilder->sawBOF()) {
+ table->fFlags |= RBBI_BOF_REQUIRED;
+ }
table->fReserved = 0;
for (state=0; state<table->fNumStates; state++) {
RBBIDebugPrintf("index | tags \n");
RBBIDebugPrintf("-------------------\n");
-
+
while (nextRecord < tbl->size()) {
thisRecord = nextRecord;
nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
fTagVals = NULL;
fPositions = NULL;
fDtran = NULL;
-
+
fDtran = new UVector(lastInputSymbol+1, *fStatus);
if (U_FAILURE(*fStatus)) {
return;
*fStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
- fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
+ fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized.
// It is indexed by input symbols, and will
// hold the next state number for each
// symbol.