X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..b331163bffd790ced0e88b73f44f86d49ccc48a5:/icuSources/common/rbbitblb.cpp?ds=inline diff --git a/icuSources/common/rbbitblb.cpp b/icuSources/common/rbbitblb.cpp index b979c92a..01b1c4cb 100644 --- a/icuSources/common/rbbitblb.cpp +++ b/icuSources/common/rbbitblb.cpp @@ -1,13 +1,13 @@ -// -// rbbitblb.cpp -// - /* ********************************************************************** -* Copyright (c) 2002-2004, International Business Machines +* Copyright (c) 2002-2009, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ +// +// rbbitblb.cpp +// + #include "unicode/utypes.h" @@ -20,6 +20,7 @@ #include "rbbidata.h" #include "cstring.h" #include "uassert.h" +#include "cmemory.h" U_NAMESPACE_BEGIN @@ -75,10 +76,35 @@ void RBBITableBuilder::build() { // parse tree for the substition expression. // fTree = fTree->flattenVariables(); +#ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) { RBBIDebugPuts("Parse tree after flattening variable references."); fTree->printTree(TRUE); } +#endif + + // + // If the rules contained any references to {bof} + // add a {bof} to the + // tree. Means that all matches must start out with the + // {bof} fake character. + // + if (fRB->fSetBuilder->sawBOF()) { + RBBINode *bofTop = new RBBINode(RBBINode::opCat); + RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar); + // Delete and exit if memory allocation failed. + if (bofTop == NULL || bofLeaf == NULL) { + *fStatus = U_MEMORY_ALLOCATION_ERROR; + delete bofTop; + delete bofLeaf; + return; + } + bofTop->fLeftChild = bofLeaf; + bofTop->fRightChild = fTree; + bofLeaf->fParent = bofTop; + bofLeaf->fVal = 2; // Reserved value for {bof}. + fTree = bofTop; + } // // Add a unique right-end marker to the expression. @@ -86,9 +112,20 @@ void RBBITableBuilder::build() { // right child being the end marker. // RBBINode *cn = new RBBINode(RBBINode::opCat); + // Exit if memory allocation failed. + if (cn == NULL) { + *fStatus = U_MEMORY_ALLOCATION_ERROR; + return; + } cn->fLeftChild = fTree; fTree->fParent = cn; cn->fRightChild = new RBBINode(RBBINode::endMark); + // Delete and exit if memory allocation failed. + if (cn->fRightChild == NULL) { + *fStatus = U_MEMORY_ALLOCATION_ERROR; + delete cn; + return; + } cn->fRightChild->fParent = cn; fTree = cn; @@ -97,10 +134,12 @@ void RBBITableBuilder::build() { // expression. // fTree->flattenSets(); +#ifdef RBBI_DEBUG if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) { RBBIDebugPuts("Parse tree after flattening Unicode Set references."); fTree->printTree(TRUE); } +#endif // @@ -126,6 +165,13 @@ void RBBITableBuilder::build() { calcChainedFollowPos(fTree); } + // + // BOF (start of input) test fixup. + // + if (fRB->fSetBuilder->sawBOF()) { + bofFixup(); + } + // // Build the DFA state transition tables. // @@ -207,6 +253,9 @@ void RBBITableBuilder::calcFirstPos(RBBINode *n) { n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) { // These are non-empty leaf node types. + // Note: In order to maintain the sort invariant on the set, + // this function should only be called on a node whose set is + // empty to start with. n->fFirstPosSet->addElement(n, *fStatus); return; } @@ -250,6 +299,9 @@ void RBBITableBuilder::calcLastPos(RBBINode *n) { n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) { // These are non-empty leaf node types. + // Note: In order to maintain the sort invariant on the set, + // this function should only be called on a node whose set is + // empty to start with. n->fLastPosSet->addElement(n, *fStatus); return; } @@ -343,14 +395,21 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { // get a list of all endmarker nodes. tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); - // get a list all leaf nodes + // get a list all leaf nodes tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus); if (U_FAILURE(*fStatus)) { return; } - // Get all nodes that can be the start a match, which is FirstPosition(root) - UVector *matchStartNodes = tree->fFirstPosSet; + // Get all nodes that can be the start a match, which is FirstPosition() + // of the portion of the tree corresponding to user-written rules. + // See the tree description in bofFixup(). + RBBINode *userRuleRoot = tree; + if (fRB->fSetBuilder->sawBOF()) { + userRuleRoot = tree->fLeftChild->fRightChild; + } + U_ASSERT(userRuleRoot != NULL); + UVector *matchStartNodes = userRuleRoot->fFirstPosSet; // Iteratate over all leaf nodes, @@ -381,12 +440,22 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { // don't chain from it. // TODO: Add rule syntax for this behavior, get specifics out of here and // into the rule file. - if (fRB->fLBCMNoChain) { + if (fRB->fLBCMNoChain || fRB->fRINoChain) { UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal); - U_ASSERT(c != -1); - ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK); - if (cLBProp == U_LB_COMBINING_MARK) { - continue; + if (c != -1) { + // c == -1 occurs with sets containing only the {eof} marker string. + if (fRB->fLBCMNoChain) { + ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK); + if (cLBProp == U_LB_COMBINING_MARK) { + continue; + } + } + if (fRB->fRINoChain) { + UGraphemeClusterBreak cGBProp = (UGraphemeClusterBreak)u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); + if (cGBProp == U_GCB_REGIONAL_INDICATOR) { + continue; + } + } } } @@ -415,6 +484,62 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { } +//----------------------------------------------------------------------------- +// +// bofFixup. Fixup for state tables that include {bof} beginning of input testing. +// Do an swizzle similar to chaining, modifying the followPos set of +// the bofNode to include the followPos nodes from other {bot} nodes +// scattered through the tree. +// +// This function has much in common with calcChainedFollowPos(). +// +//----------------------------------------------------------------------------- +void RBBITableBuilder::bofFixup() { + + if (U_FAILURE(*fStatus)) { + return; + } + + // The parse tree looks like this ... + // fTree root ---> + // / \ . + // <#end node> + // / \ . + // rest + // of tree + // + // We will be adding things to the followPos set of the + // + RBBINode *bofNode = fTree->fLeftChild->fLeftChild; + U_ASSERT(bofNode->fType == RBBINode::leafChar); + U_ASSERT(bofNode->fVal == 2); + + // Get all nodes that can be the start a match of the user-written rules + // (excluding the fake bofNode) + // We want the nodes that can start a match in the + // part labeled "rest of tree" + // + UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet; + + RBBINode *startNode; + int startNodeIx; + for (startNodeIx = 0; startNodeIxsize(); startNodeIx++) { + startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); + if (startNode->fType != RBBINode::leafChar) { + continue; + } + + if (startNode->fVal == bofNode->fVal) { + // We found a leaf node corresponding to a {bof} that was + // explicitly written into a rule. + // Add everything from the followPos set of this node to the + // followPos set of the fake bofNode at the start of the tree. + // + setAdd(bofNode->fFollowPos, startNode->fFollowPos); + } + } +} + //----------------------------------------------------------------------------- // // buildStateTable() Determine the set of runtime DFA states and the @@ -428,33 +553,49 @@ void RBBITableBuilder::buildStateTable() { if (U_FAILURE(*fStatus)) { return; } + RBBIStateDescriptor *failState; + // Set it to NULL to avoid uninitialized warning + RBBIStateDescriptor *initialState = NULL; // // Add a dummy state 0 - the stop state. Not from Aho. int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1; - RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus); + failState = new RBBIStateDescriptor(lastInputSymbol, fStatus); + if (failState == NULL) { + *fStatus = U_MEMORY_ALLOCATION_ERROR; + goto ExitBuildSTdeleteall; + } failState->fPositions = new UVector(*fStatus); - if (U_FAILURE(*fStatus)) { - return; + if (failState->fPositions == NULL) { + *fStatus = U_MEMORY_ALLOCATION_ERROR; + } + if (failState->fPositions == NULL || U_FAILURE(*fStatus)) { + goto ExitBuildSTdeleteall; } fDStates->addElement(failState, *fStatus); if (U_FAILURE(*fStatus)) { - return; + goto ExitBuildSTdeleteall; } // initially, the only unmarked state in Dstates is firstpos(root), // where toot is the root of the syntax tree for (r)#; - RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus); + initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus); + if (initialState == NULL) { + *fStatus = U_MEMORY_ALLOCATION_ERROR; + } if (U_FAILURE(*fStatus)) { - return; + goto ExitBuildSTdeleteall; } initialState->fPositions = new UVector(*fStatus); + if (initialState->fPositions == NULL) { + *fStatus = U_MEMORY_ALLOCATION_ERROR; + } if (U_FAILURE(*fStatus)) { - return; + goto ExitBuildSTdeleteall; } setAdd(initialState->fPositions, fTree->fFirstPosSet); fDStates->addElement(initialState, *fStatus); if (U_FAILURE(*fStatus)) { - return; + goto ExitBuildSTdeleteall; } // while there is an unmarked state T in Dstates do begin @@ -490,6 +631,10 @@ void RBBITableBuilder::buildStateTable() { if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) { if (U == NULL) { U = new UVector(*fStatus); + if (U == NULL) { + *fStatus = U_MEMORY_ALLOCATION_ERROR; + goto ExitBuildSTdeleteall; + } } setAdd(U, p->fFollowPos); } @@ -517,8 +662,11 @@ void RBBITableBuilder::buildStateTable() { if (!UinDstates) { RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus); + if (newState == NULL) { + *fStatus = U_MEMORY_ALLOCATION_ERROR; + } if (U_FAILURE(*fStatus)) { - return; + goto ExitBuildSTdeleteall; } newState->fPositions = U; fDStates->addElement(newState, *fStatus); @@ -533,6 +681,11 @@ void RBBITableBuilder::buildStateTable() { } } } + return; + // delete local pointers only if error occured. +ExitBuildSTdeleteall: + delete initialState; + delete failState; } @@ -572,14 +725,29 @@ void RBBITableBuilder::flagAcceptingStates() { // Any non-zero value for fAccepting means this is an accepting node. // The value is what will be returned to the user as the break status. // If no other value was specified, force it to -1. - sd->fAccepting = endMarker->fVal; - if (sd->fAccepting == 0) { - sd->fAccepting = -1; + + if (sd->fAccepting==0) { + // State hasn't been marked as accepting yet. Do it now. + sd->fAccepting = endMarker->fVal; + if (sd->fAccepting == 0) { + sd->fAccepting = -1; + } + } + if (sd->fAccepting==-1 && endMarker->fVal != 0) { + // Both lookahead and non-lookahead accepting for this state. + // Favor the look-ahead. Expedient for line break. + // TODO: need a more elegant resolution for conflicting rules. + sd->fAccepting = endMarker->fVal; } + // implicit else: + // if sd->fAccepting already had a value other than 0 or -1, leave it be. // If the end marker node is from a look-ahead rule, set // the fLookAhead field or this state also. if (endMarker->fLookAheadEnd) { + // TODO: don't change value if already set? + // TODO: allow for more than one active look-ahead rule in engine. + // Make value here an index to a side array in engine? sd->fLookAhead = sd->fAccepting; } } @@ -644,7 +812,7 @@ void RBBITableBuilder::flagTaggedStates() { } for (i=0; isize(); n++) { // For each state s (row in the state table) RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t @@ -686,9 +854,9 @@ void RBBITableBuilder::mergeRuleStatusVals() { fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero } - - // For each state - for (n=0; nsize(); n++) { + + // For each state + for (n=0; nsize(); n++) { RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); UVector *thisStatesTagValues = sd->fTagVals; if (thisStatesTagValues == NULL) { @@ -704,7 +872,7 @@ void RBBITableBuilder::mergeRuleStatusVals() { sd->fTagsIdx = -1; int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list int32_t nextTagGroupStart = 0; - + // Loop runs once per group of tags in the global list while (nextTagGroupStart < fRB->fRuleStatusVals->size()) { thisTagGroupStart = nextTagGroupStart; @@ -718,21 +886,21 @@ void RBBITableBuilder::mergeRuleStatusVals() { // The lengths match, go ahead and compare the actual tag values // between this state and the group from the global list. for (i=0; isize(); i++) { - if (thisStatesTagValues->elementAti(i) != + if (thisStatesTagValues->elementAti(i) != fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) { - // Mismatch. + // Mismatch. break; } } - + if (i == thisStatesTagValues->size()) { // We found a set of tag values in the global list that match // those for this state. Use them. sd->fTagsIdx = thisTagGroupStart; - break; + break; } } - + if (sd->fTagsIdx == -1) { // No suitable entry in the global tag list already. Add one sd->fTagsIdx = fRB->fRuleStatusVals->size(); @@ -788,24 +956,63 @@ void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) { // // setAdd Set operation on UVector // dest = dest union source -// Elements may only appear once. Order is unimportant. +// Elements may only appear once and must be sorted. // //----------------------------------------------------------------------------- void RBBITableBuilder::setAdd(UVector *dest, UVector *source) { - int destOriginalSize = dest->size(); - int sourceSize = source->size(); - int32_t si, di; - - for (si=0; sielementAt(si); - for (di=0; dielementAt(di) == elToAdd) { - goto elementAlreadyInDest; - } + int32_t destOriginalSize = dest->size(); + int32_t sourceSize = source->size(); + int32_t di = 0; + MaybeStackArray destArray, sourceArray; // Handle small cases without malloc + void **destPtr, **sourcePtr; + void **destLim, **sourceLim; + + if (destOriginalSize > destArray.getCapacity()) { + if (destArray.resize(destOriginalSize) == NULL) { + return; + } + } + destPtr = destArray.getAlias(); + destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()? + + if (sourceSize > sourceArray.getCapacity()) { + if (sourceArray.resize(sourceSize) == NULL) { + return; + } + } + sourcePtr = sourceArray.getAlias(); + sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()? + + // Avoid multiple "get element" calls by getting the contents into arrays + (void) dest->toArray(destPtr); + (void) source->toArray(sourcePtr); + + dest->setSize(sourceSize+destOriginalSize, *fStatus); + + while (sourcePtr < sourceLim && destPtr < destLim) { + if (*destPtr == *sourcePtr) { + dest->setElementAt(*sourcePtr++, di++); + destPtr++; + } + // This check is required for machines with segmented memory, like i5/OS. + // Direct pointer comparison is not recommended. + else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) { + dest->setElementAt(*destPtr++, di++); + } + else { /* *sourcePtr < *destPtr */ + dest->setElementAt(*sourcePtr++, di++); } - dest->addElement(elToAdd, *fStatus); - elementAlreadyInDest: ; } + + // At most one of these two cleanup loops will execute + while (destPtr < destLim) { + dest->setElementAt(*destPtr++, di++); + } + while (sourcePtr < sourceLim) { + dest->setElementAt(*sourcePtr++, di++); + } + + dest->setSize(di, *fStatus); } @@ -814,40 +1021,11 @@ void RBBITableBuilder::setAdd(UVector *dest, UVector *source) { // // setEqual Set operation on UVector. // Compare for equality. -// Elements may appear only once. -// Elements may appear in any order. +// Elements must be sorted. // //----------------------------------------------------------------------------- UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) { - int32_t aSize = a->size(); - int32_t bSize = b->size(); - - if (aSize != bSize) { - return FALSE; - } - - int32_t ax; - int32_t bx; - int32_t firstBx = 0; - void *aVal; - void *bVal = NULL; - - for (ax=0; axelementAt(ax); - for (bx=firstBx; bxelementAt(bx); - if (aVal == bVal) { - if (bx==firstBx) { - firstBx++; - } - break; - } - } - if (aVal != bVal) { - return FALSE; - } - } - return TRUE; + return a->equals(*b); } @@ -941,6 +1119,9 @@ void RBBITableBuilder::exportTable(void *where) { if (fRB->fLookAheadHardBreak) { table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; } + if (fRB->fSetBuilder->sawBOF()) { + table->fFlags |= RBBI_BOF_REQUIRED; + } table->fReserved = 0; for (state=0; statefNumStates; state++) { @@ -1027,7 +1208,7 @@ void RBBITableBuilder::printRuleStatusTable() { RBBIDebugPrintf("index | tags \n"); RBBIDebugPrintf("-------------------\n"); - + while (nextRecord < tbl->size()) { thisRecord = nextRecord; nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1; @@ -1057,7 +1238,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu fTagVals = NULL; fPositions = NULL; fDtran = NULL; - + fDtran = new UVector(lastInputSymbol+1, *fStatus); if (U_FAILURE(*fStatus)) { return; @@ -1066,7 +1247,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu *fStatus = U_MEMORY_ALLOCATION_ERROR; return; } - fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized. + fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized. // It is indexed by input symbols, and will // hold the next state number for each // symbol.