]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/rbbitblb.cpp
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / common / rbbitblb.cpp
index b979c92ac63b20ce1dd41feeefa3952b28c9790f..44c8e9fdd986f2f9532ccc92f173bfd1e1b75474 100644 (file)
@@ -1,13 +1,13 @@
-//
-//  rbbitblb.cpp
-//
-
 /*
 **********************************************************************
-*   Copyright (c) 2002-2004, International Business Machines
+*   Copyright (c) 2002-2008, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
+//
+//  rbbitblb.cpp
+//
+
 
 #include "unicode/utypes.h"
 
@@ -20,6 +20,7 @@
 #include "rbbidata.h"
 #include "cstring.h"
 #include "uassert.h"
+#include "cmemory.h"
 
 U_NAMESPACE_BEGIN
 
@@ -75,10 +76,35 @@ void  RBBITableBuilder::build() {
     //   parse tree for the substition expression.
     //
     fTree = fTree->flattenVariables();
+#ifdef RBBI_DEBUG
     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
         RBBIDebugPuts("Parse tree after flattening variable references.");
         fTree->printTree(TRUE);
     }
+#endif
+
+    //
+    // If the rules contained any references to {bof} 
+    //   add a {bof} <cat> <former root of tree> to the
+    //   tree.  Means that all matches must start out with the 
+    //   {bof} fake character.
+    // 
+    if (fRB->fSetBuilder->sawBOF()) {
+        RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
+        RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
+        // Delete and exit if memory allocation failed.
+        if (bofTop == NULL || bofLeaf == NULL) {
+            *fStatus = U_MEMORY_ALLOCATION_ERROR;
+            delete bofTop;
+            delete bofLeaf;
+            return;
+        }
+        bofTop->fLeftChild  = bofLeaf;
+        bofTop->fRightChild = fTree;
+        bofLeaf->fParent    = bofTop;
+        bofLeaf->fVal       = 2;      // Reserved value for {bof}.
+        fTree               = bofTop;
+    }
 
     //
     // Add a unique right-end marker to the expression.
@@ -86,9 +112,20 @@ void  RBBITableBuilder::build() {
     //   right child being the end marker.
     //
     RBBINode *cn = new RBBINode(RBBINode::opCat);
+    // Exit if memory allocation failed.
+    if (cn == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
     cn->fLeftChild = fTree;
     fTree->fParent = cn;
     cn->fRightChild = new RBBINode(RBBINode::endMark);
+    // Delete and exit if memory allocation failed.
+    if (cn->fRightChild == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+        delete cn;
+        return;
+    }
     cn->fRightChild->fParent = cn;
     fTree = cn;
 
@@ -97,10 +134,12 @@ void  RBBITableBuilder::build() {
     //      expression.
     //
     fTree->flattenSets();
+#ifdef RBBI_DEBUG
     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
         RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
         fTree->printTree(TRUE);
     }
+#endif
 
 
     //
@@ -126,6 +165,13 @@ void  RBBITableBuilder::build() {
         calcChainedFollowPos(fTree);
     }
 
+    //
+    //  BOF (start of input) test fixup.
+    //
+    if (fRB->fSetBuilder->sawBOF()) {
+        bofFixup();
+    }
+
     //
     // Build the DFA state transition tables.
     //
@@ -207,6 +253,9 @@ void RBBITableBuilder::calcFirstPos(RBBINode *n) {
         n->fType == RBBINode::lookAhead ||
         n->fType == RBBINode::tag) {
         // These are non-empty leaf node types.
+        // Note: In order to maintain the sort invariant on the set,
+        // this function should only be called on a node whose set is
+        // empty to start with.
         n->fFirstPosSet->addElement(n, *fStatus);
         return;
     }
@@ -250,6 +299,9 @@ void RBBITableBuilder::calcLastPos(RBBINode *n) {
         n->fType == RBBINode::lookAhead ||
         n->fType == RBBINode::tag) {
         // These are non-empty leaf node types.
+        // Note: In order to maintain the sort invariant on the set,
+        // this function should only be called on a node whose set is
+        // empty to start with.
         n->fLastPosSet->addElement(n, *fStatus);
         return;
     }
@@ -343,14 +395,21 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
     // get a list of all endmarker nodes.
     tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
 
-    // get a list all leaf nodes 
+    // get a list all leaf nodes
     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
     if (U_FAILURE(*fStatus)) {
         return;
     }
 
-    // Get all nodes that can be the start a match, which is FirstPosition(root)
-    UVector *matchStartNodes = tree->fFirstPosSet;
+    // Get all nodes that can be the start a match, which is FirstPosition()
+    // of the portion of the tree corresponding to user-written rules.
+    // See the tree description in bofFixup().
+    RBBINode *userRuleRoot = tree;
+    if (fRB->fSetBuilder->sawBOF()) {
+        userRuleRoot = tree->fLeftChild->fRightChild;
+    }
+    U_ASSERT(userRuleRoot != NULL);
+    UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
 
 
     // Iteratate over all leaf nodes,
@@ -383,10 +442,12 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
         //        into the rule file.
         if (fRB->fLBCMNoChain) {
             UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
-            U_ASSERT(c != -1);
-            ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
-            if (cLBProp == U_LB_COMBINING_MARK) {
-                continue;
+            if (c != -1) {
+                // c == -1 occurs with sets containing only the {eof} marker string.
+                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
+                if (cLBProp == U_LB_COMBINING_MARK) {
+                    continue;
+                }
             }
         }
 
@@ -415,6 +476,62 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
 }
 
 
+//-----------------------------------------------------------------------------
+//
+//   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
+//                Do an swizzle similar to chaining, modifying the followPos set of
+//                the bofNode to include the followPos nodes from other {bot} nodes
+//                scattered through the tree.
+//
+//                This function has much in common with calcChainedFollowPos().
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::bofFixup() {
+
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
+
+    //   The parse tree looks like this ...
+    //         fTree root  --->       <cat>
+    //                               /     \       .
+    //                            <cat>   <#end node>
+    //                           /     \  .
+    //                     <bofNode>   rest
+    //                               of tree
+    //
+    //    We will be adding things to the followPos set of the <bofNode>
+    //
+    RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
+    U_ASSERT(bofNode->fType == RBBINode::leafChar);
+    U_ASSERT(bofNode->fVal == 2);
+
+    // Get all nodes that can be the start a match of the user-written rules
+    //  (excluding the fake bofNode)
+    //  We want the nodes that can start a match in the
+    //     part labeled "rest of tree"
+    // 
+    UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
+
+    RBBINode *startNode;
+    int       startNodeIx;
+    for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
+        startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
+        if (startNode->fType != RBBINode::leafChar) {
+            continue;
+        }
+
+        if (startNode->fVal == bofNode->fVal) {
+            //  We found a leaf node corresponding to a {bof} that was
+            //    explicitly written into a rule.
+            //  Add everything from the followPos set of this node to the
+            //    followPos set of the fake bofNode at the start of the tree.
+            //  
+            setAdd(bofNode->fFollowPos, startNode->fFollowPos);
+        }
+    }
+}
+
 //-----------------------------------------------------------------------------
 //
 //   buildStateTable()    Determine the set of runtime DFA states and the
@@ -428,33 +545,49 @@ void RBBITableBuilder::buildStateTable() {
     if (U_FAILURE(*fStatus)) {
         return;
     }
+    RBBIStateDescriptor *failState;
+    // Set it to NULL to avoid uninitialized warning
+    RBBIStateDescriptor *initialState = NULL; 
     //
     // Add a dummy state 0 - the stop state.  Not from Aho.
     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
-    RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+    failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+    if (failState == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+        goto ExitBuildSTdeleteall;
+    }
     failState->fPositions = new UVector(*fStatus);
-    if (U_FAILURE(*fStatus)) {
-        return;
+    if (failState->fPositions == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+    }
+    if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
+        goto ExitBuildSTdeleteall;
     }
     fDStates->addElement(failState, *fStatus);
     if (U_FAILURE(*fStatus)) {
-        return;
+        goto ExitBuildSTdeleteall;
     }
 
     // initially, the only unmarked state in Dstates is firstpos(root),
     //       where toot is the root of the syntax tree for (r)#;
-    RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+    initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+    if (initialState == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+    }
     if (U_FAILURE(*fStatus)) {
-        return;
+        goto ExitBuildSTdeleteall;
     }
     initialState->fPositions = new UVector(*fStatus);
+    if (initialState->fPositions == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+    }
     if (U_FAILURE(*fStatus)) {
-        return;
+        goto ExitBuildSTdeleteall;
     }
     setAdd(initialState->fPositions, fTree->fFirstPosSet);
     fDStates->addElement(initialState, *fStatus);
     if (U_FAILURE(*fStatus)) {
-        return;
+        goto ExitBuildSTdeleteall;
     }
 
     // while there is an unmarked state T in Dstates do begin
@@ -490,6 +623,10 @@ void RBBITableBuilder::buildStateTable() {
                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
                     if (U == NULL) {
                         U = new UVector(*fStatus);
+                        if (U == NULL) {
+                               *fStatus = U_MEMORY_ALLOCATION_ERROR;
+                               goto ExitBuildSTdeleteall;
+                        }
                     }
                     setAdd(U, p->fFollowPos);
                 }
@@ -517,8 +654,11 @@ void RBBITableBuilder::buildStateTable() {
                 if (!UinDstates)
                 {
                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
+                    if (newState == NULL) {
+                       *fStatus = U_MEMORY_ALLOCATION_ERROR;
+                    }
                     if (U_FAILURE(*fStatus)) {
-                        return;
+                        goto ExitBuildSTdeleteall;
                     }
                     newState->fPositions = U;
                     fDStates->addElement(newState, *fStatus);
@@ -533,6 +673,11 @@ void RBBITableBuilder::buildStateTable() {
             }
         }
     }
+    return;
+    // delete local pointers only if error occured.
+ExitBuildSTdeleteall:
+    delete initialState;
+    delete failState;
 }
 
 
@@ -572,14 +717,29 @@ void     RBBITableBuilder::flagAcceptingStates() {
                 // Any non-zero value for fAccepting means this is an accepting node.
                 // The value is what will be returned to the user as the break status.
                 // If no other value was specified, force it to -1.
-                sd->fAccepting = endMarker->fVal;
-                if (sd->fAccepting == 0) {
-                    sd->fAccepting = -1;
+
+                if (sd->fAccepting==0) {
+                    // State hasn't been marked as accepting yet.  Do it now.
+                    sd->fAccepting = endMarker->fVal;
+                    if (sd->fAccepting == 0) {
+                        sd->fAccepting = -1;
+                    }
+                }
+                if (sd->fAccepting==-1 && endMarker->fVal != 0) {
+                    // Both lookahead and non-lookahead accepting for this state.
+                    // Favor the look-ahead.  Expedient for line break.
+                    // TODO:  need a more elegant resolution for conflicting rules.
+                    sd->fAccepting = endMarker->fVal;
                 }
+                // implicit else:
+                // if sd->fAccepting already had a value other than 0 or -1, leave it be.
 
                 // If the end marker node is from a look-ahead rule, set
                 //   the fLookAhead field or this state also.
                 if (endMarker->fLookAheadEnd) {
+                    // TODO:  don't change value if already set?
+                    // TODO:  allow for more than one active look-ahead rule in engine.
+                    //        Make value here an index to a side array in engine?
                     sd->fLookAhead = sd->fAccepting;
                 }
             }
@@ -644,7 +804,7 @@ void     RBBITableBuilder::flagTaggedStates() {
     }
     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
         tagNode = (RBBINode *)tagNodes.elementAt(i);
-        
+
         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
@@ -686,9 +846,9 @@ void  RBBITableBuilder::mergeRuleStatusVals() {
         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
     }
-        
-    //    For each state 
-    for (n=0; n<fDStates->size(); n++) {     
+
+    //    For each state
+    for (n=0; n<fDStates->size(); n++) {
         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
         UVector *thisStatesTagValues = sd->fTagVals;
         if (thisStatesTagValues == NULL) {
@@ -704,7 +864,7 @@ void  RBBITableBuilder::mergeRuleStatusVals() {
         sd->fTagsIdx = -1;
         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
         int32_t  nextTagGroupStart = 0;
-        
+
         // Loop runs once per group of tags in the global list
         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
             thisTagGroupStart = nextTagGroupStart;
@@ -718,21 +878,21 @@ void  RBBITableBuilder::mergeRuleStatusVals() {
             // The lengths match, go ahead and compare the actual tag values
             //    between this state and the group from the global list.
             for (i=0; i<thisStatesTagValues->size(); i++) {
-                if (thisStatesTagValues->elementAti(i) != 
+                if (thisStatesTagValues->elementAti(i) !=
                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
-                    // Mismatch.  
+                    // Mismatch.
                     break;
                 }
             }
-            
+
             if (i == thisStatesTagValues->size()) {
                 // We found a set of tag values in the global list that match
                 //   those for this state.  Use them.
                 sd->fTagsIdx = thisTagGroupStart;
-                break;   
+                break;
             }
         }
-        
+
         if (sd->fTagsIdx == -1) {
             // No suitable entry in the global tag list already.  Add one
             sd->fTagsIdx = fRB->fRuleStatusVals->size();
@@ -788,23 +948,80 @@ void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
 //
 //  setAdd     Set operation on UVector
 //             dest = dest union source
-//             Elements may only appear once.   Order is unimportant.
+//             Elements may only appear once and must be sorted.
 //
 //-----------------------------------------------------------------------------
 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
-    int destOriginalSize = dest->size();
-    int sourceSize       = source->size();
-    int32_t  si, di;
-
-    for (si=0; si<sourceSize && U_SUCCESS(*fStatus); si++) {
-        void *elToAdd = source->elementAt(si);
-        for (di=0; di<destOriginalSize; di++) {
-            if (dest->elementAt(di) == elToAdd) {
-                goto  elementAlreadyInDest;
-            }
+    int32_t destOriginalSize = dest->size();
+    int32_t sourceSize       = source->size();
+    int32_t di           = 0;
+    void *(destS[16]), *(sourceS[16]);  // Handle small cases without malloc
+    void **destH = 0, **sourceH = 0;
+    void **destBuff, **sourceBuff;
+    void **destLim, **sourceLim;
+
+    if (destOriginalSize > (int32_t)(sizeof(destS)/sizeof(destS[0]))) {
+        destH = (void **)uprv_malloc(sizeof(void *) * destOriginalSize);
+        destBuff = destH;
+    }
+    else {
+        destBuff = destS;
+    }
+    if (destBuff == 0) {
+        return;
+    }
+    destLim = destBuff + destOriginalSize;
+
+    if (sourceSize > (int32_t)(sizeof(sourceS)/sizeof(sourceS[0]))) {
+        sourceH = (void **)uprv_malloc(sizeof(void *) * sourceSize);
+        sourceBuff = sourceH;
+    }
+    else {
+        sourceBuff = sourceS;
+    }
+    if (sourceBuff == 0) {
+        if (destH) {
+            uprv_free(destH);
+        }
+        return;
+    }
+    sourceLim = sourceBuff + sourceSize;
+
+    // Avoid multiple "get element" calls by getting the contents into arrays
+    (void) dest->toArray(destBuff);
+    (void) source->toArray(sourceBuff);
+
+    dest->setSize(sourceSize+destOriginalSize, *fStatus);
+
+    while (sourceBuff < sourceLim && destBuff < destLim) {
+        if (*destBuff == *sourceBuff) {
+            dest->setElementAt(*sourceBuff++, di++);
+            destBuff++;
+        }
+        // This check is required for machines with segmented memory, like i5/OS.
+        // Direct pointer comparison is not recommended.
+        else if (uprv_memcmp(destBuff, sourceBuff, sizeof(void *)) < 0) {
+            dest->setElementAt(*destBuff++, di++);
         }
-        dest->addElement(elToAdd, *fStatus);
-        elementAlreadyInDest: ;
+        else { /* *sourceBuff < *destBuff */
+            dest->setElementAt(*sourceBuff++, di++);
+        }
+    }
+
+    // At most one of these two cleanup loops will execute
+    while (destBuff < destLim) {
+        dest->setElementAt(*destBuff++, di++);
+    }
+    while (sourceBuff < sourceLim) {
+        dest->setElementAt(*sourceBuff++, di++);
+    }
+
+    dest->setSize(di, *fStatus);
+    if (destH) {
+        uprv_free(destH);
+    }
+    if (sourceH) {
+        uprv_free(sourceH);
     }
 }
 
@@ -814,40 +1031,11 @@ void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
 //
 //  setEqual    Set operation on UVector.
 //              Compare for equality.
-//              Elements may appear only once.
-//              Elements may appear in any order.
+//              Elements must be sorted.
 //
 //-----------------------------------------------------------------------------
 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
-    int32_t    aSize = a->size();
-    int32_t    bSize = b->size();
-
-    if (aSize != bSize) {
-        return FALSE;
-    }
-
-    int32_t  ax;
-    int32_t  bx;
-    int32_t  firstBx = 0;
-    void     *aVal;
-    void     *bVal = NULL;
-
-    for (ax=0; ax<aSize; ax++) {
-        aVal = a->elementAt(ax);
-        for (bx=firstBx; bx<bSize; bx++) {
-            bVal = b->elementAt(bx);
-            if (aVal == bVal) {
-                if (bx==firstBx) {
-                    firstBx++;
-                }
-                break;
-            }
-        }
-        if (aVal != bVal) {
-            return FALSE;
-        }
-    }
-    return TRUE;
+    return a->equals(*b);
 }
 
 
@@ -941,6 +1129,9 @@ void RBBITableBuilder::exportTable(void *where) {
     if (fRB->fLookAheadHardBreak) {
         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
     }
+    if (fRB->fSetBuilder->sawBOF()) {
+        table->fFlags  |= RBBI_BOF_REQUIRED;
+    }
     table->fReserved  = 0;
 
     for (state=0; state<table->fNumStates; state++) {
@@ -1027,7 +1218,7 @@ void RBBITableBuilder::printRuleStatusTable() {
 
     RBBIDebugPrintf("index |  tags \n");
     RBBIDebugPrintf("-------------------\n");
-    
+
     while (nextRecord < tbl->size()) {
         thisRecord = nextRecord;
         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
@@ -1057,7 +1248,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
     fTagVals   = NULL;
     fPositions = NULL;
     fDtran     = NULL;
-    
+
     fDtran     = new UVector(lastInputSymbol+1, *fStatus);
     if (U_FAILURE(*fStatus)) {
         return;
@@ -1066,7 +1257,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
         *fStatus = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
-    fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
+    fDtran->setSize(lastInputSymbol+1, *fStatus);    // fDtran needs to be pre-sized.
                                            //   It is indexed by input symbols, and will
                                            //   hold  the next state number for each
                                            //   symbol.