]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/rbbisetb.cpp
ICU-64232.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbisetb.cpp
index 261aba2371203c2bf8480fb34b2a32c14c97870e..36e2e07e9c65a0daee4175eaef328c81b1ea2848 100644 (file)
@@ -1,9 +1,11 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 //
 //  rbbisetb.cpp
 //
 /*
 ***************************************************************************
-*   Copyright (C) 2002-2003 International Business Machines Corporation   *
+*   Copyright (C) 2002-2008 International Business Machines Corporation   *
 *   and others. All rights reserved.                                      *
 ***************************************************************************
 */
@@ -33,7 +35,7 @@
 #if !UCONFIG_NO_BREAK_ITERATION
 
 #include "unicode/uniset.h"
-#include "utrie.h"
+#include "utrie2.h"
 #include "uvector.h"
 #include "uassert.h"
 #include "cmemory.h"
 #include "rbbisetb.h"
 #include "rbbinode.h"
 
-
-//------------------------------------------------------------------------
-//
-//   getFoldedRBBIValue        Call-back function used during building of Trie table.
-//                             Folding value: just store the offset (16 bits)
-//                             if there is any non-0 entry.
-//                             (It'd really be nice if the Trie builder would provide a
-//                             simple default, so this function could go away from here.)
-//
-//------------------------------------------------------------------------
-/* folding value: just store the offset (16 bits) if there is any non-0 entry */
-U_CDECL_BEGIN
-static uint32_t U_CALLCONV
-getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
-    uint32_t value;
-    UChar32 limit;
-    UBool inBlockZero;
-
-    limit=start+0x400;
-    while(start<limit) {
-        value=utrie_get32(trie, start, &inBlockZero);
-        if(inBlockZero) {
-            start+=UTRIE_DATA_BLOCK_LENGTH;
-        } else if(value!=0) {
-            return (uint32_t)(offset|0x8000);
-        } else {
-            ++start;
-        }
-    }
-    return 0;
-}
-
-
-U_CDECL_END
-
-
-
 U_NAMESPACE_BEGIN
 
 //------------------------------------------------------------------------
@@ -94,6 +59,7 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
     fTrie           = 0;
     fTrieSize       = 0;
     fGroupCount     = 0;
+    fSawBOF         = FALSE;
 }
 
 
@@ -113,7 +79,7 @@ RBBISetBuilder::~RBBISetBuilder()
         delete r;
     }
 
-    utrie_close(fTrie);
+    utrie2_close(fTrie);
 }
 
 
@@ -125,7 +91,7 @@ RBBISetBuilder::~RBBISetBuilder()
 //                  from the Unicode Sets.
 //
 //------------------------------------------------------------------------
-void RBBISetBuilder::build() {
+void RBBISetBuilder::buildRanges() {
     RBBINode        *usetNode;
     RangeDescriptor *rlRange;
 
@@ -135,16 +101,23 @@ void RBBISetBuilder::build() {
     //  Initialize the process by creating a single range encompassing all characters
     //  that is in no sets.
     //
-    fRangeList                = new RangeDescriptor(*fStatus);
+    fRangeList                = new RangeDescriptor(*fStatus); // will check for status here
+    if (fRangeList == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
     fRangeList->fStartChar    = 0;
     fRangeList->fEndChar      = 0x10ffff;
 
+    if (U_FAILURE(*fStatus)) {
+        return;
+    }
 
     //
     //  Find the set of non-overlapping ranges of characters
     //
     int  ni;
-    for (ni=0; ; ni++) {
+    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
         usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
         if (usetNode==NULL) {
             break;
@@ -176,6 +149,9 @@ void RBBISetBuilder::build() {
             //     over
             if (rlRange->fStartChar < inputSetRangeBegin) {
                 rlRange->split(inputSetRangeBegin, *fStatus);
+                if (U_FAILURE(*fStatus)) {
+                    return;
+                }
                 continue;
             }
 
@@ -186,12 +162,18 @@ void RBBISetBuilder::build() {
             //   wholly inside the Unicode set.
             if (rlRange->fEndChar > inputSetRangeEnd) {
                 rlRange->split(inputSetRangeEnd+1, *fStatus);
+                if (U_FAILURE(*fStatus)) {
+                    return;
+                }
             }
 
             // The current rlRange is now entirely within the UnicodeSet range.
             // Add this unicode set to the list of sets for this rlRange
             if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
                 rlRange->fIncludesSets->addElement(usetNode, *fStatus);
+                if (U_FAILURE(*fStatus)) {
+                    return;
+                }
             }
 
             // Advance over ranges that we are finished with.
@@ -210,6 +192,11 @@ void RBBISetBuilder::build() {
     //    The groups are numbered, and these group numbers are the set of
     //    input symbols recognized by the run-time state machine.
     //
+    //    Numbering: # 0  (state table column 0) is unused.
+    //               # 1  is reserved - table column 1 is for end-of-input
+    //               # 2  is reserved - table column 2 is for beginning-in-input
+    //               # 3  is the first range list.
+    //
     RangeDescriptor *rlSearchRange;
     for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
         for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
@@ -220,45 +207,102 @@ void RBBISetBuilder::build() {
         }
         if (rlRange->fNum == 0) {
             fGroupCount ++;
-            rlRange->fNum = fGroupCount;
+            rlRange->fNum = fGroupCount+2; 
             rlRange->setDictionaryFlag();
-            addValToSets(rlRange->fIncludesSets, fGroupCount);
+            addValToSets(rlRange->fIncludesSets, fGroupCount+2);
+        }
+    }
+
+    // Handle input sets that contain the special string {eof}.
+    //   Column 1 of the state table is reserved for EOF on input.
+    //   Column 2 is reserved for before-the-start-input.
+    //            (This column can be optimized away later if there are no rule
+    //             references to {bof}.)
+    //   Add this column value (1 or 2) to the equivalent expression
+    //     subtree for each UnicodeSet that contains the string {eof}
+    //   Because {bof} and {eof} are not a characters in the normal sense,
+    //   they doesn't affect the computation of ranges or TRIE.
+    static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
+    static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
+
+    UnicodeString eofString(eofUString);
+    UnicodeString bofString(bofUString);
+    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
+        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
+        if (usetNode==NULL) {
+            break;
+        }
+        UnicodeSet      *inputSet = usetNode->fInputSet;
+        if (inputSet->contains(eofString)) {
+            addValToSet(usetNode, 1);
+        }
+        if (inputSet->contains(bofString)) {
+            addValToSet(usetNode, 2);
+            fSawBOF = TRUE;
         }
     }
 
+
     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
+}
 
-    //
-    // Build the Trie table for mapping UChar32 values to the corresponding
-    //   range group number
-    //
-    fTrie = utrie_open(NULL,    //  Pre-existing trie to be filled in
-                      NULL,    //  Data array  (utrie will allocate one)
-                      100000,  //  Max Data Length
-                      0,       //  Initial value for all code points
-                      TRUE);   //  Keep Latin 1 in separately
 
+//
+// Build the Trie table for mapping UChar32 values to the corresponding
+// range group number.
+//
+void RBBISetBuilder::buildTrie() {
+    RangeDescriptor *rlRange;
 
-    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
-        utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
+    fTrie = utrie2_open(0,       //  Initial value for all code points.
+                        0,       //  Error value for out-of-range input.
+                        fStatus);
+
+    for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
+        utrie2_setRange32(fTrie,
+                          rlRange->fStartChar,     // Range start
+                          rlRange->fEndChar,       // Range end (inclusive)
+                          rlRange->fNum,           // value for range
+                          TRUE,                    // Overwrite previously written values
+                          fStatus);
     }
 }
 
 
+void RBBISetBuilder::mergeCategories(IntPair categories) {
+    U_ASSERT(categories.first >= 1);
+    U_ASSERT(categories.second > categories.first);
+    for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
+        int32_t rangeNum = rd->fNum & ~DICT_BIT;
+        int32_t rangeDict = rd->fNum & DICT_BIT;
+        if (rangeNum == categories.second) {
+            rd->fNum = categories.first | rangeDict;
+        } else if (rangeNum > categories.second) {
+            rd->fNum--;
+        }
+    }
+    --fGroupCount;
+}
+
 
 //-----------------------------------------------------------------------------------
 //
 //  getTrieSize()    Return the size that will be required to serialize the Trie.
 //
 //-----------------------------------------------------------------------------------
-int32_t RBBISetBuilder::getTrieSize() {
-    fTrieSize  = utrie_serialize(fTrie,
-                                    NULL,                // Buffer
-                                    0,                   // Capacity
-                                    getFoldedRBBIValue,
-                                    TRUE,                // Reduce to 16 bits
-                                    fStatus);
+int32_t RBBISetBuilder::getTrieSize()  {
+    if (U_FAILURE(*fStatus)) {
+        return 0;
+    }
+    utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
+    fTrieSize  = utrie2_serialize(fTrie,
+                                  NULL,                // Buffer
+                                  0,                   // Capacity
+                                  fStatus);
+    if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
+        *fStatus = U_ZERO_ERROR;
+    }
     // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
     return fTrieSize;
 }
@@ -272,18 +316,16 @@ int32_t RBBISetBuilder::getTrieSize() {
 //
 //-----------------------------------------------------------------------------------
 void RBBISetBuilder::serializeTrie(uint8_t *where) {
-    utrie_serialize(fTrie,
-                    where,                   // Buffer
-                    fTrieSize,               // Capacity
-                    getFoldedRBBIValue,
-                    TRUE,                    // Reduce to 16 bits
-                    fStatus);
+    utrie2_serialize(fTrie,
+                     where,                   // Buffer
+                     fTrieSize,               // Capacity
+                     fStatus);
 }
 
 //------------------------------------------------------------------------
 //
 //  addValToSets     Add a runtime-mapped input value to each uset from a
-//                   list of uset nodes.
+//                   list of uset nodes. (val corresponds to a state table column.)
 //                   For each of the original Unicode sets - which correspond
 //                   directly to uset nodes - a logically equivalent expression
 //                   is constructed in terms of the remapped runtime input
@@ -299,35 +341,75 @@ void  RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
 
     for (ix=0; ix<sets->size(); ix++) {
         RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
-        RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
-        leafNode->fVal = (unsigned short)val;
-        if (usetNode->fLeftChild == NULL) {
-            usetNode->fLeftChild = leafNode;
-            leafNode->fParent    = usetNode;
-        } else {
-            // There are already input symbols present for this set.
-            // Set up an OR node, with the previous stuff as the left child
-            //   and the new value as the right child.
-            RBBINode *orNode = new RBBINode(RBBINode::opOr);
-            orNode->fLeftChild  = usetNode->fLeftChild;
-            orNode->fRightChild = leafNode;
-            orNode->fLeftChild->fParent  = orNode;
-            orNode->fRightChild->fParent = orNode;
-            usetNode->fLeftChild = orNode;
-            orNode->fParent = usetNode;
+        addValToSet(usetNode, val);
+    }
+}
+
+void  RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
+    RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
+    if (leafNode == NULL) {
+        *fStatus = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+    leafNode->fVal = (unsigned short)val;
+    if (usetNode->fLeftChild == NULL) {
+        usetNode->fLeftChild = leafNode;
+        leafNode->fParent    = usetNode;
+    } else {
+        // There are already input symbols present for this set.
+        // Set up an OR node, with the previous stuff as the left child
+        //   and the new value as the right child.
+        RBBINode *orNode = new RBBINode(RBBINode::opOr);
+        if (orNode == NULL) {
+            *fStatus = U_MEMORY_ALLOCATION_ERROR;
+            return;
         }
+        orNode->fLeftChild  = usetNode->fLeftChild;
+        orNode->fRightChild = leafNode;
+        orNode->fLeftChild->fParent  = orNode;
+        orNode->fRightChild->fParent = orNode;
+        usetNode->fLeftChild = orNode;
+        orNode->fParent = usetNode;
     }
 }
 
 
+//------------------------------------------------------------------------
+//
+//   getNumCharCategories
+//
+//------------------------------------------------------------------------
+int32_t  RBBISetBuilder::getNumCharCategories() const {
+    return fGroupCount + 3;
+}
+
 
 //------------------------------------------------------------------------
 //
-//   getNumOutputSets
+//   sawBOF
 //
 //------------------------------------------------------------------------
-int32_t  RBBISetBuilder::getNumCharCategories() {
-    return fGroupCount + 1;
+UBool  RBBISetBuilder::sawBOF() const {
+    return fSawBOF;
+}
+
+
+//------------------------------------------------------------------------
+//
+//   getFirstChar      Given a runtime RBBI character category, find
+//                     the first UChar32 that is in the set of chars 
+//                     in the category.
+//------------------------------------------------------------------------
+UChar32  RBBISetBuilder::getFirstChar(int32_t category) const {
+    RangeDescriptor   *rlRange;
+    UChar32            retVal = (UChar32)-1;
+    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
+        if (rlRange->fNum == category) {
+            retVal = rlRange->fStartChar;
+            break;
+        }
+    }
+    return retVal;
 }
 
 
@@ -338,8 +420,8 @@ int32_t  RBBISetBuilder::getNumCharCategories() {
 //                      dump out all of the range definitions.
 //
 //------------------------------------------------------------------------
-void RBBISetBuilder::printRanges() {
 #ifdef RBBI_DEBUG
+void RBBISetBuilder::printRanges() {
     RangeDescriptor       *rlRange;
     int                    i;
 
@@ -349,7 +431,7 @@ void RBBISetBuilder::printRanges() {
 
         for (i=0; i<rlRange->fIncludesSets->size(); i++) {
             RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
-            UnicodeString   setName = "anon";
+            UnicodeString   setName = UNICODE_STRING("anon", 4);
             RBBINode       *setRef = usetNode->fParent;
             if (setRef != NULL) {
                 RBBINode *varRef = setRef->fParent;
@@ -357,12 +439,12 @@ void RBBISetBuilder::printRanges() {
                     setName = varRef->fText;
                 }
             }
-            RBBINode::printUnicodeString(setName); RBBIDebugPrintf("  ");
+            RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf("  ");
         }
         RBBIDebugPrintf("\n");
     }
-#endif
 }
+#endif
 
 
 //------------------------------------------------------------------------
@@ -371,6 +453,7 @@ void RBBISetBuilder::printRanges() {
 //                        dump out all of the range groups.
 //
 //------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
 void RBBISetBuilder::printRangeGroups() {
     RangeDescriptor       *rlRange;
     RangeDescriptor       *tRange;
@@ -384,11 +467,11 @@ void RBBISetBuilder::printRangeGroups() {
             lastPrintedGroupNum = groupNum;
             RBBIDebugPrintf("%2i  ", groupNum);
 
-            if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}
+            if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" <DICT> ");}
 
             for (i=0; i<rlRange->fIncludesSets->size(); i++) {
                 RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
-                UnicodeString   setName = "anon";
+                UnicodeString   setName = UNICODE_STRING("anon", 4);
                 RBBINode       *setRef = usetNode->fParent;
                 if (setRef != NULL) {
                     RBBINode *varRef = setRef->fParent;
@@ -396,7 +479,7 @@ void RBBISetBuilder::printRangeGroups() {
                         setName = varRef->fText;
                     }
                 }
-                RBBINode::printUnicodeString(setName); RBBIDebugPrintf(" ");
+                RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
             }
 
             i = 0;
@@ -413,7 +496,7 @@ void RBBISetBuilder::printRangeGroups() {
     }
     RBBIDebugPrintf("\n");
 }
-
+#endif
 
 
 //------------------------------------------------------------------------
@@ -422,8 +505,8 @@ void RBBISetBuilder::printRangeGroups() {
 //                      dump out all of the set definitions.
 //
 //------------------------------------------------------------------------
-void RBBISetBuilder::printSets() {
 #ifdef RBBI_DEBUG
+void RBBISetBuilder::printSets() {
     int                   i;
 
     RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
@@ -439,7 +522,7 @@ void RBBISetBuilder::printSets() {
         }
 
         RBBIDebugPrintf("%3d    ", i);
-        setName = "anonymous";
+        setName = UNICODE_STRING("anonymous", 9);
         setRef = usetNode->fParent;
         if (setRef != NULL) {
             varRef = setRef->fParent;
@@ -447,17 +530,17 @@ void RBBISetBuilder::printSets() {
                 setName = varRef->fText;
             }
         }
-        RBBINode::printUnicodeString(setName);
+        RBBI_DEBUG_printUnicodeString(setName);
         RBBIDebugPrintf("   ");
-        RBBINode::printUnicodeString(usetNode->fText);
+        RBBI_DEBUG_printUnicodeString(usetNode->fText);
         RBBIDebugPrintf("\n");
         if (usetNode->fLeftChild != NULL) {
-            usetNode->fLeftChild->printTree();
+            RBBINode::printTree(usetNode->fLeftChild, TRUE);
         }
     }
     RBBIDebugPrintf("\n");
-#endif
 }
+#endif
 
 
 
@@ -474,7 +557,14 @@ RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &statu
     this->fEndChar      = other.fEndChar;
     this->fNum          = other.fNum;
     this->fNext         = NULL;
+    UErrorCode oldstatus = status;
     this->fIncludesSets = new UVector(status);
+    if (U_FAILURE(oldstatus)) {
+        status = oldstatus;
+    }
+    if (U_FAILURE(status)) {
+        return;
+    }
     /* test for NULL */
     if (this->fIncludesSets == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
@@ -497,7 +587,14 @@ RangeDescriptor::RangeDescriptor(UErrorCode &status) {
     this->fEndChar      = 0;
     this->fNum          = 0;
     this->fNext         = NULL;
+    UErrorCode oldstatus = status;
     this->fIncludesSets = new UVector(status);
+    if (U_FAILURE(oldstatus)) {
+        status = oldstatus;
+    }
+    if (U_FAILURE(status)) {
+        return;
+    }
     /* test for NULL */
     if(this->fIncludesSets == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
@@ -525,12 +622,14 @@ RangeDescriptor::~RangeDescriptor() {
 void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
     U_ASSERT(where>fStartChar && where<=fEndChar);
     RangeDescriptor *nr = new RangeDescriptor(*this, status);
-    /* test for NULL */
     if(nr == 0) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
-
+    if (U_FAILURE(status)) {
+        delete nr;
+        return;
+    }
     //  RangeDescriptor copy constructor copies all fields.
     //  Only need to update those that are different after the split.
     nr->fStartChar = where;
@@ -561,20 +660,20 @@ void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
 void RangeDescriptor::setDictionaryFlag() {
     int i;
 
-    for (i=0; i<this->fIncludesSets->size(); i++) {
-        RBBINode       *usetNode    = (RBBINode *)fIncludesSets->elementAt(i);
-        UnicodeString   setName;
-        RBBINode       *setRef = usetNode->fParent;
-        if (setRef != NULL) {
+    static const char16_t *dictionary = u"dictionary";
+    for (i=0; i<fIncludesSets->size(); i++) {
+        RBBINode *usetNode  = (RBBINode *)fIncludesSets->elementAt(i);
+        RBBINode *setRef = usetNode->fParent;
+        if (setRef != nullptr) {
             RBBINode *varRef = setRef->fParent;
-            if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
-                setName = varRef->fText;
+            if (varRef && varRef->fType == RBBINode::varRef) {
+                const UnicodeString *setName = &varRef->fText;
+                if (setName->compare(dictionary, -1) == 0) {
+                    fNum |= RBBISetBuilder::DICT_BIT;
+                    break;
+                }
             }
         }
-        if (setName.compare("dictionary") == 0) {   // TODO:  no string literals.
-            this->fNum |= 0x4000;
-            break;
-        }
     }
 }