]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/rbbirb.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / rbbirb.cpp
index a3d32c396ff6c45bf9582acda59e862d5cfe455e..08c577696c22549f905b0b890c083f6f418436c9 100644 (file)
@@ -1,7 +1,9 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 //
 //  file:  rbbirb.cpp
 //
-//  Copyright (C) 2002-2005, International Business Machines Corporation and others.
+//  Copyright (C) 2002-2011, International Business Machines Corporation and others.
 //  All Rights Reserved.
 //
 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
 #include "unicode/uchriter.h"
 #include "unicode/parsepos.h"
 #include "unicode/parseerr.h"
+
 #include "cmemory.h"
 #include "cstring.h"
-
 #include "rbbirb.h"
 #include "rbbinode.h"
-
 #include "rbbiscan.h"
 #include "rbbisetb.h"
 #include "rbbitblb.h"
 #include "rbbidata.h"
+#include "uassert.h"
 
 
 U_NAMESPACE_BEGIN
@@ -43,12 +45,12 @@ U_NAMESPACE_BEGIN
 //
 //----------------------------------------------------------------------------------------
 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
-                                       UParseError     &parseErr,
+                                       UParseError     *parseErr,
                                        UErrorCode      &status)
- : fRules(rules)
+ : fRules(rules), fStrippedRules(rules)
 {
     fStatus = &status; // status is checked below
-    fParseError = &parseErr;
+    fParseError = parseErr;
     fDebugEnv   = NULL;
 #ifdef RBBI_DEBUG
     fDebugEnv   = getenv("U_RBBIDEBUG");
@@ -60,10 +62,7 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
     fSafeFwdTree        = NULL;
     fSafeRevTree        = NULL;
     fDefaultTree        = &fForwardTree;
-    fForwardTables      = NULL;
-    fReverseTables      = NULL;
-    fSafeFwdTables      = NULL;
-    fSafeRevTables      = NULL;
+    fForwardTable       = NULL;
     fRuleStatusVals     = NULL;
     fChainRules         = FALSE;
     fLBCMNoChain        = FALSE;
@@ -72,6 +71,9 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
     fRuleStatusVals     = NULL;
     fScanner            = NULL;
     fSetBuilder         = NULL;
+    if (parseErr) {
+        uprv_memset(parseErr, 0, sizeof(UParseError));
+    }
 
     if (U_FAILURE(status)) {
         return;
@@ -109,11 +111,7 @@ RBBIRuleBuilder::~RBBIRuleBuilder() {
 
     delete fUSetNodes;
     delete fSetBuilder;
-    delete fForwardTables;
-    delete fReverseTables;
-    delete fSafeFwdTables;
-    delete fSafeRevTables;
-
+    delete fForwardTable;
     delete fForwardTree;
     delete fReverseTree;
     delete fSafeFwdTree;
@@ -142,8 +140,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
         return NULL;
     }
 
-    // Remove comments and whitespace from the rules to make it smaller.
-    UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
+    // Remove whitespace from the rules to make it smaller.
+    // The rule parser has already removed comments.
+    fStrippedRules = fScanner->stripRules(fStrippedRules);
 
     // Calculate the size of each section in the data.
     //   Sizes here are padded up to a multiple of 8 for better memory alignment.
@@ -151,16 +150,15 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
     //     without the padding.
     //
     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
-    int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
-    int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
-    int32_t safeFwdTableSize  = align8(fSafeFwdTables->getTableSize());
-    int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
+    int32_t forwardTableSize  = align8(fForwardTable->getTableSize());
+    int32_t reverseTableSize  = align8(fForwardTable->getSafeTableSize());
     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
-    int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
+    int32_t rulesSize         = align8((fStrippedRules.length()+1) * sizeof(UChar));
 
-    int32_t         totalSize = headerSize + forwardTableSize + reverseTableSize
-                                + safeFwdTableSize + safeRevTableSize 
+    int32_t         totalSize = headerSize
+                                + forwardTableSize
+                                + reverseTableSize
                                 + statusTableSize + trieSize + rulesSize;
 
     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
@@ -172,35 +170,30 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
 
 
     data->fMagic            = 0xb1a0;
-    data->fFormatVersion[0] = 3;
-    data->fFormatVersion[1] = 1;
-    data->fFormatVersion[2] = 0;
-    data->fFormatVersion[3] = 0;
+    data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
+    data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
+    data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
+    data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
     data->fLength           = totalSize;
     data->fCatCount         = fSetBuilder->getNumCharCategories();
 
     data->fFTable        = headerSize;
     data->fFTableLen     = forwardTableSize;
-    data->fRTable        = data->fFTable  + forwardTableSize;
+
+    data->fRTable        = data->fFTable  + data->fFTableLen;
     data->fRTableLen     = reverseTableSize;
-    data->fSFTable       = data->fRTable  + reverseTableSize;
-    data->fSFTableLen    = safeFwdTableSize;
-    data->fSRTable       = data->fSFTable + safeFwdTableSize;
-    data->fSRTableLen    = safeRevTableSize;
 
-    data->fTrie          = data->fSRTable + safeRevTableSize;
+    data->fTrie          = data->fRTable + data->fRTableLen;
     data->fTrieLen       = fSetBuilder->getTrieSize();
     data->fStatusTable   = data->fTrie    + trieSize;
     data->fStatusTableLen= statusTableSize;
     data->fRuleSource    = data->fStatusTable + statusTableSize;
-    data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
+    data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);
 
     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
 
-    fForwardTables->exportTable((uint8_t *)data + data->fFTable);
-    fReverseTables->exportTable((uint8_t *)data + data->fRTable);
-    fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
-    fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
+    fForwardTable->exportTable((uint8_t *)data + data->fFTable);
+    fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
 
     int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
@@ -208,16 +201,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
         ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
     }
 
-    strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
+    fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
 
     return data;
 }
 
 
-
-
-
-
 //----------------------------------------------------------------------------------------
 //
 //  createRuleBasedBreakIterator    construct from source rules that are passed in
@@ -226,85 +215,112 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
 //----------------------------------------------------------------------------------------
 BreakIterator *
 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
-                                    UParseError      &parseError,
+                                    UParseError      *parseError,
                                     UErrorCode       &status)
 {
-    // status checked below
-
     //
     // Read the input rules, generate a parse tree, symbol table,
     // and list of all Unicode Sets referenced by the rules.
     //
     RBBIRuleBuilder  builder(rules, parseError, status);
-    builder.fScanner->parse();
     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
         return NULL;
     }
 
-    //
-    // UnicodeSet processing.
-    //    Munge the Unicode Sets to create a set of character categories.
-    //    Generate the mapping tables (TRIE) from input 32-bit characters to
-    //    the character categories.
-    //
-    builder.fSetBuilder->build();
+    RBBIDataHeader *data = builder.build(status);
 
+    if (U_FAILURE(status)) {
+        return nullptr;
+    }
 
     //
-    //   Generate the DFA state transition table.
+    //  Create a break iterator from the compiled rules.
+    //     (Identical to creation from stored pre-compiled rules)
     //
-    builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
-    builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
-    builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
-    builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
-    if (U_SUCCESS(status)
-        && (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
-            builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)) 
-    {
+    // status is checked after init in construction.
+    RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
+    if (U_FAILURE(status)) {
+        delete This;
+        This = NULL;
+    } 
+    else if(This == NULL) { // test for NULL
         status = U_MEMORY_ALLOCATION_ERROR;
-        return NULL;
     }
+    return This;
+}
 
-    builder.fForwardTables->build();
-    builder.fReverseTables->build();
-    builder.fSafeFwdTables->build();
-    builder.fSafeRevTables->build();
+RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
     if (U_FAILURE(status)) {
-        return NULL;
+        return nullptr;
     }
 
-#ifdef RBBI_DEBUG
-    if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
-        builder.fForwardTables->printRuleStatusTable();
+    fScanner->parse();
+    if (U_FAILURE(status)) {
+        return nullptr;
     }
-#endif
 
     //
-    //   Package up the compiled data into a memory image
-    //      in the run-time format.
+    // UnicodeSet processing.
+    //    Munge the Unicode Sets to create a set of character categories.
+    //    Generate the mapping tables (TRIE) from input code points to
+    //    the character categories.
     //
-    RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
-
+    fSetBuilder->buildRanges();
 
     //
-    //  Clean up the compiler related stuff
+    //   Generate the DFA state transition table.
     //
+    fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
+    if (fForwardTable == nullptr) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return nullptr;
+    }
+
+    fForwardTable->buildForwardTable();
+    optimizeTables();
+    fForwardTable->buildSafeReverseTable(status);
+
+
+#ifdef RBBI_DEBUG
+    if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
+        fForwardTable->printStates();
+        fForwardTable->printRuleStatusTable();
+        fForwardTable->printReverseTable();
+    }
+#endif
 
+    fSetBuilder->buildTrie();
 
     //
-    //  Create a break iterator from the compiled rules.
-    //     (Identical to creation from stored pre-compiled rules)
+    //   Package up the compiled data into a memory image
+    //      in the run-time format.
     //
-    // status is checked after init in construction.
-    RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
+    RBBIDataHeader *data = flattenData(); // returns NULL if error
     if (U_FAILURE(status)) {
-        delete This;
-        This = NULL;
-    } 
-    else if(This == NULL) { // test for NULL
-        status = U_MEMORY_ALLOCATION_ERROR;
+        return nullptr;
     }
-    return This;
+    return data;
+}
+
+void RBBIRuleBuilder::optimizeTables() {
+    bool didSomething;
+    do {
+        didSomething = false;
+
+        // Begin looking for duplicates with char class 3.
+        // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
+        // and should not have other categories merged into them.
+        IntPair duplPair = {3, 0};
+        while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
+            fSetBuilder->mergeCategories(duplPair);
+            fForwardTable->removeColumn(duplPair.second);
+            didSomething = true;
+        }
+
+        while (fForwardTable->removeDuplicateStates() > 0) {
+            didSomething = true;
+        }
+    } while (didSomething);
 }
 
 U_NAMESPACE_END