X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/f3c0d7a59d99c2a94c6b8822291f0e42be3773c9..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/common/rbbirb.cpp diff --git a/icuSources/common/rbbirb.cpp b/icuSources/common/rbbirb.cpp index f4fa13c9..08c57769 100644 --- a/icuSources/common/rbbirb.cpp +++ b/icuSources/common/rbbirb.cpp @@ -24,16 +24,16 @@ #include "unicode/uchriter.h" #include "unicode/parsepos.h" #include "unicode/parseerr.h" + #include "cmemory.h" #include "cstring.h" - #include "rbbirb.h" #include "rbbinode.h" - #include "rbbiscan.h" #include "rbbisetb.h" #include "rbbitblb.h" #include "rbbidata.h" +#include "uassert.h" U_NAMESPACE_BEGIN @@ -47,7 +47,7 @@ U_NAMESPACE_BEGIN RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, UParseError *parseErr, UErrorCode &status) - : fRules(rules) + : fRules(rules), fStrippedRules(rules) { fStatus = &status; // status is checked below fParseError = parseErr; @@ -62,10 +62,7 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, fSafeFwdTree = NULL; fSafeRevTree = NULL; fDefaultTree = &fForwardTree; - fForwardTables = NULL; - fReverseTables = NULL; - fSafeFwdTables = NULL; - fSafeRevTables = NULL; + fForwardTable = NULL; fRuleStatusVals = NULL; fChainRules = FALSE; fLBCMNoChain = FALSE; @@ -114,11 +111,7 @@ RBBIRuleBuilder::~RBBIRuleBuilder() { delete fUSetNodes; delete fSetBuilder; - delete fForwardTables; - delete fReverseTables; - delete fSafeFwdTables; - delete fSafeRevTables; - + delete fForwardTable; delete fForwardTree; delete fReverseTree; delete fSafeFwdTree; @@ -147,8 +140,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { return NULL; } - // Remove comments and whitespace from the rules to make it smaller. - UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules)); + // Remove whitespace from the rules to make it smaller. + // The rule parser has already removed comments. + fStrippedRules = fScanner->stripRules(fStrippedRules); // Calculate the size of each section in the data. // Sizes here are padded up to a multiple of 8 for better memory alignment. @@ -156,16 +150,15 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { // without the padding. // int32_t headerSize = align8(sizeof(RBBIDataHeader)); - int32_t forwardTableSize = align8(fForwardTables->getTableSize()); - int32_t reverseTableSize = align8(fReverseTables->getTableSize()); - int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize()); - int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); + int32_t forwardTableSize = align8(fForwardTable->getTableSize()); + int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize()); int32_t trieSize = align8(fSetBuilder->getTrieSize()); int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); - int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar)); + int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar)); - int32_t totalSize = headerSize + forwardTableSize + reverseTableSize - + safeFwdTableSize + safeRevTableSize + int32_t totalSize = headerSize + + forwardTableSize + + reverseTableSize + statusTableSize + trieSize + rulesSize; RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); @@ -177,35 +170,30 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { data->fMagic = 0xb1a0; - data->fFormatVersion[0] = 3; - data->fFormatVersion[1] = 1; - data->fFormatVersion[2] = 0; - data->fFormatVersion[3] = 0; + data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0]; + data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1]; + data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2]; + data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3]; data->fLength = totalSize; data->fCatCount = fSetBuilder->getNumCharCategories(); data->fFTable = headerSize; data->fFTableLen = forwardTableSize; - data->fRTable = data->fFTable + forwardTableSize; + + data->fRTable = data->fFTable + data->fFTableLen; data->fRTableLen = reverseTableSize; - data->fSFTable = data->fRTable + reverseTableSize; - data->fSFTableLen = safeFwdTableSize; - data->fSRTable = data->fSFTable + safeFwdTableSize; - data->fSRTableLen = safeRevTableSize; - data->fTrie = data->fSRTable + safeRevTableSize; + data->fTrie = data->fRTable + data->fRTableLen; data->fTrieLen = fSetBuilder->getTrieSize(); data->fStatusTable = data->fTrie + trieSize; data->fStatusTableLen= statusTableSize; data->fRuleSource = data->fStatusTable + statusTableSize; - data->fRuleSourceLen = strippedRules.length() * sizeof(UChar); + data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar); uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); - fForwardTables->exportTable((uint8_t *)data + data->fFTable); - fReverseTables->exportTable((uint8_t *)data + data->fRTable); - fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable); - fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); + fForwardTable->exportTable((uint8_t *)data + data->fFTable); + fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable); fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); @@ -213,16 +201,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { ruleStatusTable[i] = fRuleStatusVals->elementAti(i); } - strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); + fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); return data; } - - - - //---------------------------------------------------------------------------------------- // // createRuleBasedBreakIterator construct from source rules that are passed in @@ -234,8 +218,6 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, UParseError *parseError, UErrorCode &status) { - // status checked below - // // Read the input rules, generate a parse tree, symbol table, // and list of all Unicode Sets referenced by the rules. @@ -244,75 +226,101 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, if (U_FAILURE(status)) { // status checked here bcos build below doesn't return NULL; } - builder.fScanner->parse(); + + RBBIDataHeader *data = builder.build(status); + + if (U_FAILURE(status)) { + return nullptr; + } + + // + // Create a break iterator from the compiled rules. + // (Identical to creation from stored pre-compiled rules) + // + // status is checked after init in construction. + RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); + if (U_FAILURE(status)) { + delete This; + This = NULL; + } + else if(This == NULL) { // test for NULL + status = U_MEMORY_ALLOCATION_ERROR; + } + return This; +} + +RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { + if (U_FAILURE(status)) { + return nullptr; + } + + fScanner->parse(); + if (U_FAILURE(status)) { + return nullptr; + } // // UnicodeSet processing. // Munge the Unicode Sets to create a set of character categories. - // Generate the mapping tables (TRIE) from input 32-bit characters to + // Generate the mapping tables (TRIE) from input code points to // the character categories. // - builder.fSetBuilder->build(); - + fSetBuilder->buildRanges(); // // Generate the DFA state transition table. // - builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree); - builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree); - builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree); - builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree); - if (builder.fForwardTables == NULL || builder.fReverseTables == NULL || - builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL) - { + fForwardTable = new RBBITableBuilder(this, &fForwardTree, status); + if (fForwardTable == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; - delete builder.fForwardTables; builder.fForwardTables = NULL; - delete builder.fReverseTables; builder.fReverseTables = NULL; - delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL; - delete builder.fSafeRevTables; builder.fSafeRevTables = NULL; - return NULL; + return nullptr; } - builder.fForwardTables->build(); - builder.fReverseTables->build(); - builder.fSafeFwdTables->build(); - builder.fSafeRevTables->build(); + fForwardTable->buildForwardTable(); + optimizeTables(); + fForwardTable->buildSafeReverseTable(status); + #ifdef RBBI_DEBUG - if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) { - builder.fForwardTables->printRuleStatusTable(); + if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) { + fForwardTable->printStates(); + fForwardTable->printRuleStatusTable(); + fForwardTable->printReverseTable(); } #endif + fSetBuilder->buildTrie(); + // // Package up the compiled data into a memory image // in the run-time format. // - RBBIDataHeader *data = builder.flattenData(); // returns NULL if error - if (U_FAILURE(*builder.fStatus)) { - return NULL; + RBBIDataHeader *data = flattenData(); // returns NULL if error + if (U_FAILURE(status)) { + return nullptr; } + return data; +} +void RBBIRuleBuilder::optimizeTables() { + bool didSomething; + do { + didSomething = false; + + // Begin looking for duplicates with char class 3. + // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, + // and should not have other categories merged into them. + IntPair duplPair = {3, 0}; + while (fForwardTable->findDuplCharClassFrom(&duplPair)) { + fSetBuilder->mergeCategories(duplPair); + fForwardTable->removeColumn(duplPair.second); + didSomething = true; + } - // - // Clean up the compiler related stuff - // - - - // - // Create a break iterator from the compiled rules. - // (Identical to creation from stored pre-compiled rules) - // - // status is checked after init in construction. - RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); - if (U_FAILURE(status)) { - delete This; - This = NULL; - } - else if(This == NULL) { // test for NULL - status = U_MEMORY_ALLOCATION_ERROR; - } - return This; + while (fForwardTable->removeDuplicateStates() > 0) { + didSomething = true; + } + } while (didSomething); } U_NAMESPACE_END