X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/common/rbbirb.cpp diff --git a/icuSources/common/rbbirb.cpp b/icuSources/common/rbbirb.cpp index 273ef9d1..08c57769 100644 --- a/icuSources/common/rbbirb.cpp +++ b/icuSources/common/rbbirb.cpp @@ -1,7 +1,9 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html // // file: rbbirb.cpp // -// Copyright (C) 2002-2004, International Business Machines Corporation and others. +// Copyright (C) 2002-2011, International Business Machines Corporation and others. // All Rights Reserved. // // This file contains the RBBIRuleBuilder class implementation. This is the main class for @@ -22,16 +24,16 @@ #include "unicode/uchriter.h" #include "unicode/parsepos.h" #include "unicode/parseerr.h" + #include "cmemory.h" #include "cstring.h" - #include "rbbirb.h" #include "rbbinode.h" - #include "rbbiscan.h" #include "rbbisetb.h" #include "rbbitblb.h" #include "rbbidata.h" +#include "uassert.h" U_NAMESPACE_BEGIN @@ -43,12 +45,12 @@ U_NAMESPACE_BEGIN // //---------------------------------------------------------------------------------------- RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, - UParseError &parseErr, + UParseError *parseErr, UErrorCode &status) - : fRules(rules) + : fRules(rules), fStrippedRules(rules) { fStatus = &status; // status is checked below - fParseError = &parseErr; + fParseError = parseErr; fDebugEnv = NULL; #ifdef RBBI_DEBUG fDebugEnv = getenv("U_RBBIDEBUG"); @@ -60,10 +62,7 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, fSafeFwdTree = NULL; fSafeRevTree = NULL; fDefaultTree = &fForwardTree; - fForwardTables = NULL; - fReverseTables = NULL; - fSafeFwdTables = NULL; - fSafeRevTables = NULL; + fForwardTable = NULL; fRuleStatusVals = NULL; fChainRules = FALSE; fLBCMNoChain = FALSE; @@ -72,6 +71,9 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, fRuleStatusVals = NULL; fScanner = NULL; fSetBuilder = NULL; + if (parseErr) { + uprv_memset(parseErr, 0, sizeof(UParseError)); + } if (U_FAILURE(status)) { return; @@ -109,11 +111,7 @@ RBBIRuleBuilder::~RBBIRuleBuilder() { delete fUSetNodes; delete fSetBuilder; - delete fForwardTables; - delete fReverseTables; - delete fSafeFwdTables; - delete fSafeRevTables; - + delete fForwardTable; delete fForwardTree; delete fReverseTree; delete fSafeFwdTree; @@ -142,8 +140,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { return NULL; } - // Remove comments and whitespace from the rules to make it smaller. - UnicodeString strippedRules(RBBIRuleScanner::stripRules(fRules)); + // Remove whitespace from the rules to make it smaller. + // The rule parser has already removed comments. + fStrippedRules = fScanner->stripRules(fStrippedRules); // Calculate the size of each section in the data. // Sizes here are padded up to a multiple of 8 for better memory alignment. @@ -151,16 +150,15 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { // without the padding. // int32_t headerSize = align8(sizeof(RBBIDataHeader)); - int32_t forwardTableSize = align8(fForwardTables->getTableSize()); - int32_t reverseTableSize = align8(fReverseTables->getTableSize()); - int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize()); - int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); + int32_t forwardTableSize = align8(fForwardTable->getTableSize()); + int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize()); int32_t trieSize = align8(fSetBuilder->getTrieSize()); int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); - int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar)); + int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar)); - int32_t totalSize = headerSize + forwardTableSize + reverseTableSize - + safeFwdTableSize + safeRevTableSize + int32_t totalSize = headerSize + + forwardTableSize + + reverseTableSize + statusTableSize + trieSize + rulesSize; RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); @@ -171,33 +169,31 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { uprv_memset(data, 0, totalSize); - data->fMagic = 0xb1a0; - data->fVersion = 1; - data->fLength = totalSize; - data->fCatCount = fSetBuilder->getNumCharCategories(); + data->fMagic = 0xb1a0; + data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0]; + data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1]; + data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2]; + data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3]; + data->fLength = totalSize; + data->fCatCount = fSetBuilder->getNumCharCategories(); data->fFTable = headerSize; data->fFTableLen = forwardTableSize; - data->fRTable = data->fFTable + forwardTableSize; + + data->fRTable = data->fFTable + data->fFTableLen; data->fRTableLen = reverseTableSize; - data->fSFTable = data->fRTable + reverseTableSize; - data->fSFTableLen = safeFwdTableSize; - data->fSRTable = data->fSFTable + safeFwdTableSize; - data->fSRTableLen = safeRevTableSize; - data->fTrie = data->fSRTable + safeRevTableSize; + data->fTrie = data->fRTable + data->fRTableLen; data->fTrieLen = fSetBuilder->getTrieSize(); data->fStatusTable = data->fTrie + trieSize; data->fStatusTableLen= statusTableSize; data->fRuleSource = data->fStatusTable + statusTableSize; - data->fRuleSourceLen = strippedRules.length() * sizeof(UChar); + data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar); uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); - fForwardTables->exportTable((uint8_t *)data + data->fFTable); - fReverseTables->exportTable((uint8_t *)data + data->fRTable); - fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable); - fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); + fForwardTable->exportTable((uint8_t *)data + data->fFTable); + fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable); fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); @@ -205,16 +201,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { ruleStatusTable[i] = fRuleStatusVals->elementAti(i); } - strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); + fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); return data; } - - - - //---------------------------------------------------------------------------------------- // // createRuleBasedBreakIterator construct from source rules that are passed in @@ -223,85 +215,112 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { //---------------------------------------------------------------------------------------- BreakIterator * RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, - UParseError &parseError, + UParseError *parseError, UErrorCode &status) { - // status checked below - // // Read the input rules, generate a parse tree, symbol table, // and list of all Unicode Sets referenced by the rules. // RBBIRuleBuilder builder(rules, parseError, status); - builder.fScanner->parse(); if (U_FAILURE(status)) { // status checked here bcos build below doesn't return NULL; } - // - // UnicodeSet processing. - // Munge the Unicode Sets to create a set of character categories. - // Generate the mapping tables (TRIE) from input 32-bit characters to - // the character categories. - // - builder.fSetBuilder->build(); + RBBIDataHeader *data = builder.build(status); + if (U_FAILURE(status)) { + return nullptr; + } // - // Generate the DFA state transition table. + // Create a break iterator from the compiled rules. + // (Identical to creation from stored pre-compiled rules) // - builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree); - builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree); - builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree); - builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree); - if (U_SUCCESS(status) - && (builder.fForwardTables == NULL || builder.fReverseTables == NULL || - builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)) - { + // status is checked after init in construction. + RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); + if (U_FAILURE(status)) { + delete This; + This = NULL; + } + else if(This == NULL) { // test for NULL status = U_MEMORY_ALLOCATION_ERROR; - return NULL; } + return This; +} - builder.fForwardTables->build(); - builder.fReverseTables->build(); - builder.fSafeFwdTables->build(); - builder.fSafeRevTables->build(); +RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { if (U_FAILURE(status)) { - return NULL; + return nullptr; } -#ifdef RBBI_DEBUG - if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) { - builder.fForwardTables->printRuleStatusTable(); + fScanner->parse(); + if (U_FAILURE(status)) { + return nullptr; } -#endif // - // Package up the compiled data into a memory image - // in the run-time format. + // UnicodeSet processing. + // Munge the Unicode Sets to create a set of character categories. + // Generate the mapping tables (TRIE) from input code points to + // the character categories. // - RBBIDataHeader *data = builder.flattenData(); // returns NULL if error - + fSetBuilder->buildRanges(); // - // Clean up the compiler related stuff + // Generate the DFA state transition table. // + fForwardTable = new RBBITableBuilder(this, &fForwardTree, status); + if (fForwardTable == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + + fForwardTable->buildForwardTable(); + optimizeTables(); + fForwardTable->buildSafeReverseTable(status); + + +#ifdef RBBI_DEBUG + if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) { + fForwardTable->printStates(); + fForwardTable->printRuleStatusTable(); + fForwardTable->printReverseTable(); + } +#endif + fSetBuilder->buildTrie(); // - // Create a break iterator from the compiled rules. - // (Identical to creation from stored pre-compiled rules) + // Package up the compiled data into a memory image + // in the run-time format. // - // status is checked after init in construction. - RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); + RBBIDataHeader *data = flattenData(); // returns NULL if error if (U_FAILURE(status)) { - delete This; - This = NULL; - } - else if(This == NULL) { // test for NULL - status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; } - return This; + return data; +} + +void RBBIRuleBuilder::optimizeTables() { + bool didSomething; + do { + didSomething = false; + + // Begin looking for duplicates with char class 3. + // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, + // and should not have other categories merged into them. + IntPair duplPair = {3, 0}; + while (fForwardTable->findDuplCharClassFrom(&duplPair)) { + fSetBuilder->mergeCategories(duplPair); + fForwardTable->removeColumn(duplPair.second); + didSomething = true; + } + + while (fForwardTable->removeDuplicateStates() > 0) { + didSomething = true; + } + } while (didSomething); } U_NAMESPACE_END