X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..cecc3f9394f261e71def48cf396d137687dbd0a7:/icuSources/common/rbbi.cpp diff --git a/icuSources/common/rbbi.cpp b/icuSources/common/rbbi.cpp index 10216c22..eb83a420 100644 --- a/icuSources/common/rbbi.cpp +++ b/icuSources/common/rbbi.cpp @@ -1,15 +1,19 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* *************************************************************************** -* Copyright (C) 1999-2006 International Business Machines Corporation * -* and others. All rights reserved. * +* Copyright (C) 1999-2016 International Business Machines Corporation +* and others. All rights reserved. *************************************************************************** */ // -// file: rbbi.c Contains the implementation of the rule based break iterator +// file: rbbi.cpp Contains the implementation of the rule based break iterator // runtime engine and the API implementation for // class RuleBasedBreakIterator // +#include "utypeinfo.h" // for 'typeid' to work + #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION @@ -17,33 +21,39 @@ #include "unicode/rbbi.h" #include "unicode/schriter.h" #include "unicode/uchriter.h" -#include "unicode/udata.h" #include "unicode/uclean.h" -#include "rbbidata.h" -#include "rbbirb.h" +#include "unicode/udata.h" + +#include "brkeng.h" +#include "ucln_cmn.h" #include "cmemory.h" #include "cstring.h" -#include "mutex.h" -#include "ucln_cmn.h" -#include "brkeng.h" - +#include "rbbidata.h" +#include "rbbi_cache.h" +#include "rbbirb.h" #include "uassert.h" -#include "uvector.h" +#include "umutex.h" +#include "uvectr32.h" // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. #if U_LOCAL_SERVICE_HOOK #include "localsvc.h" #endif +// Apple specific +//#include + #ifdef RBBI_DEBUG -static UBool fTrace = FALSE; +static UBool gTrace = FALSE; #endif U_NAMESPACE_BEGIN +// The state number of the starting state +constexpr int32_t START_STATE = 1; -static const int16_t START_STATE = 1; // The state number of the starting state -static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop" +// The state-transition value indicating "stop" +constexpr int32_t STOP_STATE = 0; UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) @@ -58,8 +68,9 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) * tables object that is passed in as a parameter. */ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) + : fSCharIter(UnicodeString()) { - init(); + init(status); fData = new RBBIDataWrapper(data, status); // status checked in constructor if (U_FAILURE(status)) {return;} if(fData == 0) { @@ -68,6 +79,37 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode } } +// +// Construct from precompiled binary rules (tables). This constructor is public API, +// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). +// +RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, + uint32_t ruleLength, + UErrorCode &status) + : fSCharIter(UnicodeString()) +{ + init(status); + if (U_FAILURE(status)) { + return; + } + if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; + if (data->fLength > ruleLength) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); + if (U_FAILURE(status)) {return;} + if(fData == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } +} + + //------------------------------------------------------------------------------- // // Constructor from a UDataMemory handle to precompiled break rules @@ -75,8 +117,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode // //------------------------------------------------------------------------------- RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) + : fSCharIter(UnicodeString()) { - init(); + init(status); fData = new RBBIDataWrapper(udm, status); // status checked in constructor if (U_FAILURE(status)) {return;} if(fData == 0) { @@ -95,11 +138,12 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, UParseError &parseError, UErrorCode &status) + : fSCharIter(UnicodeString()) { - init(); + init(status); if (U_FAILURE(status)) {return;} RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) - RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status); + RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that // creates and returns a complete RBBI. From here, in a constructor, we // can't just return the object created by the builder factory, hence @@ -117,8 +161,11 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, // Used when creating a RuleBasedBreakIterator from a set // of rules. //------------------------------------------------------------------------------- -RuleBasedBreakIterator::RuleBasedBreakIterator() { - init(); +RuleBasedBreakIterator::RuleBasedBreakIterator() + : fSCharIter(UnicodeString()) +{ + UErrorCode status = U_ZERO_ERROR; + init(status); } @@ -129,9 +176,11 @@ RuleBasedBreakIterator::RuleBasedBreakIterator() { // //------------------------------------------------------------------------------- RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) -: BreakIterator(other) +: BreakIterator(other), + fSCharIter(UnicodeString()) { - this->init(); + UErrorCode status = U_ZERO_ERROR; + this->init(status); *this = other; } @@ -140,34 +189,32 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth * Destructor */ RuleBasedBreakIterator::~RuleBasedBreakIterator() { - if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { + if (fCharIter != &fSCharIter) { // fCharIter was adopted from the outside. delete fCharIter; } fCharIter = NULL; - delete fSCharIter; - fCharIter = NULL; - delete fDCharIter; - fDCharIter = NULL; - - utext_close(fText); + + utext_close(&fText); if (fData != NULL) { fData->removeReference(); fData = NULL; } - if (fCachedBreakPositions) { - uprv_free(fCachedBreakPositions); - fCachedBreakPositions = NULL; - } - if (fLanguageBreakEngines) { - delete fLanguageBreakEngines; - fLanguageBreakEngines = NULL; - } - if (fUnhandledBreakEngine) { - delete fUnhandledBreakEngine; - fUnhandledBreakEngine = NULL; - } + delete fBreakCache; + fBreakCache = NULL; + + delete fDictionaryCache; + fDictionaryCache = NULL; + + delete fLanguageBreakEngines; + fLanguageBreakEngines = NULL; + + delete fUnhandledBreakEngine; + fUnhandledBreakEngine = NULL; + + delete [] fLatin1Cat; + fLatin1Cat = NULL; } /** @@ -179,27 +226,32 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { if (this == &that) { return *this; } - reset(); // Delete break cache information - fBreakType = that.fBreakType; + BreakIterator::operator=(that); + fLineWordOpts = that.fLineWordOpts; + if (fLanguageBreakEngines != NULL) { delete fLanguageBreakEngines; fLanguageBreakEngines = NULL; // Just rebuild for now } // TODO: clone fLanguageBreakEngines from "that" UErrorCode status = U_ZERO_ERROR; - fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); + utext_clone(&fText, &that.fText, FALSE, TRUE, &status); - if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { + if (fCharIter != &fSCharIter) { delete fCharIter; } - fCharIter = NULL; + fCharIter = &fSCharIter; - if (that.fCharIter != NULL ) { + if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) { // This is a little bit tricky - it will intially appear that // this->fCharIter is adopted, even if that->fCharIter was // not adopted. That's ok. fCharIter = that.fCharIter->clone(); } + fSCharIter = that.fSCharIter; + if (fCharIter == NULL) { + fCharIter = &fSCharIter; + } if (fData != NULL) { fData->removeReference(); @@ -209,6 +261,20 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { fData = that.fData->addReference(); } + delete [] fLatin1Cat; + fLatin1Cat = NULL; + + fPosition = that.fPosition; + fRuleStatusIndex = that.fRuleStatusIndex; + fDone = that.fDone; + + // TODO: both the dictionary and the main cache need to be copied. + // Current position could be within a dictionary range. Trying to continue + // the iteration without the caches present would go to the rules, with + // the assumption that the current position is on a rule boundary. + fBreakCache->reset(fPosition, fRuleStatusIndex); + fDictionaryCache->reset(); + return *this; } @@ -220,31 +286,41 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { // Initializes all fields, leaving the object in a consistent state. // //----------------------------------------------------------------------------- -void RuleBasedBreakIterator::init() { - UErrorCode status = U_ZERO_ERROR; - fBufferClone = FALSE; - fText = utext_openUChars(NULL, NULL, 0, &status); +void RuleBasedBreakIterator::init(UErrorCode &status) { fCharIter = NULL; - fSCharIter = NULL; - fDCharIter = NULL; fData = NULL; - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; + fLatin1Cat = NULL; + fPosition = 0; + fRuleStatusIndex = 0; + fDone = false; fDictionaryCharCount = 0; - fBreakType = -1; + fLanguageBreakEngines = NULL; + fUnhandledBreakEngine = NULL; + fBreakCache = NULL; + fDictionaryCache = NULL; - fCachedBreakPositions = NULL; - fLanguageBreakEngines = NULL; - fUnhandledBreakEngine = NULL; - fNumCachedBreakPositions = 0; - fPositionInCache = 0; + // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER. + // fText = UTEXT_INITIALIZER; + static const UText initializedUText = UTEXT_INITIALIZER; + uprv_memcpy(&fText, &initializedUText, sizeof(UText)); + + if (U_FAILURE(status)) { + return; + } + + utext_openUChars(&fText, NULL, 0, &status); + fDictionaryCache = new DictionaryCache(this, status); + fBreakCache = new BreakCache(this, status); + if (U_SUCCESS(status) && (fDictionaryCache == NULL || fBreakCache == NULL)) { + status = U_MEMORY_ALLOCATION_ERROR; + } #ifdef RBBI_DEBUG static UBool debugInitDone = FALSE; if (debugInitDone == FALSE) { char *debugEnv = getenv("U_RBBIDEBUG"); if (debugEnv && uprv_strstr(debugEnv, "trace")) { - fTrace = TRUE; + gTrace = TRUE; } debugInitDone = TRUE; } @@ -252,6 +328,12 @@ void RuleBasedBreakIterator::init() { } +void RuleBasedBreakIterator::initLatin1Cat(void) { + fLatin1Cat = new uint16_t[256]; + for (UChar32 c = 0; c < 256; ++c) { + fLatin1Cat[c] = UTRIE2_GET16(fData->fTrie, c); + } +} //----------------------------------------------------------------------------- // @@ -271,19 +353,34 @@ RuleBasedBreakIterator::clone(void) const { */ UBool RuleBasedBreakIterator::operator==(const BreakIterator& that) const { - if (that.getDynamicClassID() != getDynamicClassID()) { + if (typeid(*this) != typeid(that)) { return FALSE; } + if (this == &that) { + return TRUE; + } + + // The base class BreakIterator carries no state that participates in equality, + // and does not implement an equality function that would otherwise be + // checked at this point. const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; + if (that2.fLineWordOpts != fLineWordOpts) { + return FALSE; + } - if (!utext_equals(fText, that2.fText)) { + if (!utext_equals(&fText, &that2.fText)) { // The two break iterators are operating on different text, - // or have a different interation position. + // or have a different iteration position. + // Note that fText's position is always the same as the break iterator's position. return FALSE; }; - // TODO: need a check for when in a dictionary region at different offsets. + if (!(fPosition == that2.fPosition && + fRuleStatusIndex == that2.fRuleStatusIndex && + fDone == that2.fDone)) { + return FALSE; + } if (that2.fData == fData || (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { @@ -311,8 +408,9 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { if (U_FAILURE(status)) { return; } - reset(); - fText = utext_clone(fText, ut, FALSE, TRUE, &status); + fBreakCache->reset(); + fDictionaryCache->reset(); + utext_clone(&fText, ut, FALSE, TRUE, &status); // Set up a dummy CharacterIterator to be returned if anyone // calls getText(). With input from UText, there is no reasonable @@ -320,54 +418,30 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { // Return one over an empty string instead - this is the closest // we can come to signaling a failure. // (GetText() is obsolete, this failure is sort of OK) - if (fDCharIter == NULL) { - static UChar c = 0; - fDCharIter = new UCharCharacterIterator(&c, 0); - } + fSCharIter.setText(UnicodeString()); - if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { + if (fCharIter != &fSCharIter) { // existing fCharIter was adopted from the outside. Delete it now. delete fCharIter; } - fCharIter = fDCharIter; + fCharIter = &fSCharIter; this->first(); } UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { - UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); + UText *result = utext_clone(fillIn, &fText, FALSE, TRUE, &status); return result; } - -/** - * Returns the description used to create this iterator - */ -const UnicodeString& -RuleBasedBreakIterator::getRules() const { - if (fData != NULL) { - return fData->getRuleSourceString(); - } else { - static const UnicodeString *s; - if (s == NULL) { - // TODO: something more elegant here. - // perhaps API should return the string by value. - // Note: thread unsafe init & leak are semi-ok, better than - // what was before. Sould be cleaned up, though. - s = new UnicodeString; - } - return *s; - } -} - //======================================================================= // BreakIterator overrides //======================================================================= /** - * Return a CharacterIterator over the text being analyzed. + * Return a CharacterIterator over the text being analyzed. */ CharacterIterator& RuleBasedBreakIterator::getText() const { @@ -381,21 +455,22 @@ RuleBasedBreakIterator::getText() const { */ void RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { - // If we are holding a CharacterIterator adopted from a + // If we are holding a CharacterIterator adopted from a // previous call to this function, delete it now. - if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { + if (fCharIter != &fSCharIter) { delete fCharIter; } fCharIter = newText; UErrorCode status = U_ZERO_ERROR; - reset(); - if (newText==NULL || newText->startIndex() != 0) { + fBreakCache->reset(); + fDictionaryCache->reset(); + if (newText==NULL || newText->startIndex() != 0) { // startIndex !=0 wants to be an error, but there's no way to report it. // Make the iterator text be an empty string. - fText = utext_openUChars(fText, NULL, 0, &status); + utext_openUChars(&fText, NULL, 0, &status); } else { - fText = utext_openCharacterIterator(fText, newText, &status); + utext_openCharacterIterator(&fText, newText, &status); } this->first(); } @@ -408,42 +483,69 @@ RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { void RuleBasedBreakIterator::setText(const UnicodeString& newText) { UErrorCode status = U_ZERO_ERROR; - reset(); - fText = utext_openConstUnicodeString(fText, &newText, &status); + fBreakCache->reset(); + fDictionaryCache->reset(); + utext_openConstUnicodeString(&fText, &newText, &status); - // Set up a character iterator on the string. + // Set up a character iterator on the string. // Needed in case someone calls getText(). // Can not, unfortunately, do this lazily on the (probably never) // call to getText(), because getText is const. - if (fSCharIter == NULL) { - fSCharIter = new StringCharacterIterator(newText); - } else { - fSCharIter->setText(newText); - } + fSCharIter.setText(newText); - if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { + if (fCharIter != &fSCharIter) { // old fCharIter was adopted from the outside. Delete it. delete fCharIter; } - fCharIter = fSCharIter; + fCharIter = &fSCharIter; this->first(); } +/** + * Provide a new UText for the input text. Must reference text with contents identical + * to the original. + * Intended for use with text data originating in Java (garbage collected) environments + * where the data may be moved in memory at arbitrary times. + */ +RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (input == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + int64_t pos = utext_getNativeIndex(&fText); + // Shallow read-only clone of the new UText into the existing input UText + utext_clone(&fText, input, FALSE, TRUE, &status); + if (U_FAILURE(status)) { + return *this; + } + utext_setNativeIndex(&fText, pos); + if (utext_getNativeIndex(&fText) != pos) { + // Sanity check. The new input utext is supposed to have the exact same + // contents as the old. If we can't set to the same position, it doesn't. + // The contents underlying the old utext might be invalid at this point, + // so it's not safe to check directly. + status = U_ILLEGAL_ARGUMENT_ERROR; + } + return *this; +} + /** - * Sets the current iteration position to the beginning of the text. - * @return The offset of the beginning of the text. + * Sets the current iteration position to the beginning of the text, position zero. + * @return The new iterator position, which is zero. */ int32_t RuleBasedBreakIterator::first(void) { - reset(); - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - //if (fText == NULL) - // return BreakIterator::DONE; - - utext_setNativeIndex(fText, 0); + UErrorCode status = U_ZERO_ERROR; + if (!fBreakCache->seek(0)) { + fBreakCache->populateNear(0, status); + } + fBreakCache->current(); + U_ASSERT(fPosition == 0); return 0; } @@ -452,17 +554,12 @@ int32_t RuleBasedBreakIterator::first(void) { * @return The text's past-the-end offset. */ int32_t RuleBasedBreakIterator::last(void) { - reset(); - if (fText == NULL) { - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - return BreakIterator::DONE; - } - - fLastStatusIndexValid = FALSE; - int32_t pos = (int32_t)utext_nativeLength(fText); - utext_setNativeIndex(fText, pos); - return pos; + int32_t endPos = (int32_t)utext_nativeLength(&fText); + UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. + (void)endShouldBeBoundary; + U_ASSERT(endShouldBeBoundary); + U_ASSERT(fPosition == endPos); + return endPos; } /** @@ -475,14 +572,17 @@ int32_t RuleBasedBreakIterator::last(void) { * the current one. */ int32_t RuleBasedBreakIterator::next(int32_t n) { - int32_t result = current(); - while (n > 0) { - result = next(); - --n; - } - while (n < 0) { - result = previous(); - ++n; + int32_t result = 0; + if (n > 0) { + for (; n > 0 && result != UBRK_DONE; --n) { + result = next(); + } + } else if (n < 0) { + for (; n < 0 && result != UBRK_DONE; ++n) { + result = previous(); + } + } else { + result = current(); } return result; } @@ -492,388 +592,120 @@ int32_t RuleBasedBreakIterator::next(int32_t n) { * @return The position of the first boundary after this one. */ int32_t RuleBasedBreakIterator::next(void) { - // if we have cached break positions and we're still in the range - // covered by them, just move one step forward in the cache - if (fCachedBreakPositions != NULL) { - if (fPositionInCache < fNumCachedBreakPositions - 1) { - ++fPositionInCache; - int32_t pos = fCachedBreakPositions[fPositionInCache]; - utext_setNativeIndex(fText, pos); - return pos; - } - else { - reset(); - } - } - - int32_t startPos = current(); - int32_t result = handleNext(fData->fForwardTable); - if (fDictionaryCharCount > 0) { - result = checkDictionary(startPos, result, FALSE); - } - return result; + fBreakCache->next(); + return fDone ? UBRK_DONE : fPosition; } /** - * Advances the iterator backwards, to the last boundary preceding this one. - * @return The position of the last boundary position preceding this one. + * Move the iterator backwards, to the boundary preceding the current one. + * + * Starts from the current position within fText. + * Starting position need not be on a boundary. + * + * @return The position of the boundary position immediately preceding the starting position. */ int32_t RuleBasedBreakIterator::previous(void) { - int32_t result; - int32_t startPos; - - // if we have cached break positions and we're still in the range - // covered by them, just move one step backward in the cache - if (fCachedBreakPositions != NULL) { - if (fPositionInCache > 0) { - --fPositionInCache; - // If we're at the beginning of the cache, need to reevaluate the - // rule status - if (fPositionInCache <= 0) { - fLastStatusIndexValid = FALSE; - } - int32_t pos = fCachedBreakPositions[fPositionInCache]; - utext_setNativeIndex(fText, pos); - return pos; - } - else { - reset(); - } - } - - // if we're already sitting at the beginning of the text, return DONE - if (fText == NULL || (startPos = current()) == 0) { - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - return BreakIterator::DONE; - } - - if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { - result = handlePrevious(fData->fReverseTable); - if (fDictionaryCharCount > 0) { - result = checkDictionary(result, startPos, TRUE); - } - return result; - } - - // old rule syntax - // set things up. handlePrevious() will back us up to some valid - // break position before the current position (we back our internal - // iterator up one step to prevent handlePrevious() from returning - // the current position), but not necessarily the last one before - - // where we started - - int32_t start = current(); - - UTEXT_PREVIOUS32(fText); - int32_t lastResult = handlePrevious(fData->fReverseTable); - if (lastResult == UBRK_DONE) { - lastResult = 0; - utext_setNativeIndex(fText, 0); - } - result = lastResult; - int32_t lastTag = 0; - UBool breakTagValid = FALSE; - - // iterate forward from the known break position until we pass our - // starting point. The last break position before the starting - // point is our return value - - for (;;) { - result = next(); - if (result == BreakIterator::DONE || result >= start) { - break; - } - lastResult = result; - lastTag = fLastRuleStatusIndex; - breakTagValid = TRUE; - } - - // fLastBreakTag wants to have the value for section of text preceding - // the result position that we are to return (in lastResult.) If - // the backwards rules overshot and the above loop had to do two or more - // next()s to move up to the desired return position, we will have a valid - // tag value. But, if handlePrevious() took us to exactly the correct result positon, - // we wont have a tag value for that position, which is only set by handleNext(). - - // set the current iteration position to be the last break position - // before where we started, and then return that value - utext_setNativeIndex(fText, lastResult); - fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() - fLastStatusIndexValid = breakTagValid; - - // No need to check the dictionary; it will have been handled by - // next() - - return lastResult; + UErrorCode status = U_ZERO_ERROR; + fBreakCache->previous(status); + return fDone ? UBRK_DONE : fPosition; } /** * Sets the iterator to refer to the first boundary position following * the specified position. - * @offset The position from which to begin searching for a break position. + * @param startPos The position from which to begin searching for a break position. * @return The position of the first break after the current position. */ -int32_t RuleBasedBreakIterator::following(int32_t offset) { - // if we have cached break positions and offset is in the range - // covered by them, use them - // TODO: could use binary search - // TODO: what if offset is outside range, but break is not? - if (fCachedBreakPositions != NULL) { - if (offset >= fCachedBreakPositions[0] - && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { - fPositionInCache = 0; - // We are guaranteed not to leave the array due to range test above - while (offset >= fCachedBreakPositions[fPositionInCache]) { - ++fPositionInCache; - } - int32_t pos = fCachedBreakPositions[fPositionInCache]; - utext_setNativeIndex(fText, pos); - return pos; - } - else { - reset(); - } - } - - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the +int32_t RuleBasedBreakIterator::following(int32_t startPos) { + // if the supplied position is before the beginning, return the // text's starting offset - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - if (fText == NULL || offset >= utext_nativeLength(fText)) { - last(); - return next(); - } - else if (offset < 0) { + if (startPos < 0) { return first(); } - // otherwise, set our internal iteration position (temporarily) - // to the position passed in. If this is the _beginning_ position, - // then we can just use next() to get our return value - - int32_t result = 0; - - if (fData->fSafeRevTable != NULL) { - // new rule syntax - utext_setNativeIndex(fText, offset); - // move forward one codepoint to prepare for moving back to a - // safe point. - // this handles offset being between a supplementary character - UTEXT_NEXT32(fText); - // handlePrevious will move most of the time to < 1 boundary away - handlePrevious(fData->fSafeRevTable); - int32_t result = next(); - while (result <= offset) { - result = next(); - } - return result; - } - if (fData->fSafeFwdTable != NULL) { - // backup plan if forward safe table is not available - utext_setNativeIndex(fText, offset); - UTEXT_PREVIOUS32(fText); - // handle next will give result >= offset - handleNext(fData->fSafeFwdTable); - // previous will give result 0 or 1 boundary away from offset, - // most of the time - // we have to - int32_t oldresult = previous(); - while (oldresult > offset) { - int32_t result = previous(); - if (result <= offset) { - return oldresult; - } - oldresult = result; - } - int32_t result = next(); - if (result <= offset) { - return next(); - } - return result; - } - // otherwise, we have to sync up first. Use handlePrevious() to back - // up to a known break position before the specified position (if - // we can determine that the specified position is a break position, - // we don't back up at all). This may or may not be the last break - // position at or before our starting position. Advance forward - // from here until we've passed the starting position. The position - // we stop on will be the first break position after the specified one. - // old rule syntax - - utext_setNativeIndex(fText, offset); - if (offset==0 || - offset==1 && utext_getNativeIndex(fText)==0) { - return next(); - } - result = previous(); - - while (result != BreakIterator::DONE && result <= offset) { - result = next(); - } + // Move requested offset to a code point start. It might be on a trail surrogate, + // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text. + utext_setNativeIndex(&fText, startPos); + startPos = (int32_t)utext_getNativeIndex(&fText); - return result; + UErrorCode status = U_ZERO_ERROR; + fBreakCache->following(startPos, status); + return fDone ? UBRK_DONE : fPosition; } /** * Sets the iterator to refer to the last boundary position before the * specified position. - * @offset The position to begin searching for a break from. + * @param offset The position to begin searching for a break from. * @return The position of the last boundary before the starting position. */ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { - // if we have cached break positions and offset is in the range - // covered by them, use them - if (fCachedBreakPositions != NULL) { - // TODO: binary search? - // TODO: What if offset is outside range, but break is not? - if (offset > fCachedBreakPositions[0] - && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { - fPositionInCache = 0; - while (fPositionInCache < fNumCachedBreakPositions - && offset > fCachedBreakPositions[fPositionInCache]) - ++fPositionInCache; - --fPositionInCache; - // If we're at the beginning of the cache, need to reevaluate the - // rule status - if (fPositionInCache <= 0) { - fLastStatusIndexValid = FALSE; - } - utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); - return fCachedBreakPositions[fPositionInCache]; - } - else { - reset(); - } - } - - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the - // text's starting offset - if (fText == NULL || offset > utext_nativeLength(fText)) { - // return BreakIterator::DONE; + if (offset > utext_nativeLength(&fText)) { return last(); } - else if (offset < 0) { - return first(); - } - // if we start by updating the current iteration position to the - // position specified by the caller, we can just use previous() - // to carry out this operation - - if (fData->fSafeFwdTable != NULL) { - // new rule syntax - utext_setNativeIndex(fText, offset); - int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); - if (newOffset != offset) { - // Will come here if specified offset was not a code point boundary AND - // the underlying implmentation is using UText, which snaps any non-code-point-boundary - // indices to the containing code point. - // For breakitereator::preceding only, these non-code-point indices need to be moved - // up to refer to the following codepoint. - UTEXT_NEXT32(fText); - offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); - } + // Move requested offset to a code point start. It might be on a trail surrogate, + // or on a trail byte if the input is UTF-8. - // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, - // rather than adjusting the position unconditionally? - // (Change would interact with safe rules.) - // TODO: change RBBI behavior for off-boundary indices to match that of UText? - // affects only preceding(), seems cleaner, but is slightly different. - UTEXT_PREVIOUS32(fText); - handleNext(fData->fSafeFwdTable); - int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); - while (result >= offset) { - result = previous(); - } - return result; - } - if (fData->fSafeRevTable != NULL) { - // backup plan if forward safe table is not available - // TODO: check whether this path can be discarded - // It's probably OK to say that rules must supply both safe tables - // if they use safe tables at all. We have certainly never described - // to anyone how to work with just one safe table. - utext_setNativeIndex(fText, offset); - UTEXT_NEXT32(fText); - - // handle previous will give result <= offset - handlePrevious(fData->fSafeRevTable); - - // next will give result 0 or 1 boundary away from offset, - // most of the time - // we have to - int32_t oldresult = next(); - while (oldresult < offset) { - int32_t result = next(); - if (result >= offset) { - return oldresult; - } - oldresult = result; - } - int32_t result = previous(); - if (result >= offset) { - return previous(); - } - return result; - } + utext_setNativeIndex(&fText, offset); + int32_t adjustedOffset = utext_getNativeIndex(&fText); - // old rule syntax - utext_setNativeIndex(fText, offset); - return previous(); + UErrorCode status = U_ZERO_ERROR; + fBreakCache->preceding(adjustedOffset, status); + return fDone ? UBRK_DONE : fPosition; } /** * Returns true if the specfied position is a boundary position. As a side * effect, leaves the iterator pointing to the first boundary position at * or after "offset". + * * @param offset the offset to check. * @return True if "offset" is a boundary position. */ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { - // the beginning index of the iterator is always a boundary position by definition - if (offset == 0) { - first(); // For side effects on current position, tag values. - return TRUE; - } - - if (offset == (int32_t)utext_nativeLength(fText)) { - last(); // For side effects on current position, tag values. - return TRUE; - } - // out-of-range indexes are never boundary positions if (offset < 0) { first(); // For side effects on current position, tag values. return FALSE; } - if (offset > utext_nativeLength(fText)) { - last(); // For side effects on current position, tag values. - return FALSE; + // Adjust offset to be on a code point boundary and not beyond the end of the text. + // Note that isBoundary() is always false for offsets that are not on code point boundaries. + // But we still need the side effect of leaving iteration at the following boundary. + + utext_setNativeIndex(&fText, offset); + int32_t adjustedOffset = utext_getNativeIndex(&fText); + + bool result = false; + UErrorCode status = U_ZERO_ERROR; + if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) { + result = (fBreakCache->current() == offset); } - // otherwise, we can use following() on the position before the specified - // one and return true if the position we get back is the one the user - // specified - utext_previous32From(fText, offset); - int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); - UBool result = following(backOne) == offset; + if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) { + // Original offset is beyond the end of the text. Return FALSE, it's not a boundary, + // but the iteration position remains set to the end of the text, which is a boundary. + return FALSE; + } + if (!result) { + // Not on a boundary. isBoundary() must leave iterator on the following boundary. + // Cache->seek(), above, left us on the preceding boundary, so advance one. + next(); + } return result; } + /** * Returns the current iteration position. * @return The current iteration position. */ int32_t RuleBasedBreakIterator::current(void) const { - int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); - return pos; + return fPosition; } - + + //======================================================================= // implementation //======================================================================= @@ -890,48 +722,124 @@ enum RBBIRunMode { }; +// Map from look-ahead break states (corresponds to rules) to boundary positions. +// Allows multiple lookahead break rules to be in flight at the same time. +// +// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers +// in the state table be sequential, then we can just index an array. And the +// table could also tell us in advance how big that array needs to be. +// +// Before ICU 57 there was just a single simple variable for a look-ahead match that +// was in progress. Two rules at once did not work. + +static const int32_t kMaxLookaheads = 8; +struct LookAheadResults { + int32_t fUsedSlotLimit; + int32_t fPositions[8]; + int16_t fKeys[8]; + + LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}; + + int32_t getPosition(int16_t key) { + for (int32_t i=0; i= kMaxLookaheads) { + U_ASSERT(FALSE); + i = kMaxLookaheads - 1; + } + fKeys[i] = key; + fPositions[i] = position; + U_ASSERT(fUsedSlotLimit == i); + fUsedSlotLimit = i + 1; + } +}; + + //----------------------------------------------------------------------------------- // -// handleNext(stateTable) -// This method is the actual implementation of the rbbi next() method. -// This method initializes the state machine to state 1 -// and advances through the text character by character until we reach the end -// of the text or the state machine transitions to state 0. We update our return -// value every time the state machine passes through an accepting state. +// handleNext() +// Run the state machine to find a boundary // //----------------------------------------------------------------------------------- -int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { +// Route handleNext calls through the following to handleNextInternal, +// in order to handle fLineWordOpts. +int32_t RuleBasedBreakIterator::handleNext() { + int32_t result = handleNextInternal(); + while (fLineWordOpts != UBRK_LINEWORD_NORMAL) { + UChar32 prevChr = utext_char32At(&fText, result-1); + UChar32 currChr = utext_char32At(&fText, result); + if (currChr == U_SENTINEL || prevChr == U_SENTINEL) { + break; + } + if (fLineWordOpts == UBRK_LINEWORD_KEEP_HANGUL) { + UErrorCode status = U_ZERO_ERROR; + if (uscript_getScript(currChr, &status) != USCRIPT_HANGUL || uscript_getScript(prevChr, &status) != USCRIPT_HANGUL) { + break; + } + } else { + if (!u_isalpha(currChr) || !u_isalpha(prevChr)) { + break; + } + } + int32_t nextResult = handleNextInternal(); + if (nextResult <= result) { + break; + } + result = nextResult; + } + return result; +} + +int32_t RuleBasedBreakIterator::handleNextInternal() { int32_t state; - int16_t category = 0; + uint16_t category = 0; RBBIRunMode mode; - + RBBIStateTableRow *row; UChar32 c; - int32_t lookaheadStatus = 0; - int32_t lookaheadTagIdx = 0; - int32_t result = 0; - int32_t initialPosition = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; - const char *tableData = statetable->fTableData; - uint32_t tableRowLen = statetable->fRowLen; - + LookAheadResults lookAheadMatches; + int32_t result = 0; + int32_t initialPosition = 0; + const RBBIStateTable *statetable = fData->fForwardTable; + const char *tableData = statetable->fTableData; + uint32_t tableRowLen = statetable->fRowLen; #ifdef RBBI_DEBUG - if (fTrace) { + if (gTrace) { RBBIDebugPuts("Handle Next pos char state category"); } #endif - // No matter what, handleNext alway correctly sets the break tag value. - fLastStatusIndexValid = TRUE; - fLastRuleStatusIndex = 0; + // handleNext alway sets the break tag value. + // Set the default for it. + fRuleStatusIndex = 0; + + fDictionaryCharCount = 0; // if we're already at the end of the text, return DONE. - initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); + initialPosition = fPosition; + UTEXT_SETNATIVEINDEX(&fText, initialPosition); result = initialPosition; - c = UTEXT_NEXT32(fText); - if (fData == NULL || c==U_SENTINEL) { - return BreakIterator::DONE; + c = UTEXT_NEXT32(&fText); + if (c==U_SENTINEL) { + fDone = TRUE; + return UBRK_DONE; } // Set the initial state for the state machine @@ -939,8 +847,8 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { row = (RBBIStateTableRow *) //(statetable->fTableData + (statetable->fRowLen * state)); (tableData + tableRowLen * state); - - + + mode = RBBI_RUN; if (statetable->fFlags & RBBI_BOF_REQUIRED) { category = 2; @@ -954,17 +862,9 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { if (c == U_SENTINEL) { // Reached end of input string. if (mode == RBBI_END) { - // We have already run the loop one last time with the + // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - if (lookaheadResult > result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - } break; } // Run the loop one last time with the fake end-of-input character category. @@ -983,10 +883,10 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, // not the size of the character going in, which is a UChar32. // - UTRIE_GET16(&fData->fTrie, c, category); + category = (fLatin1Cat!=NULL && c<0x100)? fLatin1Cat[c]: UTRIE2_GET16(fData->fTrie, c); // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators (subclasses). + // Counter is only used by dictionary based iteration. // Chars that need to be handled by a dictionary have a flag bit set // in their category values. // @@ -997,9 +897,9 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { } } - #ifdef RBBI_DEBUG - if (fTrace) { - RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText)); + #ifdef RBBI_DEBUG + if (gTrace) { + RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(&fText)); if (0x20<=c && c<0x7f) { RBBIDebugPrintf("\"%c\" ", c); } else { @@ -1011,7 +911,10 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // State Transition - move machine to its next state // - state = row->fNextState[category]; + + // fNextState is a variable-length array. + U_ASSERT(categoryfHeader->fCatCount); + state = row->fNextState[category]; /*Not accessing beyond memory*/ row = (RBBIStateTableRow *) // (statetable->fTableData + (statetable->fRowLen * state)); (tableData + tableRowLen * state); @@ -1020,63 +923,46 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { if (row->fAccepting == -1) { // Match found, common case. if (mode != RBBI_START) { - result = (int32_t)UTEXT_GETNATIVEINDEX(fText); + result = (int32_t)UTEXT_GETNATIVEINDEX(&fText); } - fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. + fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. } - if (row->fLookAhead != 0) { - if (lookaheadStatus != 0 - && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if (lookAheadHardBreak) { - utext_setNativeIndex(fText, result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on - // TODO: junk this feature? I don't think it's used anywhwere. - goto continueOn; + int16_t completedRule = row->fAccepting; + if (completedRule > 0) { + // Lookahead match is completed. + int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + fRuleStatusIndex = row->fTagIdx; + fPosition = lookaheadResult; + return lookaheadResult; } - - int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - lookaheadTagIdx = row->fTagIdx; - goto continueOn; } - - - if (row->fAccepting != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relavant. Clear out the pending lookahead status. - lookaheadStatus = 0; // clear out any pending look-ahead match. + int16_t rule = row->fLookAhead; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText); + lookAheadMatches.setPosition(rule, pos); } -continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no // longer match is possible, no matter what characters follow. break; } - - // Advance to the next character. + + // Advance to the next character. // If this is a beginning-of-input loop iteration, don't advance // the input position. The next iteration will be processing the // first real input character. if (mode == RBBI_RUN) { - c = UTEXT_NEXT32(fText); + c = UTEXT_NEXT32(&fText); } else { if (mode == RBBI_START) { mode = RBBI_RUN; } } - - } // The state machine is done. Check whether it found a match... @@ -1085,15 +971,16 @@ continueOn: // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { - utext_setNativeIndex(fText, initialPosition); - UTEXT_NEXT32(fText); - result = (int32_t)UTEXT_GETNATIVEINDEX(fText); + utext_setNativeIndex(&fText, initialPosition); + utext_next32(&fText); + result = (int32_t)utext_getNativeIndex(&fText); + fRuleStatusIndex = 0; } // Leave the iterator at our result position. - utext_setNativeIndex(fText, result); + fPosition = result; #ifdef RBBI_DEBUG - if (fTrace) { + if (gTrace) { RBBIDebugPrintf("result = %d\n\n", result); } #endif @@ -1101,122 +988,57 @@ continueOn: } - //----------------------------------------------------------------------------------- // -// handlePrevious() +// handleSafePrevious() // -// Iterate backwards, according to the logic of the reverse rules. -// This version handles the exact style backwards rules. -// -// The logic of this function is very similar to handleNext(), above. +// Iterate backwards using the safe reverse rules. +// The logic of this function is similar to handleNext(), but simpler +// because the safe table does not require as many options. // //----------------------------------------------------------------------------------- -int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { +int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { int32_t state; - int16_t category = 0; - RBBIRunMode mode; + uint16_t category = 0; RBBIStateTableRow *row; UChar32 c; - int32_t lookaheadStatus = 0; int32_t result = 0; - int32_t initialPosition = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; + const RBBIStateTable *stateTable = fData->fReverseTable; + UTEXT_SETNATIVEINDEX(&fText, fromPosition); #ifdef RBBI_DEBUG - if (fTrace) { + if (gTrace) { RBBIDebugPuts("Handle Previous pos char state category"); } #endif - // handlePrevious() never gets the rule status. - // Flag the status as invalid; if the user ever asks for status, we will need - // to back up, then re-find the break position using handleNext(), which does - // get the status value. - fLastStatusIndexValid = FALSE; - fLastRuleStatusIndex = 0; - // if we're already at the start of the text, return DONE. - if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { + if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) { return BreakIterator::DONE; } - // Set up the starting char. - initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); - result = initialPosition; - c = UTEXT_PREVIOUS32(fText); - // Set the initial state for the state machine + c = UTEXT_PREVIOUS32(&fText); state = START_STATE; row = (RBBIStateTableRow *) - (statetable->fTableData + (statetable->fRowLen * state)); - category = 3; - mode = RBBI_RUN; - if (statetable->fFlags & RBBI_BOF_REQUIRED) { - category = 2; - mode = RBBI_START; - } - + (stateTable->fTableData + (stateTable->fRowLen * state)); // loop until we reach the start of the text or transition to state 0 // - for (;;) { - if (c == U_SENTINEL) { - // Reached end of input string. - if (mode == RBBI_END || - *(int32_t *)fData->fHeader->fFormatVersion == 1 ) { - // We have already run the loop one last time with the - // character set to the psueudo {eof} value. Now it is time - // to unconditionally bail out. - // (Or we have an old format binary rule file that does not support {eof}.) - if (lookaheadResult < result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - lookaheadStatus = 0; - } else if (result == initialPosition) { - // Ran off start, no match found. - // move one index one (towards the start, since we are doing a previous()) - utext_setNativeIndex(fText, initialPosition); - UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. - } - break; - } - // Run the loop one last time with the fake end-of-input character category. - mode = RBBI_END; - category = 1; - } + for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) { + // look up the current character's character category, which tells us + // which column in the state table to look at. + // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, + // not the size of the character going in, which is a UChar32. // - // Get the char category. An incoming category of 1 or 2 means that - // we are preset for doing the beginning or end of input, and - // that we shouldn't get a category from an actual text input character. - // - if (mode == RBBI_RUN) { - // look up the current character's character category, which tells us - // which column in the state table to look at. - // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, - // not the size of the character going in, which is a UChar32. - // - UTRIE_GET16(&fData->fTrie, c, category); - - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators (subclasses). - // Chars that need to be handled by a dictionary have a flag bit set - // in their category values. - // - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - // And off the dictionary flag bit. - category &= ~0x4000; - } - } + // And off the dictionary flag bit. For reverse iteration it is not used. + category = UTRIE2_GET16(fData->fTrie, c); + category &= ~0x4000; #ifdef RBBI_DEBUG - if (fTrace) { - RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); + if (gTrace) { + RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText)); if (0x20<=c && c<0x7f) { RBBIDebugPrintf("\"%c\" ", c); } else { @@ -1228,101 +1050,29 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) // State Transition - move machine to its next state // - state = row->fNextState[category]; + // fNextState is a variable-length array. + U_ASSERT(categoryfHeader->fCatCount); + state = row->fNextState[category]; /*Not accessing beyond memory*/ row = (RBBIStateTableRow *) - (statetable->fTableData + (statetable->fRowLen * state)); - - if (row->fAccepting == -1) { - // Match found, common case. - result = (int32_t)UTEXT_GETNATIVEINDEX(fText); - } - - if (row->fLookAhead != 0) { - if (lookaheadStatus != 0 - && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. - result = lookaheadResult; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if (lookAheadHardBreak) { - utext_setNativeIndex(fText, result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on - // TODO: junk this feature? I don't think it's used anywhwere. - goto continueOn; - } - - int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - goto continueOn; - } - - - if (row->fAccepting != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relavant. Clear out the pending lookahead status. - lookaheadStatus = 0; - } + (stateTable->fTableData + (stateTable->fRowLen * state)); -continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. - // We have advanced through the string until it is certain that no - // longer match is possible, no matter what characters follow. + // Transistion to state zero means we have found a safe point. break; } - - // Move (backwards) to the next character to process. - // If this is a beginning-of-input loop iteration, don't advance - // the input position. The next iteration will be processing the - // first real input character. - if (mode == RBBI_RUN) { - c = UTEXT_PREVIOUS32(fText); - } else { - if (mode == RBBI_START) { - mode = RBBI_RUN; - } - } } // The state machine is done. Check whether it found a match... - - // If the iterator failed to advance in the match engine, force it ahead by one. - // (This really indicates a defect in the break rules. They should always match - // at least one character.) - if (result == initialPosition) { - utext_setNativeIndex(fText, initialPosition); - UTEXT_PREVIOUS32(fText); - result = (int32_t)UTEXT_GETNATIVEINDEX(fText); - } - - // Leave the iterator at our result position. - utext_setNativeIndex(fText, result); + result = (int32_t)UTEXT_GETNATIVEINDEX(&fText); #ifdef RBBI_DEBUG - if (fTrace) { + if (gTrace) { RBBIDebugPrintf("result = %d\n\n", result); } #endif return result; } - -void -RuleBasedBreakIterator::reset() -{ - if (fCachedBreakPositions) { - uprv_free(fCachedBreakPositions); - } - fCachedBreakPositions = NULL; - fNumCachedBreakPositions = 0; - fDictionaryCharCount = 0; - fPositionInCache = 0; -} - - - //------------------------------------------------------------------------------- // // getRuleStatus() Return the break rule tag associated with the current @@ -1330,64 +1080,27 @@ RuleBasedBreakIterator::reset() // position by iterating forwards, the value will have been // cached by the handleNext() function. // -// If no cached status value is available, the status is -// found by doing a previous() followed by a next(), which -// leaves the iterator where it started, and computes the -// status while doing the next(). -// //------------------------------------------------------------------------------- -void RuleBasedBreakIterator::makeRuleStatusValid() { - if (fLastStatusIndexValid == FALSE) { - // No cached status is available. - if (fText == NULL || current() == 0) { - // At start of text, or there is no text. Status is always zero. - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - } else { - // Not at start of text. Find status the tedious way. - int32_t pa = current(); - previous(); - if (fNumCachedBreakPositions > 0) { - reset(); // Blow off the dictionary cache - } - int32_t pb = next(); - if (pa != pb) { - // note: the if (pa != pb) test is here only to eliminate warnings for - // unused local variables on gcc. Logically, it isn't needed. - U_ASSERT(pa == pb); - } - } - } - U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); -} - int32_t RuleBasedBreakIterator::getRuleStatus() const { - RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; - nonConstThis->makeRuleStatusValid(); // fLastRuleStatusIndex indexes to the start of the appropriate status record // (the number of status values.) // This function returns the last (largest) of the array of status values. - int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; + int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex]; int32_t tagVal = fData->fRuleStatusTable[idx]; return tagVal; } - - int32_t RuleBasedBreakIterator::getRuleStatusVec( - int32_t *fillInVec, int32_t capacity, UErrorCode &status) -{ + int32_t *fillInVec, int32_t capacity, UErrorCode &status) { if (U_FAILURE(status)) { return 0; } - RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; - nonConstThis->makeRuleStatusValid(); - int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; + int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex]; int32_t numValsToCopy = numVals; if (numVals > capacity) { status = U_BUFFER_OVERFLOW_ERROR; @@ -1395,12 +1108,70 @@ int32_t RuleBasedBreakIterator::getRuleStatusVec( } int i; for (i=0; ifRuleStatusTable[fLastRuleStatusIndex + i + 1]; + fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1]; } return numVals; } - +// Apple custom addition +int32_t RuleBasedBreakIterator::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags) +{ + //os_log(OS_LOG_DEFAULT, "# tokenize 0: maxT %d; txt idx %lld, len %lld", maxTokens, utext_getNativeIndex(fText), utext_nativeLength(fText)); + if (fDone) { + return 0; + } + RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens; + RuleBasedTokenRange *outTokenP = outTokenRanges; + int32_t lastOffset = fPosition; + //os_log(OS_LOG_DEFAULT, "# tokenize 1"); + while (outTokenP < outTokenLimit) { + // start portion from inlining populateFollowing() + int32_t pos = 0; + int32_t ruleStatusIdx = 0; + int32_t startPos = fPosition; + + if (fDictionaryCache->following(startPos, &pos, &ruleStatusIdx)) { + fPosition = pos; + fRuleStatusIndex = ruleStatusIdx; + } else { + pos = handleNextInternal(); // sets fRuleStatusIndex for the pos it returns, updates fPosition + if (pos == UBRK_DONE) { + // fDone = TRUE; already set by handleNextInternal + break; + } + // Use current result from handleNextInternal(), including fRuleStatusIndex, + // unless overridden by dictionary subdivisions + fPosition = pos; + if (fDictionaryCharCount > 0) { + // The text segment obtained from the rules includes dictionary characters. + // Subdivide it, with subdivided results going into the dictionary cache. + fDictionaryCache->populateDictionary(startPos, pos, fRuleStatusIndex, fRuleStatusIndex); + if (fDictionaryCache->following(startPos, &pos, &ruleStatusIdx)) { + fPosition = pos; + fRuleStatusIndex = ruleStatusIdx; + } + } + } + // end portion from inlining populateFollowing() + int32_t flagCount = fData->fRuleStatusTable[fRuleStatusIndex]; + const int32_t* flagPtr = fData->fRuleStatusTable + fRuleStatusIndex + flagCount; + int32_t flagSet = *flagPtr; // if -1 then skip token + if (flagSet != -1) { + outTokenP->location = lastOffset; + outTokenP++->length = fPosition - lastOffset; + if (outTokenFlags) { + // flagSet should be the OR of all flags returned by getRuleStatusVec; + // here we collect from high-order to low-order. + while (--flagCount > 0) { + flagSet |= *--flagPtr; + } + *outTokenFlags++ = (unsigned long)flagSet; + } + } + lastOffset = fPosition; + } + return (outTokenP - outTokenRanges); +} //------------------------------------------------------------------------------- // @@ -1421,19 +1192,7 @@ const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { } - - -//------------------------------------------------------------------------------- -// -// BufferClone TODO: In my (Andy) opinion, this function should be deprecated. -// Saving one heap allocation isn't worth the trouble. -// Cloning shouldn't be done in tight loops, and -// making the clone copy involves other heap operations anyway. -// And the application code for correctly dealing with buffer -// size problems and the eventual object destruction is ugly. -// -//------------------------------------------------------------------------------- -BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, +BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) { @@ -1441,310 +1200,86 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, return NULL; } - // - // If user buffer size is zero this is a preflight operation to - // obtain the needed buffer size, allowing for worst case misalignment. - // if (bufferSize == 0) { - bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0); + bufferSize = 1; // preflighting for deprecated functionality return NULL; } - - // - // Check the alignment and size of the user supplied buffer. - // Allocate heap memory if the user supplied memory is insufficient. - // - char *buf = (char *)stackBuffer; - uint32_t s = bufferSize; - - if (stackBuffer == NULL) { - s = 0; // Ignore size, force allocation if user didn't give us a buffer. - } - if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { - uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf); - s -= offsetUp; - buf += offsetUp; - } - if (s < sizeof(RuleBasedBreakIterator)) { - // Not enough room in the caller-supplied buffer. - // Do a plain-vanilla heap based clone and return that, along with - // a warning that the clone was allocated. - RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this); - if (clonedBI == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - } else { - status = U_SAFECLONE_ALLOCATED_WARNING; - } - return clonedBI; - } - - // - // Clone the source BI into the caller-supplied buffer. - // TODO: using an overloaded operator new to directly initialize the - // copy in the user's buffer would be better, but it doesn't seem - // to get along with namespaces. Investigate why. - // - // The memcpy is only safe with an empty (default constructed) - // break iterator. Use on others can screw up reference counts - // to data. memcpy-ing objects is not really a good idea... - // - RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy - RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf; - uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part - clone->init(); // Init RuleBasedBreakIterator part, (user default constructor) - *clone = *this; // clone = the real BI we want. - clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code) - - return clone; -} - - -//------------------------------------------------------------------------------- -// -// isDictionaryChar Return true if the category lookup for this char -// indicates that it is in the set of dictionary lookup -// chars. -// -// This function is intended for use by dictionary based -// break iterators. -// -//------------------------------------------------------------------------------- -/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { - if (fData == NULL) { - return FALSE; - } - uint16_t category; - UTRIE_GET16(&fData->fTrie, c, category); - return (category & 0x4000) != 0; -}*/ - - -//------------------------------------------------------------------------------- -// -// checkDictionary This function handles all processing of characters in -// the "dictionary" set. It will determine the appropriate -// course of action, and possibly set up a cache in the -// process. -// -//------------------------------------------------------------------------------- -int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, - int32_t endPos, - UBool reverse) { - // Reset the old break cache first. - uint32_t dictionaryCount = fDictionaryCharCount; - reset(); - - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { - return (reverse ? startPos : endPos); - } - - // Starting from the starting point, scan towards the proposed result, - // looking for the first dictionary character (which may be the one - // we're on, if we're starting in the middle of a range). - utext_setNativeIndex(fText, reverse ? endPos : startPos); - if (reverse) { - UTEXT_PREVIOUS32(fText); - } - - int32_t rangeStart = startPos; - int32_t rangeEnd = endPos; - - uint16_t category; - int32_t current; - UErrorCode status = U_ZERO_ERROR; - UStack breaks(status); - int32_t foundBreakCount = 0; - UChar32 c = utext_current32(fText); - - UTRIE_GET16(&fData->fTrie, c, category); - - // Is the character we're starting on a dictionary character? If so, we - // need to back up to include the entire run; otherwise the results of - // the break algorithm will differ depending on where we start. Since - // the result is cached and there is typically a non-dictionary break - // within a small number of words, there should be little performance impact. - if (category & 0x4000) { - if (reverse) { - do { - utext_next32(fText); // TODO: recast to work directly with postincrement. - c = utext_current32(fText); - UTRIE_GET16(&fData->fTrie, c, category); - } while (c != U_SENTINEL && (category & 0x4000)); - // Back up to the last dictionary character - rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); - if (c == U_SENTINEL) { - // c = fText->last32(); - // TODO: why was this if needed? - c = UTEXT_PREVIOUS32(fText); - } - else { - c = UTEXT_PREVIOUS32(fText); - } - } - else { - do { - c = UTEXT_PREVIOUS32(fText); - UTRIE_GET16(&fData->fTrie, c, category); - } - while (c != U_SENTINEL && (category & 0x4000)); - // Back up to the last dictionary character - if (c == U_SENTINEL) { - // c = fText->first32(); - c = utext_current32(fText); - } - else { - utext_next32(fText); - c = utext_current32(fText); - } - rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; - } - UTRIE_GET16(&fData->fTrie, c, category); - } - - // Loop through the text, looking for ranges of dictionary characters. - // For each span, find the appropriate break engine, and ask it to find - // any breaks within the span. - // Note: we always do this in the forward direction, so that the break - // cache is built in the right order. - if (reverse) { - utext_setNativeIndex(fText, rangeStart); - c = utext_current32(fText); - UTRIE_GET16(&fData->fTrie, c, category); - } - while(U_SUCCESS(status)) { - while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { - utext_next32(fText); // TODO: tweak for post-increment operation - c = utext_current32(fText); - UTRIE_GET16(&fData->fTrie, c, category); - } - if (current >= rangeEnd) { - break; - } - - // We now have a dictionary character. Get the appropriate language object - // to deal with it. - const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); - - // Ask the language object if there are any breaks. It will leave the text - // pointer on the other side of its range, ready to search for the next one. - if (lbe != NULL) { - foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); - } - - // Reload the loop variables for the next go-round - c = utext_current32(fText); - UTRIE_GET16(&fData->fTrie, c, category); - } - - // If we found breaks, build a new break cache. The first and last entries must - // be the original starting and ending position. - if (foundBreakCount > 0) { - int32_t totalBreaks = foundBreakCount; - if (startPos < breaks.elementAti(0)) { - totalBreaks += 1; - } - if (endPos > breaks.peeki()) { - totalBreaks += 1; - } - fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); - if (fCachedBreakPositions != NULL) { - int32_t out = 0; - fNumCachedBreakPositions = totalBreaks; - if (startPos < breaks.elementAti(0)) { - fCachedBreakPositions[out++] = startPos; - } - for (int32_t i = 0; i < foundBreakCount; ++i) { - fCachedBreakPositions[out++] = breaks.elementAti(i); - } - if (endPos > fCachedBreakPositions[out-1]) { - fCachedBreakPositions[out] = endPos; - } - // If there are breaks, then by definition, we are replacing the original - // proposed break by one of the breaks we found. Use following() and - // preceding() to do the work. They should never recurse in this case. - if (reverse) { - return preceding(endPos - 1); - } - else { - return following(startPos); - } - } - // If the allocation failed, just fall through to the "no breaks found" case. + BreakIterator *clonedBI = clone(); + if (clonedBI == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } else { + status = U_SAFECLONE_ALLOCATED_WARNING; } - - // If we get here, there were no language-based breaks. Set the text pointer - // to the original proposed break. - utext_setNativeIndex(fText, reverse ? startPos : endPos); - return (reverse ? startPos : endPos); + return (RuleBasedBreakIterator *)clonedBI; } -static UStack *gLanguageBreakFactories = NULL; - U_NAMESPACE_END -// defined in ucln_cmn.h + +static icu::UStack *gLanguageBreakFactories = nullptr; +static const icu::UnicodeString *gEmptyString = nullptr; +static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; +static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER; /** - * Release all static memory held by breakiterator. + * Release all static memory held by breakiterator. */ U_CDECL_BEGIN -static UBool U_CALLCONV breakiterator_cleanup_dict(void) { - if (gLanguageBreakFactories) { - delete gLanguageBreakFactories; - gLanguageBreakFactories = NULL; - } +static UBool U_CALLCONV rbbi_cleanup(void) { + delete gLanguageBreakFactories; + gLanguageBreakFactories = nullptr; + delete gEmptyString; + gEmptyString = nullptr; + gLanguageBreakFactoriesInitOnce.reset(); + gRBBIInitOnce.reset(); return TRUE; } U_CDECL_END U_CDECL_BEGIN static void U_CALLCONV _deleteFactory(void *obj) { - delete (LanguageBreakFactory *) obj; + delete (icu::LanguageBreakFactory *) obj; } U_CDECL_END U_NAMESPACE_BEGIN -static const LanguageBreakEngine* -getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) -{ - UBool needsInit; - UErrorCode status = U_ZERO_ERROR; - umtx_lock(NULL); - needsInit = (UBool)(gLanguageBreakFactories == NULL); - umtx_unlock(NULL); - - if (needsInit) { - UStack *factories = new UStack(_deleteFactory, NULL, status); - if (U_SUCCESS(status)) { - ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); - factories->push(builtIn, status); +static void U_CALLCONV rbbiInit() { + gEmptyString = new UnicodeString(); + ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup); +} + +static void U_CALLCONV initLanguageFactories() { + UErrorCode status = U_ZERO_ERROR; + U_ASSERT(gLanguageBreakFactories == NULL); + gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); + if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { + ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); + gLanguageBreakFactories->push(builtIn, status); #ifdef U_LOCAL_SERVICE_HOOK - LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); - if (extra != NULL) { - factories->push(extra, status); - } -#endif + LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); + if (extra != NULL) { + gLanguageBreakFactories->push(extra, status); } - umtx_lock(NULL); - if (gLanguageBreakFactories == NULL) { - gLanguageBreakFactories = factories; - factories = NULL; - ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); - } - umtx_unlock(NULL); - delete factories; +#endif } - + ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup); +} + + +static const LanguageBreakEngine* +getLanguageBreakEngineFromFactory(UChar32 c) +{ + umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); if (gLanguageBreakFactories == NULL) { return NULL; } - + int32_t i = gLanguageBreakFactories->size(); const LanguageBreakEngine *lbe = NULL; while (--i >= 0) { LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); - lbe = factory->getEngineFor(c, breakType); + lbe = factory->getEngineFor(c); if (lbe != NULL) { break; } @@ -1756,35 +1291,35 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) //------------------------------------------------------------------------------- // // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the -// the characer c. +// the character c. // //------------------------------------------------------------------------------- const LanguageBreakEngine * RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { const LanguageBreakEngine *lbe = NULL; UErrorCode status = U_ZERO_ERROR; - + if (fLanguageBreakEngines == NULL) { fLanguageBreakEngines = new UStack(status); - if (U_FAILURE(status)) { + if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { delete fLanguageBreakEngines; fLanguageBreakEngines = 0; return NULL; } } - + int32_t i = fLanguageBreakEngines->size(); while (--i >= 0) { lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); - if (lbe->handles(c, fBreakType)) { + if (lbe->handles(c)) { return lbe; } } - + // No existing dictionary took the character. See if a factory wants to // give us a new LanguageBreakEngine for this character. - lbe = getLanguageBreakEngineFromFactory(c, fBreakType); - + lbe = getLanguageBreakEngineFromFactory(c); + // If we got one, use it and push it on our stack. if (lbe != NULL) { fLanguageBreakEngines->push((void *)lbe, status); @@ -1792,13 +1327,14 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { // return it even if the push fails. return lbe; } - + // No engine is forthcoming for this character. Add it to the // reject set. Create the reject break engine if needed. if (fUnhandledBreakEngine == NULL) { fUnhandledBreakEngine = new UnhandledEngine(status); if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; } // Put it last so that scripts for which we have an engine get tried // first. @@ -1810,23 +1346,34 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { return NULL; } } - + // Tell the reject engine about the character; at its discretion, it may // add more than just the one character. - fUnhandledBreakEngine->handleCharacter(c, fBreakType); - + fUnhandledBreakEngine->handleCharacter(c); + return fUnhandledBreakEngine; } +void RuleBasedBreakIterator::dumpCache() { + fBreakCache->dumpCache(); +} +void RuleBasedBreakIterator::dumpTables() { + fData->printData(); +} -/*int32_t RuleBasedBreakIterator::getBreakType() const { - return fBreakType; -}*/ +/** + * Returns the description used to create this iterator + */ -void RuleBasedBreakIterator::setBreakType(int32_t type) { - fBreakType = type; - reset(); +const UnicodeString& +RuleBasedBreakIterator::getRules() const { + if (fData != NULL) { + return fData->getRuleSourceString(); + } else { + umtx_initOnce(gRBBIInitOnce, &rbbiInit); + return *gEmptyString; + } } U_NAMESPACE_END