X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..ef6cf650f4a75c3f97de06b51fa104f2069b9ea2:/icuSources/common/rbbi.cpp?ds=inline diff --git a/icuSources/common/rbbi.cpp b/icuSources/common/rbbi.cpp index 10216c22..43694734 100644 --- a/icuSources/common/rbbi.cpp +++ b/icuSources/common/rbbi.cpp @@ -1,7 +1,7 @@ /* *************************************************************************** -* Copyright (C) 1999-2006 International Business Machines Corporation * -* and others. All rights reserved. * +* Copyright (C) 1999-2016 International Business Machines Corporation +* and others. All rights reserved. *************************************************************************** */ // @@ -10,6 +10,8 @@ // class RuleBasedBreakIterator // +#include "utypeinfo.h" // for 'typeid' to work + #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION @@ -23,7 +25,7 @@ #include "rbbirb.h" #include "cmemory.h" #include "cstring.h" -#include "mutex.h" +#include "umutex.h" #include "ucln_cmn.h" #include "brkeng.h" @@ -41,9 +43,11 @@ static UBool fTrace = FALSE; U_NAMESPACE_BEGIN +// The state number of the starting state +#define START_STATE 1 -static const int16_t START_STATE = 1; // The state number of the starting state -static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop" +// The state-transition value indicating "stop" +#define STOP_STATE 0 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) @@ -68,6 +72,50 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode } } +/** + * Same as above but does not adopt memory + */ +RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) +{ + init(); + fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor + if (U_FAILURE(status)) {return;} + if(fData == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } +} + + +// +// Construct from precompiled binary rules (tables). This constructor is public API, +// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). +// +RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, + uint32_t ruleLength, + UErrorCode &status) { + init(); + if (U_FAILURE(status)) { + return; + } + if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; + if (data->fLength > ruleLength) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); + if (U_FAILURE(status)) {return;} + if(fData == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } +} + + //------------------------------------------------------------------------------- // // Constructor from a UDataMemory handle to precompiled break rules @@ -99,7 +147,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, init(); if (U_FAILURE(status)) {return;} RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) - RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status); + RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that // creates and returns a complete RBBI. From here, in a constructor, we // can't just return the object created by the builder factory, hence @@ -179,6 +227,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { if (this == &that) { return *this; } + fKeepAll = that.fKeepAll; reset(); // Delete break cache information fBreakType = that.fBreakType; if (fLanguageBreakEngines != NULL) { @@ -222,7 +271,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { //----------------------------------------------------------------------------- void RuleBasedBreakIterator::init() { UErrorCode status = U_ZERO_ERROR; - fBufferClone = FALSE; fText = utext_openUChars(NULL, NULL, 0, &status); fCharIter = NULL; fSCharIter = NULL; @@ -231,7 +279,10 @@ void RuleBasedBreakIterator::init() { fLastRuleStatusIndex = 0; fLastStatusIndexValid = TRUE; fDictionaryCharCount = 0; - fBreakType = -1; + fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable + // dictionary behavior for Break Iterators that are + // built from rules. Even better would be the ability to + // declare the type in the rules. fCachedBreakPositions = NULL; fLanguageBreakEngines = NULL; @@ -271,11 +322,14 @@ RuleBasedBreakIterator::clone(void) const { */ UBool RuleBasedBreakIterator::operator==(const BreakIterator& that) const { - if (that.getDynamicClassID() != getDynamicClassID()) { + if (typeid(*this) != typeid(that)) { return FALSE; } const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; + if (that2.fKeepAll != fKeepAll) { + return FALSE; + } if (!utext_equals(fText, that2.fText)) { // The two break iterators are operating on different text, @@ -321,8 +375,12 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { // we can come to signaling a failure. // (GetText() is obsolete, this failure is sort of OK) if (fDCharIter == NULL) { - static UChar c = 0; + static const UChar c = 0; fDCharIter = new UCharCharacterIterator(&c, 0); + if (fDCharIter == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } } if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { @@ -431,10 +489,41 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) { } +/** + * Provide a new UText for the input text. Must reference text with contents identical + * to the original. + * Intended for use with text data originating in Java (garbage collected) environments + * where the data may be moved in memory at arbitrary times. + */ +RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (input == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + int64_t pos = utext_getNativeIndex(fText); + // Shallow read-only clone of the new UText into the existing input UText + fText = utext_clone(fText, input, FALSE, TRUE, &status); + if (U_FAILURE(status)) { + return *this; + } + utext_setNativeIndex(fText, pos); + if (utext_getNativeIndex(fText) != pos) { + // Sanity check. The new input utext is supposed to have the exact same + // contents as the old. If we can't set to the same position, it doesn't. + // The contents underlying the old utext might be invalid at this point, + // so it's not safe to check directly. + status = U_ILLEGAL_ARGUMENT_ERROR; + } + return *this; +} + /** - * Sets the current iteration position to the beginning of the text. - * @return The offset of the beginning of the text. + * Sets the current iteration position to the beginning of the text, position zero. + * @return The new iterator position, which is zero. */ int32_t RuleBasedBreakIterator::first(void) { reset(); @@ -507,7 +596,20 @@ int32_t RuleBasedBreakIterator::next(void) { } int32_t startPos = current(); + fDictionaryCharCount = 0; int32_t result = handleNext(fData->fForwardTable); + while (fKeepAll) { + UChar32 prevChr = utext_char32At(fText, result-1); + UChar32 currChr = utext_char32At(fText, result); + if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) { + break; + } + int32_t nextResult = handleNext(fData->fForwardTable); + if (nextResult <= result) { + break; + } + result = nextResult; + } if (fDictionaryCharCount > 0) { result = checkDictionary(startPos, result, FALSE); } @@ -550,6 +652,18 @@ int32_t RuleBasedBreakIterator::previous(void) { if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { result = handlePrevious(fData->fReverseTable); + while (fKeepAll) { + UChar32 prevChr = utext_char32At(fText, result-1); + UChar32 currChr = utext_char32At(fText, result); + if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) { + break; + } + int32_t prevResult = handlePrevious(fData->fReverseTable); + if (prevResult >= result) { + break; + } + result = prevResult; + } if (fDictionaryCharCount > 0) { result = checkDictionary(result, startPos, TRUE); } @@ -561,12 +675,11 @@ int32_t RuleBasedBreakIterator::previous(void) { // break position before the current position (we back our internal // iterator up one step to prevent handlePrevious() from returning // the current position), but not necessarily the last one before - // where we started int32_t start = current(); - UTEXT_PREVIOUS32(fText); + (void)UTEXT_PREVIOUS32(fText); int32_t lastResult = handlePrevious(fData->fReverseTable); if (lastResult == UBRK_DONE) { lastResult = 0; @@ -594,11 +707,11 @@ int32_t RuleBasedBreakIterator::previous(void) { // the result position that we are to return (in lastResult.) If // the backwards rules overshot and the above loop had to do two or more // next()s to move up to the desired return position, we will have a valid - // tag value. But, if handlePrevious() took us to exactly the correct result positon, + // tag value. But, if handlePrevious() took us to exactly the correct result position, // we wont have a tag value for that position, which is only set by handleNext(). - // set the current iteration position to be the last break position - // before where we started, and then return that value + // Set the current iteration position to be the last break position + // before where we started, and then return that value. utext_setNativeIndex(fText, lastResult); fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() fLastStatusIndexValid = breakTagValid; @@ -616,6 +729,22 @@ int32_t RuleBasedBreakIterator::previous(void) { * @return The position of the first break after the current position. */ int32_t RuleBasedBreakIterator::following(int32_t offset) { + // if the offset passed in is already past the end of the text, + // just return DONE; if it's before the beginning, return the + // text's starting offset + if (fText == NULL || offset >= utext_nativeLength(fText)) { + last(); + return next(); + } + else if (offset < 0) { + return first(); + } + + // Move requested offset to a code point start. It might be on a trail surrogate, + // or on a trail byte if the input is UTF-8. + utext_setNativeIndex(fText, offset); + offset = (int32_t)utext_getNativeIndex(fText); + // if we have cached break positions and offset is in the range // covered by them, use them // TODO: could use binary search @@ -637,20 +766,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { } } - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the - // text's starting offset - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - if (fText == NULL || offset >= utext_nativeLength(fText)) { - last(); - return next(); - } - else if (offset < 0) { - return first(); - } - - // otherwise, set our internal iteration position (temporarily) + // Set our internal iteration position (temporarily) // to the position passed in. If this is the _beginning_ position, // then we can just use next() to get our return value @@ -662,7 +778,8 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { // move forward one codepoint to prepare for moving back to a // safe point. // this handles offset being between a supplementary character - UTEXT_NEXT32(fText); + // TODO: is this still needed, with move to code point boundary handled above? + (void)UTEXT_NEXT32(fText); // handlePrevious will move most of the time to < 1 boundary away handlePrevious(fData->fSafeRevTable); int32_t result = next(); @@ -674,7 +791,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { if (fData->fSafeFwdTable != NULL) { // backup plan if forward safe table is not available utext_setNativeIndex(fText, offset); - UTEXT_PREVIOUS32(fText); + (void)UTEXT_PREVIOUS32(fText); // handle next will give result >= offset handleNext(fData->fSafeFwdTable); // previous will give result 0 or 1 boundary away from offset, @@ -705,7 +822,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { utext_setNativeIndex(fText, offset); if (offset==0 || - offset==1 && utext_getNativeIndex(fText)==0) { + (offset==1 && utext_getNativeIndex(fText)==0)) { return next(); } result = previous(); @@ -724,6 +841,21 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { * @return The position of the last boundary before the starting position. */ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { + // if the offset passed in is already past the end of the text, + // just return DONE; if it's before the beginning, return the + // text's starting offset + if (fText == NULL || offset > utext_nativeLength(fText)) { + return last(); + } + else if (offset < 0) { + return first(); + } + + // Move requested offset to a code point start. It might be on a trail surrogate, + // or on a trail byte if the input is UTF-8. + utext_setNativeIndex(fText, offset); + offset = (int32_t)utext_getNativeIndex(fText); + // if we have cached break positions and offset is in the range // covered by them, use them if (fCachedBreakPositions != NULL) { @@ -749,17 +881,6 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { } } - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the - // text's starting offset - if (fText == NULL || offset > utext_nativeLength(fText)) { - // return BreakIterator::DONE; - return last(); - } - else if (offset < 0) { - return first(); - } - // if we start by updating the current iteration position to the // position specified by the caller, we can just use previous() // to carry out this operation @@ -774,7 +895,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { // indices to the containing code point. // For breakitereator::preceding only, these non-code-point indices need to be moved // up to refer to the following codepoint. - UTEXT_NEXT32(fText); + (void)UTEXT_NEXT32(fText); offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); } @@ -783,7 +904,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { // (Change would interact with safe rules.) // TODO: change RBBI behavior for off-boundary indices to match that of UText? // affects only preceding(), seems cleaner, but is slightly different. - UTEXT_PREVIOUS32(fText); + (void)UTEXT_PREVIOUS32(fText); handleNext(fData->fSafeFwdTable); int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); while (result >= offset) { @@ -798,7 +919,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { // if they use safe tables at all. We have certainly never described // to anyone how to work with just one safe table. utext_setNativeIndex(fText, offset); - UTEXT_NEXT32(fText); + (void)UTEXT_NEXT32(fText); // handle previous will give result <= offset handlePrevious(fData->fSafeRevTable); @@ -890,6 +1011,54 @@ enum RBBIRunMode { }; +// Map from look-ahead break states (corresponds to rules) to boundary positions. +// Allows multiple lookahead break rules to be in flight at the same time. +// +// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers +// in the state table be sequential, then we can just index an array. And the +// table could also tell us in advance how big that array needs to be. +// +// Before ICU 57 there was just a single simple variable for a look-ahead match that +// was in progress. Two rules at once did not work. + +static const int32_t kMaxLookaheads = 8; +struct LookAheadResults { + int32_t fUsedSlotLimit; + int32_t fPositions[8]; + int16_t fKeys[8]; + + LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}; + + int32_t getPosition(int16_t key) { + for (int32_t i=0; i= kMaxLookaheads) { + U_ASSERT(FALSE); + i = kMaxLookaheads - 1; + } + fKeys[i] = key; + fPositions[i] = position; + U_ASSERT(fUsedSlotLimit == i); + fUsedSlotLimit = i + 1; + } +}; + + //----------------------------------------------------------------------------------- // // handleNext(stateTable) @@ -902,19 +1071,16 @@ enum RBBIRunMode { //----------------------------------------------------------------------------------- int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { int32_t state; - int16_t category = 0; + uint16_t category = 0; RBBIRunMode mode; RBBIStateTableRow *row; UChar32 c; - int32_t lookaheadStatus = 0; - int32_t lookaheadTagIdx = 0; - int32_t result = 0; - int32_t initialPosition = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; - const char *tableData = statetable->fTableData; - uint32_t tableRowLen = statetable->fRowLen; + LookAheadResults lookAheadMatches; + int32_t result = 0; + int32_t initialPosition = 0; + const char *tableData = statetable->fTableData; + uint32_t tableRowLen = statetable->fRowLen; #ifdef RBBI_DEBUG if (fTrace) { @@ -957,14 +1123,6 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - if (lookaheadResult > result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - } break; } // Run the loop one last time with the fake end-of-input character category. @@ -997,9 +1155,9 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { } } - #ifdef RBBI_DEBUG + #ifdef RBBI_DEBUG if (fTrace) { - RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText)); + RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); if (0x20<=c && c<0x7f) { RBBIDebugPrintf("\"%c\" ", c); } else { @@ -1011,7 +1169,12 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // State Transition - move machine to its next state // - state = row->fNextState[category]; + + // Note: fNextState is defined as uint16_t[2], but we are casting + // a generated RBBI table to RBBIStateTableRow and some tables + // actually have more than 2 categories. + U_ASSERT(categoryfHeader->fCatCount); + state = row->fNextState[category]; /*Not accessing beyond memory*/ row = (RBBIStateTableRow *) // (statetable->fTableData + (statetable->fRowLen * state)); (tableData + tableRowLen * state); @@ -1025,38 +1188,23 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. } - if (row->fLookAhead != 0) { - if (lookaheadStatus != 0 - && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if (lookAheadHardBreak) { - utext_setNativeIndex(fText, result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on - // TODO: junk this feature? I don't think it's used anywhwere. - goto continueOn; + int16_t completedRule = row->fAccepting; + if (completedRule > 0) { + // Lookahead match is completed. + int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + fLastRuleStatusIndex = row->fTagIdx; + UTEXT_SETNATIVEINDEX(fText, lookaheadResult); + return lookaheadResult; } - - int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - lookaheadTagIdx = row->fTagIdx; - goto continueOn; } - - - if (row->fAccepting != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relavant. Clear out the pending lookahead status. - lookaheadStatus = 0; // clear out any pending look-ahead match. + int16_t rule = row->fLookAhead; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); + lookAheadMatches.setPosition(rule, pos); } -continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no @@ -1085,13 +1233,13 @@ continueOn: // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { - utext_setNativeIndex(fText, initialPosition); + UTEXT_SETNATIVEINDEX(fText, initialPosition); UTEXT_NEXT32(fText); result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } // Leave the iterator at our result position. - utext_setNativeIndex(fText, result); + UTEXT_SETNATIVEINDEX(fText, result); #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPrintf("result = %d\n\n", result); @@ -1114,15 +1262,13 @@ continueOn: //----------------------------------------------------------------------------------- int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { int32_t state; - int16_t category = 0; + uint16_t category = 0; RBBIRunMode mode; RBBIStateTableRow *row; UChar32 c; - int32_t lookaheadStatus = 0; + LookAheadResults lookAheadMatches; int32_t result = 0; int32_t initialPosition = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; #ifdef RBBI_DEBUG if (fTrace) { @@ -1164,23 +1310,15 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) for (;;) { if (c == U_SENTINEL) { // Reached end of input string. - if (mode == RBBI_END || - *(int32_t *)fData->fHeader->fFormatVersion == 1 ) { + if (mode == RBBI_END) { // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - // (Or we have an old format binary rule file that does not support {eof}.) - if (lookaheadResult < result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - lookaheadStatus = 0; - } else if (result == initialPosition) { + if (result == initialPosition) { // Ran off start, no match found. // move one index one (towards the start, since we are doing a previous()) - utext_setNativeIndex(fText, initialPosition); - UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. + UTEXT_SETNATIVEINDEX(fText, initialPosition); + (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. } break; } @@ -1228,7 +1366,12 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) // State Transition - move machine to its next state // - state = row->fNextState[category]; + + // Note: fNextState is defined as uint16_t[2], but we are casting + // a generated RBBI table to RBBIStateTableRow and some tables + // actually have more than 2 categories. + U_ASSERT(categoryfHeader->fCatCount); + state = row->fNextState[category]; /*Not accessing beyond memory*/ row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); @@ -1237,36 +1380,22 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } - if (row->fLookAhead != 0) { - if (lookaheadStatus != 0 - && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. - result = lookaheadResult; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if (lookAheadHardBreak) { - utext_setNativeIndex(fText, result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on - // TODO: junk this feature? I don't think it's used anywhwere. - goto continueOn; + int16_t completedRule = row->fAccepting; + if (completedRule > 0) { + // Lookahead match is completed. + int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + UTEXT_SETNATIVEINDEX(fText, lookaheadResult); + return lookaheadResult; } - - int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - goto continueOn; } - - - if (row->fAccepting != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relavant. Clear out the pending lookahead status. - lookaheadStatus = 0; + int16_t rule = row->fLookAhead; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); + lookAheadMatches.setPosition(rule, pos); } -continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no @@ -1293,13 +1422,13 @@ continueOn: // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { - utext_setNativeIndex(fText, initialPosition); + UTEXT_SETNATIVEINDEX(fText, initialPosition); UTEXT_PREVIOUS32(fText); result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } // Leave the iterator at our result position. - utext_setNativeIndex(fText, result); + UTEXT_SETNATIVEINDEX(fText, result); #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPrintf("result = %d\n\n", result); @@ -1421,19 +1550,7 @@ const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { } - - -//------------------------------------------------------------------------------- -// -// BufferClone TODO: In my (Andy) opinion, this function should be deprecated. -// Saving one heap allocation isn't worth the trouble. -// Cloning shouldn't be done in tight loops, and -// making the clone copy involves other heap operations anyway. -// And the application code for correctly dealing with buffer -// size problems and the eventual object destruction is ugly. -// -//------------------------------------------------------------------------------- -BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, +BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) { @@ -1441,62 +1558,18 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, return NULL; } - // - // If user buffer size is zero this is a preflight operation to - // obtain the needed buffer size, allowing for worst case misalignment. - // if (bufferSize == 0) { - bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0); + bufferSize = 1; // preflighting for deprecated functionality return NULL; } - - // - // Check the alignment and size of the user supplied buffer. - // Allocate heap memory if the user supplied memory is insufficient. - // - char *buf = (char *)stackBuffer; - uint32_t s = bufferSize; - - if (stackBuffer == NULL) { - s = 0; // Ignore size, force allocation if user didn't give us a buffer. - } - if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { - uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf); - s -= offsetUp; - buf += offsetUp; - } - if (s < sizeof(RuleBasedBreakIterator)) { - // Not enough room in the caller-supplied buffer. - // Do a plain-vanilla heap based clone and return that, along with - // a warning that the clone was allocated. - RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this); - if (clonedBI == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - } else { - status = U_SAFECLONE_ALLOCATED_WARNING; - } - return clonedBI; + BreakIterator *clonedBI = clone(); + if (clonedBI == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } else { + status = U_SAFECLONE_ALLOCATED_WARNING; } - - // - // Clone the source BI into the caller-supplied buffer. - // TODO: using an overloaded operator new to directly initialize the - // copy in the user's buffer would be better, but it doesn't seem - // to get along with namespaces. Investigate why. - // - // The memcpy is only safe with an empty (default constructed) - // break iterator. Use on others can screw up reference counts - // to data. memcpy-ing objects is not really a good idea... - // - RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy - RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf; - uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part - clone->init(); // Init RuleBasedBreakIterator part, (user default constructor) - *clone = *this; // clone = the real BI we want. - clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code) - - return clone; + return (RuleBasedBreakIterator *)clonedBI; } @@ -1532,10 +1605,12 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, int32_t endPos, UBool reverse) { // Reset the old break cache first. - uint32_t dictionaryCount = fDictionaryCharCount; reset(); - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { + // note: code segment below assumes that dictionary chars are in the + // startPos-endPos range + // value returned should be next character in sequence + if ((endPos - startPos) <= 1) { return (reverse ? startPos : endPos); } @@ -1640,6 +1715,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, // If we found breaks, build a new break cache. The first and last entries must // be the original starting and ending position. if (foundBreakCount > 0) { + U_ASSERT(foundBreakCount == breaks.size()); int32_t totalBreaks = foundBreakCount; if (startPos < breaks.elementAti(0)) { totalBreaks += 1; @@ -1664,7 +1740,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, // proposed break by one of the breaks we found. Use following() and // preceding() to do the work. They should never recurse in this case. if (reverse) { - return preceding(endPos - 1); + return preceding(endPos); } else { return following(startPos); @@ -1679,11 +1755,11 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, return (reverse ? startPos : endPos); } -static UStack *gLanguageBreakFactories = NULL; - U_NAMESPACE_END -// defined in ucln_cmn.h + +static icu::UStack *gLanguageBreakFactories = NULL; +static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; /** * Release all static memory held by breakiterator. @@ -1694,48 +1770,40 @@ static UBool U_CALLCONV breakiterator_cleanup_dict(void) { delete gLanguageBreakFactories; gLanguageBreakFactories = NULL; } + gLanguageBreakFactoriesInitOnce.reset(); return TRUE; } U_CDECL_END U_CDECL_BEGIN static void U_CALLCONV _deleteFactory(void *obj) { - delete (LanguageBreakFactory *) obj; + delete (icu::LanguageBreakFactory *) obj; } U_CDECL_END U_NAMESPACE_BEGIN -static const LanguageBreakEngine* -getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) -{ - UBool needsInit; - UErrorCode status = U_ZERO_ERROR; - umtx_lock(NULL); - needsInit = (UBool)(gLanguageBreakFactories == NULL); - umtx_unlock(NULL); - - if (needsInit) { - UStack *factories = new UStack(_deleteFactory, NULL, status); - if (U_SUCCESS(status)) { - ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); - factories->push(builtIn, status); +static void U_CALLCONV initLanguageFactories() { + UErrorCode status = U_ZERO_ERROR; + U_ASSERT(gLanguageBreakFactories == NULL); + gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); + if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { + ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); + gLanguageBreakFactories->push(builtIn, status); #ifdef U_LOCAL_SERVICE_HOOK - LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); - if (extra != NULL) { - factories->push(extra, status); - } -#endif - } - umtx_lock(NULL); - if (gLanguageBreakFactories == NULL) { - gLanguageBreakFactories = factories; - factories = NULL; - ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); + LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); + if (extra != NULL) { + gLanguageBreakFactories->push(extra, status); } - umtx_unlock(NULL); - delete factories; +#endif } - + ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); +} + + +static const LanguageBreakEngine* +getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) +{ + umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); if (gLanguageBreakFactories == NULL) { return NULL; } @@ -1756,7 +1824,7 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) //------------------------------------------------------------------------------- // // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the -// the characer c. +// the character c. // //------------------------------------------------------------------------------- const LanguageBreakEngine * @@ -1766,7 +1834,7 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { if (fLanguageBreakEngines == NULL) { fLanguageBreakEngines = new UStack(status); - if (U_FAILURE(status)) { + if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { delete fLanguageBreakEngines; fLanguageBreakEngines = 0; return NULL;