X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..ef6cf650f4a75c3f97de06b51fa104f2069b9ea2:/icuSources/common/rbbi.cpp?ds=inline diff --git a/icuSources/common/rbbi.cpp b/icuSources/common/rbbi.cpp index 2615a4b3..43694734 100644 --- a/icuSources/common/rbbi.cpp +++ b/icuSources/common/rbbi.cpp @@ -1,6 +1,6 @@ /* *************************************************************************** -* Copyright (C) 1999-2010 International Business Machines Corporation +* Copyright (C) 1999-2016 International Business Machines Corporation * and others. All rights reserved. *************************************************************************** */ @@ -10,7 +10,7 @@ // class RuleBasedBreakIterator // -#include // for 'typeid' to work +#include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" @@ -86,6 +86,36 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum } } + +// +// Construct from precompiled binary rules (tables). This constructor is public API, +// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). +// +RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, + uint32_t ruleLength, + UErrorCode &status) { + init(); + if (U_FAILURE(status)) { + return; + } + if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; + if (data->fLength > ruleLength) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); + if (U_FAILURE(status)) {return;} + if(fData == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } +} + + //------------------------------------------------------------------------------- // // Constructor from a UDataMemory handle to precompiled break rules @@ -197,6 +227,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { if (this == &that) { return *this; } + fKeepAll = that.fKeepAll; reset(); // Delete break cache information fBreakType = that.fBreakType; if (fLanguageBreakEngines != NULL) { @@ -240,7 +271,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { //----------------------------------------------------------------------------- void RuleBasedBreakIterator::init() { UErrorCode status = U_ZERO_ERROR; - fBufferClone = FALSE; fText = utext_openUChars(NULL, NULL, 0, &status); fCharIter = NULL; fSCharIter = NULL; @@ -297,6 +327,9 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const { } const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; + if (that2.fKeepAll != fKeepAll) { + return FALSE; + } if (!utext_equals(fText, that2.fText)) { // The two break iterators are operating on different text, @@ -456,10 +489,41 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) { } +/** + * Provide a new UText for the input text. Must reference text with contents identical + * to the original. + * Intended for use with text data originating in Java (garbage collected) environments + * where the data may be moved in memory at arbitrary times. + */ +RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (input == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + int64_t pos = utext_getNativeIndex(fText); + // Shallow read-only clone of the new UText into the existing input UText + fText = utext_clone(fText, input, FALSE, TRUE, &status); + if (U_FAILURE(status)) { + return *this; + } + utext_setNativeIndex(fText, pos); + if (utext_getNativeIndex(fText) != pos) { + // Sanity check. The new input utext is supposed to have the exact same + // contents as the old. If we can't set to the same position, it doesn't. + // The contents underlying the old utext might be invalid at this point, + // so it's not safe to check directly. + status = U_ILLEGAL_ARGUMENT_ERROR; + } + return *this; +} + /** - * Sets the current iteration position to the beginning of the text. - * @return The offset of the beginning of the text. + * Sets the current iteration position to the beginning of the text, position zero. + * @return The new iterator position, which is zero. */ int32_t RuleBasedBreakIterator::first(void) { reset(); @@ -532,7 +596,20 @@ int32_t RuleBasedBreakIterator::next(void) { } int32_t startPos = current(); + fDictionaryCharCount = 0; int32_t result = handleNext(fData->fForwardTable); + while (fKeepAll) { + UChar32 prevChr = utext_char32At(fText, result-1); + UChar32 currChr = utext_char32At(fText, result); + if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) { + break; + } + int32_t nextResult = handleNext(fData->fForwardTable); + if (nextResult <= result) { + break; + } + result = nextResult; + } if (fDictionaryCharCount > 0) { result = checkDictionary(startPos, result, FALSE); } @@ -575,6 +652,18 @@ int32_t RuleBasedBreakIterator::previous(void) { if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { result = handlePrevious(fData->fReverseTable); + while (fKeepAll) { + UChar32 prevChr = utext_char32At(fText, result-1); + UChar32 currChr = utext_char32At(fText, result); + if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) { + break; + } + int32_t prevResult = handlePrevious(fData->fReverseTable); + if (prevResult >= result) { + break; + } + result = prevResult; + } if (fDictionaryCharCount > 0) { result = checkDictionary(result, startPos, TRUE); } @@ -586,12 +675,11 @@ int32_t RuleBasedBreakIterator::previous(void) { // break position before the current position (we back our internal // iterator up one step to prevent handlePrevious() from returning // the current position), but not necessarily the last one before - // where we started int32_t start = current(); - UTEXT_PREVIOUS32(fText); + (void)UTEXT_PREVIOUS32(fText); int32_t lastResult = handlePrevious(fData->fReverseTable); if (lastResult == UBRK_DONE) { lastResult = 0; @@ -619,11 +707,11 @@ int32_t RuleBasedBreakIterator::previous(void) { // the result position that we are to return (in lastResult.) If // the backwards rules overshot and the above loop had to do two or more // next()s to move up to the desired return position, we will have a valid - // tag value. But, if handlePrevious() took us to exactly the correct result positon, + // tag value. But, if handlePrevious() took us to exactly the correct result position, // we wont have a tag value for that position, which is only set by handleNext(). - // set the current iteration position to be the last break position - // before where we started, and then return that value + // Set the current iteration position to be the last break position + // before where we started, and then return that value. utext_setNativeIndex(fText, lastResult); fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() fLastStatusIndexValid = breakTagValid; @@ -641,6 +729,22 @@ int32_t RuleBasedBreakIterator::previous(void) { * @return The position of the first break after the current position. */ int32_t RuleBasedBreakIterator::following(int32_t offset) { + // if the offset passed in is already past the end of the text, + // just return DONE; if it's before the beginning, return the + // text's starting offset + if (fText == NULL || offset >= utext_nativeLength(fText)) { + last(); + return next(); + } + else if (offset < 0) { + return first(); + } + + // Move requested offset to a code point start. It might be on a trail surrogate, + // or on a trail byte if the input is UTF-8. + utext_setNativeIndex(fText, offset); + offset = (int32_t)utext_getNativeIndex(fText); + // if we have cached break positions and offset is in the range // covered by them, use them // TODO: could use binary search @@ -662,20 +766,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { } } - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the - // text's starting offset - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - if (fText == NULL || offset >= utext_nativeLength(fText)) { - last(); - return next(); - } - else if (offset < 0) { - return first(); - } - - // otherwise, set our internal iteration position (temporarily) + // Set our internal iteration position (temporarily) // to the position passed in. If this is the _beginning_ position, // then we can just use next() to get our return value @@ -687,7 +778,8 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { // move forward one codepoint to prepare for moving back to a // safe point. // this handles offset being between a supplementary character - UTEXT_NEXT32(fText); + // TODO: is this still needed, with move to code point boundary handled above? + (void)UTEXT_NEXT32(fText); // handlePrevious will move most of the time to < 1 boundary away handlePrevious(fData->fSafeRevTable); int32_t result = next(); @@ -699,7 +791,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { if (fData->fSafeFwdTable != NULL) { // backup plan if forward safe table is not available utext_setNativeIndex(fText, offset); - UTEXT_PREVIOUS32(fText); + (void)UTEXT_PREVIOUS32(fText); // handle next will give result >= offset handleNext(fData->fSafeFwdTable); // previous will give result 0 or 1 boundary away from offset, @@ -749,6 +841,21 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { * @return The position of the last boundary before the starting position. */ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { + // if the offset passed in is already past the end of the text, + // just return DONE; if it's before the beginning, return the + // text's starting offset + if (fText == NULL || offset > utext_nativeLength(fText)) { + return last(); + } + else if (offset < 0) { + return first(); + } + + // Move requested offset to a code point start. It might be on a trail surrogate, + // or on a trail byte if the input is UTF-8. + utext_setNativeIndex(fText, offset); + offset = (int32_t)utext_getNativeIndex(fText); + // if we have cached break positions and offset is in the range // covered by them, use them if (fCachedBreakPositions != NULL) { @@ -774,17 +881,6 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { } } - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the - // text's starting offset - if (fText == NULL || offset > utext_nativeLength(fText)) { - // return BreakIterator::DONE; - return last(); - } - else if (offset < 0) { - return first(); - } - // if we start by updating the current iteration position to the // position specified by the caller, we can just use previous() // to carry out this operation @@ -799,7 +895,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { // indices to the containing code point. // For breakitereator::preceding only, these non-code-point indices need to be moved // up to refer to the following codepoint. - UTEXT_NEXT32(fText); + (void)UTEXT_NEXT32(fText); offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); } @@ -808,7 +904,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { // (Change would interact with safe rules.) // TODO: change RBBI behavior for off-boundary indices to match that of UText? // affects only preceding(), seems cleaner, but is slightly different. - UTEXT_PREVIOUS32(fText); + (void)UTEXT_PREVIOUS32(fText); handleNext(fData->fSafeFwdTable); int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); while (result >= offset) { @@ -823,7 +919,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { // if they use safe tables at all. We have certainly never described // to anyone how to work with just one safe table. utext_setNativeIndex(fText, offset); - UTEXT_NEXT32(fText); + (void)UTEXT_NEXT32(fText); // handle previous will give result <= offset handlePrevious(fData->fSafeRevTable); @@ -915,6 +1011,54 @@ enum RBBIRunMode { }; +// Map from look-ahead break states (corresponds to rules) to boundary positions. +// Allows multiple lookahead break rules to be in flight at the same time. +// +// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers +// in the state table be sequential, then we can just index an array. And the +// table could also tell us in advance how big that array needs to be. +// +// Before ICU 57 there was just a single simple variable for a look-ahead match that +// was in progress. Two rules at once did not work. + +static const int32_t kMaxLookaheads = 8; +struct LookAheadResults { + int32_t fUsedSlotLimit; + int32_t fPositions[8]; + int16_t fKeys[8]; + + LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}; + + int32_t getPosition(int16_t key) { + for (int32_t i=0; i= kMaxLookaheads) { + U_ASSERT(FALSE); + i = kMaxLookaheads - 1; + } + fKeys[i] = key; + fPositions[i] = position; + U_ASSERT(fUsedSlotLimit == i); + fUsedSlotLimit = i + 1; + } +}; + + //----------------------------------------------------------------------------------- // // handleNext(stateTable) @@ -927,19 +1071,16 @@ enum RBBIRunMode { //----------------------------------------------------------------------------------- int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { int32_t state; - int16_t category = 0; + uint16_t category = 0; RBBIRunMode mode; RBBIStateTableRow *row; UChar32 c; - int32_t lookaheadStatus = 0; - int32_t lookaheadTagIdx = 0; - int32_t result = 0; - int32_t initialPosition = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; - const char *tableData = statetable->fTableData; - uint32_t tableRowLen = statetable->fRowLen; + LookAheadResults lookAheadMatches; + int32_t result = 0; + int32_t initialPosition = 0; + const char *tableData = statetable->fTableData; + uint32_t tableRowLen = statetable->fRowLen; #ifdef RBBI_DEBUG if (fTrace) { @@ -982,14 +1123,6 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - if (lookaheadResult > result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - } break; } // Run the loop one last time with the fake end-of-input character category. @@ -1022,7 +1155,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { } } - #ifdef RBBI_DEBUG + #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); if (0x20<=c && c<0x7f) { @@ -1036,7 +1169,12 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // State Transition - move machine to its next state // - state = row->fNextState[category]; + + // Note: fNextState is defined as uint16_t[2], but we are casting + // a generated RBBI table to RBBIStateTableRow and some tables + // actually have more than 2 categories. + U_ASSERT(categoryfHeader->fCatCount); + state = row->fNextState[category]; /*Not accessing beyond memory*/ row = (RBBIStateTableRow *) // (statetable->fTableData + (statetable->fRowLen * state)); (tableData + tableRowLen * state); @@ -1050,38 +1188,23 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. } - if (row->fLookAhead != 0) { - if (lookaheadStatus != 0 - && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if (lookAheadHardBreak) { - UTEXT_SETNATIVEINDEX(fText, result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on - // TODO: junk this feature? I don't think it's used anywhwere. - goto continueOn; + int16_t completedRule = row->fAccepting; + if (completedRule > 0) { + // Lookahead match is completed. + int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + fLastRuleStatusIndex = row->fTagIdx; + UTEXT_SETNATIVEINDEX(fText, lookaheadResult); + return lookaheadResult; } - - int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - lookaheadTagIdx = row->fTagIdx; - goto continueOn; } - - - if (row->fAccepting != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relavant. Clear out the pending lookahead status. - lookaheadStatus = 0; // clear out any pending look-ahead match. + int16_t rule = row->fLookAhead; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); + lookAheadMatches.setPosition(rule, pos); } -continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no @@ -1139,15 +1262,13 @@ continueOn: //----------------------------------------------------------------------------------- int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { int32_t state; - int16_t category = 0; + uint16_t category = 0; RBBIRunMode mode; RBBIStateTableRow *row; UChar32 c; - int32_t lookaheadStatus = 0; + LookAheadResults lookAheadMatches; int32_t result = 0; int32_t initialPosition = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; #ifdef RBBI_DEBUG if (fTrace) { @@ -1193,17 +1314,11 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - if (lookaheadResult < result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - lookaheadStatus = 0; - } else if (result == initialPosition) { + if (result == initialPosition) { // Ran off start, no match found. // move one index one (towards the start, since we are doing a previous()) UTEXT_SETNATIVEINDEX(fText, initialPosition); - UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. + (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. } break; } @@ -1251,7 +1366,12 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) // State Transition - move machine to its next state // - state = row->fNextState[category]; + + // Note: fNextState is defined as uint16_t[2], but we are casting + // a generated RBBI table to RBBIStateTableRow and some tables + // actually have more than 2 categories. + U_ASSERT(categoryfHeader->fCatCount); + state = row->fNextState[category]; /*Not accessing beyond memory*/ row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); @@ -1260,36 +1380,22 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } - if (row->fLookAhead != 0) { - if (lookaheadStatus != 0 - && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. - result = lookaheadResult; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if (lookAheadHardBreak) { - UTEXT_SETNATIVEINDEX(fText, result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on - // TODO: junk this feature? I don't think it's used anywhwere. - goto continueOn; + int16_t completedRule = row->fAccepting; + if (completedRule > 0) { + // Lookahead match is completed. + int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + UTEXT_SETNATIVEINDEX(fText, lookaheadResult); + return lookaheadResult; } - - int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - goto continueOn; } - - - if (row->fAccepting != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relavant. Clear out the pending lookahead status. - lookaheadStatus = 0; + int16_t rule = row->fLookAhead; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); + lookAheadMatches.setPosition(rule, pos); } -continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no @@ -1444,19 +1550,7 @@ const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { } - - -//------------------------------------------------------------------------------- -// -// BufferClone TODO: In my (Andy) opinion, this function should be deprecated. -// Saving one heap allocation isn't worth the trouble. -// Cloning shouldn't be done in tight loops, and -// making the clone copy involves other heap operations anyway. -// And the application code for correctly dealing with buffer -// size problems and the eventual object destruction is ugly. -// -//------------------------------------------------------------------------------- -BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, +BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) { @@ -1464,62 +1558,18 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, return NULL; } - // - // If user buffer size is zero this is a preflight operation to - // obtain the needed buffer size, allowing for worst case misalignment. - // if (bufferSize == 0) { - bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0); + bufferSize = 1; // preflighting for deprecated functionality return NULL; } - - // - // Check the alignment and size of the user supplied buffer. - // Allocate heap memory if the user supplied memory is insufficient. - // - char *buf = (char *)stackBuffer; - uint32_t s = bufferSize; - - if (stackBuffer == NULL) { - s = 0; // Ignore size, force allocation if user didn't give us a buffer. - } - if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { - uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf); - s -= offsetUp; - buf += offsetUp; - } - if (s < sizeof(RuleBasedBreakIterator)) { - // Not enough room in the caller-supplied buffer. - // Do a plain-vanilla heap based clone and return that, along with - // a warning that the clone was allocated. - RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this); - if (clonedBI == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - } else { - status = U_SAFECLONE_ALLOCATED_WARNING; - } - return clonedBI; + BreakIterator *clonedBI = clone(); + if (clonedBI == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } else { + status = U_SAFECLONE_ALLOCATED_WARNING; } - - // - // Clone the source BI into the caller-supplied buffer. - // TODO: using an overloaded operator new to directly initialize the - // copy in the user's buffer would be better, but it doesn't seem - // to get along with namespaces. Investigate why. - // - // The memcpy is only safe with an empty (default constructed) - // break iterator. Use on others can screw up reference counts - // to data. memcpy-ing objects is not really a good idea... - // - RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy - RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf; - uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part - clone->init(); // Init RuleBasedBreakIterator part, (user default constructor) - *clone = *this; // clone = the real BI we want. - clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code) - - return clone; + return (RuleBasedBreakIterator *)clonedBI; } @@ -1555,37 +1605,15 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, int32_t endPos, UBool reverse) { // Reset the old break cache first. - uint32_t dictionaryCount = fDictionaryCharCount; reset(); - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { + // note: code segment below assumes that dictionary chars are in the + // startPos-endPos range + // value returned should be next character in sequence + if ((endPos - startPos) <= 1) { return (reverse ? startPos : endPos); } - // Bug 5532. The dictionary code will crash if the input text is UTF-8 - // because native indexes are different from UTF-16 indexes. - // Temporary hack: skip dictionary lookup for UTF-8 encoded text. - // It wont give the right breaks, but it's better than a crash. - // - // Check the type of the UText by checking its pFuncs field, which - // is UText's function dispatch table. It will be the same for all - // UTF-8 UTexts and different for any other UText type. - // - // We have no other type of UText available with non-UTF-16 native indexing. - // This whole check will go away once the dictionary code is fixed. - static const void *utext_utf8Funcs; - if (utext_utf8Funcs == NULL) { - // Cache the UTF-8 UText function pointer value. - UErrorCode status = U_ZERO_ERROR; - UText tempUText = UTEXT_INITIALIZER; - utext_openUTF8(&tempUText, NULL, 0, &status); - utext_utf8Funcs = tempUText.pFuncs; - utext_close(&tempUText); - } - if (fText->pFuncs == utext_utf8Funcs) { - return (reverse ? startPos : endPos); - } - // Starting from the starting point, scan towards the proposed result, // looking for the first dictionary character (which may be the one // we're on, if we're starting in the middle of a range). @@ -1687,6 +1715,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, // If we found breaks, build a new break cache. The first and last entries must // be the original starting and ending position. if (foundBreakCount > 0) { + U_ASSERT(foundBreakCount == breaks.size()); int32_t totalBreaks = foundBreakCount; if (startPos < breaks.elementAti(0)) { totalBreaks += 1; @@ -1711,7 +1740,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, // proposed break by one of the breaks we found. Use following() and // preceding() to do the work. They should never recurse in this case. if (reverse) { - return preceding(endPos - 1); + return preceding(endPos); } else { return following(startPos); @@ -1728,9 +1757,9 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, U_NAMESPACE_END -// defined in ucln_cmn.h -static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL; +static icu::UStack *gLanguageBreakFactories = NULL; +static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; /** * Release all static memory held by breakiterator. @@ -1741,46 +1770,40 @@ static UBool U_CALLCONV breakiterator_cleanup_dict(void) { delete gLanguageBreakFactories; gLanguageBreakFactories = NULL; } + gLanguageBreakFactoriesInitOnce.reset(); return TRUE; } U_CDECL_END U_CDECL_BEGIN static void U_CALLCONV _deleteFactory(void *obj) { - delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj; + delete (icu::LanguageBreakFactory *) obj; } U_CDECL_END U_NAMESPACE_BEGIN -static const LanguageBreakEngine* -getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) -{ - UBool needsInit; - UErrorCode status = U_ZERO_ERROR; - UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit); - - if (needsInit) { - UStack *factories = new UStack(_deleteFactory, NULL, status); - if (factories != NULL && U_SUCCESS(status)) { - ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); - factories->push(builtIn, status); +static void U_CALLCONV initLanguageFactories() { + UErrorCode status = U_ZERO_ERROR; + U_ASSERT(gLanguageBreakFactories == NULL); + gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); + if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { + ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); + gLanguageBreakFactories->push(builtIn, status); #ifdef U_LOCAL_SERVICE_HOOK - LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); - if (extra != NULL) { - factories->push(extra, status); - } -#endif - } - umtx_lock(NULL); - if (gLanguageBreakFactories == NULL) { - gLanguageBreakFactories = factories; - factories = NULL; - ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); + LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); + if (extra != NULL) { + gLanguageBreakFactories->push(extra, status); } - umtx_unlock(NULL); - delete factories; +#endif } - + ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); +} + + +static const LanguageBreakEngine* +getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) +{ + umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); if (gLanguageBreakFactories == NULL) { return NULL; } @@ -1801,7 +1824,7 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) //------------------------------------------------------------------------------- // // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the -// the characer c. +// the character c. // //------------------------------------------------------------------------------- const LanguageBreakEngine *