X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/57a6839dcb3bba09e8228b822b290604668416fe..ef6cf650f4a75c3f97de06b51fa104f2069b9ea2:/icuSources/common/rbbi.cpp diff --git a/icuSources/common/rbbi.cpp b/icuSources/common/rbbi.cpp index f091a3ac..43694734 100644 --- a/icuSources/common/rbbi.cpp +++ b/icuSources/common/rbbi.cpp @@ -1,6 +1,6 @@ /* *************************************************************************** -* Copyright (C) 1999-2014 International Business Machines Corporation +* Copyright (C) 1999-2016 International Business Machines Corporation * and others. All rights reserved. *************************************************************************** */ @@ -227,6 +227,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { if (this == &that) { return *this; } + fKeepAll = that.fKeepAll; reset(); // Delete break cache information fBreakType = that.fBreakType; if (fLanguageBreakEngines != NULL) { @@ -326,6 +327,9 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const { } const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; + if (that2.fKeepAll != fKeepAll) { + return FALSE; + } if (!utext_equals(fText, that2.fText)) { // The two break iterators are operating on different text, @@ -518,8 +522,8 @@ RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, U /** - * Sets the current iteration position to the beginning of the text. - * @return The offset of the beginning of the text. + * Sets the current iteration position to the beginning of the text, position zero. + * @return The new iterator position, which is zero. */ int32_t RuleBasedBreakIterator::first(void) { reset(); @@ -594,6 +598,18 @@ int32_t RuleBasedBreakIterator::next(void) { int32_t startPos = current(); fDictionaryCharCount = 0; int32_t result = handleNext(fData->fForwardTable); + while (fKeepAll) { + UChar32 prevChr = utext_char32At(fText, result-1); + UChar32 currChr = utext_char32At(fText, result); + if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) { + break; + } + int32_t nextResult = handleNext(fData->fForwardTable); + if (nextResult <= result) { + break; + } + result = nextResult; + } if (fDictionaryCharCount > 0) { result = checkDictionary(startPos, result, FALSE); } @@ -636,6 +652,18 @@ int32_t RuleBasedBreakIterator::previous(void) { if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { result = handlePrevious(fData->fReverseTable); + while (fKeepAll) { + UChar32 prevChr = utext_char32At(fText, result-1); + UChar32 currChr = utext_char32At(fText, result); + if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) { + break; + } + int32_t prevResult = handlePrevious(fData->fReverseTable); + if (prevResult >= result) { + break; + } + result = prevResult; + } if (fDictionaryCharCount > 0) { result = checkDictionary(result, startPos, TRUE); } @@ -701,6 +729,22 @@ int32_t RuleBasedBreakIterator::previous(void) { * @return The position of the first break after the current position. */ int32_t RuleBasedBreakIterator::following(int32_t offset) { + // if the offset passed in is already past the end of the text, + // just return DONE; if it's before the beginning, return the + // text's starting offset + if (fText == NULL || offset >= utext_nativeLength(fText)) { + last(); + return next(); + } + else if (offset < 0) { + return first(); + } + + // Move requested offset to a code point start. It might be on a trail surrogate, + // or on a trail byte if the input is UTF-8. + utext_setNativeIndex(fText, offset); + offset = (int32_t)utext_getNativeIndex(fText); + // if we have cached break positions and offset is in the range // covered by them, use them // TODO: could use binary search @@ -722,20 +766,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { } } - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the - // text's starting offset - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = TRUE; - if (fText == NULL || offset >= utext_nativeLength(fText)) { - last(); - return next(); - } - else if (offset < 0) { - return first(); - } - - // otherwise, set our internal iteration position (temporarily) + // Set our internal iteration position (temporarily) // to the position passed in. If this is the _beginning_ position, // then we can just use next() to get our return value @@ -747,6 +778,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { // move forward one codepoint to prepare for moving back to a // safe point. // this handles offset being between a supplementary character + // TODO: is this still needed, with move to code point boundary handled above? (void)UTEXT_NEXT32(fText); // handlePrevious will move most of the time to < 1 boundary away handlePrevious(fData->fSafeRevTable); @@ -809,6 +841,21 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { * @return The position of the last boundary before the starting position. */ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { + // if the offset passed in is already past the end of the text, + // just return DONE; if it's before the beginning, return the + // text's starting offset + if (fText == NULL || offset > utext_nativeLength(fText)) { + return last(); + } + else if (offset < 0) { + return first(); + } + + // Move requested offset to a code point start. It might be on a trail surrogate, + // or on a trail byte if the input is UTF-8. + utext_setNativeIndex(fText, offset); + offset = (int32_t)utext_getNativeIndex(fText); + // if we have cached break positions and offset is in the range // covered by them, use them if (fCachedBreakPositions != NULL) { @@ -834,17 +881,6 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { } } - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the - // text's starting offset - if (fText == NULL || offset > utext_nativeLength(fText)) { - // return BreakIterator::DONE; - return last(); - } - else if (offset < 0) { - return first(); - } - // if we start by updating the current iteration position to the // position specified by the caller, we can just use previous() // to carry out this operation @@ -975,6 +1011,54 @@ enum RBBIRunMode { }; +// Map from look-ahead break states (corresponds to rules) to boundary positions. +// Allows multiple lookahead break rules to be in flight at the same time. +// +// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers +// in the state table be sequential, then we can just index an array. And the +// table could also tell us in advance how big that array needs to be. +// +// Before ICU 57 there was just a single simple variable for a look-ahead match that +// was in progress. Two rules at once did not work. + +static const int32_t kMaxLookaheads = 8; +struct LookAheadResults { + int32_t fUsedSlotLimit; + int32_t fPositions[8]; + int16_t fKeys[8]; + + LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}; + + int32_t getPosition(int16_t key) { + for (int32_t i=0; i= kMaxLookaheads) { + U_ASSERT(FALSE); + i = kMaxLookaheads - 1; + } + fKeys[i] = key; + fPositions[i] = position; + U_ASSERT(fUsedSlotLimit == i); + fUsedSlotLimit = i + 1; + } +}; + + //----------------------------------------------------------------------------------- // // handleNext(stateTable) @@ -992,14 +1076,11 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { RBBIStateTableRow *row; UChar32 c; - int32_t lookaheadStatus = 0; - int32_t lookaheadTagIdx = 0; - int32_t result = 0; - int32_t initialPosition = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; - const char *tableData = statetable->fTableData; - uint32_t tableRowLen = statetable->fRowLen; + LookAheadResults lookAheadMatches; + int32_t result = 0; + int32_t initialPosition = 0; + const char *tableData = statetable->fTableData; + uint32_t tableRowLen = statetable->fRowLen; #ifdef RBBI_DEBUG if (fTrace) { @@ -1042,14 +1123,6 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - if (lookaheadResult > result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - } break; } // Run the loop one last time with the fake end-of-input character category. @@ -1115,38 +1188,23 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. } - if (row->fLookAhead != 0) { - if (lookaheadStatus != 0 - && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if (lookAheadHardBreak) { - UTEXT_SETNATIVEINDEX(fText, result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on - // TODO: junk this feature? I don't think it's used anywhwere. - goto continueOn; + int16_t completedRule = row->fAccepting; + if (completedRule > 0) { + // Lookahead match is completed. + int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + fLastRuleStatusIndex = row->fTagIdx; + UTEXT_SETNATIVEINDEX(fText, lookaheadResult); + return lookaheadResult; } - - int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - lookaheadTagIdx = row->fTagIdx; - goto continueOn; } - - - if (row->fAccepting != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relavant. Clear out the pending lookahead status. - lookaheadStatus = 0; // clear out any pending look-ahead match. + int16_t rule = row->fLookAhead; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); + lookAheadMatches.setPosition(rule, pos); } -continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no @@ -1208,11 +1266,9 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) RBBIRunMode mode; RBBIStateTableRow *row; UChar32 c; - int32_t lookaheadStatus = 0; + LookAheadResults lookAheadMatches; int32_t result = 0; int32_t initialPosition = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; #ifdef RBBI_DEBUG if (fTrace) { @@ -1258,13 +1314,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. - if (lookaheadResult < result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - lookaheadStatus = 0; - } else if (result == initialPosition) { + if (result == initialPosition) { // Ran off start, no match found. // move one index one (towards the start, since we are doing a previous()) UTEXT_SETNATIVEINDEX(fText, initialPosition); @@ -1330,36 +1380,22 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } - if (row->fLookAhead != 0) { - if (lookaheadStatus != 0 - && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. - result = lookaheadResult; - lookaheadStatus = 0; - // TODO: make a standalone hard break in a rule work. - if (lookAheadHardBreak) { - UTEXT_SETNATIVEINDEX(fText, result); - return result; - } - // Look-ahead completed, but other rules may match further. Continue on - // TODO: junk this feature? I don't think it's used anywhwere. - goto continueOn; + int16_t completedRule = row->fAccepting; + if (completedRule > 0) { + // Lookahead match is completed. + int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); + if (lookaheadResult >= 0) { + UTEXT_SETNATIVEINDEX(fText, lookaheadResult); + return lookaheadResult; } - - int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - goto continueOn; } - - - if (row->fAccepting != 0) { - // Because this is an accepting state, any in-progress look-ahead match - // is no longer relavant. Clear out the pending lookahead status. - lookaheadStatus = 0; + int16_t rule = row->fLookAhead; + if (rule != 0) { + // At the position of a '/' in a look-ahead match. Record it. + int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); + lookAheadMatches.setPosition(rule, pos); } -continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no @@ -1578,30 +1614,6 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, return (reverse ? startPos : endPos); } - // Bug 5532. The dictionary code will crash if the input text is UTF-8 - // because native indexes are different from UTF-16 indexes. - // Temporary hack: skip dictionary lookup for UTF-8 encoded text. - // It wont give the right breaks, but it's better than a crash. - // - // Check the type of the UText by checking its pFuncs field, which - // is UText's function dispatch table. It will be the same for all - // UTF-8 UTexts and different for any other UText type. - // - // We have no other type of UText available with non-UTF-16 native indexing. - // This whole check will go away once the dictionary code is fixed. - static const void *utext_utf8Funcs; - if (utext_utf8Funcs == NULL) { - // Cache the UTF-8 UText function pointer value. - UErrorCode status = U_ZERO_ERROR; - UText tempUText = UTEXT_INITIALIZER; - utext_openUTF8(&tempUText, NULL, 0, &status); - utext_utf8Funcs = tempUText.pFuncs; - utext_close(&tempUText); - } - if (fText->pFuncs == utext_utf8Funcs) { - return (reverse ? startPos : endPos); - } - // Starting from the starting point, scan towards the proposed result, // looking for the first dictionary character (which may be the one // we're on, if we're starting in the middle of a range). @@ -1743,8 +1755,6 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, return (reverse ? startPos : endPos); } -// defined in ucln_cmn.h - U_NAMESPACE_END