X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b331163bffd790ced0e88b73f44f86d49ccc48a5..ef6cf650f4a75c3f97de06b51fa104f2069b9ea2:/icuSources/common/filteredbrk.cpp diff --git a/icuSources/common/filteredbrk.cpp b/icuSources/common/filteredbrk.cpp index 549c8687..38c0c606 100644 --- a/icuSources/common/filteredbrk.cpp +++ b/icuSources/common/filteredbrk.cpp @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (C) 2014-2015, International Business Machines Corporation and +* Copyright (C) 2014-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ @@ -43,6 +43,9 @@ static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, #define FB_TRACE(m,s,b,d) #endif +/** + * Used with sortedInsert() + */ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { const UnicodeString &a = *(const UnicodeString*)t1.pointer; const UnicodeString &b = *(const UnicodeString*)t2.pointer; @@ -117,23 +120,46 @@ class U_COMMON_API UStringSet : public UVector { */ UStringSet::~UStringSet() {} +/* ----------------------------------------------------------- */ + +/* Filtered Break constants */ static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie static const int32_t kMATCH = (1<<1); //< exact match - skip this one. static const int32_t kSuppressInReverse = (1<<0); static const int32_t kAddToForward = (1<<1); -static const UChar kFULLSTOP = 0x002E; // '.' +static const UChar kFULLSTOP = 0x002E; // '.' + +/** + * Shared data for SimpleFilteredSentenceBreakIterator + */ +class SimpleFilteredSentenceBreakData : public UMemory { +public: + SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) + : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } + SimpleFilteredSentenceBreakData *incr() { refcount++; return this; } + SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; } + virtual ~SimpleFilteredSentenceBreakData(); + LocalPointer fForwardsPartialTrie; // Has ".a" for "a.M." + LocalPointer fBackwardsTrie; // i.e. ".srM" for Mrs. + int32_t refcount; +}; + +SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} + +/** + * Concrete implementation + */ class SimpleFilteredSentenceBreakIterator : public BreakIterator { public: SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); virtual ~SimpleFilteredSentenceBreakIterator(); private: + SimpleFilteredSentenceBreakData *fData; LocalPointer fDelegate; LocalUTextPointer fText; - LocalPointer fBackwardsTrie; // i.e. ".srM" for Mrs. - LocalPointer fForwardsPartialTrie; // Has ".a" for "a.M." /* -- subclass interface -- */ public: @@ -160,78 +186,82 @@ public: virtual CharacterIterator& getText(void) const { return fDelegate->getText(); } /* -- ITERATION -- */ - virtual int32_t first(void) { return fDelegate->first(); } - virtual UBool isBoundary(int32_t offset) { return fDelegate->isBoundary(offset); } - virtual int32_t current(void) const { return fDelegate->current(); } - virtual int32_t next(int32_t n) { return fDelegate->next(n); } // fallback implementation, undoing r36410 - virtual int32_t last(void) { return fDelegate->last(); } + virtual int32_t first(void); + virtual int32_t preceding(int32_t offset); + virtual int32_t previous(void); + virtual UBool isBoundary(int32_t offset); + virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct. virtual int32_t next(void); + + virtual int32_t next(int32_t n); virtual int32_t following(int32_t offset); - virtual int32_t previous(void); - virtual int32_t preceding(int32_t offset); + virtual int32_t last(void); private: - virtual int32_t nextCore(int32_t n); - virtual int32_t previousCore(int32_t n); - + /** + * Given that the fDelegate has already given its "initial" answer, + * find the NEXT actual (non-excepted) break. + * @param n initial position from delegate + * @return new break position or UBRK_DONE + */ + int32_t internalNext(int32_t n); + /** + * Given that the fDelegate has already given its "initial" answer, + * find the PREV actual (non-excepted) break. + * @param n initial position from delegate + * @return new break position or UBRK_DONE + */ + int32_t internalPrev(int32_t n); + /** + * set up the UText with the value of the fDelegate. + * Call this before calling breakExceptionAt. + * May be able to avoid excess calls + */ + void resetState(UErrorCode &status); + /** + * Is there a match (exception) at this spot? + */ + enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; + /** + * Determine if there is an exception at this spot + * @param n spot to check + * @return kNoExceptionHere or kExceptionHere + **/ + enum EFBMatchResult breakExceptionAt(int32_t n); }; SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) - : BreakIterator(other), fDelegate(other.fDelegate->clone()) + : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) { - /* - TODO: not able to clone Tries. Should be a refcounted hidden master instead. - if(other.fBackwardsTrie.isValid()) { - fBackwardsTrie.adoptInstead(other.fBackwardsTrie->clone()); - } - if(other.fForwardsPartialTrie.isValid()) { - fForwardsPartialTrie.adoptInstead(other.fForwardsPartialTrie->clone()); - } - */ } SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), - fDelegate(adopt), - fBackwardsTrie(backwards), - fForwardsPartialTrie(forwards) + fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), + fDelegate(adopt) { // all set.. } -SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {} - -int32_t SimpleFilteredSentenceBreakIterator::next() { - int32_t n = fDelegate->next(); - return nextCore(n); +SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { + fData = fData->decr(); } -int32_t SimpleFilteredSentenceBreakIterator::following(int32_t offset) { - int32_t n = fDelegate->following(offset); - return nextCore(n); +void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { + fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); } -int32_t SimpleFilteredSentenceBreakIterator::nextCore(int32_t n) { - if(n == UBRK_DONE || // at end or - fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions - return n; - } - // OK, do we need to break here? - UErrorCode status = U_ZERO_ERROR; - // refresh text - fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); - int64_t utextLen = utext_nativeLength(fText.getAlias()); - if(n == utextLen) { - return n; - } - //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); - do { // outer loop runs once per underlying break (from fDelegate). +SimpleFilteredSentenceBreakIterator::EFBMatchResult +SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { + int64_t bestPosn = -1; + int32_t bestValue = -1; // loops while 'n' points to an exception. utext_setNativeIndex(fText.getAlias(), n); // from n.. - fBackwardsTrie->reset(); + fData->fBackwardsTrie->reset(); UChar32 uch; + //if(debug2) u_printf(" n@ %d\n", n); // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? @@ -242,23 +272,21 @@ int32_t SimpleFilteredSentenceBreakIterator::nextCore(int32_t n) { uch = utext_next32(fText.getAlias()); //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); } - UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; - int32_t bestPosn = -1; - int32_t bestValue = -1; + UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. - USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie + USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far bestPosn = utext_getNativeIndex(fText.getAlias()); - bestValue = fBackwardsTrie->getValue(); + bestValue = fData->fBackwardsTrie->getValue(); } //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); } if(USTRINGTRIE_MATCHES(r)) { // exact match? //if(debug2) u_printf("revgetValue(); + bestValue = fData->fBackwardsTrie->getValue(); bestPosn = utext_getNativeIndex(fText.getAlias()); //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); } @@ -270,128 +298,164 @@ int32_t SimpleFilteredSentenceBreakIterator::nextCore(int32_t n) { //int32_t bestValue = fBackwardsTrie->getValue(); ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue); + if(bestPosn>0) { + UChar32 prevch = utext_char32At(fText.getAlias(), bestPosn-1); // char before the best match + if (prevch != U_SENTINEL && u_isUAlphabetic(prevch)) { + // The match is preceded by other alphabetic characters, => invalid + return kNoExceptionHere; + } + } + if(bestValue == kMATCH) { // exact match! //if(debug2) u_printf(" exact backward match\n"); - n = fDelegate->next(); // skip this one. Find the next lowerlevel break. - if(n==UBRK_DONE || n==utextLen) return n; - continue; // See if the next is another exception. + return kExceptionHere; // See if the next is another exception. } else if(bestValue == kPARTIAL - && fForwardsPartialTrie.isValid()) { // make sure there's a forward trie + && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie //if(debug2) u_printf(" partial backward match\n"); // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie // to see if it matches something going forward. - fForwardsPartialTrie->reset(); + fData->fForwardsPartialTrie->reset(); UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. //if(debug2) u_printf("Retrying at %d\n", bestPosn); while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && - USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) { + USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) { //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); } if(USTRINGTRIE_MATCHES(rfwd)) { //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch); // only full matches here, nothing to check // skip the next: - n = fDelegate->next(); - if(n==UBRK_DONE || n==utextLen) return n; - continue; + return kExceptionHere; } else { //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch); // no match (no exception) -return the 'underlying' break - return n; + return kNoExceptionHere; } } else { - return n; // internal error and/or no forwards trie + return kNoExceptionHere; // internal error and/or no forwards trie } } else { //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match - return n; // No match - so exit. Not an exception. + return kNoExceptionHere; // No match - so exit. Not an exception. } - } while(n != UBRK_DONE); - return n; } -int32_t SimpleFilteredSentenceBreakIterator::previous() { - int32_t n = fDelegate->previous(); - return previousCore(n); -} +// the workhorse single next. +int32_t +SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { + if(n == UBRK_DONE || // at end or + fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions + return n; + } + // OK, do we need to break here? + UErrorCode status = U_ZERO_ERROR; + // refresh text + resetState(status); + if(U_FAILURE(status)) return UBRK_DONE; // bail out + int64_t utextLen = utext_nativeLength(fText.getAlias()); + + //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); + while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). + SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); + + switch(m) { + case kExceptionHere: + n = fDelegate->next(); // skip this one. Find the next lowerlevel break. + continue; -int32_t SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { - int32_t n = fDelegate->preceding(offset); - return previousCore(n); + default: + case kNoExceptionHere: + return n; + } + } + return n; } -int32_t SimpleFilteredSentenceBreakIterator::previousCore(int32_t n) { - if(n == UBRK_DONE || n == 0 || // at end or - fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions - return n; +int32_t +SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { + if(n == 0 || n == UBRK_DONE || // at end or + fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions + return n; } // OK, do we need to break here? UErrorCode status = U_ZERO_ERROR; // refresh text - fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); - do { // outer loop runs once per underlying break (from fDelegate). - // loops while 'n' points to an exception. - utext_setNativeIndex(fText.getAlias(), n); // from n.. - fBackwardsTrie->reset(); - UChar32 uch; - // Skip over any space preceding the break - if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? - // TODO only do this the 1st time? - } else { - //restore what we skipped - uch = utext_next32(fText.getAlias()); - } - UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; + resetState(status); + if(U_FAILURE(status)) return UBRK_DONE; // bail out - int32_t bestPosn = -1; - int32_t bestValue = -1; + //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); + while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). + SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); - while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. - USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie - if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far - bestPosn = utext_getNativeIndex(fText.getAlias()); - bestValue = fBackwardsTrie->getValue(); - } - } + switch(m) { + case kExceptionHere: + n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. + continue; - if(USTRINGTRIE_MATCHES(r)) { // exact match? - bestValue = fBackwardsTrie->getValue(); - bestPosn = utext_getNativeIndex(fText.getAlias()); + default: + case kNoExceptionHere: + return n; } + } + return n; +} - if(bestPosn>=0) { - if(bestValue == kMATCH) { // exact match! - n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. - if(n==UBRK_DONE || n==0) return n; - continue; // See if the next is another exception. - } else if(bestValue == kPARTIAL - && fForwardsPartialTrie.isValid()) { // make sure there's a forward trie - fForwardsPartialTrie->reset(); - UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; - utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. - while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && - USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) { - } - if(USTRINGTRIE_MATCHES(rfwd)) { - n = fDelegate->previous(); - if(n==UBRK_DONE || n==0) return n; - continue; - } else { - // no match (no exception) -return the 'underlying' break - return n; - } - } else { - return n; // internal error and/or no forwards trie - } - } else { - return n; // No match - so exit. Not an exception. - } - } while(n != UBRK_DONE); - return n; +int32_t +SimpleFilteredSentenceBreakIterator::next() { + return internalNext(fDelegate->next()); +} + +int32_t +SimpleFilteredSentenceBreakIterator::first(void) { + return internalNext(fDelegate->first()); } +int32_t +SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { + return internalPrev(fDelegate->preceding(offset)); +} + +int32_t +SimpleFilteredSentenceBreakIterator::previous(void) { + return internalPrev(fDelegate->previous()); +} + +UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { + if(!fDelegate->isBoundary(offset)) return false; // no break to suppress + + UErrorCode status = U_ZERO_ERROR; + resetState(status); + + SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); + + switch(m) { + case kExceptionHere: + return false; + default: + case kNoExceptionHere: + return true; + } +} + +int32_t +SimpleFilteredSentenceBreakIterator::next(int32_t offset) { + return internalNext(fDelegate->next(offset)); +} + +int32_t +SimpleFilteredSentenceBreakIterator::following(int32_t offset) { + return internalNext(fDelegate->following(offset)); +} + +int32_t +SimpleFilteredSentenceBreakIterator::last(void) { + // Don't suppress a break opportunity at the end of text. + return fDelegate->last(); +} + + /** * Concrete implementation of builder class. */ @@ -411,7 +475,7 @@ SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() { } -SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) +SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) : fSet(status) { } @@ -483,7 +547,7 @@ SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UEr int32_t subCount = fSet.size(); UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount); - + LocalArray ustrs(ustrs_ptr); LocalMemory partials; @@ -523,7 +587,7 @@ SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UEr FB_TRACE("prefix",&ustrs[j],FALSE,nn+1); //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn if(partials[j]==0) { // hasn't been processed yet - partials[j] = kSuppressInReverse | kAddToForward; + partials[j] = (ustrs[j].length() == nn+1)? (kSuppressInReverse | kAddToForward): kAddToForward; FB_TRACE("suppressing",&ustrs[j],FALSE,j); } else if(partials[j] & kSuppressInReverse) { sameAs = j; // the other entry is already in the reverse table. @@ -540,7 +604,7 @@ SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UEr revCount++; FB_TRACE("Added partial",&prefix,FALSE, i); FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); - partials[i] = kSuppressInReverse | kAddToForward; + partials[i] = kAddToForward; } else { FB_TRACE("NOT adding partial",&prefix,FALSE, i); FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); @@ -548,12 +612,13 @@ SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UEr } } for(int i=0;iadd(ustrs[i], kMATCH, status); revCount++; FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i); - } else { + } + if((partials[i] & kAddToForward) != 0) { FB_TRACE("Adding fwd",&ustrs[i], FALSE, i); // an optimization would be to only add the portion after the '.'