/*
*******************************************************************************
-* Copyright (C) 2014-2015, International Business Machines Corporation and
+* Copyright (C) 2014-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
#define FB_TRACE(m,s,b,d)
#endif
+/**
+ * Used with sortedInsert()
+ */
static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
const UnicodeString &a = *(const UnicodeString*)t1.pointer;
const UnicodeString &b = *(const UnicodeString*)t2.pointer;
*/
UStringSet::~UStringSet() {}
+/* ----------------------------------------------------------- */
+
+/* Filtered Break constants */
static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
static const int32_t kSuppressInReverse = (1<<0);
static const int32_t kAddToForward = (1<<1);
-static const UChar kFULLSTOP = 0x002E; // '.'
+static const UChar kFULLSTOP = 0x002E; // '.'
+
+/**
+ * Shared data for SimpleFilteredSentenceBreakIterator
+ */
+class SimpleFilteredSentenceBreakData : public UMemory {
+public:
+ SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
+ : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
+ SimpleFilteredSentenceBreakData *incr() { refcount++; return this; }
+ SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
+ virtual ~SimpleFilteredSentenceBreakData();
+ LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
+ LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
+ int32_t refcount;
+};
+
+SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
+
+/**
+ * Concrete implementation
+ */
class SimpleFilteredSentenceBreakIterator : public BreakIterator {
public:
SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
virtual ~SimpleFilteredSentenceBreakIterator();
private:
+ SimpleFilteredSentenceBreakData *fData;
LocalPointer<BreakIterator> fDelegate;
LocalUTextPointer fText;
- LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
- LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
/* -- subclass interface -- */
public:
virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
/* -- ITERATION -- */
- virtual int32_t first(void) { return fDelegate->first(); }
- virtual UBool isBoundary(int32_t offset) { return fDelegate->isBoundary(offset); }
- virtual int32_t current(void) const { return fDelegate->current(); }
- virtual int32_t next(int32_t n) { return fDelegate->next(n); } // fallback implementation, undoing r36410
- virtual int32_t last(void) { return fDelegate->last(); }
+ virtual int32_t first(void);
+ virtual int32_t preceding(int32_t offset);
+ virtual int32_t previous(void);
+ virtual UBool isBoundary(int32_t offset);
+ virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
virtual int32_t next(void);
+
+ virtual int32_t next(int32_t n);
virtual int32_t following(int32_t offset);
- virtual int32_t previous(void);
- virtual int32_t preceding(int32_t offset);
+ virtual int32_t last(void);
private:
- virtual int32_t nextCore(int32_t n);
- virtual int32_t previousCore(int32_t n);
-
+ /**
+ * Given that the fDelegate has already given its "initial" answer,
+ * find the NEXT actual (non-excepted) break.
+ * @param n initial position from delegate
+ * @return new break position or UBRK_DONE
+ */
+ int32_t internalNext(int32_t n);
+ /**
+ * Given that the fDelegate has already given its "initial" answer,
+ * find the PREV actual (non-excepted) break.
+ * @param n initial position from delegate
+ * @return new break position or UBRK_DONE
+ */
+ int32_t internalPrev(int32_t n);
+ /**
+ * set up the UText with the value of the fDelegate.
+ * Call this before calling breakExceptionAt.
+ * May be able to avoid excess calls
+ */
+ void resetState(UErrorCode &status);
+ /**
+ * Is there a match (exception) at this spot?
+ */
+ enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
+ /**
+ * Determine if there is an exception at this spot
+ * @param n spot to check
+ * @return kNoExceptionHere or kExceptionHere
+ **/
+ enum EFBMatchResult breakExceptionAt(int32_t n);
};
SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
- : BreakIterator(other), fDelegate(other.fDelegate->clone())
+ : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
{
- /*
- TODO: not able to clone Tries. Should be a refcounted hidden master instead.
- if(other.fBackwardsTrie.isValid()) {
- fBackwardsTrie.adoptInstead(other.fBackwardsTrie->clone());
- }
- if(other.fForwardsPartialTrie.isValid()) {
- fForwardsPartialTrie.adoptInstead(other.fForwardsPartialTrie->clone());
- }
- */
}
SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
- fDelegate(adopt),
- fBackwardsTrie(backwards),
- fForwardsPartialTrie(forwards)
+ fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
+ fDelegate(adopt)
{
// all set..
}
-SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {}
-
-int32_t SimpleFilteredSentenceBreakIterator::next() {
- int32_t n = fDelegate->next();
- return nextCore(n);
+SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
+ fData = fData->decr();
}
-int32_t SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
- int32_t n = fDelegate->following(offset);
- return nextCore(n);
+void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
+ fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
}
-int32_t SimpleFilteredSentenceBreakIterator::nextCore(int32_t n) {
- if(n == UBRK_DONE || // at end or
- fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
- return n;
- }
- // OK, do we need to break here?
- UErrorCode status = U_ZERO_ERROR;
- // refresh text
- fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
- int64_t utextLen = utext_nativeLength(fText.getAlias());
- if(n == utextLen) {
- return n;
- }
- //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
- do { // outer loop runs once per underlying break (from fDelegate).
+SimpleFilteredSentenceBreakIterator::EFBMatchResult
+SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
+ int64_t bestPosn = -1;
+ int32_t bestValue = -1;
// loops while 'n' points to an exception.
utext_setNativeIndex(fText.getAlias(), n); // from n..
- fBackwardsTrie->reset();
+ fData->fBackwardsTrie->reset();
UChar32 uch;
+
//if(debug2) u_printf(" n@ %d\n", n);
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
uch = utext_next32(fText.getAlias());
//if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
}
- UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
- int32_t bestPosn = -1;
- int32_t bestValue = -1;
+ UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
- USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
+ USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
bestPosn = utext_getNativeIndex(fText.getAlias());
- bestValue = fBackwardsTrie->getValue();
+ bestValue = fData->fBackwardsTrie->getValue();
}
//if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
}
if(USTRINGTRIE_MATCHES(r)) { // exact match?
//if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
- bestValue = fBackwardsTrie->getValue();
+ bestValue = fData->fBackwardsTrie->getValue();
bestPosn = utext_getNativeIndex(fText.getAlias());
//if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
}
//int32_t bestValue = fBackwardsTrie->getValue();
////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
+ if(bestPosn>0) {
+ UChar32 prevch = utext_char32At(fText.getAlias(), bestPosn-1); // char before the best match
+ if (prevch != U_SENTINEL && u_isUAlphabetic(prevch)) {
+ // The match is preceded by other alphabetic characters, => invalid
+ return kNoExceptionHere;
+ }
+ }
+
if(bestValue == kMATCH) { // exact match!
//if(debug2) u_printf(" exact backward match\n");
- n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
- if(n==UBRK_DONE || n==utextLen) return n;
- continue; // See if the next is another exception.
+ return kExceptionHere; // See if the next is another exception.
} else if(bestValue == kPARTIAL
- && fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
+ && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
//if(debug2) u_printf(" partial backward match\n");
// We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
// to see if it matches something going forward.
- fForwardsPartialTrie->reset();
+ fData->fForwardsPartialTrie->reset();
UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
//if(debug2) u_printf("Retrying at %d\n", bestPosn);
while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
- USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) {
+ USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
//if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
}
if(USTRINGTRIE_MATCHES(rfwd)) {
//if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
// only full matches here, nothing to check
// skip the next:
- n = fDelegate->next();
- if(n==UBRK_DONE || n==utextLen) return n;
- continue;
+ return kExceptionHere;
} else {
//if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
// no match (no exception) -return the 'underlying' break
- return n;
+ return kNoExceptionHere;
}
} else {
- return n; // internal error and/or no forwards trie
+ return kNoExceptionHere; // internal error and/or no forwards trie
}
} else {
//if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
- return n; // No match - so exit. Not an exception.
+ return kNoExceptionHere; // No match - so exit. Not an exception.
}
- } while(n != UBRK_DONE);
- return n;
}
-int32_t SimpleFilteredSentenceBreakIterator::previous() {
- int32_t n = fDelegate->previous();
- return previousCore(n);
-}
+// the workhorse single next.
+int32_t
+SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
+ if(n == UBRK_DONE || // at end or
+ fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
+ return n;
+ }
+ // OK, do we need to break here?
+ UErrorCode status = U_ZERO_ERROR;
+ // refresh text
+ resetState(status);
+ if(U_FAILURE(status)) return UBRK_DONE; // bail out
+ int64_t utextLen = utext_nativeLength(fText.getAlias());
+
+ //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
+ while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
+ SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
+
+ switch(m) {
+ case kExceptionHere:
+ n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
+ continue;
-int32_t SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
- int32_t n = fDelegate->preceding(offset);
- return previousCore(n);
+ default:
+ case kNoExceptionHere:
+ return n;
+ }
+ }
+ return n;
}
-int32_t SimpleFilteredSentenceBreakIterator::previousCore(int32_t n) {
- if(n == UBRK_DONE || n == 0 || // at end or
- fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
- return n;
+int32_t
+SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
+ if(n == 0 || n == UBRK_DONE || // at end or
+ fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
+ return n;
}
// OK, do we need to break here?
UErrorCode status = U_ZERO_ERROR;
// refresh text
- fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
- do { // outer loop runs once per underlying break (from fDelegate).
- // loops while 'n' points to an exception.
- utext_setNativeIndex(fText.getAlias(), n); // from n..
- fBackwardsTrie->reset();
- UChar32 uch;
- // Skip over any space preceding the break
- if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
- // TODO only do this the 1st time?
- } else {
- //restore what we skipped
- uch = utext_next32(fText.getAlias());
- }
- UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
+ resetState(status);
+ if(U_FAILURE(status)) return UBRK_DONE; // bail out
- int32_t bestPosn = -1;
- int32_t bestValue = -1;
+ //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
+ while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
+ SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
- while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
- USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
- if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
- bestPosn = utext_getNativeIndex(fText.getAlias());
- bestValue = fBackwardsTrie->getValue();
- }
- }
+ switch(m) {
+ case kExceptionHere:
+ n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
+ continue;
- if(USTRINGTRIE_MATCHES(r)) { // exact match?
- bestValue = fBackwardsTrie->getValue();
- bestPosn = utext_getNativeIndex(fText.getAlias());
+ default:
+ case kNoExceptionHere:
+ return n;
}
+ }
+ return n;
+}
- if(bestPosn>=0) {
- if(bestValue == kMATCH) { // exact match!
- n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
- if(n==UBRK_DONE || n==0) return n;
- continue; // See if the next is another exception.
- } else if(bestValue == kPARTIAL
- && fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
- fForwardsPartialTrie->reset();
- UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
- utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
- while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
- USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) {
- }
- if(USTRINGTRIE_MATCHES(rfwd)) {
- n = fDelegate->previous();
- if(n==UBRK_DONE || n==0) return n;
- continue;
- } else {
- // no match (no exception) -return the 'underlying' break
- return n;
- }
- } else {
- return n; // internal error and/or no forwards trie
- }
- } else {
- return n; // No match - so exit. Not an exception.
- }
- } while(n != UBRK_DONE);
- return n;
+int32_t
+SimpleFilteredSentenceBreakIterator::next() {
+ return internalNext(fDelegate->next());
+}
+
+int32_t
+SimpleFilteredSentenceBreakIterator::first(void) {
+ return internalNext(fDelegate->first());
}
+int32_t
+SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
+ return internalPrev(fDelegate->preceding(offset));
+}
+
+int32_t
+SimpleFilteredSentenceBreakIterator::previous(void) {
+ return internalPrev(fDelegate->previous());
+}
+
+UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
+ if(!fDelegate->isBoundary(offset)) return false; // no break to suppress
+
+ UErrorCode status = U_ZERO_ERROR;
+ resetState(status);
+
+ SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
+
+ switch(m) {
+ case kExceptionHere:
+ return false;
+ default:
+ case kNoExceptionHere:
+ return true;
+ }
+}
+
+int32_t
+SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
+ return internalNext(fDelegate->next(offset));
+}
+
+int32_t
+SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
+ return internalNext(fDelegate->following(offset));
+}
+
+int32_t
+SimpleFilteredSentenceBreakIterator::last(void) {
+ // Don't suppress a break opportunity at the end of text.
+ return fDelegate->last();
+}
+
+
/**
* Concrete implementation of builder class.
*/
{
}
-SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
+SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
: fSet(status)
{
}
int32_t subCount = fSet.size();
UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
-
+
LocalArray<UnicodeString> ustrs(ustrs_ptr);
LocalMemory<int> partials;
FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
//UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
if(partials[j]==0) { // hasn't been processed yet
- partials[j] = kSuppressInReverse | kAddToForward;
+ partials[j] = (ustrs[j].length() == nn+1)? (kSuppressInReverse | kAddToForward): kAddToForward;
FB_TRACE("suppressing",&ustrs[j],FALSE,j);
} else if(partials[j] & kSuppressInReverse) {
sameAs = j; // the other entry is already in the reverse table.
revCount++;
FB_TRACE("Added partial",&prefix,FALSE, i);
FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
- partials[i] = kSuppressInReverse | kAddToForward;
+ partials[i] = kAddToForward;
} else {
FB_TRACE("NOT adding partial",&prefix,FALSE, i);
FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
}
}
for(int i=0;i<subCount;i++) {
- if(partials[i]==0) {
+ if((partials[i] & kSuppressInReverse) == 0) {
ustrs[i].reverse();
builder->add(ustrs[i], kMATCH, status);
revCount++;
FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
- } else {
+ }
+ if((partials[i] & kAddToForward) != 0) {
FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
// an optimization would be to only add the portion after the '.'