1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2014-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
10 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
15 #include "unicode/filteredbrk.h"
16 #include "unicode/ucharstriebuilder.h"
17 #include "unicode/ures.h"
19 #include "uresimp.h" // ures_getByKeyWithFallback
20 #include "ubrkimpl.h" // U_ICUDATA_BRKITR
32 static void _fb_trace(const char *m
, const UnicodeString
*s
, UBool b
, int32_t d
, const char *f
, int l
) {
35 s
->extract(0,s
->length(),buf
,2048);
39 fprintf(stderr
,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
40 f
, l
, m
, buf
, (const void*)s
, b
?'T':'F',(int)d
);
43 #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
45 #define FB_TRACE(m,s,b,d)
49 * Used with sortedInsert()
51 static int8_t U_CALLCONV
compareUnicodeString(UElement t1
, UElement t2
) {
52 const UnicodeString
&a
= *(const UnicodeString
*)t1
.pointer
;
53 const UnicodeString
&b
= *(const UnicodeString
*)t2
.pointer
;
58 * A UVector which implements a set of strings.
60 class U_COMMON_API UStringSet
: public UVector
{
62 UStringSet(UErrorCode
&status
) : UVector(uprv_deleteUObject
,
63 uhash_compareUnicodeString
,
66 virtual ~UStringSet();
68 * Is this UnicodeSet contained?
70 inline UBool
contains(const UnicodeString
& s
) {
71 return contains((void*) &s
);
73 using UVector::contains
;
75 * Return the ith UnicodeString alias
77 inline const UnicodeString
* getStringAt(int32_t i
) const {
78 return (const UnicodeString
*)elementAt(i
);
81 * Adopt the UnicodeString if not already contained.
82 * Caller no longer owns the pointer in any case.
83 * @return true if adopted successfully, false otherwise (error, or else duplicate)
85 inline UBool
adopt(UnicodeString
*str
, UErrorCode
&status
) {
86 if(U_FAILURE(status
) || contains(*str
)) {
90 sortedInsert(str
, compareUnicodeString
, status
);
91 if(U_FAILURE(status
)) {
100 * @return true if successfully adopted.
102 inline UBool
add(const UnicodeString
& str
, UErrorCode
&status
) {
103 if(U_FAILURE(status
)) return false;
104 UnicodeString
*t
= new UnicodeString(str
);
106 status
= U_MEMORY_ALLOCATION_ERROR
; return false;
108 return adopt(t
, status
);
111 * Remove this string.
112 * @return true if successfully removed, false otherwise (error, or else it wasn't there)
114 inline UBool
remove(const UnicodeString
&s
, UErrorCode
&status
) {
115 if(U_FAILURE(status
)) return false;
116 return removeElement((void*) &s
);
121 * Virtual, won't be inlined
123 UStringSet::~UStringSet() {}
125 /* ----------------------------------------------------------- */
128 /* Filtered Break constants */
129 static const int32_t kPARTIAL
= (1<<0); //< partial - need to run through forward trie
130 static const int32_t kMATCH
= (1<<1); //< exact match - skip this one.
131 static const int32_t kSuppressInReverse
= (1<<0);
132 static const int32_t kAddToForward
= (1<<1);
133 static const UChar kFULLSTOP
= 0x002E; // '.'
136 * Shared data for SimpleFilteredSentenceBreakIterator
138 class SimpleFilteredSentenceBreakData
: public UMemory
{
140 SimpleFilteredSentenceBreakData(UCharsTrie
*forwards
, UCharsTrie
*backwards
)
141 : fForwardsPartialTrie(forwards
), fBackwardsTrie(backwards
), refcount(1) { }
142 SimpleFilteredSentenceBreakData
*incr() { refcount
++; return this; }
143 SimpleFilteredSentenceBreakData
*decr() { if((--refcount
) <= 0) delete this; return 0; }
144 virtual ~SimpleFilteredSentenceBreakData();
146 LocalPointer
<UCharsTrie
> fForwardsPartialTrie
; // Has ".a" for "a.M."
147 LocalPointer
<UCharsTrie
> fBackwardsTrie
; // i.e. ".srM" for Mrs.
151 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
154 * Concrete implementation
156 class SimpleFilteredSentenceBreakIterator
: public BreakIterator
{
158 SimpleFilteredSentenceBreakIterator(BreakIterator
*adopt
, UCharsTrie
*forwards
, UCharsTrie
*backwards
, UErrorCode
&status
);
159 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator
& other
);
160 virtual ~SimpleFilteredSentenceBreakIterator();
162 SimpleFilteredSentenceBreakData
*fData
;
163 LocalPointer
<BreakIterator
> fDelegate
;
164 LocalUTextPointer fText
;
166 /* -- subclass interface -- */
168 /* -- cloning and other subclass stuff -- */
169 virtual BreakIterator
* createBufferClone(void * /*stackBuffer*/,
170 int32_t &/*BufferSize*/,
171 UErrorCode
&status
) {
172 // for now - always deep clone
173 status
= U_SAFECLONE_ALLOCATED_WARNING
;
176 virtual BreakIterator
* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); }
177 virtual UClassID
getDynamicClassID(void) const { return NULL
; }
178 virtual UBool
operator==(const BreakIterator
& o
) const { if(this==&o
) return true; return false; }
180 /* -- text modifying -- */
181 virtual void setText(UText
*text
, UErrorCode
&status
) { fDelegate
->setText(text
,status
); }
182 virtual BreakIterator
&refreshInputText(UText
*input
, UErrorCode
&status
) { fDelegate
->refreshInputText(input
,status
); return *this; }
183 virtual void adoptText(CharacterIterator
* it
) { fDelegate
->adoptText(it
); }
184 virtual void setText(const UnicodeString
&text
) { fDelegate
->setText(text
); }
186 /* -- other functions that are just delegated -- */
187 virtual UText
*getUText(UText
*fillIn
, UErrorCode
&status
) const { return fDelegate
->getUText(fillIn
,status
); }
188 virtual CharacterIterator
& getText(void) const { return fDelegate
->getText(); }
190 /* -- ITERATION -- */
191 virtual int32_t first(void);
192 virtual int32_t preceding(int32_t offset
);
193 virtual int32_t previous(void);
194 virtual UBool
isBoundary(int32_t offset
);
195 virtual int32_t current(void) const { return fDelegate
->current(); } // we keep the delegate current, so this should be correct.
197 virtual int32_t next(void);
199 virtual int32_t next(int32_t n
);
200 virtual int32_t following(int32_t offset
);
201 virtual int32_t last(void);
205 * Given that the fDelegate has already given its "initial" answer,
206 * find the NEXT actual (non-excepted) break.
207 * @param n initial position from delegate
208 * @return new break position or UBRK_DONE
210 int32_t internalNext(int32_t n
);
212 * Given that the fDelegate has already given its "initial" answer,
213 * find the PREV actual (non-excepted) break.
214 * @param n initial position from delegate
215 * @return new break position or UBRK_DONE
217 int32_t internalPrev(int32_t n
);
219 * set up the UText with the value of the fDelegate.
220 * Call this before calling breakExceptionAt.
221 * May be able to avoid excess calls
223 void resetState(UErrorCode
&status
);
225 * Is there a match (exception) at this spot?
227 enum EFBMatchResult
{ kNoExceptionHere
, kExceptionHere
};
229 * Determine if there is an exception at this spot
230 * @param n spot to check
231 * @return kNoExceptionHere or kExceptionHere
233 enum EFBMatchResult
breakExceptionAt(int32_t n
);
236 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator
& other
)
237 : BreakIterator(other
), fData(other
.fData
->incr()), fDelegate(other
.fDelegate
->clone())
242 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator
*adopt
, UCharsTrie
*forwards
, UCharsTrie
*backwards
, UErrorCode
&status
) :
243 BreakIterator(adopt
->getLocale(ULOC_VALID_LOCALE
,status
),adopt
->getLocale(ULOC_ACTUAL_LOCALE
,status
)),
244 fData(new SimpleFilteredSentenceBreakData(forwards
, backwards
)),
250 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
251 fData
= fData
->decr();
254 void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode
&status
) {
255 fText
.adoptInstead(fDelegate
->getUText(fText
.orphan(), status
));
258 SimpleFilteredSentenceBreakIterator::EFBMatchResult
259 SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n
) {
260 int64_t bestPosn
= -1;
261 int32_t bestValue
= -1;
262 // loops while 'n' points to an exception.
263 utext_setNativeIndex(fText
.getAlias(), n
); // from n..
264 fData
->fBackwardsTrie
->reset();
267 //if(debug2) u_printf(" n@ %d\n", n);
268 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
269 if((uch
=utext_previous32(fText
.getAlias()))==(UChar32
)0x0020) { // TODO: skip a class of chars here??
270 // TODO only do this the 1st time?
271 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
273 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
274 uch
= utext_next32(fText
.getAlias());
275 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
278 UStringTrieResult r
= USTRINGTRIE_INTERMEDIATE_VALUE
;
280 while((uch
=utext_previous32(fText
.getAlias()))!=U_SENTINEL
&& // more to consume backwards and..
281 USTRINGTRIE_HAS_NEXT(r
=fData
->fBackwardsTrie
->nextForCodePoint(uch
))) {// more in the trie
282 if(USTRINGTRIE_HAS_VALUE(r
)) { // remember the best match so far
283 bestPosn
= utext_getNativeIndex(fText
.getAlias());
284 bestValue
= fData
->fBackwardsTrie
->getValue();
286 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
289 if(USTRINGTRIE_MATCHES(r
)) { // exact match?
290 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
291 bestValue
= fData
->fBackwardsTrie
->getValue();
292 bestPosn
= utext_getNativeIndex(fText
.getAlias());
293 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
297 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
299 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
300 //int32_t bestValue = fBackwardsTrie->getValue();
301 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
304 UChar32 prevch
= utext_char32At(fText
.getAlias(), bestPosn
-1); // char before the best match
305 if (prevch
!= U_SENTINEL
&& u_isUAlphabetic(prevch
)) {
306 // The match is preceded by other alphabetic characters, => invalid
307 return kNoExceptionHere
;
311 if(bestValue
== kMATCH
) { // exact match!
312 //if(debug2) u_printf(" exact backward match\n");
313 return kExceptionHere
; // See if the next is another exception.
314 } else if(bestValue
== kPARTIAL
315 && fData
->fForwardsPartialTrie
.isValid()) { // make sure there's a forward trie
316 //if(debug2) u_printf(" partial backward match\n");
317 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
318 // to see if it matches something going forward.
319 fData
->fForwardsPartialTrie
->reset();
320 UStringTrieResult rfwd
= USTRINGTRIE_INTERMEDIATE_VALUE
;
321 utext_setNativeIndex(fText
.getAlias(), bestPosn
); // hope that's close ..
322 //if(debug2) u_printf("Retrying at %d\n", bestPosn);
323 while((uch
=utext_next32(fText
.getAlias()))!=U_SENTINEL
&&
324 USTRINGTRIE_HAS_NEXT(rfwd
=fData
->fForwardsPartialTrie
->nextForCodePoint(uch
))) {
325 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
327 if(USTRINGTRIE_MATCHES(rfwd
)) {
328 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
329 // only full matches here, nothing to check
331 return kExceptionHere
;
333 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
334 // no match (no exception) -return the 'underlying' break
335 return kNoExceptionHere
;
338 return kNoExceptionHere
; // internal error and/or no forwards trie
341 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
342 return kNoExceptionHere
; // No match - so exit. Not an exception.
346 // the workhorse single next.
348 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n
) {
349 if(n
== UBRK_DONE
|| // at end or
350 fData
->fBackwardsTrie
.isNull()) { // .. no backwards table loaded == no exceptions
353 // OK, do we need to break here?
354 UErrorCode status
= U_ZERO_ERROR
;
357 if(U_FAILURE(status
)) return UBRK_DONE
; // bail out
358 int64_t utextLen
= utext_nativeLength(fText
.getAlias());
360 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
361 while (n
!= UBRK_DONE
&& n
!= utextLen
) { // outer loop runs once per underlying break (from fDelegate).
362 SimpleFilteredSentenceBreakIterator::EFBMatchResult m
= breakExceptionAt(n
);
366 n
= fDelegate
->next(); // skip this one. Find the next lowerlevel break.
370 case kNoExceptionHere
:
378 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n
) {
379 if(n
== 0 || n
== UBRK_DONE
|| // at end or
380 fData
->fBackwardsTrie
.isNull()) { // .. no backwards table loaded == no exceptions
383 // OK, do we need to break here?
384 UErrorCode status
= U_ZERO_ERROR
;
387 if(U_FAILURE(status
)) return UBRK_DONE
; // bail out
389 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
390 while (n
!= UBRK_DONE
&& n
!= 0) { // outer loop runs once per underlying break (from fDelegate).
391 SimpleFilteredSentenceBreakIterator::EFBMatchResult m
= breakExceptionAt(n
);
395 n
= fDelegate
->previous(); // skip this one. Find the next lowerlevel break.
399 case kNoExceptionHere
:
408 SimpleFilteredSentenceBreakIterator::next() {
409 return internalNext(fDelegate
->next());
413 SimpleFilteredSentenceBreakIterator::first(void) {
414 // Don't suppress a break opportunity at the beginning of text.
415 return fDelegate
->first();
419 SimpleFilteredSentenceBreakIterator::preceding(int32_t offset
) {
420 return internalPrev(fDelegate
->preceding(offset
));
424 SimpleFilteredSentenceBreakIterator::previous(void) {
425 return internalPrev(fDelegate
->previous());
428 UBool
SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset
) {
429 if (!fDelegate
->isBoundary(offset
)) return false; // no break to suppress
431 if (fData
->fBackwardsTrie
.isNull()) return true; // no data = no suppressions
433 UErrorCode status
= U_ZERO_ERROR
;
436 SimpleFilteredSentenceBreakIterator::EFBMatchResult m
= breakExceptionAt(offset
);
442 case kNoExceptionHere
:
448 SimpleFilteredSentenceBreakIterator::next(int32_t offset
) {
449 return internalNext(fDelegate
->next(offset
));
453 SimpleFilteredSentenceBreakIterator::following(int32_t offset
) {
454 return internalNext(fDelegate
->following(offset
));
458 SimpleFilteredSentenceBreakIterator::last(void) {
459 // Don't suppress a break opportunity at the end of text.
460 return fDelegate
->last();
465 * Concrete implementation of builder class.
467 class U_COMMON_API SimpleFilteredBreakIteratorBuilder
: public FilteredBreakIteratorBuilder
{
469 virtual ~SimpleFilteredBreakIteratorBuilder();
470 SimpleFilteredBreakIteratorBuilder(const Locale
&fromLocale
, UErrorCode
&status
);
471 SimpleFilteredBreakIteratorBuilder(UErrorCode
&status
);
472 virtual UBool
suppressBreakAfter(const UnicodeString
& exception
, UErrorCode
& status
);
473 virtual UBool
unsuppressBreakAfter(const UnicodeString
& exception
, UErrorCode
& status
);
474 virtual BreakIterator
*build(BreakIterator
* adoptBreakIterator
, UErrorCode
& status
);
479 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
483 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode
&status
)
488 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale
&fromLocale
, UErrorCode
&status
)
491 if(U_SUCCESS(status
)) {
492 UErrorCode subStatus
= U_ZERO_ERROR
;
493 LocalUResourceBundlePointer
b(ures_open(U_ICUDATA_BRKITR
, fromLocale
.getBaseName(), &subStatus
));
494 if (U_FAILURE(subStatus
) || (subStatus
== U_USING_DEFAULT_WARNING
) ) {
495 status
= subStatus
; // copy the failing status
497 fprintf(stderr
, "open BUNDLE %s : %s, %s\n", fromLocale
.getBaseName(), "[exit]", u_errorName(status
));
499 return; // leaves the builder empty, if you try to use it.
501 LocalUResourceBundlePointer
exceptions(ures_getByKeyWithFallback(b
.getAlias(), "exceptions", NULL
, &subStatus
));
502 if (U_FAILURE(subStatus
) || (subStatus
== U_USING_DEFAULT_WARNING
) ) {
503 status
= subStatus
; // copy the failing status
505 fprintf(stderr
, "open EXCEPTIONS %s : %s, %s\n", fromLocale
.getBaseName(), "[exit]", u_errorName(status
));
507 return; // leaves the builder empty, if you try to use it.
509 LocalUResourceBundlePointer
breaks(ures_getByKeyWithFallback(exceptions
.getAlias(), "SentenceBreak", NULL
, &subStatus
));
513 UErrorCode subsub
= subStatus
;
514 fprintf(stderr
, "open SentenceBreak %s => %s, %s\n", fromLocale
.getBaseName(), ures_getLocale(breaks
.getAlias(), &subsub
), u_errorName(subStatus
));
518 if (U_FAILURE(subStatus
) || (subStatus
== U_USING_DEFAULT_WARNING
) ) {
519 status
= subStatus
; // copy the failing status
521 fprintf(stderr
, "open %s : %s, %s\n", fromLocale
.getBaseName(), "[exit]", u_errorName(status
));
523 return; // leaves the builder empty, if you try to use it.
526 LocalUResourceBundlePointer strs
;
527 subStatus
= status
; // Pick up inherited warning status now
529 strs
.adoptInstead(ures_getNextResource(breaks
.getAlias(), strs
.orphan(), &subStatus
));
530 if(strs
.isValid() && U_SUCCESS(subStatus
)) {
531 UnicodeString
str(ures_getUnicodeString(strs
.getAlias(), &status
));
532 suppressBreakAfter(str
, status
); // load the string
534 } while (strs
.isValid() && U_SUCCESS(subStatus
));
535 if(U_FAILURE(subStatus
)&&subStatus
!=U_INDEX_OUTOFBOUNDS_ERROR
&&U_SUCCESS(status
)) {
542 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString
& exception
, UErrorCode
& status
)
544 UBool r
= fSet
.add(exception
, status
);
545 FB_TRACE("suppressBreakAfter",&exception
,r
,0);
550 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString
& exception
, UErrorCode
& status
)
552 UBool r
= fSet
.remove(exception
, status
);
553 FB_TRACE("unsuppressBreakAfter",&exception
,r
,0);
558 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
561 * Note: "new UnicodeString[subCount]" ends up calling global operator new
562 * on MSVC2012 for some reason.
564 static inline UnicodeString
* newUnicodeStringArray(size_t count
) {
565 return new UnicodeString
[count
? count
: 1];
569 SimpleFilteredBreakIteratorBuilder::build(BreakIterator
* adoptBreakIterator
, UErrorCode
& status
) {
570 LocalPointer
<BreakIterator
> adopt(adoptBreakIterator
);
572 LocalPointer
<UCharsTrieBuilder
> builder(new UCharsTrieBuilder(status
), status
);
573 LocalPointer
<UCharsTrieBuilder
> builder2(new UCharsTrieBuilder(status
), status
);
574 if(U_FAILURE(status
)) {
578 int32_t revCount
= 0;
579 int32_t fwdCount
= 0;
581 int32_t subCount
= fSet
.size();
583 UnicodeString
*ustrs_ptr
= newUnicodeStringArray(subCount
);
585 LocalArray
<UnicodeString
> ustrs(ustrs_ptr
);
587 LocalMemory
<int> partials
;
588 partials
.allocateInsteadAndReset(subCount
);
590 LocalPointer
<UCharsTrie
> backwardsTrie
; // i.e. ".srM" for Mrs.
591 LocalPointer
<UCharsTrie
> forwardsPartialTrie
; // Has ".a" for "a.M."
597 const UnicodeString
*abbr
= fSet
.getStringAt(i
);
599 FB_TRACE("build",abbr
,TRUE
,i
);
600 ustrs
[n
] = *abbr
; // copy by value
601 FB_TRACE("ustrs[n]",&ustrs
[n
],TRUE
,i
);
603 FB_TRACE("build",abbr
,FALSE
,i
);
604 status
= U_MEMORY_ALLOCATION_ERROR
;
607 partials
[n
] = 0; // default: not partial
610 // first pass - find partials.
611 for(int i
=0;i
<subCount
;i
++) {
612 int nn
= ustrs
[i
].indexOf(kFULLSTOP
); // TODO: non-'.' abbreviations
613 if(nn
>-1 && (nn
+1)!=ustrs
[i
].length()) {
614 FB_TRACE("partial",&ustrs
[i
],FALSE
,i
);
618 for(int j
=0;j
<subCount
;j
++) {
620 if(ustrs
[i
].compare(0,nn
+1,ustrs
[j
],0,nn
+1)==0) {
621 FB_TRACE("prefix",&ustrs
[j
],FALSE
,nn
+1);
622 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
623 if(partials
[j
]==0) { // hasn't been processed yet
624 partials
[j
] = (ustrs
[j
].length() == nn
+1)? (kSuppressInReverse
| kAddToForward
): kAddToForward
;
625 FB_TRACE("suppressing",&ustrs
[j
],FALSE
,j
);
626 } else if(partials
[j
] & kSuppressInReverse
) {
627 sameAs
= j
; // the other entry is already in the reverse table.
631 FB_TRACE("for partial same-",&ustrs
[i
],FALSE
,sameAs
);
632 FB_TRACE(" == partial #",&ustrs
[i
],FALSE
,partials
[i
]);
633 UnicodeString
prefix(ustrs
[i
], 0, nn
+1);
634 if(sameAs
== -1 && partials
[i
] == 0) {
635 // first one - add the prefix to the reverse table.
637 builder
->add(prefix
, kPARTIAL
, status
);
639 FB_TRACE("Added partial",&prefix
,FALSE
, i
);
640 FB_TRACE(u_errorName(status
),&ustrs
[i
],FALSE
,i
);
641 partials
[i
] = kAddToForward
;
643 FB_TRACE("NOT adding partial",&prefix
,FALSE
, i
);
644 FB_TRACE(u_errorName(status
),&ustrs
[i
],FALSE
,i
);
648 for(int i
=0;i
<subCount
;i
++) {
649 if((partials
[i
] & kSuppressInReverse
) == 0) {
651 builder
->add(ustrs
[i
], kMATCH
, status
);
653 FB_TRACE(u_errorName(status
), &ustrs
[i
], FALSE
, i
);
655 if((partials
[i
] & kAddToForward
) != 0) {
656 FB_TRACE("Adding fwd",&ustrs
[i
], FALSE
, i
);
658 // an optimization would be to only add the portion after the '.'
659 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
660 // instead of "Ph.D." since we already know the "Ph." part is a match.
661 // would need the trie to be able to hold 0-length strings, though.
662 builder2
->add(ustrs
[i
], kMATCH
, status
); // forward
664 //ustrs[i].reverse();
665 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
668 FB_TRACE("AbbrCount",NULL
,FALSE
, subCount
);
671 backwardsTrie
.adoptInstead(builder
->build(USTRINGTRIE_BUILD_FAST
, status
));
672 if(U_FAILURE(status
)) {
673 FB_TRACE(u_errorName(status
),NULL
,FALSE
, -1);
679 forwardsPartialTrie
.adoptInstead(builder2
->build(USTRINGTRIE_BUILD_FAST
, status
));
680 if(U_FAILURE(status
)) {
681 FB_TRACE(u_errorName(status
),NULL
,FALSE
, -1);
686 return new SimpleFilteredSentenceBreakIterator(adopt
.orphan(), forwardsPartialTrie
.orphan(), backwardsTrie
.orphan(), status
);
690 // ----------- Base class implementation
692 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
695 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
698 FilteredBreakIteratorBuilder
*
699 FilteredBreakIteratorBuilder::createInstance(const Locale
& where
, UErrorCode
& status
) {
700 if(U_FAILURE(status
)) return NULL
;
701 LocalPointer
<FilteredBreakIteratorBuilder
> ret(new SimpleFilteredBreakIteratorBuilder(where
, status
), status
);
702 return (U_SUCCESS(status
))? ret
.orphan(): NULL
;
705 FilteredBreakIteratorBuilder
*
706 FilteredBreakIteratorBuilder::createInstance(UErrorCode
&status
) {
707 return createEmptyInstance(status
);
710 FilteredBreakIteratorBuilder
*
711 FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode
& status
) {
712 if(U_FAILURE(status
)) return NULL
;
713 LocalPointer
<FilteredBreakIteratorBuilder
> ret(new SimpleFilteredBreakIteratorBuilder(status
), status
);
714 return (U_SUCCESS(status
))? ret
.orphan(): NULL
;
719 #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION