2 *******************************************************************************
3 * Copyright (C) 2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
8 #include "unicode/filteredbrk.h"
10 #if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION
12 #include <unicode/ucharstriebuilder.h>
24 static const int32_t kPARTIAL
= (1<<0); //< partial - need to run through forward trie
25 static const int32_t kMATCH
= (1<<1); //< exact match - skip this one.
26 static const int32_t kSuppressInReverse
= (1<<0);
27 static const int32_t kAddToForward
= (1<<1);
28 static const UChar kFULLSTOP
= 0x002E; // '.'
30 class ULISentenceBreakIterator
: public BreakIterator
{
32 ULISentenceBreakIterator(BreakIterator
*adopt
, UCharsTrie
*forwards
, UCharsTrie
*backwards
, UErrorCode
&status
);
33 virtual ~ULISentenceBreakIterator() {}
34 ULISentenceBreakIterator(const ULISentenceBreakIterator
& other
);
36 LocalPointer
<BreakIterator
> fDelegate
;
37 LocalUTextPointer fText
;
38 LocalPointer
<UCharsTrie
> fBackwardsTrie
; // i.e. ".srM" for Mrs.
39 LocalPointer
<UCharsTrie
> fForwardsPartialTrie
; // Has ".a" for "a.M."
41 /* -- subclass interface -- */
43 /* -- cloning and other subclass stuff -- */
44 virtual BreakIterator
* createBufferClone(void * /*stackBuffer*/,
45 int32_t &/*BufferSize*/,
47 // for now - always deep clone
48 status
= U_SAFECLONE_ALLOCATED_WARNING
;
51 virtual BreakIterator
* clone(void) const { return new ULISentenceBreakIterator(*this); }
52 virtual UClassID
getDynamicClassID(void) const { return NULL
; }
53 virtual UBool
operator==(const BreakIterator
& o
) const { if(*this==o
) return true; return false; }
55 /* -- text modifying -- */
56 virtual void setText(UText
*text
, UErrorCode
&status
) { fDelegate
->setText(text
,status
); }
57 virtual BreakIterator
&refreshInputText(UText
*input
, UErrorCode
&status
) { fDelegate
->refreshInputText(input
,status
); return *this; }
58 virtual void adoptText(CharacterIterator
* it
) { fDelegate
->adoptText(it
); }
59 virtual void setText(const UnicodeString
&text
) { fDelegate
->setText(text
); }
61 /* -- other functions that are just delegated -- */
62 virtual UText
*getUText(UText
*fillIn
, UErrorCode
&status
) const { return fDelegate
->getUText(fillIn
,status
); }
63 virtual CharacterIterator
& getText(void) const { return fDelegate
->getText(); }
66 virtual int32_t first(void) { return fDelegate
->first(); }
67 virtual int32_t preceding(int32_t offset
) { return fDelegate
->preceding(offset
); }
68 virtual int32_t previous(void) { return fDelegate
->previous(); }
69 virtual UBool
isBoundary(int32_t offset
) { return fDelegate
->isBoundary(offset
); }
70 virtual int32_t current(void) const { return fDelegate
->current(); }
72 virtual int32_t next(void);
74 virtual int32_t next(int32_t n
) { return fDelegate
->next(n
); }
75 virtual int32_t following(int32_t offset
) { return fDelegate
->following(offset
); }
76 virtual int32_t last(void) { return fDelegate
->last(); }
80 ULISentenceBreakIterator::ULISentenceBreakIterator(const ULISentenceBreakIterator
& other
)
81 : BreakIterator(other
), fDelegate(other
.fDelegate
->clone())
84 TODO: not able to clone Tries. Should be a refcounted hidden master instead.
85 if(other.fBackwardsTrie.isValid()) {
86 fBackwardsTrie.adoptInstead(other.fBackwardsTrie->clone());
88 if(other.fForwardsPartialTrie.isValid()) {
89 fForwardsPartialTrie.adoptInstead(other.fForwardsPartialTrie->clone());
95 ULISentenceBreakIterator::ULISentenceBreakIterator(BreakIterator
*adopt
, UCharsTrie
*forwards
, UCharsTrie
*backwards
, UErrorCode
&status
) :
96 BreakIterator(adopt
->getLocale(ULOC_VALID_LOCALE
,status
),adopt
->getLocale(ULOC_ACTUAL_LOCALE
,status
)),
98 fBackwardsTrie(backwards
),
99 fForwardsPartialTrie(forwards
)
104 int32_t ULISentenceBreakIterator::next() {
105 int32_t n
= fDelegate
->next();
106 if(n
== UBRK_DONE
|| // at end or
107 fBackwardsTrie
.isNull()) { // .. no backwards table loaded == no exceptions
110 // OK, do we need to break here?
111 UErrorCode status
= U_ZERO_ERROR
;
113 fText
.adoptInstead(fDelegate
->getUText(fText
.orphan(), status
));
114 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
115 do { // outer loop runs once per underlying break (from fDelegate).
116 // loops while 'n' points to an exception.
117 utext_setNativeIndex(fText
.getAlias(), n
); // from n..
118 fBackwardsTrie
->reset();
120 //if(debug2) u_printf(" n@ %d\n", n);
121 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
122 if((uch
=utext_previous32(fText
.getAlias()))==(UChar32
)0x0020) { // TODO: skip a class of chars here??
123 // TODO only do this the 1st time?
124 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
126 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
127 uch
= utext_next32(fText
.getAlias());
128 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
130 UStringTrieResult r
= USTRINGTRIE_INTERMEDIATE_VALUE
;
132 int32_t bestPosn
= -1;
133 int32_t bestValue
= -1;
135 while((uch
=utext_previous32(fText
.getAlias()))!=U_SENTINEL
&& // more to consume backwards and..
136 USTRINGTRIE_HAS_NEXT(r
=fBackwardsTrie
->nextForCodePoint(uch
))) {// more in the trie
137 if(USTRINGTRIE_HAS_VALUE(r
)) { // remember the best match so far
138 bestPosn
= utext_getNativeIndex(fText
.getAlias());
139 bestValue
= fBackwardsTrie
->getValue();
141 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
144 if(USTRINGTRIE_MATCHES(r
)) { // exact match?
145 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
146 bestValue
= fBackwardsTrie
->getValue();
147 bestPosn
= utext_getNativeIndex(fText
.getAlias());
148 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
152 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
154 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
155 //int32_t bestValue = fBackwardsTrie->getValue();
156 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
158 if(bestValue
== kMATCH
) { // exact match!
159 //if(debug2) u_printf(" exact backward match\n");
160 n
= fDelegate
->next(); // skip this one. Find the next lowerlevel break.
161 if(n
==UBRK_DONE
) return n
;
162 continue; // See if the next is another exception.
163 } else if(bestValue
== kPARTIAL
164 && fForwardsPartialTrie
.isValid()) { // make sure there's a forward trie
165 //if(debug2) u_printf(" partial backward match\n");
166 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
167 // to see if it matches something going forward.
168 fForwardsPartialTrie
->reset();
169 UStringTrieResult rfwd
= USTRINGTRIE_INTERMEDIATE_VALUE
;
170 utext_setNativeIndex(fText
.getAlias(), bestPosn
); // hope that's close ..
171 //if(debug2) u_printf("Retrying at %d\n", bestPosn);
172 while((uch
=utext_next32(fText
.getAlias()))!=U_SENTINEL
&&
173 USTRINGTRIE_HAS_NEXT(rfwd
=fForwardsPartialTrie
->nextForCodePoint(uch
))) {
174 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
176 if(USTRINGTRIE_MATCHES(rfwd
)) {
177 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
178 // only full matches here, nothing to check
180 n
= fDelegate
->next();
181 if(n
==UBRK_DONE
) return n
;
184 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
185 // no match (no exception) -return the 'underlying' break
189 return n
; // internal error and/or no forwards trie
192 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
193 return n
; // No match - so exit. Not an exception.
195 } while(n
!= UBRK_DONE
);
202 // Would improve performance - but, platform issues.
205 template <> struct hash
<icu::UnicodeString
> {
206 size_t operator()( const UnicodeString
& str
) const {
207 return (size_t)str
.hashCode();
215 class SimpleFilteredBreakIteratorBuilder
: public FilteredBreakIteratorBuilder
{
217 virtual ~SimpleFilteredBreakIteratorBuilder();
218 SimpleFilteredBreakIteratorBuilder(const Locale
&fromLocale
, UErrorCode
&status
);
219 SimpleFilteredBreakIteratorBuilder();
220 virtual UBool
suppressBreakAfter(const UnicodeString
& exception
, UErrorCode
& status
);
221 virtual UBool
unsuppressBreakAfter(const UnicodeString
& exception
, UErrorCode
& status
);
222 virtual BreakIterator
*build(BreakIterator
* adoptBreakIterator
, UErrorCode
& status
);
224 set
<UnicodeString
> fSet
;
227 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
231 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale
&fromLocale
, UErrorCode
&status
)
234 if(U_SUCCESS(status
)) {
235 LocalUResourceBundlePointer
b(ures_open(U_ICUDATA_BRKITR
, fromLocale
.getBaseName(), &status
));
236 LocalUResourceBundlePointer
exceptions(ures_getByKeyWithFallback(b
.getAlias(), "exceptions", NULL
, &status
));
237 LocalUResourceBundlePointer
breaks(ures_getByKeyWithFallback(exceptions
.getAlias(), "SentenceBreak", NULL
, &status
));
238 if(U_FAILURE(status
)) return; // leaves the builder empty, if you try to use it.
240 LocalUResourceBundlePointer strs
;
241 UErrorCode subStatus
= status
;
243 strs
.adoptInstead(ures_getNextResource(breaks
.getAlias(), strs
.orphan(), &subStatus
));
244 if(strs
.isValid() && U_SUCCESS(subStatus
)) {
245 UnicodeString
str(ures_getUnicodeString(strs
.getAlias(), &status
));
246 suppressBreakAfter(str
, status
); // load the string
248 } while (strs
.isValid() && U_SUCCESS(subStatus
));
249 if(U_FAILURE(subStatus
)&&subStatus
!=U_INDEX_OUTOFBOUNDS_ERROR
&&U_SUCCESS(status
)) {
255 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder()
261 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString
& exception
, UErrorCode
& status
)
263 if( U_FAILURE(status
) ) return FALSE
;
264 return fSet
.insert(exception
).second
;
268 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString
& exception
, UErrorCode
& status
)
270 if( U_FAILURE(status
) ) return FALSE
;
271 return ((fSet
.erase(exception
)) != 0);
275 SimpleFilteredBreakIteratorBuilder::build(BreakIterator
* adoptBreakIterator
, UErrorCode
& status
) {
276 LocalPointer
<BreakIterator
> adopt(adoptBreakIterator
);
278 if(U_FAILURE(status
)) {
282 LocalPointer
<UCharsTrieBuilder
> builder(new UCharsTrieBuilder(status
));
283 LocalPointer
<UCharsTrieBuilder
> builder2(new UCharsTrieBuilder(status
));
285 int32_t revCount
= 0;
286 int32_t fwdCount
= 0;
288 int32_t subCount
= fSet
.size();
289 LocalArray
<UnicodeString
> ustrs(new UnicodeString
[subCount
]);
290 LocalArray
<int> partials(new int[subCount
]);
292 LocalPointer
<UCharsTrie
> backwardsTrie
; // i.e. ".srM" for Mrs.
293 LocalPointer
<UCharsTrie
> forwardsPartialTrie
; // Has ".a" for "a.M."
296 for ( set
<UnicodeString
>::iterator i
= fSet
.begin();
299 const UnicodeString
&abbr
= *i
;
301 partials
[n
] = 0; // default: not partial
304 // first pass - find partials.
305 for(int i
=0;i
<subCount
;i
++) {
306 int nn
= ustrs
[i
].indexOf(kFULLSTOP
); // TODO: non-'.' abbreviations
307 if(nn
>-1 && (nn
+1)!=ustrs
[i
].length()) {
308 //if(true) u_printf("Is a partial: /%S/\n", ustrs[i].getTerminatedBuffer());
312 for(int j
=0;j
<subCount
;j
++) {
314 if(ustrs
[i
].compare(0,nn
+1,ustrs
[j
],0,nn
+1)==0) {
315 //if(true) u_printf("Prefix match: /%S/ to %d\n", ustrs[j].getTerminatedBuffer(), nn+1);
316 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
317 if(partials
[j
]==0) { // hasn't been processed yet
318 partials
[j
] = kSuppressInReverse
| kAddToForward
;
319 //if(true) u_printf("Suppressing: /%S/\n", ustrs[j].getTerminatedBuffer());
320 } else if(partials
[j
] & kSuppressInReverse
) {
321 sameAs
= j
; // the other entry is already in the reverse table.
325 //if(debug2) u_printf("for partial /%S/ same=%d partials=%d\n", ustrs[i].getTerminatedBuffer(), sameAs, partials[i]);
326 UnicodeString
prefix(ustrs
[i
], 0, nn
+1);
327 if(sameAs
== -1 && partials
[i
] == 0) {
328 // first one - add the prefix to the reverse table.
330 builder
->add(prefix
, kPARTIAL
, status
);
332 //if(debug2) u_printf("Added Partial: /%S/ from /%S/ status=%s\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer(), u_errorName(status));
333 partials
[i
] = kSuppressInReverse
| kAddToForward
;
335 //if(debug2) u_printf(" // not adding partial for /%S/ from /%S/\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer());
339 for(int i
=0;i
<subCount
;i
++) {
342 builder
->add(ustrs
[i
], kMATCH
, status
);
344 //if(debug2) u_printf("Added: /%S/ status=%s\n", ustrs[i].getTerminatedBuffer(), u_errorName(status));
346 //if(debug2) u_printf(" Adding fwd: /%S/\n", ustrs[i].getTerminatedBuffer());
348 // an optimization would be to only add the portion after the '.'
349 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
350 // instead of "Ph.D." since we already know the "Ph." part is a match.
351 // would need the trie to be able to hold 0-length strings, though.
352 builder2
->add(ustrs
[i
], kMATCH
, status
); // forward
354 //ustrs[i].reverse();
355 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
358 //if(debug) u_printf(" %s has %d abbrs.\n", fJSONSource.c_str(), subCount);
361 backwardsTrie
.adoptInstead(builder
->build(USTRINGTRIE_BUILD_FAST
, status
));
362 if(U_FAILURE(status
)) {
363 //printf("Error %s building backwards\n", u_errorName(status));
369 forwardsPartialTrie
.adoptInstead(builder2
->build(USTRINGTRIE_BUILD_FAST
, status
));
370 if(U_FAILURE(status
)) {
371 //printf("Error %s building forwards\n", u_errorName(status));
376 return new ULISentenceBreakIterator(adopt
.orphan(), forwardsPartialTrie
.orphan(), backwardsTrie
.orphan(), status
);
382 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
385 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
388 FilteredBreakIteratorBuilder
*
389 FilteredBreakIteratorBuilder::createInstance(const Locale
& where
, UErrorCode
& status
) {
390 if(U_FAILURE(status
)) return NULL
;
392 LocalPointer
<FilteredBreakIteratorBuilder
> ret(new SimpleFilteredBreakIteratorBuilder(where
, status
));
393 if(!ret
.isValid()) status
= U_MEMORY_ALLOCATION_ERROR
;
397 FilteredBreakIteratorBuilder
*
398 FilteredBreakIteratorBuilder::createInstance(UErrorCode
& status
) {
399 if(U_FAILURE(status
)) return NULL
;
401 LocalPointer
<FilteredBreakIteratorBuilder
> ret(new SimpleFilteredBreakIteratorBuilder());
402 if(!ret
.isValid()) status
= U_MEMORY_ALLOCATION_ERROR
;
408 #endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION