]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/filteredbrk.cpp
ICU-531.31.tar.gz
[apple/icu.git] / icuSources / i18n / filteredbrk.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7
8 #include "unicode/filteredbrk.h"
9
10 #if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION
11
12 #include <unicode/ucharstriebuilder.h>
13
14 #include <set>
15 #include <string>
16 #include <functional>
17 #include "uresimp.h"
18 #include "ubrkimpl.h"
19
20 U_NAMESPACE_BEGIN
21
22 using namespace std;
23
24 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
25 static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
26 static const int32_t kSuppressInReverse = (1<<0);
27 static const int32_t kAddToForward = (1<<1);
28 static const UChar kFULLSTOP = 0x002E; // '.'
29
30 class ULISentenceBreakIterator : public BreakIterator {
31 public:
32 ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
33 virtual ~ULISentenceBreakIterator() {}
34 ULISentenceBreakIterator(const ULISentenceBreakIterator& other);
35 private:
36 LocalPointer<BreakIterator> fDelegate;
37 LocalUTextPointer fText;
38 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
39 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
40
41 /* -- subclass interface -- */
42 public:
43 /* -- cloning and other subclass stuff -- */
44 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
45 int32_t &/*BufferSize*/,
46 UErrorCode &status) {
47 // for now - always deep clone
48 status = U_SAFECLONE_ALLOCATED_WARNING;
49 return clone();
50 }
51 virtual BreakIterator* clone(void) const { return new ULISentenceBreakIterator(*this); }
52 virtual UClassID getDynamicClassID(void) const { return NULL; }
53 virtual UBool operator==(const BreakIterator& o) const { if(*this==o) return true; return false; }
54
55 /* -- text modifying -- */
56 virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
57 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
58 virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
59 virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
60
61 /* -- other functions that are just delegated -- */
62 virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
63 virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
64
65 /* -- ITERATION -- */
66 virtual int32_t first(void) { return fDelegate->first(); }
67 virtual int32_t preceding(int32_t offset) { return fDelegate->preceding(offset); }
68 virtual int32_t previous(void) { return fDelegate->previous(); }
69 virtual UBool isBoundary(int32_t offset) { return fDelegate->isBoundary(offset); }
70 virtual int32_t current(void) const { return fDelegate->current(); }
71
72 virtual int32_t next(void);
73
74 virtual int32_t next(int32_t n) { return fDelegate->next(n); }
75 virtual int32_t following(int32_t offset) { return fDelegate->following(offset); }
76 virtual int32_t last(void) { return fDelegate->last(); }
77
78 };
79
80 ULISentenceBreakIterator::ULISentenceBreakIterator(const ULISentenceBreakIterator& other)
81 : BreakIterator(other), fDelegate(other.fDelegate->clone())
82 {
83 /*
84 TODO: not able to clone Tries. Should be a refcounted hidden master instead.
85 if(other.fBackwardsTrie.isValid()) {
86 fBackwardsTrie.adoptInstead(other.fBackwardsTrie->clone());
87 }
88 if(other.fForwardsPartialTrie.isValid()) {
89 fForwardsPartialTrie.adoptInstead(other.fForwardsPartialTrie->clone());
90 }
91 */
92 }
93
94
95 ULISentenceBreakIterator::ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
96 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
97 fDelegate(adopt),
98 fBackwardsTrie(backwards),
99 fForwardsPartialTrie(forwards)
100 {
101 // all set..
102 }
103
104 int32_t ULISentenceBreakIterator::next() {
105 int32_t n = fDelegate->next();
106 if(n == UBRK_DONE || // at end or
107 fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
108 return n;
109 }
110 // OK, do we need to break here?
111 UErrorCode status = U_ZERO_ERROR;
112 // refresh text
113 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
114 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
115 do { // outer loop runs once per underlying break (from fDelegate).
116 // loops while 'n' points to an exception.
117 utext_setNativeIndex(fText.getAlias(), n); // from n..
118 fBackwardsTrie->reset();
119 UChar32 uch;
120 //if(debug2) u_printf(" n@ %d\n", n);
121 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
122 if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
123 // TODO only do this the 1st time?
124 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
125 } else {
126 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
127 uch = utext_next32(fText.getAlias());
128 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
129 }
130 UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
131
132 int32_t bestPosn = -1;
133 int32_t bestValue = -1;
134
135 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
136 USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
137 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
138 bestPosn = utext_getNativeIndex(fText.getAlias());
139 bestValue = fBackwardsTrie->getValue();
140 }
141 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
142 }
143
144 if(USTRINGTRIE_MATCHES(r)) { // exact match?
145 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
146 bestValue = fBackwardsTrie->getValue();
147 bestPosn = utext_getNativeIndex(fText.getAlias());
148 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
149 }
150
151 if(bestPosn>=0) {
152 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
153
154 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
155 //int32_t bestValue = fBackwardsTrie->getValue();
156 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
157
158 if(bestValue == kMATCH) { // exact match!
159 //if(debug2) u_printf(" exact backward match\n");
160 n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
161 if(n==UBRK_DONE) return n;
162 continue; // See if the next is another exception.
163 } else if(bestValue == kPARTIAL
164 && fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
165 //if(debug2) u_printf(" partial backward match\n");
166 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
167 // to see if it matches something going forward.
168 fForwardsPartialTrie->reset();
169 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
170 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
171 //if(debug2) u_printf("Retrying at %d\n", bestPosn);
172 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
173 USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) {
174 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
175 }
176 if(USTRINGTRIE_MATCHES(rfwd)) {
177 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
178 // only full matches here, nothing to check
179 // skip the next:
180 n = fDelegate->next();
181 if(n==UBRK_DONE) return n;
182 continue;
183 } else {
184 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
185 // no match (no exception) -return the 'underlying' break
186 return n;
187 }
188 } else {
189 return n; // internal error and/or no forwards trie
190 }
191 } else {
192 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
193 return n; // No match - so exit. Not an exception.
194 }
195 } while(n != UBRK_DONE);
196 return n;
197 }
198
199 U_NAMESPACE_END
200
201 #if 0
202 // Would improve performance - but, platform issues.
203 // for the 'set'
204 namespace std {
205 template <> struct hash<icu::UnicodeString> {
206 size_t operator()( const UnicodeString& str ) const {
207 return (size_t)str.hashCode();
208 }
209 };
210 }
211 #endif
212
213 U_NAMESPACE_BEGIN
214
215 class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
216 public:
217 virtual ~SimpleFilteredBreakIteratorBuilder();
218 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
219 SimpleFilteredBreakIteratorBuilder();
220 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
221 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
222 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
223 private:
224 set<UnicodeString> fSet;
225 };
226
227 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
228 {
229 }
230
231 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
232 : fSet()
233 {
234 if(U_SUCCESS(status)) {
235 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status));
236 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &status));
237 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &status));
238 if(U_FAILURE(status)) return; // leaves the builder empty, if you try to use it.
239
240 LocalUResourceBundlePointer strs;
241 UErrorCode subStatus = status;
242 do {
243 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
244 if(strs.isValid() && U_SUCCESS(subStatus)) {
245 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
246 suppressBreakAfter(str, status); // load the string
247 }
248 } while (strs.isValid() && U_SUCCESS(subStatus));
249 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
250 status = subStatus;
251 }
252 }
253 }
254
255 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder()
256 : fSet()
257 {
258 }
259
260 UBool
261 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
262 {
263 if( U_FAILURE(status) ) return FALSE;
264 return fSet.insert(exception).second;
265 }
266
267 UBool
268 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
269 {
270 if( U_FAILURE(status) ) return FALSE;
271 return ((fSet.erase(exception)) != 0);
272 }
273
274 BreakIterator *
275 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
276 LocalPointer<BreakIterator> adopt(adoptBreakIterator);
277
278 if(U_FAILURE(status)) {
279 return NULL;
280 }
281
282 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status));
283 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status));
284
285 int32_t revCount = 0;
286 int32_t fwdCount = 0;
287
288 int32_t subCount = fSet.size();
289 LocalArray<UnicodeString> ustrs(new UnicodeString[subCount]);
290 LocalArray<int> partials(new int[subCount]);
291
292 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
293 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
294
295 int n=0;
296 for ( set<UnicodeString>::iterator i = fSet.begin();
297 i != fSet.end();
298 i++) {
299 const UnicodeString &abbr = *i;
300 ustrs[n] = abbr;
301 partials[n] = 0; // default: not partial
302 n++;
303 }
304 // first pass - find partials.
305 for(int i=0;i<subCount;i++) {
306 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
307 if(nn>-1 && (nn+1)!=ustrs[i].length()) {
308 //if(true) u_printf("Is a partial: /%S/\n", ustrs[i].getTerminatedBuffer());
309 // is partial.
310 // is it unique?
311 int sameAs = -1;
312 for(int j=0;j<subCount;j++) {
313 if(j==i) continue;
314 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
315 //if(true) u_printf("Prefix match: /%S/ to %d\n", ustrs[j].getTerminatedBuffer(), nn+1);
316 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
317 if(partials[j]==0) { // hasn't been processed yet
318 partials[j] = kSuppressInReverse | kAddToForward;
319 //if(true) u_printf("Suppressing: /%S/\n", ustrs[j].getTerminatedBuffer());
320 } else if(partials[j] & kSuppressInReverse) {
321 sameAs = j; // the other entry is already in the reverse table.
322 }
323 }
324 }
325 //if(debug2) u_printf("for partial /%S/ same=%d partials=%d\n", ustrs[i].getTerminatedBuffer(), sameAs, partials[i]);
326 UnicodeString prefix(ustrs[i], 0, nn+1);
327 if(sameAs == -1 && partials[i] == 0) {
328 // first one - add the prefix to the reverse table.
329 prefix.reverse();
330 builder->add(prefix, kPARTIAL, status);
331 revCount++;
332 //if(debug2) u_printf("Added Partial: /%S/ from /%S/ status=%s\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer(), u_errorName(status));
333 partials[i] = kSuppressInReverse | kAddToForward;
334 } else {
335 //if(debug2) u_printf(" // not adding partial for /%S/ from /%S/\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer());
336 }
337 }
338 }
339 for(int i=0;i<subCount;i++) {
340 if(partials[i]==0) {
341 ustrs[i].reverse();
342 builder->add(ustrs[i], kMATCH, status);
343 revCount++;
344 //if(debug2) u_printf("Added: /%S/ status=%s\n", ustrs[i].getTerminatedBuffer(), u_errorName(status));
345 } else {
346 //if(debug2) u_printf(" Adding fwd: /%S/\n", ustrs[i].getTerminatedBuffer());
347
348 // an optimization would be to only add the portion after the '.'
349 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
350 // instead of "Ph.D." since we already know the "Ph." part is a match.
351 // would need the trie to be able to hold 0-length strings, though.
352 builder2->add(ustrs[i], kMATCH, status); // forward
353 fwdCount++;
354 //ustrs[i].reverse();
355 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
356 }
357 }
358 //if(debug) u_printf(" %s has %d abbrs.\n", fJSONSource.c_str(), subCount);
359
360 if(revCount>0) {
361 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
362 if(U_FAILURE(status)) {
363 //printf("Error %s building backwards\n", u_errorName(status));
364 return NULL;
365 }
366 }
367
368 if(fwdCount>0) {
369 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
370 if(U_FAILURE(status)) {
371 //printf("Error %s building forwards\n", u_errorName(status));
372 return NULL;
373 }
374 }
375
376 return new ULISentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
377 }
378
379
380 // -----------
381
382 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
383 }
384
385 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
386 }
387
388 FilteredBreakIteratorBuilder *
389 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
390 if(U_FAILURE(status)) return NULL;
391
392 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status));
393 if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
394 return ret.orphan();
395 }
396
397 FilteredBreakIteratorBuilder *
398 FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
399 if(U_FAILURE(status)) return NULL;
400
401 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder());
402 if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
403 return ret.orphan();
404 }
405
406 U_NAMESPACE_END
407
408 #endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION