]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/filteredbrk.cpp
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / filteredbrk.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
57a6839d
A
3/*
4*******************************************************************************
f3c0d7a5 5* Copyright (C) 2014-2015, International Business Machines Corporation and
57a6839d
A
6* others. All Rights Reserved.
7*******************************************************************************
8*/
9
b331163b
A
10#include "unicode/utypes.h"
11#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
57a6839d 12
b331163b 13#include "cmemory.h"
57a6839d 14
b331163b
A
15#include "unicode/filteredbrk.h"
16#include "unicode/ucharstriebuilder.h"
17#include "unicode/ures.h"
57a6839d 18
b331163b
A
19#include "uresimp.h" // ures_getByKeyWithFallback
20#include "ubrkimpl.h" // U_ICUDATA_BRKITR
21#include "uvector.h"
22#include "cmemory.h"
57a6839d
A
23
24U_NAMESPACE_BEGIN
25
b331163b
A
26#ifndef FB_DEBUG
27#define FB_DEBUG 0
28#endif
29
30#if FB_DEBUG
31#include <stdio.h>
32static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
33 char buf[2048];
34 if(s) {
35 s->extract(0,s->length(),buf,2048);
36 } else {
37 strcpy(buf,"NULL");
38 }
39 fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
40 f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
41}
42
43#define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
44#else
45#define FB_TRACE(m,s,b,d)
46#endif
47
2ca993e8
A
48/**
49 * Used with sortedInsert()
50 */
b331163b
A
51static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
52 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
53 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
54 return a.compare(b);
55}
56
57/**
58 * A UVector which implements a set of strings.
59 */
60class U_COMMON_API UStringSet : public UVector {
61 public:
62 UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
63 uhash_compareUnicodeString,
64 1,
65 status) {}
66 virtual ~UStringSet();
67 /**
68 * Is this UnicodeSet contained?
69 */
70 inline UBool contains(const UnicodeString& s) {
71 return contains((void*) &s);
72 }
73 using UVector::contains;
74 /**
75 * Return the ith UnicodeString alias
76 */
77 inline const UnicodeString* getStringAt(int32_t i) const {
78 return (const UnicodeString*)elementAt(i);
79 }
80 /**
81 * Adopt the UnicodeString if not already contained.
82 * Caller no longer owns the pointer in any case.
83 * @return true if adopted successfully, false otherwise (error, or else duplicate)
84 */
85 inline UBool adopt(UnicodeString *str, UErrorCode &status) {
86 if(U_FAILURE(status) || contains(*str)) {
87 delete str;
88 return false;
89 } else {
90 sortedInsert(str, compareUnicodeString, status);
91 if(U_FAILURE(status)) {
92 delete str;
93 return false;
94 }
95 return true;
96 }
97 }
98 /**
99 * Add by value.
100 * @return true if successfully adopted.
101 */
102 inline UBool add(const UnicodeString& str, UErrorCode &status) {
103 if(U_FAILURE(status)) return false;
104 UnicodeString *t = new UnicodeString(str);
105 if(t==NULL) {
106 status = U_MEMORY_ALLOCATION_ERROR; return false;
107 }
108 return adopt(t, status);
109 }
110 /**
111 * Remove this string.
112 * @return true if successfully removed, false otherwise (error, or else it wasn't there)
113 */
114 inline UBool remove(const UnicodeString &s, UErrorCode &status) {
115 if(U_FAILURE(status)) return false;
116 return removeElement((void*) &s);
117 }
118};
119
120/**
121 * Virtual, won't be inlined
122 */
123UStringSet::~UStringSet() {}
124
2ca993e8 125/* ----------------------------------------------------------- */
57a6839d 126
2ca993e8
A
127
128/* Filtered Break constants */
57a6839d
A
129static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
130static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
131static const int32_t kSuppressInReverse = (1<<0);
132static const int32_t kAddToForward = (1<<1);
2ca993e8
A
133static const UChar kFULLSTOP = 0x002E; // '.'
134
135/**
136 * Shared data for SimpleFilteredSentenceBreakIterator
137 */
138class SimpleFilteredSentenceBreakData : public UMemory {
139public:
140 SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
141 : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
142 SimpleFilteredSentenceBreakData *incr() { refcount++; return this; }
143 SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
144 virtual ~SimpleFilteredSentenceBreakData();
57a6839d 145
2ca993e8
A
146 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
147 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
148 int32_t refcount;
149};
150
151SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
152
153/**
154 * Concrete implementation
155 */
b331163b 156class SimpleFilteredSentenceBreakIterator : public BreakIterator {
57a6839d 157public:
b331163b
A
158 SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
159 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
160 virtual ~SimpleFilteredSentenceBreakIterator();
57a6839d 161private:
2ca993e8 162 SimpleFilteredSentenceBreakData *fData;
57a6839d
A
163 LocalPointer<BreakIterator> fDelegate;
164 LocalUTextPointer fText;
57a6839d
A
165
166 /* -- subclass interface -- */
167public:
168 /* -- cloning and other subclass stuff -- */
169 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
170 int32_t &/*BufferSize*/,
171 UErrorCode &status) {
172 // for now - always deep clone
173 status = U_SAFECLONE_ALLOCATED_WARNING;
174 return clone();
175 }
b331163b 176 virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); }
57a6839d 177 virtual UClassID getDynamicClassID(void) const { return NULL; }
b331163b 178 virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; }
57a6839d
A
179
180 /* -- text modifying -- */
181 virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
182 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
183 virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
184 virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
185
186 /* -- other functions that are just delegated -- */
187 virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
188 virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
189
190 /* -- ITERATION -- */
2ca993e8
A
191 virtual int32_t first(void);
192 virtual int32_t preceding(int32_t offset);
193 virtual int32_t previous(void);
194 virtual UBool isBoundary(int32_t offset);
195 virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
57a6839d
A
196
197 virtual int32_t next(void);
2ca993e8
A
198
199 virtual int32_t next(int32_t n);
b331163b 200 virtual int32_t following(int32_t offset);
2ca993e8 201 virtual int32_t last(void);
57a6839d 202
b331163b 203private:
2ca993e8
A
204 /**
205 * Given that the fDelegate has already given its "initial" answer,
206 * find the NEXT actual (non-excepted) break.
207 * @param n initial position from delegate
208 * @return new break position or UBRK_DONE
209 */
210 int32_t internalNext(int32_t n);
211 /**
212 * Given that the fDelegate has already given its "initial" answer,
213 * find the PREV actual (non-excepted) break.
214 * @param n initial position from delegate
215 * @return new break position or UBRK_DONE
216 */
217 int32_t internalPrev(int32_t n);
218 /**
219 * set up the UText with the value of the fDelegate.
220 * Call this before calling breakExceptionAt.
221 * May be able to avoid excess calls
222 */
223 void resetState(UErrorCode &status);
224 /**
225 * Is there a match (exception) at this spot?
226 */
227 enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
228 /**
229 * Determine if there is an exception at this spot
230 * @param n spot to check
231 * @return kNoExceptionHere or kExceptionHere
232 **/
233 enum EFBMatchResult breakExceptionAt(int32_t n);
57a6839d
A
234};
235
b331163b 236SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
2ca993e8 237 : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
57a6839d 238{
57a6839d
A
239}
240
241
b331163b 242SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
57a6839d 243 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
2ca993e8
A
244 fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
245 fDelegate(adopt)
57a6839d
A
246{
247 // all set..
248}
249
2ca993e8
A
250SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
251 fData = fData->decr();
b331163b
A
252}
253
2ca993e8
A
254void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
255 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
b331163b
A
256}
257
2ca993e8
A
258SimpleFilteredSentenceBreakIterator::EFBMatchResult
259SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
260 int64_t bestPosn = -1;
261 int32_t bestValue = -1;
57a6839d
A
262 // loops while 'n' points to an exception.
263 utext_setNativeIndex(fText.getAlias(), n); // from n..
2ca993e8 264 fData->fBackwardsTrie->reset();
57a6839d 265 UChar32 uch;
2ca993e8 266
57a6839d
A
267 //if(debug2) u_printf(" n@ %d\n", n);
268 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
269 if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
270 // TODO only do this the 1st time?
271 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
272 } else {
273 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
274 uch = utext_next32(fText.getAlias());
275 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
276 }
57a6839d 277
2ca993e8 278 UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
57a6839d
A
279
280 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
2ca993e8 281 USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
57a6839d
A
282 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
283 bestPosn = utext_getNativeIndex(fText.getAlias());
2ca993e8 284 bestValue = fData->fBackwardsTrie->getValue();
57a6839d
A
285 }
286 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
287 }
288
289 if(USTRINGTRIE_MATCHES(r)) { // exact match?
290 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
2ca993e8 291 bestValue = fData->fBackwardsTrie->getValue();
57a6839d
A
292 bestPosn = utext_getNativeIndex(fText.getAlias());
293 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
294 }
295
296 if(bestPosn>=0) {
297 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
298
299 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
300 //int32_t bestValue = fBackwardsTrie->getValue();
301 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
302
2ca993e8
A
303 if(bestPosn>0) {
304 UChar32 prevch = utext_char32At(fText.getAlias(), bestPosn-1); // char before the best match
305 if (prevch != U_SENTINEL && u_isUAlphabetic(prevch)) {
306 // The match is preceded by other alphabetic characters, => invalid
307 return kNoExceptionHere;
308 }
309 }
310
57a6839d
A
311 if(bestValue == kMATCH) { // exact match!
312 //if(debug2) u_printf(" exact backward match\n");
2ca993e8 313 return kExceptionHere; // See if the next is another exception.
57a6839d 314 } else if(bestValue == kPARTIAL
2ca993e8 315 && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
57a6839d
A
316 //if(debug2) u_printf(" partial backward match\n");
317 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
318 // to see if it matches something going forward.
2ca993e8 319 fData->fForwardsPartialTrie->reset();
57a6839d
A
320 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
321 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
322 //if(debug2) u_printf("Retrying at %d\n", bestPosn);
323 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
2ca993e8 324 USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
57a6839d
A
325 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
326 }
327 if(USTRINGTRIE_MATCHES(rfwd)) {
328 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
329 // only full matches here, nothing to check
330 // skip the next:
2ca993e8 331 return kExceptionHere;
57a6839d
A
332 } else {
333 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
334 // no match (no exception) -return the 'underlying' break
2ca993e8 335 return kNoExceptionHere;
57a6839d
A
336 }
337 } else {
2ca993e8 338 return kNoExceptionHere; // internal error and/or no forwards trie
57a6839d
A
339 }
340 } else {
341 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
2ca993e8 342 return kNoExceptionHere; // No match - so exit. Not an exception.
57a6839d 343 }
57a6839d
A
344}
345
2ca993e8
A
346// the workhorse single next.
347int32_t
348SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
349 if(n == UBRK_DONE || // at end or
350 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
351 return n;
352 }
353 // OK, do we need to break here?
354 UErrorCode status = U_ZERO_ERROR;
355 // refresh text
356 resetState(status);
357 if(U_FAILURE(status)) return UBRK_DONE; // bail out
358 int64_t utextLen = utext_nativeLength(fText.getAlias());
359
360 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
361 while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
362 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
363
364 switch(m) {
365 case kExceptionHere:
366 n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
367 continue;
57a6839d 368
2ca993e8
A
369 default:
370 case kNoExceptionHere:
371 return n;
372 }
373 }
374 return n;
57a6839d 375}
57a6839d 376
2ca993e8
A
377int32_t
378SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
379 if(n == 0 || n == UBRK_DONE || // at end or
380 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
381 return n;
b331163b
A
382 }
383 // OK, do we need to break here?
384 UErrorCode status = U_ZERO_ERROR;
385 // refresh text
2ca993e8
A
386 resetState(status);
387 if(U_FAILURE(status)) return UBRK_DONE; // bail out
b331163b 388
2ca993e8
A
389 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
390 while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
391 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
b331163b 392
2ca993e8
A
393 switch(m) {
394 case kExceptionHere:
395 n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
396 continue;
b331163b 397
2ca993e8
A
398 default:
399 case kNoExceptionHere:
400 return n;
b331163b 401 }
2ca993e8
A
402 }
403 return n;
404}
b331163b 405
b331163b 406
2ca993e8
A
407int32_t
408SimpleFilteredSentenceBreakIterator::next() {
409 return internalNext(fDelegate->next());
410}
411
412int32_t
413SimpleFilteredSentenceBreakIterator::first(void) {
f3c0d7a5
A
414 // Don't suppress a break opportunity at the beginning of text.
415 return fDelegate->first();
b331163b 416}
57a6839d 417
2ca993e8
A
418int32_t
419SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
420 return internalPrev(fDelegate->preceding(offset));
421}
422
423int32_t
424SimpleFilteredSentenceBreakIterator::previous(void) {
425 return internalPrev(fDelegate->previous());
426}
427
428UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
f3c0d7a5
A
429 if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
430
431 if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions
2ca993e8
A
432
433 UErrorCode status = U_ZERO_ERROR;
434 resetState(status);
435
436 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
437
438 switch(m) {
439 case kExceptionHere:
440 return false;
441 default:
442 case kNoExceptionHere:
443 return true;
444 }
445}
446
447int32_t
448SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
449 return internalNext(fDelegate->next(offset));
450}
451
452int32_t
453SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
454 return internalNext(fDelegate->following(offset));
455}
456
457int32_t
458SimpleFilteredSentenceBreakIterator::last(void) {
459 // Don't suppress a break opportunity at the end of text.
460 return fDelegate->last();
461}
462
463
b331163b
A
464/**
465 * Concrete implementation of builder class.
466 */
467class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
57a6839d
A
468public:
469 virtual ~SimpleFilteredBreakIteratorBuilder();
470 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
b331163b 471 SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
57a6839d
A
472 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
473 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
474 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
475private:
b331163b 476 UStringSet fSet;
57a6839d
A
477};
478
479SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
480{
481}
482
2ca993e8 483SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
b331163b
A
484 : fSet(status)
485{
486}
487
57a6839d 488SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
b331163b 489 : fSet(status)
57a6839d
A
490{
491 if(U_SUCCESS(status)) {
f3c0d7a5
A
492 UErrorCode subStatus = U_ZERO_ERROR;
493 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
494 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
495 status = subStatus; // copy the failing status
496#if FB_DEBUG
497 fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
498#endif
499 return; // leaves the builder empty, if you try to use it.
500 }
501 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &subStatus));
502 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
503 status = subStatus; // copy the failing status
504#if FB_DEBUG
505 fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
506#endif
507 return; // leaves the builder empty, if you try to use it.
508 }
509 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &subStatus));
510
511#if FB_DEBUG
512 {
513 UErrorCode subsub = subStatus;
514 fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
515 }
516#endif
517
518 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
519 status = subStatus; // copy the failing status
520#if FB_DEBUG
521 fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
522#endif
523 return; // leaves the builder empty, if you try to use it.
524 }
57a6839d
A
525
526 LocalUResourceBundlePointer strs;
f3c0d7a5 527 subStatus = status; // Pick up inherited warning status now
57a6839d
A
528 do {
529 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
530 if(strs.isValid() && U_SUCCESS(subStatus)) {
531 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
532 suppressBreakAfter(str, status); // load the string
533 }
534 } while (strs.isValid() && U_SUCCESS(subStatus));
535 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
536 status = subStatus;
537 }
538 }
539}
540
57a6839d
A
541UBool
542SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
543{
b331163b
A
544 UBool r = fSet.add(exception, status);
545 FB_TRACE("suppressBreakAfter",&exception,r,0);
546 return r;
57a6839d
A
547}
548
549UBool
550SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
551{
b331163b
A
552 UBool r = fSet.remove(exception, status);
553 FB_TRACE("unsuppressBreakAfter",&exception,r,0);
554 return r;
555}
556
557/**
558 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
559 * Work around this.
560 *
561 * Note: "new UnicodeString[subCount]" ends up calling global operator new
562 * on MSVC2012 for some reason.
563 */
564static inline UnicodeString* newUnicodeStringArray(size_t count) {
565 return new UnicodeString[count ? count : 1];
57a6839d
A
566}
567
568BreakIterator *
569SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
570 LocalPointer<BreakIterator> adopt(adoptBreakIterator);
571
b331163b
A
572 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
573 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
57a6839d
A
574 if(U_FAILURE(status)) {
575 return NULL;
576 }
577
57a6839d
A
578 int32_t revCount = 0;
579 int32_t fwdCount = 0;
580
581 int32_t subCount = fSet.size();
b331163b
A
582
583 UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
2ca993e8 584
b331163b
A
585 LocalArray<UnicodeString> ustrs(ustrs_ptr);
586
587 LocalMemory<int> partials;
588 partials.allocateInsteadAndReset(subCount);
57a6839d
A
589
590 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
591 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
592
593 int n=0;
b331163b
A
594 for ( int32_t i = 0;
595 i<fSet.size();
57a6839d 596 i++) {
b331163b
A
597 const UnicodeString *abbr = fSet.getStringAt(i);
598 if(abbr) {
599 FB_TRACE("build",abbr,TRUE,i);
600 ustrs[n] = *abbr; // copy by value
601 FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i);
602 } else {
603 FB_TRACE("build",abbr,FALSE,i);
604 status = U_MEMORY_ALLOCATION_ERROR;
605 return NULL;
606 }
57a6839d
A
607 partials[n] = 0; // default: not partial
608 n++;
609 }
610 // first pass - find partials.
611 for(int i=0;i<subCount;i++) {
612 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
613 if(nn>-1 && (nn+1)!=ustrs[i].length()) {
b331163b 614 FB_TRACE("partial",&ustrs[i],FALSE,i);
57a6839d
A
615 // is partial.
616 // is it unique?
617 int sameAs = -1;
618 for(int j=0;j<subCount;j++) {
619 if(j==i) continue;
620 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
b331163b 621 FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
57a6839d
A
622 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
623 if(partials[j]==0) { // hasn't been processed yet
2ca993e8 624 partials[j] = (ustrs[j].length() == nn+1)? (kSuppressInReverse | kAddToForward): kAddToForward;
b331163b 625 FB_TRACE("suppressing",&ustrs[j],FALSE,j);
57a6839d
A
626 } else if(partials[j] & kSuppressInReverse) {
627 sameAs = j; // the other entry is already in the reverse table.
628 }
629 }
630 }
b331163b
A
631 FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs);
632 FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]);
57a6839d
A
633 UnicodeString prefix(ustrs[i], 0, nn+1);
634 if(sameAs == -1 && partials[i] == 0) {
635 // first one - add the prefix to the reverse table.
636 prefix.reverse();
637 builder->add(prefix, kPARTIAL, status);
638 revCount++;
b331163b
A
639 FB_TRACE("Added partial",&prefix,FALSE, i);
640 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
2ca993e8 641 partials[i] = kAddToForward;
57a6839d 642 } else {
b331163b
A
643 FB_TRACE("NOT adding partial",&prefix,FALSE, i);
644 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
57a6839d
A
645 }
646 }
647 }
648 for(int i=0;i<subCount;i++) {
2ca993e8 649 if((partials[i] & kSuppressInReverse) == 0) {
57a6839d
A
650 ustrs[i].reverse();
651 builder->add(ustrs[i], kMATCH, status);
652 revCount++;
b331163b 653 FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
2ca993e8
A
654 }
655 if((partials[i] & kAddToForward) != 0) {
b331163b 656 FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
57a6839d
A
657
658 // an optimization would be to only add the portion after the '.'
659 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
660 // instead of "Ph.D." since we already know the "Ph." part is a match.
661 // would need the trie to be able to hold 0-length strings, though.
662 builder2->add(ustrs[i], kMATCH, status); // forward
663 fwdCount++;
664 //ustrs[i].reverse();
665 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
666 }
667 }
b331163b 668 FB_TRACE("AbbrCount",NULL,FALSE, subCount);
57a6839d
A
669
670 if(revCount>0) {
671 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
672 if(U_FAILURE(status)) {
b331163b 673 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
57a6839d
A
674 return NULL;
675 }
676 }
677
678 if(fwdCount>0) {
679 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
680 if(U_FAILURE(status)) {
b331163b 681 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
57a6839d
A
682 return NULL;
683 }
684 }
685
b331163b 686 return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
57a6839d
A
687}
688
689
b331163b 690// ----------- Base class implementation
57a6839d
A
691
692FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
693}
694
695FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
696}
697
698FilteredBreakIteratorBuilder *
699FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
700 if(U_FAILURE(status)) return NULL;
b331163b
A
701 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
702 return (U_SUCCESS(status))? ret.orphan(): NULL;
57a6839d
A
703}
704
705FilteredBreakIteratorBuilder *
706FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
707 if(U_FAILURE(status)) return NULL;
b331163b
A
708 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
709 return (U_SUCCESS(status))? ret.orphan(): NULL;
57a6839d
A
710}
711
712U_NAMESPACE_END
713
f3c0d7a5 714#endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION