]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/filteredbrk.cpp
ICU-57165.0.1.tar.gz
[apple/icu.git] / icuSources / common / filteredbrk.cpp
CommitLineData
57a6839d
A
1/*
2*******************************************************************************
2ca993e8 3* Copyright (C) 2014-2016, International Business Machines Corporation and
57a6839d
A
4* others. All Rights Reserved.
5*******************************************************************************
6*/
7
b331163b
A
8#include "unicode/utypes.h"
9#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
57a6839d 10
b331163b 11#include "cmemory.h"
57a6839d 12
b331163b
A
13#include "unicode/filteredbrk.h"
14#include "unicode/ucharstriebuilder.h"
15#include "unicode/ures.h"
57a6839d 16
b331163b
A
17#include "uresimp.h" // ures_getByKeyWithFallback
18#include "ubrkimpl.h" // U_ICUDATA_BRKITR
19#include "uvector.h"
20#include "cmemory.h"
57a6839d
A
21
22U_NAMESPACE_BEGIN
23
b331163b
A
24#ifndef FB_DEBUG
25#define FB_DEBUG 0
26#endif
27
28#if FB_DEBUG
29#include <stdio.h>
30static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
31 char buf[2048];
32 if(s) {
33 s->extract(0,s->length(),buf,2048);
34 } else {
35 strcpy(buf,"NULL");
36 }
37 fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
38 f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
39}
40
41#define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
42#else
43#define FB_TRACE(m,s,b,d)
44#endif
45
2ca993e8
A
46/**
47 * Used with sortedInsert()
48 */
b331163b
A
49static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
50 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
51 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
52 return a.compare(b);
53}
54
55/**
56 * A UVector which implements a set of strings.
57 */
58class U_COMMON_API UStringSet : public UVector {
59 public:
60 UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
61 uhash_compareUnicodeString,
62 1,
63 status) {}
64 virtual ~UStringSet();
65 /**
66 * Is this UnicodeSet contained?
67 */
68 inline UBool contains(const UnicodeString& s) {
69 return contains((void*) &s);
70 }
71 using UVector::contains;
72 /**
73 * Return the ith UnicodeString alias
74 */
75 inline const UnicodeString* getStringAt(int32_t i) const {
76 return (const UnicodeString*)elementAt(i);
77 }
78 /**
79 * Adopt the UnicodeString if not already contained.
80 * Caller no longer owns the pointer in any case.
81 * @return true if adopted successfully, false otherwise (error, or else duplicate)
82 */
83 inline UBool adopt(UnicodeString *str, UErrorCode &status) {
84 if(U_FAILURE(status) || contains(*str)) {
85 delete str;
86 return false;
87 } else {
88 sortedInsert(str, compareUnicodeString, status);
89 if(U_FAILURE(status)) {
90 delete str;
91 return false;
92 }
93 return true;
94 }
95 }
96 /**
97 * Add by value.
98 * @return true if successfully adopted.
99 */
100 inline UBool add(const UnicodeString& str, UErrorCode &status) {
101 if(U_FAILURE(status)) return false;
102 UnicodeString *t = new UnicodeString(str);
103 if(t==NULL) {
104 status = U_MEMORY_ALLOCATION_ERROR; return false;
105 }
106 return adopt(t, status);
107 }
108 /**
109 * Remove this string.
110 * @return true if successfully removed, false otherwise (error, or else it wasn't there)
111 */
112 inline UBool remove(const UnicodeString &s, UErrorCode &status) {
113 if(U_FAILURE(status)) return false;
114 return removeElement((void*) &s);
115 }
116};
117
118/**
119 * Virtual, won't be inlined
120 */
121UStringSet::~UStringSet() {}
122
2ca993e8 123/* ----------------------------------------------------------- */
57a6839d 124
2ca993e8
A
125
126/* Filtered Break constants */
57a6839d
A
127static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
128static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
129static const int32_t kSuppressInReverse = (1<<0);
130static const int32_t kAddToForward = (1<<1);
2ca993e8
A
131static const UChar kFULLSTOP = 0x002E; // '.'
132
133/**
134 * Shared data for SimpleFilteredSentenceBreakIterator
135 */
136class SimpleFilteredSentenceBreakData : public UMemory {
137public:
138 SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
139 : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
140 SimpleFilteredSentenceBreakData *incr() { refcount++; return this; }
141 SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
142 virtual ~SimpleFilteredSentenceBreakData();
57a6839d 143
2ca993e8
A
144 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
145 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
146 int32_t refcount;
147};
148
149SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
150
151/**
152 * Concrete implementation
153 */
b331163b 154class SimpleFilteredSentenceBreakIterator : public BreakIterator {
57a6839d 155public:
b331163b
A
156 SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
157 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
158 virtual ~SimpleFilteredSentenceBreakIterator();
57a6839d 159private:
2ca993e8 160 SimpleFilteredSentenceBreakData *fData;
57a6839d
A
161 LocalPointer<BreakIterator> fDelegate;
162 LocalUTextPointer fText;
57a6839d
A
163
164 /* -- subclass interface -- */
165public:
166 /* -- cloning and other subclass stuff -- */
167 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
168 int32_t &/*BufferSize*/,
169 UErrorCode &status) {
170 // for now - always deep clone
171 status = U_SAFECLONE_ALLOCATED_WARNING;
172 return clone();
173 }
b331163b 174 virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); }
57a6839d 175 virtual UClassID getDynamicClassID(void) const { return NULL; }
b331163b 176 virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; }
57a6839d
A
177
178 /* -- text modifying -- */
179 virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
180 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
181 virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
182 virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
183
184 /* -- other functions that are just delegated -- */
185 virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
186 virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
187
188 /* -- ITERATION -- */
2ca993e8
A
189 virtual int32_t first(void);
190 virtual int32_t preceding(int32_t offset);
191 virtual int32_t previous(void);
192 virtual UBool isBoundary(int32_t offset);
193 virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
57a6839d
A
194
195 virtual int32_t next(void);
2ca993e8
A
196
197 virtual int32_t next(int32_t n);
b331163b 198 virtual int32_t following(int32_t offset);
2ca993e8 199 virtual int32_t last(void);
57a6839d 200
b331163b 201private:
2ca993e8
A
202 /**
203 * Given that the fDelegate has already given its "initial" answer,
204 * find the NEXT actual (non-excepted) break.
205 * @param n initial position from delegate
206 * @return new break position or UBRK_DONE
207 */
208 int32_t internalNext(int32_t n);
209 /**
210 * Given that the fDelegate has already given its "initial" answer,
211 * find the PREV actual (non-excepted) break.
212 * @param n initial position from delegate
213 * @return new break position or UBRK_DONE
214 */
215 int32_t internalPrev(int32_t n);
216 /**
217 * set up the UText with the value of the fDelegate.
218 * Call this before calling breakExceptionAt.
219 * May be able to avoid excess calls
220 */
221 void resetState(UErrorCode &status);
222 /**
223 * Is there a match (exception) at this spot?
224 */
225 enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
226 /**
227 * Determine if there is an exception at this spot
228 * @param n spot to check
229 * @return kNoExceptionHere or kExceptionHere
230 **/
231 enum EFBMatchResult breakExceptionAt(int32_t n);
57a6839d
A
232};
233
b331163b 234SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
2ca993e8 235 : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
57a6839d 236{
57a6839d
A
237}
238
239
b331163b 240SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
57a6839d 241 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
2ca993e8
A
242 fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
243 fDelegate(adopt)
57a6839d
A
244{
245 // all set..
246}
247
2ca993e8
A
248SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
249 fData = fData->decr();
b331163b
A
250}
251
2ca993e8
A
252void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
253 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
b331163b
A
254}
255
2ca993e8
A
256SimpleFilteredSentenceBreakIterator::EFBMatchResult
257SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
258 int64_t bestPosn = -1;
259 int32_t bestValue = -1;
57a6839d
A
260 // loops while 'n' points to an exception.
261 utext_setNativeIndex(fText.getAlias(), n); // from n..
2ca993e8 262 fData->fBackwardsTrie->reset();
57a6839d 263 UChar32 uch;
2ca993e8 264
57a6839d
A
265 //if(debug2) u_printf(" n@ %d\n", n);
266 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
267 if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
268 // TODO only do this the 1st time?
269 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
270 } else {
271 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
272 uch = utext_next32(fText.getAlias());
273 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
274 }
57a6839d 275
2ca993e8 276 UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
57a6839d
A
277
278 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
2ca993e8 279 USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
57a6839d
A
280 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
281 bestPosn = utext_getNativeIndex(fText.getAlias());
2ca993e8 282 bestValue = fData->fBackwardsTrie->getValue();
57a6839d
A
283 }
284 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
285 }
286
287 if(USTRINGTRIE_MATCHES(r)) { // exact match?
288 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
2ca993e8 289 bestValue = fData->fBackwardsTrie->getValue();
57a6839d
A
290 bestPosn = utext_getNativeIndex(fText.getAlias());
291 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
292 }
293
294 if(bestPosn>=0) {
295 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
296
297 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
298 //int32_t bestValue = fBackwardsTrie->getValue();
299 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
300
2ca993e8
A
301 if(bestPosn>0) {
302 UChar32 prevch = utext_char32At(fText.getAlias(), bestPosn-1); // char before the best match
303 if (prevch != U_SENTINEL && u_isUAlphabetic(prevch)) {
304 // The match is preceded by other alphabetic characters, => invalid
305 return kNoExceptionHere;
306 }
307 }
308
57a6839d
A
309 if(bestValue == kMATCH) { // exact match!
310 //if(debug2) u_printf(" exact backward match\n");
2ca993e8 311 return kExceptionHere; // See if the next is another exception.
57a6839d 312 } else if(bestValue == kPARTIAL
2ca993e8 313 && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
57a6839d
A
314 //if(debug2) u_printf(" partial backward match\n");
315 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
316 // to see if it matches something going forward.
2ca993e8 317 fData->fForwardsPartialTrie->reset();
57a6839d
A
318 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
319 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
320 //if(debug2) u_printf("Retrying at %d\n", bestPosn);
321 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
2ca993e8 322 USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
57a6839d
A
323 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
324 }
325 if(USTRINGTRIE_MATCHES(rfwd)) {
326 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
327 // only full matches here, nothing to check
328 // skip the next:
2ca993e8 329 return kExceptionHere;
57a6839d
A
330 } else {
331 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
332 // no match (no exception) -return the 'underlying' break
2ca993e8 333 return kNoExceptionHere;
57a6839d
A
334 }
335 } else {
2ca993e8 336 return kNoExceptionHere; // internal error and/or no forwards trie
57a6839d
A
337 }
338 } else {
339 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
2ca993e8 340 return kNoExceptionHere; // No match - so exit. Not an exception.
57a6839d 341 }
57a6839d
A
342}
343
2ca993e8
A
344// the workhorse single next.
345int32_t
346SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
347 if(n == UBRK_DONE || // at end or
348 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
349 return n;
350 }
351 // OK, do we need to break here?
352 UErrorCode status = U_ZERO_ERROR;
353 // refresh text
354 resetState(status);
355 if(U_FAILURE(status)) return UBRK_DONE; // bail out
356 int64_t utextLen = utext_nativeLength(fText.getAlias());
357
358 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
359 while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
360 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
361
362 switch(m) {
363 case kExceptionHere:
364 n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
365 continue;
57a6839d 366
2ca993e8
A
367 default:
368 case kNoExceptionHere:
369 return n;
370 }
371 }
372 return n;
57a6839d 373}
57a6839d 374
2ca993e8
A
375int32_t
376SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
377 if(n == 0 || n == UBRK_DONE || // at end or
378 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
379 return n;
b331163b
A
380 }
381 // OK, do we need to break here?
382 UErrorCode status = U_ZERO_ERROR;
383 // refresh text
2ca993e8
A
384 resetState(status);
385 if(U_FAILURE(status)) return UBRK_DONE; // bail out
b331163b 386
2ca993e8
A
387 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
388 while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
389 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
b331163b 390
2ca993e8
A
391 switch(m) {
392 case kExceptionHere:
393 n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
394 continue;
b331163b 395
2ca993e8
A
396 default:
397 case kNoExceptionHere:
398 return n;
b331163b 399 }
2ca993e8
A
400 }
401 return n;
402}
b331163b 403
b331163b 404
2ca993e8
A
405int32_t
406SimpleFilteredSentenceBreakIterator::next() {
407 return internalNext(fDelegate->next());
408}
409
410int32_t
411SimpleFilteredSentenceBreakIterator::first(void) {
412 return internalNext(fDelegate->first());
b331163b 413}
57a6839d 414
2ca993e8
A
415int32_t
416SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
417 return internalPrev(fDelegate->preceding(offset));
418}
419
420int32_t
421SimpleFilteredSentenceBreakIterator::previous(void) {
422 return internalPrev(fDelegate->previous());
423}
424
425UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
426 if(!fDelegate->isBoundary(offset)) return false; // no break to suppress
427
428 UErrorCode status = U_ZERO_ERROR;
429 resetState(status);
430
431 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
432
433 switch(m) {
434 case kExceptionHere:
435 return false;
436 default:
437 case kNoExceptionHere:
438 return true;
439 }
440}
441
442int32_t
443SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
444 return internalNext(fDelegate->next(offset));
445}
446
447int32_t
448SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
449 return internalNext(fDelegate->following(offset));
450}
451
452int32_t
453SimpleFilteredSentenceBreakIterator::last(void) {
454 // Don't suppress a break opportunity at the end of text.
455 return fDelegate->last();
456}
457
458
b331163b
A
459/**
460 * Concrete implementation of builder class.
461 */
462class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
57a6839d
A
463public:
464 virtual ~SimpleFilteredBreakIteratorBuilder();
465 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
b331163b 466 SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
57a6839d
A
467 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
468 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
469 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
470private:
b331163b 471 UStringSet fSet;
57a6839d
A
472};
473
474SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
475{
476}
477
2ca993e8 478SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
b331163b
A
479 : fSet(status)
480{
481}
482
57a6839d 483SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
b331163b 484 : fSet(status)
57a6839d
A
485{
486 if(U_SUCCESS(status)) {
487 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status));
488 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &status));
489 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &status));
490 if(U_FAILURE(status)) return; // leaves the builder empty, if you try to use it.
491
492 LocalUResourceBundlePointer strs;
493 UErrorCode subStatus = status;
494 do {
495 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
496 if(strs.isValid() && U_SUCCESS(subStatus)) {
497 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
498 suppressBreakAfter(str, status); // load the string
499 }
500 } while (strs.isValid() && U_SUCCESS(subStatus));
501 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
502 status = subStatus;
503 }
504 }
505}
506
57a6839d
A
507UBool
508SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
509{
b331163b
A
510 UBool r = fSet.add(exception, status);
511 FB_TRACE("suppressBreakAfter",&exception,r,0);
512 return r;
57a6839d
A
513}
514
515UBool
516SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
517{
b331163b
A
518 UBool r = fSet.remove(exception, status);
519 FB_TRACE("unsuppressBreakAfter",&exception,r,0);
520 return r;
521}
522
523/**
524 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
525 * Work around this.
526 *
527 * Note: "new UnicodeString[subCount]" ends up calling global operator new
528 * on MSVC2012 for some reason.
529 */
530static inline UnicodeString* newUnicodeStringArray(size_t count) {
531 return new UnicodeString[count ? count : 1];
57a6839d
A
532}
533
534BreakIterator *
535SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
536 LocalPointer<BreakIterator> adopt(adoptBreakIterator);
537
b331163b
A
538 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
539 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
57a6839d
A
540 if(U_FAILURE(status)) {
541 return NULL;
542 }
543
57a6839d
A
544 int32_t revCount = 0;
545 int32_t fwdCount = 0;
546
547 int32_t subCount = fSet.size();
b331163b
A
548
549 UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
2ca993e8 550
b331163b
A
551 LocalArray<UnicodeString> ustrs(ustrs_ptr);
552
553 LocalMemory<int> partials;
554 partials.allocateInsteadAndReset(subCount);
57a6839d
A
555
556 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
557 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
558
559 int n=0;
b331163b
A
560 for ( int32_t i = 0;
561 i<fSet.size();
57a6839d 562 i++) {
b331163b
A
563 const UnicodeString *abbr = fSet.getStringAt(i);
564 if(abbr) {
565 FB_TRACE("build",abbr,TRUE,i);
566 ustrs[n] = *abbr; // copy by value
567 FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i);
568 } else {
569 FB_TRACE("build",abbr,FALSE,i);
570 status = U_MEMORY_ALLOCATION_ERROR;
571 return NULL;
572 }
57a6839d
A
573 partials[n] = 0; // default: not partial
574 n++;
575 }
576 // first pass - find partials.
577 for(int i=0;i<subCount;i++) {
578 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
579 if(nn>-1 && (nn+1)!=ustrs[i].length()) {
b331163b 580 FB_TRACE("partial",&ustrs[i],FALSE,i);
57a6839d
A
581 // is partial.
582 // is it unique?
583 int sameAs = -1;
584 for(int j=0;j<subCount;j++) {
585 if(j==i) continue;
586 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
b331163b 587 FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
57a6839d
A
588 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
589 if(partials[j]==0) { // hasn't been processed yet
2ca993e8 590 partials[j] = (ustrs[j].length() == nn+1)? (kSuppressInReverse | kAddToForward): kAddToForward;
b331163b 591 FB_TRACE("suppressing",&ustrs[j],FALSE,j);
57a6839d
A
592 } else if(partials[j] & kSuppressInReverse) {
593 sameAs = j; // the other entry is already in the reverse table.
594 }
595 }
596 }
b331163b
A
597 FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs);
598 FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]);
57a6839d
A
599 UnicodeString prefix(ustrs[i], 0, nn+1);
600 if(sameAs == -1 && partials[i] == 0) {
601 // first one - add the prefix to the reverse table.
602 prefix.reverse();
603 builder->add(prefix, kPARTIAL, status);
604 revCount++;
b331163b
A
605 FB_TRACE("Added partial",&prefix,FALSE, i);
606 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
2ca993e8 607 partials[i] = kAddToForward;
57a6839d 608 } else {
b331163b
A
609 FB_TRACE("NOT adding partial",&prefix,FALSE, i);
610 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
57a6839d
A
611 }
612 }
613 }
614 for(int i=0;i<subCount;i++) {
2ca993e8 615 if((partials[i] & kSuppressInReverse) == 0) {
57a6839d
A
616 ustrs[i].reverse();
617 builder->add(ustrs[i], kMATCH, status);
618 revCount++;
b331163b 619 FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
2ca993e8
A
620 }
621 if((partials[i] & kAddToForward) != 0) {
b331163b 622 FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
57a6839d
A
623
624 // an optimization would be to only add the portion after the '.'
625 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
626 // instead of "Ph.D." since we already know the "Ph." part is a match.
627 // would need the trie to be able to hold 0-length strings, though.
628 builder2->add(ustrs[i], kMATCH, status); // forward
629 fwdCount++;
630 //ustrs[i].reverse();
631 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
632 }
633 }
b331163b 634 FB_TRACE("AbbrCount",NULL,FALSE, subCount);
57a6839d
A
635
636 if(revCount>0) {
637 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
638 if(U_FAILURE(status)) {
b331163b 639 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
57a6839d
A
640 return NULL;
641 }
642 }
643
644 if(fwdCount>0) {
645 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
646 if(U_FAILURE(status)) {
b331163b 647 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
57a6839d
A
648 return NULL;
649 }
650 }
651
b331163b 652 return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
57a6839d
A
653}
654
655
b331163b 656// ----------- Base class implementation
57a6839d
A
657
658FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
659}
660
661FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
662}
663
664FilteredBreakIteratorBuilder *
665FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
666 if(U_FAILURE(status)) return NULL;
b331163b
A
667 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
668 return (U_SUCCESS(status))? ret.orphan(): NULL;
57a6839d
A
669}
670
671FilteredBreakIteratorBuilder *
672FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
673 if(U_FAILURE(status)) return NULL;
b331163b
A
674 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
675 return (U_SUCCESS(status))? ret.orphan(): NULL;
57a6839d
A
676}
677
678U_NAMESPACE_END
679
680#endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION