]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/filteredbrk.cpp
ICU-64232.0.1.tar.gz
[apple/icu.git] / icuSources / common / filteredbrk.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2014-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
12
13 #include "cmemory.h"
14
15 #include "unicode/filteredbrk.h"
16 #include "unicode/ucharstriebuilder.h"
17 #include "unicode/ures.h"
18
19 #include "uresimp.h" // ures_getByKeyWithFallback
20 #include "ubrkimpl.h" // U_ICUDATA_BRKITR
21 #include "uvector.h"
22 #include "cmemory.h"
23
24 U_NAMESPACE_BEGIN
25
26 #ifndef FB_DEBUG
27 #define FB_DEBUG 0
28 #endif
29
30 #if FB_DEBUG
31 #include <stdio.h>
32 static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
33 char buf[2048];
34 if(s) {
35 s->extract(0,s->length(),buf,2048);
36 } else {
37 strcpy(buf,"NULL");
38 }
39 fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
40 f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
41 }
42
43 #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
44 #else
45 #define FB_TRACE(m,s,b,d)
46 #endif
47
48 /**
49 * Used with sortedInsert()
50 */
51 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
52 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
53 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
54 return a.compare(b);
55 }
56
57 /**
58 * A UVector which implements a set of strings.
59 */
60 class U_COMMON_API UStringSet : public UVector {
61 public:
62 UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
63 uhash_compareUnicodeString,
64 1,
65 status) {}
66 virtual ~UStringSet();
67 /**
68 * Is this UnicodeSet contained?
69 */
70 inline UBool contains(const UnicodeString& s) {
71 return contains((void*) &s);
72 }
73 using UVector::contains;
74 /**
75 * Return the ith UnicodeString alias
76 */
77 inline const UnicodeString* getStringAt(int32_t i) const {
78 return (const UnicodeString*)elementAt(i);
79 }
80 /**
81 * Adopt the UnicodeString if not already contained.
82 * Caller no longer owns the pointer in any case.
83 * @return true if adopted successfully, false otherwise (error, or else duplicate)
84 */
85 inline UBool adopt(UnicodeString *str, UErrorCode &status) {
86 if(U_FAILURE(status) || contains(*str)) {
87 delete str;
88 return false;
89 } else {
90 sortedInsert(str, compareUnicodeString, status);
91 if(U_FAILURE(status)) {
92 delete str;
93 return false;
94 }
95 return true;
96 }
97 }
98 /**
99 * Add by value.
100 * @return true if successfully adopted.
101 */
102 inline UBool add(const UnicodeString& str, UErrorCode &status) {
103 if(U_FAILURE(status)) return false;
104 UnicodeString *t = new UnicodeString(str);
105 if(t==NULL) {
106 status = U_MEMORY_ALLOCATION_ERROR; return false;
107 }
108 return adopt(t, status);
109 }
110 /**
111 * Remove this string.
112 * @return true if successfully removed, false otherwise (error, or else it wasn't there)
113 */
114 inline UBool remove(const UnicodeString &s, UErrorCode &status) {
115 if(U_FAILURE(status)) return false;
116 return removeElement((void*) &s);
117 }
118 };
119
120 /**
121 * Virtual, won't be inlined
122 */
123 UStringSet::~UStringSet() {}
124
125 /* ----------------------------------------------------------- */
126
127
128 /* Filtered Break constants */
129 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
130 static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
131 static const int32_t kSuppressInReverse = (1<<0);
132 static const int32_t kAddToForward = (1<<1);
133 static const UChar kFULLSTOP = 0x002E; // '.'
134
135 /**
136 * Shared data for SimpleFilteredSentenceBreakIterator
137 */
138 class SimpleFilteredSentenceBreakData : public UMemory {
139 public:
140 SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
141 : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
142 SimpleFilteredSentenceBreakData *incr() { refcount++; return this; }
143 SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
144 virtual ~SimpleFilteredSentenceBreakData();
145
146 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
147 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
148 int32_t refcount;
149 };
150
151 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
152
153 /**
154 * Concrete implementation
155 */
156 class SimpleFilteredSentenceBreakIterator : public BreakIterator {
157 public:
158 SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
159 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
160 virtual ~SimpleFilteredSentenceBreakIterator();
161 private:
162 SimpleFilteredSentenceBreakData *fData;
163 LocalPointer<BreakIterator> fDelegate;
164 LocalUTextPointer fText;
165
166 /* -- subclass interface -- */
167 public:
168 /* -- cloning and other subclass stuff -- */
169 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
170 int32_t &/*BufferSize*/,
171 UErrorCode &status) {
172 // for now - always deep clone
173 status = U_SAFECLONE_ALLOCATED_WARNING;
174 return clone();
175 }
176 virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); }
177 virtual UClassID getDynamicClassID(void) const { return NULL; }
178 virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; }
179
180 /* -- text modifying -- */
181 virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
182 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
183 virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
184 virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
185
186 /* -- other functions that are just delegated -- */
187 virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
188 virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
189
190 /* -- ITERATION -- */
191 virtual int32_t first(void);
192 virtual int32_t preceding(int32_t offset);
193 virtual int32_t previous(void);
194 virtual UBool isBoundary(int32_t offset);
195 virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
196
197 virtual int32_t next(void);
198
199 virtual int32_t next(int32_t n);
200 virtual int32_t following(int32_t offset);
201 virtual int32_t last(void);
202
203 private:
204 /**
205 * Given that the fDelegate has already given its "initial" answer,
206 * find the NEXT actual (non-excepted) break.
207 * @param n initial position from delegate
208 * @return new break position or UBRK_DONE
209 */
210 int32_t internalNext(int32_t n);
211 /**
212 * Given that the fDelegate has already given its "initial" answer,
213 * find the PREV actual (non-excepted) break.
214 * @param n initial position from delegate
215 * @return new break position or UBRK_DONE
216 */
217 int32_t internalPrev(int32_t n);
218 /**
219 * set up the UText with the value of the fDelegate.
220 * Call this before calling breakExceptionAt.
221 * May be able to avoid excess calls
222 */
223 void resetState(UErrorCode &status);
224 /**
225 * Is there a match (exception) at this spot?
226 */
227 enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
228 /**
229 * Determine if there is an exception at this spot
230 * @param n spot to check
231 * @return kNoExceptionHere or kExceptionHere
232 **/
233 enum EFBMatchResult breakExceptionAt(int32_t n);
234 };
235
236 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
237 : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
238 {
239 }
240
241
242 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
243 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
244 fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
245 fDelegate(adopt)
246 {
247 // all set..
248 }
249
250 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
251 fData = fData->decr();
252 }
253
254 void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
255 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
256 }
257
258 SimpleFilteredSentenceBreakIterator::EFBMatchResult
259 SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
260 int64_t bestPosn = -1;
261 int32_t bestValue = -1;
262 // loops while 'n' points to an exception.
263 utext_setNativeIndex(fText.getAlias(), n); // from n..
264 fData->fBackwardsTrie->reset();
265 UChar32 uch;
266
267 //if(debug2) u_printf(" n@ %d\n", n);
268 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
269 if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
270 // TODO only do this the 1st time?
271 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
272 } else {
273 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
274 uch = utext_next32(fText.getAlias());
275 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
276 }
277
278 UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
279
280 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
281 USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
282 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
283 bestPosn = utext_getNativeIndex(fText.getAlias());
284 bestValue = fData->fBackwardsTrie->getValue();
285 }
286 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
287 }
288
289 if(USTRINGTRIE_MATCHES(r)) { // exact match?
290 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
291 bestValue = fData->fBackwardsTrie->getValue();
292 bestPosn = utext_getNativeIndex(fText.getAlias());
293 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
294 }
295
296 if(bestPosn>=0) {
297 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
298
299 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
300 //int32_t bestValue = fBackwardsTrie->getValue();
301 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
302
303 if(bestPosn>0) {
304 UChar32 prevch = utext_char32At(fText.getAlias(), bestPosn-1); // char before the best match
305 if (prevch != U_SENTINEL && u_isUAlphabetic(prevch)) {
306 // The match is preceded by other alphabetic characters, => invalid
307 return kNoExceptionHere;
308 }
309 }
310
311 if(bestValue == kMATCH) { // exact match!
312 //if(debug2) u_printf(" exact backward match\n");
313 return kExceptionHere; // See if the next is another exception.
314 } else if(bestValue == kPARTIAL
315 && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
316 //if(debug2) u_printf(" partial backward match\n");
317 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
318 // to see if it matches something going forward.
319 fData->fForwardsPartialTrie->reset();
320 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
321 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
322 //if(debug2) u_printf("Retrying at %d\n", bestPosn);
323 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
324 USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
325 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
326 }
327 if(USTRINGTRIE_MATCHES(rfwd)) {
328 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
329 // only full matches here, nothing to check
330 // skip the next:
331 return kExceptionHere;
332 } else {
333 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
334 // no match (no exception) -return the 'underlying' break
335 return kNoExceptionHere;
336 }
337 } else {
338 return kNoExceptionHere; // internal error and/or no forwards trie
339 }
340 } else {
341 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
342 return kNoExceptionHere; // No match - so exit. Not an exception.
343 }
344 }
345
346 // the workhorse single next.
347 int32_t
348 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
349 if(n == UBRK_DONE || // at end or
350 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
351 return n;
352 }
353 // OK, do we need to break here?
354 UErrorCode status = U_ZERO_ERROR;
355 // refresh text
356 resetState(status);
357 if(U_FAILURE(status)) return UBRK_DONE; // bail out
358 int64_t utextLen = utext_nativeLength(fText.getAlias());
359
360 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
361 while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
362 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
363
364 switch(m) {
365 case kExceptionHere:
366 n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
367 continue;
368
369 default:
370 case kNoExceptionHere:
371 return n;
372 }
373 }
374 return n;
375 }
376
377 int32_t
378 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
379 if(n == 0 || n == UBRK_DONE || // at end or
380 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
381 return n;
382 }
383 // OK, do we need to break here?
384 UErrorCode status = U_ZERO_ERROR;
385 // refresh text
386 resetState(status);
387 if(U_FAILURE(status)) return UBRK_DONE; // bail out
388
389 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
390 while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
391 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
392
393 switch(m) {
394 case kExceptionHere:
395 n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
396 continue;
397
398 default:
399 case kNoExceptionHere:
400 return n;
401 }
402 }
403 return n;
404 }
405
406
407 int32_t
408 SimpleFilteredSentenceBreakIterator::next() {
409 return internalNext(fDelegate->next());
410 }
411
412 int32_t
413 SimpleFilteredSentenceBreakIterator::first(void) {
414 // Don't suppress a break opportunity at the beginning of text.
415 return fDelegate->first();
416 }
417
418 int32_t
419 SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
420 return internalPrev(fDelegate->preceding(offset));
421 }
422
423 int32_t
424 SimpleFilteredSentenceBreakIterator::previous(void) {
425 return internalPrev(fDelegate->previous());
426 }
427
428 UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
429 if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
430
431 if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions
432
433 UErrorCode status = U_ZERO_ERROR;
434 resetState(status);
435
436 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
437
438 switch(m) {
439 case kExceptionHere:
440 return false;
441 default:
442 case kNoExceptionHere:
443 return true;
444 }
445 }
446
447 int32_t
448 SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
449 return internalNext(fDelegate->next(offset));
450 }
451
452 int32_t
453 SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
454 return internalNext(fDelegate->following(offset));
455 }
456
457 int32_t
458 SimpleFilteredSentenceBreakIterator::last(void) {
459 // Don't suppress a break opportunity at the end of text.
460 return fDelegate->last();
461 }
462
463
464 /**
465 * Concrete implementation of builder class.
466 */
467 class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
468 public:
469 virtual ~SimpleFilteredBreakIteratorBuilder();
470 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
471 SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
472 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
473 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
474 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
475 private:
476 UStringSet fSet;
477 };
478
479 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
480 {
481 }
482
483 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
484 : fSet(status)
485 {
486 }
487
488 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
489 : fSet(status)
490 {
491 if(U_SUCCESS(status)) {
492 UErrorCode subStatus = U_ZERO_ERROR;
493 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
494 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
495 status = subStatus; // copy the failing status
496 #if FB_DEBUG
497 fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
498 #endif
499 return; // leaves the builder empty, if you try to use it.
500 }
501 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &subStatus));
502 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
503 status = subStatus; // copy the failing status
504 #if FB_DEBUG
505 fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
506 #endif
507 return; // leaves the builder empty, if you try to use it.
508 }
509 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &subStatus));
510
511 #if FB_DEBUG
512 {
513 UErrorCode subsub = subStatus;
514 fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
515 }
516 #endif
517
518 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
519 status = subStatus; // copy the failing status
520 #if FB_DEBUG
521 fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
522 #endif
523 return; // leaves the builder empty, if you try to use it.
524 }
525
526 LocalUResourceBundlePointer strs;
527 subStatus = status; // Pick up inherited warning status now
528 do {
529 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
530 if(strs.isValid() && U_SUCCESS(subStatus)) {
531 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
532 suppressBreakAfter(str, status); // load the string
533 }
534 } while (strs.isValid() && U_SUCCESS(subStatus));
535 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
536 status = subStatus;
537 }
538 }
539 }
540
541 UBool
542 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
543 {
544 UBool r = fSet.add(exception, status);
545 FB_TRACE("suppressBreakAfter",&exception,r,0);
546 return r;
547 }
548
549 UBool
550 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
551 {
552 UBool r = fSet.remove(exception, status);
553 FB_TRACE("unsuppressBreakAfter",&exception,r,0);
554 return r;
555 }
556
557 /**
558 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
559 * Work around this.
560 *
561 * Note: "new UnicodeString[subCount]" ends up calling global operator new
562 * on MSVC2012 for some reason.
563 */
564 static inline UnicodeString* newUnicodeStringArray(size_t count) {
565 return new UnicodeString[count ? count : 1];
566 }
567
568 BreakIterator *
569 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
570 LocalPointer<BreakIterator> adopt(adoptBreakIterator);
571
572 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
573 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
574 if(U_FAILURE(status)) {
575 return NULL;
576 }
577
578 int32_t revCount = 0;
579 int32_t fwdCount = 0;
580
581 int32_t subCount = fSet.size();
582
583 UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
584
585 LocalArray<UnicodeString> ustrs(ustrs_ptr);
586
587 LocalMemory<int> partials;
588 partials.allocateInsteadAndReset(subCount);
589
590 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
591 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
592
593 int n=0;
594 for ( int32_t i = 0;
595 i<fSet.size();
596 i++) {
597 const UnicodeString *abbr = fSet.getStringAt(i);
598 if(abbr) {
599 FB_TRACE("build",abbr,TRUE,i);
600 ustrs[n] = *abbr; // copy by value
601 FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i);
602 } else {
603 FB_TRACE("build",abbr,FALSE,i);
604 status = U_MEMORY_ALLOCATION_ERROR;
605 return NULL;
606 }
607 partials[n] = 0; // default: not partial
608 n++;
609 }
610 // first pass - find partials.
611 for(int i=0;i<subCount;i++) {
612 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
613 if(nn>-1 && (nn+1)!=ustrs[i].length()) {
614 FB_TRACE("partial",&ustrs[i],FALSE,i);
615 // is partial.
616 // is it unique?
617 int sameAs = -1;
618 for(int j=0;j<subCount;j++) {
619 if(j==i) continue;
620 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
621 FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
622 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
623 if(partials[j]==0) { // hasn't been processed yet
624 partials[j] = (ustrs[j].length() == nn+1)? (kSuppressInReverse | kAddToForward): kAddToForward;
625 FB_TRACE("suppressing",&ustrs[j],FALSE,j);
626 } else if(partials[j] & kSuppressInReverse) {
627 sameAs = j; // the other entry is already in the reverse table.
628 }
629 }
630 }
631 FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs);
632 FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]);
633 UnicodeString prefix(ustrs[i], 0, nn+1);
634 if(sameAs == -1 && partials[i] == 0) {
635 // first one - add the prefix to the reverse table.
636 prefix.reverse();
637 builder->add(prefix, kPARTIAL, status);
638 revCount++;
639 FB_TRACE("Added partial",&prefix,FALSE, i);
640 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
641 partials[i] = kAddToForward;
642 } else {
643 FB_TRACE("NOT adding partial",&prefix,FALSE, i);
644 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
645 }
646 }
647 }
648 for(int i=0;i<subCount;i++) {
649 if((partials[i] & kSuppressInReverse) == 0) {
650 ustrs[i].reverse();
651 builder->add(ustrs[i], kMATCH, status);
652 revCount++;
653 FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
654 }
655 if((partials[i] & kAddToForward) != 0) {
656 FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
657
658 // an optimization would be to only add the portion after the '.'
659 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
660 // instead of "Ph.D." since we already know the "Ph." part is a match.
661 // would need the trie to be able to hold 0-length strings, though.
662 builder2->add(ustrs[i], kMATCH, status); // forward
663 fwdCount++;
664 //ustrs[i].reverse();
665 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
666 }
667 }
668 FB_TRACE("AbbrCount",NULL,FALSE, subCount);
669
670 if(revCount>0) {
671 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
672 if(U_FAILURE(status)) {
673 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
674 return NULL;
675 }
676 }
677
678 if(fwdCount>0) {
679 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
680 if(U_FAILURE(status)) {
681 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
682 return NULL;
683 }
684 }
685
686 return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
687 }
688
689
690 // ----------- Base class implementation
691
692 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
693 }
694
695 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
696 }
697
698 FilteredBreakIteratorBuilder *
699 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
700 if(U_FAILURE(status)) return NULL;
701 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
702 return (U_SUCCESS(status))? ret.orphan(): NULL;
703 }
704
705 FilteredBreakIteratorBuilder *
706 FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) {
707 return createEmptyInstance(status);
708 }
709
710 FilteredBreakIteratorBuilder *
711 FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
712 if(U_FAILURE(status)) return NULL;
713 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
714 return (U_SUCCESS(status))? ret.orphan(): NULL;
715 }
716
717 U_NAMESPACE_END
718
719 #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION