]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
2ca993e8 | 3 | * Copyright (C) 2014-2016, International Business Machines Corporation and |
57a6839d A |
4 | * others. All Rights Reserved. |
5 | ******************************************************************************* | |
6 | */ | |
7 | ||
b331163b A |
8 | #include "unicode/utypes.h" |
9 | #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION | |
57a6839d | 10 | |
b331163b | 11 | #include "cmemory.h" |
57a6839d | 12 | |
b331163b A |
13 | #include "unicode/filteredbrk.h" |
14 | #include "unicode/ucharstriebuilder.h" | |
15 | #include "unicode/ures.h" | |
57a6839d | 16 | |
b331163b A |
17 | #include "uresimp.h" // ures_getByKeyWithFallback |
18 | #include "ubrkimpl.h" // U_ICUDATA_BRKITR | |
19 | #include "uvector.h" | |
20 | #include "cmemory.h" | |
57a6839d A |
21 | |
22 | U_NAMESPACE_BEGIN | |
23 | ||
b331163b A |
24 | #ifndef FB_DEBUG |
25 | #define FB_DEBUG 0 | |
26 | #endif | |
27 | ||
28 | #if FB_DEBUG | |
29 | #include <stdio.h> | |
30 | static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) { | |
31 | char buf[2048]; | |
32 | if(s) { | |
33 | s->extract(0,s->length(),buf,2048); | |
34 | } else { | |
35 | strcpy(buf,"NULL"); | |
36 | } | |
37 | fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n", | |
38 | f, l, m, buf, (const void*)s, b?'T':'F',(int)d); | |
39 | } | |
40 | ||
41 | #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__) | |
42 | #else | |
43 | #define FB_TRACE(m,s,b,d) | |
44 | #endif | |
45 | ||
2ca993e8 A |
46 | /** |
47 | * Used with sortedInsert() | |
48 | */ | |
b331163b A |
49 | static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { |
50 | const UnicodeString &a = *(const UnicodeString*)t1.pointer; | |
51 | const UnicodeString &b = *(const UnicodeString*)t2.pointer; | |
52 | return a.compare(b); | |
53 | } | |
54 | ||
55 | /** | |
56 | * A UVector which implements a set of strings. | |
57 | */ | |
58 | class U_COMMON_API UStringSet : public UVector { | |
59 | public: | |
60 | UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject, | |
61 | uhash_compareUnicodeString, | |
62 | 1, | |
63 | status) {} | |
64 | virtual ~UStringSet(); | |
65 | /** | |
66 | * Is this UnicodeSet contained? | |
67 | */ | |
68 | inline UBool contains(const UnicodeString& s) { | |
69 | return contains((void*) &s); | |
70 | } | |
71 | using UVector::contains; | |
72 | /** | |
73 | * Return the ith UnicodeString alias | |
74 | */ | |
75 | inline const UnicodeString* getStringAt(int32_t i) const { | |
76 | return (const UnicodeString*)elementAt(i); | |
77 | } | |
78 | /** | |
79 | * Adopt the UnicodeString if not already contained. | |
80 | * Caller no longer owns the pointer in any case. | |
81 | * @return true if adopted successfully, false otherwise (error, or else duplicate) | |
82 | */ | |
83 | inline UBool adopt(UnicodeString *str, UErrorCode &status) { | |
84 | if(U_FAILURE(status) || contains(*str)) { | |
85 | delete str; | |
86 | return false; | |
87 | } else { | |
88 | sortedInsert(str, compareUnicodeString, status); | |
89 | if(U_FAILURE(status)) { | |
90 | delete str; | |
91 | return false; | |
92 | } | |
93 | return true; | |
94 | } | |
95 | } | |
96 | /** | |
97 | * Add by value. | |
98 | * @return true if successfully adopted. | |
99 | */ | |
100 | inline UBool add(const UnicodeString& str, UErrorCode &status) { | |
101 | if(U_FAILURE(status)) return false; | |
102 | UnicodeString *t = new UnicodeString(str); | |
103 | if(t==NULL) { | |
104 | status = U_MEMORY_ALLOCATION_ERROR; return false; | |
105 | } | |
106 | return adopt(t, status); | |
107 | } | |
108 | /** | |
109 | * Remove this string. | |
110 | * @return true if successfully removed, false otherwise (error, or else it wasn't there) | |
111 | */ | |
112 | inline UBool remove(const UnicodeString &s, UErrorCode &status) { | |
113 | if(U_FAILURE(status)) return false; | |
114 | return removeElement((void*) &s); | |
115 | } | |
116 | }; | |
117 | ||
118 | /** | |
119 | * Virtual, won't be inlined | |
120 | */ | |
121 | UStringSet::~UStringSet() {} | |
122 | ||
2ca993e8 | 123 | /* ----------------------------------------------------------- */ |
57a6839d | 124 | |
2ca993e8 A |
125 | |
126 | /* Filtered Break constants */ | |
57a6839d A |
127 | static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie |
128 | static const int32_t kMATCH = (1<<1); //< exact match - skip this one. | |
129 | static const int32_t kSuppressInReverse = (1<<0); | |
130 | static const int32_t kAddToForward = (1<<1); | |
2ca993e8 A |
131 | static const UChar kFULLSTOP = 0x002E; // '.' |
132 | ||
133 | /** | |
134 | * Shared data for SimpleFilteredSentenceBreakIterator | |
135 | */ | |
136 | class SimpleFilteredSentenceBreakData : public UMemory { | |
137 | public: | |
138 | SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) | |
139 | : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } | |
140 | SimpleFilteredSentenceBreakData *incr() { refcount++; return this; } | |
141 | SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; } | |
142 | virtual ~SimpleFilteredSentenceBreakData(); | |
57a6839d | 143 | |
2ca993e8 A |
144 | LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." |
145 | LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. | |
146 | int32_t refcount; | |
147 | }; | |
148 | ||
149 | SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} | |
150 | ||
151 | /** | |
152 | * Concrete implementation | |
153 | */ | |
b331163b | 154 | class SimpleFilteredSentenceBreakIterator : public BreakIterator { |
57a6839d | 155 | public: |
b331163b A |
156 | SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); |
157 | SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); | |
158 | virtual ~SimpleFilteredSentenceBreakIterator(); | |
57a6839d | 159 | private: |
2ca993e8 | 160 | SimpleFilteredSentenceBreakData *fData; |
57a6839d A |
161 | LocalPointer<BreakIterator> fDelegate; |
162 | LocalUTextPointer fText; | |
57a6839d A |
163 | |
164 | /* -- subclass interface -- */ | |
165 | public: | |
166 | /* -- cloning and other subclass stuff -- */ | |
167 | virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, | |
168 | int32_t &/*BufferSize*/, | |
169 | UErrorCode &status) { | |
170 | // for now - always deep clone | |
171 | status = U_SAFECLONE_ALLOCATED_WARNING; | |
172 | return clone(); | |
173 | } | |
b331163b | 174 | virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); } |
57a6839d | 175 | virtual UClassID getDynamicClassID(void) const { return NULL; } |
b331163b | 176 | virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; } |
57a6839d A |
177 | |
178 | /* -- text modifying -- */ | |
179 | virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); } | |
180 | virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; } | |
181 | virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); } | |
182 | virtual void setText(const UnicodeString &text) { fDelegate->setText(text); } | |
183 | ||
184 | /* -- other functions that are just delegated -- */ | |
185 | virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); } | |
186 | virtual CharacterIterator& getText(void) const { return fDelegate->getText(); } | |
187 | ||
188 | /* -- ITERATION -- */ | |
2ca993e8 A |
189 | virtual int32_t first(void); |
190 | virtual int32_t preceding(int32_t offset); | |
191 | virtual int32_t previous(void); | |
192 | virtual UBool isBoundary(int32_t offset); | |
193 | virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct. | |
57a6839d A |
194 | |
195 | virtual int32_t next(void); | |
2ca993e8 A |
196 | |
197 | virtual int32_t next(int32_t n); | |
b331163b | 198 | virtual int32_t following(int32_t offset); |
2ca993e8 | 199 | virtual int32_t last(void); |
57a6839d | 200 | |
b331163b | 201 | private: |
2ca993e8 A |
202 | /** |
203 | * Given that the fDelegate has already given its "initial" answer, | |
204 | * find the NEXT actual (non-excepted) break. | |
205 | * @param n initial position from delegate | |
206 | * @return new break position or UBRK_DONE | |
207 | */ | |
208 | int32_t internalNext(int32_t n); | |
209 | /** | |
210 | * Given that the fDelegate has already given its "initial" answer, | |
211 | * find the PREV actual (non-excepted) break. | |
212 | * @param n initial position from delegate | |
213 | * @return new break position or UBRK_DONE | |
214 | */ | |
215 | int32_t internalPrev(int32_t n); | |
216 | /** | |
217 | * set up the UText with the value of the fDelegate. | |
218 | * Call this before calling breakExceptionAt. | |
219 | * May be able to avoid excess calls | |
220 | */ | |
221 | void resetState(UErrorCode &status); | |
222 | /** | |
223 | * Is there a match (exception) at this spot? | |
224 | */ | |
225 | enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; | |
226 | /** | |
227 | * Determine if there is an exception at this spot | |
228 | * @param n spot to check | |
229 | * @return kNoExceptionHere or kExceptionHere | |
230 | **/ | |
231 | enum EFBMatchResult breakExceptionAt(int32_t n); | |
57a6839d A |
232 | }; |
233 | ||
b331163b | 234 | SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) |
2ca993e8 | 235 | : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) |
57a6839d | 236 | { |
57a6839d A |
237 | } |
238 | ||
239 | ||
b331163b | 240 | SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : |
57a6839d | 241 | BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), |
2ca993e8 A |
242 | fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), |
243 | fDelegate(adopt) | |
57a6839d A |
244 | { |
245 | // all set.. | |
246 | } | |
247 | ||
2ca993e8 A |
248 | SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { |
249 | fData = fData->decr(); | |
b331163b A |
250 | } |
251 | ||
2ca993e8 A |
252 | void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { |
253 | fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); | |
b331163b A |
254 | } |
255 | ||
2ca993e8 A |
256 | SimpleFilteredSentenceBreakIterator::EFBMatchResult |
257 | SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { | |
258 | int64_t bestPosn = -1; | |
259 | int32_t bestValue = -1; | |
57a6839d A |
260 | // loops while 'n' points to an exception. |
261 | utext_setNativeIndex(fText.getAlias(), n); // from n.. | |
2ca993e8 | 262 | fData->fBackwardsTrie->reset(); |
57a6839d | 263 | UChar32 uch; |
2ca993e8 | 264 | |
57a6839d A |
265 | //if(debug2) u_printf(" n@ %d\n", n); |
266 | // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") | |
267 | if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? | |
268 | // TODO only do this the 1st time? | |
269 | //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch); | |
270 | } else { | |
271 | //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch); | |
272 | uch = utext_next32(fText.getAlias()); | |
273 | //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); | |
274 | } | |
57a6839d | 275 | |
2ca993e8 | 276 | UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; |
57a6839d A |
277 | |
278 | while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. | |
2ca993e8 | 279 | USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie |
57a6839d A |
280 | if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far |
281 | bestPosn = utext_getNativeIndex(fText.getAlias()); | |
2ca993e8 | 282 | bestValue = fData->fBackwardsTrie->getValue(); |
57a6839d A |
283 | } |
284 | //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); | |
285 | } | |
286 | ||
287 | if(USTRINGTRIE_MATCHES(r)) { // exact match? | |
288 | //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); | |
2ca993e8 | 289 | bestValue = fData->fBackwardsTrie->getValue(); |
57a6839d A |
290 | bestPosn = utext_getNativeIndex(fText.getAlias()); |
291 | //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); | |
292 | } | |
293 | ||
294 | if(bestPosn>=0) { | |
295 | //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); | |
296 | ||
297 | //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? | |
298 | //int32_t bestValue = fBackwardsTrie->getValue(); | |
299 | ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue); | |
300 | ||
2ca993e8 A |
301 | if(bestPosn>0) { |
302 | UChar32 prevch = utext_char32At(fText.getAlias(), bestPosn-1); // char before the best match | |
303 | if (prevch != U_SENTINEL && u_isUAlphabetic(prevch)) { | |
304 | // The match is preceded by other alphabetic characters, => invalid | |
305 | return kNoExceptionHere; | |
306 | } | |
307 | } | |
308 | ||
57a6839d A |
309 | if(bestValue == kMATCH) { // exact match! |
310 | //if(debug2) u_printf(" exact backward match\n"); | |
2ca993e8 | 311 | return kExceptionHere; // See if the next is another exception. |
57a6839d | 312 | } else if(bestValue == kPARTIAL |
2ca993e8 | 313 | && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie |
57a6839d A |
314 | //if(debug2) u_printf(" partial backward match\n"); |
315 | // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie | |
316 | // to see if it matches something going forward. | |
2ca993e8 | 317 | fData->fForwardsPartialTrie->reset(); |
57a6839d A |
318 | UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; |
319 | utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. | |
320 | //if(debug2) u_printf("Retrying at %d\n", bestPosn); | |
321 | while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && | |
2ca993e8 | 322 | USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) { |
57a6839d A |
323 | //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); |
324 | } | |
325 | if(USTRINGTRIE_MATCHES(rfwd)) { | |
326 | //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch); | |
327 | // only full matches here, nothing to check | |
328 | // skip the next: | |
2ca993e8 | 329 | return kExceptionHere; |
57a6839d A |
330 | } else { |
331 | //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch); | |
332 | // no match (no exception) -return the 'underlying' break | |
2ca993e8 | 333 | return kNoExceptionHere; |
57a6839d A |
334 | } |
335 | } else { | |
2ca993e8 | 336 | return kNoExceptionHere; // internal error and/or no forwards trie |
57a6839d A |
337 | } |
338 | } else { | |
339 | //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match | |
2ca993e8 | 340 | return kNoExceptionHere; // No match - so exit. Not an exception. |
57a6839d | 341 | } |
57a6839d A |
342 | } |
343 | ||
2ca993e8 A |
344 | // the workhorse single next. |
345 | int32_t | |
346 | SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { | |
347 | if(n == UBRK_DONE || // at end or | |
348 | fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions | |
349 | return n; | |
350 | } | |
351 | // OK, do we need to break here? | |
352 | UErrorCode status = U_ZERO_ERROR; | |
353 | // refresh text | |
354 | resetState(status); | |
355 | if(U_FAILURE(status)) return UBRK_DONE; // bail out | |
356 | int64_t utextLen = utext_nativeLength(fText.getAlias()); | |
357 | ||
358 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); | |
359 | while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). | |
360 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); | |
361 | ||
362 | switch(m) { | |
363 | case kExceptionHere: | |
364 | n = fDelegate->next(); // skip this one. Find the next lowerlevel break. | |
365 | continue; | |
57a6839d | 366 | |
2ca993e8 A |
367 | default: |
368 | case kNoExceptionHere: | |
369 | return n; | |
370 | } | |
371 | } | |
372 | return n; | |
57a6839d | 373 | } |
57a6839d | 374 | |
2ca993e8 A |
375 | int32_t |
376 | SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { | |
377 | if(n == 0 || n == UBRK_DONE || // at end or | |
378 | fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions | |
379 | return n; | |
b331163b A |
380 | } |
381 | // OK, do we need to break here? | |
382 | UErrorCode status = U_ZERO_ERROR; | |
383 | // refresh text | |
2ca993e8 A |
384 | resetState(status); |
385 | if(U_FAILURE(status)) return UBRK_DONE; // bail out | |
b331163b | 386 | |
2ca993e8 A |
387 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
388 | while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). | |
389 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); | |
b331163b | 390 | |
2ca993e8 A |
391 | switch(m) { |
392 | case kExceptionHere: | |
393 | n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. | |
394 | continue; | |
b331163b | 395 | |
2ca993e8 A |
396 | default: |
397 | case kNoExceptionHere: | |
398 | return n; | |
b331163b | 399 | } |
2ca993e8 A |
400 | } |
401 | return n; | |
402 | } | |
b331163b | 403 | |
b331163b | 404 | |
2ca993e8 A |
405 | int32_t |
406 | SimpleFilteredSentenceBreakIterator::next() { | |
407 | return internalNext(fDelegate->next()); | |
408 | } | |
409 | ||
410 | int32_t | |
411 | SimpleFilteredSentenceBreakIterator::first(void) { | |
412 | return internalNext(fDelegate->first()); | |
b331163b | 413 | } |
57a6839d | 414 | |
2ca993e8 A |
415 | int32_t |
416 | SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { | |
417 | return internalPrev(fDelegate->preceding(offset)); | |
418 | } | |
419 | ||
420 | int32_t | |
421 | SimpleFilteredSentenceBreakIterator::previous(void) { | |
422 | return internalPrev(fDelegate->previous()); | |
423 | } | |
424 | ||
425 | UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { | |
426 | if(!fDelegate->isBoundary(offset)) return false; // no break to suppress | |
427 | ||
428 | UErrorCode status = U_ZERO_ERROR; | |
429 | resetState(status); | |
430 | ||
431 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); | |
432 | ||
433 | switch(m) { | |
434 | case kExceptionHere: | |
435 | return false; | |
436 | default: | |
437 | case kNoExceptionHere: | |
438 | return true; | |
439 | } | |
440 | } | |
441 | ||
442 | int32_t | |
443 | SimpleFilteredSentenceBreakIterator::next(int32_t offset) { | |
444 | return internalNext(fDelegate->next(offset)); | |
445 | } | |
446 | ||
447 | int32_t | |
448 | SimpleFilteredSentenceBreakIterator::following(int32_t offset) { | |
449 | return internalNext(fDelegate->following(offset)); | |
450 | } | |
451 | ||
452 | int32_t | |
453 | SimpleFilteredSentenceBreakIterator::last(void) { | |
454 | // Don't suppress a break opportunity at the end of text. | |
455 | return fDelegate->last(); | |
456 | } | |
457 | ||
458 | ||
b331163b A |
459 | /** |
460 | * Concrete implementation of builder class. | |
461 | */ | |
462 | class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { | |
57a6839d A |
463 | public: |
464 | virtual ~SimpleFilteredBreakIteratorBuilder(); | |
465 | SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); | |
b331163b | 466 | SimpleFilteredBreakIteratorBuilder(UErrorCode &status); |
57a6839d A |
467 | virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status); |
468 | virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status); | |
469 | virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status); | |
470 | private: | |
b331163b | 471 | UStringSet fSet; |
57a6839d A |
472 | }; |
473 | ||
474 | SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() | |
475 | { | |
476 | } | |
477 | ||
2ca993e8 | 478 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) |
b331163b A |
479 | : fSet(status) |
480 | { | |
481 | } | |
482 | ||
57a6839d | 483 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) |
b331163b | 484 | : fSet(status) |
57a6839d A |
485 | { |
486 | if(U_SUCCESS(status)) { | |
487 | LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status)); | |
488 | LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &status)); | |
489 | LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &status)); | |
490 | if(U_FAILURE(status)) return; // leaves the builder empty, if you try to use it. | |
491 | ||
492 | LocalUResourceBundlePointer strs; | |
493 | UErrorCode subStatus = status; | |
494 | do { | |
495 | strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); | |
496 | if(strs.isValid() && U_SUCCESS(subStatus)) { | |
497 | UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); | |
498 | suppressBreakAfter(str, status); // load the string | |
499 | } | |
500 | } while (strs.isValid() && U_SUCCESS(subStatus)); | |
501 | if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { | |
502 | status = subStatus; | |
503 | } | |
504 | } | |
505 | } | |
506 | ||
57a6839d A |
507 | UBool |
508 | SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) | |
509 | { | |
b331163b A |
510 | UBool r = fSet.add(exception, status); |
511 | FB_TRACE("suppressBreakAfter",&exception,r,0); | |
512 | return r; | |
57a6839d A |
513 | } |
514 | ||
515 | UBool | |
516 | SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) | |
517 | { | |
b331163b A |
518 | UBool r = fSet.remove(exception, status); |
519 | FB_TRACE("unsuppressBreakAfter",&exception,r,0); | |
520 | return r; | |
521 | } | |
522 | ||
523 | /** | |
524 | * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly. | |
525 | * Work around this. | |
526 | * | |
527 | * Note: "new UnicodeString[subCount]" ends up calling global operator new | |
528 | * on MSVC2012 for some reason. | |
529 | */ | |
530 | static inline UnicodeString* newUnicodeStringArray(size_t count) { | |
531 | return new UnicodeString[count ? count : 1]; | |
57a6839d A |
532 | } |
533 | ||
534 | BreakIterator * | |
535 | SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { | |
536 | LocalPointer<BreakIterator> adopt(adoptBreakIterator); | |
537 | ||
b331163b A |
538 | LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status); |
539 | LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status); | |
57a6839d A |
540 | if(U_FAILURE(status)) { |
541 | return NULL; | |
542 | } | |
543 | ||
57a6839d A |
544 | int32_t revCount = 0; |
545 | int32_t fwdCount = 0; | |
546 | ||
547 | int32_t subCount = fSet.size(); | |
b331163b A |
548 | |
549 | UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount); | |
2ca993e8 | 550 | |
b331163b A |
551 | LocalArray<UnicodeString> ustrs(ustrs_ptr); |
552 | ||
553 | LocalMemory<int> partials; | |
554 | partials.allocateInsteadAndReset(subCount); | |
57a6839d A |
555 | |
556 | LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. | |
557 | LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." | |
558 | ||
559 | int n=0; | |
b331163b A |
560 | for ( int32_t i = 0; |
561 | i<fSet.size(); | |
57a6839d | 562 | i++) { |
b331163b A |
563 | const UnicodeString *abbr = fSet.getStringAt(i); |
564 | if(abbr) { | |
565 | FB_TRACE("build",abbr,TRUE,i); | |
566 | ustrs[n] = *abbr; // copy by value | |
567 | FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i); | |
568 | } else { | |
569 | FB_TRACE("build",abbr,FALSE,i); | |
570 | status = U_MEMORY_ALLOCATION_ERROR; | |
571 | return NULL; | |
572 | } | |
57a6839d A |
573 | partials[n] = 0; // default: not partial |
574 | n++; | |
575 | } | |
576 | // first pass - find partials. | |
577 | for(int i=0;i<subCount;i++) { | |
578 | int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations | |
579 | if(nn>-1 && (nn+1)!=ustrs[i].length()) { | |
b331163b | 580 | FB_TRACE("partial",&ustrs[i],FALSE,i); |
57a6839d A |
581 | // is partial. |
582 | // is it unique? | |
583 | int sameAs = -1; | |
584 | for(int j=0;j<subCount;j++) { | |
585 | if(j==i) continue; | |
586 | if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { | |
b331163b | 587 | FB_TRACE("prefix",&ustrs[j],FALSE,nn+1); |
57a6839d A |
588 | //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn |
589 | if(partials[j]==0) { // hasn't been processed yet | |
2ca993e8 | 590 | partials[j] = (ustrs[j].length() == nn+1)? (kSuppressInReverse | kAddToForward): kAddToForward; |
b331163b | 591 | FB_TRACE("suppressing",&ustrs[j],FALSE,j); |
57a6839d A |
592 | } else if(partials[j] & kSuppressInReverse) { |
593 | sameAs = j; // the other entry is already in the reverse table. | |
594 | } | |
595 | } | |
596 | } | |
b331163b A |
597 | FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs); |
598 | FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]); | |
57a6839d A |
599 | UnicodeString prefix(ustrs[i], 0, nn+1); |
600 | if(sameAs == -1 && partials[i] == 0) { | |
601 | // first one - add the prefix to the reverse table. | |
602 | prefix.reverse(); | |
603 | builder->add(prefix, kPARTIAL, status); | |
604 | revCount++; | |
b331163b A |
605 | FB_TRACE("Added partial",&prefix,FALSE, i); |
606 | FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); | |
2ca993e8 | 607 | partials[i] = kAddToForward; |
57a6839d | 608 | } else { |
b331163b A |
609 | FB_TRACE("NOT adding partial",&prefix,FALSE, i); |
610 | FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); | |
57a6839d A |
611 | } |
612 | } | |
613 | } | |
614 | for(int i=0;i<subCount;i++) { | |
2ca993e8 | 615 | if((partials[i] & kSuppressInReverse) == 0) { |
57a6839d A |
616 | ustrs[i].reverse(); |
617 | builder->add(ustrs[i], kMATCH, status); | |
618 | revCount++; | |
b331163b | 619 | FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i); |
2ca993e8 A |
620 | } |
621 | if((partials[i] & kAddToForward) != 0) { | |
b331163b | 622 | FB_TRACE("Adding fwd",&ustrs[i], FALSE, i); |
57a6839d A |
623 | |
624 | // an optimization would be to only add the portion after the '.' | |
625 | // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, | |
626 | // instead of "Ph.D." since we already know the "Ph." part is a match. | |
627 | // would need the trie to be able to hold 0-length strings, though. | |
628 | builder2->add(ustrs[i], kMATCH, status); // forward | |
629 | fwdCount++; | |
630 | //ustrs[i].reverse(); | |
631 | ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); | |
632 | } | |
633 | } | |
b331163b | 634 | FB_TRACE("AbbrCount",NULL,FALSE, subCount); |
57a6839d A |
635 | |
636 | if(revCount>0) { | |
637 | backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); | |
638 | if(U_FAILURE(status)) { | |
b331163b | 639 | FB_TRACE(u_errorName(status),NULL,FALSE, -1); |
57a6839d A |
640 | return NULL; |
641 | } | |
642 | } | |
643 | ||
644 | if(fwdCount>0) { | |
645 | forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); | |
646 | if(U_FAILURE(status)) { | |
b331163b | 647 | FB_TRACE(u_errorName(status),NULL,FALSE, -1); |
57a6839d A |
648 | return NULL; |
649 | } | |
650 | } | |
651 | ||
b331163b | 652 | return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); |
57a6839d A |
653 | } |
654 | ||
655 | ||
b331163b | 656 | // ----------- Base class implementation |
57a6839d A |
657 | |
658 | FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { | |
659 | } | |
660 | ||
661 | FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { | |
662 | } | |
663 | ||
664 | FilteredBreakIteratorBuilder * | |
665 | FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { | |
666 | if(U_FAILURE(status)) return NULL; | |
b331163b A |
667 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status); |
668 | return (U_SUCCESS(status))? ret.orphan(): NULL; | |
57a6839d A |
669 | } |
670 | ||
671 | FilteredBreakIteratorBuilder * | |
672 | FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) { | |
673 | if(U_FAILURE(status)) return NULL; | |
b331163b A |
674 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status); |
675 | return (U_SUCCESS(status))? ret.orphan(): NULL; | |
57a6839d A |
676 | } |
677 | ||
678 | U_NAMESPACE_END | |
679 | ||
680 | #endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION |