]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
57a6839d A |
3 | /* |
4 | ******************************************************************************* | |
f3c0d7a5 | 5 | * Copyright (C) 2014-2015, International Business Machines Corporation and |
57a6839d A |
6 | * others. All Rights Reserved. |
7 | ******************************************************************************* | |
8 | */ | |
9 | ||
b331163b A |
10 | #include "unicode/utypes.h" |
11 | #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION | |
57a6839d | 12 | |
b331163b | 13 | #include "cmemory.h" |
57a6839d | 14 | |
b331163b A |
15 | #include "unicode/filteredbrk.h" |
16 | #include "unicode/ucharstriebuilder.h" | |
17 | #include "unicode/ures.h" | |
57a6839d | 18 | |
b331163b A |
19 | #include "uresimp.h" // ures_getByKeyWithFallback |
20 | #include "ubrkimpl.h" // U_ICUDATA_BRKITR | |
21 | #include "uvector.h" | |
22 | #include "cmemory.h" | |
57a6839d A |
23 | |
24 | U_NAMESPACE_BEGIN | |
25 | ||
b331163b A |
26 | #ifndef FB_DEBUG |
27 | #define FB_DEBUG 0 | |
28 | #endif | |
29 | ||
30 | #if FB_DEBUG | |
31 | #include <stdio.h> | |
32 | static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) { | |
33 | char buf[2048]; | |
34 | if(s) { | |
35 | s->extract(0,s->length(),buf,2048); | |
36 | } else { | |
37 | strcpy(buf,"NULL"); | |
38 | } | |
39 | fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n", | |
40 | f, l, m, buf, (const void*)s, b?'T':'F',(int)d); | |
41 | } | |
42 | ||
43 | #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__) | |
44 | #else | |
45 | #define FB_TRACE(m,s,b,d) | |
46 | #endif | |
47 | ||
2ca993e8 A |
48 | /** |
49 | * Used with sortedInsert() | |
50 | */ | |
b331163b A |
51 | static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { |
52 | const UnicodeString &a = *(const UnicodeString*)t1.pointer; | |
53 | const UnicodeString &b = *(const UnicodeString*)t2.pointer; | |
54 | return a.compare(b); | |
55 | } | |
56 | ||
57 | /** | |
58 | * A UVector which implements a set of strings. | |
59 | */ | |
60 | class U_COMMON_API UStringSet : public UVector { | |
61 | public: | |
62 | UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject, | |
63 | uhash_compareUnicodeString, | |
64 | 1, | |
65 | status) {} | |
66 | virtual ~UStringSet(); | |
67 | /** | |
68 | * Is this UnicodeSet contained? | |
69 | */ | |
70 | inline UBool contains(const UnicodeString& s) { | |
71 | return contains((void*) &s); | |
72 | } | |
73 | using UVector::contains; | |
74 | /** | |
75 | * Return the ith UnicodeString alias | |
76 | */ | |
77 | inline const UnicodeString* getStringAt(int32_t i) const { | |
78 | return (const UnicodeString*)elementAt(i); | |
79 | } | |
80 | /** | |
81 | * Adopt the UnicodeString if not already contained. | |
82 | * Caller no longer owns the pointer in any case. | |
83 | * @return true if adopted successfully, false otherwise (error, or else duplicate) | |
84 | */ | |
85 | inline UBool adopt(UnicodeString *str, UErrorCode &status) { | |
86 | if(U_FAILURE(status) || contains(*str)) { | |
87 | delete str; | |
88 | return false; | |
89 | } else { | |
90 | sortedInsert(str, compareUnicodeString, status); | |
91 | if(U_FAILURE(status)) { | |
92 | delete str; | |
93 | return false; | |
94 | } | |
95 | return true; | |
96 | } | |
97 | } | |
98 | /** | |
99 | * Add by value. | |
100 | * @return true if successfully adopted. | |
101 | */ | |
102 | inline UBool add(const UnicodeString& str, UErrorCode &status) { | |
103 | if(U_FAILURE(status)) return false; | |
104 | UnicodeString *t = new UnicodeString(str); | |
105 | if(t==NULL) { | |
106 | status = U_MEMORY_ALLOCATION_ERROR; return false; | |
107 | } | |
108 | return adopt(t, status); | |
109 | } | |
110 | /** | |
111 | * Remove this string. | |
112 | * @return true if successfully removed, false otherwise (error, or else it wasn't there) | |
113 | */ | |
114 | inline UBool remove(const UnicodeString &s, UErrorCode &status) { | |
115 | if(U_FAILURE(status)) return false; | |
116 | return removeElement((void*) &s); | |
117 | } | |
118 | }; | |
119 | ||
120 | /** | |
121 | * Virtual, won't be inlined | |
122 | */ | |
123 | UStringSet::~UStringSet() {} | |
124 | ||
2ca993e8 | 125 | /* ----------------------------------------------------------- */ |
57a6839d | 126 | |
2ca993e8 A |
127 | |
128 | /* Filtered Break constants */ | |
57a6839d A |
129 | static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie |
130 | static const int32_t kMATCH = (1<<1); //< exact match - skip this one. | |
131 | static const int32_t kSuppressInReverse = (1<<0); | |
132 | static const int32_t kAddToForward = (1<<1); | |
2ca993e8 A |
133 | static const UChar kFULLSTOP = 0x002E; // '.' |
134 | ||
135 | /** | |
136 | * Shared data for SimpleFilteredSentenceBreakIterator | |
137 | */ | |
138 | class SimpleFilteredSentenceBreakData : public UMemory { | |
139 | public: | |
140 | SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) | |
141 | : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } | |
142 | SimpleFilteredSentenceBreakData *incr() { refcount++; return this; } | |
143 | SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; } | |
144 | virtual ~SimpleFilteredSentenceBreakData(); | |
57a6839d | 145 | |
2ca993e8 A |
146 | LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." |
147 | LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. | |
148 | int32_t refcount; | |
149 | }; | |
150 | ||
151 | SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} | |
152 | ||
153 | /** | |
154 | * Concrete implementation | |
155 | */ | |
b331163b | 156 | class SimpleFilteredSentenceBreakIterator : public BreakIterator { |
57a6839d | 157 | public: |
b331163b A |
158 | SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); |
159 | SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); | |
160 | virtual ~SimpleFilteredSentenceBreakIterator(); | |
57a6839d | 161 | private: |
2ca993e8 | 162 | SimpleFilteredSentenceBreakData *fData; |
57a6839d A |
163 | LocalPointer<BreakIterator> fDelegate; |
164 | LocalUTextPointer fText; | |
57a6839d A |
165 | |
166 | /* -- subclass interface -- */ | |
167 | public: | |
168 | /* -- cloning and other subclass stuff -- */ | |
169 | virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, | |
170 | int32_t &/*BufferSize*/, | |
171 | UErrorCode &status) { | |
172 | // for now - always deep clone | |
173 | status = U_SAFECLONE_ALLOCATED_WARNING; | |
174 | return clone(); | |
175 | } | |
b331163b | 176 | virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); } |
57a6839d | 177 | virtual UClassID getDynamicClassID(void) const { return NULL; } |
b331163b | 178 | virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; } |
57a6839d A |
179 | |
180 | /* -- text modifying -- */ | |
181 | virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); } | |
182 | virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; } | |
183 | virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); } | |
184 | virtual void setText(const UnicodeString &text) { fDelegate->setText(text); } | |
185 | ||
186 | /* -- other functions that are just delegated -- */ | |
187 | virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); } | |
188 | virtual CharacterIterator& getText(void) const { return fDelegate->getText(); } | |
189 | ||
190 | /* -- ITERATION -- */ | |
2ca993e8 A |
191 | virtual int32_t first(void); |
192 | virtual int32_t preceding(int32_t offset); | |
193 | virtual int32_t previous(void); | |
194 | virtual UBool isBoundary(int32_t offset); | |
195 | virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct. | |
57a6839d A |
196 | |
197 | virtual int32_t next(void); | |
2ca993e8 A |
198 | |
199 | virtual int32_t next(int32_t n); | |
b331163b | 200 | virtual int32_t following(int32_t offset); |
2ca993e8 | 201 | virtual int32_t last(void); |
57a6839d | 202 | |
b331163b | 203 | private: |
2ca993e8 A |
204 | /** |
205 | * Given that the fDelegate has already given its "initial" answer, | |
206 | * find the NEXT actual (non-excepted) break. | |
207 | * @param n initial position from delegate | |
208 | * @return new break position or UBRK_DONE | |
209 | */ | |
210 | int32_t internalNext(int32_t n); | |
211 | /** | |
212 | * Given that the fDelegate has already given its "initial" answer, | |
213 | * find the PREV actual (non-excepted) break. | |
214 | * @param n initial position from delegate | |
215 | * @return new break position or UBRK_DONE | |
216 | */ | |
217 | int32_t internalPrev(int32_t n); | |
218 | /** | |
219 | * set up the UText with the value of the fDelegate. | |
220 | * Call this before calling breakExceptionAt. | |
221 | * May be able to avoid excess calls | |
222 | */ | |
223 | void resetState(UErrorCode &status); | |
224 | /** | |
225 | * Is there a match (exception) at this spot? | |
226 | */ | |
227 | enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; | |
228 | /** | |
229 | * Determine if there is an exception at this spot | |
230 | * @param n spot to check | |
231 | * @return kNoExceptionHere or kExceptionHere | |
232 | **/ | |
233 | enum EFBMatchResult breakExceptionAt(int32_t n); | |
57a6839d A |
234 | }; |
235 | ||
b331163b | 236 | SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) |
2ca993e8 | 237 | : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) |
57a6839d | 238 | { |
57a6839d A |
239 | } |
240 | ||
241 | ||
b331163b | 242 | SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : |
57a6839d | 243 | BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), |
2ca993e8 A |
244 | fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), |
245 | fDelegate(adopt) | |
57a6839d A |
246 | { |
247 | // all set.. | |
248 | } | |
249 | ||
2ca993e8 A |
250 | SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { |
251 | fData = fData->decr(); | |
b331163b A |
252 | } |
253 | ||
2ca993e8 A |
254 | void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { |
255 | fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); | |
b331163b A |
256 | } |
257 | ||
2ca993e8 A |
258 | SimpleFilteredSentenceBreakIterator::EFBMatchResult |
259 | SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { | |
260 | int64_t bestPosn = -1; | |
261 | int32_t bestValue = -1; | |
57a6839d A |
262 | // loops while 'n' points to an exception. |
263 | utext_setNativeIndex(fText.getAlias(), n); // from n.. | |
2ca993e8 | 264 | fData->fBackwardsTrie->reset(); |
57a6839d | 265 | UChar32 uch; |
2ca993e8 | 266 | |
57a6839d A |
267 | //if(debug2) u_printf(" n@ %d\n", n); |
268 | // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") | |
269 | if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? | |
270 | // TODO only do this the 1st time? | |
271 | //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch); | |
272 | } else { | |
273 | //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch); | |
274 | uch = utext_next32(fText.getAlias()); | |
275 | //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); | |
276 | } | |
57a6839d | 277 | |
2ca993e8 | 278 | UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; |
57a6839d A |
279 | |
280 | while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. | |
2ca993e8 | 281 | USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie |
57a6839d A |
282 | if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far |
283 | bestPosn = utext_getNativeIndex(fText.getAlias()); | |
2ca993e8 | 284 | bestValue = fData->fBackwardsTrie->getValue(); |
57a6839d A |
285 | } |
286 | //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); | |
287 | } | |
288 | ||
289 | if(USTRINGTRIE_MATCHES(r)) { // exact match? | |
290 | //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); | |
2ca993e8 | 291 | bestValue = fData->fBackwardsTrie->getValue(); |
57a6839d A |
292 | bestPosn = utext_getNativeIndex(fText.getAlias()); |
293 | //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); | |
294 | } | |
295 | ||
296 | if(bestPosn>=0) { | |
297 | //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); | |
298 | ||
299 | //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? | |
300 | //int32_t bestValue = fBackwardsTrie->getValue(); | |
301 | ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue); | |
302 | ||
2ca993e8 A |
303 | if(bestPosn>0) { |
304 | UChar32 prevch = utext_char32At(fText.getAlias(), bestPosn-1); // char before the best match | |
305 | if (prevch != U_SENTINEL && u_isUAlphabetic(prevch)) { | |
306 | // The match is preceded by other alphabetic characters, => invalid | |
307 | return kNoExceptionHere; | |
308 | } | |
309 | } | |
310 | ||
57a6839d A |
311 | if(bestValue == kMATCH) { // exact match! |
312 | //if(debug2) u_printf(" exact backward match\n"); | |
2ca993e8 | 313 | return kExceptionHere; // See if the next is another exception. |
57a6839d | 314 | } else if(bestValue == kPARTIAL |
2ca993e8 | 315 | && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie |
57a6839d A |
316 | //if(debug2) u_printf(" partial backward match\n"); |
317 | // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie | |
318 | // to see if it matches something going forward. | |
2ca993e8 | 319 | fData->fForwardsPartialTrie->reset(); |
57a6839d A |
320 | UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; |
321 | utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. | |
322 | //if(debug2) u_printf("Retrying at %d\n", bestPosn); | |
323 | while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && | |
2ca993e8 | 324 | USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) { |
57a6839d A |
325 | //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); |
326 | } | |
327 | if(USTRINGTRIE_MATCHES(rfwd)) { | |
328 | //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch); | |
329 | // only full matches here, nothing to check | |
330 | // skip the next: | |
2ca993e8 | 331 | return kExceptionHere; |
57a6839d A |
332 | } else { |
333 | //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch); | |
334 | // no match (no exception) -return the 'underlying' break | |
2ca993e8 | 335 | return kNoExceptionHere; |
57a6839d A |
336 | } |
337 | } else { | |
2ca993e8 | 338 | return kNoExceptionHere; // internal error and/or no forwards trie |
57a6839d A |
339 | } |
340 | } else { | |
341 | //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match | |
2ca993e8 | 342 | return kNoExceptionHere; // No match - so exit. Not an exception. |
57a6839d | 343 | } |
57a6839d A |
344 | } |
345 | ||
2ca993e8 A |
346 | // the workhorse single next. |
347 | int32_t | |
348 | SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { | |
349 | if(n == UBRK_DONE || // at end or | |
350 | fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions | |
351 | return n; | |
352 | } | |
353 | // OK, do we need to break here? | |
354 | UErrorCode status = U_ZERO_ERROR; | |
355 | // refresh text | |
356 | resetState(status); | |
357 | if(U_FAILURE(status)) return UBRK_DONE; // bail out | |
358 | int64_t utextLen = utext_nativeLength(fText.getAlias()); | |
359 | ||
360 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); | |
361 | while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). | |
362 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); | |
363 | ||
364 | switch(m) { | |
365 | case kExceptionHere: | |
366 | n = fDelegate->next(); // skip this one. Find the next lowerlevel break. | |
367 | continue; | |
57a6839d | 368 | |
2ca993e8 A |
369 | default: |
370 | case kNoExceptionHere: | |
371 | return n; | |
372 | } | |
373 | } | |
374 | return n; | |
57a6839d | 375 | } |
57a6839d | 376 | |
2ca993e8 A |
377 | int32_t |
378 | SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { | |
379 | if(n == 0 || n == UBRK_DONE || // at end or | |
380 | fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions | |
381 | return n; | |
b331163b A |
382 | } |
383 | // OK, do we need to break here? | |
384 | UErrorCode status = U_ZERO_ERROR; | |
385 | // refresh text | |
2ca993e8 A |
386 | resetState(status); |
387 | if(U_FAILURE(status)) return UBRK_DONE; // bail out | |
b331163b | 388 | |
2ca993e8 A |
389 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); |
390 | while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). | |
391 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); | |
b331163b | 392 | |
2ca993e8 A |
393 | switch(m) { |
394 | case kExceptionHere: | |
395 | n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. | |
396 | continue; | |
b331163b | 397 | |
2ca993e8 A |
398 | default: |
399 | case kNoExceptionHere: | |
400 | return n; | |
b331163b | 401 | } |
2ca993e8 A |
402 | } |
403 | return n; | |
404 | } | |
b331163b | 405 | |
b331163b | 406 | |
2ca993e8 A |
407 | int32_t |
408 | SimpleFilteredSentenceBreakIterator::next() { | |
409 | return internalNext(fDelegate->next()); | |
410 | } | |
411 | ||
412 | int32_t | |
413 | SimpleFilteredSentenceBreakIterator::first(void) { | |
f3c0d7a5 A |
414 | // Don't suppress a break opportunity at the beginning of text. |
415 | return fDelegate->first(); | |
b331163b | 416 | } |
57a6839d | 417 | |
2ca993e8 A |
418 | int32_t |
419 | SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { | |
420 | return internalPrev(fDelegate->preceding(offset)); | |
421 | } | |
422 | ||
423 | int32_t | |
424 | SimpleFilteredSentenceBreakIterator::previous(void) { | |
425 | return internalPrev(fDelegate->previous()); | |
426 | } | |
427 | ||
428 | UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { | |
f3c0d7a5 A |
429 | if (!fDelegate->isBoundary(offset)) return false; // no break to suppress |
430 | ||
431 | if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions | |
2ca993e8 A |
432 | |
433 | UErrorCode status = U_ZERO_ERROR; | |
434 | resetState(status); | |
435 | ||
436 | SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); | |
437 | ||
438 | switch(m) { | |
439 | case kExceptionHere: | |
440 | return false; | |
441 | default: | |
442 | case kNoExceptionHere: | |
443 | return true; | |
444 | } | |
445 | } | |
446 | ||
447 | int32_t | |
448 | SimpleFilteredSentenceBreakIterator::next(int32_t offset) { | |
449 | return internalNext(fDelegate->next(offset)); | |
450 | } | |
451 | ||
452 | int32_t | |
453 | SimpleFilteredSentenceBreakIterator::following(int32_t offset) { | |
454 | return internalNext(fDelegate->following(offset)); | |
455 | } | |
456 | ||
457 | int32_t | |
458 | SimpleFilteredSentenceBreakIterator::last(void) { | |
459 | // Don't suppress a break opportunity at the end of text. | |
460 | return fDelegate->last(); | |
461 | } | |
462 | ||
463 | ||
b331163b A |
464 | /** |
465 | * Concrete implementation of builder class. | |
466 | */ | |
467 | class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { | |
57a6839d A |
468 | public: |
469 | virtual ~SimpleFilteredBreakIteratorBuilder(); | |
470 | SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); | |
b331163b | 471 | SimpleFilteredBreakIteratorBuilder(UErrorCode &status); |
57a6839d A |
472 | virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status); |
473 | virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status); | |
474 | virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status); | |
475 | private: | |
b331163b | 476 | UStringSet fSet; |
57a6839d A |
477 | }; |
478 | ||
479 | SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() | |
480 | { | |
481 | } | |
482 | ||
2ca993e8 | 483 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) |
b331163b A |
484 | : fSet(status) |
485 | { | |
486 | } | |
487 | ||
57a6839d | 488 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) |
b331163b | 489 | : fSet(status) |
57a6839d A |
490 | { |
491 | if(U_SUCCESS(status)) { | |
f3c0d7a5 A |
492 | UErrorCode subStatus = U_ZERO_ERROR; |
493 | LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus)); | |
494 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { | |
495 | status = subStatus; // copy the failing status | |
496 | #if FB_DEBUG | |
497 | fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); | |
498 | #endif | |
499 | return; // leaves the builder empty, if you try to use it. | |
500 | } | |
501 | LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &subStatus)); | |
502 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { | |
503 | status = subStatus; // copy the failing status | |
504 | #if FB_DEBUG | |
505 | fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); | |
506 | #endif | |
507 | return; // leaves the builder empty, if you try to use it. | |
508 | } | |
509 | LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &subStatus)); | |
510 | ||
511 | #if FB_DEBUG | |
512 | { | |
513 | UErrorCode subsub = subStatus; | |
514 | fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus)); | |
515 | } | |
516 | #endif | |
517 | ||
518 | if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { | |
519 | status = subStatus; // copy the failing status | |
520 | #if FB_DEBUG | |
521 | fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); | |
522 | #endif | |
523 | return; // leaves the builder empty, if you try to use it. | |
524 | } | |
57a6839d A |
525 | |
526 | LocalUResourceBundlePointer strs; | |
f3c0d7a5 | 527 | subStatus = status; // Pick up inherited warning status now |
57a6839d A |
528 | do { |
529 | strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); | |
530 | if(strs.isValid() && U_SUCCESS(subStatus)) { | |
531 | UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); | |
532 | suppressBreakAfter(str, status); // load the string | |
533 | } | |
534 | } while (strs.isValid() && U_SUCCESS(subStatus)); | |
535 | if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { | |
536 | status = subStatus; | |
537 | } | |
538 | } | |
539 | } | |
540 | ||
57a6839d A |
541 | UBool |
542 | SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) | |
543 | { | |
b331163b A |
544 | UBool r = fSet.add(exception, status); |
545 | FB_TRACE("suppressBreakAfter",&exception,r,0); | |
546 | return r; | |
57a6839d A |
547 | } |
548 | ||
549 | UBool | |
550 | SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) | |
551 | { | |
b331163b A |
552 | UBool r = fSet.remove(exception, status); |
553 | FB_TRACE("unsuppressBreakAfter",&exception,r,0); | |
554 | return r; | |
555 | } | |
556 | ||
557 | /** | |
558 | * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly. | |
559 | * Work around this. | |
560 | * | |
561 | * Note: "new UnicodeString[subCount]" ends up calling global operator new | |
562 | * on MSVC2012 for some reason. | |
563 | */ | |
564 | static inline UnicodeString* newUnicodeStringArray(size_t count) { | |
565 | return new UnicodeString[count ? count : 1]; | |
57a6839d A |
566 | } |
567 | ||
568 | BreakIterator * | |
569 | SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { | |
570 | LocalPointer<BreakIterator> adopt(adoptBreakIterator); | |
571 | ||
b331163b A |
572 | LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status); |
573 | LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status); | |
57a6839d A |
574 | if(U_FAILURE(status)) { |
575 | return NULL; | |
576 | } | |
577 | ||
57a6839d A |
578 | int32_t revCount = 0; |
579 | int32_t fwdCount = 0; | |
580 | ||
581 | int32_t subCount = fSet.size(); | |
b331163b A |
582 | |
583 | UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount); | |
2ca993e8 | 584 | |
b331163b A |
585 | LocalArray<UnicodeString> ustrs(ustrs_ptr); |
586 | ||
587 | LocalMemory<int> partials; | |
588 | partials.allocateInsteadAndReset(subCount); | |
57a6839d A |
589 | |
590 | LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. | |
591 | LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." | |
592 | ||
593 | int n=0; | |
b331163b A |
594 | for ( int32_t i = 0; |
595 | i<fSet.size(); | |
57a6839d | 596 | i++) { |
b331163b A |
597 | const UnicodeString *abbr = fSet.getStringAt(i); |
598 | if(abbr) { | |
599 | FB_TRACE("build",abbr,TRUE,i); | |
600 | ustrs[n] = *abbr; // copy by value | |
601 | FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i); | |
602 | } else { | |
603 | FB_TRACE("build",abbr,FALSE,i); | |
604 | status = U_MEMORY_ALLOCATION_ERROR; | |
605 | return NULL; | |
606 | } | |
57a6839d A |
607 | partials[n] = 0; // default: not partial |
608 | n++; | |
609 | } | |
610 | // first pass - find partials. | |
611 | for(int i=0;i<subCount;i++) { | |
612 | int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations | |
613 | if(nn>-1 && (nn+1)!=ustrs[i].length()) { | |
b331163b | 614 | FB_TRACE("partial",&ustrs[i],FALSE,i); |
57a6839d A |
615 | // is partial. |
616 | // is it unique? | |
617 | int sameAs = -1; | |
618 | for(int j=0;j<subCount;j++) { | |
619 | if(j==i) continue; | |
620 | if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { | |
b331163b | 621 | FB_TRACE("prefix",&ustrs[j],FALSE,nn+1); |
57a6839d A |
622 | //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn |
623 | if(partials[j]==0) { // hasn't been processed yet | |
2ca993e8 | 624 | partials[j] = (ustrs[j].length() == nn+1)? (kSuppressInReverse | kAddToForward): kAddToForward; |
b331163b | 625 | FB_TRACE("suppressing",&ustrs[j],FALSE,j); |
57a6839d A |
626 | } else if(partials[j] & kSuppressInReverse) { |
627 | sameAs = j; // the other entry is already in the reverse table. | |
628 | } | |
629 | } | |
630 | } | |
b331163b A |
631 | FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs); |
632 | FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]); | |
57a6839d A |
633 | UnicodeString prefix(ustrs[i], 0, nn+1); |
634 | if(sameAs == -1 && partials[i] == 0) { | |
635 | // first one - add the prefix to the reverse table. | |
636 | prefix.reverse(); | |
637 | builder->add(prefix, kPARTIAL, status); | |
638 | revCount++; | |
b331163b A |
639 | FB_TRACE("Added partial",&prefix,FALSE, i); |
640 | FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); | |
2ca993e8 | 641 | partials[i] = kAddToForward; |
57a6839d | 642 | } else { |
b331163b A |
643 | FB_TRACE("NOT adding partial",&prefix,FALSE, i); |
644 | FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i); | |
57a6839d A |
645 | } |
646 | } | |
647 | } | |
648 | for(int i=0;i<subCount;i++) { | |
2ca993e8 | 649 | if((partials[i] & kSuppressInReverse) == 0) { |
57a6839d A |
650 | ustrs[i].reverse(); |
651 | builder->add(ustrs[i], kMATCH, status); | |
652 | revCount++; | |
b331163b | 653 | FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i); |
2ca993e8 A |
654 | } |
655 | if((partials[i] & kAddToForward) != 0) { | |
b331163b | 656 | FB_TRACE("Adding fwd",&ustrs[i], FALSE, i); |
57a6839d A |
657 | |
658 | // an optimization would be to only add the portion after the '.' | |
659 | // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, | |
660 | // instead of "Ph.D." since we already know the "Ph." part is a match. | |
661 | // would need the trie to be able to hold 0-length strings, though. | |
662 | builder2->add(ustrs[i], kMATCH, status); // forward | |
663 | fwdCount++; | |
664 | //ustrs[i].reverse(); | |
665 | ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); | |
666 | } | |
667 | } | |
b331163b | 668 | FB_TRACE("AbbrCount",NULL,FALSE, subCount); |
57a6839d A |
669 | |
670 | if(revCount>0) { | |
671 | backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); | |
672 | if(U_FAILURE(status)) { | |
b331163b | 673 | FB_TRACE(u_errorName(status),NULL,FALSE, -1); |
57a6839d A |
674 | return NULL; |
675 | } | |
676 | } | |
677 | ||
678 | if(fwdCount>0) { | |
679 | forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); | |
680 | if(U_FAILURE(status)) { | |
b331163b | 681 | FB_TRACE(u_errorName(status),NULL,FALSE, -1); |
57a6839d A |
682 | return NULL; |
683 | } | |
684 | } | |
685 | ||
b331163b | 686 | return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); |
57a6839d A |
687 | } |
688 | ||
689 | ||
b331163b | 690 | // ----------- Base class implementation |
57a6839d A |
691 | |
692 | FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { | |
693 | } | |
694 | ||
695 | FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { | |
696 | } | |
697 | ||
698 | FilteredBreakIteratorBuilder * | |
699 | FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { | |
700 | if(U_FAILURE(status)) return NULL; | |
b331163b A |
701 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status); |
702 | return (U_SUCCESS(status))? ret.orphan(): NULL; | |
57a6839d A |
703 | } |
704 | ||
705 | FilteredBreakIteratorBuilder * | |
706 | FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) { | |
707 | if(U_FAILURE(status)) return NULL; | |
b331163b A |
708 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status); |
709 | return (U_SUCCESS(status))? ret.orphan(): NULL; | |
57a6839d A |
710 | } |
711 | ||
712 | U_NAMESPACE_END | |
713 | ||
f3c0d7a5 | 714 | #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION |