]>
Commit | Line | Data |
---|---|---|
57a6839d A |
1 | /* |
2 | ******************************************************************************* | |
3 | * Copyright (C) 2014, International Business Machines Corporation and | |
4 | * others. All Rights Reserved. | |
5 | ******************************************************************************* | |
6 | */ | |
7 | ||
8 | #include "unicode/filteredbrk.h" | |
9 | ||
10 | #if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION | |
11 | ||
12 | #include <unicode/ucharstriebuilder.h> | |
13 | ||
14 | #include <set> | |
15 | #include <string> | |
16 | #include <functional> | |
17 | #include "uresimp.h" | |
18 | #include "ubrkimpl.h" | |
19 | ||
20 | U_NAMESPACE_BEGIN | |
21 | ||
22 | using namespace std; | |
23 | ||
24 | static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie | |
25 | static const int32_t kMATCH = (1<<1); //< exact match - skip this one. | |
26 | static const int32_t kSuppressInReverse = (1<<0); | |
27 | static const int32_t kAddToForward = (1<<1); | |
28 | static const UChar kFULLSTOP = 0x002E; // '.' | |
29 | ||
30 | class ULISentenceBreakIterator : public BreakIterator { | |
31 | public: | |
32 | ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); | |
33 | virtual ~ULISentenceBreakIterator() {} | |
34 | ULISentenceBreakIterator(const ULISentenceBreakIterator& other); | |
35 | private: | |
36 | LocalPointer<BreakIterator> fDelegate; | |
37 | LocalUTextPointer fText; | |
38 | LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. | |
39 | LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." | |
40 | ||
41 | /* -- subclass interface -- */ | |
42 | public: | |
43 | /* -- cloning and other subclass stuff -- */ | |
44 | virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, | |
45 | int32_t &/*BufferSize*/, | |
46 | UErrorCode &status) { | |
47 | // for now - always deep clone | |
48 | status = U_SAFECLONE_ALLOCATED_WARNING; | |
49 | return clone(); | |
50 | } | |
51 | virtual BreakIterator* clone(void) const { return new ULISentenceBreakIterator(*this); } | |
52 | virtual UClassID getDynamicClassID(void) const { return NULL; } | |
53 | virtual UBool operator==(const BreakIterator& o) const { if(*this==o) return true; return false; } | |
54 | ||
55 | /* -- text modifying -- */ | |
56 | virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); } | |
57 | virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; } | |
58 | virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); } | |
59 | virtual void setText(const UnicodeString &text) { fDelegate->setText(text); } | |
60 | ||
61 | /* -- other functions that are just delegated -- */ | |
62 | virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); } | |
63 | virtual CharacterIterator& getText(void) const { return fDelegate->getText(); } | |
64 | ||
65 | /* -- ITERATION -- */ | |
66 | virtual int32_t first(void) { return fDelegate->first(); } | |
67 | virtual int32_t preceding(int32_t offset) { return fDelegate->preceding(offset); } | |
68 | virtual int32_t previous(void) { return fDelegate->previous(); } | |
69 | virtual UBool isBoundary(int32_t offset) { return fDelegate->isBoundary(offset); } | |
70 | virtual int32_t current(void) const { return fDelegate->current(); } | |
71 | ||
72 | virtual int32_t next(void); | |
73 | ||
74 | virtual int32_t next(int32_t n) { return fDelegate->next(n); } | |
75 | virtual int32_t following(int32_t offset) { return fDelegate->following(offset); } | |
76 | virtual int32_t last(void) { return fDelegate->last(); } | |
77 | ||
78 | }; | |
79 | ||
80 | ULISentenceBreakIterator::ULISentenceBreakIterator(const ULISentenceBreakIterator& other) | |
81 | : BreakIterator(other), fDelegate(other.fDelegate->clone()) | |
82 | { | |
83 | /* | |
84 | TODO: not able to clone Tries. Should be a refcounted hidden master instead. | |
85 | if(other.fBackwardsTrie.isValid()) { | |
86 | fBackwardsTrie.adoptInstead(other.fBackwardsTrie->clone()); | |
87 | } | |
88 | if(other.fForwardsPartialTrie.isValid()) { | |
89 | fForwardsPartialTrie.adoptInstead(other.fForwardsPartialTrie->clone()); | |
90 | } | |
91 | */ | |
92 | } | |
93 | ||
94 | ||
95 | ULISentenceBreakIterator::ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : | |
96 | BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), | |
97 | fDelegate(adopt), | |
98 | fBackwardsTrie(backwards), | |
99 | fForwardsPartialTrie(forwards) | |
100 | { | |
101 | // all set.. | |
102 | } | |
103 | ||
104 | int32_t ULISentenceBreakIterator::next() { | |
105 | int32_t n = fDelegate->next(); | |
106 | if(n == UBRK_DONE || // at end or | |
107 | fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions | |
108 | return n; | |
109 | } | |
110 | // OK, do we need to break here? | |
111 | UErrorCode status = U_ZERO_ERROR; | |
112 | // refresh text | |
113 | fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); | |
114 | //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); | |
115 | do { // outer loop runs once per underlying break (from fDelegate). | |
116 | // loops while 'n' points to an exception. | |
117 | utext_setNativeIndex(fText.getAlias(), n); // from n.. | |
118 | fBackwardsTrie->reset(); | |
119 | UChar32 uch; | |
120 | //if(debug2) u_printf(" n@ %d\n", n); | |
121 | // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") | |
122 | if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here?? | |
123 | // TODO only do this the 1st time? | |
124 | //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch); | |
125 | } else { | |
126 | //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch); | |
127 | uch = utext_next32(fText.getAlias()); | |
128 | //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch); | |
129 | } | |
130 | UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE; | |
131 | ||
132 | int32_t bestPosn = -1; | |
133 | int32_t bestValue = -1; | |
134 | ||
135 | while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and.. | |
136 | USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie | |
137 | if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far | |
138 | bestPosn = utext_getNativeIndex(fText.getAlias()); | |
139 | bestValue = fBackwardsTrie->getValue(); | |
140 | } | |
141 | //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias())); | |
142 | } | |
143 | ||
144 | if(USTRINGTRIE_MATCHES(r)) { // exact match? | |
145 | //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); | |
146 | bestValue = fBackwardsTrie->getValue(); | |
147 | bestPosn = utext_getNativeIndex(fText.getAlias()); | |
148 | //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); | |
149 | } | |
150 | ||
151 | if(bestPosn>=0) { | |
152 | //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue); | |
153 | ||
154 | //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? | |
155 | //int32_t bestValue = fBackwardsTrie->getValue(); | |
156 | ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue); | |
157 | ||
158 | if(bestValue == kMATCH) { // exact match! | |
159 | //if(debug2) u_printf(" exact backward match\n"); | |
160 | n = fDelegate->next(); // skip this one. Find the next lowerlevel break. | |
161 | if(n==UBRK_DONE) return n; | |
162 | continue; // See if the next is another exception. | |
163 | } else if(bestValue == kPARTIAL | |
164 | && fForwardsPartialTrie.isValid()) { // make sure there's a forward trie | |
165 | //if(debug2) u_printf(" partial backward match\n"); | |
166 | // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie | |
167 | // to see if it matches something going forward. | |
168 | fForwardsPartialTrie->reset(); | |
169 | UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; | |
170 | utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. | |
171 | //if(debug2) u_printf("Retrying at %d\n", bestPosn); | |
172 | while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && | |
173 | USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) { | |
174 | //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias())); | |
175 | } | |
176 | if(USTRINGTRIE_MATCHES(rfwd)) { | |
177 | //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch); | |
178 | // only full matches here, nothing to check | |
179 | // skip the next: | |
180 | n = fDelegate->next(); | |
181 | if(n==UBRK_DONE) return n; | |
182 | continue; | |
183 | } else { | |
184 | //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch); | |
185 | // no match (no exception) -return the 'underlying' break | |
186 | return n; | |
187 | } | |
188 | } else { | |
189 | return n; // internal error and/or no forwards trie | |
190 | } | |
191 | } else { | |
192 | //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match | |
193 | return n; // No match - so exit. Not an exception. | |
194 | } | |
195 | } while(n != UBRK_DONE); | |
196 | return n; | |
197 | } | |
198 | ||
199 | U_NAMESPACE_END | |
200 | ||
201 | #if 0 | |
202 | // Would improve performance - but, platform issues. | |
203 | // for the 'set' | |
204 | namespace std { | |
205 | template <> struct hash<icu::UnicodeString> { | |
206 | size_t operator()( const UnicodeString& str ) const { | |
207 | return (size_t)str.hashCode(); | |
208 | } | |
209 | }; | |
210 | } | |
211 | #endif | |
212 | ||
213 | U_NAMESPACE_BEGIN | |
214 | ||
215 | class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { | |
216 | public: | |
217 | virtual ~SimpleFilteredBreakIteratorBuilder(); | |
218 | SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); | |
219 | SimpleFilteredBreakIteratorBuilder(); | |
220 | virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status); | |
221 | virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status); | |
222 | virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status); | |
223 | private: | |
224 | set<UnicodeString> fSet; | |
225 | }; | |
226 | ||
227 | SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() | |
228 | { | |
229 | } | |
230 | ||
231 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) | |
232 | : fSet() | |
233 | { | |
234 | if(U_SUCCESS(status)) { | |
235 | LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status)); | |
236 | LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &status)); | |
237 | LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &status)); | |
238 | if(U_FAILURE(status)) return; // leaves the builder empty, if you try to use it. | |
239 | ||
240 | LocalUResourceBundlePointer strs; | |
241 | UErrorCode subStatus = status; | |
242 | do { | |
243 | strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); | |
244 | if(strs.isValid() && U_SUCCESS(subStatus)) { | |
245 | UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); | |
246 | suppressBreakAfter(str, status); // load the string | |
247 | } | |
248 | } while (strs.isValid() && U_SUCCESS(subStatus)); | |
249 | if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { | |
250 | status = subStatus; | |
251 | } | |
252 | } | |
253 | } | |
254 | ||
255 | SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder() | |
256 | : fSet() | |
257 | { | |
258 | } | |
259 | ||
260 | UBool | |
261 | SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) | |
262 | { | |
263 | if( U_FAILURE(status) ) return FALSE; | |
264 | return fSet.insert(exception).second; | |
265 | } | |
266 | ||
267 | UBool | |
268 | SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) | |
269 | { | |
270 | if( U_FAILURE(status) ) return FALSE; | |
271 | return ((fSet.erase(exception)) != 0); | |
272 | } | |
273 | ||
274 | BreakIterator * | |
275 | SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { | |
276 | LocalPointer<BreakIterator> adopt(adoptBreakIterator); | |
277 | ||
278 | if(U_FAILURE(status)) { | |
279 | return NULL; | |
280 | } | |
281 | ||
282 | LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status)); | |
283 | LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status)); | |
284 | ||
285 | int32_t revCount = 0; | |
286 | int32_t fwdCount = 0; | |
287 | ||
288 | int32_t subCount = fSet.size(); | |
289 | LocalArray<UnicodeString> ustrs(new UnicodeString[subCount]); | |
290 | LocalArray<int> partials(new int[subCount]); | |
291 | ||
292 | LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. | |
293 | LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." | |
294 | ||
295 | int n=0; | |
296 | for ( set<UnicodeString>::iterator i = fSet.begin(); | |
297 | i != fSet.end(); | |
298 | i++) { | |
299 | const UnicodeString &abbr = *i; | |
300 | ustrs[n] = abbr; | |
301 | partials[n] = 0; // default: not partial | |
302 | n++; | |
303 | } | |
304 | // first pass - find partials. | |
305 | for(int i=0;i<subCount;i++) { | |
306 | int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations | |
307 | if(nn>-1 && (nn+1)!=ustrs[i].length()) { | |
308 | //if(true) u_printf("Is a partial: /%S/\n", ustrs[i].getTerminatedBuffer()); | |
309 | // is partial. | |
310 | // is it unique? | |
311 | int sameAs = -1; | |
312 | for(int j=0;j<subCount;j++) { | |
313 | if(j==i) continue; | |
314 | if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { | |
315 | //if(true) u_printf("Prefix match: /%S/ to %d\n", ustrs[j].getTerminatedBuffer(), nn+1); | |
316 | //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn | |
317 | if(partials[j]==0) { // hasn't been processed yet | |
318 | partials[j] = kSuppressInReverse | kAddToForward; | |
319 | //if(true) u_printf("Suppressing: /%S/\n", ustrs[j].getTerminatedBuffer()); | |
320 | } else if(partials[j] & kSuppressInReverse) { | |
321 | sameAs = j; // the other entry is already in the reverse table. | |
322 | } | |
323 | } | |
324 | } | |
325 | //if(debug2) u_printf("for partial /%S/ same=%d partials=%d\n", ustrs[i].getTerminatedBuffer(), sameAs, partials[i]); | |
326 | UnicodeString prefix(ustrs[i], 0, nn+1); | |
327 | if(sameAs == -1 && partials[i] == 0) { | |
328 | // first one - add the prefix to the reverse table. | |
329 | prefix.reverse(); | |
330 | builder->add(prefix, kPARTIAL, status); | |
331 | revCount++; | |
332 | //if(debug2) u_printf("Added Partial: /%S/ from /%S/ status=%s\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer(), u_errorName(status)); | |
333 | partials[i] = kSuppressInReverse | kAddToForward; | |
334 | } else { | |
335 | //if(debug2) u_printf(" // not adding partial for /%S/ from /%S/\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer()); | |
336 | } | |
337 | } | |
338 | } | |
339 | for(int i=0;i<subCount;i++) { | |
340 | if(partials[i]==0) { | |
341 | ustrs[i].reverse(); | |
342 | builder->add(ustrs[i], kMATCH, status); | |
343 | revCount++; | |
344 | //if(debug2) u_printf("Added: /%S/ status=%s\n", ustrs[i].getTerminatedBuffer(), u_errorName(status)); | |
345 | } else { | |
346 | //if(debug2) u_printf(" Adding fwd: /%S/\n", ustrs[i].getTerminatedBuffer()); | |
347 | ||
348 | // an optimization would be to only add the portion after the '.' | |
349 | // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, | |
350 | // instead of "Ph.D." since we already know the "Ph." part is a match. | |
351 | // would need the trie to be able to hold 0-length strings, though. | |
352 | builder2->add(ustrs[i], kMATCH, status); // forward | |
353 | fwdCount++; | |
354 | //ustrs[i].reverse(); | |
355 | ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); | |
356 | } | |
357 | } | |
358 | //if(debug) u_printf(" %s has %d abbrs.\n", fJSONSource.c_str(), subCount); | |
359 | ||
360 | if(revCount>0) { | |
361 | backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); | |
362 | if(U_FAILURE(status)) { | |
363 | //printf("Error %s building backwards\n", u_errorName(status)); | |
364 | return NULL; | |
365 | } | |
366 | } | |
367 | ||
368 | if(fwdCount>0) { | |
369 | forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); | |
370 | if(U_FAILURE(status)) { | |
371 | //printf("Error %s building forwards\n", u_errorName(status)); | |
372 | return NULL; | |
373 | } | |
374 | } | |
375 | ||
376 | return new ULISentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); | |
377 | } | |
378 | ||
379 | ||
380 | // ----------- | |
381 | ||
382 | FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { | |
383 | } | |
384 | ||
385 | FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { | |
386 | } | |
387 | ||
388 | FilteredBreakIteratorBuilder * | |
389 | FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { | |
390 | if(U_FAILURE(status)) return NULL; | |
391 | ||
392 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status)); | |
393 | if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR; | |
394 | return ret.orphan(); | |
395 | } | |
396 | ||
397 | FilteredBreakIteratorBuilder * | |
398 | FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) { | |
399 | if(U_FAILURE(status)) return NULL; | |
400 | ||
401 | LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder()); | |
402 | if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR; | |
403 | return ret.orphan(); | |
404 | } | |
405 | ||
406 | U_NAMESPACE_END | |
407 | ||
408 | #endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION |