]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /** |
2 | ******************************************************************************* | |
57a6839d | 3 | * Copyright (C) 2006-2014, International Business Machines Corporation |
51004dcb | 4 | * and others. All Rights Reserved. |
73c04bcf A |
5 | ******************************************************************************* |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_BREAK_ITERATION | |
11 | ||
12 | #include "brkeng.h" | |
13 | #include "dictbe.h" | |
14 | #include "unicode/uniset.h" | |
15 | #include "unicode/chariter.h" | |
16 | #include "unicode/ubrk.h" | |
17 | #include "uvector.h" | |
51004dcb A |
18 | #include "uassert.h" |
19 | #include "unicode/normlzr.h" | |
20 | #include "cmemory.h" | |
21 | #include "dictionarydata.h" | |
73c04bcf A |
22 | |
23 | U_NAMESPACE_BEGIN | |
24 | ||
25 | /* | |
26 | ****************************************************************** | |
27 | */ | |
28 | ||
73c04bcf A |
29 | DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { |
30 | fTypes = breakTypes; | |
31 | } | |
32 | ||
33 | DictionaryBreakEngine::~DictionaryBreakEngine() { | |
34 | } | |
35 | ||
36 | UBool | |
37 | DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { | |
38 | return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) | |
39 | && fSet.contains(c)); | |
40 | } | |
41 | ||
42 | int32_t | |
43 | DictionaryBreakEngine::findBreaks( UText *text, | |
44 | int32_t startPos, | |
45 | int32_t endPos, | |
46 | UBool reverse, | |
47 | int32_t breakType, | |
48 | UStack &foundBreaks ) const { | |
49 | int32_t result = 0; | |
50 | ||
51 | // Find the span of characters included in the set. | |
57a6839d A |
52 | // The span to break begins at the current position in the text, and |
53 | // extends towards the start or end of the text, depending on 'reverse'. | |
54 | ||
73c04bcf A |
55 | int32_t start = (int32_t)utext_getNativeIndex(text); |
56 | int32_t current; | |
57 | int32_t rangeStart; | |
58 | int32_t rangeEnd; | |
59 | UChar32 c = utext_current32(text); | |
60 | if (reverse) { | |
61 | UBool isDict = fSet.contains(c); | |
62 | while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { | |
63 | c = utext_previous32(text); | |
64 | isDict = fSet.contains(c); | |
65 | } | |
66 | rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); | |
67 | rangeEnd = start + 1; | |
68 | } | |
69 | else { | |
70 | while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { | |
71 | utext_next32(text); // TODO: recast loop for postincrement | |
72 | c = utext_current32(text); | |
73 | } | |
74 | rangeStart = start; | |
75 | rangeEnd = current; | |
76 | } | |
77 | if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { | |
78 | result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); | |
79 | utext_setNativeIndex(text, current); | |
80 | } | |
81 | ||
82 | return result; | |
83 | } | |
84 | ||
85 | void | |
46f4442e | 86 | DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { |
73c04bcf | 87 | fSet = set; |
46f4442e A |
88 | // Compact for caching |
89 | fSet.compact(); | |
73c04bcf A |
90 | } |
91 | ||
73c04bcf A |
92 | /* |
93 | ****************************************************************** | |
57a6839d | 94 | * PossibleWord |
73c04bcf A |
95 | */ |
96 | ||
57a6839d | 97 | // Helper class for improving readability of the Thai/Lao/Khmer word break |
73c04bcf A |
98 | // algorithm. The implementation is completely inline. |
99 | ||
100 | // List size, limited by the maximum number of words in the dictionary | |
101 | // that form a nested sequence. | |
102 | #define POSSIBLE_WORD_LIST_MAX 20 | |
103 | ||
104 | class PossibleWord { | |
51004dcb A |
105 | private: |
106 | // list of word candidate lengths, in increasing length order | |
107 | int32_t lengths[POSSIBLE_WORD_LIST_MAX]; | |
108 | int32_t count; // Count of candidates | |
109 | int32_t prefix; // The longest match with a dictionary word | |
110 | int32_t offset; // Offset in the text of these candidates | |
111 | int mark; // The preferred candidate's offset | |
112 | int current; // The candidate we're currently looking at | |
113 | ||
114 | public: | |
115 | PossibleWord(); | |
116 | ~PossibleWord(); | |
73c04bcf | 117 | |
51004dcb A |
118 | // Fill the list of candidates if needed, select the longest, and return the number found |
119 | int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); | |
73c04bcf | 120 | |
51004dcb A |
121 | // Select the currently marked candidate, point after it in the text, and invalidate self |
122 | int32_t acceptMarked( UText *text ); | |
73c04bcf | 123 | |
51004dcb A |
124 | // Back up from the current candidate to the next shorter one; return TRUE if that exists |
125 | // and point the text after it | |
126 | UBool backUp( UText *text ); | |
73c04bcf | 127 | |
51004dcb A |
128 | // Return the longest prefix this candidate location shares with a dictionary word |
129 | int32_t longestPrefix(); | |
73c04bcf | 130 | |
51004dcb A |
131 | // Mark the current candidate as the one we like |
132 | void markCurrent(); | |
73c04bcf A |
133 | }; |
134 | ||
135 | inline | |
136 | PossibleWord::PossibleWord() { | |
137 | offset = -1; | |
138 | } | |
139 | ||
140 | inline | |
141 | PossibleWord::~PossibleWord() { | |
142 | } | |
143 | ||
144 | inline int | |
51004dcb | 145 | PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { |
73c04bcf A |
146 | // TODO: If getIndex is too slow, use offset < 0 and add discardAll() |
147 | int32_t start = (int32_t)utext_getNativeIndex(text); | |
148 | if (start != offset) { | |
149 | offset = start; | |
150 | prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); | |
151 | // Dictionary leaves text after longest prefix, not longest word. Back up. | |
152 | if (count <= 0) { | |
153 | utext_setNativeIndex(text, start); | |
154 | } | |
155 | } | |
156 | if (count > 0) { | |
157 | utext_setNativeIndex(text, start+lengths[count-1]); | |
158 | } | |
159 | current = count-1; | |
160 | mark = current; | |
161 | return count; | |
162 | } | |
163 | ||
164 | inline int32_t | |
165 | PossibleWord::acceptMarked( UText *text ) { | |
166 | utext_setNativeIndex(text, offset + lengths[mark]); | |
167 | return lengths[mark]; | |
168 | } | |
169 | ||
170 | inline UBool | |
171 | PossibleWord::backUp( UText *text ) { | |
172 | if (current > 0) { | |
173 | utext_setNativeIndex(text, offset + lengths[--current]); | |
174 | return TRUE; | |
175 | } | |
176 | return FALSE; | |
177 | } | |
178 | ||
179 | inline int32_t | |
180 | PossibleWord::longestPrefix() { | |
181 | return prefix; | |
182 | } | |
183 | ||
184 | inline void | |
185 | PossibleWord::markCurrent() { | |
186 | mark = current; | |
187 | } | |
188 | ||
57a6839d A |
189 | /* |
190 | ****************************************************************** | |
191 | * ThaiBreakEngine | |
192 | */ | |
193 | ||
73c04bcf A |
194 | // How many words in a row are "good enough"? |
195 | #define THAI_LOOKAHEAD 3 | |
196 | ||
197 | // Will not combine a non-word with a preceding dictionary word longer than this | |
198 | #define THAI_ROOT_COMBINE_THRESHOLD 3 | |
199 | ||
200 | // Will not combine a non-word that shares at least this much prefix with a | |
201 | // dictionary word, with a preceding word | |
202 | #define THAI_PREFIX_COMBINE_THRESHOLD 3 | |
203 | ||
204 | // Ellision character | |
205 | #define THAI_PAIYANNOI 0x0E2F | |
206 | ||
207 | // Repeat character | |
208 | #define THAI_MAIYAMOK 0x0E46 | |
209 | ||
210 | // Minimum word size | |
211 | #define THAI_MIN_WORD 2 | |
212 | ||
213 | // Minimum number of characters for two words | |
214 | #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) | |
215 | ||
51004dcb | 216 | ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) |
73c04bcf A |
217 | : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), |
218 | fDictionary(adoptDictionary) | |
219 | { | |
220 | fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); | |
221 | if (U_SUCCESS(status)) { | |
222 | setCharacters(fThaiWordSet); | |
223 | } | |
224 | fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); | |
46f4442e | 225 | fMarkSet.add(0x0020); |
73c04bcf A |
226 | fEndWordSet = fThaiWordSet; |
227 | fEndWordSet.remove(0x0E31); // MAI HAN-AKAT | |
228 | fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
229 | fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK | |
230 | fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
231 | fSuffixSet.add(THAI_PAIYANNOI); | |
232 | fSuffixSet.add(THAI_MAIYAMOK); | |
46f4442e A |
233 | |
234 | // Compact for caching. | |
235 | fMarkSet.compact(); | |
236 | fEndWordSet.compact(); | |
237 | fBeginWordSet.compact(); | |
238 | fSuffixSet.compact(); | |
73c04bcf A |
239 | } |
240 | ||
241 | ThaiBreakEngine::~ThaiBreakEngine() { | |
242 | delete fDictionary; | |
243 | } | |
244 | ||
245 | int32_t | |
246 | ThaiBreakEngine::divideUpDictionaryRange( UText *text, | |
247 | int32_t rangeStart, | |
248 | int32_t rangeEnd, | |
249 | UStack &foundBreaks ) const { | |
250 | if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { | |
251 | return 0; // Not enough characters for two words | |
252 | } | |
253 | ||
254 | uint32_t wordsFound = 0; | |
255 | int32_t wordLength; | |
256 | int32_t current; | |
257 | UErrorCode status = U_ZERO_ERROR; | |
258 | PossibleWord words[THAI_LOOKAHEAD]; | |
259 | UChar32 uc; | |
260 | ||
261 | utext_setNativeIndex(text, rangeStart); | |
262 | ||
263 | while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | |
264 | wordLength = 0; | |
265 | ||
266 | // Look for candidate words at the current position | |
267 | int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
268 | ||
269 | // If we found exactly one, use that | |
270 | if (candidates == 1) { | |
51004dcb | 271 | wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); |
73c04bcf A |
272 | wordsFound += 1; |
273 | } | |
73c04bcf A |
274 | // If there was more than one, see which one can take us forward the most words |
275 | else if (candidates > 1) { | |
276 | // If we're already at the end of the range, we're done | |
277 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
278 | goto foundBest; | |
279 | } | |
280 | do { | |
281 | int wordsMatched = 1; | |
51004dcb | 282 | if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { |
73c04bcf A |
283 | if (wordsMatched < 2) { |
284 | // Followed by another dictionary word; mark first word as a good candidate | |
285 | words[wordsFound%THAI_LOOKAHEAD].markCurrent(); | |
286 | wordsMatched = 2; | |
287 | } | |
288 | ||
289 | // If we're already at the end of the range, we're done | |
290 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
291 | goto foundBest; | |
292 | } | |
293 | ||
294 | // See if any of the possible second words is followed by a third word | |
295 | do { | |
296 | // If we find a third word, stop right away | |
51004dcb A |
297 | if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { |
298 | words[wordsFound % THAI_LOOKAHEAD].markCurrent(); | |
73c04bcf A |
299 | goto foundBest; |
300 | } | |
301 | } | |
51004dcb | 302 | while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text)); |
73c04bcf A |
303 | } |
304 | } | |
51004dcb | 305 | while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); |
73c04bcf | 306 | foundBest: |
51004dcb | 307 | wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); |
73c04bcf A |
308 | wordsFound += 1; |
309 | } | |
310 | ||
311 | // We come here after having either found a word or not. We look ahead to the | |
312 | // next word. If it's not a dictionary word, we will combine it withe the word we | |
313 | // just found (if there is one), but only if the preceding word does not exceed | |
314 | // the threshold. | |
315 | // The text iterator should now be positioned at the end of the word we found. | |
316 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { | |
317 | // if it is a dictionary word, do nothing. If it isn't, then if there is | |
318 | // no preceding word, or the non-word shares less than the minimum threshold | |
319 | // of characters with a dictionary word, then scan to resynchronize | |
51004dcb | 320 | if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 |
73c04bcf A |
321 | && (wordLength == 0 |
322 | || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { | |
323 | // Look for a plausible word boundary | |
324 | //TODO: This section will need a rework for UText. | |
325 | int32_t remaining = rangeEnd - (current+wordLength); | |
326 | UChar32 pc = utext_current32(text); | |
327 | int32_t chars = 0; | |
46f4442e | 328 | for (;;) { |
73c04bcf A |
329 | utext_next32(text); |
330 | uc = utext_current32(text); | |
331 | // TODO: Here we're counting on the fact that the SA languages are all | |
332 | // in the BMP. This should get fixed with the UText rework. | |
333 | chars += 1; | |
334 | if (--remaining <= 0) { | |
335 | break; | |
336 | } | |
337 | if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | |
338 | // Maybe. See if it's in the dictionary. | |
339 | // NOTE: In the original Apple code, checked that the next | |
340 | // two characters after uc were not 0x0E4C THANTHAKHAT before | |
341 | // checking the dictionary. That is just a performance filter, | |
342 | // but it's not clear it's faster than checking the trie. | |
51004dcb A |
343 | int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); |
344 | utext_setNativeIndex(text, current + wordLength + chars); | |
73c04bcf A |
345 | if (candidates > 0) { |
346 | break; | |
347 | } | |
348 | } | |
349 | pc = uc; | |
350 | } | |
351 | ||
352 | // Bump the word count if there wasn't already one | |
353 | if (wordLength <= 0) { | |
354 | wordsFound += 1; | |
355 | } | |
356 | ||
357 | // Update the length with the passed-over characters | |
358 | wordLength += chars; | |
359 | } | |
360 | else { | |
361 | // Back up to where we were for next iteration | |
362 | utext_setNativeIndex(text, current+wordLength); | |
363 | } | |
364 | } | |
365 | ||
366 | // Never stop before a combining mark. | |
367 | int32_t currPos; | |
368 | while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | |
369 | utext_next32(text); | |
370 | wordLength += (int32_t)utext_getNativeIndex(text) - currPos; | |
371 | } | |
372 | ||
373 | // Look ahead for possible suffixes if a dictionary word does not follow. | |
374 | // We do this in code rather than using a rule so that the heuristic | |
375 | // resynch continues to function. For example, one of the suffix characters | |
376 | // could be a typo in the middle of a word. | |
377 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { | |
378 | if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
379 | && fSuffixSet.contains(uc = utext_current32(text))) { | |
380 | if (uc == THAI_PAIYANNOI) { | |
381 | if (!fSuffixSet.contains(utext_previous32(text))) { | |
382 | // Skip over previous end and PAIYANNOI | |
383 | utext_next32(text); | |
384 | utext_next32(text); | |
385 | wordLength += 1; // Add PAIYANNOI to word | |
386 | uc = utext_current32(text); // Fetch next character | |
387 | } | |
388 | else { | |
389 | // Restore prior position | |
390 | utext_next32(text); | |
391 | } | |
392 | } | |
393 | if (uc == THAI_MAIYAMOK) { | |
394 | if (utext_previous32(text) != THAI_MAIYAMOK) { | |
395 | // Skip over previous end and MAIYAMOK | |
396 | utext_next32(text); | |
397 | utext_next32(text); | |
398 | wordLength += 1; // Add MAIYAMOK to word | |
399 | } | |
400 | else { | |
401 | // Restore prior position | |
402 | utext_next32(text); | |
403 | } | |
404 | } | |
405 | } | |
406 | else { | |
407 | utext_setNativeIndex(text, current+wordLength); | |
408 | } | |
409 | } | |
4388f060 A |
410 | |
411 | // Did we find a word on this iteration? If so, push it on the break stack | |
412 | if (wordLength > 0) { | |
413 | foundBreaks.push((current+wordLength), status); | |
414 | } | |
415 | } | |
416 | ||
417 | // Don't return a break for the end of the dictionary range if there is one there. | |
418 | if (foundBreaks.peeki() >= rangeEnd) { | |
419 | (void) foundBreaks.popi(); | |
420 | wordsFound -= 1; | |
421 | } | |
422 | ||
423 | return wordsFound; | |
424 | } | |
425 | ||
57a6839d A |
426 | /* |
427 | ****************************************************************** | |
428 | * LaoBreakEngine | |
429 | */ | |
430 | ||
431 | // How many words in a row are "good enough"? | |
432 | #define LAO_LOOKAHEAD 3 | |
433 | ||
434 | // Will not combine a non-word with a preceding dictionary word longer than this | |
435 | #define LAO_ROOT_COMBINE_THRESHOLD 3 | |
436 | ||
437 | // Will not combine a non-word that shares at least this much prefix with a | |
438 | // dictionary word, with a preceding word | |
439 | #define LAO_PREFIX_COMBINE_THRESHOLD 3 | |
440 | ||
441 | // Minimum word size | |
442 | #define LAO_MIN_WORD 2 | |
443 | ||
444 | // Minimum number of characters for two words | |
445 | #define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2) | |
446 | ||
447 | LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) | |
448 | : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), | |
449 | fDictionary(adoptDictionary) | |
450 | { | |
451 | fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); | |
452 | if (U_SUCCESS(status)) { | |
453 | setCharacters(fLaoWordSet); | |
454 | } | |
455 | fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); | |
456 | fMarkSet.add(0x0020); | |
457 | fEndWordSet = fLaoWordSet; | |
458 | fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels | |
459 | fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) | |
460 | fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) | |
461 | fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels | |
462 | ||
463 | // Compact for caching. | |
464 | fMarkSet.compact(); | |
465 | fEndWordSet.compact(); | |
466 | fBeginWordSet.compact(); | |
467 | } | |
468 | ||
469 | LaoBreakEngine::~LaoBreakEngine() { | |
470 | delete fDictionary; | |
471 | } | |
472 | ||
473 | int32_t | |
474 | LaoBreakEngine::divideUpDictionaryRange( UText *text, | |
475 | int32_t rangeStart, | |
476 | int32_t rangeEnd, | |
477 | UStack &foundBreaks ) const { | |
478 | if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { | |
479 | return 0; // Not enough characters for two words | |
480 | } | |
481 | ||
482 | uint32_t wordsFound = 0; | |
483 | int32_t wordLength; | |
484 | int32_t current; | |
485 | UErrorCode status = U_ZERO_ERROR; | |
486 | PossibleWord words[LAO_LOOKAHEAD]; | |
487 | UChar32 uc; | |
488 | ||
489 | utext_setNativeIndex(text, rangeStart); | |
490 | ||
491 | while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | |
492 | wordLength = 0; | |
493 | ||
494 | // Look for candidate words at the current position | |
495 | int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
496 | ||
497 | // If we found exactly one, use that | |
498 | if (candidates == 1) { | |
499 | wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); | |
500 | wordsFound += 1; | |
501 | } | |
502 | // If there was more than one, see which one can take us forward the most words | |
503 | else if (candidates > 1) { | |
504 | // If we're already at the end of the range, we're done | |
505 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
506 | goto foundBest; | |
507 | } | |
508 | do { | |
509 | int wordsMatched = 1; | |
510 | if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { | |
511 | if (wordsMatched < 2) { | |
512 | // Followed by another dictionary word; mark first word as a good candidate | |
513 | words[wordsFound%LAO_LOOKAHEAD].markCurrent(); | |
514 | wordsMatched = 2; | |
515 | } | |
516 | ||
517 | // If we're already at the end of the range, we're done | |
518 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
519 | goto foundBest; | |
520 | } | |
521 | ||
522 | // See if any of the possible second words is followed by a third word | |
523 | do { | |
524 | // If we find a third word, stop right away | |
525 | if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { | |
526 | words[wordsFound % LAO_LOOKAHEAD].markCurrent(); | |
527 | goto foundBest; | |
528 | } | |
529 | } | |
530 | while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)); | |
531 | } | |
532 | } | |
533 | while (words[wordsFound % LAO_LOOKAHEAD].backUp(text)); | |
534 | foundBest: | |
535 | wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); | |
536 | wordsFound += 1; | |
537 | } | |
538 | ||
539 | // We come here after having either found a word or not. We look ahead to the | |
540 | // next word. If it's not a dictionary word, we will combine it withe the word we | |
541 | // just found (if there is one), but only if the preceding word does not exceed | |
542 | // the threshold. | |
543 | // The text iterator should now be positioned at the end of the word we found. | |
544 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) { | |
545 | // if it is a dictionary word, do nothing. If it isn't, then if there is | |
546 | // no preceding word, or the non-word shares less than the minimum threshold | |
547 | // of characters with a dictionary word, then scan to resynchronize | |
548 | if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
549 | && (wordLength == 0 | |
550 | || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) { | |
551 | // Look for a plausible word boundary | |
552 | //TODO: This section will need a rework for UText. | |
553 | int32_t remaining = rangeEnd - (current+wordLength); | |
554 | UChar32 pc = utext_current32(text); | |
555 | int32_t chars = 0; | |
556 | for (;;) { | |
557 | utext_next32(text); | |
558 | uc = utext_current32(text); | |
559 | // TODO: Here we're counting on the fact that the SA languages are all | |
560 | // in the BMP. This should get fixed with the UText rework. | |
561 | chars += 1; | |
562 | if (--remaining <= 0) { | |
563 | break; | |
564 | } | |
565 | if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | |
566 | // Maybe. See if it's in the dictionary. | |
567 | int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
568 | utext_setNativeIndex(text, current + wordLength + chars); | |
569 | if (candidates > 0) { | |
570 | break; | |
571 | } | |
572 | } | |
573 | pc = uc; | |
574 | } | |
575 | ||
576 | // Bump the word count if there wasn't already one | |
577 | if (wordLength <= 0) { | |
578 | wordsFound += 1; | |
579 | } | |
580 | ||
581 | // Update the length with the passed-over characters | |
582 | wordLength += chars; | |
583 | } | |
584 | else { | |
585 | // Back up to where we were for next iteration | |
586 | utext_setNativeIndex(text, current+wordLength); | |
587 | } | |
588 | } | |
589 | ||
590 | // Never stop before a combining mark. | |
591 | int32_t currPos; | |
592 | while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | |
593 | utext_next32(text); | |
594 | wordLength += (int32_t)utext_getNativeIndex(text) - currPos; | |
595 | } | |
596 | ||
597 | // Look ahead for possible suffixes if a dictionary word does not follow. | |
598 | // We do this in code rather than using a rule so that the heuristic | |
599 | // resynch continues to function. For example, one of the suffix characters | |
600 | // could be a typo in the middle of a word. | |
601 | // NOT CURRENTLY APPLICABLE TO LAO | |
602 | ||
603 | // Did we find a word on this iteration? If so, push it on the break stack | |
604 | if (wordLength > 0) { | |
605 | foundBreaks.push((current+wordLength), status); | |
606 | } | |
607 | } | |
608 | ||
609 | // Don't return a break for the end of the dictionary range if there is one there. | |
610 | if (foundBreaks.peeki() >= rangeEnd) { | |
611 | (void) foundBreaks.popi(); | |
612 | wordsFound -= 1; | |
613 | } | |
614 | ||
615 | return wordsFound; | |
616 | } | |
617 | ||
618 | /* | |
619 | ****************************************************************** | |
620 | * KhmerBreakEngine | |
621 | */ | |
622 | ||
4388f060 A |
623 | // How many words in a row are "good enough"? |
624 | #define KHMER_LOOKAHEAD 3 | |
625 | ||
626 | // Will not combine a non-word with a preceding dictionary word longer than this | |
627 | #define KHMER_ROOT_COMBINE_THRESHOLD 3 | |
628 | ||
629 | // Will not combine a non-word that shares at least this much prefix with a | |
630 | // dictionary word, with a preceding word | |
631 | #define KHMER_PREFIX_COMBINE_THRESHOLD 3 | |
632 | ||
633 | // Minimum word size | |
634 | #define KHMER_MIN_WORD 2 | |
635 | ||
636 | // Minimum number of characters for two words | |
637 | #define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) | |
638 | ||
51004dcb A |
639 | KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) |
640 | : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), | |
4388f060 A |
641 | fDictionary(adoptDictionary) |
642 | { | |
643 | fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); | |
644 | if (U_SUCCESS(status)) { | |
645 | setCharacters(fKhmerWordSet); | |
646 | } | |
647 | fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); | |
648 | fMarkSet.add(0x0020); | |
649 | fEndWordSet = fKhmerWordSet; | |
650 | fBeginWordSet.add(0x1780, 0x17B3); | |
651 | //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels | |
652 | //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word | |
653 | //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word | |
654 | fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters | |
655 | //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels | |
656 | // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT | |
657 | // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
658 | // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK | |
659 | // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
660 | // fSuffixSet.add(THAI_PAIYANNOI); | |
661 | // fSuffixSet.add(THAI_MAIYAMOK); | |
662 | ||
663 | // Compact for caching. | |
664 | fMarkSet.compact(); | |
665 | fEndWordSet.compact(); | |
666 | fBeginWordSet.compact(); | |
667 | // fSuffixSet.compact(); | |
668 | } | |
669 | ||
670 | KhmerBreakEngine::~KhmerBreakEngine() { | |
671 | delete fDictionary; | |
672 | } | |
673 | ||
674 | int32_t | |
675 | KhmerBreakEngine::divideUpDictionaryRange( UText *text, | |
676 | int32_t rangeStart, | |
677 | int32_t rangeEnd, | |
678 | UStack &foundBreaks ) const { | |
679 | if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { | |
680 | return 0; // Not enough characters for two words | |
681 | } | |
682 | ||
683 | uint32_t wordsFound = 0; | |
684 | int32_t wordLength; | |
685 | int32_t current; | |
686 | UErrorCode status = U_ZERO_ERROR; | |
687 | PossibleWord words[KHMER_LOOKAHEAD]; | |
688 | UChar32 uc; | |
689 | ||
690 | utext_setNativeIndex(text, rangeStart); | |
691 | ||
692 | while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | |
693 | wordLength = 0; | |
694 | ||
695 | // Look for candidate words at the current position | |
696 | int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
697 | ||
698 | // If we found exactly one, use that | |
699 | if (candidates == 1) { | |
700 | wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); | |
701 | wordsFound += 1; | |
702 | } | |
703 | ||
704 | // If there was more than one, see which one can take us forward the most words | |
705 | else if (candidates > 1) { | |
706 | // If we're already at the end of the range, we're done | |
707 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
708 | goto foundBest; | |
709 | } | |
710 | do { | |
711 | int wordsMatched = 1; | |
51004dcb | 712 | if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { |
4388f060 A |
713 | if (wordsMatched < 2) { |
714 | // Followed by another dictionary word; mark first word as a good candidate | |
51004dcb | 715 | words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); |
4388f060 A |
716 | wordsMatched = 2; |
717 | } | |
718 | ||
719 | // If we're already at the end of the range, we're done | |
720 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
721 | goto foundBest; | |
722 | } | |
723 | ||
724 | // See if any of the possible second words is followed by a third word | |
725 | do { | |
726 | // If we find a third word, stop right away | |
51004dcb A |
727 | if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { |
728 | words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); | |
4388f060 A |
729 | goto foundBest; |
730 | } | |
731 | } | |
51004dcb | 732 | while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); |
4388f060 A |
733 | } |
734 | } | |
51004dcb | 735 | while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); |
4388f060 | 736 | foundBest: |
51004dcb | 737 | wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); |
4388f060 A |
738 | wordsFound += 1; |
739 | } | |
740 | ||
741 | // We come here after having either found a word or not. We look ahead to the | |
742 | // next word. If it's not a dictionary word, we will combine it with the word we | |
743 | // just found (if there is one), but only if the preceding word does not exceed | |
744 | // the threshold. | |
745 | // The text iterator should now be positioned at the end of the word we found. | |
746 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { | |
747 | // if it is a dictionary word, do nothing. If it isn't, then if there is | |
748 | // no preceding word, or the non-word shares less than the minimum threshold | |
749 | // of characters with a dictionary word, then scan to resynchronize | |
51004dcb | 750 | if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 |
4388f060 | 751 | && (wordLength == 0 |
51004dcb | 752 | || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { |
4388f060 A |
753 | // Look for a plausible word boundary |
754 | //TODO: This section will need a rework for UText. | |
755 | int32_t remaining = rangeEnd - (current+wordLength); | |
756 | UChar32 pc = utext_current32(text); | |
757 | int32_t chars = 0; | |
758 | for (;;) { | |
759 | utext_next32(text); | |
760 | uc = utext_current32(text); | |
761 | // TODO: Here we're counting on the fact that the SA languages are all | |
762 | // in the BMP. This should get fixed with the UText rework. | |
763 | chars += 1; | |
764 | if (--remaining <= 0) { | |
765 | break; | |
766 | } | |
767 | if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | |
768 | // Maybe. See if it's in the dictionary. | |
51004dcb | 769 | int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); |
4388f060 A |
770 | utext_setNativeIndex(text, current+wordLength+chars); |
771 | if (candidates > 0) { | |
772 | break; | |
773 | } | |
774 | } | |
775 | pc = uc; | |
776 | } | |
777 | ||
778 | // Bump the word count if there wasn't already one | |
779 | if (wordLength <= 0) { | |
780 | wordsFound += 1; | |
781 | } | |
782 | ||
783 | // Update the length with the passed-over characters | |
784 | wordLength += chars; | |
785 | } | |
786 | else { | |
787 | // Back up to where we were for next iteration | |
788 | utext_setNativeIndex(text, current+wordLength); | |
789 | } | |
790 | } | |
791 | ||
792 | // Never stop before a combining mark. | |
793 | int32_t currPos; | |
794 | while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | |
795 | utext_next32(text); | |
796 | wordLength += (int32_t)utext_getNativeIndex(text) - currPos; | |
797 | } | |
798 | ||
799 | // Look ahead for possible suffixes if a dictionary word does not follow. | |
800 | // We do this in code rather than using a rule so that the heuristic | |
801 | // resynch continues to function. For example, one of the suffix characters | |
802 | // could be a typo in the middle of a word. | |
803 | // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { | |
804 | // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
805 | // && fSuffixSet.contains(uc = utext_current32(text))) { | |
806 | // if (uc == KHMER_PAIYANNOI) { | |
807 | // if (!fSuffixSet.contains(utext_previous32(text))) { | |
808 | // // Skip over previous end and PAIYANNOI | |
809 | // utext_next32(text); | |
810 | // utext_next32(text); | |
811 | // wordLength += 1; // Add PAIYANNOI to word | |
812 | // uc = utext_current32(text); // Fetch next character | |
813 | // } | |
814 | // else { | |
815 | // // Restore prior position | |
816 | // utext_next32(text); | |
817 | // } | |
818 | // } | |
819 | // if (uc == KHMER_MAIYAMOK) { | |
820 | // if (utext_previous32(text) != KHMER_MAIYAMOK) { | |
821 | // // Skip over previous end and MAIYAMOK | |
822 | // utext_next32(text); | |
823 | // utext_next32(text); | |
824 | // wordLength += 1; // Add MAIYAMOK to word | |
825 | // } | |
826 | // else { | |
827 | // // Restore prior position | |
828 | // utext_next32(text); | |
829 | // } | |
830 | // } | |
831 | // } | |
832 | // else { | |
833 | // utext_setNativeIndex(text, current+wordLength); | |
834 | // } | |
835 | // } | |
836 | ||
73c04bcf A |
837 | // Did we find a word on this iteration? If so, push it on the break stack |
838 | if (wordLength > 0) { | |
839 | foundBreaks.push((current+wordLength), status); | |
840 | } | |
841 | } | |
842 | ||
843 | // Don't return a break for the end of the dictionary range if there is one there. | |
844 | if (foundBreaks.peeki() >= rangeEnd) { | |
845 | (void) foundBreaks.popi(); | |
846 | wordsFound -= 1; | |
847 | } | |
848 | ||
849 | return wordsFound; | |
850 | } | |
851 | ||
51004dcb A |
852 | #if !UCONFIG_NO_NORMALIZATION |
853 | /* | |
854 | ****************************************************************** | |
855 | * CjkBreakEngine | |
856 | */ | |
857 | static const uint32_t kuint32max = 0xFFFFFFFF; | |
858 | CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) | |
859 | : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { | |
860 | // Korean dictionary only includes Hangul syllables | |
861 | fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); | |
862 | fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); | |
863 | fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); | |
864 | fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); | |
865 | ||
866 | if (U_SUCCESS(status)) { | |
867 | // handle Korean and Japanese/Chinese using different dictionaries | |
868 | if (type == kKorean) { | |
869 | setCharacters(fHangulWordSet); | |
870 | } else { //Chinese and Japanese | |
871 | UnicodeSet cjSet; | |
872 | cjSet.addAll(fHanWordSet); | |
873 | cjSet.addAll(fKatakanaWordSet); | |
874 | cjSet.addAll(fHiraganaWordSet); | |
57a6839d A |
875 | cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK |
876 | cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK | |
51004dcb A |
877 | setCharacters(cjSet); |
878 | } | |
879 | } | |
880 | } | |
881 | ||
882 | CjkBreakEngine::~CjkBreakEngine(){ | |
883 | delete fDictionary; | |
884 | } | |
885 | ||
886 | // The katakanaCost values below are based on the length frequencies of all | |
887 | // katakana phrases in the dictionary | |
888 | static const int kMaxKatakanaLength = 8; | |
889 | static const int kMaxKatakanaGroupLength = 20; | |
890 | static const uint32_t maxSnlp = 255; | |
891 | ||
892 | static inline uint32_t getKatakanaCost(int wordLength){ | |
893 | //TODO: fill array with actual values from dictionary! | |
894 | static const uint32_t katakanaCost[kMaxKatakanaLength + 1] | |
895 | = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; | |
896 | return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; | |
897 | } | |
898 | ||
899 | static inline bool isKatakana(uint16_t value) { | |
900 | return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || | |
901 | (value >= 0xFF66u && value <= 0xFF9fu); | |
902 | } | |
903 | ||
904 | // A very simple helper class to streamline the buffer handling in | |
905 | // divideUpDictionaryRange. | |
906 | template<class T, size_t N> | |
907 | class AutoBuffer { | |
908 | public: | |
909 | AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { | |
910 | if (size > N) { | |
911 | buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); | |
912 | capacity = size; | |
913 | } | |
914 | } | |
915 | ~AutoBuffer() { | |
916 | if (buffer != stackBuffer) | |
917 | uprv_free(buffer); | |
918 | } | |
919 | ||
920 | T* elems() { | |
921 | return buffer; | |
922 | } | |
923 | ||
924 | const T& operator[] (size_t i) const { | |
925 | return buffer[i]; | |
926 | } | |
927 | ||
928 | T& operator[] (size_t i) { | |
929 | return buffer[i]; | |
930 | } | |
931 | ||
932 | // resize without copy | |
933 | void resize(size_t size) { | |
934 | if (size <= capacity) | |
935 | return; | |
936 | if (buffer != stackBuffer) | |
937 | uprv_free(buffer); | |
938 | buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); | |
939 | capacity = size; | |
940 | } | |
941 | ||
942 | private: | |
943 | T stackBuffer[N]; | |
944 | T* buffer; | |
945 | AutoBuffer(); | |
946 | size_t capacity; | |
947 | }; | |
948 | ||
949 | ||
950 | /* | |
951 | * @param text A UText representing the text | |
952 | * @param rangeStart The start of the range of dictionary characters | |
953 | * @param rangeEnd The end of the range of dictionary characters | |
954 | * @param foundBreaks Output of C array of int32_t break positions, or 0 | |
955 | * @return The number of breaks found | |
956 | */ | |
957 | int32_t | |
958 | CjkBreakEngine::divideUpDictionaryRange( UText *text, | |
959 | int32_t rangeStart, | |
960 | int32_t rangeEnd, | |
961 | UStack &foundBreaks ) const { | |
962 | if (rangeStart >= rangeEnd) { | |
963 | return 0; | |
964 | } | |
965 | ||
966 | const size_t defaultInputLength = 80; | |
967 | size_t inputLength = rangeEnd - rangeStart; | |
968 | // TODO: Replace by UnicodeString. | |
969 | AutoBuffer<UChar, defaultInputLength> charString(inputLength); | |
970 | ||
971 | // Normalize the input string and put it in normalizedText. | |
972 | // The map from the indices of the normalized input to the raw | |
973 | // input is kept in charPositions. | |
974 | UErrorCode status = U_ZERO_ERROR; | |
975 | utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); | |
976 | if (U_FAILURE(status)) { | |
977 | return 0; | |
978 | } | |
979 | ||
980 | UnicodeString inputString(charString.elems(), inputLength); | |
981 | // TODO: Use Normalizer2. | |
982 | UNormalizationMode norm_mode = UNORM_NFKC; | |
983 | UBool isNormalized = | |
984 | Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || | |
985 | Normalizer::isNormalized(inputString, norm_mode, status); | |
986 | ||
987 | // TODO: Replace by UVector32. | |
988 | AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); | |
989 | int numChars = 0; | |
990 | UText normalizedText = UTEXT_INITIALIZER; | |
991 | // Needs to be declared here because normalizedText holds onto its buffer. | |
992 | UnicodeString normalizedString; | |
993 | if (isNormalized) { | |
994 | int32_t index = 0; | |
995 | charPositions[0] = 0; | |
996 | while(index < inputString.length()) { | |
997 | index = inputString.moveIndex32(index, 1); | |
998 | charPositions[++numChars] = index; | |
999 | } | |
1000 | utext_openUnicodeString(&normalizedText, &inputString, &status); | |
1001 | } | |
1002 | else { | |
1003 | Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); | |
1004 | if (U_FAILURE(status)) { | |
1005 | return 0; | |
1006 | } | |
1007 | charPositions.resize(normalizedString.length() + 1); | |
1008 | Normalizer normalizer(charString.elems(), inputLength, norm_mode); | |
1009 | int32_t index = 0; | |
1010 | charPositions[0] = 0; | |
1011 | while(index < normalizer.endIndex()){ | |
1012 | /* UChar32 uc = */ normalizer.next(); | |
1013 | charPositions[++numChars] = index = normalizer.getIndex(); | |
1014 | } | |
1015 | utext_openUnicodeString(&normalizedText, &normalizedString, &status); | |
1016 | } | |
1017 | ||
1018 | if (U_FAILURE(status)) { | |
1019 | return 0; | |
1020 | } | |
1021 | ||
1022 | // From this point on, all the indices refer to the indices of | |
1023 | // the normalized input string. | |
1024 | ||
1025 | // bestSnlp[i] is the snlp of the best segmentation of the first i | |
1026 | // characters in the range to be matched. | |
1027 | // TODO: Replace by UVector32. | |
1028 | AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); | |
1029 | bestSnlp[0] = 0; | |
1030 | for(int i = 1; i <= numChars; i++) { | |
1031 | bestSnlp[i] = kuint32max; | |
1032 | } | |
1033 | ||
1034 | // prev[i] is the index of the last CJK character in the previous word in | |
1035 | // the best segmentation of the first i characters. | |
1036 | // TODO: Replace by UVector32. | |
1037 | AutoBuffer<int, defaultInputLength> prev(numChars + 1); | |
1038 | for(int i = 0; i <= numChars; i++){ | |
1039 | prev[i] = -1; | |
1040 | } | |
1041 | ||
1042 | const size_t maxWordSize = 20; | |
1043 | // TODO: Replace both with UVector32. | |
1044 | AutoBuffer<int32_t, maxWordSize> values(numChars); | |
1045 | AutoBuffer<int32_t, maxWordSize> lengths(numChars); | |
1046 | ||
1047 | // Dynamic programming to find the best segmentation. | |
1048 | bool is_prev_katakana = false; | |
1049 | for (int32_t i = 0; i < numChars; ++i) { | |
1050 | //utext_setNativeIndex(text, rangeStart + i); | |
1051 | utext_setNativeIndex(&normalizedText, i); | |
1052 | if (bestSnlp[i] == kuint32max) | |
1053 | continue; | |
1054 | ||
1055 | int32_t count; | |
1056 | // limit maximum word length matched to size of current substring | |
1057 | int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i); | |
1058 | ||
1059 | fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); | |
1060 | ||
1061 | // if there are no single character matches found in the dictionary | |
1062 | // starting with this charcter, treat character as a 1-character word | |
1063 | // with the highest value possible, i.e. the least likely to occur. | |
1064 | // Exclude Korean characters from this treatment, as they should be left | |
1065 | // together by default. | |
1066 | if((count == 0 || lengths[0] != 1) && | |
1067 | !fHangulWordSet.contains(utext_current32(&normalizedText))) { | |
1068 | values[count] = maxSnlp; | |
1069 | lengths[count++] = 1; | |
1070 | } | |
1071 | ||
1072 | for (int j = 0; j < count; j++) { | |
1073 | uint32_t newSnlp = bestSnlp[i] + values[j]; | |
1074 | if (newSnlp < bestSnlp[lengths[j] + i]) { | |
1075 | bestSnlp[lengths[j] + i] = newSnlp; | |
1076 | prev[lengths[j] + i] = i; | |
1077 | } | |
1078 | } | |
1079 | ||
1080 | // In Japanese, | |
1081 | // Katakana word in single character is pretty rare. So we apply | |
1082 | // the following heuristic to Katakana: any continuous run of Katakana | |
1083 | // characters is considered a candidate word with a default cost | |
1084 | // specified in the katakanaCost table according to its length. | |
1085 | //utext_setNativeIndex(text, rangeStart + i); | |
1086 | utext_setNativeIndex(&normalizedText, i); | |
1087 | bool is_katakana = isKatakana(utext_current32(&normalizedText)); | |
1088 | if (!is_prev_katakana && is_katakana) { | |
1089 | int j = i + 1; | |
1090 | utext_next32(&normalizedText); | |
1091 | // Find the end of the continuous run of Katakana characters | |
1092 | while (j < numChars && (j - i) < kMaxKatakanaGroupLength && | |
1093 | isKatakana(utext_current32(&normalizedText))) { | |
1094 | utext_next32(&normalizedText); | |
1095 | ++j; | |
1096 | } | |
1097 | if ((j - i) < kMaxKatakanaGroupLength) { | |
1098 | uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); | |
1099 | if (newSnlp < bestSnlp[j]) { | |
1100 | bestSnlp[j] = newSnlp; | |
1101 | prev[j] = i; | |
1102 | } | |
1103 | } | |
1104 | } | |
1105 | is_prev_katakana = is_katakana; | |
1106 | } | |
1107 | ||
1108 | // Start pushing the optimal offset index into t_boundary (t for tentative). | |
1109 | // prev[numChars] is guaranteed to be meaningful. | |
1110 | // We'll first push in the reverse order, i.e., | |
1111 | // t_boundary[0] = numChars, and afterwards do a swap. | |
1112 | // TODO: Replace by UVector32. | |
1113 | AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); | |
1114 | ||
1115 | int numBreaks = 0; | |
1116 | // No segmentation found, set boundary to end of range | |
1117 | if (bestSnlp[numChars] == kuint32max) { | |
1118 | t_boundary[numBreaks++] = numChars; | |
1119 | } else { | |
1120 | for (int i = numChars; i > 0; i = prev[i]) { | |
1121 | t_boundary[numBreaks++] = i; | |
1122 | } | |
1123 | U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0); | |
1124 | } | |
1125 | ||
1126 | // Reverse offset index in t_boundary. | |
1127 | // Don't add a break for the start of the dictionary range if there is one | |
1128 | // there already. | |
1129 | if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { | |
1130 | t_boundary[numBreaks++] = 0; | |
1131 | } | |
1132 | ||
1133 | // Now that we're done, convert positions in t_bdry[] (indices in | |
1134 | // the normalized input string) back to indices in the raw input string | |
1135 | // while reversing t_bdry and pushing values to foundBreaks. | |
1136 | for (int i = numBreaks-1; i >= 0; i--) { | |
1137 | foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); | |
1138 | } | |
1139 | ||
1140 | utext_close(&normalizedText); | |
1141 | return numBreaks; | |
1142 | } | |
1143 | #endif | |
1144 | ||
73c04bcf A |
1145 | U_NAMESPACE_END |
1146 | ||
1147 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
51004dcb | 1148 |