]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /** |
2 | ******************************************************************************* | |
51004dcb A |
3 | * Copyright (C) 2006-2012, International Business Machines Corporation |
4 | * and others. All Rights Reserved. | |
73c04bcf A |
5 | ******************************************************************************* |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_BREAK_ITERATION | |
11 | ||
12 | #include "brkeng.h" | |
13 | #include "dictbe.h" | |
14 | #include "unicode/uniset.h" | |
15 | #include "unicode/chariter.h" | |
16 | #include "unicode/ubrk.h" | |
17 | #include "uvector.h" | |
51004dcb A |
18 | #include "uassert.h" |
19 | #include "unicode/normlzr.h" | |
20 | #include "cmemory.h" | |
21 | #include "dictionarydata.h" | |
73c04bcf A |
22 | |
23 | U_NAMESPACE_BEGIN | |
24 | ||
25 | /* | |
26 | ****************************************************************** | |
27 | */ | |
28 | ||
73c04bcf A |
29 | DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { |
30 | fTypes = breakTypes; | |
31 | } | |
32 | ||
33 | DictionaryBreakEngine::~DictionaryBreakEngine() { | |
34 | } | |
35 | ||
36 | UBool | |
37 | DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { | |
38 | return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) | |
39 | && fSet.contains(c)); | |
40 | } | |
41 | ||
42 | int32_t | |
43 | DictionaryBreakEngine::findBreaks( UText *text, | |
44 | int32_t startPos, | |
45 | int32_t endPos, | |
46 | UBool reverse, | |
47 | int32_t breakType, | |
48 | UStack &foundBreaks ) const { | |
49 | int32_t result = 0; | |
50 | ||
51 | // Find the span of characters included in the set. | |
52 | int32_t start = (int32_t)utext_getNativeIndex(text); | |
53 | int32_t current; | |
54 | int32_t rangeStart; | |
55 | int32_t rangeEnd; | |
56 | UChar32 c = utext_current32(text); | |
57 | if (reverse) { | |
58 | UBool isDict = fSet.contains(c); | |
59 | while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { | |
60 | c = utext_previous32(text); | |
61 | isDict = fSet.contains(c); | |
62 | } | |
63 | rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); | |
64 | rangeEnd = start + 1; | |
65 | } | |
66 | else { | |
67 | while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { | |
68 | utext_next32(text); // TODO: recast loop for postincrement | |
69 | c = utext_current32(text); | |
70 | } | |
71 | rangeStart = start; | |
72 | rangeEnd = current; | |
73 | } | |
74 | if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { | |
75 | result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); | |
76 | utext_setNativeIndex(text, current); | |
77 | } | |
78 | ||
79 | return result; | |
80 | } | |
81 | ||
82 | void | |
46f4442e | 83 | DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { |
73c04bcf | 84 | fSet = set; |
46f4442e A |
85 | // Compact for caching |
86 | fSet.compact(); | |
73c04bcf A |
87 | } |
88 | ||
73c04bcf A |
89 | /* |
90 | ****************************************************************** | |
91 | */ | |
92 | ||
93 | ||
94 | // Helper class for improving readability of the Thai word break | |
95 | // algorithm. The implementation is completely inline. | |
96 | ||
97 | // List size, limited by the maximum number of words in the dictionary | |
98 | // that form a nested sequence. | |
99 | #define POSSIBLE_WORD_LIST_MAX 20 | |
100 | ||
101 | class PossibleWord { | |
51004dcb A |
102 | private: |
103 | // list of word candidate lengths, in increasing length order | |
104 | int32_t lengths[POSSIBLE_WORD_LIST_MAX]; | |
105 | int32_t count; // Count of candidates | |
106 | int32_t prefix; // The longest match with a dictionary word | |
107 | int32_t offset; // Offset in the text of these candidates | |
108 | int mark; // The preferred candidate's offset | |
109 | int current; // The candidate we're currently looking at | |
110 | ||
111 | public: | |
112 | PossibleWord(); | |
113 | ~PossibleWord(); | |
73c04bcf | 114 | |
51004dcb A |
115 | // Fill the list of candidates if needed, select the longest, and return the number found |
116 | int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); | |
73c04bcf | 117 | |
51004dcb A |
118 | // Select the currently marked candidate, point after it in the text, and invalidate self |
119 | int32_t acceptMarked( UText *text ); | |
73c04bcf | 120 | |
51004dcb A |
121 | // Back up from the current candidate to the next shorter one; return TRUE if that exists |
122 | // and point the text after it | |
123 | UBool backUp( UText *text ); | |
73c04bcf | 124 | |
51004dcb A |
125 | // Return the longest prefix this candidate location shares with a dictionary word |
126 | int32_t longestPrefix(); | |
73c04bcf | 127 | |
51004dcb A |
128 | // Mark the current candidate as the one we like |
129 | void markCurrent(); | |
73c04bcf A |
130 | }; |
131 | ||
132 | inline | |
133 | PossibleWord::PossibleWord() { | |
134 | offset = -1; | |
135 | } | |
136 | ||
137 | inline | |
138 | PossibleWord::~PossibleWord() { | |
139 | } | |
140 | ||
141 | inline int | |
51004dcb | 142 | PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { |
73c04bcf A |
143 | // TODO: If getIndex is too slow, use offset < 0 and add discardAll() |
144 | int32_t start = (int32_t)utext_getNativeIndex(text); | |
145 | if (start != offset) { | |
146 | offset = start; | |
147 | prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); | |
148 | // Dictionary leaves text after longest prefix, not longest word. Back up. | |
149 | if (count <= 0) { | |
150 | utext_setNativeIndex(text, start); | |
151 | } | |
152 | } | |
153 | if (count > 0) { | |
154 | utext_setNativeIndex(text, start+lengths[count-1]); | |
155 | } | |
156 | current = count-1; | |
157 | mark = current; | |
158 | return count; | |
159 | } | |
160 | ||
161 | inline int32_t | |
162 | PossibleWord::acceptMarked( UText *text ) { | |
163 | utext_setNativeIndex(text, offset + lengths[mark]); | |
164 | return lengths[mark]; | |
165 | } | |
166 | ||
167 | inline UBool | |
168 | PossibleWord::backUp( UText *text ) { | |
169 | if (current > 0) { | |
170 | utext_setNativeIndex(text, offset + lengths[--current]); | |
171 | return TRUE; | |
172 | } | |
173 | return FALSE; | |
174 | } | |
175 | ||
176 | inline int32_t | |
177 | PossibleWord::longestPrefix() { | |
178 | return prefix; | |
179 | } | |
180 | ||
181 | inline void | |
182 | PossibleWord::markCurrent() { | |
183 | mark = current; | |
184 | } | |
185 | ||
186 | // How many words in a row are "good enough"? | |
187 | #define THAI_LOOKAHEAD 3 | |
188 | ||
189 | // Will not combine a non-word with a preceding dictionary word longer than this | |
190 | #define THAI_ROOT_COMBINE_THRESHOLD 3 | |
191 | ||
192 | // Will not combine a non-word that shares at least this much prefix with a | |
193 | // dictionary word, with a preceding word | |
194 | #define THAI_PREFIX_COMBINE_THRESHOLD 3 | |
195 | ||
196 | // Ellision character | |
197 | #define THAI_PAIYANNOI 0x0E2F | |
198 | ||
199 | // Repeat character | |
200 | #define THAI_MAIYAMOK 0x0E46 | |
201 | ||
202 | // Minimum word size | |
203 | #define THAI_MIN_WORD 2 | |
204 | ||
205 | // Minimum number of characters for two words | |
206 | #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) | |
207 | ||
51004dcb | 208 | ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) |
73c04bcf A |
209 | : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), |
210 | fDictionary(adoptDictionary) | |
211 | { | |
212 | fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); | |
213 | if (U_SUCCESS(status)) { | |
214 | setCharacters(fThaiWordSet); | |
215 | } | |
216 | fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); | |
46f4442e | 217 | fMarkSet.add(0x0020); |
73c04bcf A |
218 | fEndWordSet = fThaiWordSet; |
219 | fEndWordSet.remove(0x0E31); // MAI HAN-AKAT | |
220 | fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
221 | fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK | |
222 | fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
223 | fSuffixSet.add(THAI_PAIYANNOI); | |
224 | fSuffixSet.add(THAI_MAIYAMOK); | |
46f4442e A |
225 | |
226 | // Compact for caching. | |
227 | fMarkSet.compact(); | |
228 | fEndWordSet.compact(); | |
229 | fBeginWordSet.compact(); | |
230 | fSuffixSet.compact(); | |
73c04bcf A |
231 | } |
232 | ||
233 | ThaiBreakEngine::~ThaiBreakEngine() { | |
234 | delete fDictionary; | |
235 | } | |
236 | ||
237 | int32_t | |
238 | ThaiBreakEngine::divideUpDictionaryRange( UText *text, | |
239 | int32_t rangeStart, | |
240 | int32_t rangeEnd, | |
241 | UStack &foundBreaks ) const { | |
242 | if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { | |
243 | return 0; // Not enough characters for two words | |
244 | } | |
245 | ||
246 | uint32_t wordsFound = 0; | |
247 | int32_t wordLength; | |
248 | int32_t current; | |
249 | UErrorCode status = U_ZERO_ERROR; | |
250 | PossibleWord words[THAI_LOOKAHEAD]; | |
251 | UChar32 uc; | |
252 | ||
253 | utext_setNativeIndex(text, rangeStart); | |
254 | ||
255 | while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | |
256 | wordLength = 0; | |
257 | ||
258 | // Look for candidate words at the current position | |
259 | int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
260 | ||
261 | // If we found exactly one, use that | |
262 | if (candidates == 1) { | |
51004dcb | 263 | wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); |
73c04bcf A |
264 | wordsFound += 1; |
265 | } | |
73c04bcf A |
266 | // If there was more than one, see which one can take us forward the most words |
267 | else if (candidates > 1) { | |
268 | // If we're already at the end of the range, we're done | |
269 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
270 | goto foundBest; | |
271 | } | |
272 | do { | |
273 | int wordsMatched = 1; | |
51004dcb | 274 | if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { |
73c04bcf A |
275 | if (wordsMatched < 2) { |
276 | // Followed by another dictionary word; mark first word as a good candidate | |
277 | words[wordsFound%THAI_LOOKAHEAD].markCurrent(); | |
278 | wordsMatched = 2; | |
279 | } | |
280 | ||
281 | // If we're already at the end of the range, we're done | |
282 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
283 | goto foundBest; | |
284 | } | |
285 | ||
286 | // See if any of the possible second words is followed by a third word | |
287 | do { | |
288 | // If we find a third word, stop right away | |
51004dcb A |
289 | if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { |
290 | words[wordsFound % THAI_LOOKAHEAD].markCurrent(); | |
73c04bcf A |
291 | goto foundBest; |
292 | } | |
293 | } | |
51004dcb | 294 | while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text)); |
73c04bcf A |
295 | } |
296 | } | |
51004dcb | 297 | while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); |
73c04bcf | 298 | foundBest: |
51004dcb | 299 | wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); |
73c04bcf A |
300 | wordsFound += 1; |
301 | } | |
302 | ||
303 | // We come here after having either found a word or not. We look ahead to the | |
304 | // next word. If it's not a dictionary word, we will combine it withe the word we | |
305 | // just found (if there is one), but only if the preceding word does not exceed | |
306 | // the threshold. | |
307 | // The text iterator should now be positioned at the end of the word we found. | |
308 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { | |
309 | // if it is a dictionary word, do nothing. If it isn't, then if there is | |
310 | // no preceding word, or the non-word shares less than the minimum threshold | |
311 | // of characters with a dictionary word, then scan to resynchronize | |
51004dcb | 312 | if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 |
73c04bcf A |
313 | && (wordLength == 0 |
314 | || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { | |
315 | // Look for a plausible word boundary | |
316 | //TODO: This section will need a rework for UText. | |
317 | int32_t remaining = rangeEnd - (current+wordLength); | |
318 | UChar32 pc = utext_current32(text); | |
319 | int32_t chars = 0; | |
46f4442e | 320 | for (;;) { |
73c04bcf A |
321 | utext_next32(text); |
322 | uc = utext_current32(text); | |
323 | // TODO: Here we're counting on the fact that the SA languages are all | |
324 | // in the BMP. This should get fixed with the UText rework. | |
325 | chars += 1; | |
326 | if (--remaining <= 0) { | |
327 | break; | |
328 | } | |
329 | if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | |
330 | // Maybe. See if it's in the dictionary. | |
331 | // NOTE: In the original Apple code, checked that the next | |
332 | // two characters after uc were not 0x0E4C THANTHAKHAT before | |
333 | // checking the dictionary. That is just a performance filter, | |
334 | // but it's not clear it's faster than checking the trie. | |
51004dcb A |
335 | int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); |
336 | utext_setNativeIndex(text, current + wordLength + chars); | |
73c04bcf A |
337 | if (candidates > 0) { |
338 | break; | |
339 | } | |
340 | } | |
341 | pc = uc; | |
342 | } | |
343 | ||
344 | // Bump the word count if there wasn't already one | |
345 | if (wordLength <= 0) { | |
346 | wordsFound += 1; | |
347 | } | |
348 | ||
349 | // Update the length with the passed-over characters | |
350 | wordLength += chars; | |
351 | } | |
352 | else { | |
353 | // Back up to where we were for next iteration | |
354 | utext_setNativeIndex(text, current+wordLength); | |
355 | } | |
356 | } | |
357 | ||
358 | // Never stop before a combining mark. | |
359 | int32_t currPos; | |
360 | while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | |
361 | utext_next32(text); | |
362 | wordLength += (int32_t)utext_getNativeIndex(text) - currPos; | |
363 | } | |
364 | ||
365 | // Look ahead for possible suffixes if a dictionary word does not follow. | |
366 | // We do this in code rather than using a rule so that the heuristic | |
367 | // resynch continues to function. For example, one of the suffix characters | |
368 | // could be a typo in the middle of a word. | |
369 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { | |
370 | if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
371 | && fSuffixSet.contains(uc = utext_current32(text))) { | |
372 | if (uc == THAI_PAIYANNOI) { | |
373 | if (!fSuffixSet.contains(utext_previous32(text))) { | |
374 | // Skip over previous end and PAIYANNOI | |
375 | utext_next32(text); | |
376 | utext_next32(text); | |
377 | wordLength += 1; // Add PAIYANNOI to word | |
378 | uc = utext_current32(text); // Fetch next character | |
379 | } | |
380 | else { | |
381 | // Restore prior position | |
382 | utext_next32(text); | |
383 | } | |
384 | } | |
385 | if (uc == THAI_MAIYAMOK) { | |
386 | if (utext_previous32(text) != THAI_MAIYAMOK) { | |
387 | // Skip over previous end and MAIYAMOK | |
388 | utext_next32(text); | |
389 | utext_next32(text); | |
390 | wordLength += 1; // Add MAIYAMOK to word | |
391 | } | |
392 | else { | |
393 | // Restore prior position | |
394 | utext_next32(text); | |
395 | } | |
396 | } | |
397 | } | |
398 | else { | |
399 | utext_setNativeIndex(text, current+wordLength); | |
400 | } | |
401 | } | |
4388f060 A |
402 | |
403 | // Did we find a word on this iteration? If so, push it on the break stack | |
404 | if (wordLength > 0) { | |
405 | foundBreaks.push((current+wordLength), status); | |
406 | } | |
407 | } | |
408 | ||
409 | // Don't return a break for the end of the dictionary range if there is one there. | |
410 | if (foundBreaks.peeki() >= rangeEnd) { | |
411 | (void) foundBreaks.popi(); | |
412 | wordsFound -= 1; | |
413 | } | |
414 | ||
415 | return wordsFound; | |
416 | } | |
417 | ||
418 | // How many words in a row are "good enough"? | |
419 | #define KHMER_LOOKAHEAD 3 | |
420 | ||
421 | // Will not combine a non-word with a preceding dictionary word longer than this | |
422 | #define KHMER_ROOT_COMBINE_THRESHOLD 3 | |
423 | ||
424 | // Will not combine a non-word that shares at least this much prefix with a | |
425 | // dictionary word, with a preceding word | |
426 | #define KHMER_PREFIX_COMBINE_THRESHOLD 3 | |
427 | ||
428 | // Minimum word size | |
429 | #define KHMER_MIN_WORD 2 | |
430 | ||
431 | // Minimum number of characters for two words | |
432 | #define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) | |
433 | ||
51004dcb A |
434 | KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) |
435 | : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), | |
4388f060 A |
436 | fDictionary(adoptDictionary) |
437 | { | |
438 | fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); | |
439 | if (U_SUCCESS(status)) { | |
440 | setCharacters(fKhmerWordSet); | |
441 | } | |
442 | fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); | |
443 | fMarkSet.add(0x0020); | |
444 | fEndWordSet = fKhmerWordSet; | |
445 | fBeginWordSet.add(0x1780, 0x17B3); | |
446 | //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels | |
447 | //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word | |
448 | //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word | |
449 | fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters | |
450 | //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels | |
451 | // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT | |
452 | // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
453 | // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK | |
454 | // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
455 | // fSuffixSet.add(THAI_PAIYANNOI); | |
456 | // fSuffixSet.add(THAI_MAIYAMOK); | |
457 | ||
458 | // Compact for caching. | |
459 | fMarkSet.compact(); | |
460 | fEndWordSet.compact(); | |
461 | fBeginWordSet.compact(); | |
462 | // fSuffixSet.compact(); | |
463 | } | |
464 | ||
465 | KhmerBreakEngine::~KhmerBreakEngine() { | |
466 | delete fDictionary; | |
467 | } | |
468 | ||
469 | int32_t | |
470 | KhmerBreakEngine::divideUpDictionaryRange( UText *text, | |
471 | int32_t rangeStart, | |
472 | int32_t rangeEnd, | |
473 | UStack &foundBreaks ) const { | |
474 | if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { | |
475 | return 0; // Not enough characters for two words | |
476 | } | |
477 | ||
478 | uint32_t wordsFound = 0; | |
479 | int32_t wordLength; | |
480 | int32_t current; | |
481 | UErrorCode status = U_ZERO_ERROR; | |
482 | PossibleWord words[KHMER_LOOKAHEAD]; | |
483 | UChar32 uc; | |
484 | ||
485 | utext_setNativeIndex(text, rangeStart); | |
486 | ||
487 | while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | |
488 | wordLength = 0; | |
489 | ||
490 | // Look for candidate words at the current position | |
491 | int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
492 | ||
493 | // If we found exactly one, use that | |
494 | if (candidates == 1) { | |
495 | wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); | |
496 | wordsFound += 1; | |
497 | } | |
498 | ||
499 | // If there was more than one, see which one can take us forward the most words | |
500 | else if (candidates > 1) { | |
501 | // If we're already at the end of the range, we're done | |
502 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
503 | goto foundBest; | |
504 | } | |
505 | do { | |
506 | int wordsMatched = 1; | |
51004dcb | 507 | if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { |
4388f060 A |
508 | if (wordsMatched < 2) { |
509 | // Followed by another dictionary word; mark first word as a good candidate | |
51004dcb | 510 | words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); |
4388f060 A |
511 | wordsMatched = 2; |
512 | } | |
513 | ||
514 | // If we're already at the end of the range, we're done | |
515 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
516 | goto foundBest; | |
517 | } | |
518 | ||
519 | // See if any of the possible second words is followed by a third word | |
520 | do { | |
521 | // If we find a third word, stop right away | |
51004dcb A |
522 | if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { |
523 | words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); | |
4388f060 A |
524 | goto foundBest; |
525 | } | |
526 | } | |
51004dcb | 527 | while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); |
4388f060 A |
528 | } |
529 | } | |
51004dcb | 530 | while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); |
4388f060 | 531 | foundBest: |
51004dcb | 532 | wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); |
4388f060 A |
533 | wordsFound += 1; |
534 | } | |
535 | ||
536 | // We come here after having either found a word or not. We look ahead to the | |
537 | // next word. If it's not a dictionary word, we will combine it with the word we | |
538 | // just found (if there is one), but only if the preceding word does not exceed | |
539 | // the threshold. | |
540 | // The text iterator should now be positioned at the end of the word we found. | |
541 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { | |
542 | // if it is a dictionary word, do nothing. If it isn't, then if there is | |
543 | // no preceding word, or the non-word shares less than the minimum threshold | |
544 | // of characters with a dictionary word, then scan to resynchronize | |
51004dcb | 545 | if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 |
4388f060 | 546 | && (wordLength == 0 |
51004dcb | 547 | || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { |
4388f060 A |
548 | // Look for a plausible word boundary |
549 | //TODO: This section will need a rework for UText. | |
550 | int32_t remaining = rangeEnd - (current+wordLength); | |
551 | UChar32 pc = utext_current32(text); | |
552 | int32_t chars = 0; | |
553 | for (;;) { | |
554 | utext_next32(text); | |
555 | uc = utext_current32(text); | |
556 | // TODO: Here we're counting on the fact that the SA languages are all | |
557 | // in the BMP. This should get fixed with the UText rework. | |
558 | chars += 1; | |
559 | if (--remaining <= 0) { | |
560 | break; | |
561 | } | |
562 | if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | |
563 | // Maybe. See if it's in the dictionary. | |
51004dcb | 564 | int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); |
4388f060 A |
565 | utext_setNativeIndex(text, current+wordLength+chars); |
566 | if (candidates > 0) { | |
567 | break; | |
568 | } | |
569 | } | |
570 | pc = uc; | |
571 | } | |
572 | ||
573 | // Bump the word count if there wasn't already one | |
574 | if (wordLength <= 0) { | |
575 | wordsFound += 1; | |
576 | } | |
577 | ||
578 | // Update the length with the passed-over characters | |
579 | wordLength += chars; | |
580 | } | |
581 | else { | |
582 | // Back up to where we were for next iteration | |
583 | utext_setNativeIndex(text, current+wordLength); | |
584 | } | |
585 | } | |
586 | ||
587 | // Never stop before a combining mark. | |
588 | int32_t currPos; | |
589 | while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | |
590 | utext_next32(text); | |
591 | wordLength += (int32_t)utext_getNativeIndex(text) - currPos; | |
592 | } | |
593 | ||
594 | // Look ahead for possible suffixes if a dictionary word does not follow. | |
595 | // We do this in code rather than using a rule so that the heuristic | |
596 | // resynch continues to function. For example, one of the suffix characters | |
597 | // could be a typo in the middle of a word. | |
598 | // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { | |
599 | // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
600 | // && fSuffixSet.contains(uc = utext_current32(text))) { | |
601 | // if (uc == KHMER_PAIYANNOI) { | |
602 | // if (!fSuffixSet.contains(utext_previous32(text))) { | |
603 | // // Skip over previous end and PAIYANNOI | |
604 | // utext_next32(text); | |
605 | // utext_next32(text); | |
606 | // wordLength += 1; // Add PAIYANNOI to word | |
607 | // uc = utext_current32(text); // Fetch next character | |
608 | // } | |
609 | // else { | |
610 | // // Restore prior position | |
611 | // utext_next32(text); | |
612 | // } | |
613 | // } | |
614 | // if (uc == KHMER_MAIYAMOK) { | |
615 | // if (utext_previous32(text) != KHMER_MAIYAMOK) { | |
616 | // // Skip over previous end and MAIYAMOK | |
617 | // utext_next32(text); | |
618 | // utext_next32(text); | |
619 | // wordLength += 1; // Add MAIYAMOK to word | |
620 | // } | |
621 | // else { | |
622 | // // Restore prior position | |
623 | // utext_next32(text); | |
624 | // } | |
625 | // } | |
626 | // } | |
627 | // else { | |
628 | // utext_setNativeIndex(text, current+wordLength); | |
629 | // } | |
630 | // } | |
631 | ||
73c04bcf A |
632 | // Did we find a word on this iteration? If so, push it on the break stack |
633 | if (wordLength > 0) { | |
634 | foundBreaks.push((current+wordLength), status); | |
635 | } | |
636 | } | |
637 | ||
638 | // Don't return a break for the end of the dictionary range if there is one there. | |
639 | if (foundBreaks.peeki() >= rangeEnd) { | |
640 | (void) foundBreaks.popi(); | |
641 | wordsFound -= 1; | |
642 | } | |
643 | ||
644 | return wordsFound; | |
645 | } | |
646 | ||
51004dcb A |
647 | #if !UCONFIG_NO_NORMALIZATION |
648 | /* | |
649 | ****************************************************************** | |
650 | * CjkBreakEngine | |
651 | */ | |
652 | static const uint32_t kuint32max = 0xFFFFFFFF; | |
653 | CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) | |
654 | : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { | |
655 | // Korean dictionary only includes Hangul syllables | |
656 | fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); | |
657 | fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); | |
658 | fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); | |
659 | fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); | |
660 | ||
661 | if (U_SUCCESS(status)) { | |
662 | // handle Korean and Japanese/Chinese using different dictionaries | |
663 | if (type == kKorean) { | |
664 | setCharacters(fHangulWordSet); | |
665 | } else { //Chinese and Japanese | |
666 | UnicodeSet cjSet; | |
667 | cjSet.addAll(fHanWordSet); | |
668 | cjSet.addAll(fKatakanaWordSet); | |
669 | cjSet.addAll(fHiraganaWordSet); | |
670 | cjSet.add(0xFF70); | |
671 | cjSet.add(0x30FC); | |
672 | setCharacters(cjSet); | |
673 | } | |
674 | } | |
675 | } | |
676 | ||
677 | CjkBreakEngine::~CjkBreakEngine(){ | |
678 | delete fDictionary; | |
679 | } | |
680 | ||
681 | // The katakanaCost values below are based on the length frequencies of all | |
682 | // katakana phrases in the dictionary | |
683 | static const int kMaxKatakanaLength = 8; | |
684 | static const int kMaxKatakanaGroupLength = 20; | |
685 | static const uint32_t maxSnlp = 255; | |
686 | ||
687 | static inline uint32_t getKatakanaCost(int wordLength){ | |
688 | //TODO: fill array with actual values from dictionary! | |
689 | static const uint32_t katakanaCost[kMaxKatakanaLength + 1] | |
690 | = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; | |
691 | return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; | |
692 | } | |
693 | ||
694 | static inline bool isKatakana(uint16_t value) { | |
695 | return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || | |
696 | (value >= 0xFF66u && value <= 0xFF9fu); | |
697 | } | |
698 | ||
699 | // A very simple helper class to streamline the buffer handling in | |
700 | // divideUpDictionaryRange. | |
701 | template<class T, size_t N> | |
702 | class AutoBuffer { | |
703 | public: | |
704 | AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { | |
705 | if (size > N) { | |
706 | buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); | |
707 | capacity = size; | |
708 | } | |
709 | } | |
710 | ~AutoBuffer() { | |
711 | if (buffer != stackBuffer) | |
712 | uprv_free(buffer); | |
713 | } | |
714 | ||
715 | T* elems() { | |
716 | return buffer; | |
717 | } | |
718 | ||
719 | const T& operator[] (size_t i) const { | |
720 | return buffer[i]; | |
721 | } | |
722 | ||
723 | T& operator[] (size_t i) { | |
724 | return buffer[i]; | |
725 | } | |
726 | ||
727 | // resize without copy | |
728 | void resize(size_t size) { | |
729 | if (size <= capacity) | |
730 | return; | |
731 | if (buffer != stackBuffer) | |
732 | uprv_free(buffer); | |
733 | buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); | |
734 | capacity = size; | |
735 | } | |
736 | ||
737 | private: | |
738 | T stackBuffer[N]; | |
739 | T* buffer; | |
740 | AutoBuffer(); | |
741 | size_t capacity; | |
742 | }; | |
743 | ||
744 | ||
745 | /* | |
746 | * @param text A UText representing the text | |
747 | * @param rangeStart The start of the range of dictionary characters | |
748 | * @param rangeEnd The end of the range of dictionary characters | |
749 | * @param foundBreaks Output of C array of int32_t break positions, or 0 | |
750 | * @return The number of breaks found | |
751 | */ | |
752 | int32_t | |
753 | CjkBreakEngine::divideUpDictionaryRange( UText *text, | |
754 | int32_t rangeStart, | |
755 | int32_t rangeEnd, | |
756 | UStack &foundBreaks ) const { | |
757 | if (rangeStart >= rangeEnd) { | |
758 | return 0; | |
759 | } | |
760 | ||
761 | const size_t defaultInputLength = 80; | |
762 | size_t inputLength = rangeEnd - rangeStart; | |
763 | // TODO: Replace by UnicodeString. | |
764 | AutoBuffer<UChar, defaultInputLength> charString(inputLength); | |
765 | ||
766 | // Normalize the input string and put it in normalizedText. | |
767 | // The map from the indices of the normalized input to the raw | |
768 | // input is kept in charPositions. | |
769 | UErrorCode status = U_ZERO_ERROR; | |
770 | utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); | |
771 | if (U_FAILURE(status)) { | |
772 | return 0; | |
773 | } | |
774 | ||
775 | UnicodeString inputString(charString.elems(), inputLength); | |
776 | // TODO: Use Normalizer2. | |
777 | UNormalizationMode norm_mode = UNORM_NFKC; | |
778 | UBool isNormalized = | |
779 | Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || | |
780 | Normalizer::isNormalized(inputString, norm_mode, status); | |
781 | ||
782 | // TODO: Replace by UVector32. | |
783 | AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); | |
784 | int numChars = 0; | |
785 | UText normalizedText = UTEXT_INITIALIZER; | |
786 | // Needs to be declared here because normalizedText holds onto its buffer. | |
787 | UnicodeString normalizedString; | |
788 | if (isNormalized) { | |
789 | int32_t index = 0; | |
790 | charPositions[0] = 0; | |
791 | while(index < inputString.length()) { | |
792 | index = inputString.moveIndex32(index, 1); | |
793 | charPositions[++numChars] = index; | |
794 | } | |
795 | utext_openUnicodeString(&normalizedText, &inputString, &status); | |
796 | } | |
797 | else { | |
798 | Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); | |
799 | if (U_FAILURE(status)) { | |
800 | return 0; | |
801 | } | |
802 | charPositions.resize(normalizedString.length() + 1); | |
803 | Normalizer normalizer(charString.elems(), inputLength, norm_mode); | |
804 | int32_t index = 0; | |
805 | charPositions[0] = 0; | |
806 | while(index < normalizer.endIndex()){ | |
807 | /* UChar32 uc = */ normalizer.next(); | |
808 | charPositions[++numChars] = index = normalizer.getIndex(); | |
809 | } | |
810 | utext_openUnicodeString(&normalizedText, &normalizedString, &status); | |
811 | } | |
812 | ||
813 | if (U_FAILURE(status)) { | |
814 | return 0; | |
815 | } | |
816 | ||
817 | // From this point on, all the indices refer to the indices of | |
818 | // the normalized input string. | |
819 | ||
820 | // bestSnlp[i] is the snlp of the best segmentation of the first i | |
821 | // characters in the range to be matched. | |
822 | // TODO: Replace by UVector32. | |
823 | AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); | |
824 | bestSnlp[0] = 0; | |
825 | for(int i = 1; i <= numChars; i++) { | |
826 | bestSnlp[i] = kuint32max; | |
827 | } | |
828 | ||
829 | // prev[i] is the index of the last CJK character in the previous word in | |
830 | // the best segmentation of the first i characters. | |
831 | // TODO: Replace by UVector32. | |
832 | AutoBuffer<int, defaultInputLength> prev(numChars + 1); | |
833 | for(int i = 0; i <= numChars; i++){ | |
834 | prev[i] = -1; | |
835 | } | |
836 | ||
837 | const size_t maxWordSize = 20; | |
838 | // TODO: Replace both with UVector32. | |
839 | AutoBuffer<int32_t, maxWordSize> values(numChars); | |
840 | AutoBuffer<int32_t, maxWordSize> lengths(numChars); | |
841 | ||
842 | // Dynamic programming to find the best segmentation. | |
843 | bool is_prev_katakana = false; | |
844 | for (int32_t i = 0; i < numChars; ++i) { | |
845 | //utext_setNativeIndex(text, rangeStart + i); | |
846 | utext_setNativeIndex(&normalizedText, i); | |
847 | if (bestSnlp[i] == kuint32max) | |
848 | continue; | |
849 | ||
850 | int32_t count; | |
851 | // limit maximum word length matched to size of current substring | |
852 | int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i); | |
853 | ||
854 | fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); | |
855 | ||
856 | // if there are no single character matches found in the dictionary | |
857 | // starting with this charcter, treat character as a 1-character word | |
858 | // with the highest value possible, i.e. the least likely to occur. | |
859 | // Exclude Korean characters from this treatment, as they should be left | |
860 | // together by default. | |
861 | if((count == 0 || lengths[0] != 1) && | |
862 | !fHangulWordSet.contains(utext_current32(&normalizedText))) { | |
863 | values[count] = maxSnlp; | |
864 | lengths[count++] = 1; | |
865 | } | |
866 | ||
867 | for (int j = 0; j < count; j++) { | |
868 | uint32_t newSnlp = bestSnlp[i] + values[j]; | |
869 | if (newSnlp < bestSnlp[lengths[j] + i]) { | |
870 | bestSnlp[lengths[j] + i] = newSnlp; | |
871 | prev[lengths[j] + i] = i; | |
872 | } | |
873 | } | |
874 | ||
875 | // In Japanese, | |
876 | // Katakana word in single character is pretty rare. So we apply | |
877 | // the following heuristic to Katakana: any continuous run of Katakana | |
878 | // characters is considered a candidate word with a default cost | |
879 | // specified in the katakanaCost table according to its length. | |
880 | //utext_setNativeIndex(text, rangeStart + i); | |
881 | utext_setNativeIndex(&normalizedText, i); | |
882 | bool is_katakana = isKatakana(utext_current32(&normalizedText)); | |
883 | if (!is_prev_katakana && is_katakana) { | |
884 | int j = i + 1; | |
885 | utext_next32(&normalizedText); | |
886 | // Find the end of the continuous run of Katakana characters | |
887 | while (j < numChars && (j - i) < kMaxKatakanaGroupLength && | |
888 | isKatakana(utext_current32(&normalizedText))) { | |
889 | utext_next32(&normalizedText); | |
890 | ++j; | |
891 | } | |
892 | if ((j - i) < kMaxKatakanaGroupLength) { | |
893 | uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); | |
894 | if (newSnlp < bestSnlp[j]) { | |
895 | bestSnlp[j] = newSnlp; | |
896 | prev[j] = i; | |
897 | } | |
898 | } | |
899 | } | |
900 | is_prev_katakana = is_katakana; | |
901 | } | |
902 | ||
903 | // Start pushing the optimal offset index into t_boundary (t for tentative). | |
904 | // prev[numChars] is guaranteed to be meaningful. | |
905 | // We'll first push in the reverse order, i.e., | |
906 | // t_boundary[0] = numChars, and afterwards do a swap. | |
907 | // TODO: Replace by UVector32. | |
908 | AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); | |
909 | ||
910 | int numBreaks = 0; | |
911 | // No segmentation found, set boundary to end of range | |
912 | if (bestSnlp[numChars] == kuint32max) { | |
913 | t_boundary[numBreaks++] = numChars; | |
914 | } else { | |
915 | for (int i = numChars; i > 0; i = prev[i]) { | |
916 | t_boundary[numBreaks++] = i; | |
917 | } | |
918 | U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0); | |
919 | } | |
920 | ||
921 | // Reverse offset index in t_boundary. | |
922 | // Don't add a break for the start of the dictionary range if there is one | |
923 | // there already. | |
924 | if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { | |
925 | t_boundary[numBreaks++] = 0; | |
926 | } | |
927 | ||
928 | // Now that we're done, convert positions in t_bdry[] (indices in | |
929 | // the normalized input string) back to indices in the raw input string | |
930 | // while reversing t_bdry and pushing values to foundBreaks. | |
931 | for (int i = numBreaks-1; i >= 0; i--) { | |
932 | foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); | |
933 | } | |
934 | ||
935 | utext_close(&normalizedText); | |
936 | return numBreaks; | |
937 | } | |
938 | #endif | |
939 | ||
73c04bcf A |
940 | U_NAMESPACE_END |
941 | ||
942 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
51004dcb | 943 |