]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /** |
2 | ******************************************************************************* | |
4388f060 A |
3 | * Copyright (C) 2006-2008,2011, International Business Machines Corporation * |
4 | * and others. All Rights Reserved. * | |
73c04bcf A |
5 | ******************************************************************************* |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_BREAK_ITERATION | |
11 | ||
12 | #include "brkeng.h" | |
13 | #include "dictbe.h" | |
14 | #include "unicode/uniset.h" | |
15 | #include "unicode/chariter.h" | |
16 | #include "unicode/ubrk.h" | |
17 | #include "uvector.h" | |
18 | #include "triedict.h" | |
19 | ||
20 | U_NAMESPACE_BEGIN | |
21 | ||
22 | /* | |
23 | ****************************************************************** | |
24 | */ | |
25 | ||
26 | /*DictionaryBreakEngine::DictionaryBreakEngine() { | |
27 | fTypes = 0; | |
28 | }*/ | |
29 | ||
30 | DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { | |
31 | fTypes = breakTypes; | |
32 | } | |
33 | ||
34 | DictionaryBreakEngine::~DictionaryBreakEngine() { | |
35 | } | |
36 | ||
37 | UBool | |
38 | DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { | |
39 | return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) | |
40 | && fSet.contains(c)); | |
41 | } | |
42 | ||
43 | int32_t | |
44 | DictionaryBreakEngine::findBreaks( UText *text, | |
45 | int32_t startPos, | |
46 | int32_t endPos, | |
47 | UBool reverse, | |
48 | int32_t breakType, | |
49 | UStack &foundBreaks ) const { | |
50 | int32_t result = 0; | |
51 | ||
52 | // Find the span of characters included in the set. | |
53 | int32_t start = (int32_t)utext_getNativeIndex(text); | |
54 | int32_t current; | |
55 | int32_t rangeStart; | |
56 | int32_t rangeEnd; | |
57 | UChar32 c = utext_current32(text); | |
58 | if (reverse) { | |
59 | UBool isDict = fSet.contains(c); | |
60 | while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { | |
61 | c = utext_previous32(text); | |
62 | isDict = fSet.contains(c); | |
63 | } | |
64 | rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); | |
65 | rangeEnd = start + 1; | |
66 | } | |
67 | else { | |
68 | while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { | |
69 | utext_next32(text); // TODO: recast loop for postincrement | |
70 | c = utext_current32(text); | |
71 | } | |
72 | rangeStart = start; | |
73 | rangeEnd = current; | |
74 | } | |
75 | if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { | |
76 | result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); | |
77 | utext_setNativeIndex(text, current); | |
78 | } | |
79 | ||
80 | return result; | |
81 | } | |
82 | ||
83 | void | |
46f4442e | 84 | DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { |
73c04bcf | 85 | fSet = set; |
46f4442e A |
86 | // Compact for caching |
87 | fSet.compact(); | |
73c04bcf A |
88 | } |
89 | ||
90 | /*void | |
91 | DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) { | |
92 | fTypes = breakTypes; | |
93 | }*/ | |
94 | ||
95 | /* | |
96 | ****************************************************************** | |
97 | */ | |
98 | ||
99 | ||
100 | // Helper class for improving readability of the Thai word break | |
101 | // algorithm. The implementation is completely inline. | |
102 | ||
103 | // List size, limited by the maximum number of words in the dictionary | |
104 | // that form a nested sequence. | |
105 | #define POSSIBLE_WORD_LIST_MAX 20 | |
106 | ||
107 | class PossibleWord { | |
108 | private: | |
109 | // list of word candidate lengths, in increasing length order | |
110 | int32_t lengths[POSSIBLE_WORD_LIST_MAX]; | |
111 | int count; // Count of candidates | |
112 | int32_t prefix; // The longest match with a dictionary word | |
113 | int32_t offset; // Offset in the text of these candidates | |
114 | int mark; // The preferred candidate's offset | |
115 | int current; // The candidate we're currently looking at | |
116 | ||
117 | public: | |
118 | PossibleWord(); | |
119 | ~PossibleWord(); | |
120 | ||
121 | // Fill the list of candidates if needed, select the longest, and return the number found | |
122 | int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ); | |
123 | ||
124 | // Select the currently marked candidate, point after it in the text, and invalidate self | |
125 | int32_t acceptMarked( UText *text ); | |
126 | ||
127 | // Back up from the current candidate to the next shorter one; return TRUE if that exists | |
128 | // and point the text after it | |
129 | UBool backUp( UText *text ); | |
130 | ||
131 | // Return the longest prefix this candidate location shares with a dictionary word | |
132 | int32_t longestPrefix(); | |
133 | ||
134 | // Mark the current candidate as the one we like | |
135 | void markCurrent(); | |
136 | }; | |
137 | ||
138 | inline | |
139 | PossibleWord::PossibleWord() { | |
140 | offset = -1; | |
141 | } | |
142 | ||
143 | inline | |
144 | PossibleWord::~PossibleWord() { | |
145 | } | |
146 | ||
147 | inline int | |
148 | PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) { | |
149 | // TODO: If getIndex is too slow, use offset < 0 and add discardAll() | |
150 | int32_t start = (int32_t)utext_getNativeIndex(text); | |
151 | if (start != offset) { | |
152 | offset = start; | |
153 | prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); | |
154 | // Dictionary leaves text after longest prefix, not longest word. Back up. | |
155 | if (count <= 0) { | |
156 | utext_setNativeIndex(text, start); | |
157 | } | |
158 | } | |
159 | if (count > 0) { | |
160 | utext_setNativeIndex(text, start+lengths[count-1]); | |
161 | } | |
162 | current = count-1; | |
163 | mark = current; | |
164 | return count; | |
165 | } | |
166 | ||
167 | inline int32_t | |
168 | PossibleWord::acceptMarked( UText *text ) { | |
169 | utext_setNativeIndex(text, offset + lengths[mark]); | |
170 | return lengths[mark]; | |
171 | } | |
172 | ||
173 | inline UBool | |
174 | PossibleWord::backUp( UText *text ) { | |
175 | if (current > 0) { | |
176 | utext_setNativeIndex(text, offset + lengths[--current]); | |
177 | return TRUE; | |
178 | } | |
179 | return FALSE; | |
180 | } | |
181 | ||
182 | inline int32_t | |
183 | PossibleWord::longestPrefix() { | |
184 | return prefix; | |
185 | } | |
186 | ||
187 | inline void | |
188 | PossibleWord::markCurrent() { | |
189 | mark = current; | |
190 | } | |
191 | ||
192 | // How many words in a row are "good enough"? | |
193 | #define THAI_LOOKAHEAD 3 | |
194 | ||
195 | // Will not combine a non-word with a preceding dictionary word longer than this | |
196 | #define THAI_ROOT_COMBINE_THRESHOLD 3 | |
197 | ||
198 | // Will not combine a non-word that shares at least this much prefix with a | |
199 | // dictionary word, with a preceding word | |
200 | #define THAI_PREFIX_COMBINE_THRESHOLD 3 | |
201 | ||
202 | // Ellision character | |
203 | #define THAI_PAIYANNOI 0x0E2F | |
204 | ||
205 | // Repeat character | |
206 | #define THAI_MAIYAMOK 0x0E46 | |
207 | ||
208 | // Minimum word size | |
209 | #define THAI_MIN_WORD 2 | |
210 | ||
211 | // Minimum number of characters for two words | |
212 | #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) | |
213 | ||
214 | ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) | |
215 | : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), | |
216 | fDictionary(adoptDictionary) | |
217 | { | |
218 | fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); | |
219 | if (U_SUCCESS(status)) { | |
220 | setCharacters(fThaiWordSet); | |
221 | } | |
222 | fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); | |
46f4442e | 223 | fMarkSet.add(0x0020); |
73c04bcf A |
224 | fEndWordSet = fThaiWordSet; |
225 | fEndWordSet.remove(0x0E31); // MAI HAN-AKAT | |
226 | fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
227 | fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK | |
228 | fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
229 | fSuffixSet.add(THAI_PAIYANNOI); | |
230 | fSuffixSet.add(THAI_MAIYAMOK); | |
46f4442e A |
231 | |
232 | // Compact for caching. | |
233 | fMarkSet.compact(); | |
234 | fEndWordSet.compact(); | |
235 | fBeginWordSet.compact(); | |
236 | fSuffixSet.compact(); | |
73c04bcf A |
237 | } |
238 | ||
239 | ThaiBreakEngine::~ThaiBreakEngine() { | |
240 | delete fDictionary; | |
241 | } | |
242 | ||
243 | int32_t | |
244 | ThaiBreakEngine::divideUpDictionaryRange( UText *text, | |
245 | int32_t rangeStart, | |
246 | int32_t rangeEnd, | |
247 | UStack &foundBreaks ) const { | |
248 | if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { | |
249 | return 0; // Not enough characters for two words | |
250 | } | |
251 | ||
252 | uint32_t wordsFound = 0; | |
253 | int32_t wordLength; | |
254 | int32_t current; | |
255 | UErrorCode status = U_ZERO_ERROR; | |
256 | PossibleWord words[THAI_LOOKAHEAD]; | |
257 | UChar32 uc; | |
258 | ||
259 | utext_setNativeIndex(text, rangeStart); | |
260 | ||
261 | while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | |
262 | wordLength = 0; | |
263 | ||
264 | // Look for candidate words at the current position | |
265 | int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
266 | ||
267 | // If we found exactly one, use that | |
268 | if (candidates == 1) { | |
269 | wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); | |
270 | wordsFound += 1; | |
271 | } | |
272 | ||
273 | // If there was more than one, see which one can take us forward the most words | |
274 | else if (candidates > 1) { | |
275 | // If we're already at the end of the range, we're done | |
276 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
277 | goto foundBest; | |
278 | } | |
279 | do { | |
280 | int wordsMatched = 1; | |
281 | if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { | |
282 | if (wordsMatched < 2) { | |
283 | // Followed by another dictionary word; mark first word as a good candidate | |
284 | words[wordsFound%THAI_LOOKAHEAD].markCurrent(); | |
285 | wordsMatched = 2; | |
286 | } | |
287 | ||
288 | // If we're already at the end of the range, we're done | |
289 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
290 | goto foundBest; | |
291 | } | |
292 | ||
293 | // See if any of the possible second words is followed by a third word | |
294 | do { | |
295 | // If we find a third word, stop right away | |
296 | if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { | |
297 | words[wordsFound%THAI_LOOKAHEAD].markCurrent(); | |
298 | goto foundBest; | |
299 | } | |
300 | } | |
301 | while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text)); | |
302 | } | |
303 | } | |
304 | while (words[wordsFound%THAI_LOOKAHEAD].backUp(text)); | |
305 | foundBest: | |
306 | wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); | |
307 | wordsFound += 1; | |
308 | } | |
309 | ||
310 | // We come here after having either found a word or not. We look ahead to the | |
311 | // next word. If it's not a dictionary word, we will combine it withe the word we | |
312 | // just found (if there is one), but only if the preceding word does not exceed | |
313 | // the threshold. | |
314 | // The text iterator should now be positioned at the end of the word we found. | |
315 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { | |
316 | // if it is a dictionary word, do nothing. If it isn't, then if there is | |
317 | // no preceding word, or the non-word shares less than the minimum threshold | |
318 | // of characters with a dictionary word, then scan to resynchronize | |
319 | if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
320 | && (wordLength == 0 | |
321 | || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { | |
322 | // Look for a plausible word boundary | |
323 | //TODO: This section will need a rework for UText. | |
324 | int32_t remaining = rangeEnd - (current+wordLength); | |
325 | UChar32 pc = utext_current32(text); | |
326 | int32_t chars = 0; | |
46f4442e | 327 | for (;;) { |
73c04bcf A |
328 | utext_next32(text); |
329 | uc = utext_current32(text); | |
330 | // TODO: Here we're counting on the fact that the SA languages are all | |
331 | // in the BMP. This should get fixed with the UText rework. | |
332 | chars += 1; | |
333 | if (--remaining <= 0) { | |
334 | break; | |
335 | } | |
336 | if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | |
337 | // Maybe. See if it's in the dictionary. | |
338 | // NOTE: In the original Apple code, checked that the next | |
339 | // two characters after uc were not 0x0E4C THANTHAKHAT before | |
340 | // checking the dictionary. That is just a performance filter, | |
341 | // but it's not clear it's faster than checking the trie. | |
342 | int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
343 | utext_setNativeIndex(text, current+wordLength+chars); | |
344 | if (candidates > 0) { | |
345 | break; | |
346 | } | |
347 | } | |
348 | pc = uc; | |
349 | } | |
350 | ||
351 | // Bump the word count if there wasn't already one | |
352 | if (wordLength <= 0) { | |
353 | wordsFound += 1; | |
354 | } | |
355 | ||
356 | // Update the length with the passed-over characters | |
357 | wordLength += chars; | |
358 | } | |
359 | else { | |
360 | // Back up to where we were for next iteration | |
361 | utext_setNativeIndex(text, current+wordLength); | |
362 | } | |
363 | } | |
364 | ||
365 | // Never stop before a combining mark. | |
366 | int32_t currPos; | |
367 | while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | |
368 | utext_next32(text); | |
369 | wordLength += (int32_t)utext_getNativeIndex(text) - currPos; | |
370 | } | |
371 | ||
372 | // Look ahead for possible suffixes if a dictionary word does not follow. | |
373 | // We do this in code rather than using a rule so that the heuristic | |
374 | // resynch continues to function. For example, one of the suffix characters | |
375 | // could be a typo in the middle of a word. | |
376 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { | |
377 | if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
378 | && fSuffixSet.contains(uc = utext_current32(text))) { | |
379 | if (uc == THAI_PAIYANNOI) { | |
380 | if (!fSuffixSet.contains(utext_previous32(text))) { | |
381 | // Skip over previous end and PAIYANNOI | |
382 | utext_next32(text); | |
383 | utext_next32(text); | |
384 | wordLength += 1; // Add PAIYANNOI to word | |
385 | uc = utext_current32(text); // Fetch next character | |
386 | } | |
387 | else { | |
388 | // Restore prior position | |
389 | utext_next32(text); | |
390 | } | |
391 | } | |
392 | if (uc == THAI_MAIYAMOK) { | |
393 | if (utext_previous32(text) != THAI_MAIYAMOK) { | |
394 | // Skip over previous end and MAIYAMOK | |
395 | utext_next32(text); | |
396 | utext_next32(text); | |
397 | wordLength += 1; // Add MAIYAMOK to word | |
398 | } | |
399 | else { | |
400 | // Restore prior position | |
401 | utext_next32(text); | |
402 | } | |
403 | } | |
404 | } | |
405 | else { | |
406 | utext_setNativeIndex(text, current+wordLength); | |
407 | } | |
408 | } | |
4388f060 A |
409 | |
410 | // Did we find a word on this iteration? If so, push it on the break stack | |
411 | if (wordLength > 0) { | |
412 | foundBreaks.push((current+wordLength), status); | |
413 | } | |
414 | } | |
415 | ||
416 | // Don't return a break for the end of the dictionary range if there is one there. | |
417 | if (foundBreaks.peeki() >= rangeEnd) { | |
418 | (void) foundBreaks.popi(); | |
419 | wordsFound -= 1; | |
420 | } | |
421 | ||
422 | return wordsFound; | |
423 | } | |
424 | ||
425 | // How many words in a row are "good enough"? | |
426 | #define KHMER_LOOKAHEAD 3 | |
427 | ||
428 | // Will not combine a non-word with a preceding dictionary word longer than this | |
429 | #define KHMER_ROOT_COMBINE_THRESHOLD 3 | |
430 | ||
431 | // Will not combine a non-word that shares at least this much prefix with a | |
432 | // dictionary word, with a preceding word | |
433 | #define KHMER_PREFIX_COMBINE_THRESHOLD 3 | |
434 | ||
435 | // Minimum word size | |
436 | #define KHMER_MIN_WORD 2 | |
437 | ||
438 | // Minimum number of characters for two words | |
439 | #define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) | |
440 | ||
441 | KhmerBreakEngine::KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) | |
442 | : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), | |
443 | fDictionary(adoptDictionary) | |
444 | { | |
445 | fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); | |
446 | if (U_SUCCESS(status)) { | |
447 | setCharacters(fKhmerWordSet); | |
448 | } | |
449 | fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); | |
450 | fMarkSet.add(0x0020); | |
451 | fEndWordSet = fKhmerWordSet; | |
452 | fBeginWordSet.add(0x1780, 0x17B3); | |
453 | //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels | |
454 | //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word | |
455 | //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word | |
456 | fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters | |
457 | //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels | |
458 | // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT | |
459 | // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
460 | // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK | |
461 | // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
462 | // fSuffixSet.add(THAI_PAIYANNOI); | |
463 | // fSuffixSet.add(THAI_MAIYAMOK); | |
464 | ||
465 | // Compact for caching. | |
466 | fMarkSet.compact(); | |
467 | fEndWordSet.compact(); | |
468 | fBeginWordSet.compact(); | |
469 | // fSuffixSet.compact(); | |
470 | } | |
471 | ||
472 | KhmerBreakEngine::~KhmerBreakEngine() { | |
473 | delete fDictionary; | |
474 | } | |
475 | ||
476 | int32_t | |
477 | KhmerBreakEngine::divideUpDictionaryRange( UText *text, | |
478 | int32_t rangeStart, | |
479 | int32_t rangeEnd, | |
480 | UStack &foundBreaks ) const { | |
481 | if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { | |
482 | return 0; // Not enough characters for two words | |
483 | } | |
484 | ||
485 | uint32_t wordsFound = 0; | |
486 | int32_t wordLength; | |
487 | int32_t current; | |
488 | UErrorCode status = U_ZERO_ERROR; | |
489 | PossibleWord words[KHMER_LOOKAHEAD]; | |
490 | UChar32 uc; | |
491 | ||
492 | utext_setNativeIndex(text, rangeStart); | |
493 | ||
494 | while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | |
495 | wordLength = 0; | |
496 | ||
497 | // Look for candidate words at the current position | |
498 | int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
499 | ||
500 | // If we found exactly one, use that | |
501 | if (candidates == 1) { | |
502 | wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); | |
503 | wordsFound += 1; | |
504 | } | |
505 | ||
506 | // If there was more than one, see which one can take us forward the most words | |
507 | else if (candidates > 1) { | |
508 | // If we're already at the end of the range, we're done | |
509 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
510 | goto foundBest; | |
511 | } | |
512 | do { | |
513 | int wordsMatched = 1; | |
514 | if (words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { | |
515 | if (wordsMatched < 2) { | |
516 | // Followed by another dictionary word; mark first word as a good candidate | |
517 | words[wordsFound%KHMER_LOOKAHEAD].markCurrent(); | |
518 | wordsMatched = 2; | |
519 | } | |
520 | ||
521 | // If we're already at the end of the range, we're done | |
522 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
523 | goto foundBest; | |
524 | } | |
525 | ||
526 | // See if any of the possible second words is followed by a third word | |
527 | do { | |
528 | // If we find a third word, stop right away | |
529 | if (words[(wordsFound+2)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { | |
530 | words[wordsFound%KHMER_LOOKAHEAD].markCurrent(); | |
531 | goto foundBest; | |
532 | } | |
533 | } | |
534 | while (words[(wordsFound+1)%KHMER_LOOKAHEAD].backUp(text)); | |
535 | } | |
536 | } | |
537 | while (words[wordsFound%KHMER_LOOKAHEAD].backUp(text)); | |
538 | foundBest: | |
539 | wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); | |
540 | wordsFound += 1; | |
541 | } | |
542 | ||
543 | // We come here after having either found a word or not. We look ahead to the | |
544 | // next word. If it's not a dictionary word, we will combine it with the word we | |
545 | // just found (if there is one), but only if the preceding word does not exceed | |
546 | // the threshold. | |
547 | // The text iterator should now be positioned at the end of the word we found. | |
548 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { | |
549 | // if it is a dictionary word, do nothing. If it isn't, then if there is | |
550 | // no preceding word, or the non-word shares less than the minimum threshold | |
551 | // of characters with a dictionary word, then scan to resynchronize | |
552 | if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
553 | && (wordLength == 0 | |
554 | || words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { | |
555 | // Look for a plausible word boundary | |
556 | //TODO: This section will need a rework for UText. | |
557 | int32_t remaining = rangeEnd - (current+wordLength); | |
558 | UChar32 pc = utext_current32(text); | |
559 | int32_t chars = 0; | |
560 | for (;;) { | |
561 | utext_next32(text); | |
562 | uc = utext_current32(text); | |
563 | // TODO: Here we're counting on the fact that the SA languages are all | |
564 | // in the BMP. This should get fixed with the UText rework. | |
565 | chars += 1; | |
566 | if (--remaining <= 0) { | |
567 | break; | |
568 | } | |
569 | if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | |
570 | // Maybe. See if it's in the dictionary. | |
571 | int candidates = words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
572 | utext_setNativeIndex(text, current+wordLength+chars); | |
573 | if (candidates > 0) { | |
574 | break; | |
575 | } | |
576 | } | |
577 | pc = uc; | |
578 | } | |
579 | ||
580 | // Bump the word count if there wasn't already one | |
581 | if (wordLength <= 0) { | |
582 | wordsFound += 1; | |
583 | } | |
584 | ||
585 | // Update the length with the passed-over characters | |
586 | wordLength += chars; | |
587 | } | |
588 | else { | |
589 | // Back up to where we were for next iteration | |
590 | utext_setNativeIndex(text, current+wordLength); | |
591 | } | |
592 | } | |
593 | ||
594 | // Never stop before a combining mark. | |
595 | int32_t currPos; | |
596 | while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | |
597 | utext_next32(text); | |
598 | wordLength += (int32_t)utext_getNativeIndex(text) - currPos; | |
599 | } | |
600 | ||
601 | // Look ahead for possible suffixes if a dictionary word does not follow. | |
602 | // We do this in code rather than using a rule so that the heuristic | |
603 | // resynch continues to function. For example, one of the suffix characters | |
604 | // could be a typo in the middle of a word. | |
605 | // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { | |
606 | // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
607 | // && fSuffixSet.contains(uc = utext_current32(text))) { | |
608 | // if (uc == KHMER_PAIYANNOI) { | |
609 | // if (!fSuffixSet.contains(utext_previous32(text))) { | |
610 | // // Skip over previous end and PAIYANNOI | |
611 | // utext_next32(text); | |
612 | // utext_next32(text); | |
613 | // wordLength += 1; // Add PAIYANNOI to word | |
614 | // uc = utext_current32(text); // Fetch next character | |
615 | // } | |
616 | // else { | |
617 | // // Restore prior position | |
618 | // utext_next32(text); | |
619 | // } | |
620 | // } | |
621 | // if (uc == KHMER_MAIYAMOK) { | |
622 | // if (utext_previous32(text) != KHMER_MAIYAMOK) { | |
623 | // // Skip over previous end and MAIYAMOK | |
624 | // utext_next32(text); | |
625 | // utext_next32(text); | |
626 | // wordLength += 1; // Add MAIYAMOK to word | |
627 | // } | |
628 | // else { | |
629 | // // Restore prior position | |
630 | // utext_next32(text); | |
631 | // } | |
632 | // } | |
633 | // } | |
634 | // else { | |
635 | // utext_setNativeIndex(text, current+wordLength); | |
636 | // } | |
637 | // } | |
638 | ||
73c04bcf A |
639 | // Did we find a word on this iteration? If so, push it on the break stack |
640 | if (wordLength > 0) { | |
641 | foundBreaks.push((current+wordLength), status); | |
642 | } | |
643 | } | |
644 | ||
645 | // Don't return a break for the end of the dictionary range if there is one there. | |
646 | if (foundBreaks.peeki() >= rangeEnd) { | |
647 | (void) foundBreaks.popi(); | |
648 | wordsFound -= 1; | |
649 | } | |
650 | ||
651 | return wordsFound; | |
652 | } | |
653 | ||
654 | U_NAMESPACE_END | |
655 | ||
656 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |