]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/dictbe.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / common / dictbe.cpp
1 /**
2 *******************************************************************************
3 * Copyright (C) 2006-2014, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *******************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_BREAK_ITERATION
11
12 #include "brkeng.h"
13 #include "dictbe.h"
14 #include "unicode/uniset.h"
15 #include "unicode/chariter.h"
16 #include "unicode/ubrk.h"
17 #include "uvector.h"
18 #include "uassert.h"
19 #include "unicode/normlzr.h"
20 #include "cmemory.h"
21 #include "dictionarydata.h"
22
23 U_NAMESPACE_BEGIN
24
25 /*
26 ******************************************************************
27 */
28
29 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
30 fTypes = breakTypes;
31 }
32
33 DictionaryBreakEngine::~DictionaryBreakEngine() {
34 }
35
36 UBool
37 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
38 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
39 && fSet.contains(c));
40 }
41
42 int32_t
43 DictionaryBreakEngine::findBreaks( UText *text,
44 int32_t startPos,
45 int32_t endPos,
46 UBool reverse,
47 int32_t breakType,
48 UStack &foundBreaks ) const {
49 int32_t result = 0;
50
51 // Find the span of characters included in the set.
52 // The span to break begins at the current position in the text, and
53 // extends towards the start or end of the text, depending on 'reverse'.
54
55 int32_t start = (int32_t)utext_getNativeIndex(text);
56 int32_t current;
57 int32_t rangeStart;
58 int32_t rangeEnd;
59 UChar32 c = utext_current32(text);
60 if (reverse) {
61 UBool isDict = fSet.contains(c);
62 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
63 c = utext_previous32(text);
64 isDict = fSet.contains(c);
65 }
66 rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
67 rangeEnd = start + 1;
68 }
69 else {
70 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
71 utext_next32(text); // TODO: recast loop for postincrement
72 c = utext_current32(text);
73 }
74 rangeStart = start;
75 rangeEnd = current;
76 }
77 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
78 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
79 utext_setNativeIndex(text, current);
80 }
81
82 return result;
83 }
84
85 void
86 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
87 fSet = set;
88 // Compact for caching
89 fSet.compact();
90 }
91
92 /*
93 ******************************************************************
94 * PossibleWord
95 */
96
97 // Helper class for improving readability of the Thai/Lao/Khmer word break
98 // algorithm. The implementation is completely inline.
99
100 // List size, limited by the maximum number of words in the dictionary
101 // that form a nested sequence.
102 #define POSSIBLE_WORD_LIST_MAX 20
103
104 class PossibleWord {
105 private:
106 // list of word candidate lengths, in increasing length order
107 int32_t lengths[POSSIBLE_WORD_LIST_MAX];
108 int32_t count; // Count of candidates
109 int32_t prefix; // The longest match with a dictionary word
110 int32_t offset; // Offset in the text of these candidates
111 int mark; // The preferred candidate's offset
112 int current; // The candidate we're currently looking at
113
114 public:
115 PossibleWord();
116 ~PossibleWord();
117
118 // Fill the list of candidates if needed, select the longest, and return the number found
119 int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
120
121 // Select the currently marked candidate, point after it in the text, and invalidate self
122 int32_t acceptMarked( UText *text );
123
124 // Back up from the current candidate to the next shorter one; return TRUE if that exists
125 // and point the text after it
126 UBool backUp( UText *text );
127
128 // Return the longest prefix this candidate location shares with a dictionary word
129 int32_t longestPrefix();
130
131 // Mark the current candidate as the one we like
132 void markCurrent();
133 };
134
135 inline
136 PossibleWord::PossibleWord() {
137 offset = -1;
138 }
139
140 inline
141 PossibleWord::~PossibleWord() {
142 }
143
144 inline int
145 PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
146 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
147 int32_t start = (int32_t)utext_getNativeIndex(text);
148 if (start != offset) {
149 offset = start;
150 prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
151 // Dictionary leaves text after longest prefix, not longest word. Back up.
152 if (count <= 0) {
153 utext_setNativeIndex(text, start);
154 }
155 }
156 if (count > 0) {
157 utext_setNativeIndex(text, start+lengths[count-1]);
158 }
159 current = count-1;
160 mark = current;
161 return count;
162 }
163
164 inline int32_t
165 PossibleWord::acceptMarked( UText *text ) {
166 utext_setNativeIndex(text, offset + lengths[mark]);
167 return lengths[mark];
168 }
169
170 inline UBool
171 PossibleWord::backUp( UText *text ) {
172 if (current > 0) {
173 utext_setNativeIndex(text, offset + lengths[--current]);
174 return TRUE;
175 }
176 return FALSE;
177 }
178
179 inline int32_t
180 PossibleWord::longestPrefix() {
181 return prefix;
182 }
183
184 inline void
185 PossibleWord::markCurrent() {
186 mark = current;
187 }
188
189 /*
190 ******************************************************************
191 * ThaiBreakEngine
192 */
193
194 // How many words in a row are "good enough"?
195 #define THAI_LOOKAHEAD 3
196
197 // Will not combine a non-word with a preceding dictionary word longer than this
198 #define THAI_ROOT_COMBINE_THRESHOLD 3
199
200 // Will not combine a non-word that shares at least this much prefix with a
201 // dictionary word, with a preceding word
202 #define THAI_PREFIX_COMBINE_THRESHOLD 3
203
204 // Ellision character
205 #define THAI_PAIYANNOI 0x0E2F
206
207 // Repeat character
208 #define THAI_MAIYAMOK 0x0E46
209
210 // Minimum word size
211 #define THAI_MIN_WORD 2
212
213 // Minimum number of characters for two words
214 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
215
216 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
217 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
218 fDictionary(adoptDictionary)
219 {
220 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
221 if (U_SUCCESS(status)) {
222 setCharacters(fThaiWordSet);
223 }
224 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
225 fMarkSet.add(0x0020);
226 fEndWordSet = fThaiWordSet;
227 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
228 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
229 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
230 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
231 fSuffixSet.add(THAI_PAIYANNOI);
232 fSuffixSet.add(THAI_MAIYAMOK);
233
234 // Compact for caching.
235 fMarkSet.compact();
236 fEndWordSet.compact();
237 fBeginWordSet.compact();
238 fSuffixSet.compact();
239 }
240
241 ThaiBreakEngine::~ThaiBreakEngine() {
242 delete fDictionary;
243 }
244
245 int32_t
246 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
247 int32_t rangeStart,
248 int32_t rangeEnd,
249 UStack &foundBreaks ) const {
250 if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
251 return 0; // Not enough characters for two words
252 }
253
254 uint32_t wordsFound = 0;
255 int32_t wordLength;
256 int32_t current;
257 UErrorCode status = U_ZERO_ERROR;
258 PossibleWord words[THAI_LOOKAHEAD];
259 UChar32 uc;
260
261 utext_setNativeIndex(text, rangeStart);
262
263 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
264 wordLength = 0;
265
266 // Look for candidate words at the current position
267 int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
268
269 // If we found exactly one, use that
270 if (candidates == 1) {
271 wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
272 wordsFound += 1;
273 }
274 // If there was more than one, see which one can take us forward the most words
275 else if (candidates > 1) {
276 // If we're already at the end of the range, we're done
277 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
278 goto foundBest;
279 }
280 do {
281 int wordsMatched = 1;
282 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
283 if (wordsMatched < 2) {
284 // Followed by another dictionary word; mark first word as a good candidate
285 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
286 wordsMatched = 2;
287 }
288
289 // If we're already at the end of the range, we're done
290 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
291 goto foundBest;
292 }
293
294 // See if any of the possible second words is followed by a third word
295 do {
296 // If we find a third word, stop right away
297 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
298 words[wordsFound % THAI_LOOKAHEAD].markCurrent();
299 goto foundBest;
300 }
301 }
302 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
303 }
304 }
305 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
306 foundBest:
307 wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
308 wordsFound += 1;
309 }
310
311 // We come here after having either found a word or not. We look ahead to the
312 // next word. If it's not a dictionary word, we will combine it withe the word we
313 // just found (if there is one), but only if the preceding word does not exceed
314 // the threshold.
315 // The text iterator should now be positioned at the end of the word we found.
316 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
317 // if it is a dictionary word, do nothing. If it isn't, then if there is
318 // no preceding word, or the non-word shares less than the minimum threshold
319 // of characters with a dictionary word, then scan to resynchronize
320 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
321 && (wordLength == 0
322 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
323 // Look for a plausible word boundary
324 //TODO: This section will need a rework for UText.
325 int32_t remaining = rangeEnd - (current+wordLength);
326 UChar32 pc = utext_current32(text);
327 int32_t chars = 0;
328 for (;;) {
329 utext_next32(text);
330 uc = utext_current32(text);
331 // TODO: Here we're counting on the fact that the SA languages are all
332 // in the BMP. This should get fixed with the UText rework.
333 chars += 1;
334 if (--remaining <= 0) {
335 break;
336 }
337 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
338 // Maybe. See if it's in the dictionary.
339 // NOTE: In the original Apple code, checked that the next
340 // two characters after uc were not 0x0E4C THANTHAKHAT before
341 // checking the dictionary. That is just a performance filter,
342 // but it's not clear it's faster than checking the trie.
343 int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
344 utext_setNativeIndex(text, current + wordLength + chars);
345 if (candidates > 0) {
346 break;
347 }
348 }
349 pc = uc;
350 }
351
352 // Bump the word count if there wasn't already one
353 if (wordLength <= 0) {
354 wordsFound += 1;
355 }
356
357 // Update the length with the passed-over characters
358 wordLength += chars;
359 }
360 else {
361 // Back up to where we were for next iteration
362 utext_setNativeIndex(text, current+wordLength);
363 }
364 }
365
366 // Never stop before a combining mark.
367 int32_t currPos;
368 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
369 utext_next32(text);
370 wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
371 }
372
373 // Look ahead for possible suffixes if a dictionary word does not follow.
374 // We do this in code rather than using a rule so that the heuristic
375 // resynch continues to function. For example, one of the suffix characters
376 // could be a typo in the middle of a word.
377 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
378 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
379 && fSuffixSet.contains(uc = utext_current32(text))) {
380 if (uc == THAI_PAIYANNOI) {
381 if (!fSuffixSet.contains(utext_previous32(text))) {
382 // Skip over previous end and PAIYANNOI
383 utext_next32(text);
384 utext_next32(text);
385 wordLength += 1; // Add PAIYANNOI to word
386 uc = utext_current32(text); // Fetch next character
387 }
388 else {
389 // Restore prior position
390 utext_next32(text);
391 }
392 }
393 if (uc == THAI_MAIYAMOK) {
394 if (utext_previous32(text) != THAI_MAIYAMOK) {
395 // Skip over previous end and MAIYAMOK
396 utext_next32(text);
397 utext_next32(text);
398 wordLength += 1; // Add MAIYAMOK to word
399 }
400 else {
401 // Restore prior position
402 utext_next32(text);
403 }
404 }
405 }
406 else {
407 utext_setNativeIndex(text, current+wordLength);
408 }
409 }
410
411 // Did we find a word on this iteration? If so, push it on the break stack
412 if (wordLength > 0) {
413 foundBreaks.push((current+wordLength), status);
414 }
415 }
416
417 // Don't return a break for the end of the dictionary range if there is one there.
418 if (foundBreaks.peeki() >= rangeEnd) {
419 (void) foundBreaks.popi();
420 wordsFound -= 1;
421 }
422
423 return wordsFound;
424 }
425
426 /*
427 ******************************************************************
428 * LaoBreakEngine
429 */
430
431 // How many words in a row are "good enough"?
432 #define LAO_LOOKAHEAD 3
433
434 // Will not combine a non-word with a preceding dictionary word longer than this
435 #define LAO_ROOT_COMBINE_THRESHOLD 3
436
437 // Will not combine a non-word that shares at least this much prefix with a
438 // dictionary word, with a preceding word
439 #define LAO_PREFIX_COMBINE_THRESHOLD 3
440
441 // Minimum word size
442 #define LAO_MIN_WORD 2
443
444 // Minimum number of characters for two words
445 #define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2)
446
447 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
448 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
449 fDictionary(adoptDictionary)
450 {
451 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
452 if (U_SUCCESS(status)) {
453 setCharacters(fLaoWordSet);
454 }
455 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
456 fMarkSet.add(0x0020);
457 fEndWordSet = fLaoWordSet;
458 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
459 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
460 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
461 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
462
463 // Compact for caching.
464 fMarkSet.compact();
465 fEndWordSet.compact();
466 fBeginWordSet.compact();
467 }
468
469 LaoBreakEngine::~LaoBreakEngine() {
470 delete fDictionary;
471 }
472
473 int32_t
474 LaoBreakEngine::divideUpDictionaryRange( UText *text,
475 int32_t rangeStart,
476 int32_t rangeEnd,
477 UStack &foundBreaks ) const {
478 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
479 return 0; // Not enough characters for two words
480 }
481
482 uint32_t wordsFound = 0;
483 int32_t wordLength;
484 int32_t current;
485 UErrorCode status = U_ZERO_ERROR;
486 PossibleWord words[LAO_LOOKAHEAD];
487 UChar32 uc;
488
489 utext_setNativeIndex(text, rangeStart);
490
491 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
492 wordLength = 0;
493
494 // Look for candidate words at the current position
495 int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
496
497 // If we found exactly one, use that
498 if (candidates == 1) {
499 wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
500 wordsFound += 1;
501 }
502 // If there was more than one, see which one can take us forward the most words
503 else if (candidates > 1) {
504 // If we're already at the end of the range, we're done
505 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
506 goto foundBest;
507 }
508 do {
509 int wordsMatched = 1;
510 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
511 if (wordsMatched < 2) {
512 // Followed by another dictionary word; mark first word as a good candidate
513 words[wordsFound%LAO_LOOKAHEAD].markCurrent();
514 wordsMatched = 2;
515 }
516
517 // If we're already at the end of the range, we're done
518 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
519 goto foundBest;
520 }
521
522 // See if any of the possible second words is followed by a third word
523 do {
524 // If we find a third word, stop right away
525 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
526 words[wordsFound % LAO_LOOKAHEAD].markCurrent();
527 goto foundBest;
528 }
529 }
530 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
531 }
532 }
533 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
534 foundBest:
535 wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
536 wordsFound += 1;
537 }
538
539 // We come here after having either found a word or not. We look ahead to the
540 // next word. If it's not a dictionary word, we will combine it withe the word we
541 // just found (if there is one), but only if the preceding word does not exceed
542 // the threshold.
543 // The text iterator should now be positioned at the end of the word we found.
544 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) {
545 // if it is a dictionary word, do nothing. If it isn't, then if there is
546 // no preceding word, or the non-word shares less than the minimum threshold
547 // of characters with a dictionary word, then scan to resynchronize
548 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
549 && (wordLength == 0
550 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
551 // Look for a plausible word boundary
552 //TODO: This section will need a rework for UText.
553 int32_t remaining = rangeEnd - (current+wordLength);
554 UChar32 pc = utext_current32(text);
555 int32_t chars = 0;
556 for (;;) {
557 utext_next32(text);
558 uc = utext_current32(text);
559 // TODO: Here we're counting on the fact that the SA languages are all
560 // in the BMP. This should get fixed with the UText rework.
561 chars += 1;
562 if (--remaining <= 0) {
563 break;
564 }
565 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
566 // Maybe. See if it's in the dictionary.
567 int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
568 utext_setNativeIndex(text, current + wordLength + chars);
569 if (candidates > 0) {
570 break;
571 }
572 }
573 pc = uc;
574 }
575
576 // Bump the word count if there wasn't already one
577 if (wordLength <= 0) {
578 wordsFound += 1;
579 }
580
581 // Update the length with the passed-over characters
582 wordLength += chars;
583 }
584 else {
585 // Back up to where we were for next iteration
586 utext_setNativeIndex(text, current+wordLength);
587 }
588 }
589
590 // Never stop before a combining mark.
591 int32_t currPos;
592 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
593 utext_next32(text);
594 wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
595 }
596
597 // Look ahead for possible suffixes if a dictionary word does not follow.
598 // We do this in code rather than using a rule so that the heuristic
599 // resynch continues to function. For example, one of the suffix characters
600 // could be a typo in the middle of a word.
601 // NOT CURRENTLY APPLICABLE TO LAO
602
603 // Did we find a word on this iteration? If so, push it on the break stack
604 if (wordLength > 0) {
605 foundBreaks.push((current+wordLength), status);
606 }
607 }
608
609 // Don't return a break for the end of the dictionary range if there is one there.
610 if (foundBreaks.peeki() >= rangeEnd) {
611 (void) foundBreaks.popi();
612 wordsFound -= 1;
613 }
614
615 return wordsFound;
616 }
617
618 /*
619 ******************************************************************
620 * KhmerBreakEngine
621 */
622
623 // How many words in a row are "good enough"?
624 #define KHMER_LOOKAHEAD 3
625
626 // Will not combine a non-word with a preceding dictionary word longer than this
627 #define KHMER_ROOT_COMBINE_THRESHOLD 3
628
629 // Will not combine a non-word that shares at least this much prefix with a
630 // dictionary word, with a preceding word
631 #define KHMER_PREFIX_COMBINE_THRESHOLD 3
632
633 // Minimum word size
634 #define KHMER_MIN_WORD 2
635
636 // Minimum number of characters for two words
637 #define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)
638
639 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
640 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
641 fDictionary(adoptDictionary)
642 {
643 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
644 if (U_SUCCESS(status)) {
645 setCharacters(fKhmerWordSet);
646 }
647 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
648 fMarkSet.add(0x0020);
649 fEndWordSet = fKhmerWordSet;
650 fBeginWordSet.add(0x1780, 0x17B3);
651 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
652 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
653 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
654 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
655 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
656 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
657 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
658 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
659 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
660 // fSuffixSet.add(THAI_PAIYANNOI);
661 // fSuffixSet.add(THAI_MAIYAMOK);
662
663 // Compact for caching.
664 fMarkSet.compact();
665 fEndWordSet.compact();
666 fBeginWordSet.compact();
667 // fSuffixSet.compact();
668 }
669
670 KhmerBreakEngine::~KhmerBreakEngine() {
671 delete fDictionary;
672 }
673
674 int32_t
675 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
676 int32_t rangeStart,
677 int32_t rangeEnd,
678 UStack &foundBreaks ) const {
679 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
680 return 0; // Not enough characters for two words
681 }
682
683 uint32_t wordsFound = 0;
684 int32_t wordLength;
685 int32_t current;
686 UErrorCode status = U_ZERO_ERROR;
687 PossibleWord words[KHMER_LOOKAHEAD];
688 UChar32 uc;
689
690 utext_setNativeIndex(text, rangeStart);
691
692 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
693 wordLength = 0;
694
695 // Look for candidate words at the current position
696 int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
697
698 // If we found exactly one, use that
699 if (candidates == 1) {
700 wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
701 wordsFound += 1;
702 }
703
704 // If there was more than one, see which one can take us forward the most words
705 else if (candidates > 1) {
706 // If we're already at the end of the range, we're done
707 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
708 goto foundBest;
709 }
710 do {
711 int wordsMatched = 1;
712 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
713 if (wordsMatched < 2) {
714 // Followed by another dictionary word; mark first word as a good candidate
715 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
716 wordsMatched = 2;
717 }
718
719 // If we're already at the end of the range, we're done
720 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
721 goto foundBest;
722 }
723
724 // See if any of the possible second words is followed by a third word
725 do {
726 // If we find a third word, stop right away
727 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
728 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
729 goto foundBest;
730 }
731 }
732 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
733 }
734 }
735 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
736 foundBest:
737 wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
738 wordsFound += 1;
739 }
740
741 // We come here after having either found a word or not. We look ahead to the
742 // next word. If it's not a dictionary word, we will combine it with the word we
743 // just found (if there is one), but only if the preceding word does not exceed
744 // the threshold.
745 // The text iterator should now be positioned at the end of the word we found.
746 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
747 // if it is a dictionary word, do nothing. If it isn't, then if there is
748 // no preceding word, or the non-word shares less than the minimum threshold
749 // of characters with a dictionary word, then scan to resynchronize
750 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
751 && (wordLength == 0
752 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
753 // Look for a plausible word boundary
754 //TODO: This section will need a rework for UText.
755 int32_t remaining = rangeEnd - (current+wordLength);
756 UChar32 pc = utext_current32(text);
757 int32_t chars = 0;
758 for (;;) {
759 utext_next32(text);
760 uc = utext_current32(text);
761 // TODO: Here we're counting on the fact that the SA languages are all
762 // in the BMP. This should get fixed with the UText rework.
763 chars += 1;
764 if (--remaining <= 0) {
765 break;
766 }
767 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
768 // Maybe. See if it's in the dictionary.
769 int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
770 utext_setNativeIndex(text, current+wordLength+chars);
771 if (candidates > 0) {
772 break;
773 }
774 }
775 pc = uc;
776 }
777
778 // Bump the word count if there wasn't already one
779 if (wordLength <= 0) {
780 wordsFound += 1;
781 }
782
783 // Update the length with the passed-over characters
784 wordLength += chars;
785 }
786 else {
787 // Back up to where we were for next iteration
788 utext_setNativeIndex(text, current+wordLength);
789 }
790 }
791
792 // Never stop before a combining mark.
793 int32_t currPos;
794 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
795 utext_next32(text);
796 wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
797 }
798
799 // Look ahead for possible suffixes if a dictionary word does not follow.
800 // We do this in code rather than using a rule so that the heuristic
801 // resynch continues to function. For example, one of the suffix characters
802 // could be a typo in the middle of a word.
803 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
804 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
805 // && fSuffixSet.contains(uc = utext_current32(text))) {
806 // if (uc == KHMER_PAIYANNOI) {
807 // if (!fSuffixSet.contains(utext_previous32(text))) {
808 // // Skip over previous end and PAIYANNOI
809 // utext_next32(text);
810 // utext_next32(text);
811 // wordLength += 1; // Add PAIYANNOI to word
812 // uc = utext_current32(text); // Fetch next character
813 // }
814 // else {
815 // // Restore prior position
816 // utext_next32(text);
817 // }
818 // }
819 // if (uc == KHMER_MAIYAMOK) {
820 // if (utext_previous32(text) != KHMER_MAIYAMOK) {
821 // // Skip over previous end and MAIYAMOK
822 // utext_next32(text);
823 // utext_next32(text);
824 // wordLength += 1; // Add MAIYAMOK to word
825 // }
826 // else {
827 // // Restore prior position
828 // utext_next32(text);
829 // }
830 // }
831 // }
832 // else {
833 // utext_setNativeIndex(text, current+wordLength);
834 // }
835 // }
836
837 // Did we find a word on this iteration? If so, push it on the break stack
838 if (wordLength > 0) {
839 foundBreaks.push((current+wordLength), status);
840 }
841 }
842
843 // Don't return a break for the end of the dictionary range if there is one there.
844 if (foundBreaks.peeki() >= rangeEnd) {
845 (void) foundBreaks.popi();
846 wordsFound -= 1;
847 }
848
849 return wordsFound;
850 }
851
852 #if !UCONFIG_NO_NORMALIZATION
853 /*
854 ******************************************************************
855 * CjkBreakEngine
856 */
857 static const uint32_t kuint32max = 0xFFFFFFFF;
858 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
859 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
860 // Korean dictionary only includes Hangul syllables
861 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
862 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
863 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
864 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
865
866 if (U_SUCCESS(status)) {
867 // handle Korean and Japanese/Chinese using different dictionaries
868 if (type == kKorean) {
869 setCharacters(fHangulWordSet);
870 } else { //Chinese and Japanese
871 UnicodeSet cjSet;
872 cjSet.addAll(fHanWordSet);
873 cjSet.addAll(fKatakanaWordSet);
874 cjSet.addAll(fHiraganaWordSet);
875 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
876 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
877 setCharacters(cjSet);
878 }
879 }
880 }
881
882 CjkBreakEngine::~CjkBreakEngine(){
883 delete fDictionary;
884 }
885
886 // The katakanaCost values below are based on the length frequencies of all
887 // katakana phrases in the dictionary
888 static const int kMaxKatakanaLength = 8;
889 static const int kMaxKatakanaGroupLength = 20;
890 static const uint32_t maxSnlp = 255;
891
892 static inline uint32_t getKatakanaCost(int wordLength){
893 //TODO: fill array with actual values from dictionary!
894 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
895 = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
896 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
897 }
898
899 static inline bool isKatakana(uint16_t value) {
900 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
901 (value >= 0xFF66u && value <= 0xFF9fu);
902 }
903
904 // A very simple helper class to streamline the buffer handling in
905 // divideUpDictionaryRange.
906 template<class T, size_t N>
907 class AutoBuffer {
908 public:
909 AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
910 if (size > N) {
911 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
912 capacity = size;
913 }
914 }
915 ~AutoBuffer() {
916 if (buffer != stackBuffer)
917 uprv_free(buffer);
918 }
919
920 T* elems() {
921 return buffer;
922 }
923
924 const T& operator[] (size_t i) const {
925 return buffer[i];
926 }
927
928 T& operator[] (size_t i) {
929 return buffer[i];
930 }
931
932 // resize without copy
933 void resize(size_t size) {
934 if (size <= capacity)
935 return;
936 if (buffer != stackBuffer)
937 uprv_free(buffer);
938 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
939 capacity = size;
940 }
941
942 private:
943 T stackBuffer[N];
944 T* buffer;
945 AutoBuffer();
946 size_t capacity;
947 };
948
949
950 /*
951 * @param text A UText representing the text
952 * @param rangeStart The start of the range of dictionary characters
953 * @param rangeEnd The end of the range of dictionary characters
954 * @param foundBreaks Output of C array of int32_t break positions, or 0
955 * @return The number of breaks found
956 */
957 int32_t
958 CjkBreakEngine::divideUpDictionaryRange( UText *text,
959 int32_t rangeStart,
960 int32_t rangeEnd,
961 UStack &foundBreaks ) const {
962 if (rangeStart >= rangeEnd) {
963 return 0;
964 }
965
966 const size_t defaultInputLength = 80;
967 size_t inputLength = rangeEnd - rangeStart;
968 // TODO: Replace by UnicodeString.
969 AutoBuffer<UChar, defaultInputLength> charString(inputLength);
970
971 // Normalize the input string and put it in normalizedText.
972 // The map from the indices of the normalized input to the raw
973 // input is kept in charPositions.
974 UErrorCode status = U_ZERO_ERROR;
975 utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
976 if (U_FAILURE(status)) {
977 return 0;
978 }
979
980 UnicodeString inputString(charString.elems(), inputLength);
981 // TODO: Use Normalizer2.
982 UNormalizationMode norm_mode = UNORM_NFKC;
983 UBool isNormalized =
984 Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
985 Normalizer::isNormalized(inputString, norm_mode, status);
986
987 // TODO: Replace by UVector32.
988 AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
989 int numChars = 0;
990 UText normalizedText = UTEXT_INITIALIZER;
991 // Needs to be declared here because normalizedText holds onto its buffer.
992 UnicodeString normalizedString;
993 if (isNormalized) {
994 int32_t index = 0;
995 charPositions[0] = 0;
996 while(index < inputString.length()) {
997 index = inputString.moveIndex32(index, 1);
998 charPositions[++numChars] = index;
999 }
1000 utext_openUnicodeString(&normalizedText, &inputString, &status);
1001 }
1002 else {
1003 Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
1004 if (U_FAILURE(status)) {
1005 return 0;
1006 }
1007 charPositions.resize(normalizedString.length() + 1);
1008 Normalizer normalizer(charString.elems(), inputLength, norm_mode);
1009 int32_t index = 0;
1010 charPositions[0] = 0;
1011 while(index < normalizer.endIndex()){
1012 /* UChar32 uc = */ normalizer.next();
1013 charPositions[++numChars] = index = normalizer.getIndex();
1014 }
1015 utext_openUnicodeString(&normalizedText, &normalizedString, &status);
1016 }
1017
1018 if (U_FAILURE(status)) {
1019 return 0;
1020 }
1021
1022 // From this point on, all the indices refer to the indices of
1023 // the normalized input string.
1024
1025 // bestSnlp[i] is the snlp of the best segmentation of the first i
1026 // characters in the range to be matched.
1027 // TODO: Replace by UVector32.
1028 AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
1029 bestSnlp[0] = 0;
1030 for(int i = 1; i <= numChars; i++) {
1031 bestSnlp[i] = kuint32max;
1032 }
1033
1034 // prev[i] is the index of the last CJK character in the previous word in
1035 // the best segmentation of the first i characters.
1036 // TODO: Replace by UVector32.
1037 AutoBuffer<int, defaultInputLength> prev(numChars + 1);
1038 for(int i = 0; i <= numChars; i++){
1039 prev[i] = -1;
1040 }
1041
1042 const size_t maxWordSize = 20;
1043 // TODO: Replace both with UVector32.
1044 AutoBuffer<int32_t, maxWordSize> values(numChars);
1045 AutoBuffer<int32_t, maxWordSize> lengths(numChars);
1046
1047 // Dynamic programming to find the best segmentation.
1048 bool is_prev_katakana = false;
1049 for (int32_t i = 0; i < numChars; ++i) {
1050 //utext_setNativeIndex(text, rangeStart + i);
1051 utext_setNativeIndex(&normalizedText, i);
1052 if (bestSnlp[i] == kuint32max)
1053 continue;
1054
1055 int32_t count;
1056 // limit maximum word length matched to size of current substring
1057 int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);
1058
1059 fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
1060
1061 // if there are no single character matches found in the dictionary
1062 // starting with this charcter, treat character as a 1-character word
1063 // with the highest value possible, i.e. the least likely to occur.
1064 // Exclude Korean characters from this treatment, as they should be left
1065 // together by default.
1066 if((count == 0 || lengths[0] != 1) &&
1067 !fHangulWordSet.contains(utext_current32(&normalizedText))) {
1068 values[count] = maxSnlp;
1069 lengths[count++] = 1;
1070 }
1071
1072 for (int j = 0; j < count; j++) {
1073 uint32_t newSnlp = bestSnlp[i] + values[j];
1074 if (newSnlp < bestSnlp[lengths[j] + i]) {
1075 bestSnlp[lengths[j] + i] = newSnlp;
1076 prev[lengths[j] + i] = i;
1077 }
1078 }
1079
1080 // In Japanese,
1081 // Katakana word in single character is pretty rare. So we apply
1082 // the following heuristic to Katakana: any continuous run of Katakana
1083 // characters is considered a candidate word with a default cost
1084 // specified in the katakanaCost table according to its length.
1085 //utext_setNativeIndex(text, rangeStart + i);
1086 utext_setNativeIndex(&normalizedText, i);
1087 bool is_katakana = isKatakana(utext_current32(&normalizedText));
1088 if (!is_prev_katakana && is_katakana) {
1089 int j = i + 1;
1090 utext_next32(&normalizedText);
1091 // Find the end of the continuous run of Katakana characters
1092 while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
1093 isKatakana(utext_current32(&normalizedText))) {
1094 utext_next32(&normalizedText);
1095 ++j;
1096 }
1097 if ((j - i) < kMaxKatakanaGroupLength) {
1098 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
1099 if (newSnlp < bestSnlp[j]) {
1100 bestSnlp[j] = newSnlp;
1101 prev[j] = i;
1102 }
1103 }
1104 }
1105 is_prev_katakana = is_katakana;
1106 }
1107
1108 // Start pushing the optimal offset index into t_boundary (t for tentative).
1109 // prev[numChars] is guaranteed to be meaningful.
1110 // We'll first push in the reverse order, i.e.,
1111 // t_boundary[0] = numChars, and afterwards do a swap.
1112 // TODO: Replace by UVector32.
1113 AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
1114
1115 int numBreaks = 0;
1116 // No segmentation found, set boundary to end of range
1117 if (bestSnlp[numChars] == kuint32max) {
1118 t_boundary[numBreaks++] = numChars;
1119 } else {
1120 for (int i = numChars; i > 0; i = prev[i]) {
1121 t_boundary[numBreaks++] = i;
1122 }
1123 U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
1124 }
1125
1126 // Reverse offset index in t_boundary.
1127 // Don't add a break for the start of the dictionary range if there is one
1128 // there already.
1129 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1130 t_boundary[numBreaks++] = 0;
1131 }
1132
1133 // Now that we're done, convert positions in t_bdry[] (indices in
1134 // the normalized input string) back to indices in the raw input string
1135 // while reversing t_bdry and pushing values to foundBreaks.
1136 for (int i = numBreaks-1; i >= 0; i--) {
1137 foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
1138 }
1139
1140 utext_close(&normalizedText);
1141 return numBreaks;
1142 }
1143 #endif
1144
1145 U_NAMESPACE_END
1146
1147 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1148