]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/dictbe.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / common / dictbe.cpp
CommitLineData
73c04bcf
A
1/**
2 *******************************************************************************
57a6839d 3 * Copyright (C) 2006-2014, International Business Machines Corporation
51004dcb 4 * and others. All Rights Reserved.
73c04bcf
A
5 *******************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "brkeng.h"
13#include "dictbe.h"
14#include "unicode/uniset.h"
15#include "unicode/chariter.h"
16#include "unicode/ubrk.h"
17#include "uvector.h"
51004dcb
A
18#include "uassert.h"
19#include "unicode/normlzr.h"
20#include "cmemory.h"
21#include "dictionarydata.h"
73c04bcf
A
22
23U_NAMESPACE_BEGIN
24
25/*
26 ******************************************************************
27 */
28
73c04bcf
A
29DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
30 fTypes = breakTypes;
31}
32
33DictionaryBreakEngine::~DictionaryBreakEngine() {
34}
35
36UBool
37DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
38 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
39 && fSet.contains(c));
40}
41
42int32_t
43DictionaryBreakEngine::findBreaks( UText *text,
44 int32_t startPos,
45 int32_t endPos,
46 UBool reverse,
47 int32_t breakType,
48 UStack &foundBreaks ) const {
49 int32_t result = 0;
50
51 // Find the span of characters included in the set.
57a6839d
A
52 // The span to break begins at the current position in the text, and
53 // extends towards the start or end of the text, depending on 'reverse'.
54
73c04bcf
A
55 int32_t start = (int32_t)utext_getNativeIndex(text);
56 int32_t current;
57 int32_t rangeStart;
58 int32_t rangeEnd;
59 UChar32 c = utext_current32(text);
60 if (reverse) {
61 UBool isDict = fSet.contains(c);
62 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
63 c = utext_previous32(text);
64 isDict = fSet.contains(c);
65 }
66 rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
67 rangeEnd = start + 1;
68 }
69 else {
70 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
71 utext_next32(text); // TODO: recast loop for postincrement
72 c = utext_current32(text);
73 }
74 rangeStart = start;
75 rangeEnd = current;
76 }
77 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
78 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
79 utext_setNativeIndex(text, current);
80 }
81
82 return result;
83}
84
85void
46f4442e 86DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
73c04bcf 87 fSet = set;
46f4442e
A
88 // Compact for caching
89 fSet.compact();
73c04bcf
A
90}
91
73c04bcf
A
92/*
93 ******************************************************************
57a6839d 94 * PossibleWord
73c04bcf
A
95 */
96
57a6839d 97// Helper class for improving readability of the Thai/Lao/Khmer word break
73c04bcf
A
98// algorithm. The implementation is completely inline.
99
100// List size, limited by the maximum number of words in the dictionary
101// that form a nested sequence.
102#define POSSIBLE_WORD_LIST_MAX 20
103
104class PossibleWord {
51004dcb
A
105private:
106 // list of word candidate lengths, in increasing length order
107 int32_t lengths[POSSIBLE_WORD_LIST_MAX];
108 int32_t count; // Count of candidates
109 int32_t prefix; // The longest match with a dictionary word
110 int32_t offset; // Offset in the text of these candidates
111 int mark; // The preferred candidate's offset
112 int current; // The candidate we're currently looking at
113
114public:
115 PossibleWord();
116 ~PossibleWord();
73c04bcf 117
51004dcb
A
118 // Fill the list of candidates if needed, select the longest, and return the number found
119 int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
73c04bcf 120
51004dcb
A
121 // Select the currently marked candidate, point after it in the text, and invalidate self
122 int32_t acceptMarked( UText *text );
73c04bcf 123
51004dcb
A
124 // Back up from the current candidate to the next shorter one; return TRUE if that exists
125 // and point the text after it
126 UBool backUp( UText *text );
73c04bcf 127
51004dcb
A
128 // Return the longest prefix this candidate location shares with a dictionary word
129 int32_t longestPrefix();
73c04bcf 130
51004dcb
A
131 // Mark the current candidate as the one we like
132 void markCurrent();
73c04bcf
A
133};
134
135inline
136PossibleWord::PossibleWord() {
137 offset = -1;
138}
139
140inline
141PossibleWord::~PossibleWord() {
142}
143
144inline int
51004dcb 145PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
73c04bcf
A
146 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
147 int32_t start = (int32_t)utext_getNativeIndex(text);
148 if (start != offset) {
149 offset = start;
150 prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
151 // Dictionary leaves text after longest prefix, not longest word. Back up.
152 if (count <= 0) {
153 utext_setNativeIndex(text, start);
154 }
155 }
156 if (count > 0) {
157 utext_setNativeIndex(text, start+lengths[count-1]);
158 }
159 current = count-1;
160 mark = current;
161 return count;
162}
163
164inline int32_t
165PossibleWord::acceptMarked( UText *text ) {
166 utext_setNativeIndex(text, offset + lengths[mark]);
167 return lengths[mark];
168}
169
170inline UBool
171PossibleWord::backUp( UText *text ) {
172 if (current > 0) {
173 utext_setNativeIndex(text, offset + lengths[--current]);
174 return TRUE;
175 }
176 return FALSE;
177}
178
179inline int32_t
180PossibleWord::longestPrefix() {
181 return prefix;
182}
183
184inline void
185PossibleWord::markCurrent() {
186 mark = current;
187}
188
57a6839d
A
189/*
190 ******************************************************************
191 * ThaiBreakEngine
192 */
193
73c04bcf
A
194// How many words in a row are "good enough"?
195#define THAI_LOOKAHEAD 3
196
197// Will not combine a non-word with a preceding dictionary word longer than this
198#define THAI_ROOT_COMBINE_THRESHOLD 3
199
200// Will not combine a non-word that shares at least this much prefix with a
201// dictionary word, with a preceding word
202#define THAI_PREFIX_COMBINE_THRESHOLD 3
203
204// Ellision character
205#define THAI_PAIYANNOI 0x0E2F
206
207// Repeat character
208#define THAI_MAIYAMOK 0x0E46
209
210// Minimum word size
211#define THAI_MIN_WORD 2
212
213// Minimum number of characters for two words
214#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
215
51004dcb 216ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
73c04bcf
A
217 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
218 fDictionary(adoptDictionary)
219{
220 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
221 if (U_SUCCESS(status)) {
222 setCharacters(fThaiWordSet);
223 }
224 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
46f4442e 225 fMarkSet.add(0x0020);
73c04bcf
A
226 fEndWordSet = fThaiWordSet;
227 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
228 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
229 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
230 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
231 fSuffixSet.add(THAI_PAIYANNOI);
232 fSuffixSet.add(THAI_MAIYAMOK);
46f4442e
A
233
234 // Compact for caching.
235 fMarkSet.compact();
236 fEndWordSet.compact();
237 fBeginWordSet.compact();
238 fSuffixSet.compact();
73c04bcf
A
239}
240
241ThaiBreakEngine::~ThaiBreakEngine() {
242 delete fDictionary;
243}
244
245int32_t
246ThaiBreakEngine::divideUpDictionaryRange( UText *text,
247 int32_t rangeStart,
248 int32_t rangeEnd,
249 UStack &foundBreaks ) const {
250 if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
251 return 0; // Not enough characters for two words
252 }
253
254 uint32_t wordsFound = 0;
255 int32_t wordLength;
256 int32_t current;
257 UErrorCode status = U_ZERO_ERROR;
258 PossibleWord words[THAI_LOOKAHEAD];
259 UChar32 uc;
260
261 utext_setNativeIndex(text, rangeStart);
262
263 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
264 wordLength = 0;
265
266 // Look for candidate words at the current position
267 int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
268
269 // If we found exactly one, use that
270 if (candidates == 1) {
51004dcb 271 wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
73c04bcf
A
272 wordsFound += 1;
273 }
73c04bcf
A
274 // If there was more than one, see which one can take us forward the most words
275 else if (candidates > 1) {
276 // If we're already at the end of the range, we're done
277 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
278 goto foundBest;
279 }
280 do {
281 int wordsMatched = 1;
51004dcb 282 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
73c04bcf
A
283 if (wordsMatched < 2) {
284 // Followed by another dictionary word; mark first word as a good candidate
285 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
286 wordsMatched = 2;
287 }
288
289 // If we're already at the end of the range, we're done
290 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
291 goto foundBest;
292 }
293
294 // See if any of the possible second words is followed by a third word
295 do {
296 // If we find a third word, stop right away
51004dcb
A
297 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
298 words[wordsFound % THAI_LOOKAHEAD].markCurrent();
73c04bcf
A
299 goto foundBest;
300 }
301 }
51004dcb 302 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
73c04bcf
A
303 }
304 }
51004dcb 305 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
73c04bcf 306foundBest:
51004dcb 307 wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
73c04bcf
A
308 wordsFound += 1;
309 }
310
311 // We come here after having either found a word or not. We look ahead to the
312 // next word. If it's not a dictionary word, we will combine it withe the word we
313 // just found (if there is one), but only if the preceding word does not exceed
314 // the threshold.
315 // The text iterator should now be positioned at the end of the word we found.
316 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
317 // if it is a dictionary word, do nothing. If it isn't, then if there is
318 // no preceding word, or the non-word shares less than the minimum threshold
319 // of characters with a dictionary word, then scan to resynchronize
51004dcb 320 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
73c04bcf
A
321 && (wordLength == 0
322 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
323 // Look for a plausible word boundary
324 //TODO: This section will need a rework for UText.
325 int32_t remaining = rangeEnd - (current+wordLength);
326 UChar32 pc = utext_current32(text);
327 int32_t chars = 0;
46f4442e 328 for (;;) {
73c04bcf
A
329 utext_next32(text);
330 uc = utext_current32(text);
331 // TODO: Here we're counting on the fact that the SA languages are all
332 // in the BMP. This should get fixed with the UText rework.
333 chars += 1;
334 if (--remaining <= 0) {
335 break;
336 }
337 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
338 // Maybe. See if it's in the dictionary.
339 // NOTE: In the original Apple code, checked that the next
340 // two characters after uc were not 0x0E4C THANTHAKHAT before
341 // checking the dictionary. That is just a performance filter,
342 // but it's not clear it's faster than checking the trie.
51004dcb
A
343 int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
344 utext_setNativeIndex(text, current + wordLength + chars);
73c04bcf
A
345 if (candidates > 0) {
346 break;
347 }
348 }
349 pc = uc;
350 }
351
352 // Bump the word count if there wasn't already one
353 if (wordLength <= 0) {
354 wordsFound += 1;
355 }
356
357 // Update the length with the passed-over characters
358 wordLength += chars;
359 }
360 else {
361 // Back up to where we were for next iteration
362 utext_setNativeIndex(text, current+wordLength);
363 }
364 }
365
366 // Never stop before a combining mark.
367 int32_t currPos;
368 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
369 utext_next32(text);
370 wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
371 }
372
373 // Look ahead for possible suffixes if a dictionary word does not follow.
374 // We do this in code rather than using a rule so that the heuristic
375 // resynch continues to function. For example, one of the suffix characters
376 // could be a typo in the middle of a word.
377 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
378 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
379 && fSuffixSet.contains(uc = utext_current32(text))) {
380 if (uc == THAI_PAIYANNOI) {
381 if (!fSuffixSet.contains(utext_previous32(text))) {
382 // Skip over previous end and PAIYANNOI
383 utext_next32(text);
384 utext_next32(text);
385 wordLength += 1; // Add PAIYANNOI to word
386 uc = utext_current32(text); // Fetch next character
387 }
388 else {
389 // Restore prior position
390 utext_next32(text);
391 }
392 }
393 if (uc == THAI_MAIYAMOK) {
394 if (utext_previous32(text) != THAI_MAIYAMOK) {
395 // Skip over previous end and MAIYAMOK
396 utext_next32(text);
397 utext_next32(text);
398 wordLength += 1; // Add MAIYAMOK to word
399 }
400 else {
401 // Restore prior position
402 utext_next32(text);
403 }
404 }
405 }
406 else {
407 utext_setNativeIndex(text, current+wordLength);
408 }
409 }
4388f060
A
410
411 // Did we find a word on this iteration? If so, push it on the break stack
412 if (wordLength > 0) {
413 foundBreaks.push((current+wordLength), status);
414 }
415 }
416
417 // Don't return a break for the end of the dictionary range if there is one there.
418 if (foundBreaks.peeki() >= rangeEnd) {
419 (void) foundBreaks.popi();
420 wordsFound -= 1;
421 }
422
423 return wordsFound;
424}
425
57a6839d
A
426/*
427 ******************************************************************
428 * LaoBreakEngine
429 */
430
431// How many words in a row are "good enough"?
432#define LAO_LOOKAHEAD 3
433
434// Will not combine a non-word with a preceding dictionary word longer than this
435#define LAO_ROOT_COMBINE_THRESHOLD 3
436
437// Will not combine a non-word that shares at least this much prefix with a
438// dictionary word, with a preceding word
439#define LAO_PREFIX_COMBINE_THRESHOLD 3
440
441// Minimum word size
442#define LAO_MIN_WORD 2
443
444// Minimum number of characters for two words
445#define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2)
446
447LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
448 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
449 fDictionary(adoptDictionary)
450{
451 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
452 if (U_SUCCESS(status)) {
453 setCharacters(fLaoWordSet);
454 }
455 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
456 fMarkSet.add(0x0020);
457 fEndWordSet = fLaoWordSet;
458 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
459 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
460 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
461 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
462
463 // Compact for caching.
464 fMarkSet.compact();
465 fEndWordSet.compact();
466 fBeginWordSet.compact();
467}
468
469LaoBreakEngine::~LaoBreakEngine() {
470 delete fDictionary;
471}
472
473int32_t
474LaoBreakEngine::divideUpDictionaryRange( UText *text,
475 int32_t rangeStart,
476 int32_t rangeEnd,
477 UStack &foundBreaks ) const {
478 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
479 return 0; // Not enough characters for two words
480 }
481
482 uint32_t wordsFound = 0;
483 int32_t wordLength;
484 int32_t current;
485 UErrorCode status = U_ZERO_ERROR;
486 PossibleWord words[LAO_LOOKAHEAD];
487 UChar32 uc;
488
489 utext_setNativeIndex(text, rangeStart);
490
491 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
492 wordLength = 0;
493
494 // Look for candidate words at the current position
495 int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
496
497 // If we found exactly one, use that
498 if (candidates == 1) {
499 wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
500 wordsFound += 1;
501 }
502 // If there was more than one, see which one can take us forward the most words
503 else if (candidates > 1) {
504 // If we're already at the end of the range, we're done
505 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
506 goto foundBest;
507 }
508 do {
509 int wordsMatched = 1;
510 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
511 if (wordsMatched < 2) {
512 // Followed by another dictionary word; mark first word as a good candidate
513 words[wordsFound%LAO_LOOKAHEAD].markCurrent();
514 wordsMatched = 2;
515 }
516
517 // If we're already at the end of the range, we're done
518 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
519 goto foundBest;
520 }
521
522 // See if any of the possible second words is followed by a third word
523 do {
524 // If we find a third word, stop right away
525 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
526 words[wordsFound % LAO_LOOKAHEAD].markCurrent();
527 goto foundBest;
528 }
529 }
530 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
531 }
532 }
533 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
534foundBest:
535 wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
536 wordsFound += 1;
537 }
538
539 // We come here after having either found a word or not. We look ahead to the
540 // next word. If it's not a dictionary word, we will combine it withe the word we
541 // just found (if there is one), but only if the preceding word does not exceed
542 // the threshold.
543 // The text iterator should now be positioned at the end of the word we found.
544 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) {
545 // if it is a dictionary word, do nothing. If it isn't, then if there is
546 // no preceding word, or the non-word shares less than the minimum threshold
547 // of characters with a dictionary word, then scan to resynchronize
548 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
549 && (wordLength == 0
550 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
551 // Look for a plausible word boundary
552 //TODO: This section will need a rework for UText.
553 int32_t remaining = rangeEnd - (current+wordLength);
554 UChar32 pc = utext_current32(text);
555 int32_t chars = 0;
556 for (;;) {
557 utext_next32(text);
558 uc = utext_current32(text);
559 // TODO: Here we're counting on the fact that the SA languages are all
560 // in the BMP. This should get fixed with the UText rework.
561 chars += 1;
562 if (--remaining <= 0) {
563 break;
564 }
565 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
566 // Maybe. See if it's in the dictionary.
567 int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
568 utext_setNativeIndex(text, current + wordLength + chars);
569 if (candidates > 0) {
570 break;
571 }
572 }
573 pc = uc;
574 }
575
576 // Bump the word count if there wasn't already one
577 if (wordLength <= 0) {
578 wordsFound += 1;
579 }
580
581 // Update the length with the passed-over characters
582 wordLength += chars;
583 }
584 else {
585 // Back up to where we were for next iteration
586 utext_setNativeIndex(text, current+wordLength);
587 }
588 }
589
590 // Never stop before a combining mark.
591 int32_t currPos;
592 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
593 utext_next32(text);
594 wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
595 }
596
597 // Look ahead for possible suffixes if a dictionary word does not follow.
598 // We do this in code rather than using a rule so that the heuristic
599 // resynch continues to function. For example, one of the suffix characters
600 // could be a typo in the middle of a word.
601 // NOT CURRENTLY APPLICABLE TO LAO
602
603 // Did we find a word on this iteration? If so, push it on the break stack
604 if (wordLength > 0) {
605 foundBreaks.push((current+wordLength), status);
606 }
607 }
608
609 // Don't return a break for the end of the dictionary range if there is one there.
610 if (foundBreaks.peeki() >= rangeEnd) {
611 (void) foundBreaks.popi();
612 wordsFound -= 1;
613 }
614
615 return wordsFound;
616}
617
618/*
619 ******************************************************************
620 * KhmerBreakEngine
621 */
622
4388f060
A
623// How many words in a row are "good enough"?
624#define KHMER_LOOKAHEAD 3
625
626// Will not combine a non-word with a preceding dictionary word longer than this
627#define KHMER_ROOT_COMBINE_THRESHOLD 3
628
629// Will not combine a non-word that shares at least this much prefix with a
630// dictionary word, with a preceding word
631#define KHMER_PREFIX_COMBINE_THRESHOLD 3
632
633// Minimum word size
634#define KHMER_MIN_WORD 2
635
636// Minimum number of characters for two words
637#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)
638
51004dcb
A
639KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
640 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
4388f060
A
641 fDictionary(adoptDictionary)
642{
643 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
644 if (U_SUCCESS(status)) {
645 setCharacters(fKhmerWordSet);
646 }
647 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
648 fMarkSet.add(0x0020);
649 fEndWordSet = fKhmerWordSet;
650 fBeginWordSet.add(0x1780, 0x17B3);
651 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
652 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
653 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
654 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
655 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
656// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
657// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
658// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
659// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
660// fSuffixSet.add(THAI_PAIYANNOI);
661// fSuffixSet.add(THAI_MAIYAMOK);
662
663 // Compact for caching.
664 fMarkSet.compact();
665 fEndWordSet.compact();
666 fBeginWordSet.compact();
667// fSuffixSet.compact();
668}
669
670KhmerBreakEngine::~KhmerBreakEngine() {
671 delete fDictionary;
672}
673
674int32_t
675KhmerBreakEngine::divideUpDictionaryRange( UText *text,
676 int32_t rangeStart,
677 int32_t rangeEnd,
678 UStack &foundBreaks ) const {
679 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
680 return 0; // Not enough characters for two words
681 }
682
683 uint32_t wordsFound = 0;
684 int32_t wordLength;
685 int32_t current;
686 UErrorCode status = U_ZERO_ERROR;
687 PossibleWord words[KHMER_LOOKAHEAD];
688 UChar32 uc;
689
690 utext_setNativeIndex(text, rangeStart);
691
692 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
693 wordLength = 0;
694
695 // Look for candidate words at the current position
696 int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
697
698 // If we found exactly one, use that
699 if (candidates == 1) {
700 wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
701 wordsFound += 1;
702 }
703
704 // If there was more than one, see which one can take us forward the most words
705 else if (candidates > 1) {
706 // If we're already at the end of the range, we're done
707 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
708 goto foundBest;
709 }
710 do {
711 int wordsMatched = 1;
51004dcb 712 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
4388f060
A
713 if (wordsMatched < 2) {
714 // Followed by another dictionary word; mark first word as a good candidate
51004dcb 715 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
4388f060
A
716 wordsMatched = 2;
717 }
718
719 // If we're already at the end of the range, we're done
720 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
721 goto foundBest;
722 }
723
724 // See if any of the possible second words is followed by a third word
725 do {
726 // If we find a third word, stop right away
51004dcb
A
727 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
728 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
4388f060
A
729 goto foundBest;
730 }
731 }
51004dcb 732 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
4388f060
A
733 }
734 }
51004dcb 735 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
4388f060 736foundBest:
51004dcb 737 wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
4388f060
A
738 wordsFound += 1;
739 }
740
741 // We come here after having either found a word or not. We look ahead to the
742 // next word. If it's not a dictionary word, we will combine it with the word we
743 // just found (if there is one), but only if the preceding word does not exceed
744 // the threshold.
745 // The text iterator should now be positioned at the end of the word we found.
746 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
747 // if it is a dictionary word, do nothing. If it isn't, then if there is
748 // no preceding word, or the non-word shares less than the minimum threshold
749 // of characters with a dictionary word, then scan to resynchronize
51004dcb 750 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
4388f060 751 && (wordLength == 0
51004dcb 752 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
4388f060
A
753 // Look for a plausible word boundary
754 //TODO: This section will need a rework for UText.
755 int32_t remaining = rangeEnd - (current+wordLength);
756 UChar32 pc = utext_current32(text);
757 int32_t chars = 0;
758 for (;;) {
759 utext_next32(text);
760 uc = utext_current32(text);
761 // TODO: Here we're counting on the fact that the SA languages are all
762 // in the BMP. This should get fixed with the UText rework.
763 chars += 1;
764 if (--remaining <= 0) {
765 break;
766 }
767 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
768 // Maybe. See if it's in the dictionary.
51004dcb 769 int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
4388f060
A
770 utext_setNativeIndex(text, current+wordLength+chars);
771 if (candidates > 0) {
772 break;
773 }
774 }
775 pc = uc;
776 }
777
778 // Bump the word count if there wasn't already one
779 if (wordLength <= 0) {
780 wordsFound += 1;
781 }
782
783 // Update the length with the passed-over characters
784 wordLength += chars;
785 }
786 else {
787 // Back up to where we were for next iteration
788 utext_setNativeIndex(text, current+wordLength);
789 }
790 }
791
792 // Never stop before a combining mark.
793 int32_t currPos;
794 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
795 utext_next32(text);
796 wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
797 }
798
799 // Look ahead for possible suffixes if a dictionary word does not follow.
800 // We do this in code rather than using a rule so that the heuristic
801 // resynch continues to function. For example, one of the suffix characters
802 // could be a typo in the middle of a word.
803// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
804// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
805// && fSuffixSet.contains(uc = utext_current32(text))) {
806// if (uc == KHMER_PAIYANNOI) {
807// if (!fSuffixSet.contains(utext_previous32(text))) {
808// // Skip over previous end and PAIYANNOI
809// utext_next32(text);
810// utext_next32(text);
811// wordLength += 1; // Add PAIYANNOI to word
812// uc = utext_current32(text); // Fetch next character
813// }
814// else {
815// // Restore prior position
816// utext_next32(text);
817// }
818// }
819// if (uc == KHMER_MAIYAMOK) {
820// if (utext_previous32(text) != KHMER_MAIYAMOK) {
821// // Skip over previous end and MAIYAMOK
822// utext_next32(text);
823// utext_next32(text);
824// wordLength += 1; // Add MAIYAMOK to word
825// }
826// else {
827// // Restore prior position
828// utext_next32(text);
829// }
830// }
831// }
832// else {
833// utext_setNativeIndex(text, current+wordLength);
834// }
835// }
836
73c04bcf
A
837 // Did we find a word on this iteration? If so, push it on the break stack
838 if (wordLength > 0) {
839 foundBreaks.push((current+wordLength), status);
840 }
841 }
842
843 // Don't return a break for the end of the dictionary range if there is one there.
844 if (foundBreaks.peeki() >= rangeEnd) {
845 (void) foundBreaks.popi();
846 wordsFound -= 1;
847 }
848
849 return wordsFound;
850}
851
51004dcb
A
852#if !UCONFIG_NO_NORMALIZATION
853/*
854 ******************************************************************
855 * CjkBreakEngine
856 */
857static const uint32_t kuint32max = 0xFFFFFFFF;
858CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
859: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
860 // Korean dictionary only includes Hangul syllables
861 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
862 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
863 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
864 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
865
866 if (U_SUCCESS(status)) {
867 // handle Korean and Japanese/Chinese using different dictionaries
868 if (type == kKorean) {
869 setCharacters(fHangulWordSet);
870 } else { //Chinese and Japanese
871 UnicodeSet cjSet;
872 cjSet.addAll(fHanWordSet);
873 cjSet.addAll(fKatakanaWordSet);
874 cjSet.addAll(fHiraganaWordSet);
57a6839d
A
875 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
876 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
51004dcb
A
877 setCharacters(cjSet);
878 }
879 }
880}
881
882CjkBreakEngine::~CjkBreakEngine(){
883 delete fDictionary;
884}
885
886// The katakanaCost values below are based on the length frequencies of all
887// katakana phrases in the dictionary
888static const int kMaxKatakanaLength = 8;
889static const int kMaxKatakanaGroupLength = 20;
890static const uint32_t maxSnlp = 255;
891
892static inline uint32_t getKatakanaCost(int wordLength){
893 //TODO: fill array with actual values from dictionary!
894 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
895 = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
896 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
897}
898
899static inline bool isKatakana(uint16_t value) {
900 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
901 (value >= 0xFF66u && value <= 0xFF9fu);
902}
903
904// A very simple helper class to streamline the buffer handling in
905// divideUpDictionaryRange.
906template<class T, size_t N>
907class AutoBuffer {
908public:
909 AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
910 if (size > N) {
911 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
912 capacity = size;
913 }
914 }
915 ~AutoBuffer() {
916 if (buffer != stackBuffer)
917 uprv_free(buffer);
918 }
919
920 T* elems() {
921 return buffer;
922 }
923
924 const T& operator[] (size_t i) const {
925 return buffer[i];
926 }
927
928 T& operator[] (size_t i) {
929 return buffer[i];
930 }
931
932 // resize without copy
933 void resize(size_t size) {
934 if (size <= capacity)
935 return;
936 if (buffer != stackBuffer)
937 uprv_free(buffer);
938 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
939 capacity = size;
940 }
941
942private:
943 T stackBuffer[N];
944 T* buffer;
945 AutoBuffer();
946 size_t capacity;
947};
948
949
950/*
951 * @param text A UText representing the text
952 * @param rangeStart The start of the range of dictionary characters
953 * @param rangeEnd The end of the range of dictionary characters
954 * @param foundBreaks Output of C array of int32_t break positions, or 0
955 * @return The number of breaks found
956 */
957int32_t
958CjkBreakEngine::divideUpDictionaryRange( UText *text,
959 int32_t rangeStart,
960 int32_t rangeEnd,
961 UStack &foundBreaks ) const {
962 if (rangeStart >= rangeEnd) {
963 return 0;
964 }
965
966 const size_t defaultInputLength = 80;
967 size_t inputLength = rangeEnd - rangeStart;
968 // TODO: Replace by UnicodeString.
969 AutoBuffer<UChar, defaultInputLength> charString(inputLength);
970
971 // Normalize the input string and put it in normalizedText.
972 // The map from the indices of the normalized input to the raw
973 // input is kept in charPositions.
974 UErrorCode status = U_ZERO_ERROR;
975 utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
976 if (U_FAILURE(status)) {
977 return 0;
978 }
979
980 UnicodeString inputString(charString.elems(), inputLength);
981 // TODO: Use Normalizer2.
982 UNormalizationMode norm_mode = UNORM_NFKC;
983 UBool isNormalized =
984 Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
985 Normalizer::isNormalized(inputString, norm_mode, status);
986
987 // TODO: Replace by UVector32.
988 AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
989 int numChars = 0;
990 UText normalizedText = UTEXT_INITIALIZER;
991 // Needs to be declared here because normalizedText holds onto its buffer.
992 UnicodeString normalizedString;
993 if (isNormalized) {
994 int32_t index = 0;
995 charPositions[0] = 0;
996 while(index < inputString.length()) {
997 index = inputString.moveIndex32(index, 1);
998 charPositions[++numChars] = index;
999 }
1000 utext_openUnicodeString(&normalizedText, &inputString, &status);
1001 }
1002 else {
1003 Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
1004 if (U_FAILURE(status)) {
1005 return 0;
1006 }
1007 charPositions.resize(normalizedString.length() + 1);
1008 Normalizer normalizer(charString.elems(), inputLength, norm_mode);
1009 int32_t index = 0;
1010 charPositions[0] = 0;
1011 while(index < normalizer.endIndex()){
1012 /* UChar32 uc = */ normalizer.next();
1013 charPositions[++numChars] = index = normalizer.getIndex();
1014 }
1015 utext_openUnicodeString(&normalizedText, &normalizedString, &status);
1016 }
1017
1018 if (U_FAILURE(status)) {
1019 return 0;
1020 }
1021
1022 // From this point on, all the indices refer to the indices of
1023 // the normalized input string.
1024
1025 // bestSnlp[i] is the snlp of the best segmentation of the first i
1026 // characters in the range to be matched.
1027 // TODO: Replace by UVector32.
1028 AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
1029 bestSnlp[0] = 0;
1030 for(int i = 1; i <= numChars; i++) {
1031 bestSnlp[i] = kuint32max;
1032 }
1033
1034 // prev[i] is the index of the last CJK character in the previous word in
1035 // the best segmentation of the first i characters.
1036 // TODO: Replace by UVector32.
1037 AutoBuffer<int, defaultInputLength> prev(numChars + 1);
1038 for(int i = 0; i <= numChars; i++){
1039 prev[i] = -1;
1040 }
1041
1042 const size_t maxWordSize = 20;
1043 // TODO: Replace both with UVector32.
1044 AutoBuffer<int32_t, maxWordSize> values(numChars);
1045 AutoBuffer<int32_t, maxWordSize> lengths(numChars);
1046
1047 // Dynamic programming to find the best segmentation.
1048 bool is_prev_katakana = false;
1049 for (int32_t i = 0; i < numChars; ++i) {
1050 //utext_setNativeIndex(text, rangeStart + i);
1051 utext_setNativeIndex(&normalizedText, i);
1052 if (bestSnlp[i] == kuint32max)
1053 continue;
1054
1055 int32_t count;
1056 // limit maximum word length matched to size of current substring
1057 int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);
1058
1059 fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
1060
1061 // if there are no single character matches found in the dictionary
1062 // starting with this charcter, treat character as a 1-character word
1063 // with the highest value possible, i.e. the least likely to occur.
1064 // Exclude Korean characters from this treatment, as they should be left
1065 // together by default.
1066 if((count == 0 || lengths[0] != 1) &&
1067 !fHangulWordSet.contains(utext_current32(&normalizedText))) {
1068 values[count] = maxSnlp;
1069 lengths[count++] = 1;
1070 }
1071
1072 for (int j = 0; j < count; j++) {
1073 uint32_t newSnlp = bestSnlp[i] + values[j];
1074 if (newSnlp < bestSnlp[lengths[j] + i]) {
1075 bestSnlp[lengths[j] + i] = newSnlp;
1076 prev[lengths[j] + i] = i;
1077 }
1078 }
1079
1080 // In Japanese,
1081 // Katakana word in single character is pretty rare. So we apply
1082 // the following heuristic to Katakana: any continuous run of Katakana
1083 // characters is considered a candidate word with a default cost
1084 // specified in the katakanaCost table according to its length.
1085 //utext_setNativeIndex(text, rangeStart + i);
1086 utext_setNativeIndex(&normalizedText, i);
1087 bool is_katakana = isKatakana(utext_current32(&normalizedText));
1088 if (!is_prev_katakana && is_katakana) {
1089 int j = i + 1;
1090 utext_next32(&normalizedText);
1091 // Find the end of the continuous run of Katakana characters
1092 while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
1093 isKatakana(utext_current32(&normalizedText))) {
1094 utext_next32(&normalizedText);
1095 ++j;
1096 }
1097 if ((j - i) < kMaxKatakanaGroupLength) {
1098 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
1099 if (newSnlp < bestSnlp[j]) {
1100 bestSnlp[j] = newSnlp;
1101 prev[j] = i;
1102 }
1103 }
1104 }
1105 is_prev_katakana = is_katakana;
1106 }
1107
1108 // Start pushing the optimal offset index into t_boundary (t for tentative).
1109 // prev[numChars] is guaranteed to be meaningful.
1110 // We'll first push in the reverse order, i.e.,
1111 // t_boundary[0] = numChars, and afterwards do a swap.
1112 // TODO: Replace by UVector32.
1113 AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
1114
1115 int numBreaks = 0;
1116 // No segmentation found, set boundary to end of range
1117 if (bestSnlp[numChars] == kuint32max) {
1118 t_boundary[numBreaks++] = numChars;
1119 } else {
1120 for (int i = numChars; i > 0; i = prev[i]) {
1121 t_boundary[numBreaks++] = i;
1122 }
1123 U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
1124 }
1125
1126 // Reverse offset index in t_boundary.
1127 // Don't add a break for the start of the dictionary range if there is one
1128 // there already.
1129 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1130 t_boundary[numBreaks++] = 0;
1131 }
1132
1133 // Now that we're done, convert positions in t_bdry[] (indices in
1134 // the normalized input string) back to indices in the raw input string
1135 // while reversing t_bdry and pushing values to foundBreaks.
1136 for (int i = numBreaks-1; i >= 0; i--) {
1137 foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
1138 }
1139
1140 utext_close(&normalizedText);
1141 return numBreaks;
1142}
1143#endif
1144
73c04bcf
A
1145U_NAMESPACE_END
1146
1147#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
51004dcb 1148