2 *******************************************************************************
3 * Copyright (C) 2006-2016, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_BREAK_ITERATION
14 #include "unicode/uniset.h"
15 #include "unicode/chariter.h"
16 #include "unicode/ubrk.h"
20 #include "unicode/normlzr.h"
22 #include "dictionarydata.h"
27 ******************************************************************
30 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes
) {
34 DictionaryBreakEngine::~DictionaryBreakEngine() {
38 DictionaryBreakEngine::handles(UChar32 c
, int32_t breakType
) const {
39 return (breakType
>= 0 && breakType
< 32 && (((uint32_t)1 << breakType
) & fTypes
)
44 DictionaryBreakEngine::findBreaks( UText
*text
,
49 UStack
&foundBreaks
) const {
52 // Find the span of characters included in the set.
53 // The span to break begins at the current position in the text, and
54 // extends towards the start or end of the text, depending on 'reverse'.
56 int32_t start
= (int32_t)utext_getNativeIndex(text
);
60 UChar32 c
= utext_current32(text
);
62 UBool isDict
= fSet
.contains(c
);
63 while((current
= (int32_t)utext_getNativeIndex(text
)) > startPos
&& isDict
) {
64 c
= utext_previous32(text
);
65 isDict
= fSet
.contains(c
);
67 if (current
< startPos
) {
68 rangeStart
= startPos
;
73 rangeStart
= (int32_t)utext_getNativeIndex(text
);
76 // rangeEnd = start + 1;
77 utext_setNativeIndex(text
, start
);
79 rangeEnd
= (int32_t)utext_getNativeIndex(text
);
82 while((current
= (int32_t)utext_getNativeIndex(text
)) < endPos
&& fSet
.contains(c
)) {
83 utext_next32(text
); // TODO: recast loop for postincrement
84 c
= utext_current32(text
);
89 if (breakType
>= 0 && breakType
< 32 && (((uint32_t)1 << breakType
) & fTypes
)) {
90 result
= divideUpDictionaryRange(text
, rangeStart
, rangeEnd
, foundBreaks
);
91 utext_setNativeIndex(text
, current
);
98 DictionaryBreakEngine::setCharacters( const UnicodeSet
&set
) {
100 // Compact for caching
105 ******************************************************************
109 // Helper class for improving readability of the Thai/Lao/Khmer word break
110 // algorithm. The implementation is completely inline.
112 // List size, limited by the maximum number of words in the dictionary
113 // that form a nested sequence.
114 static const int32_t POSSIBLE_WORD_LIST_MAX
= 20;
118 // list of word candidate lengths, in increasing length order
119 // TODO: bytes would be sufficient for word lengths.
120 int32_t count
; // Count of candidates
121 int32_t prefix
; // The longest match with a dictionary word
122 int32_t offset
; // Offset in the text of these candidates
123 int32_t mark
; // The preferred candidate's offset
124 int32_t current
; // The candidate we're currently looking at
125 int32_t cuLengths
[POSSIBLE_WORD_LIST_MAX
]; // Word Lengths, in code units.
126 int32_t cpLengths
[POSSIBLE_WORD_LIST_MAX
]; // Word Lengths, in code points.
129 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
132 // Fill the list of candidates if needed, select the longest, and return the number found
133 int32_t candidates( UText
*text
, DictionaryMatcher
*dict
, int32_t rangeEnd
);
135 // Select the currently marked candidate, point after it in the text, and invalidate self
136 int32_t acceptMarked( UText
*text
);
138 // Back up from the current candidate to the next shorter one; return TRUE if that exists
139 // and point the text after it
140 UBool
backUp( UText
*text
);
142 // Return the longest prefix this candidate location shares with a dictionary word
143 // Return value is in code points.
144 int32_t longestPrefix() { return prefix
; };
146 // Mark the current candidate as the one we like
147 void markCurrent() { mark
= current
; };
149 // Get length in code points of the marked word.
150 int32_t markedCPLength() { return cpLengths
[mark
]; };
154 int32_t PossibleWord::candidates( UText
*text
, DictionaryMatcher
*dict
, int32_t rangeEnd
) {
155 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
156 int32_t start
= (int32_t)utext_getNativeIndex(text
);
157 if (start
!= offset
) {
159 count
= dict
->matches(text
, rangeEnd
-start
, UPRV_LENGTHOF(cuLengths
), cuLengths
, cpLengths
, NULL
, &prefix
);
160 // Dictionary leaves text after longest prefix, not longest word. Back up.
162 utext_setNativeIndex(text
, start
);
166 utext_setNativeIndex(text
, start
+cuLengths
[count
-1]);
174 PossibleWord::acceptMarked( UText
*text
) {
175 utext_setNativeIndex(text
, offset
+ cuLengths
[mark
]);
176 return cuLengths
[mark
];
181 PossibleWord::backUp( UText
*text
) {
183 utext_setNativeIndex(text
, offset
+ cuLengths
[--current
]);
190 ******************************************************************
194 // How many words in a row are "good enough"?
195 static const int32_t THAI_LOOKAHEAD
= 3;
197 // Will not combine a non-word with a preceding dictionary word longer than this
198 static const int32_t THAI_ROOT_COMBINE_THRESHOLD
= 3;
200 // Will not combine a non-word that shares at least this much prefix with a
201 // dictionary word, with a preceding word
202 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD
= 3;
204 // Ellision character
205 static const int32_t THAI_PAIYANNOI
= 0x0E2F;
208 static const int32_t THAI_MAIYAMOK
= 0x0E46;
211 static const int32_t THAI_MIN_WORD
= 2;
213 // Minimum number of characters for two words
214 static const int32_t THAI_MIN_WORD_SPAN
= THAI_MIN_WORD
* 2;
216 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
)
217 : DictionaryBreakEngine((1<<UBRK_WORD
) | (1<<UBRK_LINE
)),
218 fDictionary(adoptDictionary
)
220 fThaiWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status
);
221 if (U_SUCCESS(status
)) {
222 setCharacters(fThaiWordSet
);
224 fMarkSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status
);
225 fMarkSet
.add(0x0020);
226 fEndWordSet
= fThaiWordSet
;
227 fEndWordSet
.remove(0x0E31); // MAI HAN-AKAT
228 fEndWordSet
.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
229 fBeginWordSet
.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
230 fBeginWordSet
.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
231 fSuffixSet
.add(THAI_PAIYANNOI
);
232 fSuffixSet
.add(THAI_MAIYAMOK
);
234 // Compact for caching.
236 fEndWordSet
.compact();
237 fBeginWordSet
.compact();
238 fSuffixSet
.compact();
241 ThaiBreakEngine::~ThaiBreakEngine() {
246 ThaiBreakEngine::divideUpDictionaryRange( UText
*text
,
249 UStack
&foundBreaks
) const {
250 utext_setNativeIndex(text
, rangeStart
);
251 utext_moveIndex32(text
, THAI_MIN_WORD_SPAN
);
252 if (utext_getNativeIndex(text
) >= rangeEnd
) {
253 return 0; // Not enough characters for two words
255 utext_setNativeIndex(text
, rangeStart
);
258 uint32_t wordsFound
= 0;
259 int32_t cpWordLength
= 0; // Word Length in Code Points.
260 int32_t cuWordLength
= 0; // Word length in code units (UText native indexing)
262 UErrorCode status
= U_ZERO_ERROR
;
263 PossibleWord words
[THAI_LOOKAHEAD
];
265 utext_setNativeIndex(text
, rangeStart
);
267 while (U_SUCCESS(status
) && (current
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
) {
271 // Look for candidate words at the current position
272 int32_t candidates
= words
[wordsFound%THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
274 // If we found exactly one, use that
275 if (candidates
== 1) {
276 cuWordLength
= words
[wordsFound
% THAI_LOOKAHEAD
].acceptMarked(text
);
277 cpWordLength
= words
[wordsFound
% THAI_LOOKAHEAD
].markedCPLength();
280 // If there was more than one, see which one can take us forward the most words
281 else if (candidates
> 1) {
282 // If we're already at the end of the range, we're done
283 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
287 int32_t wordsMatched
= 1;
288 if (words
[(wordsFound
+ 1) % THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) > 0) {
289 if (wordsMatched
< 2) {
290 // Followed by another dictionary word; mark first word as a good candidate
291 words
[wordsFound%THAI_LOOKAHEAD
].markCurrent();
295 // If we're already at the end of the range, we're done
296 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
300 // See if any of the possible second words is followed by a third word
302 // If we find a third word, stop right away
303 if (words
[(wordsFound
+ 2) % THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
)) {
304 words
[wordsFound
% THAI_LOOKAHEAD
].markCurrent();
308 while (words
[(wordsFound
+ 1) % THAI_LOOKAHEAD
].backUp(text
));
311 while (words
[wordsFound
% THAI_LOOKAHEAD
].backUp(text
));
313 // Set UText position to after the accepted word.
314 cuWordLength
= words
[wordsFound
% THAI_LOOKAHEAD
].acceptMarked(text
);
315 cpWordLength
= words
[wordsFound
% THAI_LOOKAHEAD
].markedCPLength();
319 // We come here after having either found a word or not. We look ahead to the
320 // next word. If it's not a dictionary word, we will combine it with the word we
321 // just found (if there is one), but only if the preceding word does not exceed
323 // The text iterator should now be positioned at the end of the word we found.
326 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& cpWordLength
< THAI_ROOT_COMBINE_THRESHOLD
) {
327 // if it is a dictionary word, do nothing. If it isn't, then if there is
328 // no preceding word, or the non-word shares less than the minimum threshold
329 // of characters with a dictionary word, then scan to resynchronize
330 if (words
[wordsFound
% THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
331 && (cuWordLength
== 0
332 || words
[wordsFound%THAI_LOOKAHEAD
].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD
)) {
333 // Look for a plausible word boundary
334 int32_t remaining
= rangeEnd
- (current
+cuWordLength
);
338 int32_t pcIndex
= (int32_t)utext_getNativeIndex(text
);
339 pc
= utext_next32(text
);
340 int32_t pcSize
= (int32_t)utext_getNativeIndex(text
) - pcIndex
;
343 if (remaining
<= 0) {
346 uc
= utext_current32(text
);
347 if (fEndWordSet
.contains(pc
) && fBeginWordSet
.contains(uc
)) {
348 // Maybe. See if it's in the dictionary.
349 // NOTE: In the original Apple code, checked that the next
350 // two characters after uc were not 0x0E4C THANTHAKHAT before
351 // checking the dictionary. That is just a performance filter,
352 // but it's not clear it's faster than checking the trie.
353 int32_t candidates
= words
[(wordsFound
+ 1) % THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
354 utext_setNativeIndex(text
, current
+ cuWordLength
+ chars
);
355 if (candidates
> 0) {
361 // Bump the word count if there wasn't already one
362 if (cuWordLength
<= 0) {
366 // Update the length with the passed-over characters
367 cuWordLength
+= chars
;
370 // Back up to where we were for next iteration
371 utext_setNativeIndex(text
, current
+cuWordLength
);
375 // Never stop before a combining mark.
377 while ((currPos
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
&& fMarkSet
.contains(utext_current32(text
))) {
379 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - currPos
;
382 // Look ahead for possible suffixes if a dictionary word does not follow.
383 // We do this in code rather than using a rule so that the heuristic
384 // resynch continues to function. For example, one of the suffix characters
385 // could be a typo in the middle of a word.
386 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& cuWordLength
> 0) {
387 if (words
[wordsFound%THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
388 && fSuffixSet
.contains(uc
= utext_current32(text
))) {
389 if (uc
== THAI_PAIYANNOI
) {
390 if (!fSuffixSet
.contains(utext_previous32(text
))) {
391 // Skip over previous end and PAIYANNOI
393 int32_t paiyannoiIndex
= (int32_t)utext_getNativeIndex(text
);
395 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - paiyannoiIndex
; // Add PAIYANNOI to word
396 uc
= utext_current32(text
); // Fetch next character
399 // Restore prior position
403 if (uc
== THAI_MAIYAMOK
) {
404 if (utext_previous32(text
) != THAI_MAIYAMOK
) {
405 // Skip over previous end and MAIYAMOK
407 int32_t maiyamokIndex
= (int32_t)utext_getNativeIndex(text
);
409 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - maiyamokIndex
; // Add MAIYAMOK to word
412 // Restore prior position
418 utext_setNativeIndex(text
, current
+cuWordLength
);
422 // Did we find a word on this iteration? If so, push it on the break stack
423 if (cuWordLength
> 0) {
424 foundBreaks
.push((current
+cuWordLength
), status
);
428 // Don't return a break for the end of the dictionary range if there is one there.
429 if (foundBreaks
.peeki() >= rangeEnd
) {
430 (void) foundBreaks
.popi();
438 ******************************************************************
442 // How many words in a row are "good enough"?
443 static const int32_t LAO_LOOKAHEAD
= 3;
445 // Will not combine a non-word with a preceding dictionary word longer than this
446 static const int32_t LAO_ROOT_COMBINE_THRESHOLD
= 3;
448 // Will not combine a non-word that shares at least this much prefix with a
449 // dictionary word, with a preceding word
450 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD
= 3;
453 static const int32_t LAO_MIN_WORD
= 2;
455 // Minimum number of characters for two words
456 static const int32_t LAO_MIN_WORD_SPAN
= LAO_MIN_WORD
* 2;
458 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
)
459 : DictionaryBreakEngine((1<<UBRK_WORD
) | (1<<UBRK_LINE
)),
460 fDictionary(adoptDictionary
)
462 fLaoWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status
);
463 if (U_SUCCESS(status
)) {
464 setCharacters(fLaoWordSet
);
466 fMarkSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status
);
467 fMarkSet
.add(0x0020);
468 fEndWordSet
= fLaoWordSet
;
469 fEndWordSet
.remove(0x0EC0, 0x0EC4); // prefix vowels
470 fBeginWordSet
.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
471 fBeginWordSet
.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
472 fBeginWordSet
.add(0x0EC0, 0x0EC4); // prefix vowels
474 // Compact for caching.
476 fEndWordSet
.compact();
477 fBeginWordSet
.compact();
480 LaoBreakEngine::~LaoBreakEngine() {
485 LaoBreakEngine::divideUpDictionaryRange( UText
*text
,
488 UStack
&foundBreaks
) const {
489 if ((rangeEnd
- rangeStart
) < LAO_MIN_WORD_SPAN
) {
490 return 0; // Not enough characters for two words
493 uint32_t wordsFound
= 0;
494 int32_t cpWordLength
= 0;
495 int32_t cuWordLength
= 0;
497 UErrorCode status
= U_ZERO_ERROR
;
498 PossibleWord words
[LAO_LOOKAHEAD
];
500 utext_setNativeIndex(text
, rangeStart
);
502 while (U_SUCCESS(status
) && (current
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
) {
506 // Look for candidate words at the current position
507 int32_t candidates
= words
[wordsFound%LAO_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
509 // If we found exactly one, use that
510 if (candidates
== 1) {
511 cuWordLength
= words
[wordsFound
% LAO_LOOKAHEAD
].acceptMarked(text
);
512 cpWordLength
= words
[wordsFound
% LAO_LOOKAHEAD
].markedCPLength();
515 // If there was more than one, see which one can take us forward the most words
516 else if (candidates
> 1) {
517 // If we're already at the end of the range, we're done
518 if (utext_getNativeIndex(text
) >= rangeEnd
) {
522 int32_t wordsMatched
= 1;
523 if (words
[(wordsFound
+ 1) % LAO_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) > 0) {
524 if (wordsMatched
< 2) {
525 // Followed by another dictionary word; mark first word as a good candidate
526 words
[wordsFound%LAO_LOOKAHEAD
].markCurrent();
530 // If we're already at the end of the range, we're done
531 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
535 // See if any of the possible second words is followed by a third word
537 // If we find a third word, stop right away
538 if (words
[(wordsFound
+ 2) % LAO_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
)) {
539 words
[wordsFound
% LAO_LOOKAHEAD
].markCurrent();
543 while (words
[(wordsFound
+ 1) % LAO_LOOKAHEAD
].backUp(text
));
546 while (words
[wordsFound
% LAO_LOOKAHEAD
].backUp(text
));
548 cuWordLength
= words
[wordsFound
% LAO_LOOKAHEAD
].acceptMarked(text
);
549 cpWordLength
= words
[wordsFound
% LAO_LOOKAHEAD
].markedCPLength();
553 // We come here after having either found a word or not. We look ahead to the
554 // next word. If it's not a dictionary word, we will combine it withe the word we
555 // just found (if there is one), but only if the preceding word does not exceed
557 // The text iterator should now be positioned at the end of the word we found.
558 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& cpWordLength
< LAO_ROOT_COMBINE_THRESHOLD
) {
559 // if it is a dictionary word, do nothing. If it isn't, then if there is
560 // no preceding word, or the non-word shares less than the minimum threshold
561 // of characters with a dictionary word, then scan to resynchronize
562 if (words
[wordsFound
% LAO_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
563 && (cuWordLength
== 0
564 || words
[wordsFound%LAO_LOOKAHEAD
].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD
)) {
565 // Look for a plausible word boundary
566 int32_t remaining
= rangeEnd
- (current
+ cuWordLength
);
571 int32_t pcIndex
= (int32_t)utext_getNativeIndex(text
);
572 pc
= utext_next32(text
);
573 int32_t pcSize
= (int32_t)utext_getNativeIndex(text
) - pcIndex
;
576 if (remaining
<= 0) {
579 uc
= utext_current32(text
);
580 if (fEndWordSet
.contains(pc
) && fBeginWordSet
.contains(uc
)) {
581 // Maybe. See if it's in the dictionary.
582 // TODO: this looks iffy; compare with old code.
583 int32_t candidates
= words
[(wordsFound
+ 1) % LAO_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
584 utext_setNativeIndex(text
, current
+ cuWordLength
+ chars
);
585 if (candidates
> 0) {
591 // Bump the word count if there wasn't already one
592 if (cuWordLength
<= 0) {
596 // Update the length with the passed-over characters
597 cuWordLength
+= chars
;
600 // Back up to where we were for next iteration
601 utext_setNativeIndex(text
, current
+ cuWordLength
);
605 // Never stop before a combining mark.
607 while ((currPos
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
&& fMarkSet
.contains(utext_current32(text
))) {
609 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - currPos
;
612 // Look ahead for possible suffixes if a dictionary word does not follow.
613 // We do this in code rather than using a rule so that the heuristic
614 // resynch continues to function. For example, one of the suffix characters
615 // could be a typo in the middle of a word.
616 // NOT CURRENTLY APPLICABLE TO LAO
618 // Did we find a word on this iteration? If so, push it on the break stack
619 if (cuWordLength
> 0) {
620 foundBreaks
.push((current
+cuWordLength
), status
);
624 // Don't return a break for the end of the dictionary range if there is one there.
625 if (foundBreaks
.peeki() >= rangeEnd
) {
626 (void) foundBreaks
.popi();
634 ******************************************************************
638 // How many words in a row are "good enough"?
639 static const int32_t BURMESE_LOOKAHEAD
= 3;
641 // Will not combine a non-word with a preceding dictionary word longer than this
642 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD
= 3;
644 // Will not combine a non-word that shares at least this much prefix with a
645 // dictionary word, with a preceding word
646 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD
= 3;
649 static const int32_t BURMESE_MIN_WORD
= 2;
651 // Minimum number of characters for two words
652 static const int32_t BURMESE_MIN_WORD_SPAN
= BURMESE_MIN_WORD
* 2;
654 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
)
655 : DictionaryBreakEngine((1<<UBRK_WORD
) | (1<<UBRK_LINE
)),
656 fDictionary(adoptDictionary
)
658 fBurmeseWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status
);
659 if (U_SUCCESS(status
)) {
660 setCharacters(fBurmeseWordSet
);
662 fMarkSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status
);
663 fMarkSet
.add(0x0020);
664 fEndWordSet
= fBurmeseWordSet
;
665 fBeginWordSet
.add(0x1000, 0x102A); // basic consonants and independent vowels
667 // Compact for caching.
669 fEndWordSet
.compact();
670 fBeginWordSet
.compact();
673 BurmeseBreakEngine::~BurmeseBreakEngine() {
678 BurmeseBreakEngine::divideUpDictionaryRange( UText
*text
,
681 UStack
&foundBreaks
) const {
682 if ((rangeEnd
- rangeStart
) < BURMESE_MIN_WORD_SPAN
) {
683 return 0; // Not enough characters for two words
686 uint32_t wordsFound
= 0;
687 int32_t cpWordLength
= 0;
688 int32_t cuWordLength
= 0;
690 UErrorCode status
= U_ZERO_ERROR
;
691 PossibleWord words
[BURMESE_LOOKAHEAD
];
693 utext_setNativeIndex(text
, rangeStart
);
695 while (U_SUCCESS(status
) && (current
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
) {
699 // Look for candidate words at the current position
700 int32_t candidates
= words
[wordsFound%BURMESE_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
702 // If we found exactly one, use that
703 if (candidates
== 1) {
704 cuWordLength
= words
[wordsFound
% BURMESE_LOOKAHEAD
].acceptMarked(text
);
705 cpWordLength
= words
[wordsFound
% BURMESE_LOOKAHEAD
].markedCPLength();
708 // If there was more than one, see which one can take us forward the most words
709 else if (candidates
> 1) {
710 // If we're already at the end of the range, we're done
711 if (utext_getNativeIndex(text
) >= rangeEnd
) {
715 int32_t wordsMatched
= 1;
716 if (words
[(wordsFound
+ 1) % BURMESE_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) > 0) {
717 if (wordsMatched
< 2) {
718 // Followed by another dictionary word; mark first word as a good candidate
719 words
[wordsFound%BURMESE_LOOKAHEAD
].markCurrent();
723 // If we're already at the end of the range, we're done
724 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
728 // See if any of the possible second words is followed by a third word
730 // If we find a third word, stop right away
731 if (words
[(wordsFound
+ 2) % BURMESE_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
)) {
732 words
[wordsFound
% BURMESE_LOOKAHEAD
].markCurrent();
736 while (words
[(wordsFound
+ 1) % BURMESE_LOOKAHEAD
].backUp(text
));
739 while (words
[wordsFound
% BURMESE_LOOKAHEAD
].backUp(text
));
741 cuWordLength
= words
[wordsFound
% BURMESE_LOOKAHEAD
].acceptMarked(text
);
742 cpWordLength
= words
[wordsFound
% BURMESE_LOOKAHEAD
].markedCPLength();
746 // We come here after having either found a word or not. We look ahead to the
747 // next word. If it's not a dictionary word, we will combine it withe the word we
748 // just found (if there is one), but only if the preceding word does not exceed
750 // The text iterator should now be positioned at the end of the word we found.
751 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& cpWordLength
< BURMESE_ROOT_COMBINE_THRESHOLD
) {
752 // if it is a dictionary word, do nothing. If it isn't, then if there is
753 // no preceding word, or the non-word shares less than the minimum threshold
754 // of characters with a dictionary word, then scan to resynchronize
755 if (words
[wordsFound
% BURMESE_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
756 && (cuWordLength
== 0
757 || words
[wordsFound%BURMESE_LOOKAHEAD
].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD
)) {
758 // Look for a plausible word boundary
759 int32_t remaining
= rangeEnd
- (current
+ cuWordLength
);
764 int32_t pcIndex
= (int32_t)utext_getNativeIndex(text
);
765 pc
= utext_next32(text
);
766 int32_t pcSize
= (int32_t)utext_getNativeIndex(text
) - pcIndex
;
769 if (remaining
<= 0) {
772 uc
= utext_current32(text
);
773 if (fEndWordSet
.contains(pc
) && fBeginWordSet
.contains(uc
)) {
774 // Maybe. See if it's in the dictionary.
775 // TODO: this looks iffy; compare with old code.
776 int32_t candidates
= words
[(wordsFound
+ 1) % BURMESE_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
777 utext_setNativeIndex(text
, current
+ cuWordLength
+ chars
);
778 if (candidates
> 0) {
784 // Bump the word count if there wasn't already one
785 if (cuWordLength
<= 0) {
789 // Update the length with the passed-over characters
790 cuWordLength
+= chars
;
793 // Back up to where we were for next iteration
794 utext_setNativeIndex(text
, current
+ cuWordLength
);
798 // Never stop before a combining mark.
800 while ((currPos
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
&& fMarkSet
.contains(utext_current32(text
))) {
802 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - currPos
;
805 // Look ahead for possible suffixes if a dictionary word does not follow.
806 // We do this in code rather than using a rule so that the heuristic
807 // resynch continues to function. For example, one of the suffix characters
808 // could be a typo in the middle of a word.
809 // NOT CURRENTLY APPLICABLE TO BURMESE
811 // Did we find a word on this iteration? If so, push it on the break stack
812 if (cuWordLength
> 0) {
813 foundBreaks
.push((current
+cuWordLength
), status
);
817 // Don't return a break for the end of the dictionary range if there is one there.
818 if (foundBreaks
.peeki() >= rangeEnd
) {
819 (void) foundBreaks
.popi();
827 ******************************************************************
831 // How many words in a row are "good enough"?
832 static const int32_t KHMER_LOOKAHEAD
= 3;
834 // Will not combine a non-word with a preceding dictionary word longer than this
835 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD
= 3;
837 // Will not combine a non-word that shares at least this much prefix with a
838 // dictionary word, with a preceding word
839 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD
= 3;
842 static const int32_t KHMER_MIN_WORD
= 2;
844 // Minimum number of characters for two words
845 static const int32_t KHMER_MIN_WORD_SPAN
= KHMER_MIN_WORD
* 2;
847 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
)
848 : DictionaryBreakEngine((1 << UBRK_WORD
) | (1 << UBRK_LINE
)),
849 fDictionary(adoptDictionary
)
851 fKhmerWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status
);
852 if (U_SUCCESS(status
)) {
853 setCharacters(fKhmerWordSet
);
855 fMarkSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status
);
856 fMarkSet
.add(0x0020);
857 fEndWordSet
= fKhmerWordSet
;
858 fBeginWordSet
.add(0x1780, 0x17B3);
859 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
860 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
861 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
862 fEndWordSet
.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
863 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
864 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
865 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
866 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
867 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
868 // fSuffixSet.add(THAI_PAIYANNOI);
869 // fSuffixSet.add(THAI_MAIYAMOK);
871 // Compact for caching.
873 fEndWordSet
.compact();
874 fBeginWordSet
.compact();
875 // fSuffixSet.compact();
878 KhmerBreakEngine::~KhmerBreakEngine() {
883 KhmerBreakEngine::divideUpDictionaryRange( UText
*text
,
886 UStack
&foundBreaks
) const {
887 if ((rangeEnd
- rangeStart
) < KHMER_MIN_WORD_SPAN
) {
888 return 0; // Not enough characters for two words
891 uint32_t wordsFound
= 0;
892 int32_t cpWordLength
= 0;
893 int32_t cuWordLength
= 0;
895 UErrorCode status
= U_ZERO_ERROR
;
896 PossibleWord words
[KHMER_LOOKAHEAD
];
898 utext_setNativeIndex(text
, rangeStart
);
900 while (U_SUCCESS(status
) && (current
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
) {
904 // Look for candidate words at the current position
905 int32_t candidates
= words
[wordsFound%KHMER_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
907 // If we found exactly one, use that
908 if (candidates
== 1) {
909 cuWordLength
= words
[wordsFound
% KHMER_LOOKAHEAD
].acceptMarked(text
);
910 cpWordLength
= words
[wordsFound
% KHMER_LOOKAHEAD
].markedCPLength();
914 // If there was more than one, see which one can take us forward the most words
915 else if (candidates
> 1) {
916 // If we're already at the end of the range, we're done
917 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
921 int32_t wordsMatched
= 1;
922 if (words
[(wordsFound
+ 1) % KHMER_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) > 0) {
923 if (wordsMatched
< 2) {
924 // Followed by another dictionary word; mark first word as a good candidate
925 words
[wordsFound
% KHMER_LOOKAHEAD
].markCurrent();
929 // If we're already at the end of the range, we're done
930 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
934 // See if any of the possible second words is followed by a third word
936 // If we find a third word, stop right away
937 if (words
[(wordsFound
+ 2) % KHMER_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
)) {
938 words
[wordsFound
% KHMER_LOOKAHEAD
].markCurrent();
942 while (words
[(wordsFound
+ 1) % KHMER_LOOKAHEAD
].backUp(text
));
945 while (words
[wordsFound
% KHMER_LOOKAHEAD
].backUp(text
));
947 cuWordLength
= words
[wordsFound
% KHMER_LOOKAHEAD
].acceptMarked(text
);
948 cpWordLength
= words
[wordsFound
% KHMER_LOOKAHEAD
].markedCPLength();
952 // We come here after having either found a word or not. We look ahead to the
953 // next word. If it's not a dictionary word, we will combine it with the word we
954 // just found (if there is one), but only if the preceding word does not exceed
956 // The text iterator should now be positioned at the end of the word we found.
957 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& cpWordLength
< KHMER_ROOT_COMBINE_THRESHOLD
) {
958 // if it is a dictionary word, do nothing. If it isn't, then if there is
959 // no preceding word, or the non-word shares less than the minimum threshold
960 // of characters with a dictionary word, then scan to resynchronize
961 if (words
[wordsFound
% KHMER_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
962 && (cuWordLength
== 0
963 || words
[wordsFound
% KHMER_LOOKAHEAD
].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD
)) {
964 // Look for a plausible word boundary
965 int32_t remaining
= rangeEnd
- (current
+cuWordLength
);
970 int32_t pcIndex
= (int32_t)utext_getNativeIndex(text
);
971 pc
= utext_next32(text
);
972 int32_t pcSize
= (int32_t)utext_getNativeIndex(text
) - pcIndex
;
975 if (remaining
<= 0) {
978 uc
= utext_current32(text
);
979 if (fEndWordSet
.contains(pc
) && fBeginWordSet
.contains(uc
)) {
980 // Maybe. See if it's in the dictionary.
981 int32_t candidates
= words
[(wordsFound
+ 1) % KHMER_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
982 utext_setNativeIndex(text
, current
+cuWordLength
+chars
);
983 if (candidates
> 0) {
989 // Bump the word count if there wasn't already one
990 if (cuWordLength
<= 0) {
994 // Update the length with the passed-over characters
995 cuWordLength
+= chars
;
998 // Back up to where we were for next iteration
999 utext_setNativeIndex(text
, current
+cuWordLength
);
1003 // Never stop before a combining mark.
1005 while ((currPos
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
&& fMarkSet
.contains(utext_current32(text
))) {
1007 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - currPos
;
1010 // Look ahead for possible suffixes if a dictionary word does not follow.
1011 // We do this in code rather than using a rule so that the heuristic
1012 // resynch continues to function. For example, one of the suffix characters
1013 // could be a typo in the middle of a word.
1014 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1015 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1016 // && fSuffixSet.contains(uc = utext_current32(text))) {
1017 // if (uc == KHMER_PAIYANNOI) {
1018 // if (!fSuffixSet.contains(utext_previous32(text))) {
1019 // // Skip over previous end and PAIYANNOI
1020 // utext_next32(text);
1021 // utext_next32(text);
1022 // wordLength += 1; // Add PAIYANNOI to word
1023 // uc = utext_current32(text); // Fetch next character
1026 // // Restore prior position
1027 // utext_next32(text);
1030 // if (uc == KHMER_MAIYAMOK) {
1031 // if (utext_previous32(text) != KHMER_MAIYAMOK) {
1032 // // Skip over previous end and MAIYAMOK
1033 // utext_next32(text);
1034 // utext_next32(text);
1035 // wordLength += 1; // Add MAIYAMOK to word
1038 // // Restore prior position
1039 // utext_next32(text);
1044 // utext_setNativeIndex(text, current+wordLength);
1048 // Did we find a word on this iteration? If so, push it on the break stack
1049 if (cuWordLength
> 0) {
1050 foundBreaks
.push((current
+cuWordLength
), status
);
1054 // Don't return a break for the end of the dictionary range if there is one there.
1055 if (foundBreaks
.peeki() >= rangeEnd
) {
1056 (void) foundBreaks
.popi();
1063 #if !UCONFIG_NO_NORMALIZATION
1065 ******************************************************************
1068 static const uint32_t kuint32max
= 0xFFFFFFFF;
1069 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher
*adoptDictionary
, LanguageType type
, UErrorCode
&status
)
1070 : DictionaryBreakEngine(1 << UBRK_WORD
), fDictionary(adoptDictionary
) {
1071 // Korean dictionary only includes Hangul syllables
1072 fHangulWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status
);
1073 fHanWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status
);
1074 fKatakanaWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status
);
1075 fHiraganaWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status
);
1076 nfkcNorm2
= Normalizer2::getNFKCInstance(status
);
1078 if (U_SUCCESS(status
)) {
1079 // handle Korean and Japanese/Chinese using different dictionaries
1080 if (type
== kKorean
) {
1081 setCharacters(fHangulWordSet
);
1082 } else { //Chinese and Japanese
1084 cjSet
.addAll(fHanWordSet
);
1085 cjSet
.addAll(fKatakanaWordSet
);
1086 cjSet
.addAll(fHiraganaWordSet
);
1087 cjSet
.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1088 cjSet
.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1089 setCharacters(cjSet
);
1094 CjkBreakEngine::~CjkBreakEngine(){
1098 // The katakanaCost values below are based on the length frequencies of all
1099 // katakana phrases in the dictionary
1100 static const int32_t kMaxKatakanaLength
= 8;
1101 static const int32_t kMaxKatakanaGroupLength
= 20;
1102 static const uint32_t maxSnlp
= 255;
1104 static inline uint32_t getKatakanaCost(int32_t wordLength
){
1105 //TODO: fill array with actual values from dictionary!
1106 static const uint32_t katakanaCost
[kMaxKatakanaLength
+ 1]
1107 = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1108 return (wordLength
> kMaxKatakanaLength
) ? 8192 : katakanaCost
[wordLength
];
1111 static inline bool isKatakana(uint16_t value
) {
1112 return (value
>= 0x30A1u
&& value
<= 0x30FEu
&& value
!= 0x30FBu
) ||
1113 (value
>= 0xFF66u
&& value
<= 0xFF9fu
);
1117 // Function for accessing internal utext flags.
1118 // Replicates an internal UText function.
1120 static inline int32_t utext_i32_flag(int32_t bitIndex
) {
1121 return (int32_t)1 << bitIndex
;
1126 * @param text A UText representing the text
1127 * @param rangeStart The start of the range of dictionary characters
1128 * @param rangeEnd The end of the range of dictionary characters
1129 * @param foundBreaks Output of C array of int32_t break positions, or 0
1130 * @return The number of breaks found
1133 CjkBreakEngine::divideUpDictionaryRange( UText
*inText
,
1136 UStack
&foundBreaks
) const {
1137 if (rangeStart
>= rangeEnd
) {
1141 // UnicodeString version of input UText, NFKC normalized if necessary.
1142 UnicodeString inString
;
1144 // inputMap[inStringIndex] = corresponding native index from UText inText.
1145 // If NULL then mapping is 1:1
1146 LocalPointer
<UVector32
> inputMap
;
1148 UErrorCode status
= U_ZERO_ERROR
;
1151 // if UText has the input string as one contiguous UTF-16 chunk
1152 if ((inText
->providerProperties
& utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS
)) &&
1153 inText
->chunkNativeStart
<= rangeStart
&&
1154 inText
->chunkNativeLimit
>= rangeEnd
&&
1155 inText
->nativeIndexingLimit
>= rangeEnd
- inText
->chunkNativeStart
) {
1157 // Input UText is in one contiguous UTF-16 chunk.
1158 // Use Read-only aliasing UnicodeString.
1159 inString
.setTo(FALSE
,
1160 inText
->chunkContents
+ rangeStart
- inText
->chunkNativeStart
,
1161 rangeEnd
- rangeStart
);
1163 // Copy the text from the original inText (UText) to inString (UnicodeString).
1164 // Create a map from UnicodeString indices -> UText offsets.
1165 utext_setNativeIndex(inText
, rangeStart
);
1166 int32_t limit
= rangeEnd
;
1167 U_ASSERT(limit
<= utext_nativeLength(inText
));
1168 if (limit
> utext_nativeLength(inText
)) {
1169 limit
= (int32_t)utext_nativeLength(inText
);
1171 inputMap
.adoptInsteadAndCheckErrorCode(new UVector32(status
), status
);
1172 if (U_FAILURE(status
)) {
1175 while (utext_getNativeIndex(inText
) < limit
) {
1176 int32_t nativePosition
= (int32_t)utext_getNativeIndex(inText
);
1177 UChar32 c
= utext_next32(inText
);
1178 U_ASSERT(c
!= U_SENTINEL
);
1180 while (inputMap
->size() < inString
.length()) {
1181 inputMap
->addElement(nativePosition
, status
);
1184 inputMap
->addElement(limit
, status
);
1188 if (!nfkcNorm2
->isNormalized(inString
, status
)) {
1189 UnicodeString normalizedInput
;
1190 // normalizedMap[normalizedInput position] == original UText position.
1191 LocalPointer
<UVector32
> normalizedMap(new UVector32(status
), status
);
1192 if (U_FAILURE(status
)) {
1196 UnicodeString fragment
;
1197 UnicodeString normalizedFragment
;
1198 for (int32_t srcI
= 0; srcI
< inString
.length();) { // Once per normalization chunk
1200 int32_t fragmentStartI
= srcI
;
1201 UChar32 c
= inString
.char32At(srcI
);
1204 srcI
= inString
.moveIndex32(srcI
, 1);
1205 if (srcI
== inString
.length()) {
1208 c
= inString
.char32At(srcI
);
1209 if (nfkcNorm2
->hasBoundaryBefore(c
)) {
1213 nfkcNorm2
->normalize(fragment
, normalizedFragment
, status
);
1214 normalizedInput
.append(normalizedFragment
);
1216 // Map every position in the normalized chunk to the start of the chunk
1217 // in the original input.
1218 int32_t fragmentOriginalStart
= inputMap
.isValid() ?
1219 inputMap
->elementAti(fragmentStartI
) : fragmentStartI
+rangeStart
;
1220 while (normalizedMap
->size() < normalizedInput
.length()) {
1221 normalizedMap
->addElement(fragmentOriginalStart
, status
);
1222 if (U_FAILURE(status
)) {
1227 U_ASSERT(normalizedMap
->size() == normalizedInput
.length());
1228 int32_t nativeEnd
= inputMap
.isValid() ?
1229 inputMap
->elementAti(inString
.length()) : inString
.length()+rangeStart
;
1230 normalizedMap
->addElement(nativeEnd
, status
);
1232 inputMap
.moveFrom(normalizedMap
);
1233 inString
.moveFrom(normalizedInput
);
1236 int32_t numCodePts
= inString
.countChar32();
1237 if (numCodePts
!= inString
.length()) {
1238 // There are supplementary characters in the input.
1239 // The dictionary will produce boundary positions in terms of code point indexes,
1240 // not in terms of code unit string indexes.
1241 // Use the inputMap mechanism to take care of this in addition to indexing differences
1242 // from normalization and/or UTF-8 input.
1243 UBool hadExistingMap
= inputMap
.isValid();
1244 if (!hadExistingMap
) {
1245 inputMap
.adoptInsteadAndCheckErrorCode(new UVector32(status
), status
);
1246 if (U_FAILURE(status
)) {
1251 for (int32_t cuIdx
= 0; ; cuIdx
= inString
.moveIndex32(cuIdx
, 1)) {
1252 U_ASSERT(cuIdx
>= cpIdx
);
1253 if (hadExistingMap
) {
1254 inputMap
->setElementAt(inputMap
->elementAti(cuIdx
), cpIdx
);
1256 inputMap
->addElement(cuIdx
+rangeStart
, status
);
1259 if (cuIdx
== inString
.length()) {
1265 // bestSnlp[i] is the snlp of the best segmentation of the first i
1266 // code points in the range to be matched.
1267 UVector32
bestSnlp(numCodePts
+ 1, status
);
1268 bestSnlp
.addElement(0, status
);
1269 for(int32_t i
= 1; i
<= numCodePts
; i
++) {
1270 bestSnlp
.addElement(kuint32max
, status
);
1274 // prev[i] is the index of the last CJK code point in the previous word in
1275 // the best segmentation of the first i characters.
1276 UVector32
prev(numCodePts
+ 1, status
);
1277 for(int32_t i
= 0; i
<= numCodePts
; i
++){
1278 prev
.addElement(-1, status
);
1281 const int32_t maxWordSize
= 20;
1282 UVector32
values(numCodePts
, status
);
1283 values
.setSize(numCodePts
);
1284 UVector32
lengths(numCodePts
, status
);
1285 lengths
.setSize(numCodePts
);
1287 UText fu
= UTEXT_INITIALIZER
;
1288 utext_openUnicodeString(&fu
, &inString
, &status
);
1290 // Dynamic programming to find the best segmentation.
1292 // In outer loop, i is the code point index,
1293 // ix is the corresponding string (code unit) index.
1294 // They differ when the string contains supplementary characters.
1296 for (int32_t i
= 0; i
< numCodePts
; ++i
, ix
= inString
.moveIndex32(ix
, 1)) {
1297 if ((uint32_t)bestSnlp
.elementAti(i
) == kuint32max
) {
1302 utext_setNativeIndex(&fu
, ix
);
1303 count
= fDictionary
->matches(&fu
, maxWordSize
, numCodePts
,
1304 NULL
, lengths
.getBuffer(), values
.getBuffer(), NULL
);
1305 // Note: lengths is filled with code point lengths
1306 // The NULL parameter is the ignored code unit lengths.
1308 // if there are no single character matches found in the dictionary
1309 // starting with this charcter, treat character as a 1-character word
1310 // with the highest value possible, i.e. the least likely to occur.
1311 // Exclude Korean characters from this treatment, as they should be left
1312 // together by default.
1313 if ((count
== 0 || lengths
.elementAti(0) != 1) &&
1314 !fHangulWordSet
.contains(inString
.char32At(ix
))) {
1315 values
.setElementAt(maxSnlp
, count
); // 255
1316 lengths
.setElementAt(1, count
++);
1319 for (int32_t j
= 0; j
< count
; j
++) {
1320 uint32_t newSnlp
= (uint32_t)bestSnlp
.elementAti(i
) + (uint32_t)values
.elementAti(j
);
1321 int32_t ln_j_i
= lengths
.elementAti(j
) + i
;
1322 if (newSnlp
< (uint32_t)bestSnlp
.elementAti(ln_j_i
)) {
1323 bestSnlp
.setElementAt(newSnlp
, ln_j_i
);
1324 prev
.setElementAt(i
, ln_j_i
);
1329 // Katakana word in single character is pretty rare. So we apply
1330 // the following heuristic to Katakana: any continuous run of Katakana
1331 // characters is considered a candidate word with a default cost
1332 // specified in the katakanaCost table according to its length.
1334 bool is_prev_katakana
= false;
1335 bool is_katakana
= isKatakana(inString
.char32At(ix
));
1336 int32_t katakanaRunLength
= 1;
1337 if (!is_prev_katakana
&& is_katakana
) {
1338 int32_t j
= inString
.moveIndex32(ix
, 1);
1339 // Find the end of the continuous run of Katakana characters
1340 while (j
< inString
.length() && katakanaRunLength
< kMaxKatakanaGroupLength
&&
1341 isKatakana(inString
.char32At(j
))) {
1342 j
= inString
.moveIndex32(j
, 1);
1343 katakanaRunLength
++;
1345 if (katakanaRunLength
< kMaxKatakanaGroupLength
) {
1346 uint32_t newSnlp
= bestSnlp
.elementAti(i
) + getKatakanaCost(katakanaRunLength
);
1347 if (newSnlp
< (uint32_t)bestSnlp
.elementAti(j
)) {
1348 bestSnlp
.setElementAt(newSnlp
, j
);
1349 prev
.setElementAt(i
, i
+katakanaRunLength
); // prev[j] = i;
1353 is_prev_katakana
= is_katakana
;
1357 // Start pushing the optimal offset index into t_boundary (t for tentative).
1358 // prev[numCodePts] is guaranteed to be meaningful.
1359 // We'll first push in the reverse order, i.e.,
1360 // t_boundary[0] = numCodePts, and afterwards do a swap.
1361 UVector32
t_boundary(numCodePts
+1, status
);
1363 int32_t numBreaks
= 0;
1364 // No segmentation found, set boundary to end of range
1365 if ((uint32_t)bestSnlp
.elementAti(numCodePts
) == kuint32max
) {
1366 t_boundary
.addElement(numCodePts
, status
);
1369 for (int32_t i
= numCodePts
; i
> 0; i
= prev
.elementAti(i
)) {
1370 t_boundary
.addElement(i
, status
);
1373 U_ASSERT(prev
.elementAti(t_boundary
.elementAti(numBreaks
- 1)) == 0);
1376 // Add a break for the start of the dictionary range if there is not one
1378 if (foundBreaks
.size() == 0 || foundBreaks
.peeki() < rangeStart
) {
1379 t_boundary
.addElement(0, status
);
1383 // Now that we're done, convert positions in t_boundary[] (indices in
1384 // the normalized input string) back to indices in the original input UText
1385 // while reversing t_boundary and pushing values to foundBreaks.
1386 for (int32_t i
= numBreaks
-1; i
>= 0; i
--) {
1387 int32_t cpPos
= t_boundary
.elementAti(i
);
1388 int32_t utextPos
= inputMap
.isValid() ? inputMap
->elementAti(cpPos
) : cpPos
+ rangeStart
;
1389 // Boundaries are added to foundBreaks output in ascending order.
1390 U_ASSERT(foundBreaks
.size() == 0 ||foundBreaks
.peeki() < utextPos
);
1391 foundBreaks
.push(utextPos
, status
);
1394 // inString goes out of scope
1395 // inputMap goes out of scope
1402 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */