1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2006-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 *******************************************************************************
12 #include "unicode/utypes.h"
14 #if !UCONFIG_NO_BREAK_ITERATION
18 #include "unicode/uniset.h"
19 #include "unicode/chariter.h"
20 #include "unicode/ubrk.h"
24 #include "unicode/normlzr.h"
26 #include "dictionarydata.h"
31 ******************************************************************
34 DictionaryBreakEngine::DictionaryBreakEngine() {
37 DictionaryBreakEngine::~DictionaryBreakEngine() {
41 DictionaryBreakEngine::handles(UChar32 c
) const {
42 return fSet
.contains(c
);
46 DictionaryBreakEngine::findBreaks( UText
*text
,
49 UVector32
&foundBreaks
) const {
50 (void)startPos
; // TODO: remove this param?
53 // Find the span of characters included in the set.
54 // The span to break begins at the current position in the text, and
55 // extends towards the start or end of the text, depending on 'reverse'.
57 int32_t start
= (int32_t)utext_getNativeIndex(text
);
61 UChar32 c
= utext_current32(text
);
62 while((current
= (int32_t)utext_getNativeIndex(text
)) < endPos
&& fSet
.contains(c
)) {
63 utext_next32(text
); // TODO: recast loop for postincrement
64 c
= utext_current32(text
);
68 result
= divideUpDictionaryRange(text
, rangeStart
, rangeEnd
, foundBreaks
);
69 utext_setNativeIndex(text
, current
);
75 DictionaryBreakEngine::setCharacters( const UnicodeSet
&set
) {
77 // Compact for caching
82 ******************************************************************
86 // Helper class for improving readability of the Thai/Lao/Khmer word break
87 // algorithm. The implementation is completely inline.
89 // List size, limited by the maximum number of words in the dictionary
90 // that form a nested sequence.
91 static const int32_t POSSIBLE_WORD_LIST_MAX
= 20;
95 // list of word candidate lengths, in increasing length order
96 // TODO: bytes would be sufficient for word lengths.
97 int32_t count
; // Count of candidates
98 int32_t prefix
; // The longest match with a dictionary word
99 int32_t offset
; // Offset in the text of these candidates
100 int32_t mark
; // The preferred candidate's offset
101 int32_t current
; // The candidate we're currently looking at
102 int32_t cuLengths
[POSSIBLE_WORD_LIST_MAX
]; // Word Lengths, in code units.
103 int32_t cpLengths
[POSSIBLE_WORD_LIST_MAX
]; // Word Lengths, in code points.
106 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
109 // Fill the list of candidates if needed, select the longest, and return the number found
110 int32_t candidates( UText
*text
, DictionaryMatcher
*dict
, int32_t rangeEnd
);
112 // Select the currently marked candidate, point after it in the text, and invalidate self
113 int32_t acceptMarked( UText
*text
);
115 // Back up from the current candidate to the next shorter one; return TRUE if that exists
116 // and point the text after it
117 UBool
backUp( UText
*text
);
119 // Return the longest prefix this candidate location shares with a dictionary word
120 // Return value is in code points.
121 int32_t longestPrefix() { return prefix
; }
123 // Mark the current candidate as the one we like
124 void markCurrent() { mark
= current
; }
126 // Get length in code points of the marked word.
127 int32_t markedCPLength() { return cpLengths
[mark
]; }
131 int32_t PossibleWord::candidates( UText
*text
, DictionaryMatcher
*dict
, int32_t rangeEnd
) {
132 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
133 int32_t start
= (int32_t)utext_getNativeIndex(text
);
134 if (start
!= offset
) {
136 count
= dict
->matches(text
, rangeEnd
-start
, UPRV_LENGTHOF(cuLengths
), cuLengths
, cpLengths
, NULL
, &prefix
);
137 // Dictionary leaves text after longest prefix, not longest word. Back up.
139 utext_setNativeIndex(text
, start
);
143 utext_setNativeIndex(text
, start
+cuLengths
[count
-1]);
151 PossibleWord::acceptMarked( UText
*text
) {
152 utext_setNativeIndex(text
, offset
+ cuLengths
[mark
]);
153 return cuLengths
[mark
];
158 PossibleWord::backUp( UText
*text
) {
160 utext_setNativeIndex(text
, offset
+ cuLengths
[--current
]);
167 ******************************************************************
171 // How many words in a row are "good enough"?
172 static const int32_t THAI_LOOKAHEAD
= 3;
174 // Will not combine a non-word with a preceding dictionary word longer than this
175 static const int32_t THAI_ROOT_COMBINE_THRESHOLD
= 3;
177 // Will not combine a non-word that shares at least this much prefix with a
178 // dictionary word, with a preceding word
179 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD
= 3;
181 // Ellision character
182 static const int32_t THAI_PAIYANNOI
= 0x0E2F;
185 static const int32_t THAI_MAIYAMOK
= 0x0E46;
188 static const int32_t THAI_MIN_WORD
= 2;
190 // Minimum number of characters for two words
191 static const int32_t THAI_MIN_WORD_SPAN
= THAI_MIN_WORD
* 2;
193 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
)
194 : DictionaryBreakEngine(),
195 fDictionary(adoptDictionary
)
197 fThaiWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status
);
198 if (U_SUCCESS(status
)) {
199 setCharacters(fThaiWordSet
);
201 fMarkSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status
);
202 fMarkSet
.add(0x0020);
203 fEndWordSet
= fThaiWordSet
;
204 fEndWordSet
.remove(0x0E31); // MAI HAN-AKAT
205 fEndWordSet
.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
206 fBeginWordSet
.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
207 fBeginWordSet
.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
208 fSuffixSet
.add(THAI_PAIYANNOI
);
209 fSuffixSet
.add(THAI_MAIYAMOK
);
211 // Compact for caching.
213 fEndWordSet
.compact();
214 fBeginWordSet
.compact();
215 fSuffixSet
.compact();
218 ThaiBreakEngine::~ThaiBreakEngine() {
223 ThaiBreakEngine::divideUpDictionaryRange( UText
*text
,
226 UVector32
&foundBreaks
) const {
227 utext_setNativeIndex(text
, rangeStart
);
228 utext_moveIndex32(text
, THAI_MIN_WORD_SPAN
);
229 if (utext_getNativeIndex(text
) >= rangeEnd
) {
230 return 0; // Not enough characters for two words
232 utext_setNativeIndex(text
, rangeStart
);
235 uint32_t wordsFound
= 0;
236 int32_t cpWordLength
= 0; // Word Length in Code Points.
237 int32_t cuWordLength
= 0; // Word length in code units (UText native indexing)
239 UErrorCode status
= U_ZERO_ERROR
;
240 PossibleWord words
[THAI_LOOKAHEAD
];
242 utext_setNativeIndex(text
, rangeStart
);
244 while (U_SUCCESS(status
) && (current
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
) {
248 // Look for candidate words at the current position
249 int32_t candidates
= words
[wordsFound%THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
251 // If we found exactly one, use that
252 if (candidates
== 1) {
253 cuWordLength
= words
[wordsFound
% THAI_LOOKAHEAD
].acceptMarked(text
);
254 cpWordLength
= words
[wordsFound
% THAI_LOOKAHEAD
].markedCPLength();
257 // If there was more than one, see which one can take us forward the most words
258 else if (candidates
> 1) {
259 // If we're already at the end of the range, we're done
260 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
264 int32_t wordsMatched
= 1;
265 if (words
[(wordsFound
+ 1) % THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) > 0) {
266 if (wordsMatched
< 2) {
267 // Followed by another dictionary word; mark first word as a good candidate
268 words
[wordsFound%THAI_LOOKAHEAD
].markCurrent();
272 // If we're already at the end of the range, we're done
273 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
277 // See if any of the possible second words is followed by a third word
279 // If we find a third word, stop right away
280 if (words
[(wordsFound
+ 2) % THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
)) {
281 words
[wordsFound
% THAI_LOOKAHEAD
].markCurrent();
285 while (words
[(wordsFound
+ 1) % THAI_LOOKAHEAD
].backUp(text
));
288 while (words
[wordsFound
% THAI_LOOKAHEAD
].backUp(text
));
290 // Set UText position to after the accepted word.
291 cuWordLength
= words
[wordsFound
% THAI_LOOKAHEAD
].acceptMarked(text
);
292 cpWordLength
= words
[wordsFound
% THAI_LOOKAHEAD
].markedCPLength();
296 // We come here after having either found a word or not. We look ahead to the
297 // next word. If it's not a dictionary word, we will combine it with the word we
298 // just found (if there is one), but only if the preceding word does not exceed
300 // The text iterator should now be positioned at the end of the word we found.
303 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& cpWordLength
< THAI_ROOT_COMBINE_THRESHOLD
) {
304 // if it is a dictionary word, do nothing. If it isn't, then if there is
305 // no preceding word, or the non-word shares less than the minimum threshold
306 // of characters with a dictionary word, then scan to resynchronize
307 if (words
[wordsFound
% THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
308 && (cuWordLength
== 0
309 || words
[wordsFound%THAI_LOOKAHEAD
].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD
)) {
310 // Look for a plausible word boundary
311 int32_t remaining
= rangeEnd
- (current
+cuWordLength
);
315 int32_t pcIndex
= (int32_t)utext_getNativeIndex(text
);
316 pc
= utext_next32(text
);
317 int32_t pcSize
= (int32_t)utext_getNativeIndex(text
) - pcIndex
;
320 if (remaining
<= 0) {
323 uc
= utext_current32(text
);
324 if (fEndWordSet
.contains(pc
) && fBeginWordSet
.contains(uc
)) {
325 // Maybe. See if it's in the dictionary.
326 // NOTE: In the original Apple code, checked that the next
327 // two characters after uc were not 0x0E4C THANTHAKHAT before
328 // checking the dictionary. That is just a performance filter,
329 // but it's not clear it's faster than checking the trie.
330 int32_t num_candidates
= words
[(wordsFound
+ 1) % THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
331 utext_setNativeIndex(text
, current
+ cuWordLength
+ chars
);
332 if (num_candidates
> 0) {
338 // Bump the word count if there wasn't already one
339 if (cuWordLength
<= 0) {
343 // Update the length with the passed-over characters
344 cuWordLength
+= chars
;
347 // Back up to where we were for next iteration
348 utext_setNativeIndex(text
, current
+cuWordLength
);
352 // Never stop before a combining mark.
354 while ((currPos
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
&& fMarkSet
.contains(utext_current32(text
))) {
356 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - currPos
;
359 // Look ahead for possible suffixes if a dictionary word does not follow.
360 // We do this in code rather than using a rule so that the heuristic
361 // resynch continues to function. For example, one of the suffix characters
362 // could be a typo in the middle of a word.
363 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& cuWordLength
> 0) {
364 if (words
[wordsFound%THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
365 && fSuffixSet
.contains(uc
= utext_current32(text
))) {
366 if (uc
== THAI_PAIYANNOI
) {
367 if (!fSuffixSet
.contains(utext_previous32(text
))) {
368 // Skip over previous end and PAIYANNOI
370 int32_t paiyannoiIndex
= (int32_t)utext_getNativeIndex(text
);
372 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - paiyannoiIndex
; // Add PAIYANNOI to word
373 uc
= utext_current32(text
); // Fetch next character
376 // Restore prior position
380 if (uc
== THAI_MAIYAMOK
) {
381 if (utext_previous32(text
) != THAI_MAIYAMOK
) {
382 // Skip over previous end and MAIYAMOK
384 int32_t maiyamokIndex
= (int32_t)utext_getNativeIndex(text
);
386 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - maiyamokIndex
; // Add MAIYAMOK to word
389 // Restore prior position
395 utext_setNativeIndex(text
, current
+cuWordLength
);
399 // Did we find a word on this iteration? If so, push it on the break stack
400 if (cuWordLength
> 0) {
401 foundBreaks
.push((current
+cuWordLength
), status
);
405 // Don't return a break for the end of the dictionary range if there is one there.
406 if (foundBreaks
.peeki() >= rangeEnd
) {
407 (void) foundBreaks
.popi();
415 ******************************************************************
419 // How many words in a row are "good enough"?
420 static const int32_t LAO_LOOKAHEAD
= 3;
422 // Will not combine a non-word with a preceding dictionary word longer than this
423 static const int32_t LAO_ROOT_COMBINE_THRESHOLD
= 3;
425 // Will not combine a non-word that shares at least this much prefix with a
426 // dictionary word, with a preceding word
427 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD
= 3;
430 static const int32_t LAO_MIN_WORD
= 2;
432 // Minimum number of characters for two words
433 static const int32_t LAO_MIN_WORD_SPAN
= LAO_MIN_WORD
* 2;
435 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
)
436 : DictionaryBreakEngine(),
437 fDictionary(adoptDictionary
)
439 fLaoWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status
);
440 if (U_SUCCESS(status
)) {
441 setCharacters(fLaoWordSet
);
443 fMarkSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status
);
444 fMarkSet
.add(0x0020);
445 fEndWordSet
= fLaoWordSet
;
446 fEndWordSet
.remove(0x0EC0, 0x0EC4); // prefix vowels
447 fBeginWordSet
.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
448 fBeginWordSet
.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
449 fBeginWordSet
.add(0x0EC0, 0x0EC4); // prefix vowels
451 // Compact for caching.
453 fEndWordSet
.compact();
454 fBeginWordSet
.compact();
457 LaoBreakEngine::~LaoBreakEngine() {
462 LaoBreakEngine::divideUpDictionaryRange( UText
*text
,
465 UVector32
&foundBreaks
) const {
466 if ((rangeEnd
- rangeStart
) < LAO_MIN_WORD_SPAN
) {
467 return 0; // Not enough characters for two words
470 uint32_t wordsFound
= 0;
471 int32_t cpWordLength
= 0;
472 int32_t cuWordLength
= 0;
474 UErrorCode status
= U_ZERO_ERROR
;
475 PossibleWord words
[LAO_LOOKAHEAD
];
477 utext_setNativeIndex(text
, rangeStart
);
479 while (U_SUCCESS(status
) && (current
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
) {
483 // Look for candidate words at the current position
484 int32_t candidates
= words
[wordsFound%LAO_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
486 // If we found exactly one, use that
487 if (candidates
== 1) {
488 cuWordLength
= words
[wordsFound
% LAO_LOOKAHEAD
].acceptMarked(text
);
489 cpWordLength
= words
[wordsFound
% LAO_LOOKAHEAD
].markedCPLength();
492 // If there was more than one, see which one can take us forward the most words
493 else if (candidates
> 1) {
494 // If we're already at the end of the range, we're done
495 if (utext_getNativeIndex(text
) >= rangeEnd
) {
499 int32_t wordsMatched
= 1;
500 if (words
[(wordsFound
+ 1) % LAO_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) > 0) {
501 if (wordsMatched
< 2) {
502 // Followed by another dictionary word; mark first word as a good candidate
503 words
[wordsFound%LAO_LOOKAHEAD
].markCurrent();
507 // If we're already at the end of the range, we're done
508 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
512 // See if any of the possible second words is followed by a third word
514 // If we find a third word, stop right away
515 if (words
[(wordsFound
+ 2) % LAO_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
)) {
516 words
[wordsFound
% LAO_LOOKAHEAD
].markCurrent();
520 while (words
[(wordsFound
+ 1) % LAO_LOOKAHEAD
].backUp(text
));
523 while (words
[wordsFound
% LAO_LOOKAHEAD
].backUp(text
));
525 cuWordLength
= words
[wordsFound
% LAO_LOOKAHEAD
].acceptMarked(text
);
526 cpWordLength
= words
[wordsFound
% LAO_LOOKAHEAD
].markedCPLength();
530 // We come here after having either found a word or not. We look ahead to the
531 // next word. If it's not a dictionary word, we will combine it withe the word we
532 // just found (if there is one), but only if the preceding word does not exceed
534 // The text iterator should now be positioned at the end of the word we found.
535 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& cpWordLength
< LAO_ROOT_COMBINE_THRESHOLD
) {
536 // if it is a dictionary word, do nothing. If it isn't, then if there is
537 // no preceding word, or the non-word shares less than the minimum threshold
538 // of characters with a dictionary word, then scan to resynchronize
539 if (words
[wordsFound
% LAO_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
540 && (cuWordLength
== 0
541 || words
[wordsFound%LAO_LOOKAHEAD
].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD
)) {
542 // Look for a plausible word boundary
543 int32_t remaining
= rangeEnd
- (current
+ cuWordLength
);
548 int32_t pcIndex
= (int32_t)utext_getNativeIndex(text
);
549 pc
= utext_next32(text
);
550 int32_t pcSize
= (int32_t)utext_getNativeIndex(text
) - pcIndex
;
553 if (remaining
<= 0) {
556 uc
= utext_current32(text
);
557 if (fEndWordSet
.contains(pc
) && fBeginWordSet
.contains(uc
)) {
558 // Maybe. See if it's in the dictionary.
559 // TODO: this looks iffy; compare with old code.
560 int32_t num_candidates
= words
[(wordsFound
+ 1) % LAO_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
561 utext_setNativeIndex(text
, current
+ cuWordLength
+ chars
);
562 if (num_candidates
> 0) {
568 // Bump the word count if there wasn't already one
569 if (cuWordLength
<= 0) {
573 // Update the length with the passed-over characters
574 cuWordLength
+= chars
;
577 // Back up to where we were for next iteration
578 utext_setNativeIndex(text
, current
+ cuWordLength
);
582 // Never stop before a combining mark.
584 while ((currPos
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
&& fMarkSet
.contains(utext_current32(text
))) {
586 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - currPos
;
589 // Look ahead for possible suffixes if a dictionary word does not follow.
590 // We do this in code rather than using a rule so that the heuristic
591 // resynch continues to function. For example, one of the suffix characters
592 // could be a typo in the middle of a word.
593 // NOT CURRENTLY APPLICABLE TO LAO
595 // Did we find a word on this iteration? If so, push it on the break stack
596 if (cuWordLength
> 0) {
597 foundBreaks
.push((current
+cuWordLength
), status
);
601 // Don't return a break for the end of the dictionary range if there is one there.
602 if (foundBreaks
.peeki() >= rangeEnd
) {
603 (void) foundBreaks
.popi();
611 ******************************************************************
615 // How many words in a row are "good enough"?
616 static const int32_t BURMESE_LOOKAHEAD
= 3;
618 // Will not combine a non-word with a preceding dictionary word longer than this
619 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD
= 3;
621 // Will not combine a non-word that shares at least this much prefix with a
622 // dictionary word, with a preceding word
623 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD
= 3;
626 static const int32_t BURMESE_MIN_WORD
= 2;
628 // Minimum number of characters for two words
629 static const int32_t BURMESE_MIN_WORD_SPAN
= BURMESE_MIN_WORD
* 2;
631 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
)
632 : DictionaryBreakEngine(),
633 fDictionary(adoptDictionary
)
635 fBurmeseWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status
);
636 if (U_SUCCESS(status
)) {
637 setCharacters(fBurmeseWordSet
);
639 fMarkSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status
);
640 fMarkSet
.add(0x0020);
641 fEndWordSet
= fBurmeseWordSet
;
642 fBeginWordSet
.add(0x1000, 0x102A); // basic consonants and independent vowels
644 // Compact for caching.
646 fEndWordSet
.compact();
647 fBeginWordSet
.compact();
650 BurmeseBreakEngine::~BurmeseBreakEngine() {
655 BurmeseBreakEngine::divideUpDictionaryRange( UText
*text
,
658 UVector32
&foundBreaks
) const {
659 if ((rangeEnd
- rangeStart
) < BURMESE_MIN_WORD_SPAN
) {
660 return 0; // Not enough characters for two words
663 uint32_t wordsFound
= 0;
664 int32_t cpWordLength
= 0;
665 int32_t cuWordLength
= 0;
667 UErrorCode status
= U_ZERO_ERROR
;
668 PossibleWord words
[BURMESE_LOOKAHEAD
];
670 utext_setNativeIndex(text
, rangeStart
);
672 while (U_SUCCESS(status
) && (current
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
) {
676 // Look for candidate words at the current position
677 int32_t candidates
= words
[wordsFound%BURMESE_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
679 // If we found exactly one, use that
680 if (candidates
== 1) {
681 cuWordLength
= words
[wordsFound
% BURMESE_LOOKAHEAD
].acceptMarked(text
);
682 cpWordLength
= words
[wordsFound
% BURMESE_LOOKAHEAD
].markedCPLength();
685 // If there was more than one, see which one can take us forward the most words
686 else if (candidates
> 1) {
687 // If we're already at the end of the range, we're done
688 if (utext_getNativeIndex(text
) >= rangeEnd
) {
692 int32_t wordsMatched
= 1;
693 if (words
[(wordsFound
+ 1) % BURMESE_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) > 0) {
694 if (wordsMatched
< 2) {
695 // Followed by another dictionary word; mark first word as a good candidate
696 words
[wordsFound%BURMESE_LOOKAHEAD
].markCurrent();
700 // If we're already at the end of the range, we're done
701 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
705 // See if any of the possible second words is followed by a third word
707 // If we find a third word, stop right away
708 if (words
[(wordsFound
+ 2) % BURMESE_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
)) {
709 words
[wordsFound
% BURMESE_LOOKAHEAD
].markCurrent();
713 while (words
[(wordsFound
+ 1) % BURMESE_LOOKAHEAD
].backUp(text
));
716 while (words
[wordsFound
% BURMESE_LOOKAHEAD
].backUp(text
));
718 cuWordLength
= words
[wordsFound
% BURMESE_LOOKAHEAD
].acceptMarked(text
);
719 cpWordLength
= words
[wordsFound
% BURMESE_LOOKAHEAD
].markedCPLength();
723 // We come here after having either found a word or not. We look ahead to the
724 // next word. If it's not a dictionary word, we will combine it withe the word we
725 // just found (if there is one), but only if the preceding word does not exceed
727 // The text iterator should now be positioned at the end of the word we found.
728 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& cpWordLength
< BURMESE_ROOT_COMBINE_THRESHOLD
) {
729 // if it is a dictionary word, do nothing. If it isn't, then if there is
730 // no preceding word, or the non-word shares less than the minimum threshold
731 // of characters with a dictionary word, then scan to resynchronize
732 if (words
[wordsFound
% BURMESE_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
733 && (cuWordLength
== 0
734 || words
[wordsFound%BURMESE_LOOKAHEAD
].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD
)) {
735 // Look for a plausible word boundary
736 int32_t remaining
= rangeEnd
- (current
+ cuWordLength
);
741 int32_t pcIndex
= (int32_t)utext_getNativeIndex(text
);
742 pc
= utext_next32(text
);
743 int32_t pcSize
= (int32_t)utext_getNativeIndex(text
) - pcIndex
;
746 if (remaining
<= 0) {
749 uc
= utext_current32(text
);
750 if (fEndWordSet
.contains(pc
) && fBeginWordSet
.contains(uc
)) {
751 // Maybe. See if it's in the dictionary.
752 // TODO: this looks iffy; compare with old code.
753 int32_t num_candidates
= words
[(wordsFound
+ 1) % BURMESE_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
754 utext_setNativeIndex(text
, current
+ cuWordLength
+ chars
);
755 if (num_candidates
> 0) {
761 // Bump the word count if there wasn't already one
762 if (cuWordLength
<= 0) {
766 // Update the length with the passed-over characters
767 cuWordLength
+= chars
;
770 // Back up to where we were for next iteration
771 utext_setNativeIndex(text
, current
+ cuWordLength
);
775 // Never stop before a combining mark.
777 while ((currPos
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
&& fMarkSet
.contains(utext_current32(text
))) {
779 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - currPos
;
782 // Look ahead for possible suffixes if a dictionary word does not follow.
783 // We do this in code rather than using a rule so that the heuristic
784 // resynch continues to function. For example, one of the suffix characters
785 // could be a typo in the middle of a word.
786 // NOT CURRENTLY APPLICABLE TO BURMESE
788 // Did we find a word on this iteration? If so, push it on the break stack
789 if (cuWordLength
> 0) {
790 foundBreaks
.push((current
+cuWordLength
), status
);
794 // Don't return a break for the end of the dictionary range if there is one there.
795 if (foundBreaks
.peeki() >= rangeEnd
) {
796 (void) foundBreaks
.popi();
804 ******************************************************************
808 // How many words in a row are "good enough"?
809 static const int32_t KHMER_LOOKAHEAD
= 3;
811 // Will not combine a non-word with a preceding dictionary word longer than this
812 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD
= 3;
814 // Will not combine a non-word that shares at least this much prefix with a
815 // dictionary word, with a preceding word
816 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD
= 3;
819 static const int32_t KHMER_MIN_WORD
= 2;
821 // Minimum number of characters for two words
822 static const int32_t KHMER_MIN_WORD_SPAN
= KHMER_MIN_WORD
* 2;
824 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
)
825 : DictionaryBreakEngine(),
826 fDictionary(adoptDictionary
)
828 fKhmerWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status
);
829 if (U_SUCCESS(status
)) {
830 setCharacters(fKhmerWordSet
);
832 fMarkSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status
);
833 fMarkSet
.add(0x0020);
834 fEndWordSet
= fKhmerWordSet
;
835 fBeginWordSet
.add(0x1780, 0x17B3);
836 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
837 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
838 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
839 fEndWordSet
.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
840 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
841 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
842 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
843 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
844 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
845 // fSuffixSet.add(THAI_PAIYANNOI);
846 // fSuffixSet.add(THAI_MAIYAMOK);
848 // Compact for caching.
850 fEndWordSet
.compact();
851 fBeginWordSet
.compact();
852 // fSuffixSet.compact();
855 KhmerBreakEngine::~KhmerBreakEngine() {
860 KhmerBreakEngine::divideUpDictionaryRange( UText
*text
,
863 UVector32
&foundBreaks
) const {
864 if ((rangeEnd
- rangeStart
) < KHMER_MIN_WORD_SPAN
) {
865 return 0; // Not enough characters for two words
868 uint32_t wordsFound
= 0;
869 int32_t cpWordLength
= 0;
870 int32_t cuWordLength
= 0;
872 UErrorCode status
= U_ZERO_ERROR
;
873 PossibleWord words
[KHMER_LOOKAHEAD
];
875 utext_setNativeIndex(text
, rangeStart
);
877 while (U_SUCCESS(status
) && (current
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
) {
881 // Look for candidate words at the current position
882 int32_t candidates
= words
[wordsFound%KHMER_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
884 // If we found exactly one, use that
885 if (candidates
== 1) {
886 cuWordLength
= words
[wordsFound
% KHMER_LOOKAHEAD
].acceptMarked(text
);
887 cpWordLength
= words
[wordsFound
% KHMER_LOOKAHEAD
].markedCPLength();
891 // If there was more than one, see which one can take us forward the most words
892 else if (candidates
> 1) {
893 // If we're already at the end of the range, we're done
894 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
898 int32_t wordsMatched
= 1;
899 if (words
[(wordsFound
+ 1) % KHMER_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) > 0) {
900 if (wordsMatched
< 2) {
901 // Followed by another dictionary word; mark first word as a good candidate
902 words
[wordsFound
% KHMER_LOOKAHEAD
].markCurrent();
906 // If we're already at the end of the range, we're done
907 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
911 // See if any of the possible second words is followed by a third word
913 // If we find a third word, stop right away
914 if (words
[(wordsFound
+ 2) % KHMER_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
)) {
915 words
[wordsFound
% KHMER_LOOKAHEAD
].markCurrent();
919 while (words
[(wordsFound
+ 1) % KHMER_LOOKAHEAD
].backUp(text
));
922 while (words
[wordsFound
% KHMER_LOOKAHEAD
].backUp(text
));
924 cuWordLength
= words
[wordsFound
% KHMER_LOOKAHEAD
].acceptMarked(text
);
925 cpWordLength
= words
[wordsFound
% KHMER_LOOKAHEAD
].markedCPLength();
929 // We come here after having either found a word or not. We look ahead to the
930 // next word. If it's not a dictionary word, we will combine it with the word we
931 // just found (if there is one), but only if the preceding word does not exceed
933 // The text iterator should now be positioned at the end of the word we found.
934 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& cpWordLength
< KHMER_ROOT_COMBINE_THRESHOLD
) {
935 // if it is a dictionary word, do nothing. If it isn't, then if there is
936 // no preceding word, or the non-word shares less than the minimum threshold
937 // of characters with a dictionary word, then scan to resynchronize
938 if (words
[wordsFound
% KHMER_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
939 && (cuWordLength
== 0
940 || words
[wordsFound
% KHMER_LOOKAHEAD
].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD
)) {
941 // Look for a plausible word boundary
942 int32_t remaining
= rangeEnd
- (current
+cuWordLength
);
947 int32_t pcIndex
= (int32_t)utext_getNativeIndex(text
);
948 pc
= utext_next32(text
);
949 int32_t pcSize
= (int32_t)utext_getNativeIndex(text
) - pcIndex
;
952 if (remaining
<= 0) {
955 uc
= utext_current32(text
);
956 if (fEndWordSet
.contains(pc
) && fBeginWordSet
.contains(uc
)) {
957 // Maybe. See if it's in the dictionary.
958 int32_t num_candidates
= words
[(wordsFound
+ 1) % KHMER_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
959 utext_setNativeIndex(text
, current
+cuWordLength
+chars
);
960 if (num_candidates
> 0) {
966 // Bump the word count if there wasn't already one
967 if (cuWordLength
<= 0) {
971 // Update the length with the passed-over characters
972 cuWordLength
+= chars
;
975 // Back up to where we were for next iteration
976 utext_setNativeIndex(text
, current
+cuWordLength
);
980 // Never stop before a combining mark.
982 while ((currPos
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
&& fMarkSet
.contains(utext_current32(text
))) {
984 cuWordLength
+= (int32_t)utext_getNativeIndex(text
) - currPos
;
987 // Look ahead for possible suffixes if a dictionary word does not follow.
988 // We do this in code rather than using a rule so that the heuristic
989 // resynch continues to function. For example, one of the suffix characters
990 // could be a typo in the middle of a word.
991 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
992 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
993 // && fSuffixSet.contains(uc = utext_current32(text))) {
994 // if (uc == KHMER_PAIYANNOI) {
995 // if (!fSuffixSet.contains(utext_previous32(text))) {
996 // // Skip over previous end and PAIYANNOI
997 // utext_next32(text);
998 // utext_next32(text);
999 // wordLength += 1; // Add PAIYANNOI to word
1000 // uc = utext_current32(text); // Fetch next character
1003 // // Restore prior position
1004 // utext_next32(text);
1007 // if (uc == KHMER_MAIYAMOK) {
1008 // if (utext_previous32(text) != KHMER_MAIYAMOK) {
1009 // // Skip over previous end and MAIYAMOK
1010 // utext_next32(text);
1011 // utext_next32(text);
1012 // wordLength += 1; // Add MAIYAMOK to word
1015 // // Restore prior position
1016 // utext_next32(text);
1021 // utext_setNativeIndex(text, current+wordLength);
1025 // Did we find a word on this iteration? If so, push it on the break stack
1026 if (cuWordLength
> 0) {
1027 foundBreaks
.push((current
+cuWordLength
), status
);
1031 // Don't return a break for the end of the dictionary range if there is one there.
1032 if (foundBreaks
.peeki() >= rangeEnd
) {
1033 (void) foundBreaks
.popi();
1040 #if !UCONFIG_NO_NORMALIZATION
1042 ******************************************************************
1045 static const uint32_t kuint32max
= 0xFFFFFFFF;
1046 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher
*adoptDictionary
, LanguageType type
, UErrorCode
&status
)
1047 : DictionaryBreakEngine(), fDictionary(adoptDictionary
) {
1048 // Korean dictionary only includes Hangul syllables
1049 fHangulWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status
);
1050 fHanWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status
);
1051 fKatakanaWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status
);
1052 fHiraganaWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status
);
1053 nfkcNorm2
= Normalizer2::getNFKCInstance(status
);
1055 if (U_SUCCESS(status
)) {
1056 // handle Korean and Japanese/Chinese using different dictionaries
1057 if (type
== kKorean
) {
1058 setCharacters(fHangulWordSet
);
1059 } else { //Chinese and Japanese
1061 cjSet
.addAll(fHanWordSet
);
1062 cjSet
.addAll(fKatakanaWordSet
);
1063 cjSet
.addAll(fHiraganaWordSet
);
1064 cjSet
.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1065 cjSet
.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1066 setCharacters(cjSet
);
1071 CjkBreakEngine::~CjkBreakEngine(){
1075 // The katakanaCost values below are based on the length frequencies of all
1076 // katakana phrases in the dictionary
1077 static const int32_t kMaxKatakanaLength
= 8;
1078 static const int32_t kMaxKatakanaGroupLength
= 20;
1079 static const uint32_t maxSnlp
= 255;
1081 static inline uint32_t getKatakanaCost(int32_t wordLength
){
1082 //TODO: fill array with actual values from dictionary!
1083 static const uint32_t katakanaCost
[kMaxKatakanaLength
+ 1]
1084 = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1085 return (wordLength
> kMaxKatakanaLength
) ? 8192 : katakanaCost
[wordLength
];
1088 static inline bool isKatakana(UChar32 value
) {
1089 return (value
>= 0x30A1 && value
<= 0x30FE && value
!= 0x30FB) ||
1090 (value
>= 0xFF66 && value
<= 0xFF9f);
1094 // Function for accessing internal utext flags.
1095 // Replicates an internal UText function.
1097 static inline int32_t utext_i32_flag(int32_t bitIndex
) {
1098 return (int32_t)1 << bitIndex
;
1103 * @param text A UText representing the text
1104 * @param rangeStart The start of the range of dictionary characters
1105 * @param rangeEnd The end of the range of dictionary characters
1106 * @param foundBreaks vector<int32> to receive the break positions
1107 * @return The number of breaks found
1110 CjkBreakEngine::divideUpDictionaryRange( UText
*inText
,
1113 UVector32
&foundBreaks
) const {
1114 if (rangeStart
>= rangeEnd
) {
1118 // UnicodeString version of input UText, NFKC normalized if necessary.
1119 UnicodeString inString
;
1121 // inputMap[inStringIndex] = corresponding native index from UText inText.
1122 // If NULL then mapping is 1:1
1123 LocalPointer
<UVector32
> inputMap
;
1125 UErrorCode status
= U_ZERO_ERROR
;
1128 // if UText has the input string as one contiguous UTF-16 chunk
1129 if ((inText
->providerProperties
& utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS
)) &&
1130 inText
->chunkNativeStart
<= rangeStart
&&
1131 inText
->chunkNativeLimit
>= rangeEnd
&&
1132 inText
->nativeIndexingLimit
>= rangeEnd
- inText
->chunkNativeStart
) {
1134 // Input UText is in one contiguous UTF-16 chunk.
1135 // Use Read-only aliasing UnicodeString.
1136 inString
.setTo(FALSE
,
1137 inText
->chunkContents
+ rangeStart
- inText
->chunkNativeStart
,
1138 rangeEnd
- rangeStart
);
1140 // Copy the text from the original inText (UText) to inString (UnicodeString).
1141 // Create a map from UnicodeString indices -> UText offsets.
1142 utext_setNativeIndex(inText
, rangeStart
);
1143 int32_t limit
= rangeEnd
;
1144 U_ASSERT(limit
<= utext_nativeLength(inText
));
1145 if (limit
> utext_nativeLength(inText
)) {
1146 limit
= (int32_t)utext_nativeLength(inText
);
1148 inputMap
.adoptInsteadAndCheckErrorCode(new UVector32(status
), status
);
1149 if (U_FAILURE(status
)) {
1152 while (utext_getNativeIndex(inText
) < limit
) {
1153 int32_t nativePosition
= (int32_t)utext_getNativeIndex(inText
);
1154 UChar32 c
= utext_next32(inText
);
1155 U_ASSERT(c
!= U_SENTINEL
);
1157 while (inputMap
->size() < inString
.length()) {
1158 inputMap
->addElement(nativePosition
, status
);
1161 inputMap
->addElement(limit
, status
);
1165 if (!nfkcNorm2
->isNormalized(inString
, status
)) {
1166 UnicodeString normalizedInput
;
1167 // normalizedMap[normalizedInput position] == original UText position.
1168 LocalPointer
<UVector32
> normalizedMap(new UVector32(status
), status
);
1169 if (U_FAILURE(status
)) {
1173 UnicodeString fragment
;
1174 UnicodeString normalizedFragment
;
1175 for (int32_t srcI
= 0; srcI
< inString
.length();) { // Once per normalization chunk
1177 int32_t fragmentStartI
= srcI
;
1178 UChar32 c
= inString
.char32At(srcI
);
1181 srcI
= inString
.moveIndex32(srcI
, 1);
1182 if (srcI
== inString
.length()) {
1185 c
= inString
.char32At(srcI
);
1186 if (nfkcNorm2
->hasBoundaryBefore(c
)) {
1190 nfkcNorm2
->normalize(fragment
, normalizedFragment
, status
);
1191 normalizedInput
.append(normalizedFragment
);
1193 // Map every position in the normalized chunk to the start of the chunk
1194 // in the original input.
1195 int32_t fragmentOriginalStart
= inputMap
.isValid() ?
1196 inputMap
->elementAti(fragmentStartI
) : fragmentStartI
+rangeStart
;
1197 while (normalizedMap
->size() < normalizedInput
.length()) {
1198 normalizedMap
->addElement(fragmentOriginalStart
, status
);
1199 if (U_FAILURE(status
)) {
1204 U_ASSERT(normalizedMap
->size() == normalizedInput
.length());
1205 int32_t nativeEnd
= inputMap
.isValid() ?
1206 inputMap
->elementAti(inString
.length()) : inString
.length()+rangeStart
;
1207 normalizedMap
->addElement(nativeEnd
, status
);
1209 inputMap
= std::move(normalizedMap
);
1210 inString
= std::move(normalizedInput
);
1213 int32_t numCodePts
= inString
.countChar32();
1214 if (numCodePts
!= inString
.length()) {
1215 // There are supplementary characters in the input.
1216 // The dictionary will produce boundary positions in terms of code point indexes,
1217 // not in terms of code unit string indexes.
1218 // Use the inputMap mechanism to take care of this in addition to indexing differences
1219 // from normalization and/or UTF-8 input.
1220 UBool hadExistingMap
= inputMap
.isValid();
1221 if (!hadExistingMap
) {
1222 inputMap
.adoptInsteadAndCheckErrorCode(new UVector32(status
), status
);
1223 if (U_FAILURE(status
)) {
1228 for (int32_t cuIdx
= 0; ; cuIdx
= inString
.moveIndex32(cuIdx
, 1)) {
1229 U_ASSERT(cuIdx
>= cpIdx
);
1230 if (hadExistingMap
) {
1231 inputMap
->setElementAt(inputMap
->elementAti(cuIdx
), cpIdx
);
1233 inputMap
->addElement(cuIdx
+rangeStart
, status
);
1236 if (cuIdx
== inString
.length()) {
1242 // bestSnlp[i] is the snlp of the best segmentation of the first i
1243 // code points in the range to be matched.
1244 UVector32
bestSnlp(numCodePts
+ 1, status
);
1245 bestSnlp
.addElement(0, status
);
1246 for(int32_t i
= 1; i
<= numCodePts
; i
++) {
1247 bestSnlp
.addElement(kuint32max
, status
);
1251 // prev[i] is the index of the last CJK code point in the previous word in
1252 // the best segmentation of the first i characters.
1253 UVector32
prev(numCodePts
+ 1, status
);
1254 for(int32_t i
= 0; i
<= numCodePts
; i
++){
1255 prev
.addElement(-1, status
);
1258 const int32_t maxWordSize
= 20;
1259 UVector32
values(numCodePts
, status
);
1260 values
.setSize(numCodePts
);
1261 UVector32
lengths(numCodePts
, status
);
1262 lengths
.setSize(numCodePts
);
1264 UText fu
= UTEXT_INITIALIZER
;
1265 utext_openUnicodeString(&fu
, &inString
, &status
);
1267 // Dynamic programming to find the best segmentation.
1269 // In outer loop, i is the code point index,
1270 // ix is the corresponding string (code unit) index.
1271 // They differ when the string contains supplementary characters.
1273 bool is_prev_katakana
= false;
1274 for (int32_t i
= 0; i
< numCodePts
; ++i
, ix
= inString
.moveIndex32(ix
, 1)) {
1275 if ((uint32_t)bestSnlp
.elementAti(i
) == kuint32max
) {
1280 utext_setNativeIndex(&fu
, ix
);
1281 count
= fDictionary
->matches(&fu
, maxWordSize
, numCodePts
,
1282 NULL
, lengths
.getBuffer(), values
.getBuffer(), NULL
);
1283 // Note: lengths is filled with code point lengths
1284 // The NULL parameter is the ignored code unit lengths.
1286 // if there are no single character matches found in the dictionary
1287 // starting with this character, treat character as a 1-character word
1288 // with the highest value possible, i.e. the least likely to occur.
1289 // Exclude Korean characters from this treatment, as they should be left
1290 // together by default.
1291 if ((count
== 0 || lengths
.elementAti(0) != 1) &&
1292 !fHangulWordSet
.contains(inString
.char32At(ix
))) {
1293 values
.setElementAt(maxSnlp
, count
); // 255
1294 lengths
.setElementAt(1, count
++);
1297 for (int32_t j
= 0; j
< count
; j
++) {
1298 uint32_t newSnlp
= (uint32_t)bestSnlp
.elementAti(i
) + (uint32_t)values
.elementAti(j
);
1299 int32_t ln_j_i
= lengths
.elementAti(j
) + i
;
1300 if (newSnlp
< (uint32_t)bestSnlp
.elementAti(ln_j_i
)) {
1301 bestSnlp
.setElementAt(newSnlp
, ln_j_i
);
1302 prev
.setElementAt(i
, ln_j_i
);
1307 // Katakana word in single character is pretty rare. So we apply
1308 // the following heuristic to Katakana: any continuous run of Katakana
1309 // characters is considered a candidate word with a default cost
1310 // specified in the katakanaCost table according to its length.
1312 bool is_katakana
= isKatakana(inString
.char32At(ix
));
1313 int32_t katakanaRunLength
= 1;
1314 if (!is_prev_katakana
&& is_katakana
) {
1315 int32_t j
= inString
.moveIndex32(ix
, 1);
1316 // Find the end of the continuous run of Katakana characters
1317 while (j
< inString
.length() && katakanaRunLength
< kMaxKatakanaGroupLength
&&
1318 isKatakana(inString
.char32At(j
))) {
1319 j
= inString
.moveIndex32(j
, 1);
1320 katakanaRunLength
++;
1322 if (katakanaRunLength
< kMaxKatakanaGroupLength
) {
1323 uint32_t newSnlp
= bestSnlp
.elementAti(i
) + getKatakanaCost(katakanaRunLength
);
1324 if (newSnlp
< (uint32_t)bestSnlp
.elementAti(i
+katakanaRunLength
)) {
1325 bestSnlp
.setElementAt(newSnlp
, i
+katakanaRunLength
);
1326 prev
.setElementAt(i
, i
+katakanaRunLength
); // prev[j] = i;
1330 is_prev_katakana
= is_katakana
;
1334 // Start pushing the optimal offset index into t_boundary (t for tentative).
1335 // prev[numCodePts] is guaranteed to be meaningful.
1336 // We'll first push in the reverse order, i.e.,
1337 // t_boundary[0] = numCodePts, and afterwards do a swap.
1338 UVector32
t_boundary(numCodePts
+1, status
);
1340 int32_t numBreaks
= 0;
1341 // No segmentation found, set boundary to end of range
1342 if ((uint32_t)bestSnlp
.elementAti(numCodePts
) == kuint32max
) {
1343 t_boundary
.addElement(numCodePts
, status
);
1346 for (int32_t i
= numCodePts
; i
> 0; i
= prev
.elementAti(i
)) {
1347 t_boundary
.addElement(i
, status
);
1350 U_ASSERT(prev
.elementAti(t_boundary
.elementAti(numBreaks
- 1)) == 0);
1353 // Add a break for the start of the dictionary range if there is not one
1355 if (foundBreaks
.size() == 0 || foundBreaks
.peeki() < rangeStart
) {
1356 t_boundary
.addElement(0, status
);
1360 // Now that we're done, convert positions in t_boundary[] (indices in
1361 // the normalized input string) back to indices in the original input UText
1362 // while reversing t_boundary and pushing values to foundBreaks.
1363 int32_t prevCPPos
= -1;
1364 int32_t prevUTextPos
= -1;
1365 for (int32_t i
= numBreaks
-1; i
>= 0; i
--) {
1366 int32_t cpPos
= t_boundary
.elementAti(i
);
1367 U_ASSERT(cpPos
> prevCPPos
);
1368 int32_t utextPos
= inputMap
.isValid() ? inputMap
->elementAti(cpPos
) : cpPos
+ rangeStart
;
1369 U_ASSERT(utextPos
>= prevUTextPos
);
1370 if (utextPos
> prevUTextPos
) {
1371 // Boundaries are added to foundBreaks output in ascending order.
1372 U_ASSERT(foundBreaks
.size() == 0 || foundBreaks
.peeki() < utextPos
);
1373 foundBreaks
.push(utextPos
, status
);
1375 // Normalization expanded the input text, the dictionary found a boundary
1376 // within the expansion, giving two boundaries with the same index in the
1377 // original text. Ignore the second. See ticket #12918.
1381 prevUTextPos
= utextPos
;
1383 (void)prevCPPos
; // suppress compiler warnings about unused variable
1385 // inString goes out of scope
1386 // inputMap goes out of scope
1393 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */