]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/dictbe.cpp
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / dictbe.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4 *******************************************************************************
5 * Copyright (C) 2006-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_BREAK_ITERATION
13
14 #include "brkeng.h"
15 #include "dictbe.h"
16 #include "unicode/uniset.h"
17 #include "unicode/chariter.h"
18 #include "unicode/ubrk.h"
19 #include "uvectr32.h"
20 #include "uvector.h"
21 #include "uassert.h"
22 #include "unicode/normlzr.h"
23 #include "cmemory.h"
24 #include "dictionarydata.h"
25
26 U_NAMESPACE_BEGIN
27
28 /*
29 ******************************************************************
30 */
31
32 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
33 fTypes = breakTypes;
34 }
35
36 DictionaryBreakEngine::~DictionaryBreakEngine() {
37 }
38
39 UBool
40 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
41 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
42 && fSet.contains(c));
43 }
44
45 int32_t
46 DictionaryBreakEngine::findBreaks( UText *text,
47 int32_t startPos,
48 int32_t endPos,
49 UBool reverse,
50 int32_t breakType,
51 UStack &foundBreaks ) const {
52 int32_t result = 0;
53
54 // Find the span of characters included in the set.
55 // The span to break begins at the current position in the text, and
56 // extends towards the start or end of the text, depending on 'reverse'.
57
58 int32_t start = (int32_t)utext_getNativeIndex(text);
59 int32_t current;
60 int32_t rangeStart;
61 int32_t rangeEnd;
62 UChar32 c = utext_current32(text);
63 if (reverse) {
64 UBool isDict = fSet.contains(c);
65 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
66 c = utext_previous32(text);
67 isDict = fSet.contains(c);
68 }
69 if (current < startPos) {
70 rangeStart = startPos;
71 } else {
72 rangeStart = current;
73 if (!isDict) {
74 utext_next32(text);
75 rangeStart = (int32_t)utext_getNativeIndex(text);
76 }
77 }
78 // rangeEnd = start + 1;
79 utext_setNativeIndex(text, start);
80 utext_next32(text);
81 rangeEnd = (int32_t)utext_getNativeIndex(text);
82 }
83 else {
84 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
85 utext_next32(text); // TODO: recast loop for postincrement
86 c = utext_current32(text);
87 }
88 rangeStart = start;
89 rangeEnd = current;
90 }
91 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
92 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
93 utext_setNativeIndex(text, current);
94 }
95
96 return result;
97 }
98
99 void
100 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
101 fSet = set;
102 // Compact for caching
103 fSet.compact();
104 }
105
106 /*
107 ******************************************************************
108 * PossibleWord
109 */
110
111 // Helper class for improving readability of the Thai/Lao/Khmer word break
112 // algorithm. The implementation is completely inline.
113
114 // List size, limited by the maximum number of words in the dictionary
115 // that form a nested sequence.
116 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
117
118 class PossibleWord {
119 private:
120 // list of word candidate lengths, in increasing length order
121 // TODO: bytes would be sufficient for word lengths.
122 int32_t count; // Count of candidates
123 int32_t prefix; // The longest match with a dictionary word
124 int32_t offset; // Offset in the text of these candidates
125 int32_t mark; // The preferred candidate's offset
126 int32_t current; // The candidate we're currently looking at
127 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.
128 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.
129
130 public:
131 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
132 ~PossibleWord() {};
133
134 // Fill the list of candidates if needed, select the longest, and return the number found
135 int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
136
137 // Select the currently marked candidate, point after it in the text, and invalidate self
138 int32_t acceptMarked( UText *text );
139
140 // Back up from the current candidate to the next shorter one; return TRUE if that exists
141 // and point the text after it
142 UBool backUp( UText *text );
143
144 // Return the longest prefix this candidate location shares with a dictionary word
145 // Return value is in code points.
146 int32_t longestPrefix() { return prefix; };
147
148 // Mark the current candidate as the one we like
149 void markCurrent() { mark = current; };
150
151 // Get length in code points of the marked word.
152 int32_t markedCPLength() { return cpLengths[mark]; };
153 };
154
155
156 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
157 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
158 int32_t start = (int32_t)utext_getNativeIndex(text);
159 if (start != offset) {
160 offset = start;
161 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
162 // Dictionary leaves text after longest prefix, not longest word. Back up.
163 if (count <= 0) {
164 utext_setNativeIndex(text, start);
165 }
166 }
167 if (count > 0) {
168 utext_setNativeIndex(text, start+cuLengths[count-1]);
169 }
170 current = count-1;
171 mark = current;
172 return count;
173 }
174
175 int32_t
176 PossibleWord::acceptMarked( UText *text ) {
177 utext_setNativeIndex(text, offset + cuLengths[mark]);
178 return cuLengths[mark];
179 }
180
181
182 UBool
183 PossibleWord::backUp( UText *text ) {
184 if (current > 0) {
185 utext_setNativeIndex(text, offset + cuLengths[--current]);
186 return TRUE;
187 }
188 return FALSE;
189 }
190
191 /*
192 ******************************************************************
193 * ThaiBreakEngine
194 */
195
196 // How many words in a row are "good enough"?
197 static const int32_t THAI_LOOKAHEAD = 3;
198
199 // Will not combine a non-word with a preceding dictionary word longer than this
200 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
201
202 // Will not combine a non-word that shares at least this much prefix with a
203 // dictionary word, with a preceding word
204 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
205
206 // Ellision character
207 static const int32_t THAI_PAIYANNOI = 0x0E2F;
208
209 // Repeat character
210 static const int32_t THAI_MAIYAMOK = 0x0E46;
211
212 // Minimum word size
213 static const int32_t THAI_MIN_WORD = 2;
214
215 // Minimum number of characters for two words
216 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
217
218 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
219 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
220 fDictionary(adoptDictionary)
221 {
222 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
223 if (U_SUCCESS(status)) {
224 setCharacters(fThaiWordSet);
225 }
226 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
227 fMarkSet.add(0x0020);
228 fEndWordSet = fThaiWordSet;
229 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
230 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
231 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
232 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
233 fSuffixSet.add(THAI_PAIYANNOI);
234 fSuffixSet.add(THAI_MAIYAMOK);
235
236 // Compact for caching.
237 fMarkSet.compact();
238 fEndWordSet.compact();
239 fBeginWordSet.compact();
240 fSuffixSet.compact();
241 }
242
243 ThaiBreakEngine::~ThaiBreakEngine() {
244 delete fDictionary;
245 }
246
247 int32_t
248 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
249 int32_t rangeStart,
250 int32_t rangeEnd,
251 UStack &foundBreaks ) const {
252 utext_setNativeIndex(text, rangeStart);
253 utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
254 if (utext_getNativeIndex(text) >= rangeEnd) {
255 return 0; // Not enough characters for two words
256 }
257 utext_setNativeIndex(text, rangeStart);
258
259
260 uint32_t wordsFound = 0;
261 int32_t cpWordLength = 0; // Word Length in Code Points.
262 int32_t cuWordLength = 0; // Word length in code units (UText native indexing)
263 int32_t current;
264 UErrorCode status = U_ZERO_ERROR;
265 PossibleWord words[THAI_LOOKAHEAD];
266
267 utext_setNativeIndex(text, rangeStart);
268
269 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
270 cpWordLength = 0;
271 cuWordLength = 0;
272
273 // Look for candidate words at the current position
274 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
275
276 // If we found exactly one, use that
277 if (candidates == 1) {
278 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
279 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
280 wordsFound += 1;
281 }
282 // If there was more than one, see which one can take us forward the most words
283 else if (candidates > 1) {
284 // If we're already at the end of the range, we're done
285 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
286 goto foundBest;
287 }
288 do {
289 int32_t wordsMatched = 1;
290 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
291 if (wordsMatched < 2) {
292 // Followed by another dictionary word; mark first word as a good candidate
293 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
294 wordsMatched = 2;
295 }
296
297 // If we're already at the end of the range, we're done
298 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
299 goto foundBest;
300 }
301
302 // See if any of the possible second words is followed by a third word
303 do {
304 // If we find a third word, stop right away
305 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
306 words[wordsFound % THAI_LOOKAHEAD].markCurrent();
307 goto foundBest;
308 }
309 }
310 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
311 }
312 }
313 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
314 foundBest:
315 // Set UText position to after the accepted word.
316 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
317 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
318 wordsFound += 1;
319 }
320
321 // We come here after having either found a word or not. We look ahead to the
322 // next word. If it's not a dictionary word, we will combine it with the word we
323 // just found (if there is one), but only if the preceding word does not exceed
324 // the threshold.
325 // The text iterator should now be positioned at the end of the word we found.
326
327 UChar32 uc = 0;
328 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
329 // if it is a dictionary word, do nothing. If it isn't, then if there is
330 // no preceding word, or the non-word shares less than the minimum threshold
331 // of characters with a dictionary word, then scan to resynchronize
332 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
333 && (cuWordLength == 0
334 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
335 // Look for a plausible word boundary
336 int32_t remaining = rangeEnd - (current+cuWordLength);
337 UChar32 pc;
338 int32_t chars = 0;
339 for (;;) {
340 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
341 pc = utext_next32(text);
342 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
343 chars += pcSize;
344 remaining -= pcSize;
345 if (remaining <= 0) {
346 break;
347 }
348 uc = utext_current32(text);
349 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
350 // Maybe. See if it's in the dictionary.
351 // NOTE: In the original Apple code, checked that the next
352 // two characters after uc were not 0x0E4C THANTHAKHAT before
353 // checking the dictionary. That is just a performance filter,
354 // but it's not clear it's faster than checking the trie.
355 int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
356 utext_setNativeIndex(text, current + cuWordLength + chars);
357 if (candidates > 0) {
358 break;
359 }
360 }
361 }
362
363 // Bump the word count if there wasn't already one
364 if (cuWordLength <= 0) {
365 wordsFound += 1;
366 }
367
368 // Update the length with the passed-over characters
369 cuWordLength += chars;
370 }
371 else {
372 // Back up to where we were for next iteration
373 utext_setNativeIndex(text, current+cuWordLength);
374 }
375 }
376
377 // Never stop before a combining mark.
378 int32_t currPos;
379 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
380 utext_next32(text);
381 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
382 }
383
384 // Look ahead for possible suffixes if a dictionary word does not follow.
385 // We do this in code rather than using a rule so that the heuristic
386 // resynch continues to function. For example, one of the suffix characters
387 // could be a typo in the middle of a word.
388 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
389 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
390 && fSuffixSet.contains(uc = utext_current32(text))) {
391 if (uc == THAI_PAIYANNOI) {
392 if (!fSuffixSet.contains(utext_previous32(text))) {
393 // Skip over previous end and PAIYANNOI
394 utext_next32(text);
395 int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
396 utext_next32(text);
397 cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word
398 uc = utext_current32(text); // Fetch next character
399 }
400 else {
401 // Restore prior position
402 utext_next32(text);
403 }
404 }
405 if (uc == THAI_MAIYAMOK) {
406 if (utext_previous32(text) != THAI_MAIYAMOK) {
407 // Skip over previous end and MAIYAMOK
408 utext_next32(text);
409 int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
410 utext_next32(text);
411 cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word
412 }
413 else {
414 // Restore prior position
415 utext_next32(text);
416 }
417 }
418 }
419 else {
420 utext_setNativeIndex(text, current+cuWordLength);
421 }
422 }
423
424 // Did we find a word on this iteration? If so, push it on the break stack
425 if (cuWordLength > 0) {
426 foundBreaks.push((current+cuWordLength), status);
427 }
428 }
429
430 // Don't return a break for the end of the dictionary range if there is one there.
431 if (foundBreaks.peeki() >= rangeEnd) {
432 (void) foundBreaks.popi();
433 wordsFound -= 1;
434 }
435
436 return wordsFound;
437 }
438
439 /*
440 ******************************************************************
441 * LaoBreakEngine
442 */
443
444 // How many words in a row are "good enough"?
445 static const int32_t LAO_LOOKAHEAD = 3;
446
447 // Will not combine a non-word with a preceding dictionary word longer than this
448 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
449
450 // Will not combine a non-word that shares at least this much prefix with a
451 // dictionary word, with a preceding word
452 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
453
454 // Minimum word size
455 static const int32_t LAO_MIN_WORD = 2;
456
457 // Minimum number of characters for two words
458 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
459
460 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
461 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
462 fDictionary(adoptDictionary)
463 {
464 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
465 if (U_SUCCESS(status)) {
466 setCharacters(fLaoWordSet);
467 }
468 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
469 fMarkSet.add(0x0020);
470 fEndWordSet = fLaoWordSet;
471 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
472 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
473 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
474 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
475
476 // Compact for caching.
477 fMarkSet.compact();
478 fEndWordSet.compact();
479 fBeginWordSet.compact();
480 }
481
482 LaoBreakEngine::~LaoBreakEngine() {
483 delete fDictionary;
484 }
485
486 int32_t
487 LaoBreakEngine::divideUpDictionaryRange( UText *text,
488 int32_t rangeStart,
489 int32_t rangeEnd,
490 UStack &foundBreaks ) const {
491 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
492 return 0; // Not enough characters for two words
493 }
494
495 uint32_t wordsFound = 0;
496 int32_t cpWordLength = 0;
497 int32_t cuWordLength = 0;
498 int32_t current;
499 UErrorCode status = U_ZERO_ERROR;
500 PossibleWord words[LAO_LOOKAHEAD];
501
502 utext_setNativeIndex(text, rangeStart);
503
504 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
505 cuWordLength = 0;
506 cpWordLength = 0;
507
508 // Look for candidate words at the current position
509 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
510
511 // If we found exactly one, use that
512 if (candidates == 1) {
513 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
514 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
515 wordsFound += 1;
516 }
517 // If there was more than one, see which one can take us forward the most words
518 else if (candidates > 1) {
519 // If we're already at the end of the range, we're done
520 if (utext_getNativeIndex(text) >= rangeEnd) {
521 goto foundBest;
522 }
523 do {
524 int32_t wordsMatched = 1;
525 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
526 if (wordsMatched < 2) {
527 // Followed by another dictionary word; mark first word as a good candidate
528 words[wordsFound%LAO_LOOKAHEAD].markCurrent();
529 wordsMatched = 2;
530 }
531
532 // If we're already at the end of the range, we're done
533 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
534 goto foundBest;
535 }
536
537 // See if any of the possible second words is followed by a third word
538 do {
539 // If we find a third word, stop right away
540 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
541 words[wordsFound % LAO_LOOKAHEAD].markCurrent();
542 goto foundBest;
543 }
544 }
545 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
546 }
547 }
548 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
549 foundBest:
550 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
551 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
552 wordsFound += 1;
553 }
554
555 // We come here after having either found a word or not. We look ahead to the
556 // next word. If it's not a dictionary word, we will combine it withe the word we
557 // just found (if there is one), but only if the preceding word does not exceed
558 // the threshold.
559 // The text iterator should now be positioned at the end of the word we found.
560 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
561 // if it is a dictionary word, do nothing. If it isn't, then if there is
562 // no preceding word, or the non-word shares less than the minimum threshold
563 // of characters with a dictionary word, then scan to resynchronize
564 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
565 && (cuWordLength == 0
566 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
567 // Look for a plausible word boundary
568 int32_t remaining = rangeEnd - (current + cuWordLength);
569 UChar32 pc;
570 UChar32 uc;
571 int32_t chars = 0;
572 for (;;) {
573 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
574 pc = utext_next32(text);
575 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
576 chars += pcSize;
577 remaining -= pcSize;
578 if (remaining <= 0) {
579 break;
580 }
581 uc = utext_current32(text);
582 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
583 // Maybe. See if it's in the dictionary.
584 // TODO: this looks iffy; compare with old code.
585 int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
586 utext_setNativeIndex(text, current + cuWordLength + chars);
587 if (candidates > 0) {
588 break;
589 }
590 }
591 }
592
593 // Bump the word count if there wasn't already one
594 if (cuWordLength <= 0) {
595 wordsFound += 1;
596 }
597
598 // Update the length with the passed-over characters
599 cuWordLength += chars;
600 }
601 else {
602 // Back up to where we were for next iteration
603 utext_setNativeIndex(text, current + cuWordLength);
604 }
605 }
606
607 // Never stop before a combining mark.
608 int32_t currPos;
609 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
610 utext_next32(text);
611 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
612 }
613
614 // Look ahead for possible suffixes if a dictionary word does not follow.
615 // We do this in code rather than using a rule so that the heuristic
616 // resynch continues to function. For example, one of the suffix characters
617 // could be a typo in the middle of a word.
618 // NOT CURRENTLY APPLICABLE TO LAO
619
620 // Did we find a word on this iteration? If so, push it on the break stack
621 if (cuWordLength > 0) {
622 foundBreaks.push((current+cuWordLength), status);
623 }
624 }
625
626 // Don't return a break for the end of the dictionary range if there is one there.
627 if (foundBreaks.peeki() >= rangeEnd) {
628 (void) foundBreaks.popi();
629 wordsFound -= 1;
630 }
631
632 return wordsFound;
633 }
634
635 /*
636 ******************************************************************
637 * BurmeseBreakEngine
638 */
639
640 // How many words in a row are "good enough"?
641 static const int32_t BURMESE_LOOKAHEAD = 3;
642
643 // Will not combine a non-word with a preceding dictionary word longer than this
644 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
645
646 // Will not combine a non-word that shares at least this much prefix with a
647 // dictionary word, with a preceding word
648 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
649
650 // Minimum word size
651 static const int32_t BURMESE_MIN_WORD = 2;
652
653 // Minimum number of characters for two words
654 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
655
656 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
657 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
658 fDictionary(adoptDictionary)
659 {
660 fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
661 if (U_SUCCESS(status)) {
662 setCharacters(fBurmeseWordSet);
663 }
664 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
665 fMarkSet.add(0x0020);
666 fEndWordSet = fBurmeseWordSet;
667 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
668
669 // Compact for caching.
670 fMarkSet.compact();
671 fEndWordSet.compact();
672 fBeginWordSet.compact();
673 }
674
675 BurmeseBreakEngine::~BurmeseBreakEngine() {
676 delete fDictionary;
677 }
678
679 int32_t
680 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
681 int32_t rangeStart,
682 int32_t rangeEnd,
683 UStack &foundBreaks ) const {
684 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
685 return 0; // Not enough characters for two words
686 }
687
688 uint32_t wordsFound = 0;
689 int32_t cpWordLength = 0;
690 int32_t cuWordLength = 0;
691 int32_t current;
692 UErrorCode status = U_ZERO_ERROR;
693 PossibleWord words[BURMESE_LOOKAHEAD];
694
695 utext_setNativeIndex(text, rangeStart);
696
697 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
698 cuWordLength = 0;
699 cpWordLength = 0;
700
701 // Look for candidate words at the current position
702 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
703
704 // If we found exactly one, use that
705 if (candidates == 1) {
706 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
707 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
708 wordsFound += 1;
709 }
710 // If there was more than one, see which one can take us forward the most words
711 else if (candidates > 1) {
712 // If we're already at the end of the range, we're done
713 if (utext_getNativeIndex(text) >= rangeEnd) {
714 goto foundBest;
715 }
716 do {
717 int32_t wordsMatched = 1;
718 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
719 if (wordsMatched < 2) {
720 // Followed by another dictionary word; mark first word as a good candidate
721 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
722 wordsMatched = 2;
723 }
724
725 // If we're already at the end of the range, we're done
726 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
727 goto foundBest;
728 }
729
730 // See if any of the possible second words is followed by a third word
731 do {
732 // If we find a third word, stop right away
733 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
734 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
735 goto foundBest;
736 }
737 }
738 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
739 }
740 }
741 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
742 foundBest:
743 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
744 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
745 wordsFound += 1;
746 }
747
748 // We come here after having either found a word or not. We look ahead to the
749 // next word. If it's not a dictionary word, we will combine it withe the word we
750 // just found (if there is one), but only if the preceding word does not exceed
751 // the threshold.
752 // The text iterator should now be positioned at the end of the word we found.
753 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
754 // if it is a dictionary word, do nothing. If it isn't, then if there is
755 // no preceding word, or the non-word shares less than the minimum threshold
756 // of characters with a dictionary word, then scan to resynchronize
757 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
758 && (cuWordLength == 0
759 || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
760 // Look for a plausible word boundary
761 int32_t remaining = rangeEnd - (current + cuWordLength);
762 UChar32 pc;
763 UChar32 uc;
764 int32_t chars = 0;
765 for (;;) {
766 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
767 pc = utext_next32(text);
768 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
769 chars += pcSize;
770 remaining -= pcSize;
771 if (remaining <= 0) {
772 break;
773 }
774 uc = utext_current32(text);
775 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
776 // Maybe. See if it's in the dictionary.
777 // TODO: this looks iffy; compare with old code.
778 int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
779 utext_setNativeIndex(text, current + cuWordLength + chars);
780 if (candidates > 0) {
781 break;
782 }
783 }
784 }
785
786 // Bump the word count if there wasn't already one
787 if (cuWordLength <= 0) {
788 wordsFound += 1;
789 }
790
791 // Update the length with the passed-over characters
792 cuWordLength += chars;
793 }
794 else {
795 // Back up to where we were for next iteration
796 utext_setNativeIndex(text, current + cuWordLength);
797 }
798 }
799
800 // Never stop before a combining mark.
801 int32_t currPos;
802 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
803 utext_next32(text);
804 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
805 }
806
807 // Look ahead for possible suffixes if a dictionary word does not follow.
808 // We do this in code rather than using a rule so that the heuristic
809 // resynch continues to function. For example, one of the suffix characters
810 // could be a typo in the middle of a word.
811 // NOT CURRENTLY APPLICABLE TO BURMESE
812
813 // Did we find a word on this iteration? If so, push it on the break stack
814 if (cuWordLength > 0) {
815 foundBreaks.push((current+cuWordLength), status);
816 }
817 }
818
819 // Don't return a break for the end of the dictionary range if there is one there.
820 if (foundBreaks.peeki() >= rangeEnd) {
821 (void) foundBreaks.popi();
822 wordsFound -= 1;
823 }
824
825 return wordsFound;
826 }
827
828 /*
829 ******************************************************************
830 * KhmerBreakEngine
831 */
832
833 // How many words in a row are "good enough"?
834 static const int32_t KHMER_LOOKAHEAD = 3;
835
836 // Will not combine a non-word with a preceding dictionary word longer than this
837 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
838
839 // Will not combine a non-word that shares at least this much prefix with a
840 // dictionary word, with a preceding word
841 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
842
843 // Minimum word size
844 static const int32_t KHMER_MIN_WORD = 2;
845
846 // Minimum number of characters for two words
847 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
848
849 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
850 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
851 fDictionary(adoptDictionary)
852 {
853 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
854 if (U_SUCCESS(status)) {
855 setCharacters(fKhmerWordSet);
856 }
857 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
858 fMarkSet.add(0x0020);
859 fEndWordSet = fKhmerWordSet;
860 fBeginWordSet.add(0x1780, 0x17B3);
861 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
862 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
863 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
864 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
865 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
866 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
867 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
868 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
869 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
870 // fSuffixSet.add(THAI_PAIYANNOI);
871 // fSuffixSet.add(THAI_MAIYAMOK);
872
873 // Compact for caching.
874 fMarkSet.compact();
875 fEndWordSet.compact();
876 fBeginWordSet.compact();
877 // fSuffixSet.compact();
878 }
879
880 KhmerBreakEngine::~KhmerBreakEngine() {
881 delete fDictionary;
882 }
883
884 int32_t
885 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
886 int32_t rangeStart,
887 int32_t rangeEnd,
888 UStack &foundBreaks ) const {
889 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
890 return 0; // Not enough characters for two words
891 }
892
893 uint32_t wordsFound = 0;
894 int32_t cpWordLength = 0;
895 int32_t cuWordLength = 0;
896 int32_t current;
897 UErrorCode status = U_ZERO_ERROR;
898 PossibleWord words[KHMER_LOOKAHEAD];
899
900 utext_setNativeIndex(text, rangeStart);
901
902 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
903 cuWordLength = 0;
904 cpWordLength = 0;
905
906 // Look for candidate words at the current position
907 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
908
909 // If we found exactly one, use that
910 if (candidates == 1) {
911 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
912 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
913 wordsFound += 1;
914 }
915
916 // If there was more than one, see which one can take us forward the most words
917 else if (candidates > 1) {
918 // If we're already at the end of the range, we're done
919 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
920 goto foundBest;
921 }
922 do {
923 int32_t wordsMatched = 1;
924 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
925 if (wordsMatched < 2) {
926 // Followed by another dictionary word; mark first word as a good candidate
927 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
928 wordsMatched = 2;
929 }
930
931 // If we're already at the end of the range, we're done
932 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
933 goto foundBest;
934 }
935
936 // See if any of the possible second words is followed by a third word
937 do {
938 // If we find a third word, stop right away
939 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
940 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
941 goto foundBest;
942 }
943 }
944 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
945 }
946 }
947 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
948 foundBest:
949 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
950 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
951 wordsFound += 1;
952 }
953
954 // We come here after having either found a word or not. We look ahead to the
955 // next word. If it's not a dictionary word, we will combine it with the word we
956 // just found (if there is one), but only if the preceding word does not exceed
957 // the threshold.
958 // The text iterator should now be positioned at the end of the word we found.
959 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
960 // if it is a dictionary word, do nothing. If it isn't, then if there is
961 // no preceding word, or the non-word shares less than the minimum threshold
962 // of characters with a dictionary word, then scan to resynchronize
963 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
964 && (cuWordLength == 0
965 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
966 // Look for a plausible word boundary
967 int32_t remaining = rangeEnd - (current+cuWordLength);
968 UChar32 pc;
969 UChar32 uc;
970 int32_t chars = 0;
971 for (;;) {
972 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
973 pc = utext_next32(text);
974 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
975 chars += pcSize;
976 remaining -= pcSize;
977 if (remaining <= 0) {
978 break;
979 }
980 uc = utext_current32(text);
981 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
982 // Maybe. See if it's in the dictionary.
983 int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
984 utext_setNativeIndex(text, current+cuWordLength+chars);
985 if (candidates > 0) {
986 break;
987 }
988 }
989 }
990
991 // Bump the word count if there wasn't already one
992 if (cuWordLength <= 0) {
993 wordsFound += 1;
994 }
995
996 // Update the length with the passed-over characters
997 cuWordLength += chars;
998 }
999 else {
1000 // Back up to where we were for next iteration
1001 utext_setNativeIndex(text, current+cuWordLength);
1002 }
1003 }
1004
1005 // Never stop before a combining mark.
1006 int32_t currPos;
1007 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
1008 utext_next32(text);
1009 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
1010 }
1011
1012 // Look ahead for possible suffixes if a dictionary word does not follow.
1013 // We do this in code rather than using a rule so that the heuristic
1014 // resynch continues to function. For example, one of the suffix characters
1015 // could be a typo in the middle of a word.
1016 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1017 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1018 // && fSuffixSet.contains(uc = utext_current32(text))) {
1019 // if (uc == KHMER_PAIYANNOI) {
1020 // if (!fSuffixSet.contains(utext_previous32(text))) {
1021 // // Skip over previous end and PAIYANNOI
1022 // utext_next32(text);
1023 // utext_next32(text);
1024 // wordLength += 1; // Add PAIYANNOI to word
1025 // uc = utext_current32(text); // Fetch next character
1026 // }
1027 // else {
1028 // // Restore prior position
1029 // utext_next32(text);
1030 // }
1031 // }
1032 // if (uc == KHMER_MAIYAMOK) {
1033 // if (utext_previous32(text) != KHMER_MAIYAMOK) {
1034 // // Skip over previous end and MAIYAMOK
1035 // utext_next32(text);
1036 // utext_next32(text);
1037 // wordLength += 1; // Add MAIYAMOK to word
1038 // }
1039 // else {
1040 // // Restore prior position
1041 // utext_next32(text);
1042 // }
1043 // }
1044 // }
1045 // else {
1046 // utext_setNativeIndex(text, current+wordLength);
1047 // }
1048 // }
1049
1050 // Did we find a word on this iteration? If so, push it on the break stack
1051 if (cuWordLength > 0) {
1052 foundBreaks.push((current+cuWordLength), status);
1053 }
1054 }
1055
1056 // Don't return a break for the end of the dictionary range if there is one there.
1057 if (foundBreaks.peeki() >= rangeEnd) {
1058 (void) foundBreaks.popi();
1059 wordsFound -= 1;
1060 }
1061
1062 return wordsFound;
1063 }
1064
1065 #if !UCONFIG_NO_NORMALIZATION
1066 /*
1067 ******************************************************************
1068 * CjkBreakEngine
1069 */
1070 static const uint32_t kuint32max = 0xFFFFFFFF;
1071 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1072 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
1073 // Korean dictionary only includes Hangul syllables
1074 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1075 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1076 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1077 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1078 nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1079
1080 if (U_SUCCESS(status)) {
1081 // handle Korean and Japanese/Chinese using different dictionaries
1082 if (type == kKorean) {
1083 setCharacters(fHangulWordSet);
1084 } else { //Chinese and Japanese
1085 UnicodeSet cjSet;
1086 cjSet.addAll(fHanWordSet);
1087 cjSet.addAll(fKatakanaWordSet);
1088 cjSet.addAll(fHiraganaWordSet);
1089 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1090 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1091 setCharacters(cjSet);
1092 }
1093 }
1094 }
1095
1096 CjkBreakEngine::~CjkBreakEngine(){
1097 delete fDictionary;
1098 }
1099
1100 // The katakanaCost values below are based on the length frequencies of all
1101 // katakana phrases in the dictionary
1102 static const int32_t kMaxKatakanaLength = 8;
1103 static const int32_t kMaxKatakanaGroupLength = 20;
1104 static const uint32_t maxSnlp = 255;
1105
1106 static inline uint32_t getKatakanaCost(int32_t wordLength){
1107 //TODO: fill array with actual values from dictionary!
1108 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1109 = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1110 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1111 }
1112
1113 static inline bool isKatakana(uint16_t value) {
1114 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
1115 (value >= 0xFF66u && value <= 0xFF9fu);
1116 }
1117
1118
1119 // Function for accessing internal utext flags.
1120 // Replicates an internal UText function.
1121
1122 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1123 return (int32_t)1 << bitIndex;
1124 }
1125
1126
1127 /*
1128 * @param text A UText representing the text
1129 * @param rangeStart The start of the range of dictionary characters
1130 * @param rangeEnd The end of the range of dictionary characters
1131 * @param foundBreaks Output of C array of int32_t break positions, or 0
1132 * @return The number of breaks found
1133 */
1134 int32_t
1135 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1136 int32_t rangeStart,
1137 int32_t rangeEnd,
1138 UStack &foundBreaks ) const {
1139 if (rangeStart >= rangeEnd) {
1140 return 0;
1141 }
1142
1143 // UnicodeString version of input UText, NFKC normalized if necessary.
1144 UnicodeString inString;
1145
1146 // inputMap[inStringIndex] = corresponding native index from UText inText.
1147 // If NULL then mapping is 1:1
1148 LocalPointer<UVector32> inputMap;
1149
1150 UErrorCode status = U_ZERO_ERROR;
1151
1152
1153 // if UText has the input string as one contiguous UTF-16 chunk
1154 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1155 inText->chunkNativeStart <= rangeStart &&
1156 inText->chunkNativeLimit >= rangeEnd &&
1157 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1158
1159 // Input UText is in one contiguous UTF-16 chunk.
1160 // Use Read-only aliasing UnicodeString.
1161 inString.setTo(FALSE,
1162 inText->chunkContents + rangeStart - inText->chunkNativeStart,
1163 rangeEnd - rangeStart);
1164 } else {
1165 // Copy the text from the original inText (UText) to inString (UnicodeString).
1166 // Create a map from UnicodeString indices -> UText offsets.
1167 utext_setNativeIndex(inText, rangeStart);
1168 int32_t limit = rangeEnd;
1169 U_ASSERT(limit <= utext_nativeLength(inText));
1170 if (limit > utext_nativeLength(inText)) {
1171 limit = (int32_t)utext_nativeLength(inText);
1172 }
1173 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1174 if (U_FAILURE(status)) {
1175 return 0;
1176 }
1177 while (utext_getNativeIndex(inText) < limit) {
1178 int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1179 UChar32 c = utext_next32(inText);
1180 U_ASSERT(c != U_SENTINEL);
1181 inString.append(c);
1182 while (inputMap->size() < inString.length()) {
1183 inputMap->addElement(nativePosition, status);
1184 }
1185 }
1186 inputMap->addElement(limit, status);
1187 }
1188
1189
1190 if (!nfkcNorm2->isNormalized(inString, status)) {
1191 UnicodeString normalizedInput;
1192 // normalizedMap[normalizedInput position] == original UText position.
1193 LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1194 if (U_FAILURE(status)) {
1195 return 0;
1196 }
1197
1198 UnicodeString fragment;
1199 UnicodeString normalizedFragment;
1200 for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk
1201 fragment.remove();
1202 int32_t fragmentStartI = srcI;
1203 UChar32 c = inString.char32At(srcI);
1204 for (;;) {
1205 fragment.append(c);
1206 srcI = inString.moveIndex32(srcI, 1);
1207 if (srcI == inString.length()) {
1208 break;
1209 }
1210 c = inString.char32At(srcI);
1211 if (nfkcNorm2->hasBoundaryBefore(c)) {
1212 break;
1213 }
1214 }
1215 nfkcNorm2->normalize(fragment, normalizedFragment, status);
1216 normalizedInput.append(normalizedFragment);
1217
1218 // Map every position in the normalized chunk to the start of the chunk
1219 // in the original input.
1220 int32_t fragmentOriginalStart = inputMap.isValid() ?
1221 inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1222 while (normalizedMap->size() < normalizedInput.length()) {
1223 normalizedMap->addElement(fragmentOriginalStart, status);
1224 if (U_FAILURE(status)) {
1225 break;
1226 }
1227 }
1228 }
1229 U_ASSERT(normalizedMap->size() == normalizedInput.length());
1230 int32_t nativeEnd = inputMap.isValid() ?
1231 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1232 normalizedMap->addElement(nativeEnd, status);
1233
1234 inputMap.moveFrom(normalizedMap);
1235 inString.moveFrom(normalizedInput);
1236 }
1237
1238 int32_t numCodePts = inString.countChar32();
1239 if (numCodePts != inString.length()) {
1240 // There are supplementary characters in the input.
1241 // The dictionary will produce boundary positions in terms of code point indexes,
1242 // not in terms of code unit string indexes.
1243 // Use the inputMap mechanism to take care of this in addition to indexing differences
1244 // from normalization and/or UTF-8 input.
1245 UBool hadExistingMap = inputMap.isValid();
1246 if (!hadExistingMap) {
1247 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1248 if (U_FAILURE(status)) {
1249 return 0;
1250 }
1251 }
1252 int32_t cpIdx = 0;
1253 for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1254 U_ASSERT(cuIdx >= cpIdx);
1255 if (hadExistingMap) {
1256 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1257 } else {
1258 inputMap->addElement(cuIdx+rangeStart, status);
1259 }
1260 cpIdx++;
1261 if (cuIdx == inString.length()) {
1262 break;
1263 }
1264 }
1265 }
1266
1267 // bestSnlp[i] is the snlp of the best segmentation of the first i
1268 // code points in the range to be matched.
1269 UVector32 bestSnlp(numCodePts + 1, status);
1270 bestSnlp.addElement(0, status);
1271 for(int32_t i = 1; i <= numCodePts; i++) {
1272 bestSnlp.addElement(kuint32max, status);
1273 }
1274
1275
1276 // prev[i] is the index of the last CJK code point in the previous word in
1277 // the best segmentation of the first i characters.
1278 UVector32 prev(numCodePts + 1, status);
1279 for(int32_t i = 0; i <= numCodePts; i++){
1280 prev.addElement(-1, status);
1281 }
1282
1283 const int32_t maxWordSize = 20;
1284 UVector32 values(numCodePts, status);
1285 values.setSize(numCodePts);
1286 UVector32 lengths(numCodePts, status);
1287 lengths.setSize(numCodePts);
1288
1289 UText fu = UTEXT_INITIALIZER;
1290 utext_openUnicodeString(&fu, &inString, &status);
1291
1292 // Dynamic programming to find the best segmentation.
1293
1294 // In outer loop, i is the code point index,
1295 // ix is the corresponding string (code unit) index.
1296 // They differ when the string contains supplementary characters.
1297 int32_t ix = 0;
1298 bool is_prev_katakana = false;
1299 for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) {
1300 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1301 continue;
1302 }
1303
1304 int32_t count;
1305 utext_setNativeIndex(&fu, ix);
1306 count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1307 NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1308 // Note: lengths is filled with code point lengths
1309 // The NULL parameter is the ignored code unit lengths.
1310
1311 // if there are no single character matches found in the dictionary
1312 // starting with this character, treat character as a 1-character word
1313 // with the highest value possible, i.e. the least likely to occur.
1314 // Exclude Korean characters from this treatment, as they should be left
1315 // together by default.
1316 if ((count == 0 || lengths.elementAti(0) != 1) &&
1317 !fHangulWordSet.contains(inString.char32At(ix))) {
1318 values.setElementAt(maxSnlp, count); // 255
1319 lengths.setElementAt(1, count++);
1320 }
1321
1322 for (int32_t j = 0; j < count; j++) {
1323 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1324 int32_t ln_j_i = lengths.elementAti(j) + i;
1325 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1326 bestSnlp.setElementAt(newSnlp, ln_j_i);
1327 prev.setElementAt(i, ln_j_i);
1328 }
1329 }
1330
1331 // In Japanese,
1332 // Katakana word in single character is pretty rare. So we apply
1333 // the following heuristic to Katakana: any continuous run of Katakana
1334 // characters is considered a candidate word with a default cost
1335 // specified in the katakanaCost table according to its length.
1336
1337 bool is_katakana = isKatakana(inString.char32At(ix));
1338 int32_t katakanaRunLength = 1;
1339 if (!is_prev_katakana && is_katakana) {
1340 int32_t j = inString.moveIndex32(ix, 1);
1341 // Find the end of the continuous run of Katakana characters
1342 while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1343 isKatakana(inString.char32At(j))) {
1344 j = inString.moveIndex32(j, 1);
1345 katakanaRunLength++;
1346 }
1347 if (katakanaRunLength < kMaxKatakanaGroupLength) {
1348 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1349 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
1350 bestSnlp.setElementAt(newSnlp, j);
1351 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
1352 }
1353 }
1354 }
1355 is_prev_katakana = is_katakana;
1356 }
1357 utext_close(&fu);
1358
1359 // Start pushing the optimal offset index into t_boundary (t for tentative).
1360 // prev[numCodePts] is guaranteed to be meaningful.
1361 // We'll first push in the reverse order, i.e.,
1362 // t_boundary[0] = numCodePts, and afterwards do a swap.
1363 UVector32 t_boundary(numCodePts+1, status);
1364
1365 int32_t numBreaks = 0;
1366 // No segmentation found, set boundary to end of range
1367 if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1368 t_boundary.addElement(numCodePts, status);
1369 numBreaks++;
1370 } else {
1371 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1372 t_boundary.addElement(i, status);
1373 numBreaks++;
1374 }
1375 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1376 }
1377
1378 // Add a break for the start of the dictionary range if there is not one
1379 // there already.
1380 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1381 t_boundary.addElement(0, status);
1382 numBreaks++;
1383 }
1384
1385 // Now that we're done, convert positions in t_boundary[] (indices in
1386 // the normalized input string) back to indices in the original input UText
1387 // while reversing t_boundary and pushing values to foundBreaks.
1388 int32_t prevCPPos = -1;
1389 int32_t prevUTextPos = -1;
1390 for (int32_t i = numBreaks-1; i >= 0; i--) {
1391 int32_t cpPos = t_boundary.elementAti(i);
1392 U_ASSERT(cpPos > prevCPPos);
1393 int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1394 U_ASSERT(utextPos >= prevUTextPos);
1395 if (utextPos > prevUTextPos) {
1396 // Boundaries are added to foundBreaks output in ascending order.
1397 U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1398 foundBreaks.push(utextPos, status);
1399 } else {
1400 // Normalization expanded the input text, the dictionary found a boundary
1401 // within the expansion, giving two boundaries with the same index in the
1402 // original text. Ignore the second. See ticket #12918.
1403 --numBreaks;
1404 }
1405 prevCPPos = cpPos;
1406 prevUTextPos = utextPos;
1407 }
1408
1409 // inString goes out of scope
1410 // inputMap goes out of scope
1411 return numBreaks;
1412 }
1413 #endif
1414
1415 U_NAMESPACE_END
1416
1417 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1418