]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/dictbe.cpp
ICU-59117.0.1.tar.gz
[apple/icu.git] / icuSources / common / dictbe.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/**
4 *******************************************************************************
2ca993e8 5 * Copyright (C) 2006-2016, International Business Machines Corporation
51004dcb 6 * and others. All Rights Reserved.
73c04bcf
A
7 *******************************************************************************
8 */
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_BREAK_ITERATION
13
14#include "brkeng.h"
15#include "dictbe.h"
16#include "unicode/uniset.h"
17#include "unicode/chariter.h"
18#include "unicode/ubrk.h"
b331163b 19#include "uvectr32.h"
73c04bcf 20#include "uvector.h"
51004dcb
A
21#include "uassert.h"
22#include "unicode/normlzr.h"
23#include "cmemory.h"
24#include "dictionarydata.h"
73c04bcf
A
25
26U_NAMESPACE_BEGIN
27
28/*
29 ******************************************************************
30 */
31
73c04bcf
A
32DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
33 fTypes = breakTypes;
34}
35
36DictionaryBreakEngine::~DictionaryBreakEngine() {
37}
38
39UBool
40DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
41 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
42 && fSet.contains(c));
43}
44
45int32_t
46DictionaryBreakEngine::findBreaks( UText *text,
47 int32_t startPos,
48 int32_t endPos,
49 UBool reverse,
50 int32_t breakType,
51 UStack &foundBreaks ) const {
52 int32_t result = 0;
53
54 // Find the span of characters included in the set.
57a6839d
A
55 // The span to break begins at the current position in the text, and
56 // extends towards the start or end of the text, depending on 'reverse'.
57
73c04bcf
A
58 int32_t start = (int32_t)utext_getNativeIndex(text);
59 int32_t current;
60 int32_t rangeStart;
61 int32_t rangeEnd;
62 UChar32 c = utext_current32(text);
63 if (reverse) {
64 UBool isDict = fSet.contains(c);
65 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
66 c = utext_previous32(text);
67 isDict = fSet.contains(c);
68 }
b331163b
A
69 if (current < startPos) {
70 rangeStart = startPos;
71 } else {
72 rangeStart = current;
73 if (!isDict) {
74 utext_next32(text);
2ca993e8 75 rangeStart = (int32_t)utext_getNativeIndex(text);
b331163b
A
76 }
77 }
78 // rangeEnd = start + 1;
79 utext_setNativeIndex(text, start);
80 utext_next32(text);
2ca993e8 81 rangeEnd = (int32_t)utext_getNativeIndex(text);
73c04bcf
A
82 }
83 else {
84 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
85 utext_next32(text); // TODO: recast loop for postincrement
86 c = utext_current32(text);
87 }
88 rangeStart = start;
89 rangeEnd = current;
90 }
91 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
92 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
93 utext_setNativeIndex(text, current);
94 }
95
96 return result;
97}
98
99void
46f4442e 100DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
73c04bcf 101 fSet = set;
46f4442e
A
102 // Compact for caching
103 fSet.compact();
73c04bcf
A
104}
105
73c04bcf
A
106/*
107 ******************************************************************
57a6839d 108 * PossibleWord
73c04bcf
A
109 */
110
57a6839d 111// Helper class for improving readability of the Thai/Lao/Khmer word break
73c04bcf
A
112// algorithm. The implementation is completely inline.
113
114// List size, limited by the maximum number of words in the dictionary
115// that form a nested sequence.
b331163b 116static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
73c04bcf
A
117
118class PossibleWord {
51004dcb
A
119private:
120 // list of word candidate lengths, in increasing length order
b331163b 121 // TODO: bytes would be sufficient for word lengths.
51004dcb
A
122 int32_t count; // Count of candidates
123 int32_t prefix; // The longest match with a dictionary word
124 int32_t offset; // Offset in the text of these candidates
b331163b
A
125 int32_t mark; // The preferred candidate's offset
126 int32_t current; // The candidate we're currently looking at
127 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.
128 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.
51004dcb
A
129
130public:
b331163b
A
131 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
132 ~PossibleWord() {};
73c04bcf 133
51004dcb 134 // Fill the list of candidates if needed, select the longest, and return the number found
b331163b 135 int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
73c04bcf 136
51004dcb
A
137 // Select the currently marked candidate, point after it in the text, and invalidate self
138 int32_t acceptMarked( UText *text );
73c04bcf 139
51004dcb
A
140 // Back up from the current candidate to the next shorter one; return TRUE if that exists
141 // and point the text after it
142 UBool backUp( UText *text );
73c04bcf 143
51004dcb 144 // Return the longest prefix this candidate location shares with a dictionary word
b331163b
A
145 // Return value is in code points.
146 int32_t longestPrefix() { return prefix; };
73c04bcf 147
51004dcb 148 // Mark the current candidate as the one we like
b331163b
A
149 void markCurrent() { mark = current; };
150
151 // Get length in code points of the marked word.
152 int32_t markedCPLength() { return cpLengths[mark]; };
73c04bcf
A
153};
154
73c04bcf 155
b331163b 156int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
73c04bcf
A
157 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
158 int32_t start = (int32_t)utext_getNativeIndex(text);
159 if (start != offset) {
160 offset = start;
b331163b 161 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
73c04bcf
A
162 // Dictionary leaves text after longest prefix, not longest word. Back up.
163 if (count <= 0) {
164 utext_setNativeIndex(text, start);
165 }
166 }
167 if (count > 0) {
b331163b 168 utext_setNativeIndex(text, start+cuLengths[count-1]);
73c04bcf
A
169 }
170 current = count-1;
171 mark = current;
172 return count;
173}
174
b331163b 175int32_t
73c04bcf 176PossibleWord::acceptMarked( UText *text ) {
b331163b
A
177 utext_setNativeIndex(text, offset + cuLengths[mark]);
178 return cuLengths[mark];
73c04bcf
A
179}
180
b331163b
A
181
182UBool
73c04bcf
A
183PossibleWord::backUp( UText *text ) {
184 if (current > 0) {
b331163b 185 utext_setNativeIndex(text, offset + cuLengths[--current]);
73c04bcf
A
186 return TRUE;
187 }
188 return FALSE;
189}
190
57a6839d
A
191/*
192 ******************************************************************
193 * ThaiBreakEngine
194 */
195
73c04bcf 196// How many words in a row are "good enough"?
b331163b 197static const int32_t THAI_LOOKAHEAD = 3;
73c04bcf
A
198
199// Will not combine a non-word with a preceding dictionary word longer than this
b331163b 200static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
73c04bcf
A
201
202// Will not combine a non-word that shares at least this much prefix with a
203// dictionary word, with a preceding word
b331163b 204static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
73c04bcf
A
205
206// Ellision character
b331163b 207static const int32_t THAI_PAIYANNOI = 0x0E2F;
73c04bcf
A
208
209// Repeat character
b331163b 210static const int32_t THAI_MAIYAMOK = 0x0E46;
73c04bcf
A
211
212// Minimum word size
b331163b 213static const int32_t THAI_MIN_WORD = 2;
73c04bcf
A
214
215// Minimum number of characters for two words
b331163b 216static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
73c04bcf 217
51004dcb 218ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
73c04bcf
A
219 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
220 fDictionary(adoptDictionary)
221{
222 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
223 if (U_SUCCESS(status)) {
224 setCharacters(fThaiWordSet);
225 }
226 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
46f4442e 227 fMarkSet.add(0x0020);
73c04bcf
A
228 fEndWordSet = fThaiWordSet;
229 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
230 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
231 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
232 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
233 fSuffixSet.add(THAI_PAIYANNOI);
234 fSuffixSet.add(THAI_MAIYAMOK);
46f4442e
A
235
236 // Compact for caching.
237 fMarkSet.compact();
238 fEndWordSet.compact();
239 fBeginWordSet.compact();
240 fSuffixSet.compact();
73c04bcf
A
241}
242
243ThaiBreakEngine::~ThaiBreakEngine() {
244 delete fDictionary;
245}
246
247int32_t
248ThaiBreakEngine::divideUpDictionaryRange( UText *text,
249 int32_t rangeStart,
250 int32_t rangeEnd,
251 UStack &foundBreaks ) const {
b331163b
A
252 utext_setNativeIndex(text, rangeStart);
253 utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
254 if (utext_getNativeIndex(text) >= rangeEnd) {
73c04bcf
A
255 return 0; // Not enough characters for two words
256 }
b331163b
A
257 utext_setNativeIndex(text, rangeStart);
258
73c04bcf
A
259
260 uint32_t wordsFound = 0;
b331163b
A
261 int32_t cpWordLength = 0; // Word Length in Code Points.
262 int32_t cuWordLength = 0; // Word length in code units (UText native indexing)
73c04bcf
A
263 int32_t current;
264 UErrorCode status = U_ZERO_ERROR;
265 PossibleWord words[THAI_LOOKAHEAD];
73c04bcf
A
266
267 utext_setNativeIndex(text, rangeStart);
268
269 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
b331163b
A
270 cpWordLength = 0;
271 cuWordLength = 0;
73c04bcf
A
272
273 // Look for candidate words at the current position
b331163b 274 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
73c04bcf
A
275
276 // If we found exactly one, use that
277 if (candidates == 1) {
b331163b
A
278 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
279 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
73c04bcf
A
280 wordsFound += 1;
281 }
73c04bcf
A
282 // If there was more than one, see which one can take us forward the most words
283 else if (candidates > 1) {
284 // If we're already at the end of the range, we're done
285 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
286 goto foundBest;
287 }
288 do {
b331163b 289 int32_t wordsMatched = 1;
51004dcb 290 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
73c04bcf
A
291 if (wordsMatched < 2) {
292 // Followed by another dictionary word; mark first word as a good candidate
293 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
294 wordsMatched = 2;
295 }
296
297 // If we're already at the end of the range, we're done
298 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
299 goto foundBest;
300 }
301
302 // See if any of the possible second words is followed by a third word
303 do {
304 // If we find a third word, stop right away
51004dcb
A
305 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
306 words[wordsFound % THAI_LOOKAHEAD].markCurrent();
73c04bcf
A
307 goto foundBest;
308 }
309 }
51004dcb 310 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
73c04bcf
A
311 }
312 }
51004dcb 313 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
73c04bcf 314foundBest:
b331163b
A
315 // Set UText position to after the accepted word.
316 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
317 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
73c04bcf
A
318 wordsFound += 1;
319 }
320
321 // We come here after having either found a word or not. We look ahead to the
b331163b 322 // next word. If it's not a dictionary word, we will combine it with the word we
73c04bcf
A
323 // just found (if there is one), but only if the preceding word does not exceed
324 // the threshold.
325 // The text iterator should now be positioned at the end of the word we found.
b331163b
A
326
327 UChar32 uc = 0;
328 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
73c04bcf
A
329 // if it is a dictionary word, do nothing. If it isn't, then if there is
330 // no preceding word, or the non-word shares less than the minimum threshold
331 // of characters with a dictionary word, then scan to resynchronize
51004dcb 332 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
b331163b 333 && (cuWordLength == 0
73c04bcf
A
334 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
335 // Look for a plausible word boundary
b331163b
A
336 int32_t remaining = rangeEnd - (current+cuWordLength);
337 UChar32 pc;
73c04bcf 338 int32_t chars = 0;
46f4442e 339 for (;;) {
2ca993e8 340 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
b331163b 341 pc = utext_next32(text);
2ca993e8 342 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
b331163b
A
343 chars += pcSize;
344 remaining -= pcSize;
345 if (remaining <= 0) {
73c04bcf
A
346 break;
347 }
b331163b 348 uc = utext_current32(text);
73c04bcf
A
349 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
350 // Maybe. See if it's in the dictionary.
351 // NOTE: In the original Apple code, checked that the next
352 // two characters after uc were not 0x0E4C THANTHAKHAT before
353 // checking the dictionary. That is just a performance filter,
354 // but it's not clear it's faster than checking the trie.
b331163b
A
355 int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
356 utext_setNativeIndex(text, current + cuWordLength + chars);
73c04bcf
A
357 if (candidates > 0) {
358 break;
359 }
360 }
73c04bcf
A
361 }
362
363 // Bump the word count if there wasn't already one
b331163b 364 if (cuWordLength <= 0) {
73c04bcf
A
365 wordsFound += 1;
366 }
367
368 // Update the length with the passed-over characters
b331163b 369 cuWordLength += chars;
73c04bcf
A
370 }
371 else {
372 // Back up to where we were for next iteration
b331163b 373 utext_setNativeIndex(text, current+cuWordLength);
73c04bcf
A
374 }
375 }
376
377 // Never stop before a combining mark.
378 int32_t currPos;
379 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
380 utext_next32(text);
b331163b 381 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
73c04bcf
A
382 }
383
384 // Look ahead for possible suffixes if a dictionary word does not follow.
385 // We do this in code rather than using a rule so that the heuristic
386 // resynch continues to function. For example, one of the suffix characters
387 // could be a typo in the middle of a word.
b331163b 388 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
73c04bcf
A
389 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
390 && fSuffixSet.contains(uc = utext_current32(text))) {
391 if (uc == THAI_PAIYANNOI) {
392 if (!fSuffixSet.contains(utext_previous32(text))) {
393 // Skip over previous end and PAIYANNOI
394 utext_next32(text);
2ca993e8 395 int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
73c04bcf 396 utext_next32(text);
2ca993e8 397 cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word
73c04bcf
A
398 uc = utext_current32(text); // Fetch next character
399 }
400 else {
401 // Restore prior position
402 utext_next32(text);
403 }
404 }
405 if (uc == THAI_MAIYAMOK) {
406 if (utext_previous32(text) != THAI_MAIYAMOK) {
407 // Skip over previous end and MAIYAMOK
408 utext_next32(text);
2ca993e8 409 int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
73c04bcf 410 utext_next32(text);
2ca993e8 411 cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word
73c04bcf
A
412 }
413 else {
414 // Restore prior position
415 utext_next32(text);
416 }
417 }
418 }
419 else {
b331163b 420 utext_setNativeIndex(text, current+cuWordLength);
73c04bcf
A
421 }
422 }
4388f060
A
423
424 // Did we find a word on this iteration? If so, push it on the break stack
b331163b
A
425 if (cuWordLength > 0) {
426 foundBreaks.push((current+cuWordLength), status);
4388f060
A
427 }
428 }
429
430 // Don't return a break for the end of the dictionary range if there is one there.
431 if (foundBreaks.peeki() >= rangeEnd) {
432 (void) foundBreaks.popi();
433 wordsFound -= 1;
434 }
435
436 return wordsFound;
437}
438
57a6839d
A
439/*
440 ******************************************************************
441 * LaoBreakEngine
442 */
443
444// How many words in a row are "good enough"?
b331163b 445static const int32_t LAO_LOOKAHEAD = 3;
57a6839d
A
446
447// Will not combine a non-word with a preceding dictionary word longer than this
b331163b 448static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
57a6839d
A
449
450// Will not combine a non-word that shares at least this much prefix with a
451// dictionary word, with a preceding word
b331163b 452static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
57a6839d
A
453
454// Minimum word size
b331163b 455static const int32_t LAO_MIN_WORD = 2;
57a6839d
A
456
457// Minimum number of characters for two words
b331163b 458static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
57a6839d
A
459
460LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
461 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
462 fDictionary(adoptDictionary)
463{
464 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
465 if (U_SUCCESS(status)) {
466 setCharacters(fLaoWordSet);
467 }
468 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
469 fMarkSet.add(0x0020);
470 fEndWordSet = fLaoWordSet;
471 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
472 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
473 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
474 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
475
476 // Compact for caching.
477 fMarkSet.compact();
478 fEndWordSet.compact();
479 fBeginWordSet.compact();
480}
481
482LaoBreakEngine::~LaoBreakEngine() {
483 delete fDictionary;
484}
485
486int32_t
487LaoBreakEngine::divideUpDictionaryRange( UText *text,
488 int32_t rangeStart,
489 int32_t rangeEnd,
490 UStack &foundBreaks ) const {
491 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
492 return 0; // Not enough characters for two words
493 }
494
495 uint32_t wordsFound = 0;
b331163b
A
496 int32_t cpWordLength = 0;
497 int32_t cuWordLength = 0;
57a6839d
A
498 int32_t current;
499 UErrorCode status = U_ZERO_ERROR;
500 PossibleWord words[LAO_LOOKAHEAD];
57a6839d
A
501
502 utext_setNativeIndex(text, rangeStart);
503
504 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
b331163b
A
505 cuWordLength = 0;
506 cpWordLength = 0;
57a6839d
A
507
508 // Look for candidate words at the current position
b331163b 509 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
57a6839d
A
510
511 // If we found exactly one, use that
512 if (candidates == 1) {
b331163b
A
513 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
514 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
57a6839d
A
515 wordsFound += 1;
516 }
517 // If there was more than one, see which one can take us forward the most words
518 else if (candidates > 1) {
519 // If we're already at the end of the range, we're done
b331163b 520 if (utext_getNativeIndex(text) >= rangeEnd) {
57a6839d
A
521 goto foundBest;
522 }
523 do {
b331163b 524 int32_t wordsMatched = 1;
57a6839d
A
525 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
526 if (wordsMatched < 2) {
527 // Followed by another dictionary word; mark first word as a good candidate
528 words[wordsFound%LAO_LOOKAHEAD].markCurrent();
529 wordsMatched = 2;
530 }
531
532 // If we're already at the end of the range, we're done
533 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
534 goto foundBest;
535 }
536
537 // See if any of the possible second words is followed by a third word
538 do {
539 // If we find a third word, stop right away
540 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
541 words[wordsFound % LAO_LOOKAHEAD].markCurrent();
542 goto foundBest;
543 }
544 }
545 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
546 }
547 }
548 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
549foundBest:
b331163b
A
550 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
551 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
57a6839d
A
552 wordsFound += 1;
553 }
554
555 // We come here after having either found a word or not. We look ahead to the
556 // next word. If it's not a dictionary word, we will combine it withe the word we
557 // just found (if there is one), but only if the preceding word does not exceed
558 // the threshold.
559 // The text iterator should now be positioned at the end of the word we found.
b331163b 560 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
57a6839d
A
561 // if it is a dictionary word, do nothing. If it isn't, then if there is
562 // no preceding word, or the non-word shares less than the minimum threshold
563 // of characters with a dictionary word, then scan to resynchronize
564 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
b331163b 565 && (cuWordLength == 0
57a6839d
A
566 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
567 // Look for a plausible word boundary
b331163b
A
568 int32_t remaining = rangeEnd - (current + cuWordLength);
569 UChar32 pc;
570 UChar32 uc;
57a6839d
A
571 int32_t chars = 0;
572 for (;;) {
2ca993e8 573 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
b331163b 574 pc = utext_next32(text);
2ca993e8 575 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
b331163b
A
576 chars += pcSize;
577 remaining -= pcSize;
578 if (remaining <= 0) {
57a6839d
A
579 break;
580 }
b331163b 581 uc = utext_current32(text);
57a6839d
A
582 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
583 // Maybe. See if it's in the dictionary.
b331163b
A
584 // TODO: this looks iffy; compare with old code.
585 int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
586 utext_setNativeIndex(text, current + cuWordLength + chars);
57a6839d
A
587 if (candidates > 0) {
588 break;
589 }
590 }
57a6839d
A
591 }
592
593 // Bump the word count if there wasn't already one
b331163b 594 if (cuWordLength <= 0) {
57a6839d
A
595 wordsFound += 1;
596 }
597
598 // Update the length with the passed-over characters
b331163b 599 cuWordLength += chars;
57a6839d
A
600 }
601 else {
602 // Back up to where we were for next iteration
b331163b 603 utext_setNativeIndex(text, current + cuWordLength);
57a6839d
A
604 }
605 }
606
607 // Never stop before a combining mark.
608 int32_t currPos;
609 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
610 utext_next32(text);
b331163b 611 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
57a6839d
A
612 }
613
614 // Look ahead for possible suffixes if a dictionary word does not follow.
615 // We do this in code rather than using a rule so that the heuristic
616 // resynch continues to function. For example, one of the suffix characters
617 // could be a typo in the middle of a word.
618 // NOT CURRENTLY APPLICABLE TO LAO
619
620 // Did we find a word on this iteration? If so, push it on the break stack
b331163b
A
621 if (cuWordLength > 0) {
622 foundBreaks.push((current+cuWordLength), status);
623 }
624 }
625
626 // Don't return a break for the end of the dictionary range if there is one there.
627 if (foundBreaks.peeki() >= rangeEnd) {
628 (void) foundBreaks.popi();
629 wordsFound -= 1;
630 }
631
632 return wordsFound;
633}
634
635/*
636 ******************************************************************
637 * BurmeseBreakEngine
638 */
639
640// How many words in a row are "good enough"?
641static const int32_t BURMESE_LOOKAHEAD = 3;
642
643// Will not combine a non-word with a preceding dictionary word longer than this
644static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
645
646// Will not combine a non-word that shares at least this much prefix with a
647// dictionary word, with a preceding word
648static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
649
650// Minimum word size
651static const int32_t BURMESE_MIN_WORD = 2;
652
653// Minimum number of characters for two words
654static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
655
656BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
657 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
658 fDictionary(adoptDictionary)
659{
660 fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
661 if (U_SUCCESS(status)) {
662 setCharacters(fBurmeseWordSet);
663 }
664 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
665 fMarkSet.add(0x0020);
666 fEndWordSet = fBurmeseWordSet;
667 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
668
669 // Compact for caching.
670 fMarkSet.compact();
671 fEndWordSet.compact();
672 fBeginWordSet.compact();
673}
674
675BurmeseBreakEngine::~BurmeseBreakEngine() {
676 delete fDictionary;
677}
678
679int32_t
680BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
681 int32_t rangeStart,
682 int32_t rangeEnd,
683 UStack &foundBreaks ) const {
684 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
685 return 0; // Not enough characters for two words
686 }
687
688 uint32_t wordsFound = 0;
689 int32_t cpWordLength = 0;
690 int32_t cuWordLength = 0;
691 int32_t current;
692 UErrorCode status = U_ZERO_ERROR;
693 PossibleWord words[BURMESE_LOOKAHEAD];
694
695 utext_setNativeIndex(text, rangeStart);
696
697 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
698 cuWordLength = 0;
699 cpWordLength = 0;
700
701 // Look for candidate words at the current position
702 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
703
704 // If we found exactly one, use that
705 if (candidates == 1) {
706 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
707 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
708 wordsFound += 1;
709 }
710 // If there was more than one, see which one can take us forward the most words
711 else if (candidates > 1) {
712 // If we're already at the end of the range, we're done
713 if (utext_getNativeIndex(text) >= rangeEnd) {
714 goto foundBest;
715 }
716 do {
717 int32_t wordsMatched = 1;
718 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
719 if (wordsMatched < 2) {
720 // Followed by another dictionary word; mark first word as a good candidate
721 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
722 wordsMatched = 2;
723 }
724
725 // If we're already at the end of the range, we're done
726 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
727 goto foundBest;
728 }
729
730 // See if any of the possible second words is followed by a third word
731 do {
732 // If we find a third word, stop right away
733 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
734 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
735 goto foundBest;
736 }
737 }
738 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
739 }
740 }
741 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
742foundBest:
743 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
744 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
745 wordsFound += 1;
746 }
747
748 // We come here after having either found a word or not. We look ahead to the
749 // next word. If it's not a dictionary word, we will combine it withe the word we
750 // just found (if there is one), but only if the preceding word does not exceed
751 // the threshold.
752 // The text iterator should now be positioned at the end of the word we found.
753 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
754 // if it is a dictionary word, do nothing. If it isn't, then if there is
755 // no preceding word, or the non-word shares less than the minimum threshold
756 // of characters with a dictionary word, then scan to resynchronize
757 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
758 && (cuWordLength == 0
759 || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
760 // Look for a plausible word boundary
761 int32_t remaining = rangeEnd - (current + cuWordLength);
762 UChar32 pc;
763 UChar32 uc;
764 int32_t chars = 0;
765 for (;;) {
2ca993e8 766 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
b331163b 767 pc = utext_next32(text);
2ca993e8 768 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
b331163b
A
769 chars += pcSize;
770 remaining -= pcSize;
771 if (remaining <= 0) {
772 break;
773 }
774 uc = utext_current32(text);
775 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
776 // Maybe. See if it's in the dictionary.
777 // TODO: this looks iffy; compare with old code.
778 int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
779 utext_setNativeIndex(text, current + cuWordLength + chars);
780 if (candidates > 0) {
781 break;
782 }
783 }
784 }
785
786 // Bump the word count if there wasn't already one
787 if (cuWordLength <= 0) {
788 wordsFound += 1;
789 }
790
791 // Update the length with the passed-over characters
792 cuWordLength += chars;
793 }
794 else {
795 // Back up to where we were for next iteration
796 utext_setNativeIndex(text, current + cuWordLength);
797 }
798 }
799
800 // Never stop before a combining mark.
801 int32_t currPos;
802 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
803 utext_next32(text);
804 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
805 }
806
807 // Look ahead for possible suffixes if a dictionary word does not follow.
808 // We do this in code rather than using a rule so that the heuristic
809 // resynch continues to function. For example, one of the suffix characters
810 // could be a typo in the middle of a word.
811 // NOT CURRENTLY APPLICABLE TO BURMESE
812
813 // Did we find a word on this iteration? If so, push it on the break stack
814 if (cuWordLength > 0) {
815 foundBreaks.push((current+cuWordLength), status);
57a6839d
A
816 }
817 }
818
819 // Don't return a break for the end of the dictionary range if there is one there.
820 if (foundBreaks.peeki() >= rangeEnd) {
821 (void) foundBreaks.popi();
822 wordsFound -= 1;
823 }
824
825 return wordsFound;
826}
827
828/*
829 ******************************************************************
830 * KhmerBreakEngine
831 */
832
4388f060 833// How many words in a row are "good enough"?
b331163b 834static const int32_t KHMER_LOOKAHEAD = 3;
4388f060
A
835
836// Will not combine a non-word with a preceding dictionary word longer than this
b331163b 837static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
4388f060
A
838
839// Will not combine a non-word that shares at least this much prefix with a
840// dictionary word, with a preceding word
b331163b 841static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
4388f060
A
842
843// Minimum word size
b331163b 844static const int32_t KHMER_MIN_WORD = 2;
4388f060
A
845
846// Minimum number of characters for two words
b331163b 847static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
4388f060 848
51004dcb
A
849KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
850 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
4388f060
A
851 fDictionary(adoptDictionary)
852{
853 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
854 if (U_SUCCESS(status)) {
855 setCharacters(fKhmerWordSet);
856 }
857 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
858 fMarkSet.add(0x0020);
859 fEndWordSet = fKhmerWordSet;
860 fBeginWordSet.add(0x1780, 0x17B3);
861 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
862 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
863 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
864 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
865 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
866// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
867// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
868// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
869// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
870// fSuffixSet.add(THAI_PAIYANNOI);
871// fSuffixSet.add(THAI_MAIYAMOK);
872
873 // Compact for caching.
874 fMarkSet.compact();
875 fEndWordSet.compact();
876 fBeginWordSet.compact();
877// fSuffixSet.compact();
878}
879
880KhmerBreakEngine::~KhmerBreakEngine() {
881 delete fDictionary;
882}
883
884int32_t
885KhmerBreakEngine::divideUpDictionaryRange( UText *text,
886 int32_t rangeStart,
887 int32_t rangeEnd,
888 UStack &foundBreaks ) const {
889 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
890 return 0; // Not enough characters for two words
891 }
892
893 uint32_t wordsFound = 0;
b331163b
A
894 int32_t cpWordLength = 0;
895 int32_t cuWordLength = 0;
4388f060
A
896 int32_t current;
897 UErrorCode status = U_ZERO_ERROR;
898 PossibleWord words[KHMER_LOOKAHEAD];
4388f060
A
899
900 utext_setNativeIndex(text, rangeStart);
901
902 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
b331163b
A
903 cuWordLength = 0;
904 cpWordLength = 0;
4388f060
A
905
906 // Look for candidate words at the current position
b331163b 907 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
4388f060
A
908
909 // If we found exactly one, use that
910 if (candidates == 1) {
b331163b
A
911 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
912 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
4388f060
A
913 wordsFound += 1;
914 }
915
916 // If there was more than one, see which one can take us forward the most words
917 else if (candidates > 1) {
918 // If we're already at the end of the range, we're done
919 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
920 goto foundBest;
921 }
922 do {
b331163b 923 int32_t wordsMatched = 1;
51004dcb 924 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
4388f060
A
925 if (wordsMatched < 2) {
926 // Followed by another dictionary word; mark first word as a good candidate
51004dcb 927 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
4388f060
A
928 wordsMatched = 2;
929 }
930
931 // If we're already at the end of the range, we're done
932 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
933 goto foundBest;
934 }
935
936 // See if any of the possible second words is followed by a third word
937 do {
938 // If we find a third word, stop right away
51004dcb
A
939 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
940 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
4388f060
A
941 goto foundBest;
942 }
943 }
51004dcb 944 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
4388f060
A
945 }
946 }
51004dcb 947 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
4388f060 948foundBest:
b331163b
A
949 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
950 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
4388f060
A
951 wordsFound += 1;
952 }
953
954 // We come here after having either found a word or not. We look ahead to the
955 // next word. If it's not a dictionary word, we will combine it with the word we
956 // just found (if there is one), but only if the preceding word does not exceed
957 // the threshold.
958 // The text iterator should now be positioned at the end of the word we found.
b331163b 959 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
4388f060
A
960 // if it is a dictionary word, do nothing. If it isn't, then if there is
961 // no preceding word, or the non-word shares less than the minimum threshold
962 // of characters with a dictionary word, then scan to resynchronize
51004dcb 963 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
b331163b 964 && (cuWordLength == 0
51004dcb 965 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
4388f060 966 // Look for a plausible word boundary
b331163b
A
967 int32_t remaining = rangeEnd - (current+cuWordLength);
968 UChar32 pc;
969 UChar32 uc;
4388f060
A
970 int32_t chars = 0;
971 for (;;) {
2ca993e8 972 int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
b331163b 973 pc = utext_next32(text);
2ca993e8 974 int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
b331163b
A
975 chars += pcSize;
976 remaining -= pcSize;
977 if (remaining <= 0) {
4388f060
A
978 break;
979 }
b331163b 980 uc = utext_current32(text);
4388f060
A
981 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
982 // Maybe. See if it's in the dictionary.
b331163b
A
983 int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
984 utext_setNativeIndex(text, current+cuWordLength+chars);
4388f060
A
985 if (candidates > 0) {
986 break;
987 }
988 }
4388f060
A
989 }
990
991 // Bump the word count if there wasn't already one
b331163b 992 if (cuWordLength <= 0) {
4388f060
A
993 wordsFound += 1;
994 }
995
996 // Update the length with the passed-over characters
b331163b 997 cuWordLength += chars;
4388f060
A
998 }
999 else {
1000 // Back up to where we were for next iteration
b331163b 1001 utext_setNativeIndex(text, current+cuWordLength);
4388f060
A
1002 }
1003 }
1004
1005 // Never stop before a combining mark.
1006 int32_t currPos;
1007 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
1008 utext_next32(text);
b331163b 1009 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
4388f060
A
1010 }
1011
1012 // Look ahead for possible suffixes if a dictionary word does not follow.
1013 // We do this in code rather than using a rule so that the heuristic
1014 // resynch continues to function. For example, one of the suffix characters
1015 // could be a typo in the middle of a word.
1016// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1017// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1018// && fSuffixSet.contains(uc = utext_current32(text))) {
1019// if (uc == KHMER_PAIYANNOI) {
1020// if (!fSuffixSet.contains(utext_previous32(text))) {
1021// // Skip over previous end and PAIYANNOI
1022// utext_next32(text);
1023// utext_next32(text);
1024// wordLength += 1; // Add PAIYANNOI to word
1025// uc = utext_current32(text); // Fetch next character
1026// }
1027// else {
1028// // Restore prior position
1029// utext_next32(text);
1030// }
1031// }
1032// if (uc == KHMER_MAIYAMOK) {
1033// if (utext_previous32(text) != KHMER_MAIYAMOK) {
1034// // Skip over previous end and MAIYAMOK
1035// utext_next32(text);
1036// utext_next32(text);
1037// wordLength += 1; // Add MAIYAMOK to word
1038// }
1039// else {
1040// // Restore prior position
1041// utext_next32(text);
1042// }
1043// }
1044// }
1045// else {
1046// utext_setNativeIndex(text, current+wordLength);
1047// }
1048// }
1049
73c04bcf 1050 // Did we find a word on this iteration? If so, push it on the break stack
b331163b
A
1051 if (cuWordLength > 0) {
1052 foundBreaks.push((current+cuWordLength), status);
73c04bcf
A
1053 }
1054 }
1055
1056 // Don't return a break for the end of the dictionary range if there is one there.
1057 if (foundBreaks.peeki() >= rangeEnd) {
1058 (void) foundBreaks.popi();
1059 wordsFound -= 1;
1060 }
1061
1062 return wordsFound;
1063}
1064
51004dcb
A
1065#if !UCONFIG_NO_NORMALIZATION
1066/*
1067 ******************************************************************
1068 * CjkBreakEngine
1069 */
1070static const uint32_t kuint32max = 0xFFFFFFFF;
1071CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1072: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
1073 // Korean dictionary only includes Hangul syllables
1074 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1075 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1076 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1077 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
b331163b 1078 nfkcNorm2 = Normalizer2::getNFKCInstance(status);
51004dcb
A
1079
1080 if (U_SUCCESS(status)) {
1081 // handle Korean and Japanese/Chinese using different dictionaries
1082 if (type == kKorean) {
1083 setCharacters(fHangulWordSet);
1084 } else { //Chinese and Japanese
1085 UnicodeSet cjSet;
1086 cjSet.addAll(fHanWordSet);
1087 cjSet.addAll(fKatakanaWordSet);
1088 cjSet.addAll(fHiraganaWordSet);
57a6839d
A
1089 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1090 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
51004dcb
A
1091 setCharacters(cjSet);
1092 }
1093 }
1094}
1095
1096CjkBreakEngine::~CjkBreakEngine(){
1097 delete fDictionary;
1098}
1099
1100// The katakanaCost values below are based on the length frequencies of all
1101// katakana phrases in the dictionary
b331163b
A
1102static const int32_t kMaxKatakanaLength = 8;
1103static const int32_t kMaxKatakanaGroupLength = 20;
51004dcb
A
1104static const uint32_t maxSnlp = 255;
1105
b331163b 1106static inline uint32_t getKatakanaCost(int32_t wordLength){
51004dcb
A
1107 //TODO: fill array with actual values from dictionary!
1108 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1109 = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1110 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1111}
1112
1113static inline bool isKatakana(uint16_t value) {
1114 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
1115 (value >= 0xFF66u && value <= 0xFF9fu);
1116}
1117
51004dcb 1118
b331163b
A
1119// Function for accessing internal utext flags.
1120// Replicates an internal UText function.
51004dcb 1121
b331163b
A
1122static inline int32_t utext_i32_flag(int32_t bitIndex) {
1123 return (int32_t)1 << bitIndex;
1124}
51004dcb 1125
b331163b 1126
51004dcb
A
1127/*
1128 * @param text A UText representing the text
1129 * @param rangeStart The start of the range of dictionary characters
1130 * @param rangeEnd The end of the range of dictionary characters
1131 * @param foundBreaks Output of C array of int32_t break positions, or 0
1132 * @return The number of breaks found
1133 */
1134int32_t
b331163b 1135CjkBreakEngine::divideUpDictionaryRange( UText *inText,
51004dcb
A
1136 int32_t rangeStart,
1137 int32_t rangeEnd,
1138 UStack &foundBreaks ) const {
1139 if (rangeStart >= rangeEnd) {
1140 return 0;
1141 }
1142
2ca993e8
A
1143 // UnicodeString version of input UText, NFKC normalized if necessary.
1144 UnicodeString inString;
b331163b
A
1145
1146 // inputMap[inStringIndex] = corresponding native index from UText inText.
1147 // If NULL then mapping is 1:1
2ca993e8 1148 LocalPointer<UVector32> inputMap;
b331163b
A
1149
1150 UErrorCode status = U_ZERO_ERROR;
51004dcb 1151
51004dcb 1152
b331163b
A
1153 // if UText has the input string as one contiguous UTF-16 chunk
1154 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1155 inText->chunkNativeStart <= rangeStart &&
1156 inText->chunkNativeLimit >= rangeEnd &&
1157 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
2ca993e8
A
1158
1159 // Input UText is in one contiguous UTF-16 chunk.
1160 // Use Read-only aliasing UnicodeString.
1161 inString.setTo(FALSE,
1162 inText->chunkContents + rangeStart - inText->chunkNativeStart,
1163 rangeEnd - rangeStart);
b331163b
A
1164 } else {
1165 // Copy the text from the original inText (UText) to inString (UnicodeString).
1166 // Create a map from UnicodeString indices -> UText offsets.
1167 utext_setNativeIndex(inText, rangeStart);
1168 int32_t limit = rangeEnd;
1169 U_ASSERT(limit <= utext_nativeLength(inText));
1170 if (limit > utext_nativeLength(inText)) {
2ca993e8
A
1171 limit = (int32_t)utext_nativeLength(inText);
1172 }
1173 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1174 if (U_FAILURE(status)) {
1175 return 0;
b331163b 1176 }
b331163b 1177 while (utext_getNativeIndex(inText) < limit) {
2ca993e8 1178 int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
b331163b
A
1179 UChar32 c = utext_next32(inText);
1180 U_ASSERT(c != U_SENTINEL);
2ca993e8
A
1181 inString.append(c);
1182 while (inputMap->size() < inString.length()) {
b331163b
A
1183 inputMap->addElement(nativePosition, status);
1184 }
51004dcb 1185 }
b331163b 1186 inputMap->addElement(limit, status);
51004dcb 1187 }
b331163b
A
1188
1189
2ca993e8
A
1190 if (!nfkcNorm2->isNormalized(inString, status)) {
1191 UnicodeString normalizedInput;
b331163b 1192 // normalizedMap[normalizedInput position] == original UText position.
2ca993e8 1193 LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
51004dcb
A
1194 if (U_FAILURE(status)) {
1195 return 0;
1196 }
b331163b
A
1197
1198 UnicodeString fragment;
1199 UnicodeString normalizedFragment;
2ca993e8 1200 for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk
b331163b
A
1201 fragment.remove();
1202 int32_t fragmentStartI = srcI;
2ca993e8 1203 UChar32 c = inString.char32At(srcI);
b331163b
A
1204 for (;;) {
1205 fragment.append(c);
2ca993e8
A
1206 srcI = inString.moveIndex32(srcI, 1);
1207 if (srcI == inString.length()) {
b331163b
A
1208 break;
1209 }
2ca993e8 1210 c = inString.char32At(srcI);
b331163b
A
1211 if (nfkcNorm2->hasBoundaryBefore(c)) {
1212 break;
1213 }
1214 }
1215 nfkcNorm2->normalize(fragment, normalizedFragment, status);
2ca993e8 1216 normalizedInput.append(normalizedFragment);
b331163b
A
1217
1218 // Map every position in the normalized chunk to the start of the chunk
1219 // in the original input.
2ca993e8
A
1220 int32_t fragmentOriginalStart = inputMap.isValid() ?
1221 inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1222 while (normalizedMap->size() < normalizedInput.length()) {
b331163b
A
1223 normalizedMap->addElement(fragmentOriginalStart, status);
1224 if (U_FAILURE(status)) {
1225 break;
1226 }
1227 }
51004dcb 1228 }
2ca993e8
A
1229 U_ASSERT(normalizedMap->size() == normalizedInput.length());
1230 int32_t nativeEnd = inputMap.isValid() ?
1231 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
b331163b
A
1232 normalizedMap->addElement(nativeEnd, status);
1233
2ca993e8
A
1234 inputMap.moveFrom(normalizedMap);
1235 inString.moveFrom(normalizedInput);
51004dcb
A
1236 }
1237
2ca993e8
A
1238 int32_t numCodePts = inString.countChar32();
1239 if (numCodePts != inString.length()) {
b331163b
A
1240 // There are supplementary characters in the input.
1241 // The dictionary will produce boundary positions in terms of code point indexes,
1242 // not in terms of code unit string indexes.
1243 // Use the inputMap mechanism to take care of this in addition to indexing differences
1244 // from normalization and/or UTF-8 input.
2ca993e8 1245 UBool hadExistingMap = inputMap.isValid();
b331163b 1246 if (!hadExistingMap) {
2ca993e8
A
1247 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1248 if (U_FAILURE(status)) {
1249 return 0;
1250 }
b331163b
A
1251 }
1252 int32_t cpIdx = 0;
2ca993e8 1253 for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
b331163b
A
1254 U_ASSERT(cuIdx >= cpIdx);
1255 if (hadExistingMap) {
1256 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1257 } else {
1258 inputMap->addElement(cuIdx+rangeStart, status);
1259 }
1260 cpIdx++;
2ca993e8 1261 if (cuIdx == inString.length()) {
b331163b
A
1262 break;
1263 }
1264 }
51004dcb 1265 }
b331163b 1266
51004dcb 1267 // bestSnlp[i] is the snlp of the best segmentation of the first i
b331163b
A
1268 // code points in the range to be matched.
1269 UVector32 bestSnlp(numCodePts + 1, status);
1270 bestSnlp.addElement(0, status);
1271 for(int32_t i = 1; i <= numCodePts; i++) {
1272 bestSnlp.addElement(kuint32max, status);
51004dcb
A
1273 }
1274
b331163b
A
1275
1276 // prev[i] is the index of the last CJK code point in the previous word in
51004dcb 1277 // the best segmentation of the first i characters.
b331163b
A
1278 UVector32 prev(numCodePts + 1, status);
1279 for(int32_t i = 0; i <= numCodePts; i++){
1280 prev.addElement(-1, status);
51004dcb
A
1281 }
1282
b331163b
A
1283 const int32_t maxWordSize = 20;
1284 UVector32 values(numCodePts, status);
1285 values.setSize(numCodePts);
1286 UVector32 lengths(numCodePts, status);
1287 lengths.setSize(numCodePts);
1288
1289 UText fu = UTEXT_INITIALIZER;
2ca993e8 1290 utext_openUnicodeString(&fu, &inString, &status);
51004dcb
A
1291
1292 // Dynamic programming to find the best segmentation.
b331163b
A
1293
1294 // In outer loop, i is the code point index,
1295 // ix is the corresponding string (code unit) index.
1296 // They differ when the string contains supplementary characters.
1297 int32_t ix = 0;
f3c0d7a5 1298 bool is_prev_katakana = false;
2ca993e8 1299 for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) {
b331163b 1300 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
51004dcb 1301 continue;
b331163b 1302 }
51004dcb
A
1303
1304 int32_t count;
b331163b
A
1305 utext_setNativeIndex(&fu, ix);
1306 count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1307 NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1308 // Note: lengths is filled with code point lengths
1309 // The NULL parameter is the ignored code unit lengths.
51004dcb
A
1310
1311 // if there are no single character matches found in the dictionary
f3c0d7a5 1312 // starting with this character, treat character as a 1-character word
51004dcb
A
1313 // with the highest value possible, i.e. the least likely to occur.
1314 // Exclude Korean characters from this treatment, as they should be left
1315 // together by default.
b331163b 1316 if ((count == 0 || lengths.elementAti(0) != 1) &&
2ca993e8 1317 !fHangulWordSet.contains(inString.char32At(ix))) {
b331163b
A
1318 values.setElementAt(maxSnlp, count); // 255
1319 lengths.setElementAt(1, count++);
51004dcb
A
1320 }
1321
b331163b
A
1322 for (int32_t j = 0; j < count; j++) {
1323 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1324 int32_t ln_j_i = lengths.elementAti(j) + i;
1325 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1326 bestSnlp.setElementAt(newSnlp, ln_j_i);
1327 prev.setElementAt(i, ln_j_i);
51004dcb
A
1328 }
1329 }
1330
1331 // In Japanese,
1332 // Katakana word in single character is pretty rare. So we apply
1333 // the following heuristic to Katakana: any continuous run of Katakana
1334 // characters is considered a candidate word with a default cost
1335 // specified in the katakanaCost table according to its length.
b331163b 1336
2ca993e8 1337 bool is_katakana = isKatakana(inString.char32At(ix));
b331163b 1338 int32_t katakanaRunLength = 1;
51004dcb 1339 if (!is_prev_katakana && is_katakana) {
2ca993e8 1340 int32_t j = inString.moveIndex32(ix, 1);
51004dcb 1341 // Find the end of the continuous run of Katakana characters
2ca993e8
A
1342 while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1343 isKatakana(inString.char32At(j))) {
1344 j = inString.moveIndex32(j, 1);
b331163b 1345 katakanaRunLength++;
51004dcb 1346 }
b331163b
A
1347 if (katakanaRunLength < kMaxKatakanaGroupLength) {
1348 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1349 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
1350 bestSnlp.setElementAt(newSnlp, j);
1351 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
51004dcb
A
1352 }
1353 }
1354 }
1355 is_prev_katakana = is_katakana;
1356 }
b331163b 1357 utext_close(&fu);
51004dcb
A
1358
1359 // Start pushing the optimal offset index into t_boundary (t for tentative).
b331163b 1360 // prev[numCodePts] is guaranteed to be meaningful.
51004dcb 1361 // We'll first push in the reverse order, i.e.,
b331163b
A
1362 // t_boundary[0] = numCodePts, and afterwards do a swap.
1363 UVector32 t_boundary(numCodePts+1, status);
51004dcb 1364
b331163b 1365 int32_t numBreaks = 0;
51004dcb 1366 // No segmentation found, set boundary to end of range
b331163b
A
1367 if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1368 t_boundary.addElement(numCodePts, status);
1369 numBreaks++;
51004dcb 1370 } else {
b331163b
A
1371 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1372 t_boundary.addElement(i, status);
1373 numBreaks++;
51004dcb 1374 }
b331163b 1375 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
51004dcb
A
1376 }
1377
b331163b 1378 // Add a break for the start of the dictionary range if there is not one
51004dcb
A
1379 // there already.
1380 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
b331163b
A
1381 t_boundary.addElement(0, status);
1382 numBreaks++;
51004dcb
A
1383 }
1384
b331163b
A
1385 // Now that we're done, convert positions in t_boundary[] (indices in
1386 // the normalized input string) back to indices in the original input UText
1387 // while reversing t_boundary and pushing values to foundBreaks.
f3c0d7a5
A
1388 int32_t prevCPPos = -1;
1389 int32_t prevUTextPos = -1;
b331163b
A
1390 for (int32_t i = numBreaks-1; i >= 0; i--) {
1391 int32_t cpPos = t_boundary.elementAti(i);
f3c0d7a5 1392 U_ASSERT(cpPos > prevCPPos);
2ca993e8 1393 int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
f3c0d7a5
A
1394 U_ASSERT(utextPos >= prevUTextPos);
1395 if (utextPos > prevUTextPos) {
1396 // Boundaries are added to foundBreaks output in ascending order.
1397 U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1398 foundBreaks.push(utextPos, status);
1399 } else {
1400 // Normalization expanded the input text, the dictionary found a boundary
1401 // within the expansion, giving two boundaries with the same index in the
1402 // original text. Ignore the second. See ticket #12918.
1403 --numBreaks;
1404 }
1405 prevCPPos = cpPos;
1406 prevUTextPos = utextPos;
51004dcb
A
1407 }
1408
2ca993e8
A
1409 // inString goes out of scope
1410 // inputMap goes out of scope
51004dcb
A
1411 return numBreaks;
1412}
1413#endif
1414
73c04bcf
A
1415U_NAMESPACE_END
1416
1417#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
51004dcb 1418