icuSources/common/dictbe.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /**
   4  *******************************************************************************
   5  * Copyright (C) 2006-2016, International Business Machines Corporation
   6  * and others. All Rights Reserved.
   7  *******************************************************************************
   8  */
   9
  10 #include <utility>
  11
  12 #include "unicode/utypes.h"
  13
  14 #if !UCONFIG_NO_BREAK_ITERATION
  15
  16 #include "brkeng.h"
  17 #include "dictbe.h"
  18 #include "unicode/uniset.h"
  19 #include "unicode/chariter.h"
  20 #include "unicode/ubrk.h"
  21 #include "uvectr32.h"
  22 #include "uvector.h"
  23 #include "uassert.h"
  24 #include "unicode/normlzr.h"
  25 #include "cmemory.h"
  26 #include "dictionarydata.h"
  27
  28 U_NAMESPACE_BEGIN
  29
  30 /*
  31  ******************************************************************
  32  */
  33
  34 DictionaryBreakEngine::DictionaryBreakEngine() {
  35 }
  36
  37 DictionaryBreakEngine::~DictionaryBreakEngine() {
  38 }
  39
  40 UBool
  41 DictionaryBreakEngine::handles(UChar32 c) const {
  42     return fSet.contains(c);
  43 }
  44
  45 int32_t
  46 DictionaryBreakEngine::findBreaks( UText *text,
  47                                  int32_t startPos,
  48                                  int32_t endPos,
  49                                  UVector32 &foundBreaks ) const {
  50     (void)startPos;            // TODO: remove this param?
  51     int32_t result = 0;
  52
  53     // Find the span of characters included in the set.
  54     //   The span to break begins at the current position in the text, and
  55     //   extends towards the start or end of the text, depending on 'reverse'.
  56
  57     int32_t start = (int32_t)utext_getNativeIndex(text);
  58     int32_t current;
  59     int32_t rangeStart;
  60     int32_t rangeEnd;
  61     UChar32 c = utext_current32(text);
  62     while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
  63         utext_next32(text);         // TODO:  recast loop for postincrement
  64         c = utext_current32(text);
  65     }
  66     rangeStart = start;
  67     rangeEnd = current;
  68     result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
  69     utext_setNativeIndex(text, current);
  70
  71     return result;
  72 }
  73
  74 void
  75 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
  76     fSet = set;
  77     // Compact for caching
  78     fSet.compact();
  79 }
  80
  81 /*
  82  ******************************************************************
  83  * PossibleWord
  84  */
  85
  86 // Helper class for improving readability of the Thai/Lao/Khmer word break
  87 // algorithm. The implementation is completely inline.
  88
  89 // List size, limited by the maximum number of words in the dictionary
  90 // that form a nested sequence.
  91 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
  92
  93 class PossibleWord {
  94 private:
  95     // list of word candidate lengths, in increasing length order
  96     // TODO: bytes would be sufficient for word lengths.
  97     int32_t   count;      // Count of candidates
  98     int32_t   prefix;     // The longest match with a dictionary word
  99     int32_t   offset;     // Offset in the text of these candidates
 100     int32_t   mark;       // The preferred candidate's offset
 101     int32_t   current;    // The candidate we're currently looking at
 102     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
 103     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
 104
 105 public:
 106     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
 107     ~PossibleWord() {}
 108
 109     // Fill the list of candidates if needed, select the longest, and return the number found
 110     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
 111
 112     // Select the currently marked candidate, point after it in the text, and invalidate self
 113     int32_t   acceptMarked( UText *text );
 114
 115     // Back up from the current candidate to the next shorter one; return TRUE if that exists
 116     // and point the text after it
 117     UBool     backUp( UText *text );
 118
 119     // Return the longest prefix this candidate location shares with a dictionary word
 120     // Return value is in code points.
 121     int32_t   longestPrefix() { return prefix; }
 122
 123     // Mark the current candidate as the one we like
 124     void      markCurrent() { mark = current; }
 125
 126     // Get length in code points of the marked word.
 127     int32_t   markedCPLength() { return cpLengths[mark]; }
 128 };
 129
 130
 131 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
 132     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
 133     int32_t start = (int32_t)utext_getNativeIndex(text);
 134     if (start != offset) {
 135         offset = start;
 136         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
 137         // Dictionary leaves text after longest prefix, not longest word. Back up.
 138         if (count <= 0) {
 139             utext_setNativeIndex(text, start);
 140         }
 141     }
 142     if (count > 0) {
 143         utext_setNativeIndex(text, start+cuLengths[count-1]);
 144     }
 145     current = count-1;
 146     mark = current;
 147     return count;
 148 }
 149
 150 int32_t
 151 PossibleWord::acceptMarked( UText *text ) {
 152     utext_setNativeIndex(text, offset + cuLengths[mark]);
 153     return cuLengths[mark];
 154 }
 155
 156
 157 UBool
 158 PossibleWord::backUp( UText *text ) {
 159     if (current > 0) {
 160         utext_setNativeIndex(text, offset + cuLengths[--current]);
 161         return TRUE;
 162     }
 163     return FALSE;
 164 }
 165
 166 /*
 167  ******************************************************************
 168  * ThaiBreakEngine
 169  */
 170
 171 // How many words in a row are "good enough"?
 172 static const int32_t THAI_LOOKAHEAD = 3;
 173
 174 // Will not combine a non-word with a preceding dictionary word longer than this
 175 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
 176
 177 // Will not combine a non-word that shares at least this much prefix with a
 178 // dictionary word, with a preceding word
 179 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
 180
 181 // Ellision character
 182 static const int32_t THAI_PAIYANNOI = 0x0E2F;
 183
 184 // Repeat character
 185 static const int32_t THAI_MAIYAMOK = 0x0E46;
 186
 187 // Minimum word size
 188 static const int32_t THAI_MIN_WORD = 2;
 189
 190 // Minimum number of characters for two words
 191 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
 192
 193 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 194     : DictionaryBreakEngine(),
 195       fDictionary(adoptDictionary)
 196 {
 197     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
 198     if (U_SUCCESS(status)) {
 199         setCharacters(fThaiWordSet);
 200     }
 201     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
 202     fMarkSet.add(0x0020);
 203     fEndWordSet = fThaiWordSet;
 204     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 205     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 206     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 207     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 208     fSuffixSet.add(THAI_PAIYANNOI);
 209     fSuffixSet.add(THAI_MAIYAMOK);
 210
 211     // Compact for caching.
 212     fMarkSet.compact();
 213     fEndWordSet.compact();
 214     fBeginWordSet.compact();
 215     fSuffixSet.compact();
 216 }
 217
 218 ThaiBreakEngine::~ThaiBreakEngine() {
 219     delete fDictionary;
 220 }
 221
 222 int32_t
 223 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
 224                                                 int32_t rangeStart,
 225                                                 int32_t rangeEnd,
 226                                                 UVector32 &foundBreaks ) const {
 227     utext_setNativeIndex(text, rangeStart);
 228     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
 229     if (utext_getNativeIndex(text) >= rangeEnd) {
 230         return 0;       // Not enough characters for two words
 231     }
 232     utext_setNativeIndex(text, rangeStart);
 233
 234
 235     uint32_t wordsFound = 0;
 236     int32_t cpWordLength = 0;    // Word Length in Code Points.
 237     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
 238     int32_t current;
 239     UErrorCode status = U_ZERO_ERROR;
 240     PossibleWord words[THAI_LOOKAHEAD];
 241
 242     utext_setNativeIndex(text, rangeStart);
 243
 244     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 245         cpWordLength = 0;
 246         cuWordLength = 0;
 247
 248         // Look for candidate words at the current position
 249         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 250
 251         // If we found exactly one, use that
 252         if (candidates == 1) {
 253             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
 254             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
 255             wordsFound += 1;
 256         }
 257         // If there was more than one, see which one can take us forward the most words
 258         else if (candidates > 1) {
 259             // If we're already at the end of the range, we're done
 260             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 261                 goto foundBest;
 262             }
 263             do {
 264                 int32_t wordsMatched = 1;
 265                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 266                     if (wordsMatched < 2) {
 267                         // Followed by another dictionary word; mark first word as a good candidate
 268                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
 269                         wordsMatched = 2;
 270                     }
 271
 272                     // If we're already at the end of the range, we're done
 273                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 274                         goto foundBest;
 275                     }
 276
 277                     // See if any of the possible second words is followed by a third word
 278                     do {
 279                         // If we find a third word, stop right away
 280                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 281                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
 282                             goto foundBest;
 283                         }
 284                     }
 285                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
 286                 }
 287             }
 288             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
 289 foundBest:
 290             // Set UText position to after the accepted word.
 291             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
 292             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
 293             wordsFound += 1;
 294         }
 295
 296         // We come here after having either found a word or not. We look ahead to the
 297         // next word. If it's not a dictionary word, we will combine it with the word we
 298         // just found (if there is one), but only if the preceding word does not exceed
 299         // the threshold.
 300         // The text iterator should now be positioned at the end of the word we found.
 301
 302         UChar32 uc = 0;
 303         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
 304             // if it is a dictionary word, do nothing. If it isn't, then if there is
 305             // no preceding word, or the non-word shares less than the minimum threshold
 306             // of characters with a dictionary word, then scan to resynchronize
 307             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 308                   && (cuWordLength == 0
 309                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
 310                 // Look for a plausible word boundary
 311                 int32_t remaining = rangeEnd - (current+cuWordLength);
 312                 UChar32 pc;
 313                 int32_t chars = 0;
 314                 for (;;) {
 315                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 316                     pc = utext_next32(text);
 317                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 318                     chars += pcSize;
 319                     remaining -= pcSize;
 320                     if (remaining <= 0) {
 321                         break;
 322                     }
 323                     uc = utext_current32(text);
 324                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 325                         // Maybe. See if it's in the dictionary.
 326                         // NOTE: In the original Apple code, checked that the next
 327                         // two characters after uc were not 0x0E4C THANTHAKHAT before
 328                         // checking the dictionary. That is just a performance filter,
 329                         // but it's not clear it's faster than checking the trie.
 330                         int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 331                         utext_setNativeIndex(text, current + cuWordLength + chars);
 332                         if (num_candidates > 0) {
 333                             break;
 334                         }
 335                     }
 336                 }
 337
 338                 // Bump the word count if there wasn't already one
 339                 if (cuWordLength <= 0) {
 340                     wordsFound += 1;
 341                 }
 342
 343                 // Update the length with the passed-over characters
 344                 cuWordLength += chars;
 345             }
 346             else {
 347                 // Back up to where we were for next iteration
 348                 utext_setNativeIndex(text, current+cuWordLength);
 349             }
 350         }
 351
 352         // Never stop before a combining mark.
 353         int32_t currPos;
 354         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 355             utext_next32(text);
 356             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 357         }
 358
 359         // Look ahead for possible suffixes if a dictionary word does not follow.
 360         // We do this in code rather than using a rule so that the heuristic
 361         // resynch continues to function. For example, one of the suffix characters
 362         // could be a typo in the middle of a word.
 363         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
 364             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 365                 && fSuffixSet.contains(uc = utext_current32(text))) {
 366                 if (uc == THAI_PAIYANNOI) {
 367                     if (!fSuffixSet.contains(utext_previous32(text))) {
 368                         // Skip over previous end and PAIYANNOI
 369                         utext_next32(text);
 370                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
 371                         utext_next32(text);
 372                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
 373                         uc = utext_current32(text);     // Fetch next character
 374                     }
 375                     else {
 376                         // Restore prior position
 377                         utext_next32(text);
 378                     }
 379                 }
 380                 if (uc == THAI_MAIYAMOK) {
 381                     if (utext_previous32(text) != THAI_MAIYAMOK) {
 382                         // Skip over previous end and MAIYAMOK
 383                         utext_next32(text);
 384                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
 385                         utext_next32(text);
 386                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
 387                     }
 388                     else {
 389                         // Restore prior position
 390                         utext_next32(text);
 391                     }
 392                 }
 393             }
 394             else {
 395                 utext_setNativeIndex(text, current+cuWordLength);
 396             }
 397         }
 398
 399         // Did we find a word on this iteration? If so, push it on the break stack
 400         if (cuWordLength > 0) {
 401             foundBreaks.push((current+cuWordLength), status);
 402         }
 403     }
 404
 405     // Don't return a break for the end of the dictionary range if there is one there.
 406     if (foundBreaks.peeki() >= rangeEnd) {
 407         (void) foundBreaks.popi();
 408         wordsFound -= 1;
 409     }
 410
 411     return wordsFound;
 412 }
 413
 414 /*
 415  ******************************************************************
 416  * LaoBreakEngine
 417  */
 418
 419 // How many words in a row are "good enough"?
 420 static const int32_t LAO_LOOKAHEAD = 3;
 421
 422 // Will not combine a non-word with a preceding dictionary word longer than this
 423 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
 424
 425 // Will not combine a non-word that shares at least this much prefix with a
 426 // dictionary word, with a preceding word
 427 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
 428
 429 // Minimum word size
 430 static const int32_t LAO_MIN_WORD = 2;
 431
 432 // Minimum number of characters for two words
 433 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
 434
 435 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 436     : DictionaryBreakEngine(),
 437       fDictionary(adoptDictionary)
 438 {
 439     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
 440     if (U_SUCCESS(status)) {
 441         setCharacters(fLaoWordSet);
 442     }
 443     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
 444     fMarkSet.add(0x0020);
 445     fEndWordSet = fLaoWordSet;
 446     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
 447     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
 448     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
 449     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
 450
 451     // Compact for caching.
 452     fMarkSet.compact();
 453     fEndWordSet.compact();
 454     fBeginWordSet.compact();
 455 }
 456
 457 LaoBreakEngine::~LaoBreakEngine() {
 458     delete fDictionary;
 459 }
 460
 461 int32_t
 462 LaoBreakEngine::divideUpDictionaryRange( UText *text,
 463                                                 int32_t rangeStart,
 464                                                 int32_t rangeEnd,
 465                                                 UVector32 &foundBreaks ) const {
 466     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
 467         return 0;       // Not enough characters for two words
 468     }
 469
 470     uint32_t wordsFound = 0;
 471     int32_t cpWordLength = 0;
 472     int32_t cuWordLength = 0;
 473     int32_t current;
 474     UErrorCode status = U_ZERO_ERROR;
 475     PossibleWord words[LAO_LOOKAHEAD];
 476
 477     utext_setNativeIndex(text, rangeStart);
 478
 479     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 480         cuWordLength = 0;
 481         cpWordLength = 0;
 482
 483         // Look for candidate words at the current position
 484         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 485
 486         // If we found exactly one, use that
 487         if (candidates == 1) {
 488             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
 489             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
 490             wordsFound += 1;
 491         }
 492         // If there was more than one, see which one can take us forward the most words
 493         else if (candidates > 1) {
 494             // If we're already at the end of the range, we're done
 495             if (utext_getNativeIndex(text) >= rangeEnd) {
 496                 goto foundBest;
 497             }
 498             do {
 499                 int32_t wordsMatched = 1;
 500                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 501                     if (wordsMatched < 2) {
 502                         // Followed by another dictionary word; mark first word as a good candidate
 503                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
 504                         wordsMatched = 2;
 505                     }
 506
 507                     // If we're already at the end of the range, we're done
 508                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 509                         goto foundBest;
 510                     }
 511
 512                     // See if any of the possible second words is followed by a third word
 513                     do {
 514                         // If we find a third word, stop right away
 515                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 516                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
 517                             goto foundBest;
 518                         }
 519                     }
 520                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
 521                 }
 522             }
 523             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
 524 foundBest:
 525             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
 526             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
 527             wordsFound += 1;
 528         }
 529
 530         // We come here after having either found a word or not. We look ahead to the
 531         // next word. If it's not a dictionary word, we will combine it withe the word we
 532         // just found (if there is one), but only if the preceding word does not exceed
 533         // the threshold.
 534         // The text iterator should now be positioned at the end of the word we found.
 535         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
 536             // if it is a dictionary word, do nothing. If it isn't, then if there is
 537             // no preceding word, or the non-word shares less than the minimum threshold
 538             // of characters with a dictionary word, then scan to resynchronize
 539             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 540                   && (cuWordLength == 0
 541                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
 542                 // Look for a plausible word boundary
 543                 int32_t remaining = rangeEnd - (current + cuWordLength);
 544                 UChar32 pc;
 545                 UChar32 uc;
 546                 int32_t chars = 0;
 547                 for (;;) {
 548                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 549                     pc = utext_next32(text);
 550                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 551                     chars += pcSize;
 552                     remaining -= pcSize;
 553                     if (remaining <= 0) {
 554                         break;
 555                     }
 556                     uc = utext_current32(text);
 557                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 558                         // Maybe. See if it's in the dictionary.
 559                         // TODO: this looks iffy; compare with old code.
 560                         int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 561                         utext_setNativeIndex(text, current + cuWordLength + chars);
 562                         if (num_candidates > 0) {
 563                             break;
 564                         }
 565                     }
 566                 }
 567
 568                 // Bump the word count if there wasn't already one
 569                 if (cuWordLength <= 0) {
 570                     wordsFound += 1;
 571                 }
 572
 573                 // Update the length with the passed-over characters
 574                 cuWordLength += chars;
 575             }
 576             else {
 577                 // Back up to where we were for next iteration
 578                 utext_setNativeIndex(text, current + cuWordLength);
 579             }
 580         }
 581
 582         // Never stop before a combining mark.
 583         int32_t currPos;
 584         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 585             utext_next32(text);
 586             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 587         }
 588
 589         // Look ahead for possible suffixes if a dictionary word does not follow.
 590         // We do this in code rather than using a rule so that the heuristic
 591         // resynch continues to function. For example, one of the suffix characters
 592         // could be a typo in the middle of a word.
 593         // NOT CURRENTLY APPLICABLE TO LAO
 594
 595         // Did we find a word on this iteration? If so, push it on the break stack
 596         if (cuWordLength > 0) {
 597             foundBreaks.push((current+cuWordLength), status);
 598         }
 599     }
 600
 601     // Don't return a break for the end of the dictionary range if there is one there.
 602     if (foundBreaks.peeki() >= rangeEnd) {
 603         (void) foundBreaks.popi();
 604         wordsFound -= 1;
 605     }
 606
 607     return wordsFound;
 608 }
 609
 610 /*
 611  ******************************************************************
 612  * BurmeseBreakEngine
 613  */
 614
 615 // How many words in a row are "good enough"?
 616 static const int32_t BURMESE_LOOKAHEAD = 3;
 617
 618 // Will not combine a non-word with a preceding dictionary word longer than this
 619 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
 620
 621 // Will not combine a non-word that shares at least this much prefix with a
 622 // dictionary word, with a preceding word
 623 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
 624
 625 // Minimum word size
 626 static const int32_t BURMESE_MIN_WORD = 2;
 627
 628 // Minimum number of characters for two words
 629 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
 630
 631 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 632     : DictionaryBreakEngine(),
 633       fDictionary(adoptDictionary)
 634 {
 635     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
 636     if (U_SUCCESS(status)) {
 637         setCharacters(fBurmeseWordSet);
 638     }
 639     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
 640     fMarkSet.add(0x0020);
 641     fEndWordSet = fBurmeseWordSet;
 642     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
 643
 644     // Compact for caching.
 645     fMarkSet.compact();
 646     fEndWordSet.compact();
 647     fBeginWordSet.compact();
 648 }
 649
 650 BurmeseBreakEngine::~BurmeseBreakEngine() {
 651     delete fDictionary;
 652 }
 653
 654 int32_t
 655 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
 656                                                 int32_t rangeStart,
 657                                                 int32_t rangeEnd,
 658                                                 UVector32 &foundBreaks ) const {
 659     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
 660         return 0;       // Not enough characters for two words
 661     }
 662
 663     uint32_t wordsFound = 0;
 664     int32_t cpWordLength = 0;
 665     int32_t cuWordLength = 0;
 666     int32_t current;
 667     UErrorCode status = U_ZERO_ERROR;
 668     PossibleWord words[BURMESE_LOOKAHEAD];
 669
 670     utext_setNativeIndex(text, rangeStart);
 671
 672     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 673         cuWordLength = 0;
 674         cpWordLength = 0;
 675
 676         // Look for candidate words at the current position
 677         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 678
 679         // If we found exactly one, use that
 680         if (candidates == 1) {
 681             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
 682             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
 683             wordsFound += 1;
 684         }
 685         // If there was more than one, see which one can take us forward the most words
 686         else if (candidates > 1) {
 687             // If we're already at the end of the range, we're done
 688             if (utext_getNativeIndex(text) >= rangeEnd) {
 689                 goto foundBest;
 690             }
 691             do {
 692                 int32_t wordsMatched = 1;
 693                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 694                     if (wordsMatched < 2) {
 695                         // Followed by another dictionary word; mark first word as a good candidate
 696                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
 697                         wordsMatched = 2;
 698                     }
 699
 700                     // If we're already at the end of the range, we're done
 701                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 702                         goto foundBest;
 703                     }
 704
 705                     // See if any of the possible second words is followed by a third word
 706                     do {
 707                         // If we find a third word, stop right away
 708                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 709                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
 710                             goto foundBest;
 711                         }
 712                     }
 713                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
 714                 }
 715             }
 716             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
 717 foundBest:
 718             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
 719             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
 720             wordsFound += 1;
 721         }
 722
 723         // We come here after having either found a word or not. We look ahead to the
 724         // next word. If it's not a dictionary word, we will combine it withe the word we
 725         // just found (if there is one), but only if the preceding word does not exceed
 726         // the threshold.
 727         // The text iterator should now be positioned at the end of the word we found.
 728         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
 729             // if it is a dictionary word, do nothing. If it isn't, then if there is
 730             // no preceding word, or the non-word shares less than the minimum threshold
 731             // of characters with a dictionary word, then scan to resynchronize
 732             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 733                   && (cuWordLength == 0
 734                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
 735                 // Look for a plausible word boundary
 736                 int32_t remaining = rangeEnd - (current + cuWordLength);
 737                 UChar32 pc;
 738                 UChar32 uc;
 739                 int32_t chars = 0;
 740                 for (;;) {
 741                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 742                     pc = utext_next32(text);
 743                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 744                     chars += pcSize;
 745                     remaining -= pcSize;
 746                     if (remaining <= 0) {
 747                         break;
 748                     }
 749                     uc = utext_current32(text);
 750                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 751                         // Maybe. See if it's in the dictionary.
 752                         // TODO: this looks iffy; compare with old code.
 753                         int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 754                         utext_setNativeIndex(text, current + cuWordLength + chars);
 755                         if (num_candidates > 0) {
 756                             break;
 757                         }
 758                     }
 759                 }
 760
 761                 // Bump the word count if there wasn't already one
 762                 if (cuWordLength <= 0) {
 763                     wordsFound += 1;
 764                 }
 765
 766                 // Update the length with the passed-over characters
 767                 cuWordLength += chars;
 768             }
 769             else {
 770                 // Back up to where we were for next iteration
 771                 utext_setNativeIndex(text, current + cuWordLength);
 772             }
 773         }
 774
 775         // Never stop before a combining mark.
 776         int32_t currPos;
 777         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 778             utext_next32(text);
 779             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 780         }
 781
 782         // Look ahead for possible suffixes if a dictionary word does not follow.
 783         // We do this in code rather than using a rule so that the heuristic
 784         // resynch continues to function. For example, one of the suffix characters
 785         // could be a typo in the middle of a word.
 786         // NOT CURRENTLY APPLICABLE TO BURMESE
 787
 788         // Did we find a word on this iteration? If so, push it on the break stack
 789         if (cuWordLength > 0) {
 790             foundBreaks.push((current+cuWordLength), status);
 791         }
 792     }
 793
 794     // Don't return a break for the end of the dictionary range if there is one there.
 795     if (foundBreaks.peeki() >= rangeEnd) {
 796         (void) foundBreaks.popi();
 797         wordsFound -= 1;
 798     }
 799
 800     return wordsFound;
 801 }
 802
 803 /*
 804  ******************************************************************
 805  * KhmerBreakEngine
 806  */
 807
 808 // How many words in a row are "good enough"?
 809 static const int32_t KHMER_LOOKAHEAD = 3;
 810
 811 // Will not combine a non-word with a preceding dictionary word longer than this
 812 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
 813
 814 // Will not combine a non-word that shares at least this much prefix with a
 815 // dictionary word, with a preceding word
 816 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
 817
 818 // Minimum word size
 819 static const int32_t KHMER_MIN_WORD = 2;
 820
 821 // Minimum number of characters for two words
 822 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
 823
 824 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 825     : DictionaryBreakEngine(),
 826       fDictionary(adoptDictionary)
 827 {
 828     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
 829     if (U_SUCCESS(status)) {
 830         setCharacters(fKhmerWordSet);
 831     }
 832     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
 833     fMarkSet.add(0x0020);
 834     fEndWordSet = fKhmerWordSet;
 835     fBeginWordSet.add(0x1780, 0x17B3);
 836     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
 837     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
 838     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
 839     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
 840     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
 841 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 842 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 843 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 844 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 845 //    fSuffixSet.add(THAI_PAIYANNOI);
 846 //    fSuffixSet.add(THAI_MAIYAMOK);
 847
 848     // Compact for caching.
 849     fMarkSet.compact();
 850     fEndWordSet.compact();
 851     fBeginWordSet.compact();
 852 //    fSuffixSet.compact();
 853 }
 854
 855 KhmerBreakEngine::~KhmerBreakEngine() {
 856     delete fDictionary;
 857 }
 858
 859 int32_t
 860 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
 861                                                 int32_t rangeStart,
 862                                                 int32_t rangeEnd,
 863                                                 UVector32 &foundBreaks ) const {
 864     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
 865         return 0;       // Not enough characters for two words
 866     }
 867
 868     uint32_t wordsFound = 0;
 869     int32_t cpWordLength = 0;
 870     int32_t cuWordLength = 0;
 871     int32_t current;
 872     UErrorCode status = U_ZERO_ERROR;
 873     PossibleWord words[KHMER_LOOKAHEAD];
 874
 875     utext_setNativeIndex(text, rangeStart);
 876
 877     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 878         cuWordLength = 0;
 879         cpWordLength = 0;
 880
 881         // Look for candidate words at the current position
 882         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 883
 884         // If we found exactly one, use that
 885         if (candidates == 1) {
 886             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 887             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 888             wordsFound += 1;
 889         }
 890
 891         // If there was more than one, see which one can take us forward the most words
 892         else if (candidates > 1) {
 893             // If we're already at the end of the range, we're done
 894             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 895                 goto foundBest;
 896             }
 897             do {
 898                 int32_t wordsMatched = 1;
 899                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 900                     if (wordsMatched < 2) {
 901                         // Followed by another dictionary word; mark first word as a good candidate
 902                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 903                         wordsMatched = 2;
 904                     }
 905
 906                     // If we're already at the end of the range, we're done
 907                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 908                         goto foundBest;
 909                     }
 910
 911                     // See if any of the possible second words is followed by a third word
 912                     do {
 913                         // If we find a third word, stop right away
 914                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 915                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 916                             goto foundBest;
 917                         }
 918                     }
 919                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
 920                 }
 921             }
 922             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
 923 foundBest:
 924             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 925             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 926             wordsFound += 1;
 927         }
 928
 929         // We come here after having either found a word or not. We look ahead to the
 930         // next word. If it's not a dictionary word, we will combine it with the word we
 931         // just found (if there is one), but only if the preceding word does not exceed
 932         // the threshold.
 933         // The text iterator should now be positioned at the end of the word we found.
 934         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
 935             // if it is a dictionary word, do nothing. If it isn't, then if there is
 936             // no preceding word, or the non-word shares less than the minimum threshold
 937             // of characters with a dictionary word, then scan to resynchronize
 938             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 939                   && (cuWordLength == 0
 940                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
 941                 // Look for a plausible word boundary
 942                 int32_t remaining = rangeEnd - (current+cuWordLength);
 943                 UChar32 pc;
 944                 UChar32 uc;
 945                 int32_t chars = 0;
 946                 for (;;) {
 947                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 948                     pc = utext_next32(text);
 949                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 950                     chars += pcSize;
 951                     remaining -= pcSize;
 952                     if (remaining <= 0) {
 953                         break;
 954                     }
 955                     uc = utext_current32(text);
 956                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 957                         // Maybe. See if it's in the dictionary.
 958                         int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 959                         utext_setNativeIndex(text, current+cuWordLength+chars);
 960                         if (num_candidates > 0) {
 961                             break;
 962                         }
 963                     }
 964                 }
 965
 966                 // Bump the word count if there wasn't already one
 967                 if (cuWordLength <= 0) {
 968                     wordsFound += 1;
 969                 }
 970
 971                 // Update the length with the passed-over characters
 972                 cuWordLength += chars;
 973             }
 974             else {
 975                 // Back up to where we were for next iteration
 976                 utext_setNativeIndex(text, current+cuWordLength);
 977             }
 978         }
 979
 980         // Never stop before a combining mark.
 981         int32_t currPos;
 982         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 983             utext_next32(text);
 984             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 985         }
 986
 987         // Look ahead for possible suffixes if a dictionary word does not follow.
 988         // We do this in code rather than using a rule so that the heuristic
 989         // resynch continues to function. For example, one of the suffix characters
 990         // could be a typo in the middle of a word.
 991 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
 992 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 993 //                && fSuffixSet.contains(uc = utext_current32(text))) {
 994 //                if (uc == KHMER_PAIYANNOI) {
 995 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
 996 //                        // Skip over previous end and PAIYANNOI
 997 //                        utext_next32(text);
 998 //                        utext_next32(text);
 999 //                        wordLength += 1;            // Add PAIYANNOI to word
1000 //                        uc = utext_current32(text);     // Fetch next character
1001 //                    }
1002 //                    else {
1003 //                        // Restore prior position
1004 //                        utext_next32(text);
1005 //                    }
1006 //                }
1007 //                if (uc == KHMER_MAIYAMOK) {
1008 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1009 //                        // Skip over previous end and MAIYAMOK
1010 //                        utext_next32(text);
1011 //                        utext_next32(text);
1012 //                        wordLength += 1;            // Add MAIYAMOK to word
1013 //                    }
1014 //                    else {
1015 //                        // Restore prior position
1016 //                        utext_next32(text);
1017 //                    }
1018 //                }
1019 //            }
1020 //            else {
1021 //                utext_setNativeIndex(text, current+wordLength);
1022 //            }
1023 //        }
1024
1025         // Did we find a word on this iteration? If so, push it on the break stack
1026         if (cuWordLength > 0) {
1027             foundBreaks.push((current+cuWordLength), status);
1028         }
1029     }
1030
1031     // Don't return a break for the end of the dictionary range if there is one there.
1032     if (foundBreaks.peeki() >= rangeEnd) {
1033         (void) foundBreaks.popi();
1034         wordsFound -= 1;
1035     }
1036
1037     return wordsFound;
1038 }
1039
1040 #if !UCONFIG_NO_NORMALIZATION
1041 /*
1042  ******************************************************************
1043  * CjkBreakEngine
1044  */
1045 static const uint32_t kuint32max = 0xFFFFFFFF;
1046 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1047 : DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1048     // Korean dictionary only includes Hangul syllables
1049     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1050     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1051     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1052     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1053     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1054
1055     if (U_SUCCESS(status)) {
1056         // handle Korean and Japanese/Chinese using different dictionaries
1057         if (type == kKorean) {
1058             setCharacters(fHangulWordSet);
1059         } else { //Chinese and Japanese
1060             UnicodeSet cjSet;
1061             cjSet.addAll(fHanWordSet);
1062             cjSet.addAll(fKatakanaWordSet);
1063             cjSet.addAll(fHiraganaWordSet);
1064             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1065             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1066             setCharacters(cjSet);
1067         }
1068     }
1069 }
1070
1071 CjkBreakEngine::~CjkBreakEngine(){
1072     delete fDictionary;
1073 }
1074
1075 // The katakanaCost values below are based on the length frequencies of all
1076 // katakana phrases in the dictionary
1077 static const int32_t kMaxKatakanaLength = 8;
1078 static const int32_t kMaxKatakanaGroupLength = 20;
1079 static const uint32_t maxSnlp = 255;
1080
1081 static inline uint32_t getKatakanaCost(int32_t wordLength){
1082     //TODO: fill array with actual values from dictionary!
1083     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1084                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1085     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1086 }
1087
1088 static inline bool isKatakana(UChar32 value) {
1089     return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1090             (value >= 0xFF66 && value <= 0xFF9f);
1091 }
1092
1093
1094 // Function for accessing internal utext flags.
1095 //   Replicates an internal UText function.
1096
1097 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1098     return (int32_t)1 << bitIndex;
1099 }
1100
1101
1102 /*
1103  * @param text A UText representing the text
1104  * @param rangeStart The start of the range of dictionary characters
1105  * @param rangeEnd The end of the range of dictionary characters
1106  * @param foundBreaks vector<int32> to receive the break positions
1107  * @return The number of breaks found
1108  */
1109 int32_t
1110 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1111         int32_t rangeStart,
1112         int32_t rangeEnd,
1113         UVector32 &foundBreaks ) const {
1114     if (rangeStart >= rangeEnd) {
1115         return 0;
1116     }
1117
1118     // UnicodeString version of input UText, NFKC normalized if necessary.
1119     UnicodeString inString;
1120
1121     // inputMap[inStringIndex] = corresponding native index from UText inText.
1122     // If NULL then mapping is 1:1
1123     LocalPointer<UVector32>     inputMap;
1124
1125     UErrorCode     status      = U_ZERO_ERROR;
1126
1127
1128     // if UText has the input string as one contiguous UTF-16 chunk
1129     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1130          inText->chunkNativeStart <= rangeStart &&
1131          inText->chunkNativeLimit >= rangeEnd   &&
1132          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1133
1134         // Input UText is in one contiguous UTF-16 chunk.
1135         // Use Read-only aliasing UnicodeString.
1136         inString.setTo(FALSE,
1137                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1138                        rangeEnd - rangeStart);
1139     } else {
1140         // Copy the text from the original inText (UText) to inString (UnicodeString).
1141         // Create a map from UnicodeString indices -> UText offsets.
1142         utext_setNativeIndex(inText, rangeStart);
1143         int32_t limit = rangeEnd;
1144         U_ASSERT(limit <= utext_nativeLength(inText));
1145         if (limit > utext_nativeLength(inText)) {
1146             limit = (int32_t)utext_nativeLength(inText);
1147         }
1148         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1149         if (U_FAILURE(status)) {
1150             return 0;
1151         }
1152         while (utext_getNativeIndex(inText) < limit) {
1153             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1154             UChar32 c = utext_next32(inText);
1155             U_ASSERT(c != U_SENTINEL);
1156             inString.append(c);
1157             while (inputMap->size() < inString.length()) {
1158                 inputMap->addElement(nativePosition, status);
1159             }
1160         }
1161         inputMap->addElement(limit, status);
1162     }
1163
1164
1165     if (!nfkcNorm2->isNormalized(inString, status)) {
1166         UnicodeString normalizedInput;
1167         //  normalizedMap[normalizedInput position] ==  original UText position.
1168         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1169         if (U_FAILURE(status)) {
1170             return 0;
1171         }
1172
1173         UnicodeString fragment;
1174         UnicodeString normalizedFragment;
1175         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1176             fragment.remove();
1177             int32_t fragmentStartI = srcI;
1178             UChar32 c = inString.char32At(srcI);
1179             for (;;) {
1180                 fragment.append(c);
1181                 srcI = inString.moveIndex32(srcI, 1);
1182                 if (srcI == inString.length()) {
1183                     break;
1184                 }
1185                 c = inString.char32At(srcI);
1186                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1187                     break;
1188                 }
1189             }
1190             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1191             normalizedInput.append(normalizedFragment);
1192
1193             // Map every position in the normalized chunk to the start of the chunk
1194             //   in the original input.
1195             int32_t fragmentOriginalStart = inputMap.isValid() ?
1196                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1197             while (normalizedMap->size() < normalizedInput.length()) {
1198                 normalizedMap->addElement(fragmentOriginalStart, status);
1199                 if (U_FAILURE(status)) {
1200                     break;
1201                 }
1202             }
1203         }
1204         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1205         int32_t nativeEnd = inputMap.isValid() ?
1206                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1207         normalizedMap->addElement(nativeEnd, status);
1208
1209         inputMap = std::move(normalizedMap);
1210         inString = std::move(normalizedInput);
1211     }
1212
1213     int32_t numCodePts = inString.countChar32();
1214     if (numCodePts != inString.length()) {
1215         // There are supplementary characters in the input.
1216         // The dictionary will produce boundary positions in terms of code point indexes,
1217         //   not in terms of code unit string indexes.
1218         // Use the inputMap mechanism to take care of this in addition to indexing differences
1219         //    from normalization and/or UTF-8 input.
1220         UBool hadExistingMap = inputMap.isValid();
1221         if (!hadExistingMap) {
1222             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1223             if (U_FAILURE(status)) {
1224                 return 0;
1225             }
1226         }
1227         int32_t cpIdx = 0;
1228         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1229             U_ASSERT(cuIdx >= cpIdx);
1230             if (hadExistingMap) {
1231                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1232             } else {
1233                 inputMap->addElement(cuIdx+rangeStart, status);
1234             }
1235             cpIdx++;
1236             if (cuIdx == inString.length()) {
1237                break;
1238             }
1239         }
1240     }
1241
1242     // bestSnlp[i] is the snlp of the best segmentation of the first i
1243     // code points in the range to be matched.
1244     UVector32 bestSnlp(numCodePts + 1, status);
1245     bestSnlp.addElement(0, status);
1246     for(int32_t i = 1; i <= numCodePts; i++) {
1247         bestSnlp.addElement(kuint32max, status);
1248     }
1249
1250
1251     // prev[i] is the index of the last CJK code point in the previous word in
1252     // the best segmentation of the first i characters.
1253     UVector32 prev(numCodePts + 1, status);
1254     for(int32_t i = 0; i <= numCodePts; i++){
1255         prev.addElement(-1, status);
1256     }
1257
1258     const int32_t maxWordSize = 20;
1259     UVector32 values(numCodePts, status);
1260     values.setSize(numCodePts);
1261     UVector32 lengths(numCodePts, status);
1262     lengths.setSize(numCodePts);
1263
1264     UText fu = UTEXT_INITIALIZER;
1265     utext_openUnicodeString(&fu, &inString, &status);
1266
1267     // Dynamic programming to find the best segmentation.
1268
1269     // In outer loop, i  is the code point index,
1270     //                ix is the corresponding string (code unit) index.
1271     //    They differ when the string contains supplementary characters.
1272     int32_t ix = 0;
1273     bool is_prev_katakana = false;
1274     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1275         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1276             continue;
1277         }
1278
1279         int32_t count;
1280         utext_setNativeIndex(&fu, ix);
1281         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1282                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1283                              // Note: lengths is filled with code point lengths
1284                              //       The NULL parameter is the ignored code unit lengths.
1285
1286         // if there are no single character matches found in the dictionary
1287         // starting with this character, treat character as a 1-character word
1288         // with the highest value possible, i.e. the least likely to occur.
1289         // Exclude Korean characters from this treatment, as they should be left
1290         // together by default.
1291         if ((count == 0 || lengths.elementAti(0) != 1) &&
1292                 !fHangulWordSet.contains(inString.char32At(ix))) {
1293             values.setElementAt(maxSnlp, count);   // 255
1294             lengths.setElementAt(1, count++);
1295         }
1296
1297         for (int32_t j = 0; j < count; j++) {
1298             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1299             int32_t ln_j_i = lengths.elementAti(j) + i;
1300             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1301                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1302                 prev.setElementAt(i, ln_j_i);
1303             }
1304         }
1305
1306         // In Japanese,
1307         // Katakana word in single character is pretty rare. So we apply
1308         // the following heuristic to Katakana: any continuous run of Katakana
1309         // characters is considered a candidate word with a default cost
1310         // specified in the katakanaCost table according to its length.
1311
1312         bool is_katakana = isKatakana(inString.char32At(ix));
1313         int32_t katakanaRunLength = 1;
1314         if (!is_prev_katakana && is_katakana) {
1315             int32_t j = inString.moveIndex32(ix, 1);
1316             // Find the end of the continuous run of Katakana characters
1317             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1318                     isKatakana(inString.char32At(j))) {
1319                 j = inString.moveIndex32(j, 1);
1320                 katakanaRunLength++;
1321             }
1322             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1323                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1324                 if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1325                     bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1326                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1327                 }
1328             }
1329         }
1330         is_prev_katakana = is_katakana;
1331     }
1332     utext_close(&fu);
1333
1334     // Start pushing the optimal offset index into t_boundary (t for tentative).
1335     // prev[numCodePts] is guaranteed to be meaningful.
1336     // We'll first push in the reverse order, i.e.,
1337     // t_boundary[0] = numCodePts, and afterwards do a swap.
1338     UVector32 t_boundary(numCodePts+1, status);
1339
1340     int32_t numBreaks = 0;
1341     // No segmentation found, set boundary to end of range
1342     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1343         t_boundary.addElement(numCodePts, status);
1344         numBreaks++;
1345     } else {
1346         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1347             t_boundary.addElement(i, status);
1348             numBreaks++;
1349         }
1350         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1351     }
1352
1353     // Add a break for the start of the dictionary range if there is not one
1354     // there already.
1355     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1356         t_boundary.addElement(0, status);
1357         numBreaks++;
1358     }
1359
1360     // Now that we're done, convert positions in t_boundary[] (indices in
1361     // the normalized input string) back to indices in the original input UText
1362     // while reversing t_boundary and pushing values to foundBreaks.
1363     int32_t prevCPPos = -1;
1364     int32_t prevUTextPos = -1;
1365     for (int32_t i = numBreaks-1; i >= 0; i--) {
1366         int32_t cpPos = t_boundary.elementAti(i);
1367         U_ASSERT(cpPos > prevCPPos);
1368         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1369         U_ASSERT(utextPos >= prevUTextPos);
1370         if (utextPos > prevUTextPos) {
1371             // Boundaries are added to foundBreaks output in ascending order.
1372             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1373             foundBreaks.push(utextPos, status);
1374         } else {
1375             // Normalization expanded the input text, the dictionary found a boundary
1376             // within the expansion, giving two boundaries with the same index in the
1377             // original text. Ignore the second. See ticket #12918.
1378             --numBreaks;
1379         }
1380         prevCPPos = cpPos;
1381         prevUTextPos = utextPos;
1382     }
1383     (void)prevCPPos; // suppress compiler warnings about unused variable
1384
1385     // inString goes out of scope
1386     // inputMap goes out of scope
1387     return numBreaks;
1388 }
1389 #endif
1390
1391 U_NAMESPACE_END
1392
1393 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1394