icuSources/common/dictbe.cpp

   1 /**
   2  *******************************************************************************
   3  * Copyright (C) 2006-2016, International Business Machines Corporation
   4  * and others. All Rights Reserved.
   5  *******************************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_BREAK_ITERATION
  11
  12 #include "brkeng.h"
  13 #include "dictbe.h"
  14 #include "unicode/uniset.h"
  15 #include "unicode/chariter.h"
  16 #include "unicode/ubrk.h"
  17 #include "uvectr32.h"
  18 #include "uvector.h"
  19 #include "uassert.h"
  20 #include "unicode/normlzr.h"
  21 #include "cmemory.h"
  22 #include "dictionarydata.h"
  23
  24 U_NAMESPACE_BEGIN
  25
  26 /*
  27  ******************************************************************
  28  */
  29
  30 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
  31     fTypes = breakTypes;
  32 }
  33
  34 DictionaryBreakEngine::~DictionaryBreakEngine() {
  35 }
  36
  37 UBool
  38 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
  39     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
  40             && fSet.contains(c));
  41 }
  42
  43 int32_t
  44 DictionaryBreakEngine::findBreaks( UText *text,
  45                                  int32_t startPos,
  46                                  int32_t endPos,
  47                                  UBool reverse,
  48                                  int32_t breakType,
  49                                  UStack &foundBreaks ) const {
  50     int32_t result = 0;
  51
  52     // Find the span of characters included in the set.
  53     //   The span to break begins at the current position in the text, and
  54     //   extends towards the start or end of the text, depending on 'reverse'.
  55
  56     int32_t start = (int32_t)utext_getNativeIndex(text);
  57     int32_t current;
  58     int32_t rangeStart;
  59     int32_t rangeEnd;
  60     UChar32 c = utext_current32(text);
  61     if (reverse) {
  62         UBool   isDict = fSet.contains(c);
  63         while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
  64             c = utext_previous32(text);
  65             isDict = fSet.contains(c);
  66         }
  67         if (current < startPos) {
  68             rangeStart = startPos;
  69         } else {
  70             rangeStart = current;
  71             if (!isDict) {
  72                 utext_next32(text);
  73                 rangeStart = (int32_t)utext_getNativeIndex(text);
  74             }
  75         }
  76         // rangeEnd = start + 1;
  77         utext_setNativeIndex(text, start);
  78         utext_next32(text);
  79         rangeEnd = (int32_t)utext_getNativeIndex(text);
  80     }
  81     else {
  82         while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
  83             utext_next32(text);         // TODO:  recast loop for postincrement
  84             c = utext_current32(text);
  85         }
  86         rangeStart = start;
  87         rangeEnd = current;
  88     }
  89     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
  90         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
  91         utext_setNativeIndex(text, current);
  92     }
  93
  94     return result;
  95 }
  96
  97 void
  98 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
  99     fSet = set;
 100     // Compact for caching
 101     fSet.compact();
 102 }
 103
 104 /*
 105  ******************************************************************
 106  * PossibleWord
 107  */
 108
 109 // Helper class for improving readability of the Thai/Lao/Khmer word break
 110 // algorithm. The implementation is completely inline.
 111
 112 // List size, limited by the maximum number of words in the dictionary
 113 // that form a nested sequence.
 114 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
 115
 116 class PossibleWord {
 117 private:
 118     // list of word candidate lengths, in increasing length order
 119     // TODO: bytes would be sufficient for word lengths.
 120     int32_t   count;      // Count of candidates
 121     int32_t   prefix;     // The longest match with a dictionary word
 122     int32_t   offset;     // Offset in the text of these candidates
 123     int32_t   mark;       // The preferred candidate's offset
 124     int32_t   current;    // The candidate we're currently looking at
 125     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
 126     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
 127
 128 public:
 129     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
 130     ~PossibleWord() {};
 131
 132     // Fill the list of candidates if needed, select the longest, and return the number found
 133     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
 134
 135     // Select the currently marked candidate, point after it in the text, and invalidate self
 136     int32_t   acceptMarked( UText *text );
 137
 138     // Back up from the current candidate to the next shorter one; return TRUE if that exists
 139     // and point the text after it
 140     UBool     backUp( UText *text );
 141
 142     // Return the longest prefix this candidate location shares with a dictionary word
 143     // Return value is in code points.
 144     int32_t   longestPrefix() { return prefix; };
 145
 146     // Mark the current candidate as the one we like
 147     void      markCurrent() { mark = current; };
 148
 149     // Get length in code points of the marked word.
 150     int32_t   markedCPLength() { return cpLengths[mark]; };
 151 };
 152
 153
 154 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
 155     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
 156     int32_t start = (int32_t)utext_getNativeIndex(text);
 157     if (start != offset) {
 158         offset = start;
 159         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
 160         // Dictionary leaves text after longest prefix, not longest word. Back up.
 161         if (count <= 0) {
 162             utext_setNativeIndex(text, start);
 163         }
 164     }
 165     if (count > 0) {
 166         utext_setNativeIndex(text, start+cuLengths[count-1]);
 167     }
 168     current = count-1;
 169     mark = current;
 170     return count;
 171 }
 172
 173 int32_t
 174 PossibleWord::acceptMarked( UText *text ) {
 175     utext_setNativeIndex(text, offset + cuLengths[mark]);
 176     return cuLengths[mark];
 177 }
 178
 179
 180 UBool
 181 PossibleWord::backUp( UText *text ) {
 182     if (current > 0) {
 183         utext_setNativeIndex(text, offset + cuLengths[--current]);
 184         return TRUE;
 185     }
 186     return FALSE;
 187 }
 188
 189 /*
 190  ******************************************************************
 191  * ThaiBreakEngine
 192  */
 193
 194 // How many words in a row are "good enough"?
 195 static const int32_t THAI_LOOKAHEAD = 3;
 196
 197 // Will not combine a non-word with a preceding dictionary word longer than this
 198 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
 199
 200 // Will not combine a non-word that shares at least this much prefix with a
 201 // dictionary word, with a preceding word
 202 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
 203
 204 // Ellision character
 205 static const int32_t THAI_PAIYANNOI = 0x0E2F;
 206
 207 // Repeat character
 208 static const int32_t THAI_MAIYAMOK = 0x0E46;
 209
 210 // Minimum word size
 211 static const int32_t THAI_MIN_WORD = 2;
 212
 213 // Minimum number of characters for two words
 214 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
 215
 216 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 217     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
 218       fDictionary(adoptDictionary)
 219 {
 220     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
 221     if (U_SUCCESS(status)) {
 222         setCharacters(fThaiWordSet);
 223     }
 224     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
 225     fMarkSet.add(0x0020);
 226     fEndWordSet = fThaiWordSet;
 227     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 228     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 229     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 230     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 231     fSuffixSet.add(THAI_PAIYANNOI);
 232     fSuffixSet.add(THAI_MAIYAMOK);
 233
 234     // Compact for caching.
 235     fMarkSet.compact();
 236     fEndWordSet.compact();
 237     fBeginWordSet.compact();
 238     fSuffixSet.compact();
 239 }
 240
 241 ThaiBreakEngine::~ThaiBreakEngine() {
 242     delete fDictionary;
 243 }
 244
 245 int32_t
 246 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
 247                                                 int32_t rangeStart,
 248                                                 int32_t rangeEnd,
 249                                                 UStack &foundBreaks ) const {
 250     utext_setNativeIndex(text, rangeStart);
 251     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
 252     if (utext_getNativeIndex(text) >= rangeEnd) {
 253         return 0;       // Not enough characters for two words
 254     }
 255     utext_setNativeIndex(text, rangeStart);
 256
 257
 258     uint32_t wordsFound = 0;
 259     int32_t cpWordLength = 0;    // Word Length in Code Points.
 260     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
 261     int32_t current;
 262     UErrorCode status = U_ZERO_ERROR;
 263     PossibleWord words[THAI_LOOKAHEAD];
 264
 265     utext_setNativeIndex(text, rangeStart);
 266
 267     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 268         cpWordLength = 0;
 269         cuWordLength = 0;
 270
 271         // Look for candidate words at the current position
 272         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 273
 274         // If we found exactly one, use that
 275         if (candidates == 1) {
 276             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
 277             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
 278             wordsFound += 1;
 279         }
 280         // If there was more than one, see which one can take us forward the most words
 281         else if (candidates > 1) {
 282             // If we're already at the end of the range, we're done
 283             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 284                 goto foundBest;
 285             }
 286             do {
 287                 int32_t wordsMatched = 1;
 288                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 289                     if (wordsMatched < 2) {
 290                         // Followed by another dictionary word; mark first word as a good candidate
 291                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
 292                         wordsMatched = 2;
 293                     }
 294
 295                     // If we're already at the end of the range, we're done
 296                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 297                         goto foundBest;
 298                     }
 299
 300                     // See if any of the possible second words is followed by a third word
 301                     do {
 302                         // If we find a third word, stop right away
 303                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 304                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
 305                             goto foundBest;
 306                         }
 307                     }
 308                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
 309                 }
 310             }
 311             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
 312 foundBest:
 313             // Set UText position to after the accepted word.
 314             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
 315             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
 316             wordsFound += 1;
 317         }
 318
 319         // We come here after having either found a word or not. We look ahead to the
 320         // next word. If it's not a dictionary word, we will combine it with the word we
 321         // just found (if there is one), but only if the preceding word does not exceed
 322         // the threshold.
 323         // The text iterator should now be positioned at the end of the word we found.
 324
 325         UChar32 uc = 0;
 326         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
 327             // if it is a dictionary word, do nothing. If it isn't, then if there is
 328             // no preceding word, or the non-word shares less than the minimum threshold
 329             // of characters with a dictionary word, then scan to resynchronize
 330             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 331                   && (cuWordLength == 0
 332                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
 333                 // Look for a plausible word boundary
 334                 int32_t remaining = rangeEnd - (current+cuWordLength);
 335                 UChar32 pc;
 336                 int32_t chars = 0;
 337                 for (;;) {
 338                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 339                     pc = utext_next32(text);
 340                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 341                     chars += pcSize;
 342                     remaining -= pcSize;
 343                     if (remaining <= 0) {
 344                         break;
 345                     }
 346                     uc = utext_current32(text);
 347                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 348                         // Maybe. See if it's in the dictionary.
 349                         // NOTE: In the original Apple code, checked that the next
 350                         // two characters after uc were not 0x0E4C THANTHAKHAT before
 351                         // checking the dictionary. That is just a performance filter,
 352                         // but it's not clear it's faster than checking the trie.
 353                         int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 354                         utext_setNativeIndex(text, current + cuWordLength + chars);
 355                         if (candidates > 0) {
 356                             break;
 357                         }
 358                     }
 359                 }
 360
 361                 // Bump the word count if there wasn't already one
 362                 if (cuWordLength <= 0) {
 363                     wordsFound += 1;
 364                 }
 365
 366                 // Update the length with the passed-over characters
 367                 cuWordLength += chars;
 368             }
 369             else {
 370                 // Back up to where we were for next iteration
 371                 utext_setNativeIndex(text, current+cuWordLength);
 372             }
 373         }
 374
 375         // Never stop before a combining mark.
 376         int32_t currPos;
 377         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 378             utext_next32(text);
 379             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 380         }
 381
 382         // Look ahead for possible suffixes if a dictionary word does not follow.
 383         // We do this in code rather than using a rule so that the heuristic
 384         // resynch continues to function. For example, one of the suffix characters
 385         // could be a typo in the middle of a word.
 386         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
 387             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 388                 && fSuffixSet.contains(uc = utext_current32(text))) {
 389                 if (uc == THAI_PAIYANNOI) {
 390                     if (!fSuffixSet.contains(utext_previous32(text))) {
 391                         // Skip over previous end and PAIYANNOI
 392                         utext_next32(text);
 393                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
 394                         utext_next32(text);
 395                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
 396                         uc = utext_current32(text);     // Fetch next character
 397                     }
 398                     else {
 399                         // Restore prior position
 400                         utext_next32(text);
 401                     }
 402                 }
 403                 if (uc == THAI_MAIYAMOK) {
 404                     if (utext_previous32(text) != THAI_MAIYAMOK) {
 405                         // Skip over previous end and MAIYAMOK
 406                         utext_next32(text);
 407                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
 408                         utext_next32(text);
 409                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
 410                     }
 411                     else {
 412                         // Restore prior position
 413                         utext_next32(text);
 414                     }
 415                 }
 416             }
 417             else {
 418                 utext_setNativeIndex(text, current+cuWordLength);
 419             }
 420         }
 421
 422         // Did we find a word on this iteration? If so, push it on the break stack
 423         if (cuWordLength > 0) {
 424             foundBreaks.push((current+cuWordLength), status);
 425         }
 426     }
 427
 428     // Don't return a break for the end of the dictionary range if there is one there.
 429     if (foundBreaks.peeki() >= rangeEnd) {
 430         (void) foundBreaks.popi();
 431         wordsFound -= 1;
 432     }
 433
 434     return wordsFound;
 435 }
 436
 437 /*
 438  ******************************************************************
 439  * LaoBreakEngine
 440  */
 441
 442 // How many words in a row are "good enough"?
 443 static const int32_t LAO_LOOKAHEAD = 3;
 444
 445 // Will not combine a non-word with a preceding dictionary word longer than this
 446 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
 447
 448 // Will not combine a non-word that shares at least this much prefix with a
 449 // dictionary word, with a preceding word
 450 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
 451
 452 // Minimum word size
 453 static const int32_t LAO_MIN_WORD = 2;
 454
 455 // Minimum number of characters for two words
 456 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
 457
 458 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 459     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
 460       fDictionary(adoptDictionary)
 461 {
 462     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
 463     if (U_SUCCESS(status)) {
 464         setCharacters(fLaoWordSet);
 465     }
 466     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
 467     fMarkSet.add(0x0020);
 468     fEndWordSet = fLaoWordSet;
 469     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
 470     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
 471     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
 472     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
 473
 474     // Compact for caching.
 475     fMarkSet.compact();
 476     fEndWordSet.compact();
 477     fBeginWordSet.compact();
 478 }
 479
 480 LaoBreakEngine::~LaoBreakEngine() {
 481     delete fDictionary;
 482 }
 483
 484 int32_t
 485 LaoBreakEngine::divideUpDictionaryRange( UText *text,
 486                                                 int32_t rangeStart,
 487                                                 int32_t rangeEnd,
 488                                                 UStack &foundBreaks ) const {
 489     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
 490         return 0;       // Not enough characters for two words
 491     }
 492
 493     uint32_t wordsFound = 0;
 494     int32_t cpWordLength = 0;
 495     int32_t cuWordLength = 0;
 496     int32_t current;
 497     UErrorCode status = U_ZERO_ERROR;
 498     PossibleWord words[LAO_LOOKAHEAD];
 499
 500     utext_setNativeIndex(text, rangeStart);
 501
 502     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 503         cuWordLength = 0;
 504         cpWordLength = 0;
 505
 506         // Look for candidate words at the current position
 507         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 508
 509         // If we found exactly one, use that
 510         if (candidates == 1) {
 511             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
 512             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
 513             wordsFound += 1;
 514         }
 515         // If there was more than one, see which one can take us forward the most words
 516         else if (candidates > 1) {
 517             // If we're already at the end of the range, we're done
 518             if (utext_getNativeIndex(text) >= rangeEnd) {
 519                 goto foundBest;
 520             }
 521             do {
 522                 int32_t wordsMatched = 1;
 523                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 524                     if (wordsMatched < 2) {
 525                         // Followed by another dictionary word; mark first word as a good candidate
 526                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
 527                         wordsMatched = 2;
 528                     }
 529
 530                     // If we're already at the end of the range, we're done
 531                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 532                         goto foundBest;
 533                     }
 534
 535                     // See if any of the possible second words is followed by a third word
 536                     do {
 537                         // If we find a third word, stop right away
 538                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 539                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
 540                             goto foundBest;
 541                         }
 542                     }
 543                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
 544                 }
 545             }
 546             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
 547 foundBest:
 548             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
 549             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
 550             wordsFound += 1;
 551         }
 552
 553         // We come here after having either found a word or not. We look ahead to the
 554         // next word. If it's not a dictionary word, we will combine it withe the word we
 555         // just found (if there is one), but only if the preceding word does not exceed
 556         // the threshold.
 557         // The text iterator should now be positioned at the end of the word we found.
 558         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
 559             // if it is a dictionary word, do nothing. If it isn't, then if there is
 560             // no preceding word, or the non-word shares less than the minimum threshold
 561             // of characters with a dictionary word, then scan to resynchronize
 562             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 563                   && (cuWordLength == 0
 564                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
 565                 // Look for a plausible word boundary
 566                 int32_t remaining = rangeEnd - (current + cuWordLength);
 567                 UChar32 pc;
 568                 UChar32 uc;
 569                 int32_t chars = 0;
 570                 for (;;) {
 571                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 572                     pc = utext_next32(text);
 573                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 574                     chars += pcSize;
 575                     remaining -= pcSize;
 576                     if (remaining <= 0) {
 577                         break;
 578                     }
 579                     uc = utext_current32(text);
 580                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 581                         // Maybe. See if it's in the dictionary.
 582                         // TODO: this looks iffy; compare with old code.
 583                         int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 584                         utext_setNativeIndex(text, current + cuWordLength + chars);
 585                         if (candidates > 0) {
 586                             break;
 587                         }
 588                     }
 589                 }
 590
 591                 // Bump the word count if there wasn't already one
 592                 if (cuWordLength <= 0) {
 593                     wordsFound += 1;
 594                 }
 595
 596                 // Update the length with the passed-over characters
 597                 cuWordLength += chars;
 598             }
 599             else {
 600                 // Back up to where we were for next iteration
 601                 utext_setNativeIndex(text, current + cuWordLength);
 602             }
 603         }
 604
 605         // Never stop before a combining mark.
 606         int32_t currPos;
 607         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 608             utext_next32(text);
 609             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 610         }
 611
 612         // Look ahead for possible suffixes if a dictionary word does not follow.
 613         // We do this in code rather than using a rule so that the heuristic
 614         // resynch continues to function. For example, one of the suffix characters
 615         // could be a typo in the middle of a word.
 616         // NOT CURRENTLY APPLICABLE TO LAO
 617
 618         // Did we find a word on this iteration? If so, push it on the break stack
 619         if (cuWordLength > 0) {
 620             foundBreaks.push((current+cuWordLength), status);
 621         }
 622     }
 623
 624     // Don't return a break for the end of the dictionary range if there is one there.
 625     if (foundBreaks.peeki() >= rangeEnd) {
 626         (void) foundBreaks.popi();
 627         wordsFound -= 1;
 628     }
 629
 630     return wordsFound;
 631 }
 632
 633 /*
 634  ******************************************************************
 635  * BurmeseBreakEngine
 636  */
 637
 638 // How many words in a row are "good enough"?
 639 static const int32_t BURMESE_LOOKAHEAD = 3;
 640
 641 // Will not combine a non-word with a preceding dictionary word longer than this
 642 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
 643
 644 // Will not combine a non-word that shares at least this much prefix with a
 645 // dictionary word, with a preceding word
 646 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
 647
 648 // Minimum word size
 649 static const int32_t BURMESE_MIN_WORD = 2;
 650
 651 // Minimum number of characters for two words
 652 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
 653
 654 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 655     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
 656       fDictionary(adoptDictionary)
 657 {
 658     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
 659     if (U_SUCCESS(status)) {
 660         setCharacters(fBurmeseWordSet);
 661     }
 662     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
 663     fMarkSet.add(0x0020);
 664     fEndWordSet = fBurmeseWordSet;
 665     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
 666
 667     // Compact for caching.
 668     fMarkSet.compact();
 669     fEndWordSet.compact();
 670     fBeginWordSet.compact();
 671 }
 672
 673 BurmeseBreakEngine::~BurmeseBreakEngine() {
 674     delete fDictionary;
 675 }
 676
 677 int32_t
 678 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
 679                                                 int32_t rangeStart,
 680                                                 int32_t rangeEnd,
 681                                                 UStack &foundBreaks ) const {
 682     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
 683         return 0;       // Not enough characters for two words
 684     }
 685
 686     uint32_t wordsFound = 0;
 687     int32_t cpWordLength = 0;
 688     int32_t cuWordLength = 0;
 689     int32_t current;
 690     UErrorCode status = U_ZERO_ERROR;
 691     PossibleWord words[BURMESE_LOOKAHEAD];
 692
 693     utext_setNativeIndex(text, rangeStart);
 694
 695     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 696         cuWordLength = 0;
 697         cpWordLength = 0;
 698
 699         // Look for candidate words at the current position
 700         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 701
 702         // If we found exactly one, use that
 703         if (candidates == 1) {
 704             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
 705             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
 706             wordsFound += 1;
 707         }
 708         // If there was more than one, see which one can take us forward the most words
 709         else if (candidates > 1) {
 710             // If we're already at the end of the range, we're done
 711             if (utext_getNativeIndex(text) >= rangeEnd) {
 712                 goto foundBest;
 713             }
 714             do {
 715                 int32_t wordsMatched = 1;
 716                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 717                     if (wordsMatched < 2) {
 718                         // Followed by another dictionary word; mark first word as a good candidate
 719                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
 720                         wordsMatched = 2;
 721                     }
 722
 723                     // If we're already at the end of the range, we're done
 724                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 725                         goto foundBest;
 726                     }
 727
 728                     // See if any of the possible second words is followed by a third word
 729                     do {
 730                         // If we find a third word, stop right away
 731                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 732                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
 733                             goto foundBest;
 734                         }
 735                     }
 736                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
 737                 }
 738             }
 739             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
 740 foundBest:
 741             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
 742             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
 743             wordsFound += 1;
 744         }
 745
 746         // We come here after having either found a word or not. We look ahead to the
 747         // next word. If it's not a dictionary word, we will combine it withe the word we
 748         // just found (if there is one), but only if the preceding word does not exceed
 749         // the threshold.
 750         // The text iterator should now be positioned at the end of the word we found.
 751         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
 752             // if it is a dictionary word, do nothing. If it isn't, then if there is
 753             // no preceding word, or the non-word shares less than the minimum threshold
 754             // of characters with a dictionary word, then scan to resynchronize
 755             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 756                   && (cuWordLength == 0
 757                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
 758                 // Look for a plausible word boundary
 759                 int32_t remaining = rangeEnd - (current + cuWordLength);
 760                 UChar32 pc;
 761                 UChar32 uc;
 762                 int32_t chars = 0;
 763                 for (;;) {
 764                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 765                     pc = utext_next32(text);
 766                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 767                     chars += pcSize;
 768                     remaining -= pcSize;
 769                     if (remaining <= 0) {
 770                         break;
 771                     }
 772                     uc = utext_current32(text);
 773                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 774                         // Maybe. See if it's in the dictionary.
 775                         // TODO: this looks iffy; compare with old code.
 776                         int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 777                         utext_setNativeIndex(text, current + cuWordLength + chars);
 778                         if (candidates > 0) {
 779                             break;
 780                         }
 781                     }
 782                 }
 783
 784                 // Bump the word count if there wasn't already one
 785                 if (cuWordLength <= 0) {
 786                     wordsFound += 1;
 787                 }
 788
 789                 // Update the length with the passed-over characters
 790                 cuWordLength += chars;
 791             }
 792             else {
 793                 // Back up to where we were for next iteration
 794                 utext_setNativeIndex(text, current + cuWordLength);
 795             }
 796         }
 797
 798         // Never stop before a combining mark.
 799         int32_t currPos;
 800         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 801             utext_next32(text);
 802             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 803         }
 804
 805         // Look ahead for possible suffixes if a dictionary word does not follow.
 806         // We do this in code rather than using a rule so that the heuristic
 807         // resynch continues to function. For example, one of the suffix characters
 808         // could be a typo in the middle of a word.
 809         // NOT CURRENTLY APPLICABLE TO BURMESE
 810
 811         // Did we find a word on this iteration? If so, push it on the break stack
 812         if (cuWordLength > 0) {
 813             foundBreaks.push((current+cuWordLength), status);
 814         }
 815     }
 816
 817     // Don't return a break for the end of the dictionary range if there is one there.
 818     if (foundBreaks.peeki() >= rangeEnd) {
 819         (void) foundBreaks.popi();
 820         wordsFound -= 1;
 821     }
 822
 823     return wordsFound;
 824 }
 825
 826 /*
 827  ******************************************************************
 828  * KhmerBreakEngine
 829  */
 830
 831 // How many words in a row are "good enough"?
 832 static const int32_t KHMER_LOOKAHEAD = 3;
 833
 834 // Will not combine a non-word with a preceding dictionary word longer than this
 835 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
 836
 837 // Will not combine a non-word that shares at least this much prefix with a
 838 // dictionary word, with a preceding word
 839 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
 840
 841 // Minimum word size
 842 static const int32_t KHMER_MIN_WORD = 2;
 843
 844 // Minimum number of characters for two words
 845 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
 846
 847 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 848     : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
 849       fDictionary(adoptDictionary)
 850 {
 851     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
 852     if (U_SUCCESS(status)) {
 853         setCharacters(fKhmerWordSet);
 854     }
 855     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
 856     fMarkSet.add(0x0020);
 857     fEndWordSet = fKhmerWordSet;
 858     fBeginWordSet.add(0x1780, 0x17B3);
 859     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
 860     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
 861     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
 862     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
 863     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
 864 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 865 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 866 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 867 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 868 //    fSuffixSet.add(THAI_PAIYANNOI);
 869 //    fSuffixSet.add(THAI_MAIYAMOK);
 870
 871     // Compact for caching.
 872     fMarkSet.compact();
 873     fEndWordSet.compact();
 874     fBeginWordSet.compact();
 875 //    fSuffixSet.compact();
 876 }
 877
 878 KhmerBreakEngine::~KhmerBreakEngine() {
 879     delete fDictionary;
 880 }
 881
 882 int32_t
 883 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
 884                                                 int32_t rangeStart,
 885                                                 int32_t rangeEnd,
 886                                                 UStack &foundBreaks ) const {
 887     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
 888         return 0;       // Not enough characters for two words
 889     }
 890
 891     uint32_t wordsFound = 0;
 892     int32_t cpWordLength = 0;
 893     int32_t cuWordLength = 0;
 894     int32_t current;
 895     UErrorCode status = U_ZERO_ERROR;
 896     PossibleWord words[KHMER_LOOKAHEAD];
 897
 898     utext_setNativeIndex(text, rangeStart);
 899
 900     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 901         cuWordLength = 0;
 902         cpWordLength = 0;
 903
 904         // Look for candidate words at the current position
 905         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 906
 907         // If we found exactly one, use that
 908         if (candidates == 1) {
 909             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 910             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 911             wordsFound += 1;
 912         }
 913
 914         // If there was more than one, see which one can take us forward the most words
 915         else if (candidates > 1) {
 916             // If we're already at the end of the range, we're done
 917             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 918                 goto foundBest;
 919             }
 920             do {
 921                 int32_t wordsMatched = 1;
 922                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 923                     if (wordsMatched < 2) {
 924                         // Followed by another dictionary word; mark first word as a good candidate
 925                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 926                         wordsMatched = 2;
 927                     }
 928
 929                     // If we're already at the end of the range, we're done
 930                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 931                         goto foundBest;
 932                     }
 933
 934                     // See if any of the possible second words is followed by a third word
 935                     do {
 936                         // If we find a third word, stop right away
 937                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 938                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 939                             goto foundBest;
 940                         }
 941                     }
 942                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
 943                 }
 944             }
 945             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
 946 foundBest:
 947             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 948             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 949             wordsFound += 1;
 950         }
 951
 952         // We come here after having either found a word or not. We look ahead to the
 953         // next word. If it's not a dictionary word, we will combine it with the word we
 954         // just found (if there is one), but only if the preceding word does not exceed
 955         // the threshold.
 956         // The text iterator should now be positioned at the end of the word we found.
 957         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
 958             // if it is a dictionary word, do nothing. If it isn't, then if there is
 959             // no preceding word, or the non-word shares less than the minimum threshold
 960             // of characters with a dictionary word, then scan to resynchronize
 961             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 962                   && (cuWordLength == 0
 963                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
 964                 // Look for a plausible word boundary
 965                 int32_t remaining = rangeEnd - (current+cuWordLength);
 966                 UChar32 pc;
 967                 UChar32 uc;
 968                 int32_t chars = 0;
 969                 for (;;) {
 970                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 971                     pc = utext_next32(text);
 972                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 973                     chars += pcSize;
 974                     remaining -= pcSize;
 975                     if (remaining <= 0) {
 976                         break;
 977                     }
 978                     uc = utext_current32(text);
 979                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 980                         // Maybe. See if it's in the dictionary.
 981                         int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 982                         utext_setNativeIndex(text, current+cuWordLength+chars);
 983                         if (candidates > 0) {
 984                             break;
 985                         }
 986                     }
 987                 }
 988
 989                 // Bump the word count if there wasn't already one
 990                 if (cuWordLength <= 0) {
 991                     wordsFound += 1;
 992                 }
 993
 994                 // Update the length with the passed-over characters
 995                 cuWordLength += chars;
 996             }
 997             else {
 998                 // Back up to where we were for next iteration
 999                 utext_setNativeIndex(text, current+cuWordLength);
1000             }
1001         }
1002
1003         // Never stop before a combining mark.
1004         int32_t currPos;
1005         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
1006             utext_next32(text);
1007             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
1008         }
1009
1010         // Look ahead for possible suffixes if a dictionary word does not follow.
1011         // We do this in code rather than using a rule so that the heuristic
1012         // resynch continues to function. For example, one of the suffix characters
1013         // could be a typo in the middle of a word.
1014 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1015 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1016 //                && fSuffixSet.contains(uc = utext_current32(text))) {
1017 //                if (uc == KHMER_PAIYANNOI) {
1018 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
1019 //                        // Skip over previous end and PAIYANNOI
1020 //                        utext_next32(text);
1021 //                        utext_next32(text);
1022 //                        wordLength += 1;            // Add PAIYANNOI to word
1023 //                        uc = utext_current32(text);     // Fetch next character
1024 //                    }
1025 //                    else {
1026 //                        // Restore prior position
1027 //                        utext_next32(text);
1028 //                    }
1029 //                }
1030 //                if (uc == KHMER_MAIYAMOK) {
1031 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1032 //                        // Skip over previous end and MAIYAMOK
1033 //                        utext_next32(text);
1034 //                        utext_next32(text);
1035 //                        wordLength += 1;            // Add MAIYAMOK to word
1036 //                    }
1037 //                    else {
1038 //                        // Restore prior position
1039 //                        utext_next32(text);
1040 //                    }
1041 //                }
1042 //            }
1043 //            else {
1044 //                utext_setNativeIndex(text, current+wordLength);
1045 //            }
1046 //        }
1047
1048         // Did we find a word on this iteration? If so, push it on the break stack
1049         if (cuWordLength > 0) {
1050             foundBreaks.push((current+cuWordLength), status);
1051         }
1052     }
1053
1054     // Don't return a break for the end of the dictionary range if there is one there.
1055     if (foundBreaks.peeki() >= rangeEnd) {
1056         (void) foundBreaks.popi();
1057         wordsFound -= 1;
1058     }
1059
1060     return wordsFound;
1061 }
1062
1063 #if !UCONFIG_NO_NORMALIZATION
1064 /*
1065  ******************************************************************
1066  * CjkBreakEngine
1067  */
1068 static const uint32_t kuint32max = 0xFFFFFFFF;
1069 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1070 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
1071     // Korean dictionary only includes Hangul syllables
1072     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1073     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1074     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1075     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1076     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1077
1078     if (U_SUCCESS(status)) {
1079         // handle Korean and Japanese/Chinese using different dictionaries
1080         if (type == kKorean) {
1081             setCharacters(fHangulWordSet);
1082         } else { //Chinese and Japanese
1083             UnicodeSet cjSet;
1084             cjSet.addAll(fHanWordSet);
1085             cjSet.addAll(fKatakanaWordSet);
1086             cjSet.addAll(fHiraganaWordSet);
1087             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1088             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1089             setCharacters(cjSet);
1090         }
1091     }
1092 }
1093
1094 CjkBreakEngine::~CjkBreakEngine(){
1095     delete fDictionary;
1096 }
1097
1098 // The katakanaCost values below are based on the length frequencies of all
1099 // katakana phrases in the dictionary
1100 static const int32_t kMaxKatakanaLength = 8;
1101 static const int32_t kMaxKatakanaGroupLength = 20;
1102 static const uint32_t maxSnlp = 255;
1103
1104 static inline uint32_t getKatakanaCost(int32_t wordLength){
1105     //TODO: fill array with actual values from dictionary!
1106     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1107                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1108     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1109 }
1110
1111 static inline bool isKatakana(uint16_t value) {
1112     return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
1113             (value >= 0xFF66u && value <= 0xFF9fu);
1114 }
1115
1116
1117 // Function for accessing internal utext flags.
1118 //   Replicates an internal UText function.
1119
1120 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1121     return (int32_t)1 << bitIndex;
1122 }
1123
1124
1125 /*
1126  * @param text A UText representing the text
1127  * @param rangeStart The start of the range of dictionary characters
1128  * @param rangeEnd The end of the range of dictionary characters
1129  * @param foundBreaks Output of C array of int32_t break positions, or 0
1130  * @return The number of breaks found
1131  */
1132 int32_t
1133 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1134         int32_t rangeStart,
1135         int32_t rangeEnd,
1136         UStack &foundBreaks ) const {
1137     if (rangeStart >= rangeEnd) {
1138         return 0;
1139     }
1140
1141     // UnicodeString version of input UText, NFKC normalized if necessary.
1142     UnicodeString inString;
1143
1144     // inputMap[inStringIndex] = corresponding native index from UText inText.
1145     // If NULL then mapping is 1:1
1146     LocalPointer<UVector32>     inputMap;
1147
1148     UErrorCode     status      = U_ZERO_ERROR;
1149
1150
1151     // if UText has the input string as one contiguous UTF-16 chunk
1152     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1153          inText->chunkNativeStart <= rangeStart &&
1154          inText->chunkNativeLimit >= rangeEnd   &&
1155          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1156
1157         // Input UText is in one contiguous UTF-16 chunk.
1158         // Use Read-only aliasing UnicodeString.
1159         inString.setTo(FALSE,
1160                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1161                        rangeEnd - rangeStart);
1162     } else {
1163         // Copy the text from the original inText (UText) to inString (UnicodeString).
1164         // Create a map from UnicodeString indices -> UText offsets.
1165         utext_setNativeIndex(inText, rangeStart);
1166         int32_t limit = rangeEnd;
1167         U_ASSERT(limit <= utext_nativeLength(inText));
1168         if (limit > utext_nativeLength(inText)) {
1169             limit = (int32_t)utext_nativeLength(inText);
1170         }
1171         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1172         if (U_FAILURE(status)) {
1173             return 0;
1174         }
1175         while (utext_getNativeIndex(inText) < limit) {
1176             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1177             UChar32 c = utext_next32(inText);
1178             U_ASSERT(c != U_SENTINEL);
1179             inString.append(c);
1180             while (inputMap->size() < inString.length()) {
1181                 inputMap->addElement(nativePosition, status);
1182             }
1183         }
1184         inputMap->addElement(limit, status);
1185     }
1186
1187
1188     if (!nfkcNorm2->isNormalized(inString, status)) {
1189         UnicodeString normalizedInput;
1190         //  normalizedMap[normalizedInput position] ==  original UText position.
1191         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1192         if (U_FAILURE(status)) {
1193             return 0;
1194         }
1195
1196         UnicodeString fragment;
1197         UnicodeString normalizedFragment;
1198         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1199             fragment.remove();
1200             int32_t fragmentStartI = srcI;
1201             UChar32 c = inString.char32At(srcI);
1202             for (;;) {
1203                 fragment.append(c);
1204                 srcI = inString.moveIndex32(srcI, 1);
1205                 if (srcI == inString.length()) {
1206                     break;
1207                 }
1208                 c = inString.char32At(srcI);
1209                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1210                     break;
1211                 }
1212             }
1213             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1214             normalizedInput.append(normalizedFragment);
1215
1216             // Map every position in the normalized chunk to the start of the chunk
1217             //   in the original input.
1218             int32_t fragmentOriginalStart = inputMap.isValid() ?
1219                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1220             while (normalizedMap->size() < normalizedInput.length()) {
1221                 normalizedMap->addElement(fragmentOriginalStart, status);
1222                 if (U_FAILURE(status)) {
1223                     break;
1224                 }
1225             }
1226         }
1227         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1228         int32_t nativeEnd = inputMap.isValid() ?
1229                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1230         normalizedMap->addElement(nativeEnd, status);
1231
1232         inputMap.moveFrom(normalizedMap);
1233         inString.moveFrom(normalizedInput);
1234     }
1235
1236     int32_t numCodePts = inString.countChar32();
1237     if (numCodePts != inString.length()) {
1238         // There are supplementary characters in the input.
1239         // The dictionary will produce boundary positions in terms of code point indexes,
1240         //   not in terms of code unit string indexes.
1241         // Use the inputMap mechanism to take care of this in addition to indexing differences
1242         //    from normalization and/or UTF-8 input.
1243         UBool hadExistingMap = inputMap.isValid();
1244         if (!hadExistingMap) {
1245             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1246             if (U_FAILURE(status)) {
1247                 return 0;
1248             }
1249         }
1250         int32_t cpIdx = 0;
1251         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1252             U_ASSERT(cuIdx >= cpIdx);
1253             if (hadExistingMap) {
1254                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1255             } else {
1256                 inputMap->addElement(cuIdx+rangeStart, status);
1257             }
1258             cpIdx++;
1259             if (cuIdx == inString.length()) {
1260                break;
1261             }
1262         }
1263     }
1264
1265     // bestSnlp[i] is the snlp of the best segmentation of the first i
1266     // code points in the range to be matched.
1267     UVector32 bestSnlp(numCodePts + 1, status);
1268     bestSnlp.addElement(0, status);
1269     for(int32_t i = 1; i <= numCodePts; i++) {
1270         bestSnlp.addElement(kuint32max, status);
1271     }
1272
1273
1274     // prev[i] is the index of the last CJK code point in the previous word in
1275     // the best segmentation of the first i characters.
1276     UVector32 prev(numCodePts + 1, status);
1277     for(int32_t i = 0; i <= numCodePts; i++){
1278         prev.addElement(-1, status);
1279     }
1280
1281     const int32_t maxWordSize = 20;
1282     UVector32 values(numCodePts, status);
1283     values.setSize(numCodePts);
1284     UVector32 lengths(numCodePts, status);
1285     lengths.setSize(numCodePts);
1286
1287     UText fu = UTEXT_INITIALIZER;
1288     utext_openUnicodeString(&fu, &inString, &status);
1289
1290     // Dynamic programming to find the best segmentation.
1291
1292     // In outer loop, i  is the code point index,
1293     //                ix is the corresponding string (code unit) index.
1294     //    They differ when the string contains supplementary characters.
1295     int32_t ix = 0;
1296     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1297         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1298             continue;
1299         }
1300
1301         int32_t count;
1302         utext_setNativeIndex(&fu, ix);
1303         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1304                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1305                              // Note: lengths is filled with code point lengths
1306                              //       The NULL parameter is the ignored code unit lengths.
1307
1308         // if there are no single character matches found in the dictionary
1309         // starting with this charcter, treat character as a 1-character word
1310         // with the highest value possible, i.e. the least likely to occur.
1311         // Exclude Korean characters from this treatment, as they should be left
1312         // together by default.
1313         if ((count == 0 || lengths.elementAti(0) != 1) &&
1314                 !fHangulWordSet.contains(inString.char32At(ix))) {
1315             values.setElementAt(maxSnlp, count);   // 255
1316             lengths.setElementAt(1, count++);
1317         }
1318
1319         for (int32_t j = 0; j < count; j++) {
1320             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1321             int32_t ln_j_i = lengths.elementAti(j) + i;
1322             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1323                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1324                 prev.setElementAt(i, ln_j_i);
1325             }
1326         }
1327
1328         // In Japanese,
1329         // Katakana word in single character is pretty rare. So we apply
1330         // the following heuristic to Katakana: any continuous run of Katakana
1331         // characters is considered a candidate word with a default cost
1332         // specified in the katakanaCost table according to its length.
1333
1334         bool is_prev_katakana = false;
1335         bool is_katakana = isKatakana(inString.char32At(ix));
1336         int32_t katakanaRunLength = 1;
1337         if (!is_prev_katakana && is_katakana) {
1338             int32_t j = inString.moveIndex32(ix, 1);
1339             // Find the end of the continuous run of Katakana characters
1340             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1341                     isKatakana(inString.char32At(j))) {
1342                 j = inString.moveIndex32(j, 1);
1343                 katakanaRunLength++;
1344             }
1345             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1346                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1347                 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
1348                     bestSnlp.setElementAt(newSnlp, j);
1349                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1350                 }
1351             }
1352         }
1353         is_prev_katakana = is_katakana;
1354     }
1355     utext_close(&fu);
1356
1357     // Start pushing the optimal offset index into t_boundary (t for tentative).
1358     // prev[numCodePts] is guaranteed to be meaningful.
1359     // We'll first push in the reverse order, i.e.,
1360     // t_boundary[0] = numCodePts, and afterwards do a swap.
1361     UVector32 t_boundary(numCodePts+1, status);
1362
1363     int32_t numBreaks = 0;
1364     // No segmentation found, set boundary to end of range
1365     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1366         t_boundary.addElement(numCodePts, status);
1367         numBreaks++;
1368     } else {
1369         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1370             t_boundary.addElement(i, status);
1371             numBreaks++;
1372         }
1373         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1374     }
1375
1376     // Add a break for the start of the dictionary range if there is not one
1377     // there already.
1378     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1379         t_boundary.addElement(0, status);
1380         numBreaks++;
1381     }
1382
1383     // Now that we're done, convert positions in t_boundary[] (indices in
1384     // the normalized input string) back to indices in the original input UText
1385     // while reversing t_boundary and pushing values to foundBreaks.
1386     for (int32_t i = numBreaks-1; i >= 0; i--) {
1387         int32_t cpPos = t_boundary.elementAti(i);
1388         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1389         // Boundaries are added to foundBreaks output in ascending order.
1390         U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
1391         foundBreaks.push(utextPos, status);
1392     }
1393
1394     // inString goes out of scope
1395     // inputMap goes out of scope
1396     return numBreaks;
1397 }
1398 #endif
1399
1400 U_NAMESPACE_END
1401
1402 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1403