icuSources/common/dictbe.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /**
   4  *******************************************************************************
   5  * Copyright (C) 2006-2016, International Business Machines Corporation
   6  * and others. All Rights Reserved.
   7  *******************************************************************************
   8  */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_BREAK_ITERATION
  13
  14 #include "brkeng.h"
  15 #include "dictbe.h"
  16 #include "unicode/uniset.h"
  17 #include "unicode/chariter.h"
  18 #include "unicode/ubrk.h"
  19 #include "uvectr32.h"
  20 #include "uvector.h"
  21 #include "uassert.h"
  22 #include "unicode/normlzr.h"
  23 #include "cmemory.h"
  24 #include "dictionarydata.h"
  25
  26 U_NAMESPACE_BEGIN
  27
  28 /*
  29  ******************************************************************
  30  */
  31
  32 DictionaryBreakEngine::DictionaryBreakEngine() {
  33 }
  34
  35 DictionaryBreakEngine::~DictionaryBreakEngine() {
  36 }
  37
  38 UBool
  39 DictionaryBreakEngine::handles(UChar32 c) const {
  40     return fSet.contains(c);
  41 }
  42
  43 int32_t
  44 DictionaryBreakEngine::findBreaks( UText *text,
  45                                  int32_t startPos,
  46                                  int32_t endPos,
  47                                  UVector32 &foundBreaks ) const {
  48     (void)startPos;            // TODO: remove this param?
  49     int32_t result = 0;
  50
  51     // Find the span of characters included in the set.
  52     //   The span to break begins at the current position in the text, and
  53     //   extends towards the start or end of the text, depending on 'reverse'.
  54
  55     int32_t start = (int32_t)utext_getNativeIndex(text);
  56     int32_t current;
  57     int32_t rangeStart;
  58     int32_t rangeEnd;
  59     UChar32 c = utext_current32(text);
  60     while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
  61         utext_next32(text);         // TODO:  recast loop for postincrement
  62         c = utext_current32(text);
  63     }
  64     rangeStart = start;
  65     rangeEnd = current;
  66     result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
  67     utext_setNativeIndex(text, current);
  68
  69     return result;
  70 }
  71
  72 void
  73 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
  74     fSet = set;
  75     // Compact for caching
  76     fSet.compact();
  77 }
  78
  79 /*
  80  ******************************************************************
  81  * PossibleWord
  82  */
  83
  84 // Helper class for improving readability of the Thai/Lao/Khmer word break
  85 // algorithm. The implementation is completely inline.
  86
  87 // List size, limited by the maximum number of words in the dictionary
  88 // that form a nested sequence.
  89 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
  90
  91 class PossibleWord {
  92 private:
  93     // list of word candidate lengths, in increasing length order
  94     // TODO: bytes would be sufficient for word lengths.
  95     int32_t   count;      // Count of candidates
  96     int32_t   prefix;     // The longest match with a dictionary word
  97     int32_t   offset;     // Offset in the text of these candidates
  98     int32_t   mark;       // The preferred candidate's offset
  99     int32_t   current;    // The candidate we're currently looking at
 100     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
 101     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
 102
 103 public:
 104     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
 105     ~PossibleWord() {};
 106
 107     // Fill the list of candidates if needed, select the longest, and return the number found
 108     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
 109
 110     // Select the currently marked candidate, point after it in the text, and invalidate self
 111     int32_t   acceptMarked( UText *text );
 112
 113     // Back up from the current candidate to the next shorter one; return TRUE if that exists
 114     // and point the text after it
 115     UBool     backUp( UText *text );
 116
 117     // Return the longest prefix this candidate location shares with a dictionary word
 118     // Return value is in code points.
 119     int32_t   longestPrefix() { return prefix; };
 120
 121     // Mark the current candidate as the one we like
 122     void      markCurrent() { mark = current; };
 123
 124     // Get length in code points of the marked word.
 125     int32_t   markedCPLength() { return cpLengths[mark]; };
 126 };
 127
 128
 129 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
 130     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
 131     int32_t start = (int32_t)utext_getNativeIndex(text);
 132     if (start != offset) {
 133         offset = start;
 134         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
 135         // Dictionary leaves text after longest prefix, not longest word. Back up.
 136         if (count <= 0) {
 137             utext_setNativeIndex(text, start);
 138         }
 139     }
 140     if (count > 0) {
 141         utext_setNativeIndex(text, start+cuLengths[count-1]);
 142     }
 143     current = count-1;
 144     mark = current;
 145     return count;
 146 }
 147
 148 int32_t
 149 PossibleWord::acceptMarked( UText *text ) {
 150     utext_setNativeIndex(text, offset + cuLengths[mark]);
 151     return cuLengths[mark];
 152 }
 153
 154
 155 UBool
 156 PossibleWord::backUp( UText *text ) {
 157     if (current > 0) {
 158         utext_setNativeIndex(text, offset + cuLengths[--current]);
 159         return TRUE;
 160     }
 161     return FALSE;
 162 }
 163
 164 /*
 165  ******************************************************************
 166  * ThaiBreakEngine
 167  */
 168
 169 // How many words in a row are "good enough"?
 170 static const int32_t THAI_LOOKAHEAD = 3;
 171
 172 // Will not combine a non-word with a preceding dictionary word longer than this
 173 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
 174
 175 // Will not combine a non-word that shares at least this much prefix with a
 176 // dictionary word, with a preceding word
 177 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
 178
 179 // Ellision character
 180 static const int32_t THAI_PAIYANNOI = 0x0E2F;
 181
 182 // Repeat character
 183 static const int32_t THAI_MAIYAMOK = 0x0E46;
 184
 185 // Minimum word size
 186 static const int32_t THAI_MIN_WORD = 2;
 187
 188 // Minimum number of characters for two words
 189 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
 190
 191 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 192     : DictionaryBreakEngine(),
 193       fDictionary(adoptDictionary)
 194 {
 195     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
 196     if (U_SUCCESS(status)) {
 197         setCharacters(fThaiWordSet);
 198     }
 199     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
 200     fMarkSet.add(0x0020);
 201     fEndWordSet = fThaiWordSet;
 202     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 203     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 204     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 205     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 206     fSuffixSet.add(THAI_PAIYANNOI);
 207     fSuffixSet.add(THAI_MAIYAMOK);
 208
 209     // Compact for caching.
 210     fMarkSet.compact();
 211     fEndWordSet.compact();
 212     fBeginWordSet.compact();
 213     fSuffixSet.compact();
 214 }
 215
 216 ThaiBreakEngine::~ThaiBreakEngine() {
 217     delete fDictionary;
 218 }
 219
 220 int32_t
 221 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
 222                                                 int32_t rangeStart,
 223                                                 int32_t rangeEnd,
 224                                                 UVector32 &foundBreaks ) const {
 225     utext_setNativeIndex(text, rangeStart);
 226     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
 227     if (utext_getNativeIndex(text) >= rangeEnd) {
 228         return 0;       // Not enough characters for two words
 229     }
 230     utext_setNativeIndex(text, rangeStart);
 231
 232
 233     uint32_t wordsFound = 0;
 234     int32_t cpWordLength = 0;    // Word Length in Code Points.
 235     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
 236     int32_t current;
 237     UErrorCode status = U_ZERO_ERROR;
 238     PossibleWord words[THAI_LOOKAHEAD];
 239
 240     utext_setNativeIndex(text, rangeStart);
 241
 242     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 243         cpWordLength = 0;
 244         cuWordLength = 0;
 245
 246         // Look for candidate words at the current position
 247         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 248
 249         // If we found exactly one, use that
 250         if (candidates == 1) {
 251             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
 252             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
 253             wordsFound += 1;
 254         }
 255         // If there was more than one, see which one can take us forward the most words
 256         else if (candidates > 1) {
 257             // If we're already at the end of the range, we're done
 258             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 259                 goto foundBest;
 260             }
 261             do {
 262                 int32_t wordsMatched = 1;
 263                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 264                     if (wordsMatched < 2) {
 265                         // Followed by another dictionary word; mark first word as a good candidate
 266                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
 267                         wordsMatched = 2;
 268                     }
 269
 270                     // If we're already at the end of the range, we're done
 271                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 272                         goto foundBest;
 273                     }
 274
 275                     // See if any of the possible second words is followed by a third word
 276                     do {
 277                         // If we find a third word, stop right away
 278                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 279                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
 280                             goto foundBest;
 281                         }
 282                     }
 283                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
 284                 }
 285             }
 286             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
 287 foundBest:
 288             // Set UText position to after the accepted word.
 289             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
 290             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
 291             wordsFound += 1;
 292         }
 293
 294         // We come here after having either found a word or not. We look ahead to the
 295         // next word. If it's not a dictionary word, we will combine it with the word we
 296         // just found (if there is one), but only if the preceding word does not exceed
 297         // the threshold.
 298         // The text iterator should now be positioned at the end of the word we found.
 299
 300         UChar32 uc = 0;
 301         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
 302             // if it is a dictionary word, do nothing. If it isn't, then if there is
 303             // no preceding word, or the non-word shares less than the minimum threshold
 304             // of characters with a dictionary word, then scan to resynchronize
 305             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 306                   && (cuWordLength == 0
 307                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
 308                 // Look for a plausible word boundary
 309                 int32_t remaining = rangeEnd - (current+cuWordLength);
 310                 UChar32 pc;
 311                 int32_t chars = 0;
 312                 for (;;) {
 313                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 314                     pc = utext_next32(text);
 315                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 316                     chars += pcSize;
 317                     remaining -= pcSize;
 318                     if (remaining <= 0) {
 319                         break;
 320                     }
 321                     uc = utext_current32(text);
 322                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 323                         // Maybe. See if it's in the dictionary.
 324                         // NOTE: In the original Apple code, checked that the next
 325                         // two characters after uc were not 0x0E4C THANTHAKHAT before
 326                         // checking the dictionary. That is just a performance filter,
 327                         // but it's not clear it's faster than checking the trie.
 328                         int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 329                         utext_setNativeIndex(text, current + cuWordLength + chars);
 330                         if (candidates > 0) {
 331                             break;
 332                         }
 333                     }
 334                 }
 335
 336                 // Bump the word count if there wasn't already one
 337                 if (cuWordLength <= 0) {
 338                     wordsFound += 1;
 339                 }
 340
 341                 // Update the length with the passed-over characters
 342                 cuWordLength += chars;
 343             }
 344             else {
 345                 // Back up to where we were for next iteration
 346                 utext_setNativeIndex(text, current+cuWordLength);
 347             }
 348         }
 349
 350         // Never stop before a combining mark.
 351         int32_t currPos;
 352         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 353             utext_next32(text);
 354             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 355         }
 356
 357         // Look ahead for possible suffixes if a dictionary word does not follow.
 358         // We do this in code rather than using a rule so that the heuristic
 359         // resynch continues to function. For example, one of the suffix characters
 360         // could be a typo in the middle of a word.
 361         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
 362             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 363                 && fSuffixSet.contains(uc = utext_current32(text))) {
 364                 if (uc == THAI_PAIYANNOI) {
 365                     if (!fSuffixSet.contains(utext_previous32(text))) {
 366                         // Skip over previous end and PAIYANNOI
 367                         utext_next32(text);
 368                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
 369                         utext_next32(text);
 370                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
 371                         uc = utext_current32(text);     // Fetch next character
 372                     }
 373                     else {
 374                         // Restore prior position
 375                         utext_next32(text);
 376                     }
 377                 }
 378                 if (uc == THAI_MAIYAMOK) {
 379                     if (utext_previous32(text) != THAI_MAIYAMOK) {
 380                         // Skip over previous end and MAIYAMOK
 381                         utext_next32(text);
 382                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
 383                         utext_next32(text);
 384                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
 385                     }
 386                     else {
 387                         // Restore prior position
 388                         utext_next32(text);
 389                     }
 390                 }
 391             }
 392             else {
 393                 utext_setNativeIndex(text, current+cuWordLength);
 394             }
 395         }
 396
 397         // Did we find a word on this iteration? If so, push it on the break stack
 398         if (cuWordLength > 0) {
 399             foundBreaks.push((current+cuWordLength), status);
 400         }
 401     }
 402
 403     // Don't return a break for the end of the dictionary range if there is one there.
 404     if (foundBreaks.peeki() >= rangeEnd) {
 405         (void) foundBreaks.popi();
 406         wordsFound -= 1;
 407     }
 408
 409     return wordsFound;
 410 }
 411
 412 /*
 413  ******************************************************************
 414  * LaoBreakEngine
 415  */
 416
 417 // How many words in a row are "good enough"?
 418 static const int32_t LAO_LOOKAHEAD = 3;
 419
 420 // Will not combine a non-word with a preceding dictionary word longer than this
 421 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
 422
 423 // Will not combine a non-word that shares at least this much prefix with a
 424 // dictionary word, with a preceding word
 425 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
 426
 427 // Minimum word size
 428 static const int32_t LAO_MIN_WORD = 2;
 429
 430 // Minimum number of characters for two words
 431 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
 432
 433 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 434     : DictionaryBreakEngine(),
 435       fDictionary(adoptDictionary)
 436 {
 437     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
 438     if (U_SUCCESS(status)) {
 439         setCharacters(fLaoWordSet);
 440     }
 441     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
 442     fMarkSet.add(0x0020);
 443     fEndWordSet = fLaoWordSet;
 444     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
 445     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
 446     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
 447     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
 448
 449     // Compact for caching.
 450     fMarkSet.compact();
 451     fEndWordSet.compact();
 452     fBeginWordSet.compact();
 453 }
 454
 455 LaoBreakEngine::~LaoBreakEngine() {
 456     delete fDictionary;
 457 }
 458
 459 int32_t
 460 LaoBreakEngine::divideUpDictionaryRange( UText *text,
 461                                                 int32_t rangeStart,
 462                                                 int32_t rangeEnd,
 463                                                 UVector32 &foundBreaks ) const {
 464     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
 465         return 0;       // Not enough characters for two words
 466     }
 467
 468     uint32_t wordsFound = 0;
 469     int32_t cpWordLength = 0;
 470     int32_t cuWordLength = 0;
 471     int32_t current;
 472     UErrorCode status = U_ZERO_ERROR;
 473     PossibleWord words[LAO_LOOKAHEAD];
 474
 475     utext_setNativeIndex(text, rangeStart);
 476
 477     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 478         cuWordLength = 0;
 479         cpWordLength = 0;
 480
 481         // Look for candidate words at the current position
 482         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 483
 484         // If we found exactly one, use that
 485         if (candidates == 1) {
 486             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
 487             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
 488             wordsFound += 1;
 489         }
 490         // If there was more than one, see which one can take us forward the most words
 491         else if (candidates > 1) {
 492             // If we're already at the end of the range, we're done
 493             if (utext_getNativeIndex(text) >= rangeEnd) {
 494                 goto foundBest;
 495             }
 496             do {
 497                 int32_t wordsMatched = 1;
 498                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 499                     if (wordsMatched < 2) {
 500                         // Followed by another dictionary word; mark first word as a good candidate
 501                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
 502                         wordsMatched = 2;
 503                     }
 504
 505                     // If we're already at the end of the range, we're done
 506                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 507                         goto foundBest;
 508                     }
 509
 510                     // See if any of the possible second words is followed by a third word
 511                     do {
 512                         // If we find a third word, stop right away
 513                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 514                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
 515                             goto foundBest;
 516                         }
 517                     }
 518                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
 519                 }
 520             }
 521             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
 522 foundBest:
 523             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
 524             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
 525             wordsFound += 1;
 526         }
 527
 528         // We come here after having either found a word or not. We look ahead to the
 529         // next word. If it's not a dictionary word, we will combine it withe the word we
 530         // just found (if there is one), but only if the preceding word does not exceed
 531         // the threshold.
 532         // The text iterator should now be positioned at the end of the word we found.
 533         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
 534             // if it is a dictionary word, do nothing. If it isn't, then if there is
 535             // no preceding word, or the non-word shares less than the minimum threshold
 536             // of characters with a dictionary word, then scan to resynchronize
 537             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 538                   && (cuWordLength == 0
 539                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
 540                 // Look for a plausible word boundary
 541                 int32_t remaining = rangeEnd - (current + cuWordLength);
 542                 UChar32 pc;
 543                 UChar32 uc;
 544                 int32_t chars = 0;
 545                 for (;;) {
 546                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 547                     pc = utext_next32(text);
 548                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 549                     chars += pcSize;
 550                     remaining -= pcSize;
 551                     if (remaining <= 0) {
 552                         break;
 553                     }
 554                     uc = utext_current32(text);
 555                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 556                         // Maybe. See if it's in the dictionary.
 557                         // TODO: this looks iffy; compare with old code.
 558                         int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 559                         utext_setNativeIndex(text, current + cuWordLength + chars);
 560                         if (candidates > 0) {
 561                             break;
 562                         }
 563                     }
 564                 }
 565
 566                 // Bump the word count if there wasn't already one
 567                 if (cuWordLength <= 0) {
 568                     wordsFound += 1;
 569                 }
 570
 571                 // Update the length with the passed-over characters
 572                 cuWordLength += chars;
 573             }
 574             else {
 575                 // Back up to where we were for next iteration
 576                 utext_setNativeIndex(text, current + cuWordLength);
 577             }
 578         }
 579
 580         // Never stop before a combining mark.
 581         int32_t currPos;
 582         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 583             utext_next32(text);
 584             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 585         }
 586
 587         // Look ahead for possible suffixes if a dictionary word does not follow.
 588         // We do this in code rather than using a rule so that the heuristic
 589         // resynch continues to function. For example, one of the suffix characters
 590         // could be a typo in the middle of a word.
 591         // NOT CURRENTLY APPLICABLE TO LAO
 592
 593         // Did we find a word on this iteration? If so, push it on the break stack
 594         if (cuWordLength > 0) {
 595             foundBreaks.push((current+cuWordLength), status);
 596         }
 597     }
 598
 599     // Don't return a break for the end of the dictionary range if there is one there.
 600     if (foundBreaks.peeki() >= rangeEnd) {
 601         (void) foundBreaks.popi();
 602         wordsFound -= 1;
 603     }
 604
 605     return wordsFound;
 606 }
 607
 608 /*
 609  ******************************************************************
 610  * BurmeseBreakEngine
 611  */
 612
 613 // How many words in a row are "good enough"?
 614 static const int32_t BURMESE_LOOKAHEAD = 3;
 615
 616 // Will not combine a non-word with a preceding dictionary word longer than this
 617 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
 618
 619 // Will not combine a non-word that shares at least this much prefix with a
 620 // dictionary word, with a preceding word
 621 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
 622
 623 // Minimum word size
 624 static const int32_t BURMESE_MIN_WORD = 2;
 625
 626 // Minimum number of characters for two words
 627 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
 628
 629 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 630     : DictionaryBreakEngine(),
 631       fDictionary(adoptDictionary)
 632 {
 633     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
 634     if (U_SUCCESS(status)) {
 635         setCharacters(fBurmeseWordSet);
 636     }
 637     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
 638     fMarkSet.add(0x0020);
 639     fEndWordSet = fBurmeseWordSet;
 640     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
 641
 642     // Compact for caching.
 643     fMarkSet.compact();
 644     fEndWordSet.compact();
 645     fBeginWordSet.compact();
 646 }
 647
 648 BurmeseBreakEngine::~BurmeseBreakEngine() {
 649     delete fDictionary;
 650 }
 651
 652 int32_t
 653 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
 654                                                 int32_t rangeStart,
 655                                                 int32_t rangeEnd,
 656                                                 UVector32 &foundBreaks ) const {
 657     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
 658         return 0;       // Not enough characters for two words
 659     }
 660
 661     uint32_t wordsFound = 0;
 662     int32_t cpWordLength = 0;
 663     int32_t cuWordLength = 0;
 664     int32_t current;
 665     UErrorCode status = U_ZERO_ERROR;
 666     PossibleWord words[BURMESE_LOOKAHEAD];
 667
 668     utext_setNativeIndex(text, rangeStart);
 669
 670     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 671         cuWordLength = 0;
 672         cpWordLength = 0;
 673
 674         // Look for candidate words at the current position
 675         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 676
 677         // If we found exactly one, use that
 678         if (candidates == 1) {
 679             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
 680             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
 681             wordsFound += 1;
 682         }
 683         // If there was more than one, see which one can take us forward the most words
 684         else if (candidates > 1) {
 685             // If we're already at the end of the range, we're done
 686             if (utext_getNativeIndex(text) >= rangeEnd) {
 687                 goto foundBest;
 688             }
 689             do {
 690                 int32_t wordsMatched = 1;
 691                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 692                     if (wordsMatched < 2) {
 693                         // Followed by another dictionary word; mark first word as a good candidate
 694                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
 695                         wordsMatched = 2;
 696                     }
 697
 698                     // If we're already at the end of the range, we're done
 699                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 700                         goto foundBest;
 701                     }
 702
 703                     // See if any of the possible second words is followed by a third word
 704                     do {
 705                         // If we find a third word, stop right away
 706                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 707                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
 708                             goto foundBest;
 709                         }
 710                     }
 711                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
 712                 }
 713             }
 714             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
 715 foundBest:
 716             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
 717             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
 718             wordsFound += 1;
 719         }
 720
 721         // We come here after having either found a word or not. We look ahead to the
 722         // next word. If it's not a dictionary word, we will combine it withe the word we
 723         // just found (if there is one), but only if the preceding word does not exceed
 724         // the threshold.
 725         // The text iterator should now be positioned at the end of the word we found.
 726         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
 727             // if it is a dictionary word, do nothing. If it isn't, then if there is
 728             // no preceding word, or the non-word shares less than the minimum threshold
 729             // of characters with a dictionary word, then scan to resynchronize
 730             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 731                   && (cuWordLength == 0
 732                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
 733                 // Look for a plausible word boundary
 734                 int32_t remaining = rangeEnd - (current + cuWordLength);
 735                 UChar32 pc;
 736                 UChar32 uc;
 737                 int32_t chars = 0;
 738                 for (;;) {
 739                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 740                     pc = utext_next32(text);
 741                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 742                     chars += pcSize;
 743                     remaining -= pcSize;
 744                     if (remaining <= 0) {
 745                         break;
 746                     }
 747                     uc = utext_current32(text);
 748                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 749                         // Maybe. See if it's in the dictionary.
 750                         // TODO: this looks iffy; compare with old code.
 751                         int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 752                         utext_setNativeIndex(text, current + cuWordLength + chars);
 753                         if (candidates > 0) {
 754                             break;
 755                         }
 756                     }
 757                 }
 758
 759                 // Bump the word count if there wasn't already one
 760                 if (cuWordLength <= 0) {
 761                     wordsFound += 1;
 762                 }
 763
 764                 // Update the length with the passed-over characters
 765                 cuWordLength += chars;
 766             }
 767             else {
 768                 // Back up to where we were for next iteration
 769                 utext_setNativeIndex(text, current + cuWordLength);
 770             }
 771         }
 772
 773         // Never stop before a combining mark.
 774         int32_t currPos;
 775         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 776             utext_next32(text);
 777             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 778         }
 779
 780         // Look ahead for possible suffixes if a dictionary word does not follow.
 781         // We do this in code rather than using a rule so that the heuristic
 782         // resynch continues to function. For example, one of the suffix characters
 783         // could be a typo in the middle of a word.
 784         // NOT CURRENTLY APPLICABLE TO BURMESE
 785
 786         // Did we find a word on this iteration? If so, push it on the break stack
 787         if (cuWordLength > 0) {
 788             foundBreaks.push((current+cuWordLength), status);
 789         }
 790     }
 791
 792     // Don't return a break for the end of the dictionary range if there is one there.
 793     if (foundBreaks.peeki() >= rangeEnd) {
 794         (void) foundBreaks.popi();
 795         wordsFound -= 1;
 796     }
 797
 798     return wordsFound;
 799 }
 800
 801 /*
 802  ******************************************************************
 803  * KhmerBreakEngine
 804  */
 805
 806 // How many words in a row are "good enough"?
 807 static const int32_t KHMER_LOOKAHEAD = 3;
 808
 809 // Will not combine a non-word with a preceding dictionary word longer than this
 810 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
 811
 812 // Will not combine a non-word that shares at least this much prefix with a
 813 // dictionary word, with a preceding word
 814 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
 815
 816 // Minimum word size
 817 static const int32_t KHMER_MIN_WORD = 2;
 818
 819 // Minimum number of characters for two words
 820 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
 821
 822 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 823     : DictionaryBreakEngine(),
 824       fDictionary(adoptDictionary)
 825 {
 826     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
 827     if (U_SUCCESS(status)) {
 828         setCharacters(fKhmerWordSet);
 829     }
 830     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
 831     fMarkSet.add(0x0020);
 832     fEndWordSet = fKhmerWordSet;
 833     fBeginWordSet.add(0x1780, 0x17B3);
 834     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
 835     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
 836     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
 837     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
 838     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
 839 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 840 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 841 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 842 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 843 //    fSuffixSet.add(THAI_PAIYANNOI);
 844 //    fSuffixSet.add(THAI_MAIYAMOK);
 845
 846     // Compact for caching.
 847     fMarkSet.compact();
 848     fEndWordSet.compact();
 849     fBeginWordSet.compact();
 850 //    fSuffixSet.compact();
 851 }
 852
 853 KhmerBreakEngine::~KhmerBreakEngine() {
 854     delete fDictionary;
 855 }
 856
 857 int32_t
 858 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
 859                                                 int32_t rangeStart,
 860                                                 int32_t rangeEnd,
 861                                                 UVector32 &foundBreaks ) const {
 862     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
 863         return 0;       // Not enough characters for two words
 864     }
 865
 866     uint32_t wordsFound = 0;
 867     int32_t cpWordLength = 0;
 868     int32_t cuWordLength = 0;
 869     int32_t current;
 870     UErrorCode status = U_ZERO_ERROR;
 871     PossibleWord words[KHMER_LOOKAHEAD];
 872
 873     utext_setNativeIndex(text, rangeStart);
 874
 875     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 876         cuWordLength = 0;
 877         cpWordLength = 0;
 878
 879         // Look for candidate words at the current position
 880         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 881
 882         // If we found exactly one, use that
 883         if (candidates == 1) {
 884             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 885             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 886             wordsFound += 1;
 887         }
 888
 889         // If there was more than one, see which one can take us forward the most words
 890         else if (candidates > 1) {
 891             // If we're already at the end of the range, we're done
 892             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 893                 goto foundBest;
 894             }
 895             do {
 896                 int32_t wordsMatched = 1;
 897                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 898                     if (wordsMatched < 2) {
 899                         // Followed by another dictionary word; mark first word as a good candidate
 900                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 901                         wordsMatched = 2;
 902                     }
 903
 904                     // If we're already at the end of the range, we're done
 905                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 906                         goto foundBest;
 907                     }
 908
 909                     // See if any of the possible second words is followed by a third word
 910                     do {
 911                         // If we find a third word, stop right away
 912                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 913                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 914                             goto foundBest;
 915                         }
 916                     }
 917                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
 918                 }
 919             }
 920             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
 921 foundBest:
 922             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 923             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 924             wordsFound += 1;
 925         }
 926
 927         // We come here after having either found a word or not. We look ahead to the
 928         // next word. If it's not a dictionary word, we will combine it with the word we
 929         // just found (if there is one), but only if the preceding word does not exceed
 930         // the threshold.
 931         // The text iterator should now be positioned at the end of the word we found.
 932         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
 933             // if it is a dictionary word, do nothing. If it isn't, then if there is
 934             // no preceding word, or the non-word shares less than the minimum threshold
 935             // of characters with a dictionary word, then scan to resynchronize
 936             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 937                   && (cuWordLength == 0
 938                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
 939                 // Look for a plausible word boundary
 940                 int32_t remaining = rangeEnd - (current+cuWordLength);
 941                 UChar32 pc;
 942                 UChar32 uc;
 943                 int32_t chars = 0;
 944                 for (;;) {
 945                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 946                     pc = utext_next32(text);
 947                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 948                     chars += pcSize;
 949                     remaining -= pcSize;
 950                     if (remaining <= 0) {
 951                         break;
 952                     }
 953                     uc = utext_current32(text);
 954                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 955                         // Maybe. See if it's in the dictionary.
 956                         int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 957                         utext_setNativeIndex(text, current+cuWordLength+chars);
 958                         if (candidates > 0) {
 959                             break;
 960                         }
 961                     }
 962                 }
 963
 964                 // Bump the word count if there wasn't already one
 965                 if (cuWordLength <= 0) {
 966                     wordsFound += 1;
 967                 }
 968
 969                 // Update the length with the passed-over characters
 970                 cuWordLength += chars;
 971             }
 972             else {
 973                 // Back up to where we were for next iteration
 974                 utext_setNativeIndex(text, current+cuWordLength);
 975             }
 976         }
 977
 978         // Never stop before a combining mark.
 979         int32_t currPos;
 980         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 981             utext_next32(text);
 982             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 983         }
 984
 985         // Look ahead for possible suffixes if a dictionary word does not follow.
 986         // We do this in code rather than using a rule so that the heuristic
 987         // resynch continues to function. For example, one of the suffix characters
 988         // could be a typo in the middle of a word.
 989 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
 990 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 991 //                && fSuffixSet.contains(uc = utext_current32(text))) {
 992 //                if (uc == KHMER_PAIYANNOI) {
 993 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
 994 //                        // Skip over previous end and PAIYANNOI
 995 //                        utext_next32(text);
 996 //                        utext_next32(text);
 997 //                        wordLength += 1;            // Add PAIYANNOI to word
 998 //                        uc = utext_current32(text);     // Fetch next character
 999 //                    }
1000 //                    else {
1001 //                        // Restore prior position
1002 //                        utext_next32(text);
1003 //                    }
1004 //                }
1005 //                if (uc == KHMER_MAIYAMOK) {
1006 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1007 //                        // Skip over previous end and MAIYAMOK
1008 //                        utext_next32(text);
1009 //                        utext_next32(text);
1010 //                        wordLength += 1;            // Add MAIYAMOK to word
1011 //                    }
1012 //                    else {
1013 //                        // Restore prior position
1014 //                        utext_next32(text);
1015 //                    }
1016 //                }
1017 //            }
1018 //            else {
1019 //                utext_setNativeIndex(text, current+wordLength);
1020 //            }
1021 //        }
1022
1023         // Did we find a word on this iteration? If so, push it on the break stack
1024         if (cuWordLength > 0) {
1025             foundBreaks.push((current+cuWordLength), status);
1026         }
1027     }
1028
1029     // Don't return a break for the end of the dictionary range if there is one there.
1030     if (foundBreaks.peeki() >= rangeEnd) {
1031         (void) foundBreaks.popi();
1032         wordsFound -= 1;
1033     }
1034
1035     return wordsFound;
1036 }
1037
1038 #if !UCONFIG_NO_NORMALIZATION
1039 /*
1040  ******************************************************************
1041  * CjkBreakEngine
1042  */
1043 static const uint32_t kuint32max = 0xFFFFFFFF;
1044 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1045 : DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1046     // Korean dictionary only includes Hangul syllables
1047     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1048     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1049     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1050     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1051     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1052
1053     if (U_SUCCESS(status)) {
1054         // handle Korean and Japanese/Chinese using different dictionaries
1055         if (type == kKorean) {
1056             setCharacters(fHangulWordSet);
1057         } else { //Chinese and Japanese
1058             UnicodeSet cjSet;
1059             cjSet.addAll(fHanWordSet);
1060             cjSet.addAll(fKatakanaWordSet);
1061             cjSet.addAll(fHiraganaWordSet);
1062             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1063             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1064             setCharacters(cjSet);
1065         }
1066     }
1067 }
1068
1069 CjkBreakEngine::~CjkBreakEngine(){
1070     delete fDictionary;
1071 }
1072
1073 // The katakanaCost values below are based on the length frequencies of all
1074 // katakana phrases in the dictionary
1075 static const int32_t kMaxKatakanaLength = 8;
1076 static const int32_t kMaxKatakanaGroupLength = 20;
1077 static const uint32_t maxSnlp = 255;
1078
1079 static inline uint32_t getKatakanaCost(int32_t wordLength){
1080     //TODO: fill array with actual values from dictionary!
1081     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1082                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1083     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1084 }
1085
1086 static inline bool isKatakana(UChar32 value) {
1087     return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1088             (value >= 0xFF66 && value <= 0xFF9f);
1089 }
1090
1091
1092 // Function for accessing internal utext flags.
1093 //   Replicates an internal UText function.
1094
1095 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1096     return (int32_t)1 << bitIndex;
1097 }
1098
1099
1100 /*
1101  * @param text A UText representing the text
1102  * @param rangeStart The start of the range of dictionary characters
1103  * @param rangeEnd The end of the range of dictionary characters
1104  * @param foundBreaks vector<int32> to receive the break positions
1105  * @return The number of breaks found
1106  */
1107 int32_t
1108 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1109         int32_t rangeStart,
1110         int32_t rangeEnd,
1111         UVector32 &foundBreaks ) const {
1112     if (rangeStart >= rangeEnd) {
1113         return 0;
1114     }
1115
1116     // UnicodeString version of input UText, NFKC normalized if necessary.
1117     UnicodeString inString;
1118
1119     // inputMap[inStringIndex] = corresponding native index from UText inText.
1120     // If NULL then mapping is 1:1
1121     LocalPointer<UVector32>     inputMap;
1122
1123     UErrorCode     status      = U_ZERO_ERROR;
1124
1125
1126     // if UText has the input string as one contiguous UTF-16 chunk
1127     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1128          inText->chunkNativeStart <= rangeStart &&
1129          inText->chunkNativeLimit >= rangeEnd   &&
1130          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1131
1132         // Input UText is in one contiguous UTF-16 chunk.
1133         // Use Read-only aliasing UnicodeString.
1134         inString.setTo(FALSE,
1135                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1136                        rangeEnd - rangeStart);
1137     } else {
1138         // Copy the text from the original inText (UText) to inString (UnicodeString).
1139         // Create a map from UnicodeString indices -> UText offsets.
1140         utext_setNativeIndex(inText, rangeStart);
1141         int32_t limit = rangeEnd;
1142         U_ASSERT(limit <= utext_nativeLength(inText));
1143         if (limit > utext_nativeLength(inText)) {
1144             limit = (int32_t)utext_nativeLength(inText);
1145         }
1146         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1147         if (U_FAILURE(status)) {
1148             return 0;
1149         }
1150         while (utext_getNativeIndex(inText) < limit) {
1151             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1152             UChar32 c = utext_next32(inText);
1153             U_ASSERT(c != U_SENTINEL);
1154             inString.append(c);
1155             while (inputMap->size() < inString.length()) {
1156                 inputMap->addElement(nativePosition, status);
1157             }
1158         }
1159         inputMap->addElement(limit, status);
1160     }
1161
1162
1163     if (!nfkcNorm2->isNormalized(inString, status)) {
1164         UnicodeString normalizedInput;
1165         //  normalizedMap[normalizedInput position] ==  original UText position.
1166         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1167         if (U_FAILURE(status)) {
1168             return 0;
1169         }
1170
1171         UnicodeString fragment;
1172         UnicodeString normalizedFragment;
1173         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1174             fragment.remove();
1175             int32_t fragmentStartI = srcI;
1176             UChar32 c = inString.char32At(srcI);
1177             for (;;) {
1178                 fragment.append(c);
1179                 srcI = inString.moveIndex32(srcI, 1);
1180                 if (srcI == inString.length()) {
1181                     break;
1182                 }
1183                 c = inString.char32At(srcI);
1184                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1185                     break;
1186                 }
1187             }
1188             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1189             normalizedInput.append(normalizedFragment);
1190
1191             // Map every position in the normalized chunk to the start of the chunk
1192             //   in the original input.
1193             int32_t fragmentOriginalStart = inputMap.isValid() ?
1194                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1195             while (normalizedMap->size() < normalizedInput.length()) {
1196                 normalizedMap->addElement(fragmentOriginalStart, status);
1197                 if (U_FAILURE(status)) {
1198                     break;
1199                 }
1200             }
1201         }
1202         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1203         int32_t nativeEnd = inputMap.isValid() ?
1204                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1205         normalizedMap->addElement(nativeEnd, status);
1206
1207         inputMap.moveFrom(normalizedMap);
1208         inString.moveFrom(normalizedInput);
1209     }
1210
1211     int32_t numCodePts = inString.countChar32();
1212     if (numCodePts != inString.length()) {
1213         // There are supplementary characters in the input.
1214         // The dictionary will produce boundary positions in terms of code point indexes,
1215         //   not in terms of code unit string indexes.
1216         // Use the inputMap mechanism to take care of this in addition to indexing differences
1217         //    from normalization and/or UTF-8 input.
1218         UBool hadExistingMap = inputMap.isValid();
1219         if (!hadExistingMap) {
1220             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1221             if (U_FAILURE(status)) {
1222                 return 0;
1223             }
1224         }
1225         int32_t cpIdx = 0;
1226         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1227             U_ASSERT(cuIdx >= cpIdx);
1228             if (hadExistingMap) {
1229                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1230             } else {
1231                 inputMap->addElement(cuIdx+rangeStart, status);
1232             }
1233             cpIdx++;
1234             if (cuIdx == inString.length()) {
1235                break;
1236             }
1237         }
1238     }
1239
1240     // bestSnlp[i] is the snlp of the best segmentation of the first i
1241     // code points in the range to be matched.
1242     UVector32 bestSnlp(numCodePts + 1, status);
1243     bestSnlp.addElement(0, status);
1244     for(int32_t i = 1; i <= numCodePts; i++) {
1245         bestSnlp.addElement(kuint32max, status);
1246     }
1247
1248
1249     // prev[i] is the index of the last CJK code point in the previous word in
1250     // the best segmentation of the first i characters.
1251     UVector32 prev(numCodePts + 1, status);
1252     for(int32_t i = 0; i <= numCodePts; i++){
1253         prev.addElement(-1, status);
1254     }
1255
1256     const int32_t maxWordSize = 20;
1257     UVector32 values(numCodePts, status);
1258     values.setSize(numCodePts);
1259     UVector32 lengths(numCodePts, status);
1260     lengths.setSize(numCodePts);
1261
1262     UText fu = UTEXT_INITIALIZER;
1263     utext_openUnicodeString(&fu, &inString, &status);
1264
1265     // Dynamic programming to find the best segmentation.
1266
1267     // In outer loop, i  is the code point index,
1268     //                ix is the corresponding string (code unit) index.
1269     //    They differ when the string contains supplementary characters.
1270     int32_t ix = 0;
1271     bool is_prev_katakana = false;
1272     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1273         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1274             continue;
1275         }
1276
1277         int32_t count;
1278         utext_setNativeIndex(&fu, ix);
1279         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1280                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1281                              // Note: lengths is filled with code point lengths
1282                              //       The NULL parameter is the ignored code unit lengths.
1283
1284         // if there are no single character matches found in the dictionary
1285         // starting with this character, treat character as a 1-character word
1286         // with the highest value possible, i.e. the least likely to occur.
1287         // Exclude Korean characters from this treatment, as they should be left
1288         // together by default.
1289         if ((count == 0 || lengths.elementAti(0) != 1) &&
1290                 !fHangulWordSet.contains(inString.char32At(ix))) {
1291             values.setElementAt(maxSnlp, count);   // 255
1292             lengths.setElementAt(1, count++);
1293         }
1294
1295         for (int32_t j = 0; j < count; j++) {
1296             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1297             int32_t ln_j_i = lengths.elementAti(j) + i;
1298             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1299                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1300                 prev.setElementAt(i, ln_j_i);
1301             }
1302         }
1303
1304         // In Japanese,
1305         // Katakana word in single character is pretty rare. So we apply
1306         // the following heuristic to Katakana: any continuous run of Katakana
1307         // characters is considered a candidate word with a default cost
1308         // specified in the katakanaCost table according to its length.
1309
1310         bool is_katakana = isKatakana(inString.char32At(ix));
1311         int32_t katakanaRunLength = 1;
1312         if (!is_prev_katakana && is_katakana) {
1313             int32_t j = inString.moveIndex32(ix, 1);
1314             // Find the end of the continuous run of Katakana characters
1315             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1316                     isKatakana(inString.char32At(j))) {
1317                 j = inString.moveIndex32(j, 1);
1318                 katakanaRunLength++;
1319             }
1320             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1321                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1322                 if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1323                     bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1324                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1325                 }
1326             }
1327         }
1328         is_prev_katakana = is_katakana;
1329     }
1330     utext_close(&fu);
1331
1332     // Start pushing the optimal offset index into t_boundary (t for tentative).
1333     // prev[numCodePts] is guaranteed to be meaningful.
1334     // We'll first push in the reverse order, i.e.,
1335     // t_boundary[0] = numCodePts, and afterwards do a swap.
1336     UVector32 t_boundary(numCodePts+1, status);
1337
1338     int32_t numBreaks = 0;
1339     // No segmentation found, set boundary to end of range
1340     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1341         t_boundary.addElement(numCodePts, status);
1342         numBreaks++;
1343     } else {
1344         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1345             t_boundary.addElement(i, status);
1346             numBreaks++;
1347         }
1348         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1349     }
1350
1351     // Add a break for the start of the dictionary range if there is not one
1352     // there already.
1353     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1354         t_boundary.addElement(0, status);
1355         numBreaks++;
1356     }
1357
1358     // Now that we're done, convert positions in t_boundary[] (indices in
1359     // the normalized input string) back to indices in the original input UText
1360     // while reversing t_boundary and pushing values to foundBreaks.
1361     int32_t prevCPPos = -1;
1362     int32_t prevUTextPos = -1;
1363     for (int32_t i = numBreaks-1; i >= 0; i--) {
1364         int32_t cpPos = t_boundary.elementAti(i);
1365         U_ASSERT(cpPos > prevCPPos);
1366         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1367         U_ASSERT(utextPos >= prevUTextPos);
1368         if (utextPos > prevUTextPos) {
1369             // Boundaries are added to foundBreaks output in ascending order.
1370             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1371             foundBreaks.push(utextPos, status);
1372         } else {
1373             // Normalization expanded the input text, the dictionary found a boundary
1374             // within the expansion, giving two boundaries with the same index in the
1375             // original text. Ignore the second. See ticket #12918.
1376             --numBreaks;
1377         }
1378         prevCPPos = cpPos;
1379         prevUTextPos = utextPos;
1380     }
1381     (void)prevCPPos; // suppress compiler warnings about unused variable
1382
1383     // inString goes out of scope
1384     // inputMap goes out of scope
1385     return numBreaks;
1386 }
1387 #endif
1388
1389 U_NAMESPACE_END
1390
1391 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1392