icuSources/common/dictbe.cpp

   1 /**
   2  *******************************************************************************
   3  * Copyright (C) 2006-2008, International Business Machines Corporation and others. *
   4  * All Rights Reserved.                                                        *
   5  *******************************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_BREAK_ITERATION
  11
  12 #include "brkeng.h"
  13 #include "dictbe.h"
  14 #include "unicode/uniset.h"
  15 #include "unicode/chariter.h"
  16 #include "unicode/ubrk.h"
  17 #include "uvector.h"
  18 #include "triedict.h"
  19
  20 U_NAMESPACE_BEGIN
  21
  22 /*
  23  ******************************************************************
  24  */
  25
  26 /*DictionaryBreakEngine::DictionaryBreakEngine() {
  27     fTypes = 0;
  28 }*/
  29
  30 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
  31     fTypes = breakTypes;
  32 }
  33
  34 DictionaryBreakEngine::~DictionaryBreakEngine() {
  35 }
  36
  37 UBool
  38 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
  39     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
  40             && fSet.contains(c));
  41 }
  42
  43 int32_t
  44 DictionaryBreakEngine::findBreaks( UText *text,
  45                                  int32_t startPos,
  46                                  int32_t endPos,
  47                                  UBool reverse,
  48                                  int32_t breakType,
  49                                  UStack &foundBreaks ) const {
  50     int32_t result = 0;
  51
  52     // Find the span of characters included in the set.
  53     int32_t start = (int32_t)utext_getNativeIndex(text);
  54     int32_t current;
  55     int32_t rangeStart;
  56     int32_t rangeEnd;
  57     UChar32 c = utext_current32(text);
  58     if (reverse) {
  59         UBool   isDict = fSet.contains(c);
  60         while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
  61             c = utext_previous32(text);
  62             isDict = fSet.contains(c);
  63         }
  64         rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
  65         rangeEnd = start + 1;
  66     }
  67     else {
  68         while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
  69             utext_next32(text);         // TODO:  recast loop for postincrement
  70             c = utext_current32(text);
  71         }
  72         rangeStart = start;
  73         rangeEnd = current;
  74     }
  75     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
  76         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
  77         utext_setNativeIndex(text, current);
  78     }
  79
  80     return result;
  81 }
  82
  83 void
  84 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
  85     fSet = set;
  86     // Compact for caching
  87     fSet.compact();
  88 }
  89
  90 /*void
  91 DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
  92     fTypes = breakTypes;
  93 }*/
  94
  95 /*
  96  ******************************************************************
  97  */
  98
  99
 100 // Helper class for improving readability of the Thai word break
 101 // algorithm. The implementation is completely inline.
 102
 103 // List size, limited by the maximum number of words in the dictionary
 104 // that form a nested sequence.
 105 #define POSSIBLE_WORD_LIST_MAX 20
 106
 107 class PossibleWord {
 108  private:
 109   // list of word candidate lengths, in increasing length order
 110   int32_t   lengths[POSSIBLE_WORD_LIST_MAX];
 111   int       count;      // Count of candidates
 112   int32_t   prefix;     // The longest match with a dictionary word
 113   int32_t   offset;     // Offset in the text of these candidates
 114   int       mark;       // The preferred candidate's offset
 115   int       current;    // The candidate we're currently looking at
 116
 117  public:
 118   PossibleWord();
 119   ~PossibleWord();
 120
 121   // Fill the list of candidates if needed, select the longest, and return the number found
 122   int       candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd );
 123
 124   // Select the currently marked candidate, point after it in the text, and invalidate self
 125   int32_t   acceptMarked( UText *text );
 126
 127   // Back up from the current candidate to the next shorter one; return TRUE if that exists
 128   // and point the text after it
 129   UBool     backUp( UText *text );
 130
 131   // Return the longest prefix this candidate location shares with a dictionary word
 132   int32_t   longestPrefix();
 133
 134   // Mark the current candidate as the one we like
 135   void      markCurrent();
 136 };
 137
 138 inline
 139 PossibleWord::PossibleWord() {
 140     offset = -1;
 141 }
 142
 143 inline
 144 PossibleWord::~PossibleWord() {
 145 }
 146
 147 inline int
 148 PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
 149     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
 150     int32_t start = (int32_t)utext_getNativeIndex(text);
 151     if (start != offset) {
 152         offset = start;
 153         prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
 154         // Dictionary leaves text after longest prefix, not longest word. Back up.
 155         if (count <= 0) {
 156             utext_setNativeIndex(text, start);
 157         }
 158     }
 159     if (count > 0) {
 160         utext_setNativeIndex(text, start+lengths[count-1]);
 161     }
 162     current = count-1;
 163     mark = current;
 164     return count;
 165 }
 166
 167 inline int32_t
 168 PossibleWord::acceptMarked( UText *text ) {
 169     utext_setNativeIndex(text, offset + lengths[mark]);
 170     return lengths[mark];
 171 }
 172
 173 inline UBool
 174 PossibleWord::backUp( UText *text ) {
 175     if (current > 0) {
 176         utext_setNativeIndex(text, offset + lengths[--current]);
 177         return TRUE;
 178     }
 179     return FALSE;
 180 }
 181
 182 inline int32_t
 183 PossibleWord::longestPrefix() {
 184     return prefix;
 185 }
 186
 187 inline void
 188 PossibleWord::markCurrent() {
 189     mark = current;
 190 }
 191
 192 // How many words in a row are "good enough"?
 193 #define THAI_LOOKAHEAD 3
 194
 195 // Will not combine a non-word with a preceding dictionary word longer than this
 196 #define THAI_ROOT_COMBINE_THRESHOLD 3
 197
 198 // Will not combine a non-word that shares at least this much prefix with a
 199 // dictionary word, with a preceding word
 200 #define THAI_PREFIX_COMBINE_THRESHOLD 3
 201
 202 // Ellision character
 203 #define THAI_PAIYANNOI 0x0E2F
 204
 205 // Repeat character
 206 #define THAI_MAIYAMOK 0x0E46
 207
 208 // Minimum word size
 209 #define THAI_MIN_WORD 2
 210
 211 // Minimum number of characters for two words
 212 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
 213
 214 ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
 215     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
 216       fDictionary(adoptDictionary)
 217 {
 218     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
 219     if (U_SUCCESS(status)) {
 220         setCharacters(fThaiWordSet);
 221     }
 222     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
 223     fMarkSet.add(0x0020);
 224     fEndWordSet = fThaiWordSet;
 225     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 226     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 227     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 228     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 229     fSuffixSet.add(THAI_PAIYANNOI);
 230     fSuffixSet.add(THAI_MAIYAMOK);
 231
 232     // Compact for caching.
 233     fMarkSet.compact();
 234     fEndWordSet.compact();
 235     fBeginWordSet.compact();
 236     fSuffixSet.compact();
 237 }
 238
 239 ThaiBreakEngine::~ThaiBreakEngine() {
 240     delete fDictionary;
 241 }
 242
 243 int32_t
 244 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
 245                                                 int32_t rangeStart,
 246                                                 int32_t rangeEnd,
 247                                                 UStack &foundBreaks ) const {
 248     if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
 249         return 0;       // Not enough characters for two words
 250     }
 251
 252     uint32_t wordsFound = 0;
 253     int32_t wordLength;
 254     int32_t current;
 255     UErrorCode status = U_ZERO_ERROR;
 256     PossibleWord words[THAI_LOOKAHEAD];
 257     UChar32 uc;
 258
 259     utext_setNativeIndex(text, rangeStart);
 260
 261     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 262         wordLength = 0;
 263
 264         // Look for candidate words at the current position
 265         int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 266
 267         // If we found exactly one, use that
 268         if (candidates == 1) {
 269             wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
 270             wordsFound += 1;
 271         }
 272
 273         // If there was more than one, see which one can take us forward the most words
 274         else if (candidates > 1) {
 275             // If we're already at the end of the range, we're done
 276             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 277                 goto foundBest;
 278             }
 279             do {
 280                 int wordsMatched = 1;
 281                 if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 282                     if (wordsMatched < 2) {
 283                         // Followed by another dictionary word; mark first word as a good candidate
 284                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
 285                         wordsMatched = 2;
 286                     }
 287
 288                     // If we're already at the end of the range, we're done
 289                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 290                         goto foundBest;
 291                     }
 292
 293                     // See if any of the possible second words is followed by a third word
 294                     do {
 295                         // If we find a third word, stop right away
 296                         if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 297                             words[wordsFound%THAI_LOOKAHEAD].markCurrent();
 298                             goto foundBest;
 299                         }
 300                     }
 301                     while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text));
 302                 }
 303             }
 304             while (words[wordsFound%THAI_LOOKAHEAD].backUp(text));
 305 foundBest:
 306             wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
 307             wordsFound += 1;
 308         }
 309
 310         // We come here after having either found a word or not. We look ahead to the
 311         // next word. If it's not a dictionary word, we will combine it withe the word we
 312         // just found (if there is one), but only if the preceding word does not exceed
 313         // the threshold.
 314         // The text iterator should now be positioned at the end of the word we found.
 315         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
 316             // if it is a dictionary word, do nothing. If it isn't, then if there is
 317             // no preceding word, or the non-word shares less than the minimum threshold
 318             // of characters with a dictionary word, then scan to resynchronize
 319             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 320                   && (wordLength == 0
 321                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
 322                 // Look for a plausible word boundary
 323                 //TODO: This section will need a rework for UText.
 324                 int32_t remaining = rangeEnd - (current+wordLength);
 325                 UChar32 pc = utext_current32(text);
 326                 int32_t chars = 0;
 327                 for (;;) {
 328                     utext_next32(text);
 329                     uc = utext_current32(text);
 330                     // TODO: Here we're counting on the fact that the SA languages are all
 331                     // in the BMP. This should get fixed with the UText rework.
 332                     chars += 1;
 333                     if (--remaining <= 0) {
 334                         break;
 335                     }
 336                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 337                         // Maybe. See if it's in the dictionary.
 338                         // NOTE: In the original Apple code, checked that the next
 339                         // two characters after uc were not 0x0E4C THANTHAKHAT before
 340                         // checking the dictionary. That is just a performance filter,
 341                         // but it's not clear it's faster than checking the trie.
 342                         int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 343                         utext_setNativeIndex(text, current+wordLength+chars);
 344                         if (candidates > 0) {
 345                             break;
 346                         }
 347                     }
 348                     pc = uc;
 349                 }
 350
 351                 // Bump the word count if there wasn't already one
 352                 if (wordLength <= 0) {
 353                     wordsFound += 1;
 354                 }
 355
 356                 // Update the length with the passed-over characters
 357                 wordLength += chars;
 358             }
 359             else {
 360                 // Back up to where we were for next iteration
 361                 utext_setNativeIndex(text, current+wordLength);
 362             }
 363         }
 364
 365         // Never stop before a combining mark.
 366         int32_t currPos;
 367         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 368             utext_next32(text);
 369             wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 370         }
 371
 372         // Look ahead for possible suffixes if a dictionary word does not follow.
 373         // We do this in code rather than using a rule so that the heuristic
 374         // resynch continues to function. For example, one of the suffix characters
 375         // could be a typo in the middle of a word.
 376         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
 377             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 378                 && fSuffixSet.contains(uc = utext_current32(text))) {
 379                 if (uc == THAI_PAIYANNOI) {
 380                     if (!fSuffixSet.contains(utext_previous32(text))) {
 381                         // Skip over previous end and PAIYANNOI
 382                         utext_next32(text);
 383                         utext_next32(text);
 384                         wordLength += 1;            // Add PAIYANNOI to word
 385                         uc = utext_current32(text);     // Fetch next character
 386                     }
 387                     else {
 388                         // Restore prior position
 389                         utext_next32(text);
 390                     }
 391                 }
 392                 if (uc == THAI_MAIYAMOK) {
 393                     if (utext_previous32(text) != THAI_MAIYAMOK) {
 394                         // Skip over previous end and MAIYAMOK
 395                         utext_next32(text);
 396                         utext_next32(text);
 397                         wordLength += 1;            // Add MAIYAMOK to word
 398                     }
 399                     else {
 400                         // Restore prior position
 401                         utext_next32(text);
 402                     }
 403                 }
 404             }
 405             else {
 406                 utext_setNativeIndex(text, current+wordLength);
 407             }
 408         }
 409
 410         // Did we find a word on this iteration? If so, push it on the break stack
 411         if (wordLength > 0) {
 412             foundBreaks.push((current+wordLength), status);
 413         }
 414     }
 415
 416     // Don't return a break for the end of the dictionary range if there is one there.
 417     if (foundBreaks.peeki() >= rangeEnd) {
 418         (void) foundBreaks.popi();
 419         wordsFound -= 1;
 420     }
 421
 422     return wordsFound;
 423 }
 424
 425 U_NAMESPACE_END
 426
 427 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */