icuSources/common/brkeng.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  ************************************************************************************
   5  * Copyright (C) 2006-2016, International Business Machines Corporation
   6  * and others. All Rights Reserved.
   7  ************************************************************************************
   8  */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_BREAK_ITERATION
  13
  14 #include "unicode/uchar.h"
  15 #include "unicode/uniset.h"
  16 #include "unicode/chariter.h"
  17 #include "unicode/ures.h"
  18 #include "unicode/udata.h"
  19 #include "unicode/putil.h"
  20 #include "unicode/ustring.h"
  21 #include "unicode/uscript.h"
  22 #include "unicode/ucharstrie.h"
  23 #include "unicode/bytestrie.h"
  24
  25 #include "brkeng.h"
  26 #include "cmemory.h"
  27 #include "dictbe.h"
  28 #include "charstr.h"
  29 #include "dictionarydata.h"
  30 #include "mutex.h"
  31 #include "uvector.h"
  32 #include "umutex.h"
  33 #include "uresimp.h"
  34 #include "ubrkimpl.h"
  35
  36 U_NAMESPACE_BEGIN
  37
  38 /*
  39  ******************************************************************
  40  */
  41
  42 LanguageBreakEngine::LanguageBreakEngine() {
  43 }
  44
  45 LanguageBreakEngine::~LanguageBreakEngine() {
  46 }
  47
  48 /*
  49  ******************************************************************
  50  */
  51
  52 LanguageBreakFactory::LanguageBreakFactory() {
  53 }
  54
  55 LanguageBreakFactory::~LanguageBreakFactory() {
  56 }
  57
  58 /*
  59  ******************************************************************
  60  */
  61
  62 UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
  63     (void)status;
  64 }
  65
  66 UnhandledEngine::~UnhandledEngine() {
  67     delete fHandled;
  68     fHandled = nullptr;
  69 }
  70
  71 UBool
  72 UnhandledEngine::handles(UChar32 c) const {
  73     return fHandled && fHandled->contains(c);
  74 }
  75
  76 int32_t
  77 UnhandledEngine::findBreaks( UText *text,
  78                              int32_t /* startPos */,
  79                              int32_t endPos,
  80                              UVector32 &/*foundBreaks*/ ) const {
  81     UChar32 c = utext_current32(text);
  82     while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
  83         utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
  84         c = utext_current32(text);
  85     }
  86     return 0;
  87 }
  88
  89 void
  90 UnhandledEngine::handleCharacter(UChar32 c) {
  91     if (fHandled == nullptr) {
  92         fHandled = new UnicodeSet();
  93         if (fHandled == nullptr) {
  94             return;
  95         }
  96     }
  97     if (!fHandled->contains(c)) {
  98         UErrorCode status = U_ZERO_ERROR;
  99         // Apply the entire script of the character.
 100         int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
 101         fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
 102     }
 103 }
 104
 105 /*
 106  ******************************************************************
 107  */
 108
 109 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
 110     fEngines = 0;
 111 }
 112
 113 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
 114     if (fEngines != 0) {
 115         delete fEngines;
 116     }
 117 }
 118
 119 U_NAMESPACE_END
 120 U_CDECL_BEGIN
 121 static void U_CALLCONV _deleteEngine(void *obj) {
 122     delete (const icu::LanguageBreakEngine *) obj;
 123 }
 124 U_CDECL_END
 125 U_NAMESPACE_BEGIN
 126
 127 const LanguageBreakEngine *
 128 ICULanguageBreakFactory::getEngineFor(UChar32 c) {
 129     const LanguageBreakEngine *lbe = NULL;
 130     UErrorCode  status = U_ZERO_ERROR;
 131
 132     static UMutex gBreakEngineMutex;
 133     Mutex m(&gBreakEngineMutex);
 134
 135     if (fEngines == NULL) {
 136         UStack  *engines = new UStack(_deleteEngine, NULL, status);
 137         if (U_FAILURE(status) || engines == NULL) {
 138             // Note: no way to return error code to caller.
 139             delete engines;
 140             return NULL;
 141         }
 142         fEngines = engines;
 143     } else {
 144         int32_t i = fEngines->size();
 145         while (--i >= 0) {
 146             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
 147             if (lbe != NULL && lbe->handles(c)) {
 148                 return lbe;
 149             }
 150         }
 151     }
 152
 153     // We didn't find an engine. Create one.
 154     lbe = loadEngineFor(c);
 155     if (lbe != NULL) {
 156         fEngines->push((void *)lbe, status);
 157     }
 158     return lbe;
 159 }
 160
 161 const LanguageBreakEngine *
 162 ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
 163     UErrorCode status = U_ZERO_ERROR;
 164     UScriptCode code = uscript_getScript(c, &status);
 165     if (U_SUCCESS(status)) {
 166         DictionaryMatcher *m = loadDictionaryMatcherFor(code);
 167         if (m != NULL) {
 168             const LanguageBreakEngine *engine = NULL;
 169             switch(code) {
 170             case USCRIPT_THAI:
 171                 engine = new ThaiBreakEngine(m, status);
 172                 break;
 173             case USCRIPT_LAO:
 174                 engine = new LaoBreakEngine(m, status);
 175                 break;
 176             case USCRIPT_MYANMAR:
 177                 engine = new BurmeseBreakEngine(m, status);
 178                 break;
 179             case USCRIPT_KHMER:
 180                 engine = new KhmerBreakEngine(m, status);
 181                 break;
 182
 183 #if !UCONFIG_NO_NORMALIZATION
 184                 // CJK not available w/o normalization
 185             case USCRIPT_HANGUL:
 186                 engine = new CjkBreakEngine(m, kKorean, status);
 187                 break;
 188
 189             // use same BreakEngine and dictionary for both Chinese and Japanese
 190             case USCRIPT_HIRAGANA:
 191             case USCRIPT_KATAKANA:
 192             case USCRIPT_HAN:
 193                 engine = new CjkBreakEngine(m, kChineseJapanese, status);
 194                 break;
 195 #if 0
 196             // TODO: Have to get some characters with script=common handled
 197             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
 198             // them to CjkBreakEngine does not work. The engine has to
 199             // special-case them.
 200             case USCRIPT_COMMON:
 201             {
 202                 UBlockCode block = ublock_getCode(code);
 203                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
 204                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
 205                 break;
 206             }
 207 #endif
 208 #endif
 209
 210             default:
 211                 break;
 212             }
 213             if (engine == NULL) {
 214                 delete m;
 215             }
 216             else if (U_FAILURE(status)) {
 217                 delete engine;
 218                 engine = NULL;
 219             }
 220             return engine;
 221         }
 222     }
 223     return NULL;
 224 }
 225
 226 DictionaryMatcher *
 227 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
 228     UErrorCode status = U_ZERO_ERROR;
 229     // open root from brkitr tree.
 230     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
 231     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
 232     int32_t dictnlength = 0;
 233     const UChar *dictfname =
 234         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
 235     if (U_FAILURE(status)) {
 236         ures_close(b);
 237         return NULL;
 238     }
 239     CharString dictnbuf;
 240     CharString ext;
 241     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
 242     if (extStart != NULL) {
 243         int32_t len = (int32_t)(extStart - dictfname);
 244         ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
 245         dictnlength = len;
 246     }
 247     dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
 248     ures_close(b);
 249
 250     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
 251     if (U_SUCCESS(status)) {
 252         // build trie
 253         const uint8_t *data = (const uint8_t *)udata_getMemory(file);
 254         const int32_t *indexes = (const int32_t *)data;
 255         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
 256         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
 257         DictionaryMatcher *m = NULL;
 258         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
 259             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
 260             const char *characters = (const char *)(data + offset);
 261             m = new BytesDictionaryMatcher(characters, transform, file);
 262         }
 263         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
 264             const UChar *characters = (const UChar *)(data + offset);
 265             m = new UCharsDictionaryMatcher(characters, file);
 266         }
 267         if (m == NULL) {
 268             // no matcher exists to take ownership - either we are an invalid
 269             // type or memory allocation failed
 270             udata_close(file);
 271         }
 272         return m;
 273     } else if (dictfname != NULL) {
 274         // we don't have a dictionary matcher.
 275         // returning NULL here will cause us to fail to find a dictionary break engine, as expected
 276         status = U_ZERO_ERROR;
 277         return NULL;
 278     }
 279     return NULL;
 280 }
 281
 282 U_NAMESPACE_END
 283
 284 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */