icuSources/common/brkeng.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  ************************************************************************************
   5  * Copyright (C) 2006-2016, International Business Machines Corporation
   6  * and others. All Rights Reserved.
   7  ************************************************************************************
   8  */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_BREAK_ITERATION
  13
  14 #include "unicode/uchar.h"
  15 #include "unicode/uniset.h"
  16 #include "unicode/chariter.h"
  17 #include "unicode/ures.h"
  18 #include "unicode/udata.h"
  19 #include "unicode/putil.h"
  20 #include "unicode/ustring.h"
  21 #include "unicode/uscript.h"
  22 #include "unicode/ucharstrie.h"
  23 #include "unicode/bytestrie.h"
  24
  25 #include "brkeng.h"
  26 #include "cmemory.h"
  27 #include "dictbe.h"
  28 #include "charstr.h"
  29 #include "dictionarydata.h"
  30 #include "mutex.h"
  31 #include "uvector.h"
  32 #include "umutex.h"
  33 #include "uresimp.h"
  34 #include "ubrkimpl.h"
  35
  36 U_NAMESPACE_BEGIN
  37
  38 /*
  39  ******************************************************************
  40  */
  41
  42 LanguageBreakEngine::LanguageBreakEngine() {
  43 }
  44
  45 LanguageBreakEngine::~LanguageBreakEngine() {
  46 }
  47
  48 /*
  49  ******************************************************************
  50  */
  51
  52 LanguageBreakFactory::LanguageBreakFactory() {
  53 }
  54
  55 LanguageBreakFactory::~LanguageBreakFactory() {
  56 }
  57
  58 /*
  59  ******************************************************************
  60  */
  61
  62 UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
  63     (void)status;
  64 }
  65
  66 UnhandledEngine::~UnhandledEngine() {
  67     delete fHandled;
  68     fHandled = nullptr;
  69 }
  70
  71 UBool
  72 UnhandledEngine::handles(UChar32 c) const {
  73     return fHandled && fHandled->contains(c);
  74 }
  75
  76 int32_t
  77 UnhandledEngine::findBreaks( UText *text,
  78                              int32_t /* startPos */,
  79                              int32_t endPos,
  80                              UVector32 &/*foundBreaks*/ ) const {
  81     UChar32 c = utext_current32(text);
  82     while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
  83         utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
  84         c = utext_current32(text);
  85     }
  86     return 0;
  87 }
  88
  89 void
  90 UnhandledEngine::handleCharacter(UChar32 c) {
  91     if (fHandled == nullptr) {
  92         fHandled = new UnicodeSet();
  93         if (fHandled == nullptr) {
  94             return;
  95         }
  96     }
  97     if (!fHandled->contains(c)) {
  98         UErrorCode status = U_ZERO_ERROR;
  99         // Apply the entire script of the character.
 100         int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
 101         fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
 102     }
 103 }
 104
 105 /*
 106  ******************************************************************
 107  */
 108
 109 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
 110     fEngines = 0;
 111 }
 112
 113 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
 114     if (fEngines != 0) {
 115         delete fEngines;
 116     }
 117 }
 118
 119 U_NAMESPACE_END
 120 U_CDECL_BEGIN
 121 static void U_CALLCONV _deleteEngine(void *obj) {
 122     delete (const icu::LanguageBreakEngine *) obj;
 123 }
 124 U_CDECL_END
 125 U_NAMESPACE_BEGIN
 126
 127 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
 128
 129 const LanguageBreakEngine *
 130 ICULanguageBreakFactory::getEngineFor(UChar32 c) {
 131     const LanguageBreakEngine *lbe = NULL;
 132     UErrorCode  status = U_ZERO_ERROR;
 133
 134     Mutex m(&gBreakEngineMutex);
 135
 136     if (fEngines == NULL) {
 137         UStack  *engines = new UStack(_deleteEngine, NULL, status);
 138         if (U_FAILURE(status) || engines == NULL) {
 139             // Note: no way to return error code to caller.
 140             delete engines;
 141             return NULL;
 142         }
 143         fEngines = engines;
 144     } else {
 145         int32_t i = fEngines->size();
 146         while (--i >= 0) {
 147             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
 148             if (lbe != NULL && lbe->handles(c)) {
 149                 return lbe;
 150             }
 151         }
 152     }
 153
 154     // We didn't find an engine. Create one.
 155     lbe = loadEngineFor(c);
 156     if (lbe != NULL) {
 157         fEngines->push((void *)lbe, status);
 158     }
 159     return lbe;
 160 }
 161
 162 const LanguageBreakEngine *
 163 ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
 164     UErrorCode status = U_ZERO_ERROR;
 165     UScriptCode code = uscript_getScript(c, &status);
 166     if (U_SUCCESS(status)) {
 167         DictionaryMatcher *m = loadDictionaryMatcherFor(code);
 168         if (m != NULL) {
 169             const LanguageBreakEngine *engine = NULL;
 170             switch(code) {
 171             case USCRIPT_THAI:
 172                 engine = new ThaiBreakEngine(m, status);
 173                 break;
 174             case USCRIPT_LAO:
 175                 engine = new LaoBreakEngine(m, status);
 176                 break;
 177             case USCRIPT_MYANMAR:
 178                 engine = new BurmeseBreakEngine(m, status);
 179                 break;
 180             case USCRIPT_KHMER:
 181                 engine = new KhmerBreakEngine(m, status);
 182                 break;
 183
 184 #if !UCONFIG_NO_NORMALIZATION
 185                 // CJK not available w/o normalization
 186             case USCRIPT_HANGUL:
 187                 engine = new CjkBreakEngine(m, kKorean, status);
 188                 break;
 189
 190             // use same BreakEngine and dictionary for both Chinese and Japanese
 191             case USCRIPT_HIRAGANA:
 192             case USCRIPT_KATAKANA:
 193             case USCRIPT_HAN:
 194                 engine = new CjkBreakEngine(m, kChineseJapanese, status);
 195                 break;
 196 #if 0
 197             // TODO: Have to get some characters with script=common handled
 198             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
 199             // them to CjkBreakEngine does not work. The engine has to
 200             // special-case them.
 201             case USCRIPT_COMMON:
 202             {
 203                 UBlockCode block = ublock_getCode(code);
 204                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
 205                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
 206                 break;
 207             }
 208 #endif
 209 #endif
 210
 211             default:
 212                 break;
 213             }
 214             if (engine == NULL) {
 215                 delete m;
 216             }
 217             else if (U_FAILURE(status)) {
 218                 delete engine;
 219                 engine = NULL;
 220             }
 221             return engine;
 222         }
 223     }
 224     return NULL;
 225 }
 226
 227 DictionaryMatcher *
 228 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
 229     UErrorCode status = U_ZERO_ERROR;
 230     // open root from brkitr tree.
 231     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
 232     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
 233     int32_t dictnlength = 0;
 234     const UChar *dictfname =
 235         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
 236     if (U_FAILURE(status)) {
 237         ures_close(b);
 238         return NULL;
 239     }
 240     CharString dictnbuf;
 241     CharString ext;
 242     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
 243     if (extStart != NULL) {
 244         int32_t len = (int32_t)(extStart - dictfname);
 245         ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
 246         dictnlength = len;
 247     }
 248     dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
 249     ures_close(b);
 250
 251     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
 252     if (U_SUCCESS(status)) {
 253         // build trie
 254         const uint8_t *data = (const uint8_t *)udata_getMemory(file);
 255         const int32_t *indexes = (const int32_t *)data;
 256         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
 257         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
 258         DictionaryMatcher *m = NULL;
 259         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
 260             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
 261             const char *characters = (const char *)(data + offset);
 262             m = new BytesDictionaryMatcher(characters, transform, file);
 263         }
 264         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
 265             const UChar *characters = (const UChar *)(data + offset);
 266             m = new UCharsDictionaryMatcher(characters, file);
 267         }
 268         if (m == NULL) {
 269             // no matcher exists to take ownership - either we are an invalid
 270             // type or memory allocation failed
 271             udata_close(file);
 272         }
 273         return m;
 274     } else if (dictfname != NULL) {
 275         // we don't have a dictionary matcher.
 276         // returning NULL here will cause us to fail to find a dictionary break engine, as expected
 277         status = U_ZERO_ERROR;
 278         return NULL;
 279     }
 280     return NULL;
 281 }
 282
 283 U_NAMESPACE_END
 284
 285 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */