icuSources/common/brkeng.cpp

   1 /*
   2  ************************************************************************************
   3  * Copyright (C) 2006-2016, International Business Machines Corporation
   4  * and others. All Rights Reserved.
   5  ************************************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_BREAK_ITERATION
  11
  12 #include "brkeng.h"
  13 #include "cmemory.h"
  14 #include "dictbe.h"
  15 #include "unicode/uchar.h"
  16 #include "unicode/uniset.h"
  17 #include "unicode/chariter.h"
  18 #include "unicode/ures.h"
  19 #include "unicode/udata.h"
  20 #include "unicode/putil.h"
  21 #include "unicode/ustring.h"
  22 #include "unicode/uscript.h"
  23 #include "unicode/ucharstrie.h"
  24 #include "unicode/bytestrie.h"
  25 #include "charstr.h"
  26 #include "dictionarydata.h"
  27 #include "mutex.h"
  28 #include "uvector.h"
  29 #include "umutex.h"
  30 #include "uresimp.h"
  31 #include "ubrkimpl.h"
  32
  33 U_NAMESPACE_BEGIN
  34
  35 /*
  36  ******************************************************************
  37  */
  38
  39 LanguageBreakEngine::LanguageBreakEngine() {
  40 }
  41
  42 LanguageBreakEngine::~LanguageBreakEngine() {
  43 }
  44
  45 /*
  46  ******************************************************************
  47  */
  48
  49 LanguageBreakFactory::LanguageBreakFactory() {
  50 }
  51
  52 LanguageBreakFactory::~LanguageBreakFactory() {
  53 }
  54
  55 /*
  56  ******************************************************************
  57  */
  58
  59 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
  60     for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
  61         fHandled[i] = 0;
  62     }
  63 }
  64
  65 UnhandledEngine::~UnhandledEngine() {
  66     for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
  67         if (fHandled[i] != 0) {
  68             delete fHandled[i];
  69         }
  70     }
  71 }
  72
  73 UBool
  74 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
  75     return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)
  76         && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
  77 }
  78
  79 int32_t
  80 UnhandledEngine::findBreaks( UText *text,
  81                                  int32_t startPos,
  82                                  int32_t endPos,
  83                                  UBool reverse,
  84                                  int32_t breakType,
  85                                  UStack &/*foundBreaks*/ ) const {
  86     if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
  87         UChar32 c = utext_current32(text);
  88         if (reverse) {
  89             while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
  90                 c = utext_previous32(text);
  91             }
  92         }
  93         else {
  94             while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
  95                 utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
  96                 c = utext_current32(text);
  97             }
  98         }
  99     }
 100     return 0;
 101 }
 102
 103 void
 104 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
 105     if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
 106         if (fHandled[breakType] == 0) {
 107             fHandled[breakType] = new UnicodeSet();
 108             if (fHandled[breakType] == 0) {
 109                 return;
 110             }
 111         }
 112         if (!fHandled[breakType]->contains(c)) {
 113             UErrorCode status = U_ZERO_ERROR;
 114             // Apply the entire script of the character.
 115             int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
 116             fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
 117         }
 118     }
 119 }
 120
 121 /*
 122  ******************************************************************
 123  */
 124
 125 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
 126     fEngines = 0;
 127 }
 128
 129 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
 130     if (fEngines != 0) {
 131         delete fEngines;
 132     }
 133 }
 134
 135 U_NAMESPACE_END
 136 U_CDECL_BEGIN
 137 static void U_CALLCONV _deleteEngine(void *obj) {
 138     delete (const icu::LanguageBreakEngine *) obj;
 139 }
 140 U_CDECL_END
 141 U_NAMESPACE_BEGIN
 142
 143 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
 144
 145 const LanguageBreakEngine *
 146 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
 147     const LanguageBreakEngine *lbe = NULL;
 148     UErrorCode  status = U_ZERO_ERROR;
 149
 150     Mutex m(&gBreakEngineMutex);
 151
 152     if (fEngines == NULL) {
 153         UStack  *engines = new UStack(_deleteEngine, NULL, status);
 154         if (U_FAILURE(status) || engines == NULL) {
 155             // Note: no way to return error code to caller.
 156             delete engines;
 157             return NULL;
 158         }
 159         fEngines = engines;
 160     } else {
 161         int32_t i = fEngines->size();
 162         while (--i >= 0) {
 163             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
 164             if (lbe != NULL && lbe->handles(c, breakType)) {
 165                 return lbe;
 166             }
 167         }
 168     }
 169
 170     // We didn't find an engine. Create one.
 171     lbe = loadEngineFor(c, breakType);
 172     if (lbe != NULL) {
 173         fEngines->push((void *)lbe, status);
 174     }
 175     return lbe;
 176 }
 177
 178 const LanguageBreakEngine *
 179 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
 180     UErrorCode status = U_ZERO_ERROR;
 181     UScriptCode code = uscript_getScript(c, &status);
 182     if (U_SUCCESS(status)) {
 183         DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
 184         if (m != NULL) {
 185             const LanguageBreakEngine *engine = NULL;
 186             switch(code) {
 187             case USCRIPT_THAI:
 188                 engine = new ThaiBreakEngine(m, status);
 189                 break;
 190             case USCRIPT_LAO:
 191                 engine = new LaoBreakEngine(m, status);
 192                 break;
 193             case USCRIPT_MYANMAR:
 194                 engine = new BurmeseBreakEngine(m, status);
 195                 break;
 196             case USCRIPT_KHMER:
 197                 engine = new KhmerBreakEngine(m, status);
 198                 break;
 199
 200 #if !UCONFIG_NO_NORMALIZATION
 201                 // CJK not available w/o normalization
 202             case USCRIPT_HANGUL:
 203                 engine = new CjkBreakEngine(m, kKorean, status);
 204                 break;
 205
 206             // use same BreakEngine and dictionary for both Chinese and Japanese
 207             case USCRIPT_HIRAGANA:
 208             case USCRIPT_KATAKANA:
 209             case USCRIPT_HAN:
 210                 engine = new CjkBreakEngine(m, kChineseJapanese, status);
 211                 break;
 212 #if 0
 213             // TODO: Have to get some characters with script=common handled
 214             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
 215             // them to CjkBreakEngine does not work. The engine has to
 216             // special-case them.
 217             case USCRIPT_COMMON:
 218             {
 219                 UBlockCode block = ublock_getCode(code);
 220                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
 221                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
 222                 break;
 223             }
 224 #endif
 225 #endif
 226
 227             default:
 228                 break;
 229             }
 230             if (engine == NULL) {
 231                 delete m;
 232             }
 233             else if (U_FAILURE(status)) {
 234                 delete engine;
 235                 engine = NULL;
 236             }
 237             return engine;
 238         }
 239     }
 240     return NULL;
 241 }
 242
 243 DictionaryMatcher *
 244 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
 245     UErrorCode status = U_ZERO_ERROR;
 246     // open root from brkitr tree.
 247     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
 248     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
 249     int32_t dictnlength = 0;
 250     const UChar *dictfname =
 251         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
 252     if (U_FAILURE(status)) {
 253         ures_close(b);
 254         return NULL;
 255     }
 256     CharString dictnbuf;
 257     CharString ext;
 258     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
 259     if (extStart != NULL) {
 260         int32_t len = (int32_t)(extStart - dictfname);
 261         ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
 262         dictnlength = len;
 263     }
 264     dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
 265     ures_close(b);
 266
 267     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
 268     if (U_SUCCESS(status)) {
 269         // build trie
 270         const uint8_t *data = (const uint8_t *)udata_getMemory(file);
 271         const int32_t *indexes = (const int32_t *)data;
 272         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
 273         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
 274         DictionaryMatcher *m = NULL;
 275         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
 276             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
 277             const char *characters = (const char *)(data + offset);
 278             m = new BytesDictionaryMatcher(characters, transform, file);
 279         }
 280         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
 281             const UChar *characters = (const UChar *)(data + offset);
 282             m = new UCharsDictionaryMatcher(characters, file);
 283         }
 284         if (m == NULL) {
 285             // no matcher exists to take ownership - either we are an invalid
 286             // type or memory allocation failed
 287             udata_close(file);
 288         }
 289         return m;
 290     } else if (dictfname != NULL) {
 291         // we don't have a dictionary matcher.
 292         // returning NULL here will cause us to fail to find a dictionary break engine, as expected
 293         status = U_ZERO_ERROR;
 294         return NULL;
 295     }
 296     return NULL;
 297 }
 298
 299 U_NAMESPACE_END
 300
 301 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */