]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/brkeng.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ************************************************************************************
5 * Copyright (C) 2006-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ************************************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_BREAK_ITERATION
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
29 #include "dictionarydata.h"
39 ******************************************************************
42 LanguageBreakEngine::LanguageBreakEngine() {
45 LanguageBreakEngine::~LanguageBreakEngine() {
49 ******************************************************************
52 LanguageBreakFactory::LanguageBreakFactory() {
55 LanguageBreakFactory::~LanguageBreakFactory() {
59 ******************************************************************
62 UnhandledEngine::UnhandledEngine(UErrorCode
&status
) : fHandled(nullptr) {
66 UnhandledEngine::~UnhandledEngine() {
72 UnhandledEngine::handles(UChar32 c
) const {
73 return fHandled
&& fHandled
->contains(c
);
77 UnhandledEngine::findBreaks( UText
*text
,
78 int32_t /* startPos */,
80 UVector32
&/*foundBreaks*/ ) const {
81 UChar32 c
= utext_current32(text
);
82 while((int32_t)utext_getNativeIndex(text
) < endPos
&& fHandled
->contains(c
)) {
83 utext_next32(text
); // TODO: recast loop to work with post-increment operations.
84 c
= utext_current32(text
);
90 UnhandledEngine::handleCharacter(UChar32 c
) {
91 if (fHandled
== nullptr) {
92 fHandled
= new UnicodeSet();
93 if (fHandled
== nullptr) {
97 if (!fHandled
->contains(c
)) {
98 UErrorCode status
= U_ZERO_ERROR
;
99 // Apply the entire script of the character.
100 int32_t script
= u_getIntPropertyValue(c
, UCHAR_SCRIPT
);
101 fHandled
->applyIntPropertyValue(UCHAR_SCRIPT
, script
, status
);
106 ******************************************************************
109 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode
&/*status*/) {
113 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
121 static void U_CALLCONV
_deleteEngine(void *obj
) {
122 delete (const icu::LanguageBreakEngine
*) obj
;
127 static UMutex gBreakEngineMutex
= U_MUTEX_INITIALIZER
;
129 const LanguageBreakEngine
*
130 ICULanguageBreakFactory::getEngineFor(UChar32 c
) {
131 const LanguageBreakEngine
*lbe
= NULL
;
132 UErrorCode status
= U_ZERO_ERROR
;
134 Mutex
m(&gBreakEngineMutex
);
136 if (fEngines
== NULL
) {
137 UStack
*engines
= new UStack(_deleteEngine
, NULL
, status
);
138 if (U_FAILURE(status
) || engines
== NULL
) {
139 // Note: no way to return error code to caller.
145 int32_t i
= fEngines
->size();
147 lbe
= (const LanguageBreakEngine
*)(fEngines
->elementAt(i
));
148 if (lbe
!= NULL
&& lbe
->handles(c
)) {
154 // We didn't find an engine. Create one.
155 lbe
= loadEngineFor(c
);
157 fEngines
->push((void *)lbe
, status
);
162 const LanguageBreakEngine
*
163 ICULanguageBreakFactory::loadEngineFor(UChar32 c
) {
164 UErrorCode status
= U_ZERO_ERROR
;
165 UScriptCode code
= uscript_getScript(c
, &status
);
166 if (U_SUCCESS(status
)) {
167 DictionaryMatcher
*m
= loadDictionaryMatcherFor(code
);
169 const LanguageBreakEngine
*engine
= NULL
;
172 engine
= new ThaiBreakEngine(m
, status
);
175 engine
= new LaoBreakEngine(m
, status
);
177 case USCRIPT_MYANMAR
:
178 engine
= new BurmeseBreakEngine(m
, status
);
181 engine
= new KhmerBreakEngine(m
, status
);
184 #if !UCONFIG_NO_NORMALIZATION
185 // CJK not available w/o normalization
187 engine
= new CjkBreakEngine(m
, kKorean
, status
);
190 // use same BreakEngine and dictionary for both Chinese and Japanese
191 case USCRIPT_HIRAGANA
:
192 case USCRIPT_KATAKANA
:
194 engine
= new CjkBreakEngine(m
, kChineseJapanese
, status
);
197 // TODO: Have to get some characters with script=common handled
198 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
199 // them to CjkBreakEngine does not work. The engine has to
200 // special-case them.
203 UBlockCode block
= ublock_getCode(code
);
204 if (block
== UBLOCK_HIRAGANA
|| block
== UBLOCK_KATAKANA
)
205 engine
= new CjkBreakEngine(dict
, kChineseJapanese
, status
);
214 if (engine
== NULL
) {
217 else if (U_FAILURE(status
)) {
228 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script
) {
229 UErrorCode status
= U_ZERO_ERROR
;
230 // open root from brkitr tree.
231 UResourceBundle
*b
= ures_open(U_ICUDATA_BRKITR
, "", &status
);
232 b
= ures_getByKeyWithFallback(b
, "dictionaries", b
, &status
);
233 int32_t dictnlength
= 0;
234 const UChar
*dictfname
=
235 ures_getStringByKeyWithFallback(b
, uscript_getShortName(script
), &dictnlength
, &status
);
236 if (U_FAILURE(status
)) {
242 const UChar
*extStart
= u_memrchr(dictfname
, 0x002e, dictnlength
); // last dot
243 if (extStart
!= NULL
) {
244 int32_t len
= (int32_t)(extStart
- dictfname
);
245 ext
.appendInvariantChars(UnicodeString(FALSE
, extStart
+ 1, dictnlength
- len
- 1), status
);
248 dictnbuf
.appendInvariantChars(UnicodeString(FALSE
, dictfname
, dictnlength
), status
);
251 UDataMemory
*file
= udata_open(U_ICUDATA_BRKITR
, ext
.data(), dictnbuf
.data(), &status
);
252 if (U_SUCCESS(status
)) {
254 const uint8_t *data
= (const uint8_t *)udata_getMemory(file
);
255 const int32_t *indexes
= (const int32_t *)data
;
256 const int32_t offset
= indexes
[DictionaryData::IX_STRING_TRIE_OFFSET
];
257 const int32_t trieType
= indexes
[DictionaryData::IX_TRIE_TYPE
] & DictionaryData::TRIE_TYPE_MASK
;
258 DictionaryMatcher
*m
= NULL
;
259 if (trieType
== DictionaryData::TRIE_TYPE_BYTES
) {
260 const int32_t transform
= indexes
[DictionaryData::IX_TRANSFORM
];
261 const char *characters
= (const char *)(data
+ offset
);
262 m
= new BytesDictionaryMatcher(characters
, transform
, file
);
264 else if (trieType
== DictionaryData::TRIE_TYPE_UCHARS
) {
265 const UChar
*characters
= (const UChar
*)(data
+ offset
);
266 m
= new UCharsDictionaryMatcher(characters
, file
);
269 // no matcher exists to take ownership - either we are an invalid
270 // type or memory allocation failed
274 } else if (dictfname
!= NULL
) {
275 // we don't have a dictionary matcher.
276 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
277 status
= U_ZERO_ERROR
;
285 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */