]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/brkeng.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ************************************************************************************
5 * Copyright (C) 2006-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ************************************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_BREAK_ITERATION
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
29 #include "dictionarydata.h"
39 ******************************************************************
42 LanguageBreakEngine::LanguageBreakEngine() {
45 LanguageBreakEngine::~LanguageBreakEngine() {
49 ******************************************************************
52 LanguageBreakFactory::LanguageBreakFactory() {
55 LanguageBreakFactory::~LanguageBreakFactory() {
59 ******************************************************************
62 UnhandledEngine::UnhandledEngine(UErrorCode
&status
) : fHandled(nullptr) {
66 UnhandledEngine::~UnhandledEngine() {
72 UnhandledEngine::handles(UChar32 c
) const {
73 return fHandled
&& fHandled
->contains(c
);
77 UnhandledEngine::findBreaks( UText
*text
,
78 int32_t /* startPos */,
80 UVector32
&/*foundBreaks*/ ) const {
81 UChar32 c
= utext_current32(text
);
82 while((int32_t)utext_getNativeIndex(text
) < endPos
&& fHandled
->contains(c
)) {
83 utext_next32(text
); // TODO: recast loop to work with post-increment operations.
84 c
= utext_current32(text
);
90 UnhandledEngine::handleCharacter(UChar32 c
) {
91 if (fHandled
== nullptr) {
92 fHandled
= new UnicodeSet();
93 if (fHandled
== nullptr) {
97 if (!fHandled
->contains(c
)) {
98 UErrorCode status
= U_ZERO_ERROR
;
99 // Apply the entire script of the character.
100 int32_t script
= u_getIntPropertyValue(c
, UCHAR_SCRIPT
);
101 fHandled
->applyIntPropertyValue(UCHAR_SCRIPT
, script
, status
);
106 ******************************************************************
109 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode
&/*status*/) {
113 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
121 static void U_CALLCONV
_deleteEngine(void *obj
) {
122 delete (const icu::LanguageBreakEngine
*) obj
;
127 const LanguageBreakEngine
*
128 ICULanguageBreakFactory::getEngineFor(UChar32 c
) {
129 const LanguageBreakEngine
*lbe
= NULL
;
130 UErrorCode status
= U_ZERO_ERROR
;
132 static UMutex gBreakEngineMutex
;
133 Mutex
m(&gBreakEngineMutex
);
135 if (fEngines
== NULL
) {
136 UStack
*engines
= new UStack(_deleteEngine
, NULL
, status
);
137 if (U_FAILURE(status
) || engines
== NULL
) {
138 // Note: no way to return error code to caller.
144 int32_t i
= fEngines
->size();
146 lbe
= (const LanguageBreakEngine
*)(fEngines
->elementAt(i
));
147 if (lbe
!= NULL
&& lbe
->handles(c
)) {
153 // We didn't find an engine. Create one.
154 lbe
= loadEngineFor(c
);
156 fEngines
->push((void *)lbe
, status
);
161 const LanguageBreakEngine
*
162 ICULanguageBreakFactory::loadEngineFor(UChar32 c
) {
163 UErrorCode status
= U_ZERO_ERROR
;
164 UScriptCode code
= uscript_getScript(c
, &status
);
165 if (U_SUCCESS(status
)) {
166 DictionaryMatcher
*m
= loadDictionaryMatcherFor(code
);
168 const LanguageBreakEngine
*engine
= NULL
;
171 engine
= new ThaiBreakEngine(m
, status
);
174 engine
= new LaoBreakEngine(m
, status
);
176 case USCRIPT_MYANMAR
:
177 engine
= new BurmeseBreakEngine(m
, status
);
180 engine
= new KhmerBreakEngine(m
, status
);
183 #if !UCONFIG_NO_NORMALIZATION
184 // CJK not available w/o normalization
186 engine
= new CjkBreakEngine(m
, kKorean
, status
);
189 // use same BreakEngine and dictionary for both Chinese and Japanese
190 case USCRIPT_HIRAGANA
:
191 case USCRIPT_KATAKANA
:
193 engine
= new CjkBreakEngine(m
, kChineseJapanese
, status
);
196 // TODO: Have to get some characters with script=common handled
197 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
198 // them to CjkBreakEngine does not work. The engine has to
199 // special-case them.
202 UBlockCode block
= ublock_getCode(code
);
203 if (block
== UBLOCK_HIRAGANA
|| block
== UBLOCK_KATAKANA
)
204 engine
= new CjkBreakEngine(dict
, kChineseJapanese
, status
);
213 if (engine
== NULL
) {
216 else if (U_FAILURE(status
)) {
227 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script
) {
228 UErrorCode status
= U_ZERO_ERROR
;
229 // open root from brkitr tree.
230 UResourceBundle
*b
= ures_open(U_ICUDATA_BRKITR
, "", &status
);
231 b
= ures_getByKeyWithFallback(b
, "dictionaries", b
, &status
);
232 int32_t dictnlength
= 0;
233 const UChar
*dictfname
=
234 ures_getStringByKeyWithFallback(b
, uscript_getShortName(script
), &dictnlength
, &status
);
235 if (U_FAILURE(status
)) {
241 const UChar
*extStart
= u_memrchr(dictfname
, 0x002e, dictnlength
); // last dot
242 if (extStart
!= NULL
) {
243 int32_t len
= (int32_t)(extStart
- dictfname
);
244 ext
.appendInvariantChars(UnicodeString(FALSE
, extStart
+ 1, dictnlength
- len
- 1), status
);
247 dictnbuf
.appendInvariantChars(UnicodeString(FALSE
, dictfname
, dictnlength
), status
);
250 UDataMemory
*file
= udata_open(U_ICUDATA_BRKITR
, ext
.data(), dictnbuf
.data(), &status
);
251 if (U_SUCCESS(status
)) {
253 const uint8_t *data
= (const uint8_t *)udata_getMemory(file
);
254 const int32_t *indexes
= (const int32_t *)data
;
255 const int32_t offset
= indexes
[DictionaryData::IX_STRING_TRIE_OFFSET
];
256 const int32_t trieType
= indexes
[DictionaryData::IX_TRIE_TYPE
] & DictionaryData::TRIE_TYPE_MASK
;
257 DictionaryMatcher
*m
= NULL
;
258 if (trieType
== DictionaryData::TRIE_TYPE_BYTES
) {
259 const int32_t transform
= indexes
[DictionaryData::IX_TRANSFORM
];
260 const char *characters
= (const char *)(data
+ offset
);
261 m
= new BytesDictionaryMatcher(characters
, transform
, file
);
263 else if (trieType
== DictionaryData::TRIE_TYPE_UCHARS
) {
264 const UChar
*characters
= (const UChar
*)(data
+ offset
);
265 m
= new UCharsDictionaryMatcher(characters
, file
);
268 // no matcher exists to take ownership - either we are an invalid
269 // type or memory allocation failed
273 } else if (dictfname
!= NULL
) {
274 // we don't have a dictionary matcher.
275 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
276 status
= U_ZERO_ERROR
;
284 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */