]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/brkeng.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ************************************************************************************
5 * Copyright (C) 2006-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ************************************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_BREAK_ITERATION
17 #include "unicode/uchar.h"
18 #include "unicode/uniset.h"
19 #include "unicode/chariter.h"
20 #include "unicode/ures.h"
21 #include "unicode/udata.h"
22 #include "unicode/putil.h"
23 #include "unicode/ustring.h"
24 #include "unicode/uscript.h"
25 #include "unicode/ucharstrie.h"
26 #include "unicode/bytestrie.h"
28 #include "dictionarydata.h"
38 ******************************************************************
41 LanguageBreakEngine::LanguageBreakEngine() {
44 LanguageBreakEngine::~LanguageBreakEngine() {
48 ******************************************************************
51 LanguageBreakFactory::LanguageBreakFactory() {
54 LanguageBreakFactory::~LanguageBreakFactory() {
58 ******************************************************************
61 UnhandledEngine::UnhandledEngine(UErrorCode
&/*status*/) {
62 for (int32_t i
= 0; i
< UPRV_LENGTHOF(fHandled
); ++i
) {
67 UnhandledEngine::~UnhandledEngine() {
68 for (int32_t i
= 0; i
< UPRV_LENGTHOF(fHandled
); ++i
) {
69 if (fHandled
[i
] != 0) {
76 UnhandledEngine::handles(UChar32 c
, int32_t breakType
) const {
77 return (breakType
>= 0 && breakType
< UPRV_LENGTHOF(fHandled
)
78 && fHandled
[breakType
] != 0 && fHandled
[breakType
]->contains(c
));
82 UnhandledEngine::findBreaks( UText
*text
,
87 UStack
&/*foundBreaks*/ ) const {
88 if (breakType
>= 0 && breakType
< UPRV_LENGTHOF(fHandled
)) {
89 UChar32 c
= utext_current32(text
);
91 while((int32_t)utext_getNativeIndex(text
) > startPos
&& fHandled
[breakType
]->contains(c
)) {
92 c
= utext_previous32(text
);
96 while((int32_t)utext_getNativeIndex(text
) < endPos
&& fHandled
[breakType
]->contains(c
)) {
97 utext_next32(text
); // TODO: recast loop to work with post-increment operations.
98 c
= utext_current32(text
);
106 UnhandledEngine::handleCharacter(UChar32 c
, int32_t breakType
) {
107 if (breakType
>= 0 && breakType
< UPRV_LENGTHOF(fHandled
)) {
108 if (fHandled
[breakType
] == 0) {
109 fHandled
[breakType
] = new UnicodeSet();
110 if (fHandled
[breakType
] == 0) {
114 if (!fHandled
[breakType
]->contains(c
)) {
115 UErrorCode status
= U_ZERO_ERROR
;
116 // Apply the entire script of the character.
117 int32_t script
= u_getIntPropertyValue(c
, UCHAR_SCRIPT
);
118 fHandled
[breakType
]->applyIntPropertyValue(UCHAR_SCRIPT
, script
, status
);
124 ******************************************************************
127 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode
&/*status*/) {
131 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
139 static void U_CALLCONV
_deleteEngine(void *obj
) {
140 delete (const icu::LanguageBreakEngine
*) obj
;
145 static UMutex gBreakEngineMutex
= U_MUTEX_INITIALIZER
;
147 const LanguageBreakEngine
*
148 ICULanguageBreakFactory::getEngineFor(UChar32 c
, int32_t breakType
) {
149 const LanguageBreakEngine
*lbe
= NULL
;
150 UErrorCode status
= U_ZERO_ERROR
;
152 Mutex
m(&gBreakEngineMutex
);
154 if (fEngines
== NULL
) {
155 UStack
*engines
= new UStack(_deleteEngine
, NULL
, status
);
156 if (U_FAILURE(status
) || engines
== NULL
) {
157 // Note: no way to return error code to caller.
163 int32_t i
= fEngines
->size();
165 lbe
= (const LanguageBreakEngine
*)(fEngines
->elementAt(i
));
166 if (lbe
!= NULL
&& lbe
->handles(c
, breakType
)) {
172 // We didn't find an engine. Create one.
173 lbe
= loadEngineFor(c
, breakType
);
175 fEngines
->push((void *)lbe
, status
);
180 const LanguageBreakEngine
*
181 ICULanguageBreakFactory::loadEngineFor(UChar32 c
, int32_t breakType
) {
182 UErrorCode status
= U_ZERO_ERROR
;
183 UScriptCode code
= uscript_getScript(c
, &status
);
184 if (U_SUCCESS(status
)) {
185 DictionaryMatcher
*m
= loadDictionaryMatcherFor(code
, breakType
);
187 const LanguageBreakEngine
*engine
= NULL
;
190 engine
= new ThaiBreakEngine(m
, status
);
193 engine
= new LaoBreakEngine(m
, status
);
195 case USCRIPT_MYANMAR
:
196 engine
= new BurmeseBreakEngine(m
, status
);
199 engine
= new KhmerBreakEngine(m
, status
);
202 #if !UCONFIG_NO_NORMALIZATION
203 // CJK not available w/o normalization
205 engine
= new CjkBreakEngine(m
, kKorean
, status
);
208 // use same BreakEngine and dictionary for both Chinese and Japanese
209 case USCRIPT_HIRAGANA
:
210 case USCRIPT_KATAKANA
:
212 engine
= new CjkBreakEngine(m
, kChineseJapanese
, status
);
215 // TODO: Have to get some characters with script=common handled
216 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
217 // them to CjkBreakEngine does not work. The engine has to
218 // special-case them.
221 UBlockCode block
= ublock_getCode(code
);
222 if (block
== UBLOCK_HIRAGANA
|| block
== UBLOCK_KATAKANA
)
223 engine
= new CjkBreakEngine(dict
, kChineseJapanese
, status
);
232 if (engine
== NULL
) {
235 else if (U_FAILURE(status
)) {
246 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script
, int32_t /* brkType */) {
247 UErrorCode status
= U_ZERO_ERROR
;
248 // open root from brkitr tree.
249 UResourceBundle
*b
= ures_open(U_ICUDATA_BRKITR
, "", &status
);
250 b
= ures_getByKeyWithFallback(b
, "dictionaries", b
, &status
);
251 int32_t dictnlength
= 0;
252 const UChar
*dictfname
=
253 ures_getStringByKeyWithFallback(b
, uscript_getShortName(script
), &dictnlength
, &status
);
254 if (U_FAILURE(status
)) {
260 const UChar
*extStart
= u_memrchr(dictfname
, 0x002e, dictnlength
); // last dot
261 if (extStart
!= NULL
) {
262 int32_t len
= (int32_t)(extStart
- dictfname
);
263 ext
.appendInvariantChars(UnicodeString(FALSE
, extStart
+ 1, dictnlength
- len
- 1), status
);
266 dictnbuf
.appendInvariantChars(UnicodeString(FALSE
, dictfname
, dictnlength
), status
);
269 UDataMemory
*file
= udata_open(U_ICUDATA_BRKITR
, ext
.data(), dictnbuf
.data(), &status
);
270 if (U_SUCCESS(status
)) {
272 const uint8_t *data
= (const uint8_t *)udata_getMemory(file
);
273 const int32_t *indexes
= (const int32_t *)data
;
274 const int32_t offset
= indexes
[DictionaryData::IX_STRING_TRIE_OFFSET
];
275 const int32_t trieType
= indexes
[DictionaryData::IX_TRIE_TYPE
] & DictionaryData::TRIE_TYPE_MASK
;
276 DictionaryMatcher
*m
= NULL
;
277 if (trieType
== DictionaryData::TRIE_TYPE_BYTES
) {
278 const int32_t transform
= indexes
[DictionaryData::IX_TRANSFORM
];
279 const char *characters
= (const char *)(data
+ offset
);
280 m
= new BytesDictionaryMatcher(characters
, transform
, file
);
282 else if (trieType
== DictionaryData::TRIE_TYPE_UCHARS
) {
283 const UChar
*characters
= (const UChar
*)(data
+ offset
);
284 m
= new UCharsDictionaryMatcher(characters
, file
);
287 // no matcher exists to take ownership - either we are an invalid
288 // type or memory allocation failed
292 } else if (dictfname
!= NULL
) {
293 // we don't have a dictionary matcher.
294 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
295 status
= U_ZERO_ERROR
;
303 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */