]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/brkeng.cpp
2 ************************************************************************************
3 * Copyright (C) 2006-2016, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ************************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_BREAK_ITERATION
15 #include "unicode/uchar.h"
16 #include "unicode/uniset.h"
17 #include "unicode/chariter.h"
18 #include "unicode/ures.h"
19 #include "unicode/udata.h"
20 #include "unicode/putil.h"
21 #include "unicode/ustring.h"
22 #include "unicode/uscript.h"
23 #include "unicode/ucharstrie.h"
24 #include "unicode/bytestrie.h"
26 #include "dictionarydata.h"
36 ******************************************************************
39 LanguageBreakEngine::LanguageBreakEngine() {
42 LanguageBreakEngine::~LanguageBreakEngine() {
46 ******************************************************************
49 LanguageBreakFactory::LanguageBreakFactory() {
52 LanguageBreakFactory::~LanguageBreakFactory() {
56 ******************************************************************
59 UnhandledEngine::UnhandledEngine(UErrorCode
&/*status*/) {
60 for (int32_t i
= 0; i
< UPRV_LENGTHOF(fHandled
); ++i
) {
65 UnhandledEngine::~UnhandledEngine() {
66 for (int32_t i
= 0; i
< UPRV_LENGTHOF(fHandled
); ++i
) {
67 if (fHandled
[i
] != 0) {
74 UnhandledEngine::handles(UChar32 c
, int32_t breakType
) const {
75 return (breakType
>= 0 && breakType
< UPRV_LENGTHOF(fHandled
)
76 && fHandled
[breakType
] != 0 && fHandled
[breakType
]->contains(c
));
80 UnhandledEngine::findBreaks( UText
*text
,
85 UStack
&/*foundBreaks*/ ) const {
86 if (breakType
>= 0 && breakType
< UPRV_LENGTHOF(fHandled
)) {
87 UChar32 c
= utext_current32(text
);
89 while((int32_t)utext_getNativeIndex(text
) > startPos
&& fHandled
[breakType
]->contains(c
)) {
90 c
= utext_previous32(text
);
94 while((int32_t)utext_getNativeIndex(text
) < endPos
&& fHandled
[breakType
]->contains(c
)) {
95 utext_next32(text
); // TODO: recast loop to work with post-increment operations.
96 c
= utext_current32(text
);
104 UnhandledEngine::handleCharacter(UChar32 c
, int32_t breakType
) {
105 if (breakType
>= 0 && breakType
< UPRV_LENGTHOF(fHandled
)) {
106 if (fHandled
[breakType
] == 0) {
107 fHandled
[breakType
] = new UnicodeSet();
108 if (fHandled
[breakType
] == 0) {
112 if (!fHandled
[breakType
]->contains(c
)) {
113 UErrorCode status
= U_ZERO_ERROR
;
114 // Apply the entire script of the character.
115 int32_t script
= u_getIntPropertyValue(c
, UCHAR_SCRIPT
);
116 fHandled
[breakType
]->applyIntPropertyValue(UCHAR_SCRIPT
, script
, status
);
122 ******************************************************************
125 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode
&/*status*/) {
129 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
137 static void U_CALLCONV
_deleteEngine(void *obj
) {
138 delete (const icu::LanguageBreakEngine
*) obj
;
143 static UMutex gBreakEngineMutex
= U_MUTEX_INITIALIZER
;
145 const LanguageBreakEngine
*
146 ICULanguageBreakFactory::getEngineFor(UChar32 c
, int32_t breakType
) {
147 const LanguageBreakEngine
*lbe
= NULL
;
148 UErrorCode status
= U_ZERO_ERROR
;
150 Mutex
m(&gBreakEngineMutex
);
152 if (fEngines
== NULL
) {
153 UStack
*engines
= new UStack(_deleteEngine
, NULL
, status
);
154 if (U_FAILURE(status
) || engines
== NULL
) {
155 // Note: no way to return error code to caller.
161 int32_t i
= fEngines
->size();
163 lbe
= (const LanguageBreakEngine
*)(fEngines
->elementAt(i
));
164 if (lbe
!= NULL
&& lbe
->handles(c
, breakType
)) {
170 // We didn't find an engine. Create one.
171 lbe
= loadEngineFor(c
, breakType
);
173 fEngines
->push((void *)lbe
, status
);
178 const LanguageBreakEngine
*
179 ICULanguageBreakFactory::loadEngineFor(UChar32 c
, int32_t breakType
) {
180 UErrorCode status
= U_ZERO_ERROR
;
181 UScriptCode code
= uscript_getScript(c
, &status
);
182 if (U_SUCCESS(status
)) {
183 DictionaryMatcher
*m
= loadDictionaryMatcherFor(code
, breakType
);
185 const LanguageBreakEngine
*engine
= NULL
;
188 engine
= new ThaiBreakEngine(m
, status
);
191 engine
= new LaoBreakEngine(m
, status
);
193 case USCRIPT_MYANMAR
:
194 engine
= new BurmeseBreakEngine(m
, status
);
197 engine
= new KhmerBreakEngine(m
, status
);
200 #if !UCONFIG_NO_NORMALIZATION
201 // CJK not available w/o normalization
203 engine
= new CjkBreakEngine(m
, kKorean
, status
);
206 // use same BreakEngine and dictionary for both Chinese and Japanese
207 case USCRIPT_HIRAGANA
:
208 case USCRIPT_KATAKANA
:
210 engine
= new CjkBreakEngine(m
, kChineseJapanese
, status
);
213 // TODO: Have to get some characters with script=common handled
214 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
215 // them to CjkBreakEngine does not work. The engine has to
216 // special-case them.
219 UBlockCode block
= ublock_getCode(code
);
220 if (block
== UBLOCK_HIRAGANA
|| block
== UBLOCK_KATAKANA
)
221 engine
= new CjkBreakEngine(dict
, kChineseJapanese
, status
);
230 if (engine
== NULL
) {
233 else if (U_FAILURE(status
)) {
244 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script
, int32_t /* brkType */) {
245 UErrorCode status
= U_ZERO_ERROR
;
246 // open root from brkitr tree.
247 UResourceBundle
*b
= ures_open(U_ICUDATA_BRKITR
, "", &status
);
248 b
= ures_getByKeyWithFallback(b
, "dictionaries", b
, &status
);
249 int32_t dictnlength
= 0;
250 const UChar
*dictfname
=
251 ures_getStringByKeyWithFallback(b
, uscript_getShortName(script
), &dictnlength
, &status
);
252 if (U_FAILURE(status
)) {
258 const UChar
*extStart
= u_memrchr(dictfname
, 0x002e, dictnlength
); // last dot
259 if (extStart
!= NULL
) {
260 int32_t len
= (int32_t)(extStart
- dictfname
);
261 ext
.appendInvariantChars(UnicodeString(FALSE
, extStart
+ 1, dictnlength
- len
- 1), status
);
264 dictnbuf
.appendInvariantChars(UnicodeString(FALSE
, dictfname
, dictnlength
), status
);
267 UDataMemory
*file
= udata_open(U_ICUDATA_BRKITR
, ext
.data(), dictnbuf
.data(), &status
);
268 if (U_SUCCESS(status
)) {
270 const uint8_t *data
= (const uint8_t *)udata_getMemory(file
);
271 const int32_t *indexes
= (const int32_t *)data
;
272 const int32_t offset
= indexes
[DictionaryData::IX_STRING_TRIE_OFFSET
];
273 const int32_t trieType
= indexes
[DictionaryData::IX_TRIE_TYPE
] & DictionaryData::TRIE_TYPE_MASK
;
274 DictionaryMatcher
*m
= NULL
;
275 if (trieType
== DictionaryData::TRIE_TYPE_BYTES
) {
276 const int32_t transform
= indexes
[DictionaryData::IX_TRANSFORM
];
277 const char *characters
= (const char *)(data
+ offset
);
278 m
= new BytesDictionaryMatcher(characters
, transform
, file
);
280 else if (trieType
== DictionaryData::TRIE_TYPE_UCHARS
) {
281 const UChar
*characters
= (const UChar
*)(data
+ offset
);
282 m
= new UCharsDictionaryMatcher(characters
, file
);
285 // no matcher exists to take ownership - either we are an invalid
286 // type or memory allocation failed
290 } else if (dictfname
!= NULL
) {
291 // we don't have a dictionary matcher.
292 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
293 status
= U_ZERO_ERROR
;
301 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */