2 ************************************************************************************
3 * Copyright (C) 2006-2013, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ************************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_BREAK_ITERATION
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
25 #include "dictionarydata.h"
34 ******************************************************************
37 LanguageBreakEngine::LanguageBreakEngine() {
40 LanguageBreakEngine::~LanguageBreakEngine() {
44 ******************************************************************
47 LanguageBreakFactory::LanguageBreakFactory() {
50 LanguageBreakFactory::~LanguageBreakFactory() {
54 ******************************************************************
57 UnhandledEngine::UnhandledEngine(UErrorCode
&/*status*/) {
58 for (int32_t i
= 0; i
< (int32_t)(sizeof(fHandled
)/sizeof(fHandled
[0])); ++i
) {
63 UnhandledEngine::~UnhandledEngine() {
64 for (int32_t i
= 0; i
< (int32_t)(sizeof(fHandled
)/sizeof(fHandled
[0])); ++i
) {
65 if (fHandled
[i
] != 0) {
72 UnhandledEngine::handles(UChar32 c
, int32_t breakType
) const {
73 return (breakType
>= 0 && breakType
< (int32_t)(sizeof(fHandled
)/sizeof(fHandled
[0]))
74 && fHandled
[breakType
] != 0 && fHandled
[breakType
]->contains(c
));
78 UnhandledEngine::findBreaks( UText
*text
,
83 UStack
&/*foundBreaks*/ ) const {
84 if (breakType
>= 0 && breakType
< (int32_t)(sizeof(fHandled
)/sizeof(fHandled
[0]))) {
85 UChar32 c
= utext_current32(text
);
87 while((int32_t)utext_getNativeIndex(text
) > startPos
&& fHandled
[breakType
]->contains(c
)) {
88 c
= utext_previous32(text
);
92 while((int32_t)utext_getNativeIndex(text
) < endPos
&& fHandled
[breakType
]->contains(c
)) {
93 utext_next32(text
); // TODO: recast loop to work with post-increment operations.
94 c
= utext_current32(text
);
102 UnhandledEngine::handleCharacter(UChar32 c
, int32_t breakType
) {
103 if (breakType
>= 0 && breakType
< (int32_t)(sizeof(fHandled
)/sizeof(fHandled
[0]))) {
104 if (fHandled
[breakType
] == 0) {
105 fHandled
[breakType
] = new UnicodeSet();
106 if (fHandled
[breakType
] == 0) {
110 if (!fHandled
[breakType
]->contains(c
)) {
111 UErrorCode status
= U_ZERO_ERROR
;
112 // Apply the entire script of the character.
113 int32_t script
= u_getIntPropertyValue(c
, UCHAR_SCRIPT
);
114 fHandled
[breakType
]->applyIntPropertyValue(UCHAR_SCRIPT
, script
, status
);
120 ******************************************************************
123 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode
&/*status*/) {
127 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
135 static void U_CALLCONV
_deleteEngine(void *obj
) {
136 delete (const icu::LanguageBreakEngine
*) obj
;
141 const LanguageBreakEngine
*
142 ICULanguageBreakFactory::getEngineFor(UChar32 c
, int32_t breakType
) {
145 const LanguageBreakEngine
*lbe
= NULL
;
146 UErrorCode status
= U_ZERO_ERROR
;
148 // TODO: The global mutex should not be used.
149 // The global mutex should only be used for short periods.
150 // A ICULanguageBreakFactory specific mutex should be used.
152 needsInit
= (UBool
)(fEngines
== NULL
);
154 i
= fEngines
->size();
156 lbe
= (const LanguageBreakEngine
*)(fEngines
->elementAt(i
));
157 if (lbe
!= NULL
&& lbe
->handles(c
, breakType
)) {
170 UStack
*engines
= new UStack(_deleteEngine
, NULL
, status
);
171 if (U_SUCCESS(status
) && engines
== NULL
) {
172 status
= U_MEMORY_ALLOCATION_ERROR
;
174 else if (U_FAILURE(status
)) {
180 if (fEngines
== NULL
) {
189 if (fEngines
== NULL
) {
193 // We didn't find an engine the first time through, or there was no
194 // stack. Create an engine.
195 const LanguageBreakEngine
*newlbe
= loadEngineFor(c
, breakType
);
197 // Now get the lock, and see if someone else has created it in the
200 i
= fEngines
->size();
202 lbe
= (const LanguageBreakEngine
*)(fEngines
->elementAt(i
));
203 if (lbe
!= NULL
&& lbe
->handles(c
, breakType
)) {
208 if (lbe
== NULL
&& newlbe
!= NULL
) {
209 fEngines
->push((void *)newlbe
, status
);
220 const LanguageBreakEngine
*
221 ICULanguageBreakFactory::loadEngineFor(UChar32 c
, int32_t breakType
) {
222 UErrorCode status
= U_ZERO_ERROR
;
223 UScriptCode code
= uscript_getScript(c
, &status
);
224 if (U_SUCCESS(status
)) {
225 DictionaryMatcher
*m
= loadDictionaryMatcherFor(code
, breakType
);
227 const LanguageBreakEngine
*engine
= NULL
;
230 engine
= new ThaiBreakEngine(m
, status
);
233 engine
= new LaoBreakEngine(m
, status
);
236 engine
= new KhmerBreakEngine(m
, status
);
239 #if !UCONFIG_NO_NORMALIZATION
240 // CJK not available w/o normalization
242 engine
= new CjkBreakEngine(m
, kKorean
, status
);
245 // use same BreakEngine and dictionary for both Chinese and Japanese
246 case USCRIPT_HIRAGANA
:
247 case USCRIPT_KATAKANA
:
249 engine
= new CjkBreakEngine(m
, kChineseJapanese
, status
);
252 // TODO: Have to get some characters with script=common handled
253 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
254 // them to CjkBreakEngine does not work. The engine has to
255 // special-case them.
258 UBlockCode block
= ublock_getCode(code
);
259 if (block
== UBLOCK_HIRAGANA
|| block
== UBLOCK_KATAKANA
)
260 engine
= new CjkBreakEngine(dict
, kChineseJapanese
, status
);
269 if (engine
== NULL
) {
272 else if (U_FAILURE(status
)) {
283 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script
, int32_t /* brkType */) {
284 UErrorCode status
= U_ZERO_ERROR
;
285 // open root from brkitr tree.
286 UResourceBundle
*b
= ures_open(U_ICUDATA_BRKITR
, "", &status
);
287 b
= ures_getByKeyWithFallback(b
, "dictionaries", b
, &status
);
288 int32_t dictnlength
= 0;
289 const UChar
*dictfname
=
290 ures_getStringByKeyWithFallback(b
, uscript_getShortName(script
), &dictnlength
, &status
);
291 if (U_FAILURE(status
)) {
297 const UChar
*extStart
= u_memrchr(dictfname
, 0x002e, dictnlength
); // last dot
298 if (extStart
!= NULL
) {
299 int32_t len
= (int32_t)(extStart
- dictfname
);
300 ext
.appendInvariantChars(UnicodeString(FALSE
, extStart
+ 1, dictnlength
- len
- 1), status
);
303 dictnbuf
.appendInvariantChars(UnicodeString(FALSE
, dictfname
, dictnlength
), status
);
306 UDataMemory
*file
= udata_open(U_ICUDATA_BRKITR
, ext
.data(), dictnbuf
.data(), &status
);
307 if (U_SUCCESS(status
)) {
309 const uint8_t *data
= (const uint8_t *)udata_getMemory(file
);
310 const int32_t *indexes
= (const int32_t *)data
;
311 const int32_t offset
= indexes
[DictionaryData::IX_STRING_TRIE_OFFSET
];
312 const int32_t trieType
= indexes
[DictionaryData::IX_TRIE_TYPE
] & DictionaryData::TRIE_TYPE_MASK
;
313 DictionaryMatcher
*m
= NULL
;
314 if (trieType
== DictionaryData::TRIE_TYPE_BYTES
) {
315 const int32_t transform
= indexes
[DictionaryData::IX_TRANSFORM
];
316 const char *characters
= (const char *)(data
+ offset
);
317 m
= new BytesDictionaryMatcher(characters
, transform
, file
);
319 else if (trieType
== DictionaryData::TRIE_TYPE_UCHARS
) {
320 const UChar
*characters
= (const UChar
*)(data
+ offset
);
321 m
= new UCharsDictionaryMatcher(characters
, file
);
324 // no matcher exists to take ownership - either we are an invalid
325 // type or memory allocation failed
329 } else if (dictfname
!= NULL
) {
330 // we don't have a dictionary matcher.
331 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
332 status
= U_ZERO_ERROR
;
340 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */