]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/brkeng.cpp
ICU-531.30.tar.gz
[apple/icu.git] / icuSources / common / brkeng.cpp
1 /*
2 ************************************************************************************
3 * Copyright (C) 2006-2013, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ************************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_BREAK_ITERATION
11
12 #include "brkeng.h"
13 #include "dictbe.h"
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
24 #include "charstr.h"
25 #include "dictionarydata.h"
26 #include "uvector.h"
27 #include "umutex.h"
28 #include "uresimp.h"
29 #include "ubrkimpl.h"
30
31 U_NAMESPACE_BEGIN
32
33 /*
34 ******************************************************************
35 */
36
37 LanguageBreakEngine::LanguageBreakEngine() {
38 }
39
40 LanguageBreakEngine::~LanguageBreakEngine() {
41 }
42
43 /*
44 ******************************************************************
45 */
46
47 LanguageBreakFactory::LanguageBreakFactory() {
48 }
49
50 LanguageBreakFactory::~LanguageBreakFactory() {
51 }
52
53 /*
54 ******************************************************************
55 */
56
57 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
58 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
59 fHandled[i] = 0;
60 }
61 }
62
63 UnhandledEngine::~UnhandledEngine() {
64 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
65 if (fHandled[i] != 0) {
66 delete fHandled[i];
67 }
68 }
69 }
70
71 UBool
72 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
73 return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
74 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
75 }
76
77 int32_t
78 UnhandledEngine::findBreaks( UText *text,
79 int32_t startPos,
80 int32_t endPos,
81 UBool reverse,
82 int32_t breakType,
83 UStack &/*foundBreaks*/ ) const {
84 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
85 UChar32 c = utext_current32(text);
86 if (reverse) {
87 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
88 c = utext_previous32(text);
89 }
90 }
91 else {
92 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
93 utext_next32(text); // TODO: recast loop to work with post-increment operations.
94 c = utext_current32(text);
95 }
96 }
97 }
98 return 0;
99 }
100
101 void
102 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
103 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
104 if (fHandled[breakType] == 0) {
105 fHandled[breakType] = new UnicodeSet();
106 if (fHandled[breakType] == 0) {
107 return;
108 }
109 }
110 if (!fHandled[breakType]->contains(c)) {
111 UErrorCode status = U_ZERO_ERROR;
112 // Apply the entire script of the character.
113 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
114 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
115 }
116 }
117 }
118
119 /*
120 ******************************************************************
121 */
122
123 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
124 fEngines = 0;
125 }
126
127 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
128 if (fEngines != 0) {
129 delete fEngines;
130 }
131 }
132
133 U_NAMESPACE_END
134 U_CDECL_BEGIN
135 static void U_CALLCONV _deleteEngine(void *obj) {
136 delete (const icu::LanguageBreakEngine *) obj;
137 }
138 U_CDECL_END
139 U_NAMESPACE_BEGIN
140
141 const LanguageBreakEngine *
142 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
143 UBool needsInit;
144 int32_t i;
145 const LanguageBreakEngine *lbe = NULL;
146 UErrorCode status = U_ZERO_ERROR;
147
148 // TODO: The global mutex should not be used.
149 // The global mutex should only be used for short periods.
150 // A ICULanguageBreakFactory specific mutex should be used.
151 umtx_lock(NULL);
152 needsInit = (UBool)(fEngines == NULL);
153 if (!needsInit) {
154 i = fEngines->size();
155 while (--i >= 0) {
156 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
157 if (lbe != NULL && lbe->handles(c, breakType)) {
158 break;
159 }
160 lbe = NULL;
161 }
162 }
163 umtx_unlock(NULL);
164
165 if (lbe != NULL) {
166 return lbe;
167 }
168
169 if (needsInit) {
170 UStack *engines = new UStack(_deleteEngine, NULL, status);
171 if (U_SUCCESS(status) && engines == NULL) {
172 status = U_MEMORY_ALLOCATION_ERROR;
173 }
174 else if (U_FAILURE(status)) {
175 delete engines;
176 engines = NULL;
177 }
178 else {
179 umtx_lock(NULL);
180 if (fEngines == NULL) {
181 fEngines = engines;
182 engines = NULL;
183 }
184 umtx_unlock(NULL);
185 delete engines;
186 }
187 }
188
189 if (fEngines == NULL) {
190 return NULL;
191 }
192
193 // We didn't find an engine the first time through, or there was no
194 // stack. Create an engine.
195 const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
196
197 // Now get the lock, and see if someone else has created it in the
198 // meantime
199 umtx_lock(NULL);
200 i = fEngines->size();
201 while (--i >= 0) {
202 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
203 if (lbe != NULL && lbe->handles(c, breakType)) {
204 break;
205 }
206 lbe = NULL;
207 }
208 if (lbe == NULL && newlbe != NULL) {
209 fEngines->push((void *)newlbe, status);
210 lbe = newlbe;
211 newlbe = NULL;
212 }
213 umtx_unlock(NULL);
214
215 delete newlbe;
216
217 return lbe;
218 }
219
220 const LanguageBreakEngine *
221 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
222 UErrorCode status = U_ZERO_ERROR;
223 UScriptCode code = uscript_getScript(c, &status);
224 if (U_SUCCESS(status)) {
225 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
226 if (m != NULL) {
227 const LanguageBreakEngine *engine = NULL;
228 switch(code) {
229 case USCRIPT_THAI:
230 engine = new ThaiBreakEngine(m, status);
231 break;
232 case USCRIPT_LAO:
233 engine = new LaoBreakEngine(m, status);
234 break;
235 case USCRIPT_KHMER:
236 engine = new KhmerBreakEngine(m, status);
237 break;
238
239 #if !UCONFIG_NO_NORMALIZATION
240 // CJK not available w/o normalization
241 case USCRIPT_HANGUL:
242 engine = new CjkBreakEngine(m, kKorean, status);
243 break;
244
245 // use same BreakEngine and dictionary for both Chinese and Japanese
246 case USCRIPT_HIRAGANA:
247 case USCRIPT_KATAKANA:
248 case USCRIPT_HAN:
249 engine = new CjkBreakEngine(m, kChineseJapanese, status);
250 break;
251 #if 0
252 // TODO: Have to get some characters with script=common handled
253 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
254 // them to CjkBreakEngine does not work. The engine has to
255 // special-case them.
256 case USCRIPT_COMMON:
257 {
258 UBlockCode block = ublock_getCode(code);
259 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
260 engine = new CjkBreakEngine(dict, kChineseJapanese, status);
261 break;
262 }
263 #endif
264 #endif
265
266 default:
267 break;
268 }
269 if (engine == NULL) {
270 delete m;
271 }
272 else if (U_FAILURE(status)) {
273 delete engine;
274 engine = NULL;
275 }
276 return engine;
277 }
278 }
279 return NULL;
280 }
281
282 DictionaryMatcher *
283 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
284 UErrorCode status = U_ZERO_ERROR;
285 // open root from brkitr tree.
286 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
287 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
288 int32_t dictnlength = 0;
289 const UChar *dictfname =
290 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
291 if (U_FAILURE(status)) {
292 ures_close(b);
293 return NULL;
294 }
295 CharString dictnbuf;
296 CharString ext;
297 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
298 if (extStart != NULL) {
299 int32_t len = (int32_t)(extStart - dictfname);
300 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
301 dictnlength = len;
302 }
303 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
304 ures_close(b);
305
306 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
307 if (U_SUCCESS(status)) {
308 // build trie
309 const uint8_t *data = (const uint8_t *)udata_getMemory(file);
310 const int32_t *indexes = (const int32_t *)data;
311 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
312 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
313 DictionaryMatcher *m = NULL;
314 if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
315 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
316 const char *characters = (const char *)(data + offset);
317 m = new BytesDictionaryMatcher(characters, transform, file);
318 }
319 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
320 const UChar *characters = (const UChar *)(data + offset);
321 m = new UCharsDictionaryMatcher(characters, file);
322 }
323 if (m == NULL) {
324 // no matcher exists to take ownership - either we are an invalid
325 // type or memory allocation failed
326 udata_close(file);
327 }
328 return m;
329 } else if (dictfname != NULL) {
330 // we don't have a dictionary matcher.
331 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
332 status = U_ZERO_ERROR;
333 return NULL;
334 }
335 return NULL;
336 }
337
338 U_NAMESPACE_END
339
340 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */