]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
2ca993e8 | 3 | * Copyright (C) 1997-2016, International Business Machines Corporation and |
729e4ab9 | 4 | * others. All Rights Reserved. |
b75a7d8f A |
5 | ******************************************************************************* |
6 | * | |
57a6839d | 7 | * File brkiter.cpp |
b75a7d8f A |
8 | * |
9 | * Modification History: | |
10 | * | |
11 | * Date Name Description | |
12 | * 02/18/97 aliu Converted from OpenClass. Added DONE. | |
13 | * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods. | |
14 | ***************************************************************************************** | |
15 | */ | |
16 | ||
17 | // ***************************************************************************** | |
18 | // This file was generated from the java source file BreakIterator.java | |
19 | // ***************************************************************************** | |
20 | ||
21 | #include "unicode/utypes.h" | |
22 | ||
23 | #if !UCONFIG_NO_BREAK_ITERATION | |
24 | ||
73c04bcf | 25 | #include "unicode/rbbi.h" |
b75a7d8f A |
26 | #include "unicode/brkiter.h" |
27 | #include "unicode/udata.h" | |
374ca955 | 28 | #include "unicode/ures.h" |
73c04bcf | 29 | #include "unicode/ustring.h" |
2ca993e8 | 30 | #include "unicode/filteredbrk.h" |
374ca955 | 31 | #include "ucln_cmn.h" |
b75a7d8f | 32 | #include "cstring.h" |
46f4442e | 33 | #include "umutex.h" |
73c04bcf | 34 | #include "servloc.h" |
374ca955 A |
35 | #include "locbased.h" |
36 | #include "uresimp.h" | |
73c04bcf A |
37 | #include "uassert.h" |
38 | #include "ubrkimpl.h" | |
b331163b A |
39 | #include "charstr.h" |
40 | #include "unicode/filteredbrk.h" | |
b75a7d8f A |
41 | |
42 | // ***************************************************************************** | |
43 | // class BreakIterator | |
44 | // This class implements methods for finding the location of boundaries in text. | |
45 | // Instances of BreakIterator maintain a current position and scan over text | |
46 | // returning the index of characters where boundaries occur. | |
47 | // ***************************************************************************** | |
48 | ||
49 | U_NAMESPACE_BEGIN | |
50 | ||
b75a7d8f A |
51 | // ------------------------------------- |
52 | ||
b75a7d8f | 53 | BreakIterator* |
73c04bcf | 54 | BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status) |
b75a7d8f | 55 | { |
374ca955 | 56 | char fnbuff[256]; |
73c04bcf | 57 | char ext[4]={'\0'}; |
b331163b | 58 | CharString actualLocale; |
374ca955 A |
59 | int32_t size; |
60 | const UChar* brkfname = NULL; | |
73c04bcf A |
61 | UResourceBundle brkRulesStack; |
62 | UResourceBundle brkNameStack; | |
63 | UResourceBundle *brkRules = &brkRulesStack; | |
64 | UResourceBundle *brkName = &brkNameStack; | |
65 | RuleBasedBreakIterator *result = NULL; | |
46f4442e | 66 | |
b75a7d8f A |
67 | if (U_FAILURE(status)) |
68 | return NULL; | |
69 | ||
374ca955 A |
70 | ures_initStackObject(brkRules); |
71 | ures_initStackObject(brkName); | |
72 | ||
73 | // Get the locale | |
b331163b | 74 | UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status); |
374ca955 A |
75 | |
76 | // Get the "boundaries" array. | |
77 | if (U_SUCCESS(status)) { | |
78 | brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status); | |
79 | // Get the string object naming the rules file | |
80 | brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status); | |
81 | // Get the actual string | |
82 | brkfname = ures_getString(brkName, &size, &status); | |
73c04bcf A |
83 | U_ASSERT((size_t)size<sizeof(fnbuff)); |
84 | if ((size_t)size>=sizeof(fnbuff)) { | |
85 | size=0; | |
86 | if (U_SUCCESS(status)) { | |
87 | status = U_BUFFER_OVERFLOW_ERROR; | |
88 | } | |
89 | } | |
374ca955 A |
90 | |
91 | // Use the string if we found it | |
92 | if (U_SUCCESS(status) && brkfname) { | |
b331163b | 93 | actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status); |
46f4442e | 94 | |
73c04bcf A |
95 | UChar* extStart=u_strchr(brkfname, 0x002e); |
96 | int len = 0; | |
97 | if(extStart!=NULL){ | |
729e4ab9 | 98 | len = (int)(extStart-brkfname); |
73c04bcf A |
99 | u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff |
100 | u_UCharsToChars(brkfname, fnbuff, len); | |
101 | } | |
102 | fnbuff[len]=0; // nul terminate | |
374ca955 | 103 | } |
b75a7d8f A |
104 | } |
105 | ||
374ca955 A |
106 | ures_close(brkRules); |
107 | ures_close(brkName); | |
46f4442e | 108 | |
73c04bcf | 109 | UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status); |
b75a7d8f | 110 | if (U_FAILURE(status)) { |
374ca955 | 111 | ures_close(b); |
b75a7d8f A |
112 | return NULL; |
113 | } | |
b75a7d8f | 114 | |
73c04bcf A |
115 | // Create a RuleBasedBreakIterator |
116 | result = new RuleBasedBreakIterator(file, status); | |
374ca955 | 117 | |
73c04bcf | 118 | // If there is a result, set the valid locale and actual locale, and the kind |
374ca955 | 119 | if (U_SUCCESS(status) && result != NULL) { |
46f4442e | 120 | U_LOCALE_BASED(locBased, *(BreakIterator*)result); |
b331163b A |
121 | locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), |
122 | actualLocale.data()); | |
73c04bcf | 123 | result->setBreakType(kind); |
b75a7d8f | 124 | } |
374ca955 A |
125 | |
126 | ures_close(b); | |
46f4442e | 127 | |
374ca955 | 128 | if (U_FAILURE(status) && result != NULL) { // Sometimes redundant check, but simple |
b75a7d8f | 129 | delete result; |
374ca955 A |
130 | return NULL; |
131 | } | |
132 | ||
133 | if (result == NULL) { | |
134 | udata_close(file); | |
135 | if (U_SUCCESS(status)) { | |
136 | status = U_MEMORY_ALLOCATION_ERROR; | |
137 | } | |
b75a7d8f A |
138 | } |
139 | ||
140 | return result; | |
141 | } | |
142 | ||
374ca955 A |
143 | // Creates a break iterator for word breaks. |
144 | BreakIterator* U_EXPORT2 | |
145 | BreakIterator::createWordInstance(const Locale& key, UErrorCode& status) | |
146 | { | |
147 | return createInstance(key, UBRK_WORD, status); | |
148 | } | |
149 | ||
b75a7d8f A |
150 | // ------------------------------------- |
151 | ||
152 | // Creates a break iterator for line breaks. | |
374ca955 | 153 | BreakIterator* U_EXPORT2 |
b75a7d8f A |
154 | BreakIterator::createLineInstance(const Locale& key, UErrorCode& status) |
155 | { | |
374ca955 | 156 | return createInstance(key, UBRK_LINE, status); |
b75a7d8f A |
157 | } |
158 | ||
159 | // ------------------------------------- | |
160 | ||
161 | // Creates a break iterator for character breaks. | |
374ca955 | 162 | BreakIterator* U_EXPORT2 |
b75a7d8f A |
163 | BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status) |
164 | { | |
374ca955 | 165 | return createInstance(key, UBRK_CHARACTER, status); |
b75a7d8f A |
166 | } |
167 | ||
168 | // ------------------------------------- | |
169 | ||
170 | // Creates a break iterator for sentence breaks. | |
374ca955 | 171 | BreakIterator* U_EXPORT2 |
b75a7d8f A |
172 | BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status) |
173 | { | |
374ca955 | 174 | return createInstance(key, UBRK_SENTENCE, status); |
b75a7d8f A |
175 | } |
176 | ||
177 | // ------------------------------------- | |
178 | ||
179 | // Creates a break iterator for title casing breaks. | |
374ca955 | 180 | BreakIterator* U_EXPORT2 |
b75a7d8f A |
181 | BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status) |
182 | { | |
374ca955 | 183 | return createInstance(key, UBRK_TITLE, status); |
b75a7d8f A |
184 | } |
185 | ||
186 | // ------------------------------------- | |
187 | ||
188 | // Gets all the available locales that has localized text boundary data. | |
374ca955 | 189 | const Locale* U_EXPORT2 |
b75a7d8f A |
190 | BreakIterator::getAvailableLocales(int32_t& count) |
191 | { | |
192 | return Locale::getAvailableLocales(count); | |
193 | } | |
194 | ||
b75a7d8f A |
195 | // ------------------------------------------ |
196 | // | |
197 | // Default constructor and destructor | |
198 | // | |
199 | //------------------------------------------- | |
200 | ||
201 | BreakIterator::BreakIterator() | |
2ca993e8 | 202 | : fKeepAll(FALSE) |
b75a7d8f | 203 | { |
374ca955 | 204 | *validLocale = *actualLocale = 0; |
b75a7d8f A |
205 | } |
206 | ||
207 | BreakIterator::~BreakIterator() | |
208 | { | |
209 | } | |
210 | ||
211 | // ------------------------------------------ | |
212 | // | |
213 | // Registration | |
214 | // | |
215 | //------------------------------------------- | |
374ca955 | 216 | #if !UCONFIG_NO_SERVICE |
b75a7d8f | 217 | |
b75a7d8f A |
218 | // ------------------------------------- |
219 | ||
220 | class ICUBreakIteratorFactory : public ICUResourceBundleFactory { | |
4388f060 A |
221 | public: |
222 | virtual ~ICUBreakIteratorFactory(); | |
b75a7d8f | 223 | protected: |
374ca955 A |
224 | virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const { |
225 | return BreakIterator::makeInstance(loc, kind, status); | |
226 | } | |
b75a7d8f A |
227 | }; |
228 | ||
4388f060 A |
229 | ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {} |
230 | ||
b75a7d8f A |
231 | // ------------------------------------- |
232 | ||
233 | class ICUBreakIteratorService : public ICULocaleService { | |
234 | public: | |
374ca955 A |
235 | ICUBreakIteratorService() |
236 | : ICULocaleService(UNICODE_STRING("Break Iterator", 14)) | |
237 | { | |
238 | UErrorCode status = U_ZERO_ERROR; | |
239 | registerFactory(new ICUBreakIteratorFactory(), status); | |
240 | } | |
46f4442e | 241 | |
4388f060 A |
242 | virtual ~ICUBreakIteratorService(); |
243 | ||
374ca955 A |
244 | virtual UObject* cloneInstance(UObject* instance) const { |
245 | return ((BreakIterator*)instance)->clone(); | |
246 | } | |
46f4442e | 247 | |
374ca955 A |
248 | virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const { |
249 | LocaleKey& lkey = (LocaleKey&)key; | |
250 | int32_t kind = lkey.kind(); | |
251 | Locale loc; | |
252 | lkey.currentLocale(loc); | |
253 | return BreakIterator::makeInstance(loc, kind, status); | |
254 | } | |
46f4442e | 255 | |
374ca955 A |
256 | virtual UBool isDefault() const { |
257 | return countFactories() == 1; | |
258 | } | |
b75a7d8f A |
259 | }; |
260 | ||
4388f060 A |
261 | ICUBreakIteratorService::~ICUBreakIteratorService() {} |
262 | ||
b75a7d8f A |
263 | // ------------------------------------- |
264 | ||
374ca955 | 265 | // defined in ucln_cmn.h |
57a6839d | 266 | U_NAMESPACE_END |
374ca955 | 267 | |
57a6839d | 268 | static icu::UInitOnce gInitOnce; |
4388f060 | 269 | static icu::ICULocaleService* gService = NULL; |
46f4442e | 270 | |
57a6839d A |
271 | |
272 | ||
374ca955 | 273 | /** |
46f4442e | 274 | * Release all static memory held by breakiterator. |
374ca955 A |
275 | */ |
276 | U_CDECL_BEGIN | |
277 | static UBool U_CALLCONV breakiterator_cleanup(void) { | |
278 | #if !UCONFIG_NO_SERVICE | |
279 | if (gService) { | |
280 | delete gService; | |
281 | gService = NULL; | |
282 | } | |
57a6839d | 283 | gInitOnce.reset(); |
374ca955 A |
284 | #endif |
285 | return TRUE; | |
286 | } | |
287 | U_CDECL_END | |
288 | U_NAMESPACE_BEGIN | |
289 | ||
57a6839d A |
290 | static void U_CALLCONV |
291 | initService(void) { | |
292 | gService = new ICUBreakIteratorService(); | |
293 | ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup); | |
294 | } | |
295 | ||
46f4442e | 296 | static ICULocaleService* |
b75a7d8f A |
297 | getService(void) |
298 | { | |
57a6839d | 299 | umtx_initOnce(gInitOnce, &initService); |
b75a7d8f A |
300 | return gService; |
301 | } | |
302 | ||
57a6839d | 303 | |
b75a7d8f A |
304 | // ------------------------------------- |
305 | ||
46f4442e A |
306 | static inline UBool |
307 | hasService(void) | |
b75a7d8f | 308 | { |
57a6839d | 309 | return !gInitOnce.isReset() && getService() != NULL; |
b75a7d8f A |
310 | } |
311 | ||
312 | // ------------------------------------- | |
313 | ||
374ca955 | 314 | URegistryKey U_EXPORT2 |
46f4442e | 315 | BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status) |
b75a7d8f | 316 | { |
46f4442e A |
317 | ICULocaleService *service = getService(); |
318 | if (service == NULL) { | |
319 | status = U_MEMORY_ALLOCATION_ERROR; | |
320 | return NULL; | |
321 | } | |
322 | return service->registerInstance(toAdopt, locale, kind, status); | |
b75a7d8f A |
323 | } |
324 | ||
325 | // ------------------------------------- | |
326 | ||
374ca955 | 327 | UBool U_EXPORT2 |
46f4442e | 328 | BreakIterator::unregister(URegistryKey key, UErrorCode& status) |
b75a7d8f A |
329 | { |
330 | if (U_SUCCESS(status)) { | |
331 | if (hasService()) { | |
332 | return gService->unregister(key, status); | |
333 | } | |
46f4442e | 334 | status = U_MEMORY_ALLOCATION_ERROR; |
b75a7d8f A |
335 | } |
336 | return FALSE; | |
337 | } | |
338 | ||
339 | // ------------------------------------- | |
340 | ||
374ca955 | 341 | StringEnumeration* U_EXPORT2 |
b75a7d8f A |
342 | BreakIterator::getAvailableLocales(void) |
343 | { | |
46f4442e A |
344 | ICULocaleService *service = getService(); |
345 | if (service == NULL) { | |
346 | return NULL; | |
347 | } | |
348 | return service->getAvailableLocales(); | |
b75a7d8f | 349 | } |
374ca955 A |
350 | #endif /* UCONFIG_NO_SERVICE */ |
351 | ||
352 | // ------------------------------------- | |
353 | ||
354 | BreakIterator* | |
73c04bcf | 355 | BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status) |
374ca955 A |
356 | { |
357 | if (U_FAILURE(status)) { | |
358 | return NULL; | |
359 | } | |
46f4442e | 360 | |
374ca955 A |
361 | #if !UCONFIG_NO_SERVICE |
362 | if (hasService()) { | |
73c04bcf | 363 | Locale actualLoc(""); |
374ca955 A |
364 | BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status); |
365 | // TODO: The way the service code works in ICU 2.8 is that if | |
366 | // there is a real registered break iterator, the actualLoc | |
367 | // will be populated, but if the handleDefault path is taken | |
368 | // (because nothing is registered that can handle the | |
369 | // requested locale) then the actualLoc comes back empty. In | |
370 | // that case, the returned object already has its actual/valid | |
371 | // locale data populated (by makeInstance, which is what | |
372 | // handleDefault calls), so we don't touch it. YES, A COMMENT | |
373 | // THIS LONG is a sign of bad code -- so the action item is to | |
374 | // revisit this in ICU 3.0 and clean it up/fix it/remove it. | |
375 | if (U_SUCCESS(status) && (result != NULL) && *actualLoc.getName() != 0) { | |
376 | U_LOCALE_BASED(locBased, *result); | |
377 | locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName()); | |
378 | } | |
379 | return result; | |
380 | } | |
381 | else | |
382 | #endif | |
383 | { | |
384 | return makeInstance(loc, kind, status); | |
385 | } | |
386 | } | |
b75a7d8f A |
387 | |
388 | // ------------------------------------- | |
b331163b | 389 | enum { kKeyValueLenMax = 32 }; |
b75a7d8f | 390 | |
46f4442e | 391 | BreakIterator* |
b75a7d8f A |
392 | BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) |
393 | { | |
374ca955 A |
394 | |
395 | if (U_FAILURE(status)) { | |
396 | return NULL; | |
397 | } | |
b331163b | 398 | char lbType[kKeyValueLenMax]; |
374ca955 A |
399 | |
400 | BreakIterator *result = NULL; | |
b75a7d8f | 401 | switch (kind) { |
46f4442e | 402 | case UBRK_CHARACTER: |
73c04bcf | 403 | result = BreakIterator::buildInstance(loc, "grapheme", kind, status); |
374ca955 A |
404 | break; |
405 | case UBRK_WORD: | |
73c04bcf | 406 | result = BreakIterator::buildInstance(loc, "word", kind, status); |
374ca955 A |
407 | break; |
408 | case UBRK_LINE: | |
08b89b0a A |
409 | uprv_strcpy(lbType, "line"); |
410 | { | |
b331163b | 411 | char lbKeyValue[kKeyValueLenMax] = {0}; |
08b89b0a | 412 | UErrorCode kvStatus = U_ZERO_ERROR; |
b331163b A |
413 | int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus); |
414 | if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) { | |
08b89b0a A |
415 | uprv_strcat(lbType, "_"); |
416 | uprv_strcat(lbType, lbKeyValue); | |
417 | } | |
418 | } | |
419 | result = BreakIterator::buildInstance(loc, lbType, kind, status); | |
2ca993e8 A |
420 | if (U_SUCCESS(status) && result != NULL) { |
421 | char lwKeyValue[kKeyValueLenMax] = {0}; | |
422 | UErrorCode kvStatus = U_ZERO_ERROR; | |
423 | int32_t kLen = loc.getKeywordValue("lw", lwKeyValue, kKeyValueLenMax, kvStatus); | |
424 | result->setKeepAll(U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(lwKeyValue,"keepall")==0); | |
425 | } | |
374ca955 A |
426 | break; |
427 | case UBRK_SENTENCE: | |
73c04bcf | 428 | result = BreakIterator::buildInstance(loc, "sentence", kind, status); |
b331163b A |
429 | { |
430 | char ssKeyValue[kKeyValueLenMax] = {0}; | |
431 | UErrorCode kvStatus = U_ZERO_ERROR; | |
432 | int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus); | |
433 | if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) { | |
434 | FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus); | |
435 | if (U_SUCCESS(kvStatus)) { | |
436 | result = fbiBuilder->build(result, status); | |
437 | delete fbiBuilder; | |
438 | } | |
439 | } | |
440 | } | |
374ca955 A |
441 | break; |
442 | case UBRK_TITLE: | |
73c04bcf | 443 | result = BreakIterator::buildInstance(loc, "title", kind, status); |
374ca955 | 444 | break; |
b75a7d8f | 445 | default: |
374ca955 | 446 | status = U_ILLEGAL_ARGUMENT_ERROR; |
b75a7d8f | 447 | } |
b75a7d8f | 448 | |
374ca955 A |
449 | if (U_FAILURE(status)) { |
450 | return NULL; | |
451 | } | |
b75a7d8f | 452 | |
374ca955 A |
453 | return result; |
454 | } | |
b75a7d8f | 455 | |
46f4442e | 456 | Locale |
374ca955 A |
457 | BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const { |
458 | U_LOCALE_BASED(locBased, *this); | |
459 | return locBased.getLocale(type, status); | |
460 | } | |
461 | ||
462 | const char * | |
463 | BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const { | |
464 | U_LOCALE_BASED(locBased, *this); | |
465 | return locBased.getLocaleID(type, status); | |
b75a7d8f A |
466 | } |
467 | ||
57a6839d A |
468 | |
469 | // This implementation of getRuleStatus is a do-nothing stub, here to | |
470 | // provide a default implementation for any derived BreakIterator classes that | |
471 | // do not implement it themselves. | |
472 | int32_t BreakIterator::getRuleStatus() const { | |
473 | return 0; | |
474 | } | |
475 | ||
476 | // This implementation of getRuleStatusVec is a do-nothing stub, here to | |
477 | // provide a default implementation for any derived BreakIterator classes that | |
478 | // do not implement it themselves. | |
479 | int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) { | |
480 | if (U_FAILURE(status)) { | |
481 | return 0; | |
482 | } | |
483 | if (capacity < 1) { | |
484 | status = U_BUFFER_OVERFLOW_ERROR; | |
485 | return 1; | |
486 | } | |
487 | *fillInVec = 0; | |
488 | return 1; | |
489 | } | |
490 | ||
491 | BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) { | |
492 | U_LOCALE_BASED(locBased, (*this)); | |
493 | locBased.setLocaleIDs(valid, actual); | |
494 | } | |
495 | ||
374ca955 A |
496 | U_NAMESPACE_END |
497 | ||
b75a7d8f A |
498 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
499 | ||
500 | //eof |