]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/brkiter.cpp
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / brkiter.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 1997-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 *
9 * File brkiter.cpp
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 02/18/97 aliu Converted from OpenClass. Added DONE.
15 * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
16 *****************************************************************************************
17 */
18
19 // *****************************************************************************
20 // This file was generated from the java source file BreakIterator.java
21 // *****************************************************************************
22
23 #include "unicode/utypes.h"
24
25 #if !UCONFIG_NO_BREAK_ITERATION
26
27 #include "unicode/rbbi.h"
28 #include "unicode/brkiter.h"
29 #include "unicode/udata.h"
30 #include "unicode/ures.h"
31 #include "unicode/ustring.h"
32 #include "unicode/filteredbrk.h"
33 #include "ucln_cmn.h"
34 #include "cstring.h"
35 #include "umutex.h"
36 #include "servloc.h"
37 #include "locbased.h"
38 #include "uresimp.h"
39 #include "uassert.h"
40 #include "ubrkimpl.h"
41 #include "charstr.h"
42 #include "unicode/filteredbrk.h"
43
44 // *****************************************************************************
45 // class BreakIterator
46 // This class implements methods for finding the location of boundaries in text.
47 // Instances of BreakIterator maintain a current position and scan over text
48 // returning the index of characters where boundaries occur.
49 // *****************************************************************************
50
51 U_NAMESPACE_BEGIN
52
53 // -------------------------------------
54
55 BreakIterator*
56 BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status)
57 {
58 char fnbuff[256];
59 char ext[4]={'\0'};
60 CharString actualLocale;
61 int32_t size;
62 const UChar* brkfname = NULL;
63 UResourceBundle brkRulesStack;
64 UResourceBundle brkNameStack;
65 UResourceBundle *brkRules = &brkRulesStack;
66 UResourceBundle *brkName = &brkNameStack;
67 RuleBasedBreakIterator *result = NULL;
68
69 if (U_FAILURE(status))
70 return NULL;
71
72 ures_initStackObject(brkRules);
73 ures_initStackObject(brkName);
74
75 // Get the locale
76 UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status);
77
78 // Get the "boundaries" array.
79 if (U_SUCCESS(status)) {
80 brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status);
81 // Get the string object naming the rules file
82 brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status);
83 // Get the actual string
84 brkfname = ures_getString(brkName, &size, &status);
85 U_ASSERT((size_t)size<sizeof(fnbuff));
86 if ((size_t)size>=sizeof(fnbuff)) {
87 size=0;
88 if (U_SUCCESS(status)) {
89 status = U_BUFFER_OVERFLOW_ERROR;
90 }
91 }
92
93 // Use the string if we found it
94 if (U_SUCCESS(status) && brkfname) {
95 actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status);
96
97 UChar* extStart=u_strchr(brkfname, 0x002e);
98 int len = 0;
99 if(extStart!=NULL){
100 len = (int)(extStart-brkfname);
101 u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
102 u_UCharsToChars(brkfname, fnbuff, len);
103 }
104 fnbuff[len]=0; // nul terminate
105 }
106 }
107
108 ures_close(brkRules);
109 ures_close(brkName);
110
111 UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status);
112 if (U_FAILURE(status)) {
113 ures_close(b);
114 return NULL;
115 }
116
117 // Create a RuleBasedBreakIterator
118 result = new RuleBasedBreakIterator(file, status);
119
120 // If there is a result, set the valid locale and actual locale, and the kind
121 if (U_SUCCESS(status) && result != NULL) {
122 U_LOCALE_BASED(locBased, *(BreakIterator*)result);
123 locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
124 actualLocale.data());
125 result->setBreakType(kind);
126 }
127
128 ures_close(b);
129
130 if (U_FAILURE(status) && result != NULL) { // Sometimes redundant check, but simple
131 delete result;
132 return NULL;
133 }
134
135 if (result == NULL) {
136 udata_close(file);
137 if (U_SUCCESS(status)) {
138 status = U_MEMORY_ALLOCATION_ERROR;
139 }
140 }
141
142 return result;
143 }
144
145 // Creates a break iterator for word breaks.
146 BreakIterator* U_EXPORT2
147 BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
148 {
149 return createInstance(key, UBRK_WORD, status);
150 }
151
152 // -------------------------------------
153
154 // Creates a break iterator for line breaks.
155 BreakIterator* U_EXPORT2
156 BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
157 {
158 return createInstance(key, UBRK_LINE, status);
159 }
160
161 // -------------------------------------
162
163 // Creates a break iterator for character breaks.
164 BreakIterator* U_EXPORT2
165 BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
166 {
167 return createInstance(key, UBRK_CHARACTER, status);
168 }
169
170 // -------------------------------------
171
172 // Creates a break iterator for sentence breaks.
173 BreakIterator* U_EXPORT2
174 BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
175 {
176 return createInstance(key, UBRK_SENTENCE, status);
177 }
178
179 // -------------------------------------
180
181 // Creates a break iterator for title casing breaks.
182 BreakIterator* U_EXPORT2
183 BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
184 {
185 return createInstance(key, UBRK_TITLE, status);
186 }
187
188 // -------------------------------------
189
190 // Gets all the available locales that has localized text boundary data.
191 const Locale* U_EXPORT2
192 BreakIterator::getAvailableLocales(int32_t& count)
193 {
194 return Locale::getAvailableLocales(count);
195 }
196
197 // ------------------------------------------
198 //
199 // Default constructor and destructor
200 //
201 //-------------------------------------------
202
203 BreakIterator::BreakIterator()
204 : fKeepAll(FALSE)
205 {
206 *validLocale = *actualLocale = 0;
207 }
208
209 BreakIterator::~BreakIterator()
210 {
211 }
212
213 // ------------------------------------------
214 //
215 // Registration
216 //
217 //-------------------------------------------
218 #if !UCONFIG_NO_SERVICE
219
220 // -------------------------------------
221
222 class ICUBreakIteratorFactory : public ICUResourceBundleFactory {
223 public:
224 virtual ~ICUBreakIteratorFactory();
225 protected:
226 virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const {
227 return BreakIterator::makeInstance(loc, kind, status);
228 }
229 };
230
231 ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {}
232
233 // -------------------------------------
234
235 class ICUBreakIteratorService : public ICULocaleService {
236 public:
237 ICUBreakIteratorService()
238 : ICULocaleService(UNICODE_STRING("Break Iterator", 14))
239 {
240 UErrorCode status = U_ZERO_ERROR;
241 registerFactory(new ICUBreakIteratorFactory(), status);
242 }
243
244 virtual ~ICUBreakIteratorService();
245
246 virtual UObject* cloneInstance(UObject* instance) const {
247 return ((BreakIterator*)instance)->clone();
248 }
249
250 virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const {
251 LocaleKey& lkey = (LocaleKey&)key;
252 int32_t kind = lkey.kind();
253 Locale loc;
254 lkey.currentLocale(loc);
255 return BreakIterator::makeInstance(loc, kind, status);
256 }
257
258 virtual UBool isDefault() const {
259 return countFactories() == 1;
260 }
261 };
262
263 ICUBreakIteratorService::~ICUBreakIteratorService() {}
264
265 // -------------------------------------
266
267 // defined in ucln_cmn.h
268 U_NAMESPACE_END
269
270 static icu::UInitOnce gInitOnce;
271 static icu::ICULocaleService* gService = NULL;
272
273
274
275 /**
276 * Release all static memory held by breakiterator.
277 */
278 U_CDECL_BEGIN
279 static UBool U_CALLCONV breakiterator_cleanup(void) {
280 #if !UCONFIG_NO_SERVICE
281 if (gService) {
282 delete gService;
283 gService = NULL;
284 }
285 gInitOnce.reset();
286 #endif
287 return TRUE;
288 }
289 U_CDECL_END
290 U_NAMESPACE_BEGIN
291
292 static void U_CALLCONV
293 initService(void) {
294 gService = new ICUBreakIteratorService();
295 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup);
296 }
297
298 static ICULocaleService*
299 getService(void)
300 {
301 umtx_initOnce(gInitOnce, &initService);
302 return gService;
303 }
304
305
306 // -------------------------------------
307
308 static inline UBool
309 hasService(void)
310 {
311 return !gInitOnce.isReset() && getService() != NULL;
312 }
313
314 // -------------------------------------
315
316 URegistryKey U_EXPORT2
317 BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status)
318 {
319 ICULocaleService *service = getService();
320 if (service == NULL) {
321 status = U_MEMORY_ALLOCATION_ERROR;
322 return NULL;
323 }
324 return service->registerInstance(toAdopt, locale, kind, status);
325 }
326
327 // -------------------------------------
328
329 UBool U_EXPORT2
330 BreakIterator::unregister(URegistryKey key, UErrorCode& status)
331 {
332 if (U_SUCCESS(status)) {
333 if (hasService()) {
334 return gService->unregister(key, status);
335 }
336 status = U_MEMORY_ALLOCATION_ERROR;
337 }
338 return FALSE;
339 }
340
341 // -------------------------------------
342
343 StringEnumeration* U_EXPORT2
344 BreakIterator::getAvailableLocales(void)
345 {
346 ICULocaleService *service = getService();
347 if (service == NULL) {
348 return NULL;
349 }
350 return service->getAvailableLocales();
351 }
352 #endif /* UCONFIG_NO_SERVICE */
353
354 // -------------------------------------
355
356 BreakIterator*
357 BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)
358 {
359 if (U_FAILURE(status)) {
360 return NULL;
361 }
362
363 #if !UCONFIG_NO_SERVICE
364 if (hasService()) {
365 Locale actualLoc("");
366 BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status);
367 // TODO: The way the service code works in ICU 2.8 is that if
368 // there is a real registered break iterator, the actualLoc
369 // will be populated, but if the handleDefault path is taken
370 // (because nothing is registered that can handle the
371 // requested locale) then the actualLoc comes back empty. In
372 // that case, the returned object already has its actual/valid
373 // locale data populated (by makeInstance, which is what
374 // handleDefault calls), so we don't touch it. YES, A COMMENT
375 // THIS LONG is a sign of bad code -- so the action item is to
376 // revisit this in ICU 3.0 and clean it up/fix it/remove it.
377 if (U_SUCCESS(status) && (result != NULL) && *actualLoc.getName() != 0) {
378 U_LOCALE_BASED(locBased, *result);
379 locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName());
380 }
381 return result;
382 }
383 else
384 #endif
385 {
386 return makeInstance(loc, kind, status);
387 }
388 }
389
390 // -------------------------------------
391 enum { kKeyValueLenMax = 32 };
392
393 BreakIterator*
394 BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
395 {
396
397 if (U_FAILURE(status)) {
398 return NULL;
399 }
400 char lbType[kKeyValueLenMax];
401
402 BreakIterator *result = NULL;
403 switch (kind) {
404 case UBRK_CHARACTER:
405 result = BreakIterator::buildInstance(loc, "grapheme", kind, status);
406 break;
407 case UBRK_WORD:
408 result = BreakIterator::buildInstance(loc, "word", kind, status);
409 break;
410 case UBRK_LINE:
411 uprv_strcpy(lbType, "line");
412 {
413 char lbKeyValue[kKeyValueLenMax] = {0};
414 UErrorCode kvStatus = U_ZERO_ERROR;
415 int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus);
416 if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) {
417 uprv_strcat(lbType, "_");
418 uprv_strcat(lbType, lbKeyValue);
419 }
420 }
421 result = BreakIterator::buildInstance(loc, lbType, kind, status);
422 if (U_SUCCESS(status) && result != NULL) {
423 char lwKeyValue[kKeyValueLenMax] = {0};
424 UErrorCode kvStatus = U_ZERO_ERROR;
425 int32_t kLen = loc.getKeywordValue("lw", lwKeyValue, kKeyValueLenMax, kvStatus);
426 result->setKeepAll(U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(lwKeyValue,"keepall")==0);
427 }
428 break;
429 case UBRK_SENTENCE:
430 result = BreakIterator::buildInstance(loc, "sentence", kind, status);
431 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
432 {
433 char ssKeyValue[kKeyValueLenMax] = {0};
434 UErrorCode kvStatus = U_ZERO_ERROR;
435 int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
436 if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {
437 FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);
438 if (U_SUCCESS(kvStatus)) {
439 result = fbiBuilder->build(result, status);
440 delete fbiBuilder;
441 }
442 }
443 }
444 #endif
445 break;
446 case UBRK_TITLE:
447 result = BreakIterator::buildInstance(loc, "title", kind, status);
448 break;
449 default:
450 status = U_ILLEGAL_ARGUMENT_ERROR;
451 }
452
453 if (U_FAILURE(status)) {
454 return NULL;
455 }
456
457 return result;
458 }
459
460 Locale
461 BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
462 U_LOCALE_BASED(locBased, *this);
463 return locBased.getLocale(type, status);
464 }
465
466 const char *
467 BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
468 U_LOCALE_BASED(locBased, *this);
469 return locBased.getLocaleID(type, status);
470 }
471
472
473 // This implementation of getRuleStatus is a do-nothing stub, here to
474 // provide a default implementation for any derived BreakIterator classes that
475 // do not implement it themselves.
476 int32_t BreakIterator::getRuleStatus() const {
477 return 0;
478 }
479
480 // This implementation of getRuleStatusVec is a do-nothing stub, here to
481 // provide a default implementation for any derived BreakIterator classes that
482 // do not implement it themselves.
483 int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
484 if (U_FAILURE(status)) {
485 return 0;
486 }
487 if (capacity < 1) {
488 status = U_BUFFER_OVERFLOW_ERROR;
489 return 1;
490 }
491 *fillInVec = 0;
492 return 1;
493 }
494
495 BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) {
496 U_LOCALE_BASED(locBased, (*this));
497 locBased.setLocaleIDs(valid, actual);
498 }
499
500 U_NAMESPACE_END
501
502 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
503
504 //eof