X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..ef6cf650f4a75c3f97de06b51fa104f2069b9ea2:/icuSources/common/brkiter.cpp?ds=inline diff --git a/icuSources/common/brkiter.cpp b/icuSources/common/brkiter.cpp index 8db038ff..065e6df9 100644 --- a/icuSources/common/brkiter.cpp +++ b/icuSources/common/brkiter.cpp @@ -1,10 +1,10 @@ /* ******************************************************************************* -* Copyright (C) 1997-2001, International Business Machines Corporation and * -* others. All Rights Reserved. * +* Copyright (C) 1997-2016, International Business Machines Corporation and +* others. All Rights Reserved. ******************************************************************************* * -* File TXTBDRY.CPP +* File brkiter.cpp * * Modification History: * @@ -22,13 +22,22 @@ #if !UCONFIG_NO_BREAK_ITERATION -#include "unicode/dbbi.h" +#include "unicode/rbbi.h" #include "unicode/brkiter.h" #include "unicode/udata.h" -#include "unicode/resbund.h" +#include "unicode/ures.h" +#include "unicode/ustring.h" +#include "unicode/filteredbrk.h" +#include "ucln_cmn.h" #include "cstring.h" -#include "mutex.h" -#include "iculserv.h" +#include "umutex.h" +#include "servloc.h" +#include "locbased.h" +#include "uresimp.h" +#include "uassert.h" +#include "ubrkimpl.h" +#include "charstr.h" +#include "unicode/filteredbrk.h" // ***************************************************************************** // class BreakIterator @@ -39,253 +48,150 @@ U_NAMESPACE_BEGIN -const int32_t BreakIterator::DONE = (int32_t)-1; - // ------------------------------------- -// Creates a break iterator for word breaks. -BreakIterator* -BreakIterator::createWordInstance(const Locale& key, UErrorCode& status) -{ - return createInstance(key, UBRK_WORD, status); -} - BreakIterator* -BreakIterator::makeWordInstance(const Locale& key, UErrorCode& status) +BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status) { - // WARNING: This routine is currently written specifically to handle only the - // default rules files and the alternate rules files for Thai. This function - // will have to be made fully general at some time in the future! - BreakIterator* result = NULL; - const char* filename = "word"; + char fnbuff[256]; + char ext[4]={'\0'}; + CharString actualLocale; + int32_t size; + const UChar* brkfname = NULL; + UResourceBundle brkRulesStack; + UResourceBundle brkNameStack; + UResourceBundle *brkRules = &brkRulesStack; + UResourceBundle *brkName = &brkNameStack; + RuleBasedBreakIterator *result = NULL; if (U_FAILURE(status)) return NULL; - if (!uprv_strcmp(key.getLanguage(), "th")) - { - filename = "word_th"; + ures_initStackObject(brkRules); + ures_initStackObject(brkName); + + // Get the locale + UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status); + + // Get the "boundaries" array. + if (U_SUCCESS(status)) { + brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status); + // Get the string object naming the rules file + brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status); + // Get the actual string + brkfname = ures_getString(brkName, &size, &status); + U_ASSERT((size_t)size=sizeof(fnbuff)) { + size=0; + if (U_SUCCESS(status)) { + status = U_BUFFER_OVERFLOW_ERROR; + } + } + + // Use the string if we found it + if (U_SUCCESS(status) && brkfname) { + actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status); + + UChar* extStart=u_strchr(brkfname, 0x002e); + int len = 0; + if(extStart!=NULL){ + len = (int)(extStart-brkfname); + u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff + u_UCharsToChars(brkfname, fnbuff, len); + } + fnbuff[len]=0; // nul terminate + } } - UDataMemory* file = udata_open(NULL, "brk", filename, &status); + ures_close(brkRules); + ures_close(brkName); + + UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status); if (U_FAILURE(status)) { + ures_close(b); return NULL; } - // The UDataMemory is adopted by the break iterator. - if(!uprv_strcmp(filename, "word_th")) { - filename = "thaidict.brk"; - result = new DictionaryBasedBreakIterator(file, filename, status); + // Create a RuleBasedBreakIterator + result = new RuleBasedBreakIterator(file, status); + + // If there is a result, set the valid locale and actual locale, and the kind + if (U_SUCCESS(status) && result != NULL) { + U_LOCALE_BASED(locBased, *(BreakIterator*)result); + locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), + actualLocale.data()); + result->setBreakType(kind); } - else { - result = new RuleBasedBreakIterator(file, status); + + ures_close(b); + + if (U_FAILURE(status) && result != NULL) { // Sometimes redundant check, but simple + delete result; + return NULL; } + if (result == NULL) { udata_close(file); - status = U_MEMORY_ALLOCATION_ERROR; - } - if (U_FAILURE(status)) { // Sometimes redundant check, but simple. - delete result; - result = NULL; + if (U_SUCCESS(status)) { + status = U_MEMORY_ALLOCATION_ERROR; + } } return result; } +// Creates a break iterator for word breaks. +BreakIterator* U_EXPORT2 +BreakIterator::createWordInstance(const Locale& key, UErrorCode& status) +{ + return createInstance(key, UBRK_WORD, status); +} + // ------------------------------------- // Creates a break iterator for line breaks. -BreakIterator* +BreakIterator* U_EXPORT2 BreakIterator::createLineInstance(const Locale& key, UErrorCode& status) { - return createInstance(key, UBRK_LINE, status); -} - -BreakIterator* -BreakIterator::makeLineInstance(const Locale& key, UErrorCode& status) -{ - // WARNING: This routine is currently written specifically to handle only the - // default rules files and the alternate rules files for Thai. This function - // will have to be made fully general at some time in the future! - BreakIterator* result = NULL; - const char* filename = "line"; - - if (U_FAILURE(status)) - return NULL; - - if (!uprv_strcmp(key.getLanguage(), "th")) - { - filename = "line_th"; - } - - UDataMemory* file = udata_open(NULL, "brk", filename, &status); - if (U_FAILURE(status)) { - return NULL; - } - // The UDataMemory is adopted by the break iterator. - - if (!uprv_strcmp(key.getLanguage(), "th")) { - filename = "thaidict.brk"; - result = new DictionaryBasedBreakIterator(file, filename, status); - } - else { - result = new RuleBasedBreakIterator(file, status); - } - if (result == NULL) { - udata_close(file); - status = U_MEMORY_ALLOCATION_ERROR; - } - if (U_FAILURE(status)) { // Sometimes redundant check, but simple. - delete result; - result = NULL; - } - return result; + return createInstance(key, UBRK_LINE, status); } // ------------------------------------- // Creates a break iterator for character breaks. -BreakIterator* +BreakIterator* U_EXPORT2 BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status) { - return createInstance(key, UBRK_CHARACTER, status); -} - -BreakIterator* -BreakIterator::makeCharacterInstance(const Locale& /* key */, UErrorCode& status) -{ - // WARNING: This routine is currently written specifically to handle only the - // default rules files and the alternate rules files for Thai. This function - // will have to be made fully general at some time in the future! - BreakIterator* result = NULL; - static const char filename[] = "char"; - - if (U_FAILURE(status)) - return NULL; - UDataMemory* file = udata_open(NULL, "brk", filename, &status); - if (U_FAILURE(status)) { - return NULL; - } - // The UDataMemory is adopted by the break iterator. - - result = new RuleBasedBreakIterator(file, status); - if (result == NULL) { - udata_close(file); - status = U_MEMORY_ALLOCATION_ERROR; - } - if (U_FAILURE(status)) { // Sometimes redundant check, but simple. - delete result; - result = NULL; - } - return result; + return createInstance(key, UBRK_CHARACTER, status); } // ------------------------------------- // Creates a break iterator for sentence breaks. -BreakIterator* +BreakIterator* U_EXPORT2 BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status) { - return createInstance(key, UBRK_SENTENCE, status); -} - -BreakIterator* -BreakIterator::makeSentenceInstance(const Locale& /*key */, UErrorCode& status) -{ - // WARNING: This routine is currently written specifically to handle only the - // default rules files and the alternate rules files for Thai. This function - // will have to be made fully general at some time in the future! - BreakIterator* result = NULL; - static const char filename[] = "sent"; - - if (U_FAILURE(status)) - return NULL; - UDataMemory* file = udata_open(NULL, "brk", filename, &status); - if (U_FAILURE(status)) { - return NULL; - } - // The UDataMemory is adopted by the break iterator. - - result = new RuleBasedBreakIterator(file, status); - if (result == NULL) { - udata_close(file); - status = U_MEMORY_ALLOCATION_ERROR; - } - if (U_FAILURE(status)) { // Sometimes redundant check, but simple. - delete result; - result = NULL; - } - - return result; + return createInstance(key, UBRK_SENTENCE, status); } // ------------------------------------- // Creates a break iterator for title casing breaks. -BreakIterator* +BreakIterator* U_EXPORT2 BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status) { - return createInstance(key, UBRK_TITLE, status); -} - -BreakIterator* -BreakIterator::makeTitleInstance(const Locale& /* key */, UErrorCode& status) -{ - // WARNING: This routine is currently written specifically to handle only the - // default rules files. This function will have to be made fully general - // at some time in the future! - BreakIterator* result = NULL; - static const char filename[] = "title"; - - if (U_FAILURE(status)) - return NULL; - UDataMemory* file = udata_open(NULL, "brk", filename, &status); - if (U_FAILURE(status)) { - return NULL; - } - // The UDataMemory is adopted by the break iterator. - - result = new RuleBasedBreakIterator(file, status); - if (result == NULL) { - udata_close(file); - status = U_MEMORY_ALLOCATION_ERROR; - } - if (U_FAILURE(status)) { // Sometimes redundant check, but simple. - delete result; - result = NULL; - } - - return result; + return createInstance(key, UBRK_TITLE, status); } // ------------------------------------- // Gets all the available locales that has localized text boundary data. -const Locale* +const Locale* U_EXPORT2 BreakIterator::getAvailableLocales(int32_t& count) { return Locale::getAvailableLocales(count); } -// ------------------------------------- -// Gets the objectLocale display name in the default locale language. -UnicodeString& -BreakIterator::getDisplayName(const Locale& objectLocale, - UnicodeString& name) -{ - return objectLocale.getDisplayName(name); -} - -// ------------------------------------- -// Gets the objectLocale display name in the displayLocale language. -UnicodeString& -BreakIterator::getDisplayName(const Locale& objectLocale, - const Locale& displayLocale, - UnicodeString& name) -{ - return objectLocale.getDisplayName(displayLocale, name); -} - // ------------------------------------------ // // Default constructor and destructor @@ -293,8 +199,9 @@ BreakIterator::getDisplayName(const Locale& objectLocale, //------------------------------------------- BreakIterator::BreakIterator() +: fKeepAll(FALSE) { - fBufferClone = FALSE; + *validLocale = *actualLocale = 0; } BreakIterator::~BreakIterator() @@ -306,156 +213,288 @@ BreakIterator::~BreakIterator() // Registration // //------------------------------------------- - -static ICULocaleService* gService = NULL; +#if !UCONFIG_NO_SERVICE // ------------------------------------- class ICUBreakIteratorFactory : public ICUResourceBundleFactory { +public: + virtual ~ICUBreakIteratorFactory(); protected: - virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* service, UErrorCode& status) const { - return BreakIterator::makeInstance(loc, kind, status); - } + virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const { + return BreakIterator::makeInstance(loc, kind, status); + } }; +ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {} + // ------------------------------------- class ICUBreakIteratorService : public ICULocaleService { public: - ICUBreakIteratorService() - : ICULocaleService("Break Iterator") - { - UErrorCode status = U_ZERO_ERROR; - registerFactory(new ICUBreakIteratorFactory(), status); - } - - virtual UObject* cloneInstance(UObject* instance) const { - return ((BreakIterator*)instance)->clone(); - } - - virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* actualID, UErrorCode& status) const { - LocaleKey& lkey = (LocaleKey&)key; - int32_t kind = lkey.kind(); - Locale loc; - lkey.currentLocale(loc); - return BreakIterator::makeInstance(loc, kind, status); - } - - virtual UBool isDefault() const { - return countFactories() == 1; - } + ICUBreakIteratorService() + : ICULocaleService(UNICODE_STRING("Break Iterator", 14)) + { + UErrorCode status = U_ZERO_ERROR; + registerFactory(new ICUBreakIteratorFactory(), status); + } + + virtual ~ICUBreakIteratorService(); + + virtual UObject* cloneInstance(UObject* instance) const { + return ((BreakIterator*)instance)->clone(); + } + + virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const { + LocaleKey& lkey = (LocaleKey&)key; + int32_t kind = lkey.kind(); + Locale loc; + lkey.currentLocale(loc); + return BreakIterator::makeInstance(loc, kind, status); + } + + virtual UBool isDefault() const { + return countFactories() == 1; + } }; +ICUBreakIteratorService::~ICUBreakIteratorService() {} + // ------------------------------------- -static ICULocaleService* -getService(void) -{ - UBool needsInit; - umtx_lock(NULL); - needsInit = (UBool)(gService == NULL); - umtx_unlock(NULL); - - if (needsInit) { - ICULocaleService *tService = new ICUBreakIteratorService(); - umtx_lock(NULL); - if (gService == NULL) { - gService = tService; - tService = NULL; - } - umtx_unlock(NULL); - delete tService; +// defined in ucln_cmn.h +U_NAMESPACE_END + +static icu::UInitOnce gInitOnce; +static icu::ICULocaleService* gService = NULL; + + + +/** + * Release all static memory held by breakiterator. + */ +U_CDECL_BEGIN +static UBool U_CALLCONV breakiterator_cleanup(void) { +#if !UCONFIG_NO_SERVICE + if (gService) { + delete gService; + gService = NULL; } - return gService; + gInitOnce.reset(); +#endif + return TRUE; } +U_CDECL_END +U_NAMESPACE_BEGIN -// ------------------------------------- +static void U_CALLCONV +initService(void) { + gService = new ICUBreakIteratorService(); + ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup); +} -static UBool -hasService(void) +static ICULocaleService* +getService(void) { - Mutex mutex; - return gService != NULL; + umtx_initOnce(gInitOnce, &initService); + return gService; } + // ------------------------------------- -BreakIterator* -BreakIterator::createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status) +static inline UBool +hasService(void) { - if (U_FAILURE(status)) { - return NULL; - } - - if (hasService()) { - return (BreakIterator*)gService->get(loc, kind, status); - } else { - return makeInstance(loc, kind, status); - } + return !gInitOnce.isReset() && getService() != NULL; } // ------------------------------------- -URegistryKey -BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status) +URegistryKey U_EXPORT2 +BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status) { - return getService()->registerInstance(toAdopt, locale, kind, status); + ICULocaleService *service = getService(); + if (service == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + return service->registerInstance(toAdopt, locale, kind, status); } // ------------------------------------- -UBool -BreakIterator::unregister(URegistryKey key, UErrorCode& status) +UBool U_EXPORT2 +BreakIterator::unregister(URegistryKey key, UErrorCode& status) { if (U_SUCCESS(status)) { if (hasService()) { return gService->unregister(key, status); } - status = U_ILLEGAL_ARGUMENT_ERROR; + status = U_MEMORY_ALLOCATION_ERROR; } return FALSE; } // ------------------------------------- -StringEnumeration* +StringEnumeration* U_EXPORT2 BreakIterator::getAvailableLocales(void) { - return getService()->getAvailableLocales(); + ICULocaleService *service = getService(); + if (service == NULL) { + return NULL; + } + return service->getAvailableLocales(); +} +#endif /* UCONFIG_NO_SERVICE */ + +// ------------------------------------- + +BreakIterator* +BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status) +{ + if (U_FAILURE(status)) { + return NULL; + } + +#if !UCONFIG_NO_SERVICE + if (hasService()) { + Locale actualLoc(""); + BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status); + // TODO: The way the service code works in ICU 2.8 is that if + // there is a real registered break iterator, the actualLoc + // will be populated, but if the handleDefault path is taken + // (because nothing is registered that can handle the + // requested locale) then the actualLoc comes back empty. In + // that case, the returned object already has its actual/valid + // locale data populated (by makeInstance, which is what + // handleDefault calls), so we don't touch it. YES, A COMMENT + // THIS LONG is a sign of bad code -- so the action item is to + // revisit this in ICU 3.0 and clean it up/fix it/remove it. + if (U_SUCCESS(status) && (result != NULL) && *actualLoc.getName() != 0) { + U_LOCALE_BASED(locBased, *result); + locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName()); + } + return result; + } + else +#endif + { + return makeInstance(loc, kind, status); + } } // ------------------------------------- +enum { kKeyValueLenMax = 32 }; -BreakIterator* +BreakIterator* BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) { + + if (U_FAILURE(status)) { + return NULL; + } + char lbType[kKeyValueLenMax]; + + BreakIterator *result = NULL; switch (kind) { - case UBRK_CHARACTER: return BreakIterator::makeCharacterInstance(loc, status); - case UBRK_WORD: return BreakIterator::makeWordInstance(loc, status); - case UBRK_LINE: return BreakIterator::makeLineInstance(loc, status); - case UBRK_SENTENCE: return BreakIterator::makeSentenceInstance(loc, status); - case UBRK_TITLE: return BreakIterator::makeTitleInstance(loc, status); + case UBRK_CHARACTER: + result = BreakIterator::buildInstance(loc, "grapheme", kind, status); + break; + case UBRK_WORD: + result = BreakIterator::buildInstance(loc, "word", kind, status); + break; + case UBRK_LINE: + uprv_strcpy(lbType, "line"); + { + char lbKeyValue[kKeyValueLenMax] = {0}; + UErrorCode kvStatus = U_ZERO_ERROR; + int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus); + if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) { + uprv_strcat(lbType, "_"); + uprv_strcat(lbType, lbKeyValue); + } + } + result = BreakIterator::buildInstance(loc, lbType, kind, status); + if (U_SUCCESS(status) && result != NULL) { + char lwKeyValue[kKeyValueLenMax] = {0}; + UErrorCode kvStatus = U_ZERO_ERROR; + int32_t kLen = loc.getKeywordValue("lw", lwKeyValue, kKeyValueLenMax, kvStatus); + result->setKeepAll(U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(lwKeyValue,"keepall")==0); + } + break; + case UBRK_SENTENCE: + result = BreakIterator::buildInstance(loc, "sentence", kind, status); + { + char ssKeyValue[kKeyValueLenMax] = {0}; + UErrorCode kvStatus = U_ZERO_ERROR; + int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus); + if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) { + FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus); + if (U_SUCCESS(kvStatus)) { + result = fbiBuilder->build(result, status); + delete fbiBuilder; + } + } + } + break; + case UBRK_TITLE: + result = BreakIterator::buildInstance(loc, "title", kind, status); + break; default: - status = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; + status = U_ILLEGAL_ARGUMENT_ERROR; + } + + if (U_FAILURE(status)) { + return NULL; } + + return result; } -U_NAMESPACE_END +Locale +BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const { + U_LOCALE_BASED(locBased, *this); + return locBased.getLocale(type, status); +} -// defined in ucln_cmn.h +const char * +BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const { + U_LOCALE_BASED(locBased, *this); + return locBased.getLocaleID(type, status); +} -/** - * Release all static memory held by breakiterator. - */ -U_CFUNC UBool breakiterator_cleanup(void) { - if (gService) { - delete gService; - gService = NULL; - } - return TRUE; + +// This implementation of getRuleStatus is a do-nothing stub, here to +// provide a default implementation for any derived BreakIterator classes that +// do not implement it themselves. +int32_t BreakIterator::getRuleStatus() const { + return 0; } +// This implementation of getRuleStatusVec is a do-nothing stub, here to +// provide a default implementation for any derived BreakIterator classes that +// do not implement it themselves. +int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) { + if (U_FAILURE(status)) { + return 0; + } + if (capacity < 1) { + status = U_BUFFER_OVERFLOW_ERROR; + return 1; + } + *fillInVec = 0; + return 1; +} + +BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) { + U_LOCALE_BASED(locBased, (*this)); + locBased.setLocaleIDs(valid, actual); +} + +U_NAMESPACE_END + #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ //eof