X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..6be67b064733ad8f9e904623c29984bb874c1e0c:/icuSources/common/uniset_props.cpp diff --git a/icuSources/common/uniset_props.cpp b/icuSources/common/uniset_props.cpp index 6f82dfb7..0a137408 100644 --- a/icuSources/common/uniset_props.cpp +++ b/icuSources/common/uniset_props.cpp @@ -1,12 +1,14 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * -* Copyright (C) 1999-2010, International Business Machines +* Copyright (C) 1999-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: uniset_props.cpp -* encoding: US-ASCII +* encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * @@ -47,8 +49,6 @@ U_NAMESPACE_USE -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - // initial storage. Must be >= 0 // *** same as in uniset.cpp ! *** #define START_EXTRA 16 @@ -74,7 +74,7 @@ U_NAMESPACE_USE //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" -static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" +//static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ @@ -97,38 +97,15 @@ static const char ASSIGNED[] = "Assigned"; // [:^Cn:] U_CDECL_BEGIN static UBool U_CALLCONV uset_cleanup(); -U_CDECL_END - -// Not a TriStateSingletonWrapper because we think the UnicodeSet constructor -// can only fail with an out-of-memory error -// if we have a correct pattern and the properties data is hardcoded and always available. -class UnicodeSetSingleton : public SimpleSingletonWrapper { -public: - UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) : - SimpleSingletonWrapper(s), fPattern(pattern) {} - UnicodeSet *getInstance(UErrorCode &errorCode) { - return SimpleSingletonWrapper::getInstance(createInstance, fPattern, errorCode); - } -private: - static void *createInstance(const void *context, UErrorCode &errorCode) { - UnicodeString pattern((const char *)context, -1, US_INV); - UnicodeSet *set=new UnicodeSet(pattern, errorCode); - if(set==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; - } - set->freeze(); - ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); - return set; - } - const char *fPattern; +struct Inclusion { + UnicodeSet *fSet; + UInitOnce fInitOnce; }; +static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() -U_CDECL_BEGIN - -static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions() - -STATIC_SIMPLE_SINGLETON(uni32Singleton); +static UnicodeSet *uni32Singleton; +static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER; //---------------------------------------------------------------- // Inclusions list @@ -155,15 +132,16 @@ _set_addString(USet *set, const UChar *str, int32_t length) { * Cleanup function for UnicodeSet */ static UBool U_CALLCONV uset_cleanup(void) { - int32_t i; - - for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { - if (INCLUSIONS[i] != NULL) { - delete INCLUSIONS[i]; - INCLUSIONS[i] = NULL; - } + for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { + Inclusion &in = gInclusions[i]; + delete in.fSet; + in.fSet = NULL; + in.fInitOnce.reset(); } - UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance(); + + delete uni32Singleton; + uni32Singleton = NULL; + uni32InitOnce.reset(); return TRUE; } @@ -172,110 +150,131 @@ U_CDECL_END U_NAMESPACE_BEGIN /* -Reduce excessive reallocation, and make it easier to detect initialization -problems. +Reduce excessive reallocation, and make it easier to detect initialization problems. Usually you don't see smaller sets than this for Unicode 5.0. */ #define DEFAULT_INCLUSION_CAPACITY 3072 -const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { - UBool needInit; - UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit); - if (needInit) { - UnicodeSet* incl = new UnicodeSet(); - USetAdder sa = { - (USet *)incl, - _set_add, - _set_addRange, - _set_addString, - NULL, // don't need remove() - NULL // don't need removeRange() - }; - incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); - if (incl != NULL) { - switch(src) { - case UPROPS_SRC_CHAR: - uchar_addPropertyStarts(&sa, &status); - break; - case UPROPS_SRC_PROPSVEC: - upropsvec_addPropertyStarts(&sa, &status); - break; - case UPROPS_SRC_CHAR_AND_PROPSVEC: - uchar_addPropertyStarts(&sa, &status); - upropsvec_addPropertyStarts(&sa, &status); - break; +void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) { + // This function is invoked only via umtx_initOnce(). + // This function is a friend of class UnicodeSet. + + U_ASSERT(src >=0 && srcensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); + switch(src) { + case UPROPS_SRC_CHAR: + uchar_addPropertyStarts(&sa, &status); + break; + case UPROPS_SRC_PROPSVEC: + upropsvec_addPropertyStarts(&sa, &status); + break; + case UPROPS_SRC_CHAR_AND_PROPSVEC: + uchar_addPropertyStarts(&sa, &status); + upropsvec_addPropertyStarts(&sa, &status); + break; #if !UCONFIG_NO_NORMALIZATION - case UPROPS_SRC_CASE_AND_NORM: { - const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); - if(U_SUCCESS(status)) { - impl->addPropertyStarts(&sa, status); - } - ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); - break; - } - case UPROPS_SRC_NFC: { - const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); - if(U_SUCCESS(status)) { - impl->addPropertyStarts(&sa, status); - } - break; - } - case UPROPS_SRC_NFKC: { - const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); - if(U_SUCCESS(status)) { - impl->addPropertyStarts(&sa, status); - } - break; - } - case UPROPS_SRC_NFKC_CF: { - const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); - if(U_SUCCESS(status)) { - impl->addPropertyStarts(&sa, status); - } - break; - } - case UPROPS_SRC_NFC_CANON_ITER: { - const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); - if(U_SUCCESS(status)) { - impl->addCanonIterPropertyStarts(&sa, status); - } - break; - } -#endif - case UPROPS_SRC_CASE: - ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); - break; - case UPROPS_SRC_BIDI: - ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); - break; - default: - status = U_INTERNAL_PROGRAM_ERROR; - break; - } - if (U_SUCCESS(status)) { - // Compact for caching - incl->compact(); - umtx_lock(NULL); - if (INCLUSIONS[src] == NULL) { - INCLUSIONS[src] = incl; - incl = NULL; - ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); - } - umtx_unlock(NULL); - } - delete incl; - } else { - status = U_MEMORY_ALLOCATION_ERROR; + case UPROPS_SRC_CASE_AND_NORM: { + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); + if(U_SUCCESS(status)) { + impl->addPropertyStarts(&sa, status); } + ucase_addPropertyStarts(&sa, &status); + break; } - return INCLUSIONS[src]; + case UPROPS_SRC_NFC: { + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); + if(U_SUCCESS(status)) { + impl->addPropertyStarts(&sa, status); + } + break; + } + case UPROPS_SRC_NFKC: { + const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); + if(U_SUCCESS(status)) { + impl->addPropertyStarts(&sa, status); + } + break; + } + case UPROPS_SRC_NFKC_CF: { + const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); + if(U_SUCCESS(status)) { + impl->addPropertyStarts(&sa, status); + } + break; + } + case UPROPS_SRC_NFC_CANON_ITER: { + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); + if(U_SUCCESS(status)) { + impl->addCanonIterPropertyStarts(&sa, status); + } + break; + } +#endif + case UPROPS_SRC_CASE: + ucase_addPropertyStarts(&sa, &status); + break; + case UPROPS_SRC_BIDI: + ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); + break; + default: + status = U_INTERNAL_PROGRAM_ERROR; + break; + } + + if (U_FAILURE(status)) { + delete incl; + incl = NULL; + return; + } + // Compact for caching + incl->compact(); + ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); } + + +const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { + U_ASSERT(src >=0 && srcfreeze(); + } + ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); +} + U_CFUNC UnicodeSet * uniset_getUnicode32Instance(UErrorCode &errorCode) { - return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode); + umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode); + return uni32Singleton; } // helper functions for matching of pattern syntax pieces ------------------ *** @@ -331,65 +330,15 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), fFlags(0) -{ - if(U_SUCCESS(status)){ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - /* test for NULL */ - if(list == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - }else{ - allocateStrings(status); - applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); - } - } - _dbgct(this); -} - -/** - * Constructs a set from the given pattern, optionally ignoring - * white space. See the class description for the syntax of the - * pattern language. - * @param pattern a string specifying what characters are in the set - * @param options bitmask for options to apply to the pattern. - * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. - */ -UnicodeSet::UnicodeSet(const UnicodeString& pattern, - uint32_t options, - const SymbolTable* symbols, - UErrorCode& status) : - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - if(U_SUCCESS(status)){ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - /* test for NULL */ - if(list == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - }else{ - allocateStrings(status); - applyPattern(pattern, options, symbols, status); - } - } - _dbgct(this); -} - -UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, - uint32_t options, - const SymbolTable* symbols, - UErrorCode& status) : - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) { if(U_SUCCESS(status)){ list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); /* test for NULL */ if(list == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; + status = U_MEMORY_ALLOCATION_ERROR; }else{ allocateStrings(status); - applyPattern(pattern, pos, options, symbols, status); + applyPattern(pattern, status); } } _dbgct(this); @@ -399,80 +348,48 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, // Public API //---------------------------------------------------------------- -/** - * Modifies this set to represent the set specified by the given - * pattern, optionally ignoring white space. See the class - * description for the syntax of the pattern language. - * @param pattern a string specifying what characters are in the set - * @param ignoreSpaces if true, all spaces in the - * pattern are ignored. Spaces are those characters for which - * uprv_isRuleWhiteSpace() is true. - * Characters preceded by '\\' are escaped, losing any special - * meaning they otherwise have. Spaces may be included by - * escaping them. - * @exception IllegalArgumentException if the pattern - * contains a syntax error. - */ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, UErrorCode& status) { - return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); -} - - -/** - * Modifies this set to represent the set specified by the given - * pattern, optionally ignoring white space. See the class - * description for the syntax of the pattern language. - * @param pattern a string specifying what characters are in the set - * @param options bitmask for options to apply to the pattern. - * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. - */ -UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, - uint32_t options, - const SymbolTable* symbols, - UErrorCode& status) { - if (U_FAILURE(status) || isFrozen()) { - return *this; - } - + // Equivalent to + // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); + // but without dependency on closeOver(). ParsePosition pos(0); - applyPattern(pattern, pos, options, symbols, status); + applyPatternIgnoreSpace(pattern, pos, NULL, status); if (U_FAILURE(status)) return *this; int32_t i = pos.getIndex(); - - if (options & USET_IGNORE_SPACE) { - // Skip over trailing whitespace - ICU_Utility::skipWhitespace(pattern, i, TRUE); - } - + // Skip over trailing whitespace + ICU_Utility::skipWhitespace(pattern, i, TRUE); if (i != pattern.length()) { status = U_ILLEGAL_ARGUMENT_ERROR; } return *this; } -UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, - ParsePosition& pos, - uint32_t options, - const SymbolTable* symbols, - UErrorCode& status) { - if (U_FAILURE(status) || isFrozen()) { - return *this; +void +UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, + ParsePosition& pos, + const SymbolTable* symbols, + UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + if (isFrozen()) { + status = U_NO_WRITE_PERMISSION; + return; } // Need to build the pattern in a temporary string because // _applyPattern calls add() etc., which set pat to empty. UnicodeString rebuiltPat; RuleCharacterIterator chars(pattern, symbols, pos); - applyPattern(chars, symbols, rebuiltPat, options, status); - if (U_FAILURE(status)) return *this; + applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status); + if (U_FAILURE(status)) return; if (chars.inVariable()) { // syntaxError(chars, "Extra chars in variable value"); status = U_MALFORMED_SET; - return *this; + return; } setPattern(rebuiltPat); - return *this; } /** @@ -525,6 +442,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, const SymbolTable* symbols, UnicodeString& rebuiltPat, uint32_t options, + UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), UErrorCode& ec) { if (U_FAILURE(ec)) return; @@ -661,7 +579,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, } switch (setMode) { case 1: - nested->applyPattern(chars, symbols, patLocal, options, ec); + nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec); break; case 2: chars.skipIgnored(opts); @@ -740,7 +658,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, c = chars.next(opts, literal, ec); if (U_FAILURE(ec)) return; if (c == 0x5D /*']'*/ && !literal) { - patLocal.append(HYPHEN_RIGHT_BRACE); + patLocal.append(HYPHEN_RIGHT_BRACE, 2); mode = 2; continue; } @@ -893,10 +811,10 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, * patterns like /[^abc]/i work. */ if ((options & USET_CASE_INSENSITIVE) != 0) { - closeOver(USET_CASE_INSENSITIVE); + (this->*caseClosure)(USET_CASE_INSENSITIVE); } else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { - closeOver(USET_ADD_CASE_MAPPINGS); + (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); } if (invert) { complement(); @@ -1118,17 +1036,13 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); return *this; } - break; case UCHAR_NAME: - case UCHAR_UNICODE_1_NAME: { // Must munge name, since u_charFromName() does not do // 'loose' matching. char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); - UCharNameChoice choice = (p == UCHAR_NAME) ? - U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME; - UChar32 ch = u_charFromName(choice, buf, &ec); + UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); if (U_SUCCESS(ec)) { clear(); add(ch); @@ -1137,7 +1051,9 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, FAIL(ec); } } - break; + case UCHAR_UNICODE_1_NAME: + // ICU 49 deprecates the Unicode_1_Name property APIs. + FAIL(ec); case UCHAR_AGE: { // Must munge name, since u_versionFromString() does not do @@ -1149,7 +1065,6 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); return *this; } - break; case UCHAR_SCRIPT_EXTENSIONS: v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); if (v == UCHAR_INVALID_CODE) { @@ -1304,7 +1219,12 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, } // Look for the matching close delimiter, either :] or } - int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos); + int32_t close; + if (posix) { + close = pattern.indexOf(POSIX_CLOSE, 2, pos); + } else { + close = pattern.indexOf(CLOSE_BRACE, pos); + } if (close < 0) { // Syntax error; close delimiter missing FAIL(ec); @@ -1379,126 +1299,4 @@ void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, rebuiltPat.append(pattern, 0, pos.getIndex()); } -//---------------------------------------------------------------- -// Case folding API -//---------------------------------------------------------------- - -// add the result of a full case mapping to the set -// use str as a temporary string to avoid constructing one -static inline void -addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) { - if(result >= 0) { - if(result > UCASE_MAX_STRING_LENGTH) { - // add a single-code point case mapping - set.add(result); - } else { - // add a string case mapping from full with length result - str.setTo((UBool)FALSE, full, result); - set.add(str); - } - } - // result < 0: the code point mapped to itself, no need to add it - // see ucase.h -} - -UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { - if (isFrozen() || isBogus()) { - return *this; - } - if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { - const UCaseProps *csp = ucase_getSingleton(); - { - UnicodeSet foldSet(*this); - UnicodeString str; - USetAdder sa = { - foldSet.toUSet(), - _set_add, - _set_addRange, - _set_addString, - NULL, // don't need remove() - NULL // don't need removeRange() - }; - - // start with input set to guarantee inclusion - // USET_CASE: remove strings because the strings will actually be reduced (folded); - // therefore, start with no strings and add only those needed - if (attribute & USET_CASE_INSENSITIVE) { - foldSet.strings->removeAllElements(); - } - - int32_t n = getRangeCount(); - UChar32 result; - const UChar *full; - int32_t locCache = 0; - - for (int32_t i=0; isize() > 0) { - if (attribute & USET_CASE_INSENSITIVE) { - for (int32_t j=0; jsize(); ++j) { - str = *(const UnicodeString *) strings->elementAt(j); - str.foldCase(); - if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { - foldSet.add(str); // does not map to code points: add the folded string itself - } - } - } else { - Locale root(""); -#if !UCONFIG_NO_BREAK_ITERATION - UErrorCode status = U_ZERO_ERROR; - BreakIterator *bi = BreakIterator::createWordInstance(root, status); - if (U_SUCCESS(status)) { -#endif - const UnicodeString *pStr; - - for (int32_t j=0; jsize(); ++j) { - pStr = (const UnicodeString *) strings->elementAt(j); - (str = *pStr).toLower(root); - foldSet.add(str); -#if !UCONFIG_NO_BREAK_ITERATION - (str = *pStr).toTitle(bi, root); - foldSet.add(str); -#endif - (str = *pStr).toUpper(root); - foldSet.add(str); - (str = *pStr).foldCase(); - foldSet.add(str); - } -#if !UCONFIG_NO_BREAK_ITERATION - } - delete bi; -#endif - } - } - *this = foldSet; - } - } - return *this; -} - U_NAMESPACE_END