X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/374ca955a76ecab1204ca8bfa63ff9238d998416..ef6cf650f4a75c3f97de06b51fa104f2069b9ea2:/icuSources/common/unormcmp.cpp diff --git a/icuSources/common/unormcmp.cpp b/icuSources/common/unormcmp.cpp index 106e8399..1b072c4c 100644 --- a/icuSources/common/unormcmp.cpp +++ b/icuSources/common/unormcmp.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2001-2004, International Business Machines +* Copyright (C) 2001-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -22,14 +22,15 @@ #if !UCONFIG_NO_NORMALIZATION -#include "unicode/ustring.h" #include "unicode/unorm.h" -#include "unicode/uniset.h" -#include "unormimp.h" -#include "ucase.h" +#include "unicode/ustring.h" #include "cmemory.h" +#include "normalizer2impl.h" +#include "ucase.h" +#include "uprops.h" +#include "ustr_imp.h" -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) +U_NAMESPACE_USE /* compare canonically equivalent ------------------------------------------- */ @@ -121,9 +122,6 @@ * Note that all of this is only a problem when case-folding _and_ * canonical equivalence come together. * (Comments in unorm_compare() are more up to date than this TODO.) - * - * This function could be moved to a different source file, at increased cost - * for calling the decomposition access function. */ /* stack element for previous-level source/decomposition pointers */ @@ -132,13 +130,20 @@ struct CmpEquivLevel { }; typedef struct CmpEquivLevel CmpEquivLevel; +/** + * Internal option for unorm_cmpEquivFold() for decomposing. + * If not set, just do strcasecmp(). + */ +#define _COMPARE_EQUIV 0x80000 + /* internal function */ static int32_t unorm_cmpEquivFold(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { - UCaseProps *csp; + const Normalizer2Impl *nfcImpl; + const UCaseProps *csp; /* current-level start/limit - s1/s2 as current */ const UChar *start1, *start2, *limit1, *limit2; @@ -150,7 +155,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, /* stacks of previous-level start/current/limit */ CmpEquivLevel stack1[2], stack2[2]; - /* decomposition buffers for Hangul */ + /* buffers for algorithmic decompositions */ UChar decomp1[4], decomp2[4]; /* case folding buffers, only use current-level start/limit */ @@ -171,19 +176,19 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, */ /* normalization/properties data loaded? */ - if( ((options&_COMPARE_EQUIV)!=0 && !unorm_haveData(pErrorCode)) || - U_FAILURE(*pErrorCode) - ) { - return 0; + if((options&_COMPARE_EQUIV)!=0) { + nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode); + } else { + nfcImpl=NULL; } if((options&U_COMPARE_IGNORE_CASE)!=0) { - csp=ucase_getSingleton(pErrorCode); - if(U_FAILURE(*pErrorCode)) { - return 0; - } + csp=ucase_getSingleton(); } else { csp=NULL; } + if(U_FAILURE(*pErrorCode)) { + return 0; + } /* initialize */ start1=s1; @@ -226,10 +231,10 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, /* reached end of level buffer, pop one level */ do { --level1; - start1=stack1[level1].start; + start1=stack1[level1].start; /*Not uninitialized*/ } while(start1==NULL); - s1=stack1[level1].s; - limit1=stack1[level1].limit; + s1=stack1[level1].s; /*Not uninitialized*/ + limit1=stack1[level1].limit; /*Not uninitialized*/ } } @@ -249,10 +254,10 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, /* reached end of level buffer, pop one level */ do { --level2; - start2=stack2[level2].start; + start2=stack2[level2].start; /*Not uninitialized*/ } while(start2==NULL); - s2=stack2[level2].s; - limit2=stack2[level2].limit; + s2=stack2[level2].s; /*Not uninitialized*/ + limit2=stack2[level2].limit; /*Not uninitialized*/ } } @@ -402,7 +407,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, } if( level1<2 && (options&_COMPARE_EQUIV) && - 0!=(p=unorm_getCanonicalDecomposition((UChar32)cp1, decomp1, &length)) + 0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length)) ) { /* cp1 decomposes into p[length] */ if(U_IS_SURROGATE(c1)) { @@ -443,7 +448,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, } if( level2<2 && (options&_COMPARE_EQUIV) && - 0!=(p=unorm_getCanonicalDecomposition((UChar32)cp2, decomp2, &length)) + 0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length)) ) { /* cp2 decomposes into p[length] */ if(U_IS_SURROGATE(c2)) { @@ -527,20 +532,42 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, } } +static +UBool _normalize(const Normalizer2 *n2, const UChar *s, int32_t length, + UnicodeString &normalized, UErrorCode *pErrorCode) { + UnicodeString str(length<0, s, length); + + // check if s fulfill the conditions + int32_t spanQCYes=n2->spanQuickCheckYes(str, *pErrorCode); + if (U_FAILURE(*pErrorCode)) { + return FALSE; + } + /* + * ICU 2.4 had a further optimization: + * If both strings were not in FCD, then they were both NFD'ed, + * and the _COMPARE_EQUIV option was turned off. + * It is not entirely clear that this is valid with the current + * definition of the canonical caseless match. + * Therefore, ICU 2.6 removes that optimization. + */ + if(spanQCYesnormalizeSecondAndAppend(normalized, unnormalized, *pErrorCode); + if (U_SUCCESS(*pErrorCode)) { + return TRUE; + } + } + return FALSE; +} + U_CAPI int32_t U_EXPORT2 unorm_compare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { - UChar fcd1[300], fcd2[300]; - UChar *d1, *d2; - const UnicodeSet *nx; - UNormalizationMode mode; - int32_t normOptions; - int32_t result; - /* argument checking */ - if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { + if(U_FAILURE(*pErrorCode)) { return 0; } if(s1==0 || length1<-1 || s2==0 || length2<-1) { @@ -548,22 +575,9 @@ unorm_compare(const UChar *s1, int32_t length1, return 0; } - if(!unorm_haveData(pErrorCode)) { - return 0; - } - if(!uprv_haveProperties(pErrorCode)) { - return 0; - } - - normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT); - nx=unorm_getNX(normOptions, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - return 0; - } - - d1=d2=0; + UnicodeString fcd1, fcd2; + int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT); options|=_COMPARE_EQUIV; - result=0; /* * UAX #21 Case Mappings, as fixed for Unicode version 4 @@ -586,103 +600,45 @@ unorm_compare(const UChar *s1, int32_t length1, * are first decomposed or not, so an FCD check - a check only for * canonical order - is not sufficient. */ - if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) { - mode=UNORM_NFD; - options&=~UNORM_INPUT_IS_FCD; - } else { - mode=UNORM_FCD; - } - - if(!(options&UNORM_INPUT_IS_FCD)) { - int32_t _len1, _len2; - UBool isFCD1, isFCD2; - - // check if s1 and/or s2 fulfill the FCD conditions - isFCD1= UNORM_YES==unorm_internalQuickCheck(s1, length1, mode, TRUE, nx, pErrorCode); - isFCD2= UNORM_YES==unorm_internalQuickCheck(s2, length2, mode, TRUE, nx, pErrorCode); - if(U_FAILURE(*pErrorCode)) { + if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) { + const Normalizer2 *n2; + if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) { + n2=Normalizer2::getNFDInstance(*pErrorCode); + } else { + n2=Normalizer2Factory::getFCDInstance(*pErrorCode); + } + if (U_FAILURE(*pErrorCode)) { return 0; } - /* - * ICU 2.4 had a further optimization: - * If both strings were not in FCD, then they were both NFD'ed, - * and the _COMPARE_EQUIV option was turned off. - * It is not entirely clear that this is valid with the current - * definition of the canonical caseless match. - * Therefore, ICU 2.6 removes that optimization. - */ - - if(!isFCD1) { - _len1=unorm_internalNormalizeWithNX(fcd1, LENGTHOF(fcd1), - s1, length1, - mode, normOptions, nx, - pErrorCode); - if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - s1=fcd1; - } else { - d1=(UChar *)uprv_malloc(_len1*U_SIZEOF_UCHAR); - if(d1==0) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - - *pErrorCode=U_ZERO_ERROR; - _len1=unorm_internalNormalizeWithNX(d1, _len1, - s1, length1, - mode, normOptions, nx, - pErrorCode); - if(U_FAILURE(*pErrorCode)) { - goto cleanup; - } - - s1=d1; + if(normOptions&UNORM_UNICODE_3_2) { + const UnicodeSet *uni32=uniset_getUnicode32Instance(*pErrorCode); + FilteredNormalizer2 fn2(*n2, *uni32); + if(_normalize(&fn2, s1, length1, fcd1, pErrorCode)) { + s1=fcd1.getBuffer(); + length1=fcd1.length(); } - length1=_len1; - } - - if(!isFCD2) { - _len2=unorm_internalNormalizeWithNX(fcd2, LENGTHOF(fcd2), - s2, length2, - mode, normOptions, nx, - pErrorCode); - if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - s2=fcd2; - } else { - d2=(UChar *)uprv_malloc(_len2*U_SIZEOF_UCHAR); - if(d2==0) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - goto cleanup; - } - - *pErrorCode=U_ZERO_ERROR; - _len2=unorm_internalNormalizeWithNX(d2, _len2, - s2, length2, - mode, normOptions, nx, - pErrorCode); - if(U_FAILURE(*pErrorCode)) { - goto cleanup; - } - - s2=d2; + if(_normalize(&fn2, s2, length2, fcd2, pErrorCode)) { + s2=fcd2.getBuffer(); + length2=fcd2.length(); + } + } else { + if(_normalize(n2, s1, length1, fcd1, pErrorCode)) { + s1=fcd1.getBuffer(); + length1=fcd1.length(); + } + if(_normalize(n2, s2, length2, fcd2, pErrorCode)) { + s2=fcd2.getBuffer(); + length2=fcd2.length(); } - length2=_len2; } } if(U_SUCCESS(*pErrorCode)) { - result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode); - } - -cleanup: - if(d1!=0) { - uprv_free(d1); - } - if(d2!=0) { - uprv_free(d2); + return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode); + } else { + return 0; } - - return result; } #endif /* #if !UCONFIG_NO_NORMALIZATION */