icuSources/common/filterednormalizer2.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2009-2010, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  filterednormalizer2.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2009dec10
  14 *   created by: Markus W. Scherer
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_NORMALIZATION
  20
  21 #include "unicode/normalizer2.h"
  22 #include "unicode/uniset.h"
  23 #include "unicode/unistr.h"
  24 #include "unicode/unorm.h"
  25 #include "cpputils.h"
  26
  27 U_NAMESPACE_BEGIN
  28
  29 UnicodeString &
  30 FilteredNormalizer2::normalize(const UnicodeString &src,
  31                                UnicodeString &dest,
  32                                UErrorCode &errorCode) const {
  33     uprv_checkCanGetBuffer(src, errorCode);
  34     if(U_FAILURE(errorCode)) {
  35         dest.setToBogus();
  36         return dest;
  37     }
  38     if(&dest==&src) {
  39         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  40         return dest;
  41     }
  42     dest.remove();
  43     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
  44 }
  45
  46 // Internal: No argument checking, and appends to dest.
  47 // Pass as input spanCondition the one that is likely to yield a non-zero
  48 // span length at the start of src.
  49 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
  50 // USET_SPAN_SIMPLE should be passed in for the start of src
  51 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
  52 // an in-filter prefix.
  53 UnicodeString &
  54 FilteredNormalizer2::normalize(const UnicodeString &src,
  55                                UnicodeString &dest,
  56                                USetSpanCondition spanCondition,
  57                                UErrorCode &errorCode) const {
  58     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
  59     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
  60         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
  61         int32_t spanLength=spanLimit-prevSpanLimit;
  62         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
  63             if(spanLength!=0) {
  64                 dest.append(src, prevSpanLimit, spanLength);
  65             }
  66             spanCondition=USET_SPAN_SIMPLE;
  67         } else {
  68             if(spanLength!=0) {
  69                 // Not norm2.normalizeSecondAndAppend() because we do not want
  70                 // to modify the non-filter part of dest.
  71                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
  72                                             tempDest, errorCode));
  73                 if(U_FAILURE(errorCode)) {
  74                     break;
  75                 }
  76             }
  77             spanCondition=USET_SPAN_NOT_CONTAINED;
  78         }
  79         prevSpanLimit=spanLimit;
  80     }
  81     return dest;
  82 }
  83
  84 UnicodeString &
  85 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
  86                                               const UnicodeString &second,
  87                                               UErrorCode &errorCode) const {
  88     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
  89 }
  90
  91 UnicodeString &
  92 FilteredNormalizer2::append(UnicodeString &first,
  93                             const UnicodeString &second,
  94                             UErrorCode &errorCode) const {
  95     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
  96 }
  97
  98 UnicodeString &
  99 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
 100                                               const UnicodeString &second,
 101                                               UBool doNormalize,
 102                                               UErrorCode &errorCode) const {
 103     uprv_checkCanGetBuffer(first, errorCode);
 104     uprv_checkCanGetBuffer(second, errorCode);
 105     if(U_FAILURE(errorCode)) {
 106         return first;
 107     }
 108     if(&first==&second) {
 109         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 110         return first;
 111     }
 112     if(first.isEmpty()) {
 113         if(doNormalize) {
 114             return normalize(second, first, errorCode);
 115         } else {
 116             return first=second;
 117         }
 118     }
 119     // merge the in-filter suffix of the first string with the in-filter prefix of the second
 120     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
 121     if(prefixLimit!=0) {
 122         UnicodeString prefix(second.tempSubString(0, prefixLimit));
 123         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
 124         if(suffixStart==0) {
 125             if(doNormalize) {
 126                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
 127             } else {
 128                 norm2.append(first, prefix, errorCode);
 129             }
 130         } else {
 131             UnicodeString middle(first, suffixStart, INT32_MAX);
 132             if(doNormalize) {
 133                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
 134             } else {
 135                 norm2.append(middle, prefix, errorCode);
 136             }
 137             first.replace(suffixStart, INT32_MAX, middle);
 138         }
 139     }
 140     if(prefixLimit<second.length()) {
 141         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
 142         if(doNormalize) {
 143             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
 144         } else {
 145             first.append(rest);
 146         }
 147     }
 148     return first;
 149 }
 150
 151 UBool
 152 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
 153     return set.contains(c) && norm2.getDecomposition(c, decomposition);
 154 }
 155
 156 UBool
 157 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
 158     uprv_checkCanGetBuffer(s, errorCode);
 159     if(U_FAILURE(errorCode)) {
 160         return FALSE;
 161     }
 162     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 163     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 164         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 165         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 166             spanCondition=USET_SPAN_SIMPLE;
 167         } else {
 168             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
 169                 U_FAILURE(errorCode)
 170             ) {
 171                 return FALSE;
 172             }
 173             spanCondition=USET_SPAN_NOT_CONTAINED;
 174         }
 175         prevSpanLimit=spanLimit;
 176     }
 177     return TRUE;
 178 }
 179
 180 UNormalizationCheckResult
 181 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
 182     uprv_checkCanGetBuffer(s, errorCode);
 183     if(U_FAILURE(errorCode)) {
 184         return UNORM_MAYBE;
 185     }
 186     UNormalizationCheckResult result=UNORM_YES;
 187     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 188     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 189         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 190         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 191             spanCondition=USET_SPAN_SIMPLE;
 192         } else {
 193             UNormalizationCheckResult qcResult=
 194                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
 195             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
 196                 return qcResult;
 197             } else if(qcResult==UNORM_MAYBE) {
 198                 result=qcResult;
 199             }
 200             spanCondition=USET_SPAN_NOT_CONTAINED;
 201         }
 202         prevSpanLimit=spanLimit;
 203     }
 204     return result;
 205 }
 206
 207 int32_t
 208 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
 209     uprv_checkCanGetBuffer(s, errorCode);
 210     if(U_FAILURE(errorCode)) {
 211         return 0;
 212     }
 213     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 214     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 215         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 216         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 217             spanCondition=USET_SPAN_SIMPLE;
 218         } else {
 219             int32_t yesLimit=
 220                 prevSpanLimit+
 221                 norm2.spanQuickCheckYes(
 222                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
 223             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
 224                 return yesLimit;
 225             }
 226             spanCondition=USET_SPAN_NOT_CONTAINED;
 227         }
 228         prevSpanLimit=spanLimit;
 229     }
 230     return s.length();
 231 }
 232
 233 UBool
 234 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
 235     return !set.contains(c) || norm2.hasBoundaryBefore(c);
 236 }
 237
 238 UBool
 239 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
 240     return !set.contains(c) || norm2.hasBoundaryAfter(c);
 241 }
 242
 243 UBool
 244 FilteredNormalizer2::isInert(UChar32 c) const {
 245     return !set.contains(c) || norm2.isInert(c);
 246 }
 247
 248 U_NAMESPACE_END
 249
 250 // C API ------------------------------------------------------------------- ***
 251
 252 U_NAMESPACE_USE
 253
 254 U_DRAFT UNormalizer2 * U_EXPORT2
 255 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
 256     if(U_FAILURE(*pErrorCode)) {
 257         return NULL;
 258     }
 259     if(filterSet==NULL) {
 260         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 261         return NULL;
 262     }
 263     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
 264                                              *UnicodeSet::fromUSet(filterSet));
 265     if(fn2==NULL) {
 266         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 267     }
 268     return (UNormalizer2 *)fn2;
 269 }
 270
 271 #endif  // !UCONFIG_NO_NORMALIZATION