icuSources/common/filterednormalizer2.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2009-2012, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  filterednormalizer2.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2009dec10
  14 *   created by: Markus W. Scherer
  15 */
  16
  17 #include "unicode/utypes.h"
  18
  19 #if !UCONFIG_NO_NORMALIZATION
  20
  21 #include "unicode/normalizer2.h"
  22 #include "unicode/uniset.h"
  23 #include "unicode/unistr.h"
  24 #include "unicode/unorm.h"
  25 #include "cpputils.h"
  26
  27 U_NAMESPACE_BEGIN
  28
  29 FilteredNormalizer2::~FilteredNormalizer2() {}
  30
  31 UnicodeString &
  32 FilteredNormalizer2::normalize(const UnicodeString &src,
  33                                UnicodeString &dest,
  34                                UErrorCode &errorCode) const {
  35     uprv_checkCanGetBuffer(src, errorCode);
  36     if(U_FAILURE(errorCode)) {
  37         dest.setToBogus();
  38         return dest;
  39     }
  40     if(&dest==&src) {
  41         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  42         return dest;
  43     }
  44     dest.remove();
  45     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
  46 }
  47
  48 // Internal: No argument checking, and appends to dest.
  49 // Pass as input spanCondition the one that is likely to yield a non-zero
  50 // span length at the start of src.
  51 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
  52 // USET_SPAN_SIMPLE should be passed in for the start of src
  53 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
  54 // an in-filter prefix.
  55 UnicodeString &
  56 FilteredNormalizer2::normalize(const UnicodeString &src,
  57                                UnicodeString &dest,
  58                                USetSpanCondition spanCondition,
  59                                UErrorCode &errorCode) const {
  60     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
  61     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
  62         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
  63         int32_t spanLength=spanLimit-prevSpanLimit;
  64         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
  65             if(spanLength!=0) {
  66                 dest.append(src, prevSpanLimit, spanLength);
  67             }
  68             spanCondition=USET_SPAN_SIMPLE;
  69         } else {
  70             if(spanLength!=0) {
  71                 // Not norm2.normalizeSecondAndAppend() because we do not want
  72                 // to modify the non-filter part of dest.
  73                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
  74                                             tempDest, errorCode));
  75                 if(U_FAILURE(errorCode)) {
  76                     break;
  77                 }
  78             }
  79             spanCondition=USET_SPAN_NOT_CONTAINED;
  80         }
  81         prevSpanLimit=spanLimit;
  82     }
  83     return dest;
  84 }
  85
  86 UnicodeString &
  87 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
  88                                               const UnicodeString &second,
  89                                               UErrorCode &errorCode) const {
  90     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
  91 }
  92
  93 UnicodeString &
  94 FilteredNormalizer2::append(UnicodeString &first,
  95                             const UnicodeString &second,
  96                             UErrorCode &errorCode) const {
  97     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
  98 }
  99
 100 UnicodeString &
 101 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
 102                                               const UnicodeString &second,
 103                                               UBool doNormalize,
 104                                               UErrorCode &errorCode) const {
 105     uprv_checkCanGetBuffer(first, errorCode);
 106     uprv_checkCanGetBuffer(second, errorCode);
 107     if(U_FAILURE(errorCode)) {
 108         return first;
 109     }
 110     if(&first==&second) {
 111         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 112         return first;
 113     }
 114     if(first.isEmpty()) {
 115         if(doNormalize) {
 116             return normalize(second, first, errorCode);
 117         } else {
 118             return first=second;
 119         }
 120     }
 121     // merge the in-filter suffix of the first string with the in-filter prefix of the second
 122     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
 123     if(prefixLimit!=0) {
 124         UnicodeString prefix(second.tempSubString(0, prefixLimit));
 125         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
 126         if(suffixStart==0) {
 127             if(doNormalize) {
 128                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
 129             } else {
 130                 norm2.append(first, prefix, errorCode);
 131             }
 132         } else {
 133             UnicodeString middle(first, suffixStart, INT32_MAX);
 134             if(doNormalize) {
 135                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
 136             } else {
 137                 norm2.append(middle, prefix, errorCode);
 138             }
 139             first.replace(suffixStart, INT32_MAX, middle);
 140         }
 141     }
 142     if(prefixLimit<second.length()) {
 143         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
 144         if(doNormalize) {
 145             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
 146         } else {
 147             first.append(rest);
 148         }
 149     }
 150     return first;
 151 }
 152
 153 UBool
 154 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
 155     return set.contains(c) && norm2.getDecomposition(c, decomposition);
 156 }
 157
 158 UBool
 159 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
 160     return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
 161 }
 162
 163 UChar32
 164 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
 165     return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
 166 }
 167
 168 uint8_t
 169 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
 170     return set.contains(c) ? norm2.getCombiningClass(c) : 0;
 171 }
 172
 173 UBool
 174 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
 175     uprv_checkCanGetBuffer(s, errorCode);
 176     if(U_FAILURE(errorCode)) {
 177         return FALSE;
 178     }
 179     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 180     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 181         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 182         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 183             spanCondition=USET_SPAN_SIMPLE;
 184         } else {
 185             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
 186                 U_FAILURE(errorCode)
 187             ) {
 188                 return FALSE;
 189             }
 190             spanCondition=USET_SPAN_NOT_CONTAINED;
 191         }
 192         prevSpanLimit=spanLimit;
 193     }
 194     return TRUE;
 195 }
 196
 197 UNormalizationCheckResult
 198 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
 199     uprv_checkCanGetBuffer(s, errorCode);
 200     if(U_FAILURE(errorCode)) {
 201         return UNORM_MAYBE;
 202     }
 203     UNormalizationCheckResult result=UNORM_YES;
 204     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 205     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 206         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 207         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 208             spanCondition=USET_SPAN_SIMPLE;
 209         } else {
 210             UNormalizationCheckResult qcResult=
 211                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
 212             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
 213                 return qcResult;
 214             } else if(qcResult==UNORM_MAYBE) {
 215                 result=qcResult;
 216             }
 217             spanCondition=USET_SPAN_NOT_CONTAINED;
 218         }
 219         prevSpanLimit=spanLimit;
 220     }
 221     return result;
 222 }
 223
 224 int32_t
 225 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
 226     uprv_checkCanGetBuffer(s, errorCode);
 227     if(U_FAILURE(errorCode)) {
 228         return 0;
 229     }
 230     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 231     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 232         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 233         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 234             spanCondition=USET_SPAN_SIMPLE;
 235         } else {
 236             int32_t yesLimit=
 237                 prevSpanLimit+
 238                 norm2.spanQuickCheckYes(
 239                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
 240             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
 241                 return yesLimit;
 242             }
 243             spanCondition=USET_SPAN_NOT_CONTAINED;
 244         }
 245         prevSpanLimit=spanLimit;
 246     }
 247     return s.length();
 248 }
 249
 250 UBool
 251 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
 252     return !set.contains(c) || norm2.hasBoundaryBefore(c);
 253 }
 254
 255 UBool
 256 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
 257     return !set.contains(c) || norm2.hasBoundaryAfter(c);
 258 }
 259
 260 UBool
 261 FilteredNormalizer2::isInert(UChar32 c) const {
 262     return !set.contains(c) || norm2.isInert(c);
 263 }
 264
 265 U_NAMESPACE_END
 266
 267 // C API ------------------------------------------------------------------- ***
 268
 269 U_NAMESPACE_USE
 270
 271 U_CAPI UNormalizer2 * U_EXPORT2
 272 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
 273     if(U_FAILURE(*pErrorCode)) {
 274         return NULL;
 275     }
 276     if(filterSet==NULL) {
 277         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 278         return NULL;
 279     }
 280     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
 281                                              *UnicodeSet::fromUSet(filterSet));
 282     if(fn2==NULL) {
 283         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 284     }
 285     return (UNormalizer2 *)fn2;
 286 }
 287
 288 #endif  // !UCONFIG_NO_NORMALIZATION