icuSources/common/filterednormalizer2.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2009-2012, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  filterednormalizer2.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2009dec10
  16 *   created by: Markus W. Scherer
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_NORMALIZATION
  22
  23 #include "unicode/normalizer2.h"
  24 #include "unicode/uniset.h"
  25 #include "unicode/unistr.h"
  26 #include "unicode/unorm.h"
  27 #include "cpputils.h"
  28
  29 U_NAMESPACE_BEGIN
  30
  31 FilteredNormalizer2::~FilteredNormalizer2() {}
  32
  33 UnicodeString &
  34 FilteredNormalizer2::normalize(const UnicodeString &src,
  35                                UnicodeString &dest,
  36                                UErrorCode &errorCode) const {
  37     uprv_checkCanGetBuffer(src, errorCode);
  38     if(U_FAILURE(errorCode)) {
  39         dest.setToBogus();
  40         return dest;
  41     }
  42     if(&dest==&src) {
  43         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  44         return dest;
  45     }
  46     dest.remove();
  47     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
  48 }
  49
  50 // Internal: No argument checking, and appends to dest.
  51 // Pass as input spanCondition the one that is likely to yield a non-zero
  52 // span length at the start of src.
  53 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
  54 // USET_SPAN_SIMPLE should be passed in for the start of src
  55 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
  56 // an in-filter prefix.
  57 UnicodeString &
  58 FilteredNormalizer2::normalize(const UnicodeString &src,
  59                                UnicodeString &dest,
  60                                USetSpanCondition spanCondition,
  61                                UErrorCode &errorCode) const {
  62     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
  63     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
  64         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
  65         int32_t spanLength=spanLimit-prevSpanLimit;
  66         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
  67             if(spanLength!=0) {
  68                 dest.append(src, prevSpanLimit, spanLength);
  69             }
  70             spanCondition=USET_SPAN_SIMPLE;
  71         } else {
  72             if(spanLength!=0) {
  73                 // Not norm2.normalizeSecondAndAppend() because we do not want
  74                 // to modify the non-filter part of dest.
  75                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
  76                                             tempDest, errorCode));
  77                 if(U_FAILURE(errorCode)) {
  78                     break;
  79                 }
  80             }
  81             spanCondition=USET_SPAN_NOT_CONTAINED;
  82         }
  83         prevSpanLimit=spanLimit;
  84     }
  85     return dest;
  86 }
  87
  88 UnicodeString &
  89 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
  90                                               const UnicodeString &second,
  91                                               UErrorCode &errorCode) const {
  92     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
  93 }
  94
  95 UnicodeString &
  96 FilteredNormalizer2::append(UnicodeString &first,
  97                             const UnicodeString &second,
  98                             UErrorCode &errorCode) const {
  99     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
 100 }
 101
 102 UnicodeString &
 103 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
 104                                               const UnicodeString &second,
 105                                               UBool doNormalize,
 106                                               UErrorCode &errorCode) const {
 107     uprv_checkCanGetBuffer(first, errorCode);
 108     uprv_checkCanGetBuffer(second, errorCode);
 109     if(U_FAILURE(errorCode)) {
 110         return first;
 111     }
 112     if(&first==&second) {
 113         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 114         return first;
 115     }
 116     if(first.isEmpty()) {
 117         if(doNormalize) {
 118             return normalize(second, first, errorCode);
 119         } else {
 120             return first=second;
 121         }
 122     }
 123     // merge the in-filter suffix of the first string with the in-filter prefix of the second
 124     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
 125     if(prefixLimit!=0) {
 126         UnicodeString prefix(second.tempSubString(0, prefixLimit));
 127         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
 128         if(suffixStart==0) {
 129             if(doNormalize) {
 130                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
 131             } else {
 132                 norm2.append(first, prefix, errorCode);
 133             }
 134         } else {
 135             UnicodeString middle(first, suffixStart, INT32_MAX);
 136             if(doNormalize) {
 137                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
 138             } else {
 139                 norm2.append(middle, prefix, errorCode);
 140             }
 141             first.replace(suffixStart, INT32_MAX, middle);
 142         }
 143     }
 144     if(prefixLimit<second.length()) {
 145         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
 146         if(doNormalize) {
 147             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
 148         } else {
 149             first.append(rest);
 150         }
 151     }
 152     return first;
 153 }
 154
 155 UBool
 156 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
 157     return set.contains(c) && norm2.getDecomposition(c, decomposition);
 158 }
 159
 160 UBool
 161 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
 162     return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
 163 }
 164
 165 UChar32
 166 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
 167     return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
 168 }
 169
 170 uint8_t
 171 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
 172     return set.contains(c) ? norm2.getCombiningClass(c) : 0;
 173 }
 174
 175 UBool
 176 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
 177     uprv_checkCanGetBuffer(s, errorCode);
 178     if(U_FAILURE(errorCode)) {
 179         return FALSE;
 180     }
 181     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 182     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 183         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 184         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 185             spanCondition=USET_SPAN_SIMPLE;
 186         } else {
 187             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
 188                 U_FAILURE(errorCode)
 189             ) {
 190                 return FALSE;
 191             }
 192             spanCondition=USET_SPAN_NOT_CONTAINED;
 193         }
 194         prevSpanLimit=spanLimit;
 195     }
 196     return TRUE;
 197 }
 198
 199 UNormalizationCheckResult
 200 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
 201     uprv_checkCanGetBuffer(s, errorCode);
 202     if(U_FAILURE(errorCode)) {
 203         return UNORM_MAYBE;
 204     }
 205     UNormalizationCheckResult result=UNORM_YES;
 206     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 207     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 208         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 209         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 210             spanCondition=USET_SPAN_SIMPLE;
 211         } else {
 212             UNormalizationCheckResult qcResult=
 213                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
 214             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
 215                 return qcResult;
 216             } else if(qcResult==UNORM_MAYBE) {
 217                 result=qcResult;
 218             }
 219             spanCondition=USET_SPAN_NOT_CONTAINED;
 220         }
 221         prevSpanLimit=spanLimit;
 222     }
 223     return result;
 224 }
 225
 226 int32_t
 227 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
 228     uprv_checkCanGetBuffer(s, errorCode);
 229     if(U_FAILURE(errorCode)) {
 230         return 0;
 231     }
 232     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 233     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 234         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 235         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 236             spanCondition=USET_SPAN_SIMPLE;
 237         } else {
 238             int32_t yesLimit=
 239                 prevSpanLimit+
 240                 norm2.spanQuickCheckYes(
 241                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
 242             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
 243                 return yesLimit;
 244             }
 245             spanCondition=USET_SPAN_NOT_CONTAINED;
 246         }
 247         prevSpanLimit=spanLimit;
 248     }
 249     return s.length();
 250 }
 251
 252 UBool
 253 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
 254     return !set.contains(c) || norm2.hasBoundaryBefore(c);
 255 }
 256
 257 UBool
 258 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
 259     return !set.contains(c) || norm2.hasBoundaryAfter(c);
 260 }
 261
 262 UBool
 263 FilteredNormalizer2::isInert(UChar32 c) const {
 264     return !set.contains(c) || norm2.isInert(c);
 265 }
 266
 267 U_NAMESPACE_END
 268
 269 // C API ------------------------------------------------------------------- ***
 270
 271 U_NAMESPACE_USE
 272
 273 U_CAPI UNormalizer2 * U_EXPORT2
 274 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
 275     if(U_FAILURE(*pErrorCode)) {
 276         return NULL;
 277     }
 278     if(filterSet==NULL) {
 279         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 280         return NULL;
 281     }
 282     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
 283                                              *UnicodeSet::fromUSet(filterSet));
 284     if(fn2==NULL) {
 285         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 286     }
 287     return (UNormalizer2 *)fn2;
 288 }
 289
 290 #endif  // !UCONFIG_NO_NORMALIZATION