icuSources/common/filterednormalizer2.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2009-2012, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  filterednormalizer2.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2009dec10
  16 *   created by: Markus W. Scherer
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_NORMALIZATION
  22
  23 #include "unicode/edits.h"
  24 #include "unicode/normalizer2.h"
  25 #include "unicode/stringoptions.h"
  26 #include "unicode/uniset.h"
  27 #include "unicode/unistr.h"
  28 #include "unicode/unorm.h"
  29 #include "cpputils.h"
  30
  31 U_NAMESPACE_BEGIN
  32
  33 FilteredNormalizer2::~FilteredNormalizer2() {}
  34
  35 UnicodeString &
  36 FilteredNormalizer2::normalize(const UnicodeString &src,
  37                                UnicodeString &dest,
  38                                UErrorCode &errorCode) const {
  39     uprv_checkCanGetBuffer(src, errorCode);
  40     if(U_FAILURE(errorCode)) {
  41         dest.setToBogus();
  42         return dest;
  43     }
  44     if(&dest==&src) {
  45         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  46         return dest;
  47     }
  48     dest.remove();
  49     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
  50 }
  51
  52 // Internal: No argument checking, and appends to dest.
  53 // Pass as input spanCondition the one that is likely to yield a non-zero
  54 // span length at the start of src.
  55 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
  56 // USET_SPAN_SIMPLE should be passed in for the start of src
  57 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
  58 // an in-filter prefix.
  59 UnicodeString &
  60 FilteredNormalizer2::normalize(const UnicodeString &src,
  61                                UnicodeString &dest,
  62                                USetSpanCondition spanCondition,
  63                                UErrorCode &errorCode) const {
  64     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
  65     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
  66         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
  67         int32_t spanLength=spanLimit-prevSpanLimit;
  68         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
  69             if(spanLength!=0) {
  70                 dest.append(src, prevSpanLimit, spanLength);
  71             }
  72             spanCondition=USET_SPAN_SIMPLE;
  73         } else {
  74             if(spanLength!=0) {
  75                 // Not norm2.normalizeSecondAndAppend() because we do not want
  76                 // to modify the non-filter part of dest.
  77                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
  78                                             tempDest, errorCode));
  79                 if(U_FAILURE(errorCode)) {
  80                     break;
  81                 }
  82             }
  83             spanCondition=USET_SPAN_NOT_CONTAINED;
  84         }
  85         prevSpanLimit=spanLimit;
  86     }
  87     return dest;
  88 }
  89
  90 void
  91 FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
  92                                    Edits *edits, UErrorCode &errorCode) const {
  93     if (U_FAILURE(errorCode)) {
  94         return;
  95     }
  96     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
  97         edits->reset();
  98     }
  99     options |= U_EDITS_NO_RESET;  // Do not reset for each span.
 100     normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode);
 101 }
 102
 103 void
 104 FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length,
 105                                    ByteSink &sink, Edits *edits,
 106                                    USetSpanCondition spanCondition,
 107                                    UErrorCode &errorCode) const {
 108     while (length > 0) {
 109         int32_t spanLength = set.spanUTF8(src, length, spanCondition);
 110         if (spanCondition == USET_SPAN_NOT_CONTAINED) {
 111             if (spanLength != 0) {
 112                 if (edits != nullptr) {
 113                     edits->addUnchanged(spanLength);
 114                 }
 115                 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
 116                     sink.Append(src, spanLength);
 117                 }
 118             }
 119             spanCondition = USET_SPAN_SIMPLE;
 120         } else {
 121             if (spanLength != 0) {
 122                 // Not norm2.normalizeSecondAndAppend() because we do not want
 123                 // to modify the non-filter part of dest.
 124                 norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode);
 125                 if (U_FAILURE(errorCode)) {
 126                     break;
 127                 }
 128             }
 129             spanCondition = USET_SPAN_NOT_CONTAINED;
 130         }
 131         src += spanLength;
 132         length -= spanLength;
 133     }
 134 }
 135
 136 UnicodeString &
 137 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
 138                                               const UnicodeString &second,
 139                                               UErrorCode &errorCode) const {
 140     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
 141 }
 142
 143 UnicodeString &
 144 FilteredNormalizer2::append(UnicodeString &first,
 145                             const UnicodeString &second,
 146                             UErrorCode &errorCode) const {
 147     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
 148 }
 149
 150 UnicodeString &
 151 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
 152                                               const UnicodeString &second,
 153                                               UBool doNormalize,
 154                                               UErrorCode &errorCode) const {
 155     uprv_checkCanGetBuffer(first, errorCode);
 156     uprv_checkCanGetBuffer(second, errorCode);
 157     if(U_FAILURE(errorCode)) {
 158         return first;
 159     }
 160     if(&first==&second) {
 161         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 162         return first;
 163     }
 164     if(first.isEmpty()) {
 165         if(doNormalize) {
 166             return normalize(second, first, errorCode);
 167         } else {
 168             return first=second;
 169         }
 170     }
 171     // merge the in-filter suffix of the first string with the in-filter prefix of the second
 172     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
 173     if(prefixLimit!=0) {
 174         UnicodeString prefix(second.tempSubString(0, prefixLimit));
 175         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
 176         if(suffixStart==0) {
 177             if(doNormalize) {
 178                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
 179             } else {
 180                 norm2.append(first, prefix, errorCode);
 181             }
 182         } else {
 183             UnicodeString middle(first, suffixStart, INT32_MAX);
 184             if(doNormalize) {
 185                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
 186             } else {
 187                 norm2.append(middle, prefix, errorCode);
 188             }
 189             first.replace(suffixStart, INT32_MAX, middle);
 190         }
 191     }
 192     if(prefixLimit<second.length()) {
 193         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
 194         if(doNormalize) {
 195             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
 196         } else {
 197             first.append(rest);
 198         }
 199     }
 200     return first;
 201 }
 202
 203 UBool
 204 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
 205     return set.contains(c) && norm2.getDecomposition(c, decomposition);
 206 }
 207
 208 UBool
 209 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
 210     return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
 211 }
 212
 213 UChar32
 214 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
 215     return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
 216 }
 217
 218 uint8_t
 219 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
 220     return set.contains(c) ? norm2.getCombiningClass(c) : 0;
 221 }
 222
 223 UBool
 224 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
 225     uprv_checkCanGetBuffer(s, errorCode);
 226     if(U_FAILURE(errorCode)) {
 227         return FALSE;
 228     }
 229     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 230     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 231         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 232         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 233             spanCondition=USET_SPAN_SIMPLE;
 234         } else {
 235             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
 236                 U_FAILURE(errorCode)
 237             ) {
 238                 return FALSE;
 239             }
 240             spanCondition=USET_SPAN_NOT_CONTAINED;
 241         }
 242         prevSpanLimit=spanLimit;
 243     }
 244     return TRUE;
 245 }
 246
 247 UBool
 248 FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
 249     if(U_FAILURE(errorCode)) {
 250         return FALSE;
 251     }
 252     const char *s = sp.data();
 253     int32_t length = sp.length();
 254     USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
 255     while (length > 0) {
 256         int32_t spanLength = set.spanUTF8(s, length, spanCondition);
 257         if (spanCondition == USET_SPAN_NOT_CONTAINED) {
 258             spanCondition = USET_SPAN_SIMPLE;
 259         } else {
 260             if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
 261                     U_FAILURE(errorCode)) {
 262                 return FALSE;
 263             }
 264             spanCondition = USET_SPAN_NOT_CONTAINED;
 265         }
 266         s += spanLength;
 267         length -= spanLength;
 268     }
 269     return TRUE;
 270 }
 271
 272 UNormalizationCheckResult
 273 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
 274     uprv_checkCanGetBuffer(s, errorCode);
 275     if(U_FAILURE(errorCode)) {
 276         return UNORM_MAYBE;
 277     }
 278     UNormalizationCheckResult result=UNORM_YES;
 279     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 280     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 281         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 282         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 283             spanCondition=USET_SPAN_SIMPLE;
 284         } else {
 285             UNormalizationCheckResult qcResult=
 286                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
 287             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
 288                 return qcResult;
 289             } else if(qcResult==UNORM_MAYBE) {
 290                 result=qcResult;
 291             }
 292             spanCondition=USET_SPAN_NOT_CONTAINED;
 293         }
 294         prevSpanLimit=spanLimit;
 295     }
 296     return result;
 297 }
 298
 299 int32_t
 300 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
 301     uprv_checkCanGetBuffer(s, errorCode);
 302     if(U_FAILURE(errorCode)) {
 303         return 0;
 304     }
 305     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
 306     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
 307         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
 308         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
 309             spanCondition=USET_SPAN_SIMPLE;
 310         } else {
 311             int32_t yesLimit=
 312                 prevSpanLimit+
 313                 norm2.spanQuickCheckYes(
 314                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
 315             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
 316                 return yesLimit;
 317             }
 318             spanCondition=USET_SPAN_NOT_CONTAINED;
 319         }
 320         prevSpanLimit=spanLimit;
 321     }
 322     return s.length();
 323 }
 324
 325 UBool
 326 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
 327     return !set.contains(c) || norm2.hasBoundaryBefore(c);
 328 }
 329
 330 UBool
 331 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
 332     return !set.contains(c) || norm2.hasBoundaryAfter(c);
 333 }
 334
 335 UBool
 336 FilteredNormalizer2::isInert(UChar32 c) const {
 337     return !set.contains(c) || norm2.isInert(c);
 338 }
 339
 340 U_NAMESPACE_END
 341
 342 // C API ------------------------------------------------------------------- ***
 343
 344 U_NAMESPACE_USE
 345
 346 U_CAPI UNormalizer2 * U_EXPORT2
 347 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
 348     if(U_FAILURE(*pErrorCode)) {
 349         return NULL;
 350     }
 351     if(filterSet==NULL) {
 352         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 353         return NULL;
 354     }
 355     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
 356                                              *UnicodeSet::fromUSet(filterSet));
 357     if(fn2==NULL) {
 358         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
 359     }
 360     return (UNormalizer2 *)fn2;
 361 }
 362
 363 #endif  // !UCONFIG_NO_NORMALIZATION