1 // © 2016 and later: Unicode, Inc. and others. 
   2 // License & terms of use: http://www.unicode.org/copyright.html 
   4 ******************************************************************************* 
   6 *   Copyright (C) 2009-2012, International Business Machines 
   7 *   Corporation and others.  All Rights Reserved. 
   9 ******************************************************************************* 
  10 *   file name:  filterednormalizer2.cpp 
  12 *   tab size:   8 (not used) 
  15 *   created on: 2009dec10 
  16 *   created by: Markus W. Scherer 
  19 #include "unicode/utypes.h" 
  21 #if !UCONFIG_NO_NORMALIZATION 
  23 #include "unicode/edits.h" 
  24 #include "unicode/normalizer2.h" 
  25 #include "unicode/stringoptions.h" 
  26 #include "unicode/uniset.h" 
  27 #include "unicode/unistr.h" 
  28 #include "unicode/unorm.h" 
  33 FilteredNormalizer2::~FilteredNormalizer2() {} 
  36 FilteredNormalizer2::normalize(const UnicodeString 
&src
, 
  38                                UErrorCode 
&errorCode
) const { 
  39     uprv_checkCanGetBuffer(src
, errorCode
); 
  40     if(U_FAILURE(errorCode
)) { 
  45         errorCode
=U_ILLEGAL_ARGUMENT_ERROR
; 
  49     return normalize(src
, dest
, USET_SPAN_SIMPLE
, errorCode
); 
  52 // Internal: No argument checking, and appends to dest. 
  53 // Pass as input spanCondition the one that is likely to yield a non-zero 
  54 // span length at the start of src. 
  55 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, 
  56 // USET_SPAN_SIMPLE should be passed in for the start of src 
  57 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after 
  58 // an in-filter prefix. 
  60 FilteredNormalizer2::normalize(const UnicodeString 
&src
, 
  62                                USetSpanCondition spanCondition
, 
  63                                UErrorCode 
&errorCode
) const { 
  64     UnicodeString tempDest
;  // Don't throw away destination buffer between iterations. 
  65     for(int32_t prevSpanLimit
=0; prevSpanLimit
<src
.length();) { 
  66         int32_t spanLimit
=set
.span(src
, prevSpanLimit
, spanCondition
); 
  67         int32_t spanLength
=spanLimit
-prevSpanLimit
; 
  68         if(spanCondition
==USET_SPAN_NOT_CONTAINED
) { 
  70                 dest
.append(src
, prevSpanLimit
, spanLength
); 
  72             spanCondition
=USET_SPAN_SIMPLE
; 
  75                 // Not norm2.normalizeSecondAndAppend() because we do not want 
  76                 // to modify the non-filter part of dest. 
  77                 dest
.append(norm2
.normalize(src
.tempSubStringBetween(prevSpanLimit
, spanLimit
), 
  78                                             tempDest
, errorCode
)); 
  79                 if(U_FAILURE(errorCode
)) { 
  83             spanCondition
=USET_SPAN_NOT_CONTAINED
; 
  85         prevSpanLimit
=spanLimit
; 
  91 FilteredNormalizer2::normalizeUTF8(uint32_t options
, StringPiece src
, ByteSink 
&sink
, 
  92                                    Edits 
*edits
, UErrorCode 
&errorCode
) const { 
  93     if (U_FAILURE(errorCode
)) { 
  96     if (edits 
!= nullptr && (options 
& U_EDITS_NO_RESET
) == 0) { 
  99     options 
|= U_EDITS_NO_RESET
;  // Do not reset for each span. 
 100     normalizeUTF8(options
, src
.data(), src
.length(), sink
, edits
, USET_SPAN_SIMPLE
, errorCode
); 
 104 FilteredNormalizer2::normalizeUTF8(uint32_t options
, const char *src
, int32_t length
, 
 105                                    ByteSink 
&sink
, Edits 
*edits
, 
 106                                    USetSpanCondition spanCondition
, 
 107                                    UErrorCode 
&errorCode
) const { 
 109         int32_t spanLength 
= set
.spanUTF8(src
, length
, spanCondition
); 
 110         if (spanCondition 
== USET_SPAN_NOT_CONTAINED
) { 
 111             if (spanLength 
!= 0) { 
 112                 if (edits 
!= nullptr) { 
 113                     edits
->addUnchanged(spanLength
); 
 115                 if ((options 
& U_OMIT_UNCHANGED_TEXT
) == 0) { 
 116                     sink
.Append(src
, spanLength
); 
 119             spanCondition 
= USET_SPAN_SIMPLE
; 
 121             if (spanLength 
!= 0) { 
 122                 // Not norm2.normalizeSecondAndAppend() because we do not want 
 123                 // to modify the non-filter part of dest. 
 124                 norm2
.normalizeUTF8(options
, StringPiece(src
, spanLength
), sink
, edits
, errorCode
); 
 125                 if (U_FAILURE(errorCode
)) { 
 129             spanCondition 
= USET_SPAN_NOT_CONTAINED
; 
 132         length 
-= spanLength
; 
 137 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString 
&first
, 
 138                                               const UnicodeString 
&second
, 
 139                                               UErrorCode 
&errorCode
) const { 
 140     return normalizeSecondAndAppend(first
, second
, TRUE
, errorCode
); 
 144 FilteredNormalizer2::append(UnicodeString 
&first
, 
 145                             const UnicodeString 
&second
, 
 146                             UErrorCode 
&errorCode
) const { 
 147     return normalizeSecondAndAppend(first
, second
, FALSE
, errorCode
); 
 151 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString 
&first
, 
 152                                               const UnicodeString 
&second
, 
 154                                               UErrorCode 
&errorCode
) const { 
 155     uprv_checkCanGetBuffer(first
, errorCode
); 
 156     uprv_checkCanGetBuffer(second
, errorCode
); 
 157     if(U_FAILURE(errorCode
)) { 
 160     if(&first
==&second
) { 
 161         errorCode
=U_ILLEGAL_ARGUMENT_ERROR
; 
 164     if(first
.isEmpty()) { 
 166             return normalize(second
, first
, errorCode
); 
 171     // merge the in-filter suffix of the first string with the in-filter prefix of the second 
 172     int32_t prefixLimit
=set
.span(second
, 0, USET_SPAN_SIMPLE
); 
 174         UnicodeString 
prefix(second
.tempSubString(0, prefixLimit
)); 
 175         int32_t suffixStart
=set
.spanBack(first
, INT32_MAX
, USET_SPAN_SIMPLE
); 
 178                 norm2
.normalizeSecondAndAppend(first
, prefix
, errorCode
); 
 180                 norm2
.append(first
, prefix
, errorCode
); 
 183             UnicodeString 
middle(first
, suffixStart
, INT32_MAX
); 
 185                 norm2
.normalizeSecondAndAppend(middle
, prefix
, errorCode
); 
 187                 norm2
.append(middle
, prefix
, errorCode
); 
 189             first
.replace(suffixStart
, INT32_MAX
, middle
); 
 192     if(prefixLimit
<second
.length()) { 
 193         UnicodeString 
rest(second
.tempSubString(prefixLimit
, INT32_MAX
)); 
 195             normalize(rest
, first
, USET_SPAN_NOT_CONTAINED
, errorCode
); 
 204 FilteredNormalizer2::getDecomposition(UChar32 c
, UnicodeString 
&decomposition
) const { 
 205     return set
.contains(c
) && norm2
.getDecomposition(c
, decomposition
); 
 209 FilteredNormalizer2::getRawDecomposition(UChar32 c
, UnicodeString 
&decomposition
) const { 
 210     return set
.contains(c
) && norm2
.getRawDecomposition(c
, decomposition
); 
 214 FilteredNormalizer2::composePair(UChar32 a
, UChar32 b
) const { 
 215     return (set
.contains(a
) && set
.contains(b
)) ? norm2
.composePair(a
, b
) : U_SENTINEL
; 
 219 FilteredNormalizer2::getCombiningClass(UChar32 c
) const { 
 220     return set
.contains(c
) ? norm2
.getCombiningClass(c
) : 0; 
 224 FilteredNormalizer2::isNormalized(const UnicodeString 
&s
, UErrorCode 
&errorCode
) const { 
 225     uprv_checkCanGetBuffer(s
, errorCode
); 
 226     if(U_FAILURE(errorCode
)) { 
 229     USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
; 
 230     for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) { 
 231         int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
); 
 232         if(spanCondition
==USET_SPAN_NOT_CONTAINED
) { 
 233             spanCondition
=USET_SPAN_SIMPLE
; 
 235             if( !norm2
.isNormalized(s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
) || 
 240             spanCondition
=USET_SPAN_NOT_CONTAINED
; 
 242         prevSpanLimit
=spanLimit
; 
 248 FilteredNormalizer2::isNormalizedUTF8(StringPiece sp
, UErrorCode 
&errorCode
) const { 
 249     if(U_FAILURE(errorCode
)) { 
 252     const char *s 
= sp
.data(); 
 253     int32_t length 
= sp
.length(); 
 254     USetSpanCondition spanCondition 
= USET_SPAN_SIMPLE
; 
 256         int32_t spanLength 
= set
.spanUTF8(s
, length
, spanCondition
); 
 257         if (spanCondition 
== USET_SPAN_NOT_CONTAINED
) { 
 258             spanCondition 
= USET_SPAN_SIMPLE
; 
 260             if (!norm2
.isNormalizedUTF8(StringPiece(s
, spanLength
), errorCode
) || 
 261                     U_FAILURE(errorCode
)) { 
 264             spanCondition 
= USET_SPAN_NOT_CONTAINED
; 
 267         length 
-= spanLength
; 
 272 UNormalizationCheckResult
 
 273 FilteredNormalizer2::quickCheck(const UnicodeString 
&s
, UErrorCode 
&errorCode
) const { 
 274     uprv_checkCanGetBuffer(s
, errorCode
); 
 275     if(U_FAILURE(errorCode
)) { 
 278     UNormalizationCheckResult result
=UNORM_YES
; 
 279     USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
; 
 280     for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) { 
 281         int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
); 
 282         if(spanCondition
==USET_SPAN_NOT_CONTAINED
) { 
 283             spanCondition
=USET_SPAN_SIMPLE
; 
 285             UNormalizationCheckResult qcResult
= 
 286                 norm2
.quickCheck(s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
); 
 287             if(U_FAILURE(errorCode
) || qcResult
==UNORM_NO
) { 
 289             } else if(qcResult
==UNORM_MAYBE
) { 
 292             spanCondition
=USET_SPAN_NOT_CONTAINED
; 
 294         prevSpanLimit
=spanLimit
; 
 300 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString 
&s
, UErrorCode 
&errorCode
) const { 
 301     uprv_checkCanGetBuffer(s
, errorCode
); 
 302     if(U_FAILURE(errorCode
)) { 
 305     USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
; 
 306     for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) { 
 307         int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
); 
 308         if(spanCondition
==USET_SPAN_NOT_CONTAINED
) { 
 309             spanCondition
=USET_SPAN_SIMPLE
; 
 313                 norm2
.spanQuickCheckYes( 
 314                     s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
); 
 315             if(U_FAILURE(errorCode
) || yesLimit
<spanLimit
) { 
 318             spanCondition
=USET_SPAN_NOT_CONTAINED
; 
 320         prevSpanLimit
=spanLimit
; 
 326 FilteredNormalizer2::hasBoundaryBefore(UChar32 c
) const { 
 327     return !set
.contains(c
) || norm2
.hasBoundaryBefore(c
); 
 331 FilteredNormalizer2::hasBoundaryAfter(UChar32 c
) const { 
 332     return !set
.contains(c
) || norm2
.hasBoundaryAfter(c
); 
 336 FilteredNormalizer2::isInert(UChar32 c
) const { 
 337     return !set
.contains(c
) || norm2
.isInert(c
); 
 342 // C API ------------------------------------------------------------------- *** 
 346 U_CAPI UNormalizer2 
* U_EXPORT2
 
 347 unorm2_openFiltered(const UNormalizer2 
*norm2
, const USet 
*filterSet
, UErrorCode 
*pErrorCode
) { 
 348     if(U_FAILURE(*pErrorCode
)) { 
 351     if(filterSet
==NULL
) { 
 352         *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
; 
 355     Normalizer2 
*fn2
=new FilteredNormalizer2(*(Normalizer2 
*)norm2
, 
 356                                              *UnicodeSet::fromUSet(filterSet
)); 
 358         *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
; 
 360     return (UNormalizer2 
*)fn2
; 
 363 #endif  // !UCONFIG_NO_NORMALIZATION