1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: filterednormalizer2.cpp
12 * tab size: 8 (not used)
15 * created on: 2009dec10
16 * created by: Markus W. Scherer
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_NORMALIZATION
23 #include "unicode/normalizer2.h"
24 #include "unicode/uniset.h"
25 #include "unicode/unistr.h"
26 #include "unicode/unorm.h"
31 FilteredNormalizer2::~FilteredNormalizer2() {}
34 FilteredNormalizer2::normalize(const UnicodeString
&src
,
36 UErrorCode
&errorCode
) const {
37 uprv_checkCanGetBuffer(src
, errorCode
);
38 if(U_FAILURE(errorCode
)) {
43 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
47 return normalize(src
, dest
, USET_SPAN_SIMPLE
, errorCode
);
50 // Internal: No argument checking, and appends to dest.
51 // Pass as input spanCondition the one that is likely to yield a non-zero
52 // span length at the start of src.
53 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
54 // USET_SPAN_SIMPLE should be passed in for the start of src
55 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
56 // an in-filter prefix.
58 FilteredNormalizer2::normalize(const UnicodeString
&src
,
60 USetSpanCondition spanCondition
,
61 UErrorCode
&errorCode
) const {
62 UnicodeString tempDest
; // Don't throw away destination buffer between iterations.
63 for(int32_t prevSpanLimit
=0; prevSpanLimit
<src
.length();) {
64 int32_t spanLimit
=set
.span(src
, prevSpanLimit
, spanCondition
);
65 int32_t spanLength
=spanLimit
-prevSpanLimit
;
66 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
68 dest
.append(src
, prevSpanLimit
, spanLength
);
70 spanCondition
=USET_SPAN_SIMPLE
;
73 // Not norm2.normalizeSecondAndAppend() because we do not want
74 // to modify the non-filter part of dest.
75 dest
.append(norm2
.normalize(src
.tempSubStringBetween(prevSpanLimit
, spanLimit
),
76 tempDest
, errorCode
));
77 if(U_FAILURE(errorCode
)) {
81 spanCondition
=USET_SPAN_NOT_CONTAINED
;
83 prevSpanLimit
=spanLimit
;
89 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString
&first
,
90 const UnicodeString
&second
,
91 UErrorCode
&errorCode
) const {
92 return normalizeSecondAndAppend(first
, second
, TRUE
, errorCode
);
96 FilteredNormalizer2::append(UnicodeString
&first
,
97 const UnicodeString
&second
,
98 UErrorCode
&errorCode
) const {
99 return normalizeSecondAndAppend(first
, second
, FALSE
, errorCode
);
103 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString
&first
,
104 const UnicodeString
&second
,
106 UErrorCode
&errorCode
) const {
107 uprv_checkCanGetBuffer(first
, errorCode
);
108 uprv_checkCanGetBuffer(second
, errorCode
);
109 if(U_FAILURE(errorCode
)) {
112 if(&first
==&second
) {
113 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
116 if(first
.isEmpty()) {
118 return normalize(second
, first
, errorCode
);
123 // merge the in-filter suffix of the first string with the in-filter prefix of the second
124 int32_t prefixLimit
=set
.span(second
, 0, USET_SPAN_SIMPLE
);
126 UnicodeString
prefix(second
.tempSubString(0, prefixLimit
));
127 int32_t suffixStart
=set
.spanBack(first
, INT32_MAX
, USET_SPAN_SIMPLE
);
130 norm2
.normalizeSecondAndAppend(first
, prefix
, errorCode
);
132 norm2
.append(first
, prefix
, errorCode
);
135 UnicodeString
middle(first
, suffixStart
, INT32_MAX
);
137 norm2
.normalizeSecondAndAppend(middle
, prefix
, errorCode
);
139 norm2
.append(middle
, prefix
, errorCode
);
141 first
.replace(suffixStart
, INT32_MAX
, middle
);
144 if(prefixLimit
<second
.length()) {
145 UnicodeString
rest(second
.tempSubString(prefixLimit
, INT32_MAX
));
147 normalize(rest
, first
, USET_SPAN_NOT_CONTAINED
, errorCode
);
156 FilteredNormalizer2::getDecomposition(UChar32 c
, UnicodeString
&decomposition
) const {
157 return set
.contains(c
) && norm2
.getDecomposition(c
, decomposition
);
161 FilteredNormalizer2::getRawDecomposition(UChar32 c
, UnicodeString
&decomposition
) const {
162 return set
.contains(c
) && norm2
.getRawDecomposition(c
, decomposition
);
166 FilteredNormalizer2::composePair(UChar32 a
, UChar32 b
) const {
167 return (set
.contains(a
) && set
.contains(b
)) ? norm2
.composePair(a
, b
) : U_SENTINEL
;
171 FilteredNormalizer2::getCombiningClass(UChar32 c
) const {
172 return set
.contains(c
) ? norm2
.getCombiningClass(c
) : 0;
176 FilteredNormalizer2::isNormalized(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
177 uprv_checkCanGetBuffer(s
, errorCode
);
178 if(U_FAILURE(errorCode
)) {
181 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
182 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
183 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
184 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
185 spanCondition
=USET_SPAN_SIMPLE
;
187 if( !norm2
.isNormalized(s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
) ||
192 spanCondition
=USET_SPAN_NOT_CONTAINED
;
194 prevSpanLimit
=spanLimit
;
199 UNormalizationCheckResult
200 FilteredNormalizer2::quickCheck(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
201 uprv_checkCanGetBuffer(s
, errorCode
);
202 if(U_FAILURE(errorCode
)) {
205 UNormalizationCheckResult result
=UNORM_YES
;
206 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
207 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
208 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
209 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
210 spanCondition
=USET_SPAN_SIMPLE
;
212 UNormalizationCheckResult qcResult
=
213 norm2
.quickCheck(s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
);
214 if(U_FAILURE(errorCode
) || qcResult
==UNORM_NO
) {
216 } else if(qcResult
==UNORM_MAYBE
) {
219 spanCondition
=USET_SPAN_NOT_CONTAINED
;
221 prevSpanLimit
=spanLimit
;
227 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
228 uprv_checkCanGetBuffer(s
, errorCode
);
229 if(U_FAILURE(errorCode
)) {
232 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
233 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
234 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
235 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
236 spanCondition
=USET_SPAN_SIMPLE
;
240 norm2
.spanQuickCheckYes(
241 s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
);
242 if(U_FAILURE(errorCode
) || yesLimit
<spanLimit
) {
245 spanCondition
=USET_SPAN_NOT_CONTAINED
;
247 prevSpanLimit
=spanLimit
;
253 FilteredNormalizer2::hasBoundaryBefore(UChar32 c
) const {
254 return !set
.contains(c
) || norm2
.hasBoundaryBefore(c
);
258 FilteredNormalizer2::hasBoundaryAfter(UChar32 c
) const {
259 return !set
.contains(c
) || norm2
.hasBoundaryAfter(c
);
263 FilteredNormalizer2::isInert(UChar32 c
) const {
264 return !set
.contains(c
) || norm2
.isInert(c
);
269 // C API ------------------------------------------------------------------- ***
273 U_CAPI UNormalizer2
* U_EXPORT2
274 unorm2_openFiltered(const UNormalizer2
*norm2
, const USet
*filterSet
, UErrorCode
*pErrorCode
) {
275 if(U_FAILURE(*pErrorCode
)) {
278 if(filterSet
==NULL
) {
279 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
282 Normalizer2
*fn2
=new FilteredNormalizer2(*(Normalizer2
*)norm2
,
283 *UnicodeSet::fromUSet(filterSet
));
285 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
287 return (UNormalizer2
*)fn2
;
290 #endif // !UCONFIG_NO_NORMALIZATION