1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: filterednormalizer2.cpp
12 * tab size: 8 (not used)
15 * created on: 2009dec10
16 * created by: Markus W. Scherer
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_NORMALIZATION
23 #include "unicode/edits.h"
24 #include "unicode/normalizer2.h"
25 #include "unicode/stringoptions.h"
26 #include "unicode/uniset.h"
27 #include "unicode/unistr.h"
28 #include "unicode/unorm.h"
33 FilteredNormalizer2::~FilteredNormalizer2() {}
36 FilteredNormalizer2::normalize(const UnicodeString
&src
,
38 UErrorCode
&errorCode
) const {
39 uprv_checkCanGetBuffer(src
, errorCode
);
40 if(U_FAILURE(errorCode
)) {
45 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
49 return normalize(src
, dest
, USET_SPAN_SIMPLE
, errorCode
);
52 // Internal: No argument checking, and appends to dest.
53 // Pass as input spanCondition the one that is likely to yield a non-zero
54 // span length at the start of src.
55 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
56 // USET_SPAN_SIMPLE should be passed in for the start of src
57 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
58 // an in-filter prefix.
60 FilteredNormalizer2::normalize(const UnicodeString
&src
,
62 USetSpanCondition spanCondition
,
63 UErrorCode
&errorCode
) const {
64 UnicodeString tempDest
; // Don't throw away destination buffer between iterations.
65 for(int32_t prevSpanLimit
=0; prevSpanLimit
<src
.length();) {
66 int32_t spanLimit
=set
.span(src
, prevSpanLimit
, spanCondition
);
67 int32_t spanLength
=spanLimit
-prevSpanLimit
;
68 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
70 dest
.append(src
, prevSpanLimit
, spanLength
);
72 spanCondition
=USET_SPAN_SIMPLE
;
75 // Not norm2.normalizeSecondAndAppend() because we do not want
76 // to modify the non-filter part of dest.
77 dest
.append(norm2
.normalize(src
.tempSubStringBetween(prevSpanLimit
, spanLimit
),
78 tempDest
, errorCode
));
79 if(U_FAILURE(errorCode
)) {
83 spanCondition
=USET_SPAN_NOT_CONTAINED
;
85 prevSpanLimit
=spanLimit
;
91 FilteredNormalizer2::normalizeUTF8(uint32_t options
, StringPiece src
, ByteSink
&sink
,
92 Edits
*edits
, UErrorCode
&errorCode
) const {
93 if (U_FAILURE(errorCode
)) {
96 if (edits
!= nullptr && (options
& U_EDITS_NO_RESET
) == 0) {
99 options
|= U_EDITS_NO_RESET
; // Do not reset for each span.
100 normalizeUTF8(options
, src
.data(), src
.length(), sink
, edits
, USET_SPAN_SIMPLE
, errorCode
);
104 FilteredNormalizer2::normalizeUTF8(uint32_t options
, const char *src
, int32_t length
,
105 ByteSink
&sink
, Edits
*edits
,
106 USetSpanCondition spanCondition
,
107 UErrorCode
&errorCode
) const {
109 int32_t spanLength
= set
.spanUTF8(src
, length
, spanCondition
);
110 if (spanCondition
== USET_SPAN_NOT_CONTAINED
) {
111 if (spanLength
!= 0) {
112 if (edits
!= nullptr) {
113 edits
->addUnchanged(spanLength
);
115 if ((options
& U_OMIT_UNCHANGED_TEXT
) == 0) {
116 sink
.Append(src
, spanLength
);
119 spanCondition
= USET_SPAN_SIMPLE
;
121 if (spanLength
!= 0) {
122 // Not norm2.normalizeSecondAndAppend() because we do not want
123 // to modify the non-filter part of dest.
124 norm2
.normalizeUTF8(options
, StringPiece(src
, spanLength
), sink
, edits
, errorCode
);
125 if (U_FAILURE(errorCode
)) {
129 spanCondition
= USET_SPAN_NOT_CONTAINED
;
132 length
-= spanLength
;
137 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString
&first
,
138 const UnicodeString
&second
,
139 UErrorCode
&errorCode
) const {
140 return normalizeSecondAndAppend(first
, second
, TRUE
, errorCode
);
144 FilteredNormalizer2::append(UnicodeString
&first
,
145 const UnicodeString
&second
,
146 UErrorCode
&errorCode
) const {
147 return normalizeSecondAndAppend(first
, second
, FALSE
, errorCode
);
151 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString
&first
,
152 const UnicodeString
&second
,
154 UErrorCode
&errorCode
) const {
155 uprv_checkCanGetBuffer(first
, errorCode
);
156 uprv_checkCanGetBuffer(second
, errorCode
);
157 if(U_FAILURE(errorCode
)) {
160 if(&first
==&second
) {
161 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
164 if(first
.isEmpty()) {
166 return normalize(second
, first
, errorCode
);
171 // merge the in-filter suffix of the first string with the in-filter prefix of the second
172 int32_t prefixLimit
=set
.span(second
, 0, USET_SPAN_SIMPLE
);
174 UnicodeString
prefix(second
.tempSubString(0, prefixLimit
));
175 int32_t suffixStart
=set
.spanBack(first
, INT32_MAX
, USET_SPAN_SIMPLE
);
178 norm2
.normalizeSecondAndAppend(first
, prefix
, errorCode
);
180 norm2
.append(first
, prefix
, errorCode
);
183 UnicodeString
middle(first
, suffixStart
, INT32_MAX
);
185 norm2
.normalizeSecondAndAppend(middle
, prefix
, errorCode
);
187 norm2
.append(middle
, prefix
, errorCode
);
189 first
.replace(suffixStart
, INT32_MAX
, middle
);
192 if(prefixLimit
<second
.length()) {
193 UnicodeString
rest(second
.tempSubString(prefixLimit
, INT32_MAX
));
195 normalize(rest
, first
, USET_SPAN_NOT_CONTAINED
, errorCode
);
204 FilteredNormalizer2::getDecomposition(UChar32 c
, UnicodeString
&decomposition
) const {
205 return set
.contains(c
) && norm2
.getDecomposition(c
, decomposition
);
209 FilteredNormalizer2::getRawDecomposition(UChar32 c
, UnicodeString
&decomposition
) const {
210 return set
.contains(c
) && norm2
.getRawDecomposition(c
, decomposition
);
214 FilteredNormalizer2::composePair(UChar32 a
, UChar32 b
) const {
215 return (set
.contains(a
) && set
.contains(b
)) ? norm2
.composePair(a
, b
) : U_SENTINEL
;
219 FilteredNormalizer2::getCombiningClass(UChar32 c
) const {
220 return set
.contains(c
) ? norm2
.getCombiningClass(c
) : 0;
224 FilteredNormalizer2::isNormalized(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
225 uprv_checkCanGetBuffer(s
, errorCode
);
226 if(U_FAILURE(errorCode
)) {
229 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
230 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
231 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
232 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
233 spanCondition
=USET_SPAN_SIMPLE
;
235 if( !norm2
.isNormalized(s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
) ||
240 spanCondition
=USET_SPAN_NOT_CONTAINED
;
242 prevSpanLimit
=spanLimit
;
248 FilteredNormalizer2::isNormalizedUTF8(StringPiece sp
, UErrorCode
&errorCode
) const {
249 if(U_FAILURE(errorCode
)) {
252 const char *s
= sp
.data();
253 int32_t length
= sp
.length();
254 USetSpanCondition spanCondition
= USET_SPAN_SIMPLE
;
256 int32_t spanLength
= set
.spanUTF8(s
, length
, spanCondition
);
257 if (spanCondition
== USET_SPAN_NOT_CONTAINED
) {
258 spanCondition
= USET_SPAN_SIMPLE
;
260 if (!norm2
.isNormalizedUTF8(StringPiece(s
, spanLength
), errorCode
) ||
261 U_FAILURE(errorCode
)) {
264 spanCondition
= USET_SPAN_NOT_CONTAINED
;
267 length
-= spanLength
;
272 UNormalizationCheckResult
273 FilteredNormalizer2::quickCheck(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
274 uprv_checkCanGetBuffer(s
, errorCode
);
275 if(U_FAILURE(errorCode
)) {
278 UNormalizationCheckResult result
=UNORM_YES
;
279 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
280 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
281 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
282 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
283 spanCondition
=USET_SPAN_SIMPLE
;
285 UNormalizationCheckResult qcResult
=
286 norm2
.quickCheck(s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
);
287 if(U_FAILURE(errorCode
) || qcResult
==UNORM_NO
) {
289 } else if(qcResult
==UNORM_MAYBE
) {
292 spanCondition
=USET_SPAN_NOT_CONTAINED
;
294 prevSpanLimit
=spanLimit
;
300 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
301 uprv_checkCanGetBuffer(s
, errorCode
);
302 if(U_FAILURE(errorCode
)) {
305 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
306 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
307 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
308 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
309 spanCondition
=USET_SPAN_SIMPLE
;
313 norm2
.spanQuickCheckYes(
314 s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
);
315 if(U_FAILURE(errorCode
) || yesLimit
<spanLimit
) {
318 spanCondition
=USET_SPAN_NOT_CONTAINED
;
320 prevSpanLimit
=spanLimit
;
326 FilteredNormalizer2::hasBoundaryBefore(UChar32 c
) const {
327 return !set
.contains(c
) || norm2
.hasBoundaryBefore(c
);
331 FilteredNormalizer2::hasBoundaryAfter(UChar32 c
) const {
332 return !set
.contains(c
) || norm2
.hasBoundaryAfter(c
);
336 FilteredNormalizer2::isInert(UChar32 c
) const {
337 return !set
.contains(c
) || norm2
.isInert(c
);
342 // C API ------------------------------------------------------------------- ***
346 U_CAPI UNormalizer2
* U_EXPORT2
347 unorm2_openFiltered(const UNormalizer2
*norm2
, const USet
*filterSet
, UErrorCode
*pErrorCode
) {
348 if(U_FAILURE(*pErrorCode
)) {
351 if(filterSet
==NULL
) {
352 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
355 Normalizer2
*fn2
=new FilteredNormalizer2(*(Normalizer2
*)norm2
,
356 *UnicodeSet::fromUSet(filterSet
));
358 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
360 return (UNormalizer2
*)fn2
;
363 #endif // !UCONFIG_NO_NORMALIZATION