2 *******************************************************************************
4 * Copyright (C) 2009-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: filterednormalizer2.cpp
10 * tab size: 8 (not used)
13 * created on: 2009dec10
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_NORMALIZATION
21 #include "unicode/normalizer2.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 #include "unicode/unorm.h"
30 FilteredNormalizer2::normalize(const UnicodeString
&src
,
32 UErrorCode
&errorCode
) const {
33 uprv_checkCanGetBuffer(src
, errorCode
);
34 if(U_FAILURE(errorCode
)) {
39 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
43 return normalize(src
, dest
, USET_SPAN_SIMPLE
, errorCode
);
46 // Internal: No argument checking, and appends to dest.
47 // Pass as input spanCondition the one that is likely to yield a non-zero
48 // span length at the start of src.
49 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
50 // USET_SPAN_SIMPLE should be passed in for the start of src
51 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
52 // an in-filter prefix.
54 FilteredNormalizer2::normalize(const UnicodeString
&src
,
56 USetSpanCondition spanCondition
,
57 UErrorCode
&errorCode
) const {
58 UnicodeString tempDest
; // Don't throw away destination buffer between iterations.
59 for(int32_t prevSpanLimit
=0; prevSpanLimit
<src
.length();) {
60 int32_t spanLimit
=set
.span(src
, prevSpanLimit
, spanCondition
);
61 int32_t spanLength
=spanLimit
-prevSpanLimit
;
62 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
64 dest
.append(src
, prevSpanLimit
, spanLength
);
66 spanCondition
=USET_SPAN_SIMPLE
;
69 // Not norm2.normalizeSecondAndAppend() because we do not want
70 // to modify the non-filter part of dest.
71 dest
.append(norm2
.normalize(src
.tempSubStringBetween(prevSpanLimit
, spanLimit
),
72 tempDest
, errorCode
));
73 if(U_FAILURE(errorCode
)) {
77 spanCondition
=USET_SPAN_NOT_CONTAINED
;
79 prevSpanLimit
=spanLimit
;
85 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString
&first
,
86 const UnicodeString
&second
,
87 UErrorCode
&errorCode
) const {
88 return normalizeSecondAndAppend(first
, second
, TRUE
, errorCode
);
92 FilteredNormalizer2::append(UnicodeString
&first
,
93 const UnicodeString
&second
,
94 UErrorCode
&errorCode
) const {
95 return normalizeSecondAndAppend(first
, second
, FALSE
, errorCode
);
99 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString
&first
,
100 const UnicodeString
&second
,
102 UErrorCode
&errorCode
) const {
103 uprv_checkCanGetBuffer(first
, errorCode
);
104 uprv_checkCanGetBuffer(second
, errorCode
);
105 if(U_FAILURE(errorCode
)) {
108 if(&first
==&second
) {
109 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
112 if(first
.isEmpty()) {
114 return normalize(second
, first
, errorCode
);
119 // merge the in-filter suffix of the first string with the in-filter prefix of the second
120 int32_t prefixLimit
=set
.span(second
, 0, USET_SPAN_SIMPLE
);
122 UnicodeString
prefix(second
.tempSubString(0, prefixLimit
));
123 int32_t suffixStart
=set
.spanBack(first
, INT32_MAX
, USET_SPAN_SIMPLE
);
126 norm2
.normalizeSecondAndAppend(first
, prefix
, errorCode
);
128 norm2
.append(first
, prefix
, errorCode
);
131 UnicodeString
middle(first
, suffixStart
, INT32_MAX
);
133 norm2
.normalizeSecondAndAppend(middle
, prefix
, errorCode
);
135 norm2
.append(middle
, prefix
, errorCode
);
137 first
.replace(suffixStart
, INT32_MAX
, middle
);
140 if(prefixLimit
<second
.length()) {
141 UnicodeString
rest(second
.tempSubString(prefixLimit
, INT32_MAX
));
143 normalize(rest
, first
, USET_SPAN_NOT_CONTAINED
, errorCode
);
152 FilteredNormalizer2::getDecomposition(UChar32 c
, UnicodeString
&decomposition
) const {
153 return set
.contains(c
) && norm2
.getDecomposition(c
, decomposition
);
157 FilteredNormalizer2::isNormalized(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
158 uprv_checkCanGetBuffer(s
, errorCode
);
159 if(U_FAILURE(errorCode
)) {
162 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
163 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
164 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
165 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
166 spanCondition
=USET_SPAN_SIMPLE
;
168 if( !norm2
.isNormalized(s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
) ||
173 spanCondition
=USET_SPAN_NOT_CONTAINED
;
175 prevSpanLimit
=spanLimit
;
180 UNormalizationCheckResult
181 FilteredNormalizer2::quickCheck(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
182 uprv_checkCanGetBuffer(s
, errorCode
);
183 if(U_FAILURE(errorCode
)) {
186 UNormalizationCheckResult result
=UNORM_YES
;
187 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
188 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
189 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
190 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
191 spanCondition
=USET_SPAN_SIMPLE
;
193 UNormalizationCheckResult qcResult
=
194 norm2
.quickCheck(s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
);
195 if(U_FAILURE(errorCode
) || qcResult
==UNORM_NO
) {
197 } else if(qcResult
==UNORM_MAYBE
) {
200 spanCondition
=USET_SPAN_NOT_CONTAINED
;
202 prevSpanLimit
=spanLimit
;
208 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
209 uprv_checkCanGetBuffer(s
, errorCode
);
210 if(U_FAILURE(errorCode
)) {
213 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
214 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
215 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
216 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
217 spanCondition
=USET_SPAN_SIMPLE
;
221 norm2
.spanQuickCheckYes(
222 s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
);
223 if(U_FAILURE(errorCode
) || yesLimit
<spanLimit
) {
226 spanCondition
=USET_SPAN_NOT_CONTAINED
;
228 prevSpanLimit
=spanLimit
;
234 FilteredNormalizer2::hasBoundaryBefore(UChar32 c
) const {
235 return !set
.contains(c
) || norm2
.hasBoundaryBefore(c
);
239 FilteredNormalizer2::hasBoundaryAfter(UChar32 c
) const {
240 return !set
.contains(c
) || norm2
.hasBoundaryAfter(c
);
244 FilteredNormalizer2::isInert(UChar32 c
) const {
245 return !set
.contains(c
) || norm2
.isInert(c
);
250 // C API ------------------------------------------------------------------- ***
254 U_DRAFT UNormalizer2
* U_EXPORT2
255 unorm2_openFiltered(const UNormalizer2
*norm2
, const USet
*filterSet
, UErrorCode
*pErrorCode
) {
256 if(U_FAILURE(*pErrorCode
)) {
259 if(filterSet
==NULL
) {
260 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
263 Normalizer2
*fn2
=new FilteredNormalizer2(*(Normalizer2
*)norm2
,
264 *UnicodeSet::fromUSet(filterSet
));
266 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
268 return (UNormalizer2
*)fn2
;
271 #endif // !UCONFIG_NO_NORMALIZATION