2 *******************************************************************************
4 * Copyright (C) 2009-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: filterednormalizer2.cpp
10 * tab size: 8 (not used)
13 * created on: 2009dec10
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_NORMALIZATION
21 #include "unicode/normalizer2.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 #include "unicode/unorm.h"
29 FilteredNormalizer2::~FilteredNormalizer2() {}
32 FilteredNormalizer2::normalize(const UnicodeString
&src
,
34 UErrorCode
&errorCode
) const {
35 uprv_checkCanGetBuffer(src
, errorCode
);
36 if(U_FAILURE(errorCode
)) {
41 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
45 return normalize(src
, dest
, USET_SPAN_SIMPLE
, errorCode
);
48 // Internal: No argument checking, and appends to dest.
49 // Pass as input spanCondition the one that is likely to yield a non-zero
50 // span length at the start of src.
51 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
52 // USET_SPAN_SIMPLE should be passed in for the start of src
53 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
54 // an in-filter prefix.
56 FilteredNormalizer2::normalize(const UnicodeString
&src
,
58 USetSpanCondition spanCondition
,
59 UErrorCode
&errorCode
) const {
60 UnicodeString tempDest
; // Don't throw away destination buffer between iterations.
61 for(int32_t prevSpanLimit
=0; prevSpanLimit
<src
.length();) {
62 int32_t spanLimit
=set
.span(src
, prevSpanLimit
, spanCondition
);
63 int32_t spanLength
=spanLimit
-prevSpanLimit
;
64 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
66 dest
.append(src
, prevSpanLimit
, spanLength
);
68 spanCondition
=USET_SPAN_SIMPLE
;
71 // Not norm2.normalizeSecondAndAppend() because we do not want
72 // to modify the non-filter part of dest.
73 dest
.append(norm2
.normalize(src
.tempSubStringBetween(prevSpanLimit
, spanLimit
),
74 tempDest
, errorCode
));
75 if(U_FAILURE(errorCode
)) {
79 spanCondition
=USET_SPAN_NOT_CONTAINED
;
81 prevSpanLimit
=spanLimit
;
87 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString
&first
,
88 const UnicodeString
&second
,
89 UErrorCode
&errorCode
) const {
90 return normalizeSecondAndAppend(first
, second
, TRUE
, errorCode
);
94 FilteredNormalizer2::append(UnicodeString
&first
,
95 const UnicodeString
&second
,
96 UErrorCode
&errorCode
) const {
97 return normalizeSecondAndAppend(first
, second
, FALSE
, errorCode
);
101 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString
&first
,
102 const UnicodeString
&second
,
104 UErrorCode
&errorCode
) const {
105 uprv_checkCanGetBuffer(first
, errorCode
);
106 uprv_checkCanGetBuffer(second
, errorCode
);
107 if(U_FAILURE(errorCode
)) {
110 if(&first
==&second
) {
111 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
114 if(first
.isEmpty()) {
116 return normalize(second
, first
, errorCode
);
121 // merge the in-filter suffix of the first string with the in-filter prefix of the second
122 int32_t prefixLimit
=set
.span(second
, 0, USET_SPAN_SIMPLE
);
124 UnicodeString
prefix(second
.tempSubString(0, prefixLimit
));
125 int32_t suffixStart
=set
.spanBack(first
, INT32_MAX
, USET_SPAN_SIMPLE
);
128 norm2
.normalizeSecondAndAppend(first
, prefix
, errorCode
);
130 norm2
.append(first
, prefix
, errorCode
);
133 UnicodeString
middle(first
, suffixStart
, INT32_MAX
);
135 norm2
.normalizeSecondAndAppend(middle
, prefix
, errorCode
);
137 norm2
.append(middle
, prefix
, errorCode
);
139 first
.replace(suffixStart
, INT32_MAX
, middle
);
142 if(prefixLimit
<second
.length()) {
143 UnicodeString
rest(second
.tempSubString(prefixLimit
, INT32_MAX
));
145 normalize(rest
, first
, USET_SPAN_NOT_CONTAINED
, errorCode
);
154 FilteredNormalizer2::getDecomposition(UChar32 c
, UnicodeString
&decomposition
) const {
155 return set
.contains(c
) && norm2
.getDecomposition(c
, decomposition
);
159 FilteredNormalizer2::getRawDecomposition(UChar32 c
, UnicodeString
&decomposition
) const {
160 return set
.contains(c
) && norm2
.getRawDecomposition(c
, decomposition
);
164 FilteredNormalizer2::composePair(UChar32 a
, UChar32 b
) const {
165 return (set
.contains(a
) && set
.contains(b
)) ? norm2
.composePair(a
, b
) : U_SENTINEL
;
169 FilteredNormalizer2::getCombiningClass(UChar32 c
) const {
170 return set
.contains(c
) ? norm2
.getCombiningClass(c
) : 0;
174 FilteredNormalizer2::isNormalized(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
175 uprv_checkCanGetBuffer(s
, errorCode
);
176 if(U_FAILURE(errorCode
)) {
179 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
180 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
181 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
182 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
183 spanCondition
=USET_SPAN_SIMPLE
;
185 if( !norm2
.isNormalized(s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
) ||
190 spanCondition
=USET_SPAN_NOT_CONTAINED
;
192 prevSpanLimit
=spanLimit
;
197 UNormalizationCheckResult
198 FilteredNormalizer2::quickCheck(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
199 uprv_checkCanGetBuffer(s
, errorCode
);
200 if(U_FAILURE(errorCode
)) {
203 UNormalizationCheckResult result
=UNORM_YES
;
204 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
205 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
206 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
207 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
208 spanCondition
=USET_SPAN_SIMPLE
;
210 UNormalizationCheckResult qcResult
=
211 norm2
.quickCheck(s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
);
212 if(U_FAILURE(errorCode
) || qcResult
==UNORM_NO
) {
214 } else if(qcResult
==UNORM_MAYBE
) {
217 spanCondition
=USET_SPAN_NOT_CONTAINED
;
219 prevSpanLimit
=spanLimit
;
225 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&errorCode
) const {
226 uprv_checkCanGetBuffer(s
, errorCode
);
227 if(U_FAILURE(errorCode
)) {
230 USetSpanCondition spanCondition
=USET_SPAN_SIMPLE
;
231 for(int32_t prevSpanLimit
=0; prevSpanLimit
<s
.length();) {
232 int32_t spanLimit
=set
.span(s
, prevSpanLimit
, spanCondition
);
233 if(spanCondition
==USET_SPAN_NOT_CONTAINED
) {
234 spanCondition
=USET_SPAN_SIMPLE
;
238 norm2
.spanQuickCheckYes(
239 s
.tempSubStringBetween(prevSpanLimit
, spanLimit
), errorCode
);
240 if(U_FAILURE(errorCode
) || yesLimit
<spanLimit
) {
243 spanCondition
=USET_SPAN_NOT_CONTAINED
;
245 prevSpanLimit
=spanLimit
;
251 FilteredNormalizer2::hasBoundaryBefore(UChar32 c
) const {
252 return !set
.contains(c
) || norm2
.hasBoundaryBefore(c
);
256 FilteredNormalizer2::hasBoundaryAfter(UChar32 c
) const {
257 return !set
.contains(c
) || norm2
.hasBoundaryAfter(c
);
261 FilteredNormalizer2::isInert(UChar32 c
) const {
262 return !set
.contains(c
) || norm2
.isInert(c
);
267 // C API ------------------------------------------------------------------- ***
271 U_CAPI UNormalizer2
* U_EXPORT2
272 unorm2_openFiltered(const UNormalizer2
*norm2
, const USet
*filterSet
, UErrorCode
*pErrorCode
) {
273 if(U_FAILURE(*pErrorCode
)) {
276 if(filterSet
==NULL
) {
277 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
280 Normalizer2
*fn2
=new FilteredNormalizer2(*(Normalizer2
*)norm2
,
281 *UnicodeSet::fromUSet(filterSet
));
283 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
285 return (UNormalizer2
*)fn2
;
288 #endif // !UCONFIG_NO_NORMALIZATION