ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / common / filterednormalizer2.cpp
CommitLineData
729e4ab9
A
1/*
2*******************************************************************************
3*
51004dcb 4* Copyright (C) 2009-2012, International Business Machines
729e4ab9
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: filterednormalizer2.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2009dec10
14* created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_NORMALIZATION
20
21#include "unicode/normalizer2.h"
22#include "unicode/uniset.h"
23#include "unicode/unistr.h"
24#include "unicode/unorm.h"
25#include "cpputils.h"
26
27U_NAMESPACE_BEGIN
28
4388f060
A
29FilteredNormalizer2::~FilteredNormalizer2() {}
30
729e4ab9
A
31UnicodeString &
32FilteredNormalizer2::normalize(const UnicodeString &src,
33 UnicodeString &dest,
34 UErrorCode &errorCode) const {
35 uprv_checkCanGetBuffer(src, errorCode);
36 if(U_FAILURE(errorCode)) {
37 dest.setToBogus();
38 return dest;
39 }
40 if(&dest==&src) {
41 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
42 return dest;
43 }
44 dest.remove();
45 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
46}
47
48// Internal: No argument checking, and appends to dest.
49// Pass as input spanCondition the one that is likely to yield a non-zero
50// span length at the start of src.
51// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
52// USET_SPAN_SIMPLE should be passed in for the start of src
53// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
54// an in-filter prefix.
55UnicodeString &
56FilteredNormalizer2::normalize(const UnicodeString &src,
57 UnicodeString &dest,
58 USetSpanCondition spanCondition,
59 UErrorCode &errorCode) const {
60 UnicodeString tempDest; // Don't throw away destination buffer between iterations.
61 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
62 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
63 int32_t spanLength=spanLimit-prevSpanLimit;
64 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
65 if(spanLength!=0) {
66 dest.append(src, prevSpanLimit, spanLength);
67 }
68 spanCondition=USET_SPAN_SIMPLE;
69 } else {
70 if(spanLength!=0) {
71 // Not norm2.normalizeSecondAndAppend() because we do not want
72 // to modify the non-filter part of dest.
73 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
74 tempDest, errorCode));
75 if(U_FAILURE(errorCode)) {
76 break;
77 }
78 }
79 spanCondition=USET_SPAN_NOT_CONTAINED;
80 }
81 prevSpanLimit=spanLimit;
82 }
83 return dest;
84}
85
86UnicodeString &
87FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
88 const UnicodeString &second,
89 UErrorCode &errorCode) const {
90 return normalizeSecondAndAppend(first, second, TRUE, errorCode);
91}
92
93UnicodeString &
94FilteredNormalizer2::append(UnicodeString &first,
95 const UnicodeString &second,
96 UErrorCode &errorCode) const {
97 return normalizeSecondAndAppend(first, second, FALSE, errorCode);
98}
99
100UnicodeString &
101FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
102 const UnicodeString &second,
103 UBool doNormalize,
104 UErrorCode &errorCode) const {
105 uprv_checkCanGetBuffer(first, errorCode);
106 uprv_checkCanGetBuffer(second, errorCode);
107 if(U_FAILURE(errorCode)) {
108 return first;
109 }
110 if(&first==&second) {
111 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
112 return first;
113 }
114 if(first.isEmpty()) {
115 if(doNormalize) {
116 return normalize(second, first, errorCode);
117 } else {
118 return first=second;
119 }
120 }
121 // merge the in-filter suffix of the first string with the in-filter prefix of the second
122 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
123 if(prefixLimit!=0) {
124 UnicodeString prefix(second.tempSubString(0, prefixLimit));
125 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
126 if(suffixStart==0) {
127 if(doNormalize) {
128 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
129 } else {
130 norm2.append(first, prefix, errorCode);
131 }
132 } else {
133 UnicodeString middle(first, suffixStart, INT32_MAX);
134 if(doNormalize) {
135 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
136 } else {
137 norm2.append(middle, prefix, errorCode);
138 }
139 first.replace(suffixStart, INT32_MAX, middle);
140 }
141 }
142 if(prefixLimit<second.length()) {
143 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
144 if(doNormalize) {
145 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
146 } else {
147 first.append(rest);
148 }
149 }
150 return first;
151}
152
153UBool
154FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
155 return set.contains(c) && norm2.getDecomposition(c, decomposition);
156}
157
4388f060
A
158UBool
159FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
160 return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
161}
162
163UChar32
164FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
165 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
166}
167
168uint8_t
169FilteredNormalizer2::getCombiningClass(UChar32 c) const {
170 return set.contains(c) ? norm2.getCombiningClass(c) : 0;
171}
172
729e4ab9
A
173UBool
174FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
175 uprv_checkCanGetBuffer(s, errorCode);
176 if(U_FAILURE(errorCode)) {
177 return FALSE;
178 }
179 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
180 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
181 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
182 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
183 spanCondition=USET_SPAN_SIMPLE;
184 } else {
185 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
186 U_FAILURE(errorCode)
187 ) {
188 return FALSE;
189 }
190 spanCondition=USET_SPAN_NOT_CONTAINED;
191 }
192 prevSpanLimit=spanLimit;
193 }
194 return TRUE;
195}
196
197UNormalizationCheckResult
198FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
199 uprv_checkCanGetBuffer(s, errorCode);
200 if(U_FAILURE(errorCode)) {
201 return UNORM_MAYBE;
202 }
203 UNormalizationCheckResult result=UNORM_YES;
204 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
205 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
206 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
207 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
208 spanCondition=USET_SPAN_SIMPLE;
209 } else {
210 UNormalizationCheckResult qcResult=
211 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
212 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
213 return qcResult;
214 } else if(qcResult==UNORM_MAYBE) {
215 result=qcResult;
216 }
217 spanCondition=USET_SPAN_NOT_CONTAINED;
218 }
219 prevSpanLimit=spanLimit;
220 }
221 return result;
222}
223
224int32_t
225FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
226 uprv_checkCanGetBuffer(s, errorCode);
227 if(U_FAILURE(errorCode)) {
228 return 0;
229 }
230 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
231 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
232 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
233 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
234 spanCondition=USET_SPAN_SIMPLE;
235 } else {
236 int32_t yesLimit=
237 prevSpanLimit+
238 norm2.spanQuickCheckYes(
239 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
240 if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
241 return yesLimit;
242 }
243 spanCondition=USET_SPAN_NOT_CONTAINED;
244 }
245 prevSpanLimit=spanLimit;
246 }
247 return s.length();
248}
249
250UBool
251FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
252 return !set.contains(c) || norm2.hasBoundaryBefore(c);
253}
254
255UBool
256FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
257 return !set.contains(c) || norm2.hasBoundaryAfter(c);
258}
259
260UBool
261FilteredNormalizer2::isInert(UChar32 c) const {
262 return !set.contains(c) || norm2.isInert(c);
263}
264
265U_NAMESPACE_END
266
267// C API ------------------------------------------------------------------- ***
268
269U_NAMESPACE_USE
270
51004dcb 271U_CAPI UNormalizer2 * U_EXPORT2
729e4ab9
A
272unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
273 if(U_FAILURE(*pErrorCode)) {
274 return NULL;
275 }
276 if(filterSet==NULL) {
277 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
278 return NULL;
279 }
280 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
281 *UnicodeSet::fromUSet(filterSet));
282 if(fn2==NULL) {
283 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
284 }
285 return (UNormalizer2 *)fn2;
286}
287
288#endif // !UCONFIG_NO_NORMALIZATION