]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/filterednormalizer2.cpp
ICU-64232.0.1.tar.gz
[apple/icu.git] / icuSources / common / filterednormalizer2.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: filterednormalizer2.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009dec10
16 * created by: Markus W. Scherer
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_NORMALIZATION
22
23 #include "unicode/edits.h"
24 #include "unicode/normalizer2.h"
25 #include "unicode/stringoptions.h"
26 #include "unicode/uniset.h"
27 #include "unicode/unistr.h"
28 #include "unicode/unorm.h"
29 #include "cpputils.h"
30
31 U_NAMESPACE_BEGIN
32
33 FilteredNormalizer2::~FilteredNormalizer2() {}
34
35 UnicodeString &
36 FilteredNormalizer2::normalize(const UnicodeString &src,
37 UnicodeString &dest,
38 UErrorCode &errorCode) const {
39 uprv_checkCanGetBuffer(src, errorCode);
40 if(U_FAILURE(errorCode)) {
41 dest.setToBogus();
42 return dest;
43 }
44 if(&dest==&src) {
45 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
46 return dest;
47 }
48 dest.remove();
49 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
50 }
51
52 // Internal: No argument checking, and appends to dest.
53 // Pass as input spanCondition the one that is likely to yield a non-zero
54 // span length at the start of src.
55 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
56 // USET_SPAN_SIMPLE should be passed in for the start of src
57 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
58 // an in-filter prefix.
59 UnicodeString &
60 FilteredNormalizer2::normalize(const UnicodeString &src,
61 UnicodeString &dest,
62 USetSpanCondition spanCondition,
63 UErrorCode &errorCode) const {
64 UnicodeString tempDest; // Don't throw away destination buffer between iterations.
65 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
66 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
67 int32_t spanLength=spanLimit-prevSpanLimit;
68 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
69 if(spanLength!=0) {
70 dest.append(src, prevSpanLimit, spanLength);
71 }
72 spanCondition=USET_SPAN_SIMPLE;
73 } else {
74 if(spanLength!=0) {
75 // Not norm2.normalizeSecondAndAppend() because we do not want
76 // to modify the non-filter part of dest.
77 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
78 tempDest, errorCode));
79 if(U_FAILURE(errorCode)) {
80 break;
81 }
82 }
83 spanCondition=USET_SPAN_NOT_CONTAINED;
84 }
85 prevSpanLimit=spanLimit;
86 }
87 return dest;
88 }
89
90 void
91 FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
92 Edits *edits, UErrorCode &errorCode) const {
93 if (U_FAILURE(errorCode)) {
94 return;
95 }
96 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
97 edits->reset();
98 }
99 options |= U_EDITS_NO_RESET; // Do not reset for each span.
100 normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode);
101 }
102
103 void
104 FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length,
105 ByteSink &sink, Edits *edits,
106 USetSpanCondition spanCondition,
107 UErrorCode &errorCode) const {
108 while (length > 0) {
109 int32_t spanLength = set.spanUTF8(src, length, spanCondition);
110 if (spanCondition == USET_SPAN_NOT_CONTAINED) {
111 if (spanLength != 0) {
112 if (edits != nullptr) {
113 edits->addUnchanged(spanLength);
114 }
115 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
116 sink.Append(src, spanLength);
117 }
118 }
119 spanCondition = USET_SPAN_SIMPLE;
120 } else {
121 if (spanLength != 0) {
122 // Not norm2.normalizeSecondAndAppend() because we do not want
123 // to modify the non-filter part of dest.
124 norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode);
125 if (U_FAILURE(errorCode)) {
126 break;
127 }
128 }
129 spanCondition = USET_SPAN_NOT_CONTAINED;
130 }
131 src += spanLength;
132 length -= spanLength;
133 }
134 }
135
136 UnicodeString &
137 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
138 const UnicodeString &second,
139 UErrorCode &errorCode) const {
140 return normalizeSecondAndAppend(first, second, TRUE, errorCode);
141 }
142
143 UnicodeString &
144 FilteredNormalizer2::append(UnicodeString &first,
145 const UnicodeString &second,
146 UErrorCode &errorCode) const {
147 return normalizeSecondAndAppend(first, second, FALSE, errorCode);
148 }
149
150 UnicodeString &
151 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
152 const UnicodeString &second,
153 UBool doNormalize,
154 UErrorCode &errorCode) const {
155 uprv_checkCanGetBuffer(first, errorCode);
156 uprv_checkCanGetBuffer(second, errorCode);
157 if(U_FAILURE(errorCode)) {
158 return first;
159 }
160 if(&first==&second) {
161 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
162 return first;
163 }
164 if(first.isEmpty()) {
165 if(doNormalize) {
166 return normalize(second, first, errorCode);
167 } else {
168 return first=second;
169 }
170 }
171 // merge the in-filter suffix of the first string with the in-filter prefix of the second
172 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
173 if(prefixLimit!=0) {
174 UnicodeString prefix(second.tempSubString(0, prefixLimit));
175 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
176 if(suffixStart==0) {
177 if(doNormalize) {
178 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
179 } else {
180 norm2.append(first, prefix, errorCode);
181 }
182 } else {
183 UnicodeString middle(first, suffixStart, INT32_MAX);
184 if(doNormalize) {
185 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
186 } else {
187 norm2.append(middle, prefix, errorCode);
188 }
189 first.replace(suffixStart, INT32_MAX, middle);
190 }
191 }
192 if(prefixLimit<second.length()) {
193 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
194 if(doNormalize) {
195 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
196 } else {
197 first.append(rest);
198 }
199 }
200 return first;
201 }
202
203 UBool
204 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
205 return set.contains(c) && norm2.getDecomposition(c, decomposition);
206 }
207
208 UBool
209 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
210 return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
211 }
212
213 UChar32
214 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
215 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
216 }
217
218 uint8_t
219 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
220 return set.contains(c) ? norm2.getCombiningClass(c) : 0;
221 }
222
223 UBool
224 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
225 uprv_checkCanGetBuffer(s, errorCode);
226 if(U_FAILURE(errorCode)) {
227 return FALSE;
228 }
229 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
230 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
231 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
232 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
233 spanCondition=USET_SPAN_SIMPLE;
234 } else {
235 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
236 U_FAILURE(errorCode)
237 ) {
238 return FALSE;
239 }
240 spanCondition=USET_SPAN_NOT_CONTAINED;
241 }
242 prevSpanLimit=spanLimit;
243 }
244 return TRUE;
245 }
246
247 UBool
248 FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
249 if(U_FAILURE(errorCode)) {
250 return FALSE;
251 }
252 const char *s = sp.data();
253 int32_t length = sp.length();
254 USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
255 while (length > 0) {
256 int32_t spanLength = set.spanUTF8(s, length, spanCondition);
257 if (spanCondition == USET_SPAN_NOT_CONTAINED) {
258 spanCondition = USET_SPAN_SIMPLE;
259 } else {
260 if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
261 U_FAILURE(errorCode)) {
262 return FALSE;
263 }
264 spanCondition = USET_SPAN_NOT_CONTAINED;
265 }
266 s += spanLength;
267 length -= spanLength;
268 }
269 return TRUE;
270 }
271
272 UNormalizationCheckResult
273 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
274 uprv_checkCanGetBuffer(s, errorCode);
275 if(U_FAILURE(errorCode)) {
276 return UNORM_MAYBE;
277 }
278 UNormalizationCheckResult result=UNORM_YES;
279 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
280 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
281 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
282 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
283 spanCondition=USET_SPAN_SIMPLE;
284 } else {
285 UNormalizationCheckResult qcResult=
286 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
287 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
288 return qcResult;
289 } else if(qcResult==UNORM_MAYBE) {
290 result=qcResult;
291 }
292 spanCondition=USET_SPAN_NOT_CONTAINED;
293 }
294 prevSpanLimit=spanLimit;
295 }
296 return result;
297 }
298
299 int32_t
300 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
301 uprv_checkCanGetBuffer(s, errorCode);
302 if(U_FAILURE(errorCode)) {
303 return 0;
304 }
305 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
306 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
307 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
308 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
309 spanCondition=USET_SPAN_SIMPLE;
310 } else {
311 int32_t yesLimit=
312 prevSpanLimit+
313 norm2.spanQuickCheckYes(
314 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
315 if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
316 return yesLimit;
317 }
318 spanCondition=USET_SPAN_NOT_CONTAINED;
319 }
320 prevSpanLimit=spanLimit;
321 }
322 return s.length();
323 }
324
325 UBool
326 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
327 return !set.contains(c) || norm2.hasBoundaryBefore(c);
328 }
329
330 UBool
331 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
332 return !set.contains(c) || norm2.hasBoundaryAfter(c);
333 }
334
335 UBool
336 FilteredNormalizer2::isInert(UChar32 c) const {
337 return !set.contains(c) || norm2.isInert(c);
338 }
339
340 U_NAMESPACE_END
341
342 // C API ------------------------------------------------------------------- ***
343
344 U_NAMESPACE_USE
345
346 U_CAPI UNormalizer2 * U_EXPORT2
347 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
348 if(U_FAILURE(*pErrorCode)) {
349 return NULL;
350 }
351 if(filterSet==NULL) {
352 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
353 return NULL;
354 }
355 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
356 *UnicodeSet::fromUSet(filterSet));
357 if(fn2==NULL) {
358 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
359 }
360 return (UNormalizer2 *)fn2;
361 }
362
363 #endif // !UCONFIG_NO_NORMALIZATION