]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/filterednormalizer2.cpp
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / filterednormalizer2.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
729e4ab9
A
3/*
4*******************************************************************************
5*
51004dcb 6* Copyright (C) 2009-2012, International Business Machines
729e4ab9
A
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: filterednormalizer2.cpp
f3c0d7a5 11* encoding: UTF-8
729e4ab9
A
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009dec10
16* created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_NORMALIZATION
22
23#include "unicode/normalizer2.h"
24#include "unicode/uniset.h"
25#include "unicode/unistr.h"
26#include "unicode/unorm.h"
27#include "cpputils.h"
28
29U_NAMESPACE_BEGIN
30
4388f060
A
31FilteredNormalizer2::~FilteredNormalizer2() {}
32
729e4ab9
A
33UnicodeString &
34FilteredNormalizer2::normalize(const UnicodeString &src,
35 UnicodeString &dest,
36 UErrorCode &errorCode) const {
37 uprv_checkCanGetBuffer(src, errorCode);
38 if(U_FAILURE(errorCode)) {
39 dest.setToBogus();
40 return dest;
41 }
42 if(&dest==&src) {
43 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
44 return dest;
45 }
46 dest.remove();
47 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
48}
49
50// Internal: No argument checking, and appends to dest.
51// Pass as input spanCondition the one that is likely to yield a non-zero
52// span length at the start of src.
53// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
54// USET_SPAN_SIMPLE should be passed in for the start of src
55// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
56// an in-filter prefix.
57UnicodeString &
58FilteredNormalizer2::normalize(const UnicodeString &src,
59 UnicodeString &dest,
60 USetSpanCondition spanCondition,
61 UErrorCode &errorCode) const {
62 UnicodeString tempDest; // Don't throw away destination buffer between iterations.
63 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
64 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
65 int32_t spanLength=spanLimit-prevSpanLimit;
66 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
67 if(spanLength!=0) {
68 dest.append(src, prevSpanLimit, spanLength);
69 }
70 spanCondition=USET_SPAN_SIMPLE;
71 } else {
72 if(spanLength!=0) {
73 // Not norm2.normalizeSecondAndAppend() because we do not want
74 // to modify the non-filter part of dest.
75 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
76 tempDest, errorCode));
77 if(U_FAILURE(errorCode)) {
78 break;
79 }
80 }
81 spanCondition=USET_SPAN_NOT_CONTAINED;
82 }
83 prevSpanLimit=spanLimit;
84 }
85 return dest;
86}
87
88UnicodeString &
89FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
90 const UnicodeString &second,
91 UErrorCode &errorCode) const {
92 return normalizeSecondAndAppend(first, second, TRUE, errorCode);
93}
94
95UnicodeString &
96FilteredNormalizer2::append(UnicodeString &first,
97 const UnicodeString &second,
98 UErrorCode &errorCode) const {
99 return normalizeSecondAndAppend(first, second, FALSE, errorCode);
100}
101
102UnicodeString &
103FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
104 const UnicodeString &second,
105 UBool doNormalize,
106 UErrorCode &errorCode) const {
107 uprv_checkCanGetBuffer(first, errorCode);
108 uprv_checkCanGetBuffer(second, errorCode);
109 if(U_FAILURE(errorCode)) {
110 return first;
111 }
112 if(&first==&second) {
113 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
114 return first;
115 }
116 if(first.isEmpty()) {
117 if(doNormalize) {
118 return normalize(second, first, errorCode);
119 } else {
120 return first=second;
121 }
122 }
123 // merge the in-filter suffix of the first string with the in-filter prefix of the second
124 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
125 if(prefixLimit!=0) {
126 UnicodeString prefix(second.tempSubString(0, prefixLimit));
127 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
128 if(suffixStart==0) {
129 if(doNormalize) {
130 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
131 } else {
132 norm2.append(first, prefix, errorCode);
133 }
134 } else {
135 UnicodeString middle(first, suffixStart, INT32_MAX);
136 if(doNormalize) {
137 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
138 } else {
139 norm2.append(middle, prefix, errorCode);
140 }
141 first.replace(suffixStart, INT32_MAX, middle);
142 }
143 }
144 if(prefixLimit<second.length()) {
145 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
146 if(doNormalize) {
147 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
148 } else {
149 first.append(rest);
150 }
151 }
152 return first;
153}
154
155UBool
156FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
157 return set.contains(c) && norm2.getDecomposition(c, decomposition);
158}
159
4388f060
A
160UBool
161FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
162 return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
163}
164
165UChar32
166FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
167 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
168}
169
170uint8_t
171FilteredNormalizer2::getCombiningClass(UChar32 c) const {
172 return set.contains(c) ? norm2.getCombiningClass(c) : 0;
173}
174
729e4ab9
A
175UBool
176FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
177 uprv_checkCanGetBuffer(s, errorCode);
178 if(U_FAILURE(errorCode)) {
179 return FALSE;
180 }
181 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
182 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
183 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
184 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
185 spanCondition=USET_SPAN_SIMPLE;
186 } else {
187 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
188 U_FAILURE(errorCode)
189 ) {
190 return FALSE;
191 }
192 spanCondition=USET_SPAN_NOT_CONTAINED;
193 }
194 prevSpanLimit=spanLimit;
195 }
196 return TRUE;
197}
198
199UNormalizationCheckResult
200FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
201 uprv_checkCanGetBuffer(s, errorCode);
202 if(U_FAILURE(errorCode)) {
203 return UNORM_MAYBE;
204 }
205 UNormalizationCheckResult result=UNORM_YES;
206 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
207 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
208 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
209 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
210 spanCondition=USET_SPAN_SIMPLE;
211 } else {
212 UNormalizationCheckResult qcResult=
213 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
214 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
215 return qcResult;
216 } else if(qcResult==UNORM_MAYBE) {
217 result=qcResult;
218 }
219 spanCondition=USET_SPAN_NOT_CONTAINED;
220 }
221 prevSpanLimit=spanLimit;
222 }
223 return result;
224}
225
226int32_t
227FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
228 uprv_checkCanGetBuffer(s, errorCode);
229 if(U_FAILURE(errorCode)) {
230 return 0;
231 }
232 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
233 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
234 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
235 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
236 spanCondition=USET_SPAN_SIMPLE;
237 } else {
238 int32_t yesLimit=
239 prevSpanLimit+
240 norm2.spanQuickCheckYes(
241 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
242 if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
243 return yesLimit;
244 }
245 spanCondition=USET_SPAN_NOT_CONTAINED;
246 }
247 prevSpanLimit=spanLimit;
248 }
249 return s.length();
250}
251
252UBool
253FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
254 return !set.contains(c) || norm2.hasBoundaryBefore(c);
255}
256
257UBool
258FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
259 return !set.contains(c) || norm2.hasBoundaryAfter(c);
260}
261
262UBool
263FilteredNormalizer2::isInert(UChar32 c) const {
264 return !set.contains(c) || norm2.isInert(c);
265}
266
267U_NAMESPACE_END
268
269// C API ------------------------------------------------------------------- ***
270
271U_NAMESPACE_USE
272
51004dcb 273U_CAPI UNormalizer2 * U_EXPORT2
729e4ab9
A
274unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
275 if(U_FAILURE(*pErrorCode)) {
276 return NULL;
277 }
278 if(filterSet==NULL) {
279 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
280 return NULL;
281 }
282 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
283 *UnicodeSet::fromUSet(filterSet));
284 if(fn2==NULL) {
285 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
286 }
287 return (UNormalizer2 *)fn2;
288}
289
290#endif // !UCONFIG_NO_NORMALIZATION