]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2009-2010, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: filterednormalizer2.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2009dec10 | |
14 | * created by: Markus W. Scherer | |
15 | */ | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | ||
19 | #if !UCONFIG_NO_NORMALIZATION | |
20 | ||
21 | #include "unicode/normalizer2.h" | |
22 | #include "unicode/uniset.h" | |
23 | #include "unicode/unistr.h" | |
24 | #include "unicode/unorm.h" | |
25 | #include "cpputils.h" | |
26 | ||
27 | U_NAMESPACE_BEGIN | |
28 | ||
29 | UnicodeString & | |
30 | FilteredNormalizer2::normalize(const UnicodeString &src, | |
31 | UnicodeString &dest, | |
32 | UErrorCode &errorCode) const { | |
33 | uprv_checkCanGetBuffer(src, errorCode); | |
34 | if(U_FAILURE(errorCode)) { | |
35 | dest.setToBogus(); | |
36 | return dest; | |
37 | } | |
38 | if(&dest==&src) { | |
39 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
40 | return dest; | |
41 | } | |
42 | dest.remove(); | |
43 | return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); | |
44 | } | |
45 | ||
46 | // Internal: No argument checking, and appends to dest. | |
47 | // Pass as input spanCondition the one that is likely to yield a non-zero | |
48 | // span length at the start of src. | |
49 | // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, | |
50 | // USET_SPAN_SIMPLE should be passed in for the start of src | |
51 | // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after | |
52 | // an in-filter prefix. | |
53 | UnicodeString & | |
54 | FilteredNormalizer2::normalize(const UnicodeString &src, | |
55 | UnicodeString &dest, | |
56 | USetSpanCondition spanCondition, | |
57 | UErrorCode &errorCode) const { | |
58 | UnicodeString tempDest; // Don't throw away destination buffer between iterations. | |
59 | for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) { | |
60 | int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition); | |
61 | int32_t spanLength=spanLimit-prevSpanLimit; | |
62 | if(spanCondition==USET_SPAN_NOT_CONTAINED) { | |
63 | if(spanLength!=0) { | |
64 | dest.append(src, prevSpanLimit, spanLength); | |
65 | } | |
66 | spanCondition=USET_SPAN_SIMPLE; | |
67 | } else { | |
68 | if(spanLength!=0) { | |
69 | // Not norm2.normalizeSecondAndAppend() because we do not want | |
70 | // to modify the non-filter part of dest. | |
71 | dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit), | |
72 | tempDest, errorCode)); | |
73 | if(U_FAILURE(errorCode)) { | |
74 | break; | |
75 | } | |
76 | } | |
77 | spanCondition=USET_SPAN_NOT_CONTAINED; | |
78 | } | |
79 | prevSpanLimit=spanLimit; | |
80 | } | |
81 | return dest; | |
82 | } | |
83 | ||
84 | UnicodeString & | |
85 | FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, | |
86 | const UnicodeString &second, | |
87 | UErrorCode &errorCode) const { | |
88 | return normalizeSecondAndAppend(first, second, TRUE, errorCode); | |
89 | } | |
90 | ||
91 | UnicodeString & | |
92 | FilteredNormalizer2::append(UnicodeString &first, | |
93 | const UnicodeString &second, | |
94 | UErrorCode &errorCode) const { | |
95 | return normalizeSecondAndAppend(first, second, FALSE, errorCode); | |
96 | } | |
97 | ||
98 | UnicodeString & | |
99 | FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, | |
100 | const UnicodeString &second, | |
101 | UBool doNormalize, | |
102 | UErrorCode &errorCode) const { | |
103 | uprv_checkCanGetBuffer(first, errorCode); | |
104 | uprv_checkCanGetBuffer(second, errorCode); | |
105 | if(U_FAILURE(errorCode)) { | |
106 | return first; | |
107 | } | |
108 | if(&first==&second) { | |
109 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
110 | return first; | |
111 | } | |
112 | if(first.isEmpty()) { | |
113 | if(doNormalize) { | |
114 | return normalize(second, first, errorCode); | |
115 | } else { | |
116 | return first=second; | |
117 | } | |
118 | } | |
119 | // merge the in-filter suffix of the first string with the in-filter prefix of the second | |
120 | int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); | |
121 | if(prefixLimit!=0) { | |
122 | UnicodeString prefix(second.tempSubString(0, prefixLimit)); | |
123 | int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); | |
124 | if(suffixStart==0) { | |
125 | if(doNormalize) { | |
126 | norm2.normalizeSecondAndAppend(first, prefix, errorCode); | |
127 | } else { | |
128 | norm2.append(first, prefix, errorCode); | |
129 | } | |
130 | } else { | |
131 | UnicodeString middle(first, suffixStart, INT32_MAX); | |
132 | if(doNormalize) { | |
133 | norm2.normalizeSecondAndAppend(middle, prefix, errorCode); | |
134 | } else { | |
135 | norm2.append(middle, prefix, errorCode); | |
136 | } | |
137 | first.replace(suffixStart, INT32_MAX, middle); | |
138 | } | |
139 | } | |
140 | if(prefixLimit<second.length()) { | |
141 | UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX)); | |
142 | if(doNormalize) { | |
143 | normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode); | |
144 | } else { | |
145 | first.append(rest); | |
146 | } | |
147 | } | |
148 | return first; | |
149 | } | |
150 | ||
151 | UBool | |
152 | FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const { | |
153 | return set.contains(c) && norm2.getDecomposition(c, decomposition); | |
154 | } | |
155 | ||
156 | UBool | |
157 | FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { | |
158 | uprv_checkCanGetBuffer(s, errorCode); | |
159 | if(U_FAILURE(errorCode)) { | |
160 | return FALSE; | |
161 | } | |
162 | USetSpanCondition spanCondition=USET_SPAN_SIMPLE; | |
163 | for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { | |
164 | int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); | |
165 | if(spanCondition==USET_SPAN_NOT_CONTAINED) { | |
166 | spanCondition=USET_SPAN_SIMPLE; | |
167 | } else { | |
168 | if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) || | |
169 | U_FAILURE(errorCode) | |
170 | ) { | |
171 | return FALSE; | |
172 | } | |
173 | spanCondition=USET_SPAN_NOT_CONTAINED; | |
174 | } | |
175 | prevSpanLimit=spanLimit; | |
176 | } | |
177 | return TRUE; | |
178 | } | |
179 | ||
180 | UNormalizationCheckResult | |
181 | FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const { | |
182 | uprv_checkCanGetBuffer(s, errorCode); | |
183 | if(U_FAILURE(errorCode)) { | |
184 | return UNORM_MAYBE; | |
185 | } | |
186 | UNormalizationCheckResult result=UNORM_YES; | |
187 | USetSpanCondition spanCondition=USET_SPAN_SIMPLE; | |
188 | for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { | |
189 | int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); | |
190 | if(spanCondition==USET_SPAN_NOT_CONTAINED) { | |
191 | spanCondition=USET_SPAN_SIMPLE; | |
192 | } else { | |
193 | UNormalizationCheckResult qcResult= | |
194 | norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); | |
195 | if(U_FAILURE(errorCode) || qcResult==UNORM_NO) { | |
196 | return qcResult; | |
197 | } else if(qcResult==UNORM_MAYBE) { | |
198 | result=qcResult; | |
199 | } | |
200 | spanCondition=USET_SPAN_NOT_CONTAINED; | |
201 | } | |
202 | prevSpanLimit=spanLimit; | |
203 | } | |
204 | return result; | |
205 | } | |
206 | ||
207 | int32_t | |
208 | FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const { | |
209 | uprv_checkCanGetBuffer(s, errorCode); | |
210 | if(U_FAILURE(errorCode)) { | |
211 | return 0; | |
212 | } | |
213 | USetSpanCondition spanCondition=USET_SPAN_SIMPLE; | |
214 | for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { | |
215 | int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); | |
216 | if(spanCondition==USET_SPAN_NOT_CONTAINED) { | |
217 | spanCondition=USET_SPAN_SIMPLE; | |
218 | } else { | |
219 | int32_t yesLimit= | |
220 | prevSpanLimit+ | |
221 | norm2.spanQuickCheckYes( | |
222 | s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); | |
223 | if(U_FAILURE(errorCode) || yesLimit<spanLimit) { | |
224 | return yesLimit; | |
225 | } | |
226 | spanCondition=USET_SPAN_NOT_CONTAINED; | |
227 | } | |
228 | prevSpanLimit=spanLimit; | |
229 | } | |
230 | return s.length(); | |
231 | } | |
232 | ||
233 | UBool | |
234 | FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const { | |
235 | return !set.contains(c) || norm2.hasBoundaryBefore(c); | |
236 | } | |
237 | ||
238 | UBool | |
239 | FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const { | |
240 | return !set.contains(c) || norm2.hasBoundaryAfter(c); | |
241 | } | |
242 | ||
243 | UBool | |
244 | FilteredNormalizer2::isInert(UChar32 c) const { | |
245 | return !set.contains(c) || norm2.isInert(c); | |
246 | } | |
247 | ||
248 | U_NAMESPACE_END | |
249 | ||
250 | // C API ------------------------------------------------------------------- *** | |
251 | ||
252 | U_NAMESPACE_USE | |
253 | ||
254 | U_DRAFT UNormalizer2 * U_EXPORT2 | |
255 | unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) { | |
256 | if(U_FAILURE(*pErrorCode)) { | |
257 | return NULL; | |
258 | } | |
259 | if(filterSet==NULL) { | |
260 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
261 | return NULL; | |
262 | } | |
263 | Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2, | |
264 | *UnicodeSet::fromUSet(filterSet)); | |
265 | if(fn2==NULL) { | |
266 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
267 | } | |
268 | return (UNormalizer2 *)fn2; | |
269 | } | |
270 | ||
271 | #endif // !UCONFIG_NO_NORMALIZATION |