]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ****************************************************************************** | |
5 | * Copyright (c) 1996-2014, International Business Machines | |
6 | * Corporation and others. All Rights Reserved. | |
7 | ****************************************************************************** | |
8 | * File unorm.cpp | |
9 | * | |
10 | * Created by: Vladimir Weinstein 12052000 | |
11 | * | |
12 | * Modification history : | |
13 | * | |
14 | * Date Name Description | |
15 | * 02/01/01 synwee Added normalization quickcheck enum and method. | |
16 | * 02/12/01 synwee Commented out quickcheck util api has been approved | |
17 | * Added private method for doing FCD checks | |
18 | * 02/23/01 synwee Modified quickcheck and checkFCE to run through | |
19 | * string for codepoints < 0x300 for the normalization | |
20 | * mode NFC. | |
21 | * 05/25/01+ Markus Scherer total rewrite, implement all normalization here | |
22 | * instead of just wrappers around normlzr.cpp, | |
23 | * load unorm.dat, support Unicode 3.1 with | |
24 | * supplementary code points, etc. | |
25 | * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code | |
26 | */ | |
27 | ||
28 | #include "unicode/utypes.h" | |
29 | ||
30 | #if !UCONFIG_NO_NORMALIZATION | |
31 | ||
32 | #include "unicode/udata.h" | |
33 | #include "unicode/ustring.h" | |
34 | #include "unicode/uiter.h" | |
35 | #include "unicode/unorm.h" | |
36 | #include "unicode/unorm2.h" | |
37 | #include "normalizer2impl.h" | |
38 | #include "unormimp.h" | |
39 | #include "uprops.h" | |
40 | #include "ustr_imp.h" | |
41 | ||
42 | U_NAMESPACE_USE | |
43 | ||
44 | /* quick check functions ---------------------------------------------------- */ | |
45 | ||
46 | U_CAPI UNormalizationCheckResult U_EXPORT2 | |
47 | unorm_quickCheck(const UChar *src, | |
48 | int32_t srcLength, | |
49 | UNormalizationMode mode, | |
50 | UErrorCode *pErrorCode) { | |
51 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
52 | return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
53 | } | |
54 | ||
55 | U_CAPI UNormalizationCheckResult U_EXPORT2 | |
56 | unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, | |
57 | UNormalizationMode mode, int32_t options, | |
58 | UErrorCode *pErrorCode) { | |
59 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
60 | if(options&UNORM_UNICODE_3_2) { | |
61 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); | |
62 | return unorm2_quickCheck( | |
63 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), | |
64 | src, srcLength, pErrorCode); | |
65 | } else { | |
66 | return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
67 | } | |
68 | } | |
69 | ||
70 | U_CAPI UBool U_EXPORT2 | |
71 | unorm_isNormalized(const UChar *src, int32_t srcLength, | |
72 | UNormalizationMode mode, | |
73 | UErrorCode *pErrorCode) { | |
74 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
75 | return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
76 | } | |
77 | ||
78 | U_CAPI UBool U_EXPORT2 | |
79 | unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, | |
80 | UNormalizationMode mode, int32_t options, | |
81 | UErrorCode *pErrorCode) { | |
82 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
83 | if(options&UNORM_UNICODE_3_2) { | |
84 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); | |
85 | return unorm2_isNormalized( | |
86 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), | |
87 | src, srcLength, pErrorCode); | |
88 | } else { | |
89 | return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
90 | } | |
91 | } | |
92 | ||
93 | /* normalize() API ---------------------------------------------------------- */ | |
94 | ||
95 | /** Public API for normalizing. */ | |
96 | U_CAPI int32_t U_EXPORT2 | |
97 | unorm_normalize(const UChar *src, int32_t srcLength, | |
98 | UNormalizationMode mode, int32_t options, | |
99 | UChar *dest, int32_t destCapacity, | |
100 | UErrorCode *pErrorCode) { | |
101 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
102 | if(options&UNORM_UNICODE_3_2) { | |
103 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); | |
104 | return unorm2_normalize( | |
105 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), | |
106 | src, srcLength, dest, destCapacity, pErrorCode); | |
107 | } else { | |
108 | return unorm2_normalize((const UNormalizer2 *)n2, | |
109 | src, srcLength, dest, destCapacity, pErrorCode); | |
110 | } | |
111 | } | |
112 | ||
113 | ||
114 | /* iteration functions ------------------------------------------------------ */ | |
115 | ||
116 | static int32_t | |
117 | _iterate(UCharIterator *src, UBool forward, | |
118 | UChar *dest, int32_t destCapacity, | |
119 | const Normalizer2 *n2, | |
120 | UBool doNormalize, UBool *pNeededToNormalize, | |
121 | UErrorCode *pErrorCode) { | |
122 | if(U_FAILURE(*pErrorCode)) { | |
123 | return 0; | |
124 | } | |
125 | if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) { | |
126 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
127 | return 0; | |
128 | } | |
129 | ||
130 | if(pNeededToNormalize!=NULL) { | |
131 | *pNeededToNormalize=FALSE; | |
132 | } | |
133 | if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { | |
134 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); | |
135 | } | |
136 | ||
137 | UnicodeString buffer; | |
138 | UChar32 c; | |
139 | if(forward) { | |
140 | /* get one character and ignore its properties */ | |
141 | buffer.append(uiter_next32(src)); | |
142 | /* get all following characters until we see a boundary */ | |
143 | while((c=uiter_next32(src))>=0) { | |
144 | if(n2->hasBoundaryBefore(c)) { | |
145 | /* back out the latest movement to stop at the boundary */ | |
146 | src->move(src, -U16_LENGTH(c), UITER_CURRENT); | |
147 | break; | |
148 | } else { | |
149 | buffer.append(c); | |
150 | } | |
151 | } | |
152 | } else { | |
153 | while((c=uiter_previous32(src))>=0) { | |
154 | /* always write this character to the front of the buffer */ | |
155 | buffer.insert(0, c); | |
156 | /* stop if this just-copied character is a boundary */ | |
157 | if(n2->hasBoundaryBefore(c)) { | |
158 | break; | |
159 | } | |
160 | } | |
161 | } | |
162 | ||
163 | UnicodeString destString(dest, 0, destCapacity); | |
164 | if(buffer.length()>0 && doNormalize) { | |
165 | n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); | |
166 | if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { | |
167 | *pNeededToNormalize= destString!=buffer; | |
168 | } | |
169 | return destString.length(); | |
170 | } else { | |
171 | /* just copy the source characters */ | |
172 | return buffer.extract(dest, destCapacity, *pErrorCode); | |
173 | } | |
174 | } | |
175 | ||
176 | static int32_t | |
177 | unorm_iterate(UCharIterator *src, UBool forward, | |
178 | UChar *dest, int32_t destCapacity, | |
179 | UNormalizationMode mode, int32_t options, | |
180 | UBool doNormalize, UBool *pNeededToNormalize, | |
181 | UErrorCode *pErrorCode) { | |
182 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
183 | if(options&UNORM_UNICODE_3_2) { | |
184 | const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); | |
185 | if(U_FAILURE(*pErrorCode)) { | |
186 | return 0; | |
187 | } | |
188 | FilteredNormalizer2 fn2(*n2, *uni32); | |
189 | return _iterate(src, forward, dest, destCapacity, | |
190 | &fn2, doNormalize, pNeededToNormalize, pErrorCode); | |
191 | } | |
192 | return _iterate(src, forward, dest, destCapacity, | |
193 | n2, doNormalize, pNeededToNormalize, pErrorCode); | |
194 | } | |
195 | ||
196 | U_CAPI int32_t U_EXPORT2 | |
197 | unorm_previous(UCharIterator *src, | |
198 | UChar *dest, int32_t destCapacity, | |
199 | UNormalizationMode mode, int32_t options, | |
200 | UBool doNormalize, UBool *pNeededToNormalize, | |
201 | UErrorCode *pErrorCode) { | |
202 | return unorm_iterate(src, FALSE, | |
203 | dest, destCapacity, | |
204 | mode, options, | |
205 | doNormalize, pNeededToNormalize, | |
206 | pErrorCode); | |
207 | } | |
208 | ||
209 | U_CAPI int32_t U_EXPORT2 | |
210 | unorm_next(UCharIterator *src, | |
211 | UChar *dest, int32_t destCapacity, | |
212 | UNormalizationMode mode, int32_t options, | |
213 | UBool doNormalize, UBool *pNeededToNormalize, | |
214 | UErrorCode *pErrorCode) { | |
215 | return unorm_iterate(src, TRUE, | |
216 | dest, destCapacity, | |
217 | mode, options, | |
218 | doNormalize, pNeededToNormalize, | |
219 | pErrorCode); | |
220 | } | |
221 | ||
222 | /* Concatenation of normalized strings -------------------------------------- */ | |
223 | ||
224 | static int32_t | |
225 | _concatenate(const UChar *left, int32_t leftLength, | |
226 | const UChar *right, int32_t rightLength, | |
227 | UChar *dest, int32_t destCapacity, | |
228 | const Normalizer2 *n2, | |
229 | UErrorCode *pErrorCode) { | |
230 | if(U_FAILURE(*pErrorCode)) { | |
231 | return 0; | |
232 | } | |
233 | if(destCapacity<0 || (dest==NULL && destCapacity>0) || | |
234 | left==NULL || leftLength<-1 || right==NULL || rightLength<-1) { | |
235 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
236 | return 0; | |
237 | } | |
238 | ||
239 | /* check for overlapping right and destination */ | |
240 | if( dest!=NULL && | |
241 | ((right>=dest && right<(dest+destCapacity)) || | |
242 | (rightLength>0 && dest>=right && dest<(right+rightLength))) | |
243 | ) { | |
244 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
245 | return 0; | |
246 | } | |
247 | ||
248 | /* allow left==dest */ | |
249 | UnicodeString destString; | |
250 | if(left==dest) { | |
251 | destString.setTo(dest, leftLength, destCapacity); | |
252 | } else { | |
253 | destString.setTo(dest, 0, destCapacity); | |
254 | destString.append(left, leftLength); | |
255 | } | |
256 | return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). | |
257 | extract(dest, destCapacity, *pErrorCode); | |
258 | } | |
259 | ||
260 | U_CAPI int32_t U_EXPORT2 | |
261 | unorm_concatenate(const UChar *left, int32_t leftLength, | |
262 | const UChar *right, int32_t rightLength, | |
263 | UChar *dest, int32_t destCapacity, | |
264 | UNormalizationMode mode, int32_t options, | |
265 | UErrorCode *pErrorCode) { | |
266 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
267 | if(options&UNORM_UNICODE_3_2) { | |
268 | const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); | |
269 | if(U_FAILURE(*pErrorCode)) { | |
270 | return 0; | |
271 | } | |
272 | FilteredNormalizer2 fn2(*n2, *uni32); | |
273 | return _concatenate(left, leftLength, right, rightLength, | |
274 | dest, destCapacity, &fn2, pErrorCode); | |
275 | } | |
276 | return _concatenate(left, leftLength, right, rightLength, | |
277 | dest, destCapacity, n2, pErrorCode); | |
278 | } | |
279 | ||
280 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |