]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ****************************************************************************** | |
b331163b | 5 | * Copyright (c) 1996-2014, International Business Machines |
b75a7d8f A |
6 | * Corporation and others. All Rights Reserved. |
7 | ****************************************************************************** | |
8 | * File unorm.cpp | |
9 | * | |
10 | * Created by: Vladimir Weinstein 12052000 | |
11 | * | |
12 | * Modification history : | |
13 | * | |
14 | * Date Name Description | |
15 | * 02/01/01 synwee Added normalization quickcheck enum and method. | |
16 | * 02/12/01 synwee Commented out quickcheck util api has been approved | |
17 | * Added private method for doing FCD checks | |
18 | * 02/23/01 synwee Modified quickcheck and checkFCE to run through | |
19 | * string for codepoints < 0x300 for the normalization | |
20 | * mode NFC. | |
21 | * 05/25/01+ Markus Scherer total rewrite, implement all normalization here | |
22 | * instead of just wrappers around normlzr.cpp, | |
23 | * load unorm.dat, support Unicode 3.1 with | |
24 | * supplementary code points, etc. | |
729e4ab9 | 25 | * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code |
b75a7d8f A |
26 | */ |
27 | ||
28 | #include "unicode/utypes.h" | |
29 | ||
b75a7d8f A |
30 | #if !UCONFIG_NO_NORMALIZATION |
31 | ||
32 | #include "unicode/udata.h" | |
374ca955 | 33 | #include "unicode/ustring.h" |
b75a7d8f | 34 | #include "unicode/uiter.h" |
b75a7d8f | 35 | #include "unicode/unorm.h" |
729e4ab9 A |
36 | #include "unicode/unorm2.h" |
37 | #include "normalizer2impl.h" | |
374ca955 | 38 | #include "unormimp.h" |
729e4ab9 A |
39 | #include "uprops.h" |
40 | #include "ustr_imp.h" | |
b75a7d8f | 41 | |
729e4ab9 | 42 | U_NAMESPACE_USE |
b75a7d8f | 43 | |
729e4ab9 | 44 | /* quick check functions ---------------------------------------------------- */ |
374ca955 A |
45 | |
46 | U_CAPI UNormalizationCheckResult U_EXPORT2 | |
47 | unorm_quickCheck(const UChar *src, | |
48 | int32_t srcLength, | |
49 | UNormalizationMode mode, | |
50 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
51 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
52 | return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
b75a7d8f A |
53 | } |
54 | ||
374ca955 A |
55 | U_CAPI UNormalizationCheckResult U_EXPORT2 |
56 | unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, | |
57 | UNormalizationMode mode, int32_t options, | |
58 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
59 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
60 | if(options&UNORM_UNICODE_3_2) { | |
61 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); | |
62 | return unorm2_quickCheck( | |
63 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), | |
64 | src, srcLength, pErrorCode); | |
65 | } else { | |
66 | return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
67 | } | |
374ca955 | 68 | } |
b75a7d8f | 69 | |
374ca955 A |
70 | U_CAPI UBool U_EXPORT2 |
71 | unorm_isNormalized(const UChar *src, int32_t srcLength, | |
72 | UNormalizationMode mode, | |
73 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
74 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
75 | return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
374ca955 | 76 | } |
b75a7d8f | 77 | |
374ca955 A |
78 | U_CAPI UBool U_EXPORT2 |
79 | unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, | |
80 | UNormalizationMode mode, int32_t options, | |
81 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
82 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
83 | if(options&UNORM_UNICODE_3_2) { | |
84 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); | |
85 | return unorm2_isNormalized( | |
86 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), | |
87 | src, srcLength, pErrorCode); | |
88 | } else { | |
89 | return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
b75a7d8f | 90 | } |
b75a7d8f A |
91 | } |
92 | ||
729e4ab9 | 93 | /* normalize() API ---------------------------------------------------------- */ |
b75a7d8f A |
94 | |
95 | /** Public API for normalizing. */ | |
96 | U_CAPI int32_t U_EXPORT2 | |
97 | unorm_normalize(const UChar *src, int32_t srcLength, | |
98 | UNormalizationMode mode, int32_t options, | |
99 | UChar *dest, int32_t destCapacity, | |
100 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
101 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
102 | if(options&UNORM_UNICODE_3_2) { | |
103 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); | |
104 | return unorm2_normalize( | |
105 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), | |
106 | src, srcLength, dest, destCapacity, pErrorCode); | |
107 | } else { | |
108 | return unorm2_normalize((const UNormalizer2 *)n2, | |
109 | src, srcLength, dest, destCapacity, pErrorCode); | |
b75a7d8f | 110 | } |
b75a7d8f A |
111 | } |
112 | ||
113 | ||
114 | /* iteration functions ------------------------------------------------------ */ | |
115 | ||
729e4ab9 | 116 | static int32_t |
4388f060 | 117 | _iterate(UCharIterator *src, UBool forward, |
729e4ab9 | 118 | UChar *dest, int32_t destCapacity, |
4388f060 | 119 | const Normalizer2 *n2, |
729e4ab9 A |
120 | UBool doNormalize, UBool *pNeededToNormalize, |
121 | UErrorCode *pErrorCode) { | |
729e4ab9 | 122 | if(U_FAILURE(*pErrorCode)) { |
b75a7d8f A |
123 | return 0; |
124 | } | |
4388f060 | 125 | if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) { |
b75a7d8f A |
126 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
127 | return 0; | |
128 | } | |
129 | ||
b75a7d8f A |
130 | if(pNeededToNormalize!=NULL) { |
131 | *pNeededToNormalize=FALSE; | |
132 | } | |
729e4ab9 A |
133 | if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { |
134 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); | |
b75a7d8f A |
135 | } |
136 | ||
729e4ab9 A |
137 | UnicodeString buffer; |
138 | UChar32 c; | |
139 | if(forward) { | |
140 | /* get one character and ignore its properties */ | |
141 | buffer.append(uiter_next32(src)); | |
142 | /* get all following characters until we see a boundary */ | |
143 | while((c=uiter_next32(src))>=0) { | |
144 | if(n2->hasBoundaryBefore(c)) { | |
145 | /* back out the latest movement to stop at the boundary */ | |
146 | src->move(src, -U16_LENGTH(c), UITER_CURRENT); | |
147 | break; | |
148 | } else { | |
149 | buffer.append(c); | |
b75a7d8f | 150 | } |
b75a7d8f A |
151 | } |
152 | } else { | |
729e4ab9 A |
153 | while((c=uiter_previous32(src))>=0) { |
154 | /* always write this character to the front of the buffer */ | |
155 | buffer.insert(0, c); | |
156 | /* stop if this just-copied character is a boundary */ | |
157 | if(n2->hasBoundaryBefore(c)) { | |
158 | break; | |
b75a7d8f | 159 | } |
b75a7d8f A |
160 | } |
161 | } | |
162 | ||
729e4ab9 A |
163 | UnicodeString destString(dest, 0, destCapacity); |
164 | if(buffer.length()>0 && doNormalize) { | |
165 | n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); | |
166 | if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { | |
167 | *pNeededToNormalize= destString!=buffer; | |
b75a7d8f | 168 | } |
729e4ab9 A |
169 | return destString.length(); |
170 | } else { | |
171 | /* just copy the source characters */ | |
172 | return buffer.extract(dest, destCapacity, *pErrorCode); | |
b75a7d8f | 173 | } |
729e4ab9 | 174 | } |
b75a7d8f | 175 | |
4388f060 A |
176 | static int32_t |
177 | unorm_iterate(UCharIterator *src, UBool forward, | |
178 | UChar *dest, int32_t destCapacity, | |
179 | UNormalizationMode mode, int32_t options, | |
180 | UBool doNormalize, UBool *pNeededToNormalize, | |
181 | UErrorCode *pErrorCode) { | |
182 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
183 | if(options&UNORM_UNICODE_3_2) { | |
184 | const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); | |
185 | if(U_FAILURE(*pErrorCode)) { | |
186 | return 0; | |
187 | } | |
188 | FilteredNormalizer2 fn2(*n2, *uni32); | |
189 | return _iterate(src, forward, dest, destCapacity, | |
190 | &fn2, doNormalize, pNeededToNormalize, pErrorCode); | |
191 | } | |
192 | return _iterate(src, forward, dest, destCapacity, | |
193 | n2, doNormalize, pNeededToNormalize, pErrorCode); | |
194 | } | |
195 | ||
729e4ab9 A |
196 | U_CAPI int32_t U_EXPORT2 |
197 | unorm_previous(UCharIterator *src, | |
198 | UChar *dest, int32_t destCapacity, | |
199 | UNormalizationMode mode, int32_t options, | |
200 | UBool doNormalize, UBool *pNeededToNormalize, | |
201 | UErrorCode *pErrorCode) { | |
202 | return unorm_iterate(src, FALSE, | |
203 | dest, destCapacity, | |
204 | mode, options, | |
205 | doNormalize, pNeededToNormalize, | |
206 | pErrorCode); | |
b75a7d8f A |
207 | } |
208 | ||
209 | U_CAPI int32_t U_EXPORT2 | |
210 | unorm_next(UCharIterator *src, | |
211 | UChar *dest, int32_t destCapacity, | |
212 | UNormalizationMode mode, int32_t options, | |
213 | UBool doNormalize, UBool *pNeededToNormalize, | |
214 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
215 | return unorm_iterate(src, TRUE, |
216 | dest, destCapacity, | |
217 | mode, options, | |
218 | doNormalize, pNeededToNormalize, | |
219 | pErrorCode); | |
b75a7d8f A |
220 | } |
221 | ||
b75a7d8f A |
222 | /* Concatenation of normalized strings -------------------------------------- */ |
223 | ||
4388f060 A |
224 | static int32_t |
225 | _concatenate(const UChar *left, int32_t leftLength, | |
b75a7d8f A |
226 | const UChar *right, int32_t rightLength, |
227 | UChar *dest, int32_t destCapacity, | |
4388f060 | 228 | const Normalizer2 *n2, |
b75a7d8f | 229 | UErrorCode *pErrorCode) { |
729e4ab9 | 230 | if(U_FAILURE(*pErrorCode)) { |
b75a7d8f A |
231 | return 0; |
232 | } | |
4388f060 A |
233 | if(destCapacity<0 || (dest==NULL && destCapacity>0) || |
234 | left==NULL || leftLength<-1 || right==NULL || rightLength<-1) { | |
b75a7d8f A |
235 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
236 | return 0; | |
237 | } | |
238 | ||
239 | /* check for overlapping right and destination */ | |
240 | if( dest!=NULL && | |
241 | ((right>=dest && right<(dest+destCapacity)) || | |
242 | (rightLength>0 && dest>=right && dest<(right+rightLength))) | |
243 | ) { | |
244 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
245 | return 0; | |
246 | } | |
247 | ||
248 | /* allow left==dest */ | |
729e4ab9 A |
249 | UnicodeString destString; |
250 | if(left==dest) { | |
251 | destString.setTo(dest, leftLength, destCapacity); | |
b75a7d8f | 252 | } else { |
729e4ab9 A |
253 | destString.setTo(dest, 0, destCapacity); |
254 | destString.append(left, leftLength); | |
b75a7d8f | 255 | } |
729e4ab9 A |
256 | return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). |
257 | extract(dest, destCapacity, *pErrorCode); | |
b75a7d8f A |
258 | } |
259 | ||
4388f060 A |
260 | U_CAPI int32_t U_EXPORT2 |
261 | unorm_concatenate(const UChar *left, int32_t leftLength, | |
262 | const UChar *right, int32_t rightLength, | |
263 | UChar *dest, int32_t destCapacity, | |
264 | UNormalizationMode mode, int32_t options, | |
265 | UErrorCode *pErrorCode) { | |
266 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
267 | if(options&UNORM_UNICODE_3_2) { | |
268 | const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); | |
269 | if(U_FAILURE(*pErrorCode)) { | |
270 | return 0; | |
271 | } | |
272 | FilteredNormalizer2 fn2(*n2, *uni32); | |
273 | return _concatenate(left, leftLength, right, rightLength, | |
274 | dest, destCapacity, &fn2, pErrorCode); | |
275 | } | |
276 | return _concatenate(left, leftLength, right, rightLength, | |
277 | dest, destCapacity, n2, pErrorCode); | |
278 | } | |
279 | ||
b75a7d8f | 280 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |