]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
b331163b | 3 | * Copyright (c) 1996-2014, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ****************************************************************************** | |
6 | * File unorm.cpp | |
7 | * | |
8 | * Created by: Vladimir Weinstein 12052000 | |
9 | * | |
10 | * Modification history : | |
11 | * | |
12 | * Date Name Description | |
13 | * 02/01/01 synwee Added normalization quickcheck enum and method. | |
14 | * 02/12/01 synwee Commented out quickcheck util api has been approved | |
15 | * Added private method for doing FCD checks | |
16 | * 02/23/01 synwee Modified quickcheck and checkFCE to run through | |
17 | * string for codepoints < 0x300 for the normalization | |
18 | * mode NFC. | |
19 | * 05/25/01+ Markus Scherer total rewrite, implement all normalization here | |
20 | * instead of just wrappers around normlzr.cpp, | |
21 | * load unorm.dat, support Unicode 3.1 with | |
22 | * supplementary code points, etc. | |
729e4ab9 | 23 | * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code |
b75a7d8f A |
24 | */ |
25 | ||
26 | #include "unicode/utypes.h" | |
27 | ||
b75a7d8f A |
28 | #if !UCONFIG_NO_NORMALIZATION |
29 | ||
30 | #include "unicode/udata.h" | |
374ca955 | 31 | #include "unicode/ustring.h" |
b75a7d8f | 32 | #include "unicode/uiter.h" |
b75a7d8f | 33 | #include "unicode/unorm.h" |
729e4ab9 A |
34 | #include "unicode/unorm2.h" |
35 | #include "normalizer2impl.h" | |
374ca955 | 36 | #include "unormimp.h" |
729e4ab9 A |
37 | #include "uprops.h" |
38 | #include "ustr_imp.h" | |
b75a7d8f | 39 | |
729e4ab9 | 40 | U_NAMESPACE_USE |
b75a7d8f | 41 | |
729e4ab9 | 42 | /* quick check functions ---------------------------------------------------- */ |
374ca955 A |
43 | |
44 | U_CAPI UNormalizationCheckResult U_EXPORT2 | |
45 | unorm_quickCheck(const UChar *src, | |
46 | int32_t srcLength, | |
47 | UNormalizationMode mode, | |
48 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
49 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
50 | return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
b75a7d8f A |
51 | } |
52 | ||
374ca955 A |
53 | U_CAPI UNormalizationCheckResult U_EXPORT2 |
54 | unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, | |
55 | UNormalizationMode mode, int32_t options, | |
56 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
57 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
58 | if(options&UNORM_UNICODE_3_2) { | |
59 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); | |
60 | return unorm2_quickCheck( | |
61 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), | |
62 | src, srcLength, pErrorCode); | |
63 | } else { | |
64 | return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
65 | } | |
374ca955 | 66 | } |
b75a7d8f | 67 | |
374ca955 A |
68 | U_CAPI UBool U_EXPORT2 |
69 | unorm_isNormalized(const UChar *src, int32_t srcLength, | |
70 | UNormalizationMode mode, | |
71 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
72 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
73 | return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
374ca955 | 74 | } |
b75a7d8f | 75 | |
374ca955 A |
76 | U_CAPI UBool U_EXPORT2 |
77 | unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength, | |
78 | UNormalizationMode mode, int32_t options, | |
79 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
80 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
81 | if(options&UNORM_UNICODE_3_2) { | |
82 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); | |
83 | return unorm2_isNormalized( | |
84 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), | |
85 | src, srcLength, pErrorCode); | |
86 | } else { | |
87 | return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode); | |
b75a7d8f | 88 | } |
b75a7d8f A |
89 | } |
90 | ||
729e4ab9 | 91 | /* normalize() API ---------------------------------------------------------- */ |
b75a7d8f A |
92 | |
93 | /** Public API for normalizing. */ | |
94 | U_CAPI int32_t U_EXPORT2 | |
95 | unorm_normalize(const UChar *src, int32_t srcLength, | |
96 | UNormalizationMode mode, int32_t options, | |
97 | UChar *dest, int32_t destCapacity, | |
98 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
99 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); |
100 | if(options&UNORM_UNICODE_3_2) { | |
101 | FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode)); | |
102 | return unorm2_normalize( | |
103 | reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)), | |
104 | src, srcLength, dest, destCapacity, pErrorCode); | |
105 | } else { | |
106 | return unorm2_normalize((const UNormalizer2 *)n2, | |
107 | src, srcLength, dest, destCapacity, pErrorCode); | |
b75a7d8f | 108 | } |
b75a7d8f A |
109 | } |
110 | ||
111 | ||
112 | /* iteration functions ------------------------------------------------------ */ | |
113 | ||
729e4ab9 | 114 | static int32_t |
4388f060 | 115 | _iterate(UCharIterator *src, UBool forward, |
729e4ab9 | 116 | UChar *dest, int32_t destCapacity, |
4388f060 | 117 | const Normalizer2 *n2, |
729e4ab9 A |
118 | UBool doNormalize, UBool *pNeededToNormalize, |
119 | UErrorCode *pErrorCode) { | |
729e4ab9 | 120 | if(U_FAILURE(*pErrorCode)) { |
b75a7d8f A |
121 | return 0; |
122 | } | |
4388f060 | 123 | if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) { |
b75a7d8f A |
124 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
125 | return 0; | |
126 | } | |
127 | ||
b75a7d8f A |
128 | if(pNeededToNormalize!=NULL) { |
129 | *pNeededToNormalize=FALSE; | |
130 | } | |
729e4ab9 A |
131 | if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) { |
132 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); | |
b75a7d8f A |
133 | } |
134 | ||
729e4ab9 A |
135 | UnicodeString buffer; |
136 | UChar32 c; | |
137 | if(forward) { | |
138 | /* get one character and ignore its properties */ | |
139 | buffer.append(uiter_next32(src)); | |
140 | /* get all following characters until we see a boundary */ | |
141 | while((c=uiter_next32(src))>=0) { | |
142 | if(n2->hasBoundaryBefore(c)) { | |
143 | /* back out the latest movement to stop at the boundary */ | |
144 | src->move(src, -U16_LENGTH(c), UITER_CURRENT); | |
145 | break; | |
146 | } else { | |
147 | buffer.append(c); | |
b75a7d8f | 148 | } |
b75a7d8f A |
149 | } |
150 | } else { | |
729e4ab9 A |
151 | while((c=uiter_previous32(src))>=0) { |
152 | /* always write this character to the front of the buffer */ | |
153 | buffer.insert(0, c); | |
154 | /* stop if this just-copied character is a boundary */ | |
155 | if(n2->hasBoundaryBefore(c)) { | |
156 | break; | |
b75a7d8f | 157 | } |
b75a7d8f A |
158 | } |
159 | } | |
160 | ||
729e4ab9 A |
161 | UnicodeString destString(dest, 0, destCapacity); |
162 | if(buffer.length()>0 && doNormalize) { | |
163 | n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode); | |
164 | if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) { | |
165 | *pNeededToNormalize= destString!=buffer; | |
b75a7d8f | 166 | } |
729e4ab9 A |
167 | return destString.length(); |
168 | } else { | |
169 | /* just copy the source characters */ | |
170 | return buffer.extract(dest, destCapacity, *pErrorCode); | |
b75a7d8f | 171 | } |
729e4ab9 | 172 | } |
b75a7d8f | 173 | |
4388f060 A |
174 | static int32_t |
175 | unorm_iterate(UCharIterator *src, UBool forward, | |
176 | UChar *dest, int32_t destCapacity, | |
177 | UNormalizationMode mode, int32_t options, | |
178 | UBool doNormalize, UBool *pNeededToNormalize, | |
179 | UErrorCode *pErrorCode) { | |
180 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
181 | if(options&UNORM_UNICODE_3_2) { | |
182 | const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); | |
183 | if(U_FAILURE(*pErrorCode)) { | |
184 | return 0; | |
185 | } | |
186 | FilteredNormalizer2 fn2(*n2, *uni32); | |
187 | return _iterate(src, forward, dest, destCapacity, | |
188 | &fn2, doNormalize, pNeededToNormalize, pErrorCode); | |
189 | } | |
190 | return _iterate(src, forward, dest, destCapacity, | |
191 | n2, doNormalize, pNeededToNormalize, pErrorCode); | |
192 | } | |
193 | ||
729e4ab9 A |
194 | U_CAPI int32_t U_EXPORT2 |
195 | unorm_previous(UCharIterator *src, | |
196 | UChar *dest, int32_t destCapacity, | |
197 | UNormalizationMode mode, int32_t options, | |
198 | UBool doNormalize, UBool *pNeededToNormalize, | |
199 | UErrorCode *pErrorCode) { | |
200 | return unorm_iterate(src, FALSE, | |
201 | dest, destCapacity, | |
202 | mode, options, | |
203 | doNormalize, pNeededToNormalize, | |
204 | pErrorCode); | |
b75a7d8f A |
205 | } |
206 | ||
207 | U_CAPI int32_t U_EXPORT2 | |
208 | unorm_next(UCharIterator *src, | |
209 | UChar *dest, int32_t destCapacity, | |
210 | UNormalizationMode mode, int32_t options, | |
211 | UBool doNormalize, UBool *pNeededToNormalize, | |
212 | UErrorCode *pErrorCode) { | |
729e4ab9 A |
213 | return unorm_iterate(src, TRUE, |
214 | dest, destCapacity, | |
215 | mode, options, | |
216 | doNormalize, pNeededToNormalize, | |
217 | pErrorCode); | |
b75a7d8f A |
218 | } |
219 | ||
b75a7d8f A |
220 | /* Concatenation of normalized strings -------------------------------------- */ |
221 | ||
4388f060 A |
222 | static int32_t |
223 | _concatenate(const UChar *left, int32_t leftLength, | |
b75a7d8f A |
224 | const UChar *right, int32_t rightLength, |
225 | UChar *dest, int32_t destCapacity, | |
4388f060 | 226 | const Normalizer2 *n2, |
b75a7d8f | 227 | UErrorCode *pErrorCode) { |
729e4ab9 | 228 | if(U_FAILURE(*pErrorCode)) { |
b75a7d8f A |
229 | return 0; |
230 | } | |
4388f060 A |
231 | if(destCapacity<0 || (dest==NULL && destCapacity>0) || |
232 | left==NULL || leftLength<-1 || right==NULL || rightLength<-1) { | |
b75a7d8f A |
233 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
234 | return 0; | |
235 | } | |
236 | ||
237 | /* check for overlapping right and destination */ | |
238 | if( dest!=NULL && | |
239 | ((right>=dest && right<(dest+destCapacity)) || | |
240 | (rightLength>0 && dest>=right && dest<(right+rightLength))) | |
241 | ) { | |
242 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
243 | return 0; | |
244 | } | |
245 | ||
246 | /* allow left==dest */ | |
729e4ab9 A |
247 | UnicodeString destString; |
248 | if(left==dest) { | |
249 | destString.setTo(dest, leftLength, destCapacity); | |
b75a7d8f | 250 | } else { |
729e4ab9 A |
251 | destString.setTo(dest, 0, destCapacity); |
252 | destString.append(left, leftLength); | |
b75a7d8f | 253 | } |
729e4ab9 A |
254 | return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode). |
255 | extract(dest, destCapacity, *pErrorCode); | |
b75a7d8f A |
256 | } |
257 | ||
4388f060 A |
258 | U_CAPI int32_t U_EXPORT2 |
259 | unorm_concatenate(const UChar *left, int32_t leftLength, | |
260 | const UChar *right, int32_t rightLength, | |
261 | UChar *dest, int32_t destCapacity, | |
262 | UNormalizationMode mode, int32_t options, | |
263 | UErrorCode *pErrorCode) { | |
264 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode); | |
265 | if(options&UNORM_UNICODE_3_2) { | |
266 | const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode); | |
267 | if(U_FAILURE(*pErrorCode)) { | |
268 | return 0; | |
269 | } | |
270 | FilteredNormalizer2 fn2(*n2, *uni32); | |
271 | return _concatenate(left, leftLength, right, rightLength, | |
272 | dest, destCapacity, &fn2, pErrorCode); | |
273 | } | |
274 | return _concatenate(left, leftLength, right, rightLength, | |
275 | dest, destCapacity, n2, pErrorCode); | |
276 | } | |
277 | ||
b75a7d8f | 278 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |