2 ******************************************************************************
3 * Copyright (c) 1996-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
8 * Created by: Vladimir Weinstein 12052000
10 * Modification history :
12 * Date Name Description
13 * 02/01/01 synwee Added normalization quickcheck enum and method.
14 * 02/12/01 synwee Commented out quickcheck util api has been approved
15 * Added private method for doing FCD checks
16 * 02/23/01 synwee Modified quickcheck and checkFCE to run through
17 * string for codepoints < 0x300 for the normalization
19 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here
20 * instead of just wrappers around normlzr.cpp,
21 * load unorm.dat, support Unicode 3.1 with
22 * supplementary code points, etc.
23 * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code
26 #include "unicode/utypes.h"
28 #if !UCONFIG_NO_NORMALIZATION
30 #include "unicode/udata.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/unorm.h"
34 #include "unicode/unorm2.h"
35 #include "normalizer2impl.h"
40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
44 /* quick check functions ---------------------------------------------------- */
46 U_CAPI UNormalizationCheckResult U_EXPORT2
47 unorm_quickCheck(const UChar
*src
,
49 UNormalizationMode mode
,
50 UErrorCode
*pErrorCode
) {
51 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, *pErrorCode
);
52 return unorm2_quickCheck((const UNormalizer2
*)n2
, src
, srcLength
, pErrorCode
);
55 U_CAPI UNormalizationCheckResult U_EXPORT2
56 unorm_quickCheckWithOptions(const UChar
*src
, int32_t srcLength
,
57 UNormalizationMode mode
, int32_t options
,
58 UErrorCode
*pErrorCode
) {
59 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, *pErrorCode
);
60 if(options
&UNORM_UNICODE_3_2
) {
61 FilteredNormalizer2
fn2(*n2
, *uniset_getUnicode32Instance(*pErrorCode
));
62 return unorm2_quickCheck(
63 reinterpret_cast<const UNormalizer2
*>(static_cast<Normalizer2
*>(&fn2
)),
64 src
, srcLength
, pErrorCode
);
66 return unorm2_quickCheck((const UNormalizer2
*)n2
, src
, srcLength
, pErrorCode
);
70 U_CAPI UBool U_EXPORT2
71 unorm_isNormalized(const UChar
*src
, int32_t srcLength
,
72 UNormalizationMode mode
,
73 UErrorCode
*pErrorCode
) {
74 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, *pErrorCode
);
75 return unorm2_isNormalized((const UNormalizer2
*)n2
, src
, srcLength
, pErrorCode
);
78 U_CAPI UBool U_EXPORT2
79 unorm_isNormalizedWithOptions(const UChar
*src
, int32_t srcLength
,
80 UNormalizationMode mode
, int32_t options
,
81 UErrorCode
*pErrorCode
) {
82 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, *pErrorCode
);
83 if(options
&UNORM_UNICODE_3_2
) {
84 FilteredNormalizer2
fn2(*n2
, *uniset_getUnicode32Instance(*pErrorCode
));
85 return unorm2_isNormalized(
86 reinterpret_cast<const UNormalizer2
*>(static_cast<Normalizer2
*>(&fn2
)),
87 src
, srcLength
, pErrorCode
);
89 return unorm2_isNormalized((const UNormalizer2
*)n2
, src
, srcLength
, pErrorCode
);
93 /* normalize() API ---------------------------------------------------------- */
95 /** Public API for normalizing. */
96 U_CAPI
int32_t U_EXPORT2
97 unorm_normalize(const UChar
*src
, int32_t srcLength
,
98 UNormalizationMode mode
, int32_t options
,
99 UChar
*dest
, int32_t destCapacity
,
100 UErrorCode
*pErrorCode
) {
101 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, *pErrorCode
);
102 if(options
&UNORM_UNICODE_3_2
) {
103 FilteredNormalizer2
fn2(*n2
, *uniset_getUnicode32Instance(*pErrorCode
));
104 return unorm2_normalize(
105 reinterpret_cast<const UNormalizer2
*>(static_cast<Normalizer2
*>(&fn2
)),
106 src
, srcLength
, dest
, destCapacity
, pErrorCode
);
108 return unorm2_normalize((const UNormalizer2
*)n2
,
109 src
, srcLength
, dest
, destCapacity
, pErrorCode
);
114 /* iteration functions ------------------------------------------------------ */
117 unorm_iterate(UCharIterator
*src
, UBool forward
,
118 UChar
*dest
, int32_t destCapacity
,
119 UNormalizationMode mode
, int32_t options
,
120 UBool doNormalize
, UBool
*pNeededToNormalize
,
121 UErrorCode
*pErrorCode
) {
122 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, *pErrorCode
);
123 const UnicodeSet
*uni32
;
124 if(options
&UNORM_UNICODE_3_2
) {
125 uni32
=uniset_getUnicode32Instance(*pErrorCode
);
127 uni32
=NULL
; // unused
129 FilteredNormalizer2
fn2(*n2
, *uni32
);
130 if(options
&UNORM_UNICODE_3_2
) {
133 if(U_FAILURE(*pErrorCode
)) {
136 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
139 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
143 if(pNeededToNormalize
!=NULL
) {
144 *pNeededToNormalize
=FALSE
;
146 if(!(forward
? src
->hasNext(src
) : src
->hasPrevious(src
))) {
147 return u_terminateUChars(dest
, destCapacity
, 0, pErrorCode
);
150 UnicodeString buffer
;
153 /* get one character and ignore its properties */
154 buffer
.append(uiter_next32(src
));
155 /* get all following characters until we see a boundary */
156 while((c
=uiter_next32(src
))>=0) {
157 if(n2
->hasBoundaryBefore(c
)) {
158 /* back out the latest movement to stop at the boundary */
159 src
->move(src
, -U16_LENGTH(c
), UITER_CURRENT
);
166 while((c
=uiter_previous32(src
))>=0) {
167 /* always write this character to the front of the buffer */
169 /* stop if this just-copied character is a boundary */
170 if(n2
->hasBoundaryBefore(c
)) {
176 UnicodeString
destString(dest
, 0, destCapacity
);
177 if(buffer
.length()>0 && doNormalize
) {
178 n2
->normalize(buffer
, destString
, *pErrorCode
).extract(dest
, destCapacity
, *pErrorCode
);
179 if(pNeededToNormalize
!=NULL
&& U_SUCCESS(*pErrorCode
)) {
180 *pNeededToNormalize
= destString
!=buffer
;
182 return destString
.length();
184 /* just copy the source characters */
185 return buffer
.extract(dest
, destCapacity
, *pErrorCode
);
189 U_CAPI
int32_t U_EXPORT2
190 unorm_previous(UCharIterator
*src
,
191 UChar
*dest
, int32_t destCapacity
,
192 UNormalizationMode mode
, int32_t options
,
193 UBool doNormalize
, UBool
*pNeededToNormalize
,
194 UErrorCode
*pErrorCode
) {
195 return unorm_iterate(src
, FALSE
,
198 doNormalize
, pNeededToNormalize
,
202 U_CAPI
int32_t U_EXPORT2
203 unorm_next(UCharIterator
*src
,
204 UChar
*dest
, int32_t destCapacity
,
205 UNormalizationMode mode
, int32_t options
,
206 UBool doNormalize
, UBool
*pNeededToNormalize
,
207 UErrorCode
*pErrorCode
) {
208 return unorm_iterate(src
, TRUE
,
211 doNormalize
, pNeededToNormalize
,
215 /* Concatenation of normalized strings -------------------------------------- */
217 U_CAPI
int32_t U_EXPORT2
218 unorm_concatenate(const UChar
*left
, int32_t leftLength
,
219 const UChar
*right
, int32_t rightLength
,
220 UChar
*dest
, int32_t destCapacity
,
221 UNormalizationMode mode
, int32_t options
,
222 UErrorCode
*pErrorCode
) {
223 const Normalizer2
*n2
=Normalizer2Factory::getInstance(mode
, *pErrorCode
);
224 const UnicodeSet
*uni32
;
225 if(options
&UNORM_UNICODE_3_2
) {
226 uni32
=uniset_getUnicode32Instance(*pErrorCode
);
228 uni32
=NULL
; // unused
230 FilteredNormalizer2
fn2(*n2
, *uni32
);
231 if(options
&UNORM_UNICODE_3_2
) {
234 if(U_FAILURE(*pErrorCode
)) {
237 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
238 left
==NULL
|| leftLength
<-1 ||
239 right
==NULL
|| rightLength
<-1
241 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
245 /* check for overlapping right and destination */
247 ((right
>=dest
&& right
<(dest
+destCapacity
)) ||
248 (rightLength
>0 && dest
>=right
&& dest
<(right
+rightLength
)))
250 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
254 /* allow left==dest */
255 UnicodeString destString
;
257 destString
.setTo(dest
, leftLength
, destCapacity
);
259 destString
.setTo(dest
, 0, destCapacity
);
260 destString
.append(left
, leftLength
);
262 return n2
->append(destString
, UnicodeString(rightLength
<0, right
, rightLength
), *pErrorCode
).
263 extract(dest
, destCapacity
, *pErrorCode
);
266 #endif /* #if !UCONFIG_NO_NORMALIZATION */