1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: normalizer2.cpp
12 * tab size: 8 (not used)
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_NORMALIZATION
23 #include "unicode/edits.h"
24 #include "unicode/normalizer2.h"
25 #include "unicode/stringoptions.h"
26 #include "unicode/unistr.h"
27 #include "unicode/unorm.h"
30 #include "norm2allmodes.h"
31 #include "normalizer2impl.h"
35 using icu::Normalizer2Impl
;
37 #if NORM2_HARDCODE_NFC_DATA
38 // NFC/NFD data machine-generated by gennorm2 --csource
39 #define INCLUDED_FROM_NORMALIZER2_CPP
40 #include "norm2_nfc_data.h"
45 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
47 Normalizer2::~Normalizer2() {}
50 Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src
, ByteSink
&sink
,
51 Edits
*edits
, UErrorCode
&errorCode
) const {
52 if (U_FAILURE(errorCode
)) {
55 if (edits
!= nullptr) {
56 errorCode
= U_UNSUPPORTED_ERROR
;
59 UnicodeString src16
= UnicodeString::fromUTF8(src
);
60 normalize(src16
, errorCode
).toUTF8(sink
);
64 Normalizer2::getRawDecomposition(UChar32
, UnicodeString
&) const {
69 Normalizer2::composePair(UChar32
, UChar32
) const {
74 Normalizer2::getCombiningClass(UChar32
/*c*/) const {
79 Normalizer2::isNormalizedUTF8(StringPiece s
, UErrorCode
&errorCode
) const {
80 return U_SUCCESS(errorCode
) && isNormalized(UnicodeString::fromUTF8(s
), errorCode
);
83 // Normalizer2 implementation for the old UNORM_NONE.
84 class NoopNormalizer2
: public Normalizer2
{
85 virtual ~NoopNormalizer2();
87 virtual UnicodeString
&
88 normalize(const UnicodeString
&src
,
90 UErrorCode
&errorCode
) const U_OVERRIDE
{
91 if(U_SUCCESS(errorCode
)) {
95 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
101 normalizeUTF8(uint32_t options
, StringPiece src
, ByteSink
&sink
,
102 Edits
*edits
, UErrorCode
&errorCode
) const U_OVERRIDE
{
103 if(U_SUCCESS(errorCode
)) {
104 if (edits
!= nullptr) {
105 if ((options
& U_EDITS_NO_RESET
) == 0) {
108 edits
->addUnchanged(src
.length());
110 if ((options
& U_OMIT_UNCHANGED_TEXT
) == 0) {
111 sink
.Append(src
.data(), src
.length());
117 virtual UnicodeString
&
118 normalizeSecondAndAppend(UnicodeString
&first
,
119 const UnicodeString
&second
,
120 UErrorCode
&errorCode
) const U_OVERRIDE
{
121 if(U_SUCCESS(errorCode
)) {
122 if(&first
!=&second
) {
123 first
.append(second
);
125 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
130 virtual UnicodeString
&
131 append(UnicodeString
&first
,
132 const UnicodeString
&second
,
133 UErrorCode
&errorCode
) const U_OVERRIDE
{
134 if(U_SUCCESS(errorCode
)) {
135 if(&first
!=&second
) {
136 first
.append(second
);
138 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
144 getDecomposition(UChar32
, UnicodeString
&) const U_OVERRIDE
{
147 // No need to U_OVERRIDE the default getRawDecomposition().
149 isNormalized(const UnicodeString
&, UErrorCode
&errorCode
) const U_OVERRIDE
{
150 return U_SUCCESS(errorCode
);
153 isNormalizedUTF8(StringPiece
, UErrorCode
&errorCode
) const U_OVERRIDE
{
154 return U_SUCCESS(errorCode
);
156 virtual UNormalizationCheckResult
157 quickCheck(const UnicodeString
&, UErrorCode
&) const U_OVERRIDE
{
161 spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&) const U_OVERRIDE
{
164 virtual UBool
hasBoundaryBefore(UChar32
) const U_OVERRIDE
{ return TRUE
; }
165 virtual UBool
hasBoundaryAfter(UChar32
) const U_OVERRIDE
{ return TRUE
; }
166 virtual UBool
isInert(UChar32
) const U_OVERRIDE
{ return TRUE
; }
169 NoopNormalizer2::~NoopNormalizer2() {}
171 Normalizer2WithImpl::~Normalizer2WithImpl() {}
173 DecomposeNormalizer2::~DecomposeNormalizer2() {}
175 ComposeNormalizer2::~ComposeNormalizer2() {}
177 FCDNormalizer2::~FCDNormalizer2() {}
179 // instance cache ---------------------------------------------------------- ***
182 static UBool U_CALLCONV
uprv_normalizer2_cleanup();
185 static Normalizer2
*noopSingleton
;
186 static icu::UInitOnce noopInitOnce
= U_INITONCE_INITIALIZER
;
188 static void U_CALLCONV
initNoopSingleton(UErrorCode
&errorCode
) {
189 if(U_FAILURE(errorCode
)) {
192 noopSingleton
=new NoopNormalizer2
;
193 if(noopSingleton
==NULL
) {
194 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
197 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2
, uprv_normalizer2_cleanup
);
200 const Normalizer2
*Normalizer2Factory::getNoopInstance(UErrorCode
&errorCode
) {
201 if(U_FAILURE(errorCode
)) { return NULL
; }
202 umtx_initOnce(noopInitOnce
, &initNoopSingleton
, errorCode
);
203 return noopSingleton
;
206 const Normalizer2Impl
*
207 Normalizer2Factory::getImpl(const Normalizer2
*norm2
) {
208 return &((Normalizer2WithImpl
*)norm2
)->impl
;
211 Norm2AllModes::~Norm2AllModes() {
216 Norm2AllModes::createInstance(Normalizer2Impl
*impl
, UErrorCode
&errorCode
) {
217 if(U_FAILURE(errorCode
)) {
221 Norm2AllModes
*allModes
=new Norm2AllModes(impl
);
223 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
230 #if NORM2_HARDCODE_NFC_DATA
232 Norm2AllModes::createNFCInstance(UErrorCode
&errorCode
) {
233 if(U_FAILURE(errorCode
)) {
236 Normalizer2Impl
*impl
=new Normalizer2Impl
;
238 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
241 impl
->init(norm2_nfc_data_indexes
, &norm2_nfc_data_trie
,
242 norm2_nfc_data_extraData
, norm2_nfc_data_smallFCD
);
243 return createInstance(impl
, errorCode
);
246 static Norm2AllModes
*nfcSingleton
;
248 static icu::UInitOnce nfcInitOnce
= U_INITONCE_INITIALIZER
;
250 static void U_CALLCONV
initNFCSingleton(UErrorCode
&errorCode
) {
251 nfcSingleton
=Norm2AllModes::createNFCInstance(errorCode
);
252 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2
, uprv_normalizer2_cleanup
);
255 const Norm2AllModes
*
256 Norm2AllModes::getNFCInstance(UErrorCode
&errorCode
) {
257 if(U_FAILURE(errorCode
)) { return NULL
; }
258 umtx_initOnce(nfcInitOnce
, &initNFCSingleton
, errorCode
);
263 Normalizer2::getNFCInstance(UErrorCode
&errorCode
) {
264 const Norm2AllModes
*allModes
=Norm2AllModes::getNFCInstance(errorCode
);
265 return allModes
!=NULL
? &allModes
->comp
: NULL
;
269 Normalizer2::getNFDInstance(UErrorCode
&errorCode
) {
270 const Norm2AllModes
*allModes
=Norm2AllModes::getNFCInstance(errorCode
);
271 return allModes
!=NULL
? &allModes
->decomp
: NULL
;
274 const Normalizer2
*Normalizer2Factory::getFCDInstance(UErrorCode
&errorCode
) {
275 const Norm2AllModes
*allModes
=Norm2AllModes::getNFCInstance(errorCode
);
276 return allModes
!=NULL
? &allModes
->fcd
: NULL
;
279 const Normalizer2
*Normalizer2Factory::getFCCInstance(UErrorCode
&errorCode
) {
280 const Norm2AllModes
*allModes
=Norm2AllModes::getNFCInstance(errorCode
);
281 return allModes
!=NULL
? &allModes
->fcc
: NULL
;
284 const Normalizer2Impl
*
285 Normalizer2Factory::getNFCImpl(UErrorCode
&errorCode
) {
286 const Norm2AllModes
*allModes
=Norm2AllModes::getNFCInstance(errorCode
);
287 return allModes
!=NULL
? allModes
->impl
: NULL
;
289 #endif // NORM2_HARDCODE_NFC_DATA
293 static UBool U_CALLCONV
uprv_normalizer2_cleanup() {
294 delete noopSingleton
;
295 noopSingleton
= NULL
;
296 noopInitOnce
.reset();
297 #if NORM2_HARDCODE_NFC_DATA
309 // C API ------------------------------------------------------------------- ***
313 U_CAPI
const UNormalizer2
* U_EXPORT2
314 unorm2_getNFCInstance(UErrorCode
*pErrorCode
) {
315 return (const UNormalizer2
*)Normalizer2::getNFCInstance(*pErrorCode
);
318 U_CAPI
const UNormalizer2
* U_EXPORT2
319 unorm2_getNFDInstance(UErrorCode
*pErrorCode
) {
320 return (const UNormalizer2
*)Normalizer2::getNFDInstance(*pErrorCode
);
323 U_CAPI
void U_EXPORT2
324 unorm2_close(UNormalizer2
*norm2
) {
325 delete (Normalizer2
*)norm2
;
328 U_CAPI
int32_t U_EXPORT2
329 unorm2_normalize(const UNormalizer2
*norm2
,
330 const UChar
*src
, int32_t length
,
331 UChar
*dest
, int32_t capacity
,
332 UErrorCode
*pErrorCode
) {
333 if(U_FAILURE(*pErrorCode
)) {
336 if( (src
==NULL
? length
!=0 : length
<-1) ||
337 (dest
==NULL
? capacity
!=0 : capacity
<0) ||
338 (src
==dest
&& src
!=NULL
)
340 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
343 UnicodeString
destString(dest
, 0, capacity
);
344 // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
346 const Normalizer2
*n2
=(const Normalizer2
*)norm2
;
347 const Normalizer2WithImpl
*n2wi
=dynamic_cast<const Normalizer2WithImpl
*>(n2
);
349 // Avoid duplicate argument checking and support NUL-terminated src.
350 ReorderingBuffer
buffer(n2wi
->impl
, destString
);
351 if(buffer
.init(length
, *pErrorCode
)) {
352 n2wi
->normalize(src
, length
>=0 ? src
+length
: NULL
, buffer
, *pErrorCode
);
355 UnicodeString
srcString(length
<0, src
, length
);
356 n2
->normalize(srcString
, destString
, *pErrorCode
);
359 return destString
.extract(dest
, capacity
, *pErrorCode
);
363 normalizeSecondAndAppend(const UNormalizer2
*norm2
,
364 UChar
*first
, int32_t firstLength
, int32_t firstCapacity
,
365 const UChar
*second
, int32_t secondLength
,
367 UErrorCode
*pErrorCode
) {
368 if(U_FAILURE(*pErrorCode
)) {
371 if( (second
==NULL
? secondLength
!=0 : secondLength
<-1) ||
372 (first
==NULL
? (firstCapacity
!=0 || firstLength
!=0) :
373 (firstCapacity
<0 || firstLength
<-1)) ||
374 (first
==second
&& first
!=NULL
)
376 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
379 UnicodeString
firstString(first
, firstLength
, firstCapacity
);
380 firstLength
=firstString
.length(); // In case it was -1.
381 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
382 if(secondLength
!=0) {
383 const Normalizer2
*n2
=(const Normalizer2
*)norm2
;
384 const Normalizer2WithImpl
*n2wi
=dynamic_cast<const Normalizer2WithImpl
*>(n2
);
386 // Avoid duplicate argument checking and support NUL-terminated src.
387 UnicodeString safeMiddle
;
389 ReorderingBuffer
buffer(n2wi
->impl
, firstString
);
390 if(buffer
.init(firstLength
+secondLength
+1, *pErrorCode
)) { // destCapacity>=-1
391 n2wi
->normalizeAndAppend(second
, secondLength
>=0 ? second
+secondLength
: NULL
,
392 doNormalize
, safeMiddle
, buffer
, *pErrorCode
);
394 } // The ReorderingBuffer destructor finalizes firstString.
395 if(U_FAILURE(*pErrorCode
) || firstString
.length()>firstCapacity
) {
396 // Restore the modified suffix of the first string.
397 // This does not restore first[] array contents between firstLength and firstCapacity.
398 // (That might be uninitialized memory, as far as we know.)
399 if(first
!=NULL
) { /* don't dereference NULL */
400 safeMiddle
.extract(0, 0x7fffffff, first
+firstLength
-safeMiddle
.length());
401 if(firstLength
<firstCapacity
) {
402 first
[firstLength
]=0; // NUL-terminate in case it was originally.
407 UnicodeString
secondString(secondLength
<0, second
, secondLength
);
409 n2
->normalizeSecondAndAppend(firstString
, secondString
, *pErrorCode
);
411 n2
->append(firstString
, secondString
, *pErrorCode
);
415 return firstString
.extract(first
, firstCapacity
, *pErrorCode
);
418 U_CAPI
int32_t U_EXPORT2
419 unorm2_normalizeSecondAndAppend(const UNormalizer2
*norm2
,
420 UChar
*first
, int32_t firstLength
, int32_t firstCapacity
,
421 const UChar
*second
, int32_t secondLength
,
422 UErrorCode
*pErrorCode
) {
423 return normalizeSecondAndAppend(norm2
,
424 first
, firstLength
, firstCapacity
,
425 second
, secondLength
,
429 U_CAPI
int32_t U_EXPORT2
430 unorm2_append(const UNormalizer2
*norm2
,
431 UChar
*first
, int32_t firstLength
, int32_t firstCapacity
,
432 const UChar
*second
, int32_t secondLength
,
433 UErrorCode
*pErrorCode
) {
434 return normalizeSecondAndAppend(norm2
,
435 first
, firstLength
, firstCapacity
,
436 second
, secondLength
,
440 U_CAPI
int32_t U_EXPORT2
441 unorm2_getDecomposition(const UNormalizer2
*norm2
,
442 UChar32 c
, UChar
*decomposition
, int32_t capacity
,
443 UErrorCode
*pErrorCode
) {
444 if(U_FAILURE(*pErrorCode
)) {
447 if(decomposition
==NULL
? capacity
!=0 : capacity
<0) {
448 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
451 UnicodeString
destString(decomposition
, 0, capacity
);
452 if(reinterpret_cast<const Normalizer2
*>(norm2
)->getDecomposition(c
, destString
)) {
453 return destString
.extract(decomposition
, capacity
, *pErrorCode
);
459 U_CAPI
int32_t U_EXPORT2
460 unorm2_getRawDecomposition(const UNormalizer2
*norm2
,
461 UChar32 c
, UChar
*decomposition
, int32_t capacity
,
462 UErrorCode
*pErrorCode
) {
463 if(U_FAILURE(*pErrorCode
)) {
466 if(decomposition
==NULL
? capacity
!=0 : capacity
<0) {
467 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
470 UnicodeString
destString(decomposition
, 0, capacity
);
471 if(reinterpret_cast<const Normalizer2
*>(norm2
)->getRawDecomposition(c
, destString
)) {
472 return destString
.extract(decomposition
, capacity
, *pErrorCode
);
478 U_CAPI UChar32 U_EXPORT2
479 unorm2_composePair(const UNormalizer2
*norm2
, UChar32 a
, UChar32 b
) {
480 return reinterpret_cast<const Normalizer2
*>(norm2
)->composePair(a
, b
);
483 U_CAPI
uint8_t U_EXPORT2
484 unorm2_getCombiningClass(const UNormalizer2
*norm2
, UChar32 c
) {
485 return reinterpret_cast<const Normalizer2
*>(norm2
)->getCombiningClass(c
);
488 U_CAPI UBool U_EXPORT2
489 unorm2_isNormalized(const UNormalizer2
*norm2
,
490 const UChar
*s
, int32_t length
,
491 UErrorCode
*pErrorCode
) {
492 if(U_FAILURE(*pErrorCode
)) {
495 if((s
==NULL
&& length
!=0) || length
<-1) {
496 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
499 UnicodeString
sString(length
<0, s
, length
);
500 return ((const Normalizer2
*)norm2
)->isNormalized(sString
, *pErrorCode
);
503 U_CAPI UNormalizationCheckResult U_EXPORT2
504 unorm2_quickCheck(const UNormalizer2
*norm2
,
505 const UChar
*s
, int32_t length
,
506 UErrorCode
*pErrorCode
) {
507 if(U_FAILURE(*pErrorCode
)) {
510 if((s
==NULL
&& length
!=0) || length
<-1) {
511 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
514 UnicodeString
sString(length
<0, s
, length
);
515 return ((const Normalizer2
*)norm2
)->quickCheck(sString
, *pErrorCode
);
518 U_CAPI
int32_t U_EXPORT2
519 unorm2_spanQuickCheckYes(const UNormalizer2
*norm2
,
520 const UChar
*s
, int32_t length
,
521 UErrorCode
*pErrorCode
) {
522 if(U_FAILURE(*pErrorCode
)) {
525 if((s
==NULL
&& length
!=0) || length
<-1) {
526 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
529 UnicodeString
sString(length
<0, s
, length
);
530 return ((const Normalizer2
*)norm2
)->spanQuickCheckYes(sString
, *pErrorCode
);
533 U_CAPI UBool U_EXPORT2
534 unorm2_hasBoundaryBefore(const UNormalizer2
*norm2
, UChar32 c
) {
535 return ((const Normalizer2
*)norm2
)->hasBoundaryBefore(c
);
538 U_CAPI UBool U_EXPORT2
539 unorm2_hasBoundaryAfter(const UNormalizer2
*norm2
, UChar32 c
) {
540 return ((const Normalizer2
*)norm2
)->hasBoundaryAfter(c
);
543 U_CAPI UBool U_EXPORT2
544 unorm2_isInert(const UNormalizer2
*norm2
, UChar32 c
) {
545 return ((const Normalizer2
*)norm2
)->isInert(c
);
548 // Some properties APIs ---------------------------------------------------- ***
550 U_CAPI
uint8_t U_EXPORT2
551 u_getCombiningClass(UChar32 c
) {
552 UErrorCode errorCode
=U_ZERO_ERROR
;
553 const Normalizer2
*nfd
=Normalizer2::getNFDInstance(errorCode
);
554 if(U_SUCCESS(errorCode
)) {
555 return nfd
->getCombiningClass(c
);
562 unorm_getFCD16(UChar32 c
) {
563 UErrorCode errorCode
=U_ZERO_ERROR
;
564 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFCImpl(errorCode
);
565 if(U_SUCCESS(errorCode
)) {
566 return impl
->getFCD16(c
);
572 #endif // !UCONFIG_NO_NORMALIZATION