1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: normalizer2.cpp
12 * tab size: 8 (not used)
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_NORMALIZATION
23 #include "unicode/edits.h"
24 #include "unicode/normalizer2.h"
25 #include "unicode/stringoptions.h"
26 #include "unicode/unistr.h"
27 #include "unicode/unorm.h"
30 #include "norm2allmodes.h"
31 #include "normalizer2impl.h"
35 using icu::Normalizer2Impl
;
37 // NFC/NFD data machine-generated by gennorm2 --csource
38 #define INCLUDED_FROM_NORMALIZER2_CPP
39 #include "norm2_nfc_data.h"
43 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
45 Normalizer2::~Normalizer2() {}
48 Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src
, ByteSink
&sink
,
49 Edits
*edits
, UErrorCode
&errorCode
) const {
50 if (U_FAILURE(errorCode
)) {
53 if (edits
!= nullptr) {
54 errorCode
= U_UNSUPPORTED_ERROR
;
57 UnicodeString src16
= UnicodeString::fromUTF8(src
);
58 normalize(src16
, errorCode
).toUTF8(sink
);
62 Normalizer2::getRawDecomposition(UChar32
, UnicodeString
&) const {
67 Normalizer2::composePair(UChar32
, UChar32
) const {
72 Normalizer2::getCombiningClass(UChar32
/*c*/) const {
77 Normalizer2::isNormalizedUTF8(StringPiece s
, UErrorCode
&errorCode
) const {
78 return U_SUCCESS(errorCode
) && isNormalized(UnicodeString::fromUTF8(s
), errorCode
);
81 // Normalizer2 implementation for the old UNORM_NONE.
82 class NoopNormalizer2
: public Normalizer2
{
83 virtual ~NoopNormalizer2();
85 virtual UnicodeString
&
86 normalize(const UnicodeString
&src
,
88 UErrorCode
&errorCode
) const U_OVERRIDE
{
89 if(U_SUCCESS(errorCode
)) {
93 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
99 normalizeUTF8(uint32_t options
, StringPiece src
, ByteSink
&sink
,
100 Edits
*edits
, UErrorCode
&errorCode
) const U_OVERRIDE
{
101 if(U_SUCCESS(errorCode
)) {
102 if (edits
!= nullptr) {
103 if ((options
& U_EDITS_NO_RESET
) == 0) {
106 edits
->addUnchanged(src
.length());
108 if ((options
& U_OMIT_UNCHANGED_TEXT
) == 0) {
109 sink
.Append(src
.data(), src
.length());
115 virtual UnicodeString
&
116 normalizeSecondAndAppend(UnicodeString
&first
,
117 const UnicodeString
&second
,
118 UErrorCode
&errorCode
) const U_OVERRIDE
{
119 if(U_SUCCESS(errorCode
)) {
120 if(&first
!=&second
) {
121 first
.append(second
);
123 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
128 virtual UnicodeString
&
129 append(UnicodeString
&first
,
130 const UnicodeString
&second
,
131 UErrorCode
&errorCode
) const U_OVERRIDE
{
132 if(U_SUCCESS(errorCode
)) {
133 if(&first
!=&second
) {
134 first
.append(second
);
136 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
142 getDecomposition(UChar32
, UnicodeString
&) const U_OVERRIDE
{
145 // No need to U_OVERRIDE the default getRawDecomposition().
147 isNormalized(const UnicodeString
&, UErrorCode
&errorCode
) const U_OVERRIDE
{
148 return U_SUCCESS(errorCode
);
151 isNormalizedUTF8(StringPiece
, UErrorCode
&errorCode
) const U_OVERRIDE
{
152 return U_SUCCESS(errorCode
);
154 virtual UNormalizationCheckResult
155 quickCheck(const UnicodeString
&, UErrorCode
&) const U_OVERRIDE
{
159 spanQuickCheckYes(const UnicodeString
&s
, UErrorCode
&) const U_OVERRIDE
{
162 virtual UBool
hasBoundaryBefore(UChar32
) const U_OVERRIDE
{ return TRUE
; }
163 virtual UBool
hasBoundaryAfter(UChar32
) const U_OVERRIDE
{ return TRUE
; }
164 virtual UBool
isInert(UChar32
) const U_OVERRIDE
{ return TRUE
; }
167 NoopNormalizer2::~NoopNormalizer2() {}
169 Normalizer2WithImpl::~Normalizer2WithImpl() {}
171 DecomposeNormalizer2::~DecomposeNormalizer2() {}
173 ComposeNormalizer2::~ComposeNormalizer2() {}
175 FCDNormalizer2::~FCDNormalizer2() {}
177 // instance cache ---------------------------------------------------------- ***
179 Norm2AllModes::~Norm2AllModes() {
184 Norm2AllModes::createInstance(Normalizer2Impl
*impl
, UErrorCode
&errorCode
) {
185 if(U_FAILURE(errorCode
)) {
189 Norm2AllModes
*allModes
=new Norm2AllModes(impl
);
191 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
199 Norm2AllModes::createNFCInstance(UErrorCode
&errorCode
) {
200 if(U_FAILURE(errorCode
)) {
203 Normalizer2Impl
*impl
=new Normalizer2Impl
;
205 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
208 impl
->init(norm2_nfc_data_indexes
, &norm2_nfc_data_trie
,
209 norm2_nfc_data_extraData
, norm2_nfc_data_smallFCD
);
210 return createInstance(impl
, errorCode
);
214 static UBool U_CALLCONV
uprv_normalizer2_cleanup();
217 static Norm2AllModes
*nfcSingleton
;
218 static Normalizer2
*noopSingleton
;
220 static icu::UInitOnce nfcInitOnce
= U_INITONCE_INITIALIZER
;
221 static icu::UInitOnce noopInitOnce
= U_INITONCE_INITIALIZER
;
223 // UInitOnce singleton initialization functions
224 static void U_CALLCONV
initNFCSingleton(UErrorCode
&errorCode
) {
225 nfcSingleton
=Norm2AllModes::createNFCInstance(errorCode
);
226 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2
, uprv_normalizer2_cleanup
);
229 static void U_CALLCONV
initNoopSingleton(UErrorCode
&errorCode
) {
230 if(U_FAILURE(errorCode
)) {
233 noopSingleton
=new NoopNormalizer2
;
234 if(noopSingleton
==NULL
) {
235 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
238 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2
, uprv_normalizer2_cleanup
);
243 static UBool U_CALLCONV
uprv_normalizer2_cleanup() {
246 delete noopSingleton
;
247 noopSingleton
= NULL
;
249 noopInitOnce
.reset();
255 const Norm2AllModes
*
256 Norm2AllModes::getNFCInstance(UErrorCode
&errorCode
) {
257 if(U_FAILURE(errorCode
)) { return NULL
; }
258 umtx_initOnce(nfcInitOnce
, &initNFCSingleton
, errorCode
);
263 Normalizer2::getNFCInstance(UErrorCode
&errorCode
) {
264 const Norm2AllModes
*allModes
=Norm2AllModes::getNFCInstance(errorCode
);
265 return allModes
!=NULL
? &allModes
->comp
: NULL
;
269 Normalizer2::getNFDInstance(UErrorCode
&errorCode
) {
270 const Norm2AllModes
*allModes
=Norm2AllModes::getNFCInstance(errorCode
);
271 return allModes
!=NULL
? &allModes
->decomp
: NULL
;
274 const Normalizer2
*Normalizer2Factory::getFCDInstance(UErrorCode
&errorCode
) {
275 const Norm2AllModes
*allModes
=Norm2AllModes::getNFCInstance(errorCode
);
276 return allModes
!=NULL
? &allModes
->fcd
: NULL
;
279 const Normalizer2
*Normalizer2Factory::getFCCInstance(UErrorCode
&errorCode
) {
280 const Norm2AllModes
*allModes
=Norm2AllModes::getNFCInstance(errorCode
);
281 return allModes
!=NULL
? &allModes
->fcc
: NULL
;
284 const Normalizer2
*Normalizer2Factory::getNoopInstance(UErrorCode
&errorCode
) {
285 if(U_FAILURE(errorCode
)) { return NULL
; }
286 umtx_initOnce(noopInitOnce
, &initNoopSingleton
, errorCode
);
287 return noopSingleton
;
290 const Normalizer2Impl
*
291 Normalizer2Factory::getNFCImpl(UErrorCode
&errorCode
) {
292 const Norm2AllModes
*allModes
=Norm2AllModes::getNFCInstance(errorCode
);
293 return allModes
!=NULL
? allModes
->impl
: NULL
;
296 const Normalizer2Impl
*
297 Normalizer2Factory::getImpl(const Normalizer2
*norm2
) {
298 return &((Normalizer2WithImpl
*)norm2
)->impl
;
303 // C API ------------------------------------------------------------------- ***
307 U_CAPI
const UNormalizer2
* U_EXPORT2
308 unorm2_getNFCInstance(UErrorCode
*pErrorCode
) {
309 return (const UNormalizer2
*)Normalizer2::getNFCInstance(*pErrorCode
);
312 U_CAPI
const UNormalizer2
* U_EXPORT2
313 unorm2_getNFDInstance(UErrorCode
*pErrorCode
) {
314 return (const UNormalizer2
*)Normalizer2::getNFDInstance(*pErrorCode
);
317 U_CAPI
void U_EXPORT2
318 unorm2_close(UNormalizer2
*norm2
) {
319 delete (Normalizer2
*)norm2
;
322 U_CAPI
int32_t U_EXPORT2
323 unorm2_normalize(const UNormalizer2
*norm2
,
324 const UChar
*src
, int32_t length
,
325 UChar
*dest
, int32_t capacity
,
326 UErrorCode
*pErrorCode
) {
327 if(U_FAILURE(*pErrorCode
)) {
330 if( (src
==NULL
? length
!=0 : length
<-1) ||
331 (dest
==NULL
? capacity
!=0 : capacity
<0) ||
332 (src
==dest
&& src
!=NULL
)
334 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
337 UnicodeString
destString(dest
, 0, capacity
);
338 // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
340 const Normalizer2
*n2
=(const Normalizer2
*)norm2
;
341 const Normalizer2WithImpl
*n2wi
=dynamic_cast<const Normalizer2WithImpl
*>(n2
);
343 // Avoid duplicate argument checking and support NUL-terminated src.
344 ReorderingBuffer
buffer(n2wi
->impl
, destString
);
345 if(buffer
.init(length
, *pErrorCode
)) {
346 n2wi
->normalize(src
, length
>=0 ? src
+length
: NULL
, buffer
, *pErrorCode
);
349 UnicodeString
srcString(length
<0, src
, length
);
350 n2
->normalize(srcString
, destString
, *pErrorCode
);
353 return destString
.extract(dest
, capacity
, *pErrorCode
);
357 normalizeSecondAndAppend(const UNormalizer2
*norm2
,
358 UChar
*first
, int32_t firstLength
, int32_t firstCapacity
,
359 const UChar
*second
, int32_t secondLength
,
361 UErrorCode
*pErrorCode
) {
362 if(U_FAILURE(*pErrorCode
)) {
365 if( (second
==NULL
? secondLength
!=0 : secondLength
<-1) ||
366 (first
==NULL
? (firstCapacity
!=0 || firstLength
!=0) :
367 (firstCapacity
<0 || firstLength
<-1)) ||
368 (first
==second
&& first
!=NULL
)
370 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
373 UnicodeString
firstString(first
, firstLength
, firstCapacity
);
374 firstLength
=firstString
.length(); // In case it was -1.
375 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
376 if(secondLength
!=0) {
377 const Normalizer2
*n2
=(const Normalizer2
*)norm2
;
378 const Normalizer2WithImpl
*n2wi
=dynamic_cast<const Normalizer2WithImpl
*>(n2
);
380 // Avoid duplicate argument checking and support NUL-terminated src.
381 UnicodeString safeMiddle
;
383 ReorderingBuffer
buffer(n2wi
->impl
, firstString
);
384 if(buffer
.init(firstLength
+secondLength
+1, *pErrorCode
)) { // destCapacity>=-1
385 n2wi
->normalizeAndAppend(second
, secondLength
>=0 ? second
+secondLength
: NULL
,
386 doNormalize
, safeMiddle
, buffer
, *pErrorCode
);
388 } // The ReorderingBuffer destructor finalizes firstString.
389 if(U_FAILURE(*pErrorCode
) || firstString
.length()>firstCapacity
) {
390 // Restore the modified suffix of the first string.
391 // This does not restore first[] array contents between firstLength and firstCapacity.
392 // (That might be uninitialized memory, as far as we know.)
393 if(first
!=NULL
) { /* don't dereference NULL */
394 safeMiddle
.extract(0, 0x7fffffff, first
+firstLength
-safeMiddle
.length());
395 if(firstLength
<firstCapacity
) {
396 first
[firstLength
]=0; // NUL-terminate in case it was originally.
401 UnicodeString
secondString(secondLength
<0, second
, secondLength
);
403 n2
->normalizeSecondAndAppend(firstString
, secondString
, *pErrorCode
);
405 n2
->append(firstString
, secondString
, *pErrorCode
);
409 return firstString
.extract(first
, firstCapacity
, *pErrorCode
);
412 U_CAPI
int32_t U_EXPORT2
413 unorm2_normalizeSecondAndAppend(const UNormalizer2
*norm2
,
414 UChar
*first
, int32_t firstLength
, int32_t firstCapacity
,
415 const UChar
*second
, int32_t secondLength
,
416 UErrorCode
*pErrorCode
) {
417 return normalizeSecondAndAppend(norm2
,
418 first
, firstLength
, firstCapacity
,
419 second
, secondLength
,
423 U_CAPI
int32_t U_EXPORT2
424 unorm2_append(const UNormalizer2
*norm2
,
425 UChar
*first
, int32_t firstLength
, int32_t firstCapacity
,
426 const UChar
*second
, int32_t secondLength
,
427 UErrorCode
*pErrorCode
) {
428 return normalizeSecondAndAppend(norm2
,
429 first
, firstLength
, firstCapacity
,
430 second
, secondLength
,
434 U_CAPI
int32_t U_EXPORT2
435 unorm2_getDecomposition(const UNormalizer2
*norm2
,
436 UChar32 c
, UChar
*decomposition
, int32_t capacity
,
437 UErrorCode
*pErrorCode
) {
438 if(U_FAILURE(*pErrorCode
)) {
441 if(decomposition
==NULL
? capacity
!=0 : capacity
<0) {
442 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
445 UnicodeString
destString(decomposition
, 0, capacity
);
446 if(reinterpret_cast<const Normalizer2
*>(norm2
)->getDecomposition(c
, destString
)) {
447 return destString
.extract(decomposition
, capacity
, *pErrorCode
);
453 U_CAPI
int32_t U_EXPORT2
454 unorm2_getRawDecomposition(const UNormalizer2
*norm2
,
455 UChar32 c
, UChar
*decomposition
, int32_t capacity
,
456 UErrorCode
*pErrorCode
) {
457 if(U_FAILURE(*pErrorCode
)) {
460 if(decomposition
==NULL
? capacity
!=0 : capacity
<0) {
461 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
464 UnicodeString
destString(decomposition
, 0, capacity
);
465 if(reinterpret_cast<const Normalizer2
*>(norm2
)->getRawDecomposition(c
, destString
)) {
466 return destString
.extract(decomposition
, capacity
, *pErrorCode
);
472 U_CAPI UChar32 U_EXPORT2
473 unorm2_composePair(const UNormalizer2
*norm2
, UChar32 a
, UChar32 b
) {
474 return reinterpret_cast<const Normalizer2
*>(norm2
)->composePair(a
, b
);
477 U_CAPI
uint8_t U_EXPORT2
478 unorm2_getCombiningClass(const UNormalizer2
*norm2
, UChar32 c
) {
479 return reinterpret_cast<const Normalizer2
*>(norm2
)->getCombiningClass(c
);
482 U_CAPI UBool U_EXPORT2
483 unorm2_isNormalized(const UNormalizer2
*norm2
,
484 const UChar
*s
, int32_t length
,
485 UErrorCode
*pErrorCode
) {
486 if(U_FAILURE(*pErrorCode
)) {
489 if((s
==NULL
&& length
!=0) || length
<-1) {
490 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
493 UnicodeString
sString(length
<0, s
, length
);
494 return ((const Normalizer2
*)norm2
)->isNormalized(sString
, *pErrorCode
);
497 U_CAPI UNormalizationCheckResult U_EXPORT2
498 unorm2_quickCheck(const UNormalizer2
*norm2
,
499 const UChar
*s
, int32_t length
,
500 UErrorCode
*pErrorCode
) {
501 if(U_FAILURE(*pErrorCode
)) {
504 if((s
==NULL
&& length
!=0) || length
<-1) {
505 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
508 UnicodeString
sString(length
<0, s
, length
);
509 return ((const Normalizer2
*)norm2
)->quickCheck(sString
, *pErrorCode
);
512 U_CAPI
int32_t U_EXPORT2
513 unorm2_spanQuickCheckYes(const UNormalizer2
*norm2
,
514 const UChar
*s
, int32_t length
,
515 UErrorCode
*pErrorCode
) {
516 if(U_FAILURE(*pErrorCode
)) {
519 if((s
==NULL
&& length
!=0) || length
<-1) {
520 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
523 UnicodeString
sString(length
<0, s
, length
);
524 return ((const Normalizer2
*)norm2
)->spanQuickCheckYes(sString
, *pErrorCode
);
527 U_CAPI UBool U_EXPORT2
528 unorm2_hasBoundaryBefore(const UNormalizer2
*norm2
, UChar32 c
) {
529 return ((const Normalizer2
*)norm2
)->hasBoundaryBefore(c
);
532 U_CAPI UBool U_EXPORT2
533 unorm2_hasBoundaryAfter(const UNormalizer2
*norm2
, UChar32 c
) {
534 return ((const Normalizer2
*)norm2
)->hasBoundaryAfter(c
);
537 U_CAPI UBool U_EXPORT2
538 unorm2_isInert(const UNormalizer2
*norm2
, UChar32 c
) {
539 return ((const Normalizer2
*)norm2
)->isInert(c
);
542 // Some properties APIs ---------------------------------------------------- ***
544 U_CAPI
uint8_t U_EXPORT2
545 u_getCombiningClass(UChar32 c
) {
546 UErrorCode errorCode
=U_ZERO_ERROR
;
547 const Normalizer2
*nfd
=Normalizer2::getNFDInstance(errorCode
);
548 if(U_SUCCESS(errorCode
)) {
549 return nfd
->getCombiningClass(c
);
556 unorm_getFCD16(UChar32 c
) {
557 UErrorCode errorCode
=U_ZERO_ERROR
;
558 const Normalizer2Impl
*impl
=Normalizer2Factory::getNFCImpl(errorCode
);
559 if(U_SUCCESS(errorCode
)) {
560 return impl
->getFCD16(c
);
566 #endif // !UCONFIG_NO_NORMALIZATION