]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/common/normalizer2.cpp
ICU-62141.0.1.tar.gz
[apple/icu.git] / icuSources / common / normalizer2.cpp
... / ...
CommitLineData
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2016, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: normalizer2.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov22
16* created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_NORMALIZATION
22
23#include "unicode/edits.h"
24#include "unicode/normalizer2.h"
25#include "unicode/stringoptions.h"
26#include "unicode/unistr.h"
27#include "unicode/unorm.h"
28#include "cstring.h"
29#include "mutex.h"
30#include "norm2allmodes.h"
31#include "normalizer2impl.h"
32#include "uassert.h"
33#include "ucln_cmn.h"
34
35using icu::Normalizer2Impl;
36
37// NFC/NFD data machine-generated by gennorm2 --csource
38#define INCLUDED_FROM_NORMALIZER2_CPP
39#include "norm2_nfc_data.h"
40
41U_NAMESPACE_BEGIN
42
43// Public API dispatch via Normalizer2 subclasses -------------------------- ***
44
45Normalizer2::~Normalizer2() {}
46
47void
48Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
49 Edits *edits, UErrorCode &errorCode) const {
50 if (U_FAILURE(errorCode)) {
51 return;
52 }
53 if (edits != nullptr) {
54 errorCode = U_UNSUPPORTED_ERROR;
55 return;
56 }
57 UnicodeString src16 = UnicodeString::fromUTF8(src);
58 normalize(src16, errorCode).toUTF8(sink);
59}
60
61UBool
62Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
63 return FALSE;
64}
65
66UChar32
67Normalizer2::composePair(UChar32, UChar32) const {
68 return U_SENTINEL;
69}
70
71uint8_t
72Normalizer2::getCombiningClass(UChar32 /*c*/) const {
73 return 0;
74}
75
76UBool
77Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
78 return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
79}
80
81// Normalizer2 implementation for the old UNORM_NONE.
82class NoopNormalizer2 : public Normalizer2 {
83 virtual ~NoopNormalizer2();
84
85 virtual UnicodeString &
86 normalize(const UnicodeString &src,
87 UnicodeString &dest,
88 UErrorCode &errorCode) const U_OVERRIDE {
89 if(U_SUCCESS(errorCode)) {
90 if(&dest!=&src) {
91 dest=src;
92 } else {
93 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
94 }
95 }
96 return dest;
97 }
98 virtual void
99 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
100 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
101 if(U_SUCCESS(errorCode)) {
102 if (edits != nullptr) {
103 if ((options & U_EDITS_NO_RESET) == 0) {
104 edits->reset();
105 }
106 edits->addUnchanged(src.length());
107 }
108 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
109 sink.Append(src.data(), src.length());
110 }
111 sink.Flush();
112 }
113 }
114
115 virtual UnicodeString &
116 normalizeSecondAndAppend(UnicodeString &first,
117 const UnicodeString &second,
118 UErrorCode &errorCode) const U_OVERRIDE {
119 if(U_SUCCESS(errorCode)) {
120 if(&first!=&second) {
121 first.append(second);
122 } else {
123 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
124 }
125 }
126 return first;
127 }
128 virtual UnicodeString &
129 append(UnicodeString &first,
130 const UnicodeString &second,
131 UErrorCode &errorCode) const U_OVERRIDE {
132 if(U_SUCCESS(errorCode)) {
133 if(&first!=&second) {
134 first.append(second);
135 } else {
136 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
137 }
138 }
139 return first;
140 }
141 virtual UBool
142 getDecomposition(UChar32, UnicodeString &) const U_OVERRIDE {
143 return FALSE;
144 }
145 // No need to U_OVERRIDE the default getRawDecomposition().
146 virtual UBool
147 isNormalized(const UnicodeString &, UErrorCode &errorCode) const U_OVERRIDE {
148 return U_SUCCESS(errorCode);
149 }
150 virtual UBool
151 isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const U_OVERRIDE {
152 return U_SUCCESS(errorCode);
153 }
154 virtual UNormalizationCheckResult
155 quickCheck(const UnicodeString &, UErrorCode &) const U_OVERRIDE {
156 return UNORM_YES;
157 }
158 virtual int32_t
159 spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const U_OVERRIDE {
160 return s.length();
161 }
162 virtual UBool hasBoundaryBefore(UChar32) const U_OVERRIDE { return TRUE; }
163 virtual UBool hasBoundaryAfter(UChar32) const U_OVERRIDE { return TRUE; }
164 virtual UBool isInert(UChar32) const U_OVERRIDE { return TRUE; }
165};
166
167NoopNormalizer2::~NoopNormalizer2() {}
168
169Normalizer2WithImpl::~Normalizer2WithImpl() {}
170
171DecomposeNormalizer2::~DecomposeNormalizer2() {}
172
173ComposeNormalizer2::~ComposeNormalizer2() {}
174
175FCDNormalizer2::~FCDNormalizer2() {}
176
177// instance cache ---------------------------------------------------------- ***
178
179Norm2AllModes::~Norm2AllModes() {
180 delete impl;
181}
182
183Norm2AllModes *
184Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
185 if(U_FAILURE(errorCode)) {
186 delete impl;
187 return NULL;
188 }
189 Norm2AllModes *allModes=new Norm2AllModes(impl);
190 if(allModes==NULL) {
191 errorCode=U_MEMORY_ALLOCATION_ERROR;
192 delete impl;
193 return NULL;
194 }
195 return allModes;
196}
197
198Norm2AllModes *
199Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
200 if(U_FAILURE(errorCode)) {
201 return NULL;
202 }
203 Normalizer2Impl *impl=new Normalizer2Impl;
204 if(impl==NULL) {
205 errorCode=U_MEMORY_ALLOCATION_ERROR;
206 return NULL;
207 }
208 impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
209 norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
210 return createInstance(impl, errorCode);
211}
212
213U_CDECL_BEGIN
214static UBool U_CALLCONV uprv_normalizer2_cleanup();
215U_CDECL_END
216
217static Norm2AllModes *nfcSingleton;
218static Normalizer2 *noopSingleton;
219
220static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
221static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
222
223// UInitOnce singleton initialization functions
224static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
225 nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
226 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
227}
228
229static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
230 if(U_FAILURE(errorCode)) {
231 return;
232 }
233 noopSingleton=new NoopNormalizer2;
234 if(noopSingleton==NULL) {
235 errorCode=U_MEMORY_ALLOCATION_ERROR;
236 return;
237 }
238 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
239}
240
241U_CDECL_BEGIN
242
243static UBool U_CALLCONV uprv_normalizer2_cleanup() {
244 delete nfcSingleton;
245 nfcSingleton = NULL;
246 delete noopSingleton;
247 noopSingleton = NULL;
248 nfcInitOnce.reset();
249 noopInitOnce.reset();
250 return TRUE;
251}
252
253U_CDECL_END
254
255const Norm2AllModes *
256Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
257 if(U_FAILURE(errorCode)) { return NULL; }
258 umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
259 return nfcSingleton;
260}
261
262const Normalizer2 *
263Normalizer2::getNFCInstance(UErrorCode &errorCode) {
264 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
265 return allModes!=NULL ? &allModes->comp : NULL;
266}
267
268const Normalizer2 *
269Normalizer2::getNFDInstance(UErrorCode &errorCode) {
270 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
271 return allModes!=NULL ? &allModes->decomp : NULL;
272}
273
274const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
275 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
276 return allModes!=NULL ? &allModes->fcd : NULL;
277}
278
279const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
280 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
281 return allModes!=NULL ? &allModes->fcc : NULL;
282}
283
284const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
285 if(U_FAILURE(errorCode)) { return NULL; }
286 umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
287 return noopSingleton;
288}
289
290const Normalizer2Impl *
291Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
292 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
293 return allModes!=NULL ? allModes->impl : NULL;
294}
295
296const Normalizer2Impl *
297Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
298 return &((Normalizer2WithImpl *)norm2)->impl;
299}
300
301U_NAMESPACE_END
302
303// C API ------------------------------------------------------------------- ***
304
305U_NAMESPACE_USE
306
307U_CAPI const UNormalizer2 * U_EXPORT2
308unorm2_getNFCInstance(UErrorCode *pErrorCode) {
309 return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
310}
311
312U_CAPI const UNormalizer2 * U_EXPORT2
313unorm2_getNFDInstance(UErrorCode *pErrorCode) {
314 return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
315}
316
317U_CAPI void U_EXPORT2
318unorm2_close(UNormalizer2 *norm2) {
319 delete (Normalizer2 *)norm2;
320}
321
322U_CAPI int32_t U_EXPORT2
323unorm2_normalize(const UNormalizer2 *norm2,
324 const UChar *src, int32_t length,
325 UChar *dest, int32_t capacity,
326 UErrorCode *pErrorCode) {
327 if(U_FAILURE(*pErrorCode)) {
328 return 0;
329 }
330 if( (src==NULL ? length!=0 : length<-1) ||
331 (dest==NULL ? capacity!=0 : capacity<0) ||
332 (src==dest && src!=NULL)
333 ) {
334 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
335 return 0;
336 }
337 UnicodeString destString(dest, 0, capacity);
338 // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
339 if(length!=0) {
340 const Normalizer2 *n2=(const Normalizer2 *)norm2;
341 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
342 if(n2wi!=NULL) {
343 // Avoid duplicate argument checking and support NUL-terminated src.
344 ReorderingBuffer buffer(n2wi->impl, destString);
345 if(buffer.init(length, *pErrorCode)) {
346 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
347 }
348 } else {
349 UnicodeString srcString(length<0, src, length);
350 n2->normalize(srcString, destString, *pErrorCode);
351 }
352 }
353 return destString.extract(dest, capacity, *pErrorCode);
354}
355
356static int32_t
357normalizeSecondAndAppend(const UNormalizer2 *norm2,
358 UChar *first, int32_t firstLength, int32_t firstCapacity,
359 const UChar *second, int32_t secondLength,
360 UBool doNormalize,
361 UErrorCode *pErrorCode) {
362 if(U_FAILURE(*pErrorCode)) {
363 return 0;
364 }
365 if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
366 (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
367 (firstCapacity<0 || firstLength<-1)) ||
368 (first==second && first!=NULL)
369 ) {
370 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
371 return 0;
372 }
373 UnicodeString firstString(first, firstLength, firstCapacity);
374 firstLength=firstString.length(); // In case it was -1.
375 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
376 if(secondLength!=0) {
377 const Normalizer2 *n2=(const Normalizer2 *)norm2;
378 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
379 if(n2wi!=NULL) {
380 // Avoid duplicate argument checking and support NUL-terminated src.
381 UnicodeString safeMiddle;
382 {
383 ReorderingBuffer buffer(n2wi->impl, firstString);
384 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
385 n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
386 doNormalize, safeMiddle, buffer, *pErrorCode);
387 }
388 } // The ReorderingBuffer destructor finalizes firstString.
389 if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
390 // Restore the modified suffix of the first string.
391 // This does not restore first[] array contents between firstLength and firstCapacity.
392 // (That might be uninitialized memory, as far as we know.)
393 if(first!=NULL) { /* don't dereference NULL */
394 safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
395 if(firstLength<firstCapacity) {
396 first[firstLength]=0; // NUL-terminate in case it was originally.
397 }
398 }
399 }
400 } else {
401 UnicodeString secondString(secondLength<0, second, secondLength);
402 if(doNormalize) {
403 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
404 } else {
405 n2->append(firstString, secondString, *pErrorCode);
406 }
407 }
408 }
409 return firstString.extract(first, firstCapacity, *pErrorCode);
410}
411
412U_CAPI int32_t U_EXPORT2
413unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
414 UChar *first, int32_t firstLength, int32_t firstCapacity,
415 const UChar *second, int32_t secondLength,
416 UErrorCode *pErrorCode) {
417 return normalizeSecondAndAppend(norm2,
418 first, firstLength, firstCapacity,
419 second, secondLength,
420 TRUE, pErrorCode);
421}
422
423U_CAPI int32_t U_EXPORT2
424unorm2_append(const UNormalizer2 *norm2,
425 UChar *first, int32_t firstLength, int32_t firstCapacity,
426 const UChar *second, int32_t secondLength,
427 UErrorCode *pErrorCode) {
428 return normalizeSecondAndAppend(norm2,
429 first, firstLength, firstCapacity,
430 second, secondLength,
431 FALSE, pErrorCode);
432}
433
434U_CAPI int32_t U_EXPORT2
435unorm2_getDecomposition(const UNormalizer2 *norm2,
436 UChar32 c, UChar *decomposition, int32_t capacity,
437 UErrorCode *pErrorCode) {
438 if(U_FAILURE(*pErrorCode)) {
439 return 0;
440 }
441 if(decomposition==NULL ? capacity!=0 : capacity<0) {
442 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
443 return 0;
444 }
445 UnicodeString destString(decomposition, 0, capacity);
446 if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
447 return destString.extract(decomposition, capacity, *pErrorCode);
448 } else {
449 return -1;
450 }
451}
452
453U_CAPI int32_t U_EXPORT2
454unorm2_getRawDecomposition(const UNormalizer2 *norm2,
455 UChar32 c, UChar *decomposition, int32_t capacity,
456 UErrorCode *pErrorCode) {
457 if(U_FAILURE(*pErrorCode)) {
458 return 0;
459 }
460 if(decomposition==NULL ? capacity!=0 : capacity<0) {
461 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
462 return 0;
463 }
464 UnicodeString destString(decomposition, 0, capacity);
465 if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
466 return destString.extract(decomposition, capacity, *pErrorCode);
467 } else {
468 return -1;
469 }
470}
471
472U_CAPI UChar32 U_EXPORT2
473unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
474 return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
475}
476
477U_CAPI uint8_t U_EXPORT2
478unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
479 return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
480}
481
482U_CAPI UBool U_EXPORT2
483unorm2_isNormalized(const UNormalizer2 *norm2,
484 const UChar *s, int32_t length,
485 UErrorCode *pErrorCode) {
486 if(U_FAILURE(*pErrorCode)) {
487 return 0;
488 }
489 if((s==NULL && length!=0) || length<-1) {
490 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
491 return 0;
492 }
493 UnicodeString sString(length<0, s, length);
494 return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
495}
496
497U_CAPI UNormalizationCheckResult U_EXPORT2
498unorm2_quickCheck(const UNormalizer2 *norm2,
499 const UChar *s, int32_t length,
500 UErrorCode *pErrorCode) {
501 if(U_FAILURE(*pErrorCode)) {
502 return UNORM_NO;
503 }
504 if((s==NULL && length!=0) || length<-1) {
505 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
506 return UNORM_NO;
507 }
508 UnicodeString sString(length<0, s, length);
509 return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
510}
511
512U_CAPI int32_t U_EXPORT2
513unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
514 const UChar *s, int32_t length,
515 UErrorCode *pErrorCode) {
516 if(U_FAILURE(*pErrorCode)) {
517 return 0;
518 }
519 if((s==NULL && length!=0) || length<-1) {
520 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
521 return 0;
522 }
523 UnicodeString sString(length<0, s, length);
524 return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
525}
526
527U_CAPI UBool U_EXPORT2
528unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
529 return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
530}
531
532U_CAPI UBool U_EXPORT2
533unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
534 return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
535}
536
537U_CAPI UBool U_EXPORT2
538unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
539 return ((const Normalizer2 *)norm2)->isInert(c);
540}
541
542// Some properties APIs ---------------------------------------------------- ***
543
544U_CAPI uint8_t U_EXPORT2
545u_getCombiningClass(UChar32 c) {
546 UErrorCode errorCode=U_ZERO_ERROR;
547 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
548 if(U_SUCCESS(errorCode)) {
549 return nfd->getCombiningClass(c);
550 } else {
551 return 0;
552 }
553}
554
555U_CFUNC uint16_t
556unorm_getFCD16(UChar32 c) {
557 UErrorCode errorCode=U_ZERO_ERROR;
558 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
559 if(U_SUCCESS(errorCode)) {
560 return impl->getFCD16(c);
561 } else {
562 return 0;
563 }
564}
565
566#endif // !UCONFIG_NO_NORMALIZATION