]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/normalizer2.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / normalizer2.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_NORMALIZATION
22
23 #include "unicode/edits.h"
24 #include "unicode/normalizer2.h"
25 #include "unicode/stringoptions.h"
26 #include "unicode/unistr.h"
27 #include "unicode/unorm.h"
28 #include "cstring.h"
29 #include "mutex.h"
30 #include "norm2allmodes.h"
31 #include "normalizer2impl.h"
32 #include "uassert.h"
33 #include "ucln_cmn.h"
34
35 using icu::Normalizer2Impl;
36
37 #if NORM2_HARDCODE_NFC_DATA
38 // NFC/NFD data machine-generated by gennorm2 --csource
39 #define INCLUDED_FROM_NORMALIZER2_CPP
40 #include "norm2_nfc_data.h"
41 #endif
42
43 U_NAMESPACE_BEGIN
44
45 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
46
47 Normalizer2::~Normalizer2() {}
48
49 void
50 Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
51 Edits *edits, UErrorCode &errorCode) const {
52 if (U_FAILURE(errorCode)) {
53 return;
54 }
55 if (edits != nullptr) {
56 errorCode = U_UNSUPPORTED_ERROR;
57 return;
58 }
59 UnicodeString src16 = UnicodeString::fromUTF8(src);
60 normalize(src16, errorCode).toUTF8(sink);
61 }
62
63 UBool
64 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
65 return FALSE;
66 }
67
68 UChar32
69 Normalizer2::composePair(UChar32, UChar32) const {
70 return U_SENTINEL;
71 }
72
73 uint8_t
74 Normalizer2::getCombiningClass(UChar32 /*c*/) const {
75 return 0;
76 }
77
78 UBool
79 Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
80 return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
81 }
82
83 // Normalizer2 implementation for the old UNORM_NONE.
84 class NoopNormalizer2 : public Normalizer2 {
85 virtual ~NoopNormalizer2();
86
87 virtual UnicodeString &
88 normalize(const UnicodeString &src,
89 UnicodeString &dest,
90 UErrorCode &errorCode) const U_OVERRIDE {
91 if(U_SUCCESS(errorCode)) {
92 if(&dest!=&src) {
93 dest=src;
94 } else {
95 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
96 }
97 }
98 return dest;
99 }
100 virtual void
101 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
102 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
103 if(U_SUCCESS(errorCode)) {
104 if (edits != nullptr) {
105 if ((options & U_EDITS_NO_RESET) == 0) {
106 edits->reset();
107 }
108 edits->addUnchanged(src.length());
109 }
110 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
111 sink.Append(src.data(), src.length());
112 }
113 sink.Flush();
114 }
115 }
116
117 virtual UnicodeString &
118 normalizeSecondAndAppend(UnicodeString &first,
119 const UnicodeString &second,
120 UErrorCode &errorCode) const U_OVERRIDE {
121 if(U_SUCCESS(errorCode)) {
122 if(&first!=&second) {
123 first.append(second);
124 } else {
125 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
126 }
127 }
128 return first;
129 }
130 virtual UnicodeString &
131 append(UnicodeString &first,
132 const UnicodeString &second,
133 UErrorCode &errorCode) const U_OVERRIDE {
134 if(U_SUCCESS(errorCode)) {
135 if(&first!=&second) {
136 first.append(second);
137 } else {
138 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
139 }
140 }
141 return first;
142 }
143 virtual UBool
144 getDecomposition(UChar32, UnicodeString &) const U_OVERRIDE {
145 return FALSE;
146 }
147 // No need to U_OVERRIDE the default getRawDecomposition().
148 virtual UBool
149 isNormalized(const UnicodeString &, UErrorCode &errorCode) const U_OVERRIDE {
150 return U_SUCCESS(errorCode);
151 }
152 virtual UBool
153 isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const U_OVERRIDE {
154 return U_SUCCESS(errorCode);
155 }
156 virtual UNormalizationCheckResult
157 quickCheck(const UnicodeString &, UErrorCode &) const U_OVERRIDE {
158 return UNORM_YES;
159 }
160 virtual int32_t
161 spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const U_OVERRIDE {
162 return s.length();
163 }
164 virtual UBool hasBoundaryBefore(UChar32) const U_OVERRIDE { return TRUE; }
165 virtual UBool hasBoundaryAfter(UChar32) const U_OVERRIDE { return TRUE; }
166 virtual UBool isInert(UChar32) const U_OVERRIDE { return TRUE; }
167 };
168
169 NoopNormalizer2::~NoopNormalizer2() {}
170
171 Normalizer2WithImpl::~Normalizer2WithImpl() {}
172
173 DecomposeNormalizer2::~DecomposeNormalizer2() {}
174
175 ComposeNormalizer2::~ComposeNormalizer2() {}
176
177 FCDNormalizer2::~FCDNormalizer2() {}
178
179 // instance cache ---------------------------------------------------------- ***
180
181 U_CDECL_BEGIN
182 static UBool U_CALLCONV uprv_normalizer2_cleanup();
183 U_CDECL_END
184
185 static Normalizer2 *noopSingleton;
186 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
187
188 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
189 if(U_FAILURE(errorCode)) {
190 return;
191 }
192 noopSingleton=new NoopNormalizer2;
193 if(noopSingleton==NULL) {
194 errorCode=U_MEMORY_ALLOCATION_ERROR;
195 return;
196 }
197 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
198 }
199
200 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
201 if(U_FAILURE(errorCode)) { return NULL; }
202 umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
203 return noopSingleton;
204 }
205
206 const Normalizer2Impl *
207 Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
208 return &((Normalizer2WithImpl *)norm2)->impl;
209 }
210
211 Norm2AllModes::~Norm2AllModes() {
212 delete impl;
213 }
214
215 Norm2AllModes *
216 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
217 if(U_FAILURE(errorCode)) {
218 delete impl;
219 return NULL;
220 }
221 Norm2AllModes *allModes=new Norm2AllModes(impl);
222 if(allModes==NULL) {
223 errorCode=U_MEMORY_ALLOCATION_ERROR;
224 delete impl;
225 return NULL;
226 }
227 return allModes;
228 }
229
230 #if NORM2_HARDCODE_NFC_DATA
231 Norm2AllModes *
232 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
233 if(U_FAILURE(errorCode)) {
234 return NULL;
235 }
236 Normalizer2Impl *impl=new Normalizer2Impl;
237 if(impl==NULL) {
238 errorCode=U_MEMORY_ALLOCATION_ERROR;
239 return NULL;
240 }
241 impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
242 norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
243 return createInstance(impl, errorCode);
244 }
245
246 static Norm2AllModes *nfcSingleton;
247
248 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
249
250 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
251 nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
252 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
253 }
254
255 const Norm2AllModes *
256 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
257 if(U_FAILURE(errorCode)) { return NULL; }
258 umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
259 return nfcSingleton;
260 }
261
262 const Normalizer2 *
263 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
264 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
265 return allModes!=NULL ? &allModes->comp : NULL;
266 }
267
268 const Normalizer2 *
269 Normalizer2::getNFDInstance(UErrorCode &errorCode) {
270 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
271 return allModes!=NULL ? &allModes->decomp : NULL;
272 }
273
274 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
275 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
276 return allModes!=NULL ? &allModes->fcd : NULL;
277 }
278
279 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
280 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
281 return allModes!=NULL ? &allModes->fcc : NULL;
282 }
283
284 const Normalizer2Impl *
285 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
286 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
287 return allModes!=NULL ? allModes->impl : NULL;
288 }
289 #endif // NORM2_HARDCODE_NFC_DATA
290
291 U_CDECL_BEGIN
292
293 static UBool U_CALLCONV uprv_normalizer2_cleanup() {
294 delete noopSingleton;
295 noopSingleton = NULL;
296 noopInitOnce.reset();
297 #if NORM2_HARDCODE_NFC_DATA
298 delete nfcSingleton;
299 nfcSingleton = NULL;
300 nfcInitOnce.reset();
301 #endif
302 return TRUE;
303 }
304
305 U_CDECL_END
306
307 U_NAMESPACE_END
308
309 // C API ------------------------------------------------------------------- ***
310
311 U_NAMESPACE_USE
312
313 U_CAPI const UNormalizer2 * U_EXPORT2
314 unorm2_getNFCInstance(UErrorCode *pErrorCode) {
315 return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
316 }
317
318 U_CAPI const UNormalizer2 * U_EXPORT2
319 unorm2_getNFDInstance(UErrorCode *pErrorCode) {
320 return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
321 }
322
323 U_CAPI void U_EXPORT2
324 unorm2_close(UNormalizer2 *norm2) {
325 delete (Normalizer2 *)norm2;
326 }
327
328 U_CAPI int32_t U_EXPORT2
329 unorm2_normalize(const UNormalizer2 *norm2,
330 const UChar *src, int32_t length,
331 UChar *dest, int32_t capacity,
332 UErrorCode *pErrorCode) {
333 if(U_FAILURE(*pErrorCode)) {
334 return 0;
335 }
336 if( (src==NULL ? length!=0 : length<-1) ||
337 (dest==NULL ? capacity!=0 : capacity<0) ||
338 (src==dest && src!=NULL)
339 ) {
340 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
341 return 0;
342 }
343 UnicodeString destString(dest, 0, capacity);
344 // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
345 if(length!=0) {
346 const Normalizer2 *n2=(const Normalizer2 *)norm2;
347 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
348 if(n2wi!=NULL) {
349 // Avoid duplicate argument checking and support NUL-terminated src.
350 ReorderingBuffer buffer(n2wi->impl, destString);
351 if(buffer.init(length, *pErrorCode)) {
352 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
353 }
354 } else {
355 UnicodeString srcString(length<0, src, length);
356 n2->normalize(srcString, destString, *pErrorCode);
357 }
358 }
359 return destString.extract(dest, capacity, *pErrorCode);
360 }
361
362 static int32_t
363 normalizeSecondAndAppend(const UNormalizer2 *norm2,
364 UChar *first, int32_t firstLength, int32_t firstCapacity,
365 const UChar *second, int32_t secondLength,
366 UBool doNormalize,
367 UErrorCode *pErrorCode) {
368 if(U_FAILURE(*pErrorCode)) {
369 return 0;
370 }
371 if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
372 (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
373 (firstCapacity<0 || firstLength<-1)) ||
374 (first==second && first!=NULL)
375 ) {
376 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
377 return 0;
378 }
379 UnicodeString firstString(first, firstLength, firstCapacity);
380 firstLength=firstString.length(); // In case it was -1.
381 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
382 if(secondLength!=0) {
383 const Normalizer2 *n2=(const Normalizer2 *)norm2;
384 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
385 if(n2wi!=NULL) {
386 // Avoid duplicate argument checking and support NUL-terminated src.
387 UnicodeString safeMiddle;
388 {
389 ReorderingBuffer buffer(n2wi->impl, firstString);
390 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
391 n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
392 doNormalize, safeMiddle, buffer, *pErrorCode);
393 }
394 } // The ReorderingBuffer destructor finalizes firstString.
395 if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
396 // Restore the modified suffix of the first string.
397 // This does not restore first[] array contents between firstLength and firstCapacity.
398 // (That might be uninitialized memory, as far as we know.)
399 if(first!=NULL) { /* don't dereference NULL */
400 safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
401 if(firstLength<firstCapacity) {
402 first[firstLength]=0; // NUL-terminate in case it was originally.
403 }
404 }
405 }
406 } else {
407 UnicodeString secondString(secondLength<0, second, secondLength);
408 if(doNormalize) {
409 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
410 } else {
411 n2->append(firstString, secondString, *pErrorCode);
412 }
413 }
414 }
415 return firstString.extract(first, firstCapacity, *pErrorCode);
416 }
417
418 U_CAPI int32_t U_EXPORT2
419 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
420 UChar *first, int32_t firstLength, int32_t firstCapacity,
421 const UChar *second, int32_t secondLength,
422 UErrorCode *pErrorCode) {
423 return normalizeSecondAndAppend(norm2,
424 first, firstLength, firstCapacity,
425 second, secondLength,
426 TRUE, pErrorCode);
427 }
428
429 U_CAPI int32_t U_EXPORT2
430 unorm2_append(const UNormalizer2 *norm2,
431 UChar *first, int32_t firstLength, int32_t firstCapacity,
432 const UChar *second, int32_t secondLength,
433 UErrorCode *pErrorCode) {
434 return normalizeSecondAndAppend(norm2,
435 first, firstLength, firstCapacity,
436 second, secondLength,
437 FALSE, pErrorCode);
438 }
439
440 U_CAPI int32_t U_EXPORT2
441 unorm2_getDecomposition(const UNormalizer2 *norm2,
442 UChar32 c, UChar *decomposition, int32_t capacity,
443 UErrorCode *pErrorCode) {
444 if(U_FAILURE(*pErrorCode)) {
445 return 0;
446 }
447 if(decomposition==NULL ? capacity!=0 : capacity<0) {
448 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
449 return 0;
450 }
451 UnicodeString destString(decomposition, 0, capacity);
452 if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
453 return destString.extract(decomposition, capacity, *pErrorCode);
454 } else {
455 return -1;
456 }
457 }
458
459 U_CAPI int32_t U_EXPORT2
460 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
461 UChar32 c, UChar *decomposition, int32_t capacity,
462 UErrorCode *pErrorCode) {
463 if(U_FAILURE(*pErrorCode)) {
464 return 0;
465 }
466 if(decomposition==NULL ? capacity!=0 : capacity<0) {
467 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
468 return 0;
469 }
470 UnicodeString destString(decomposition, 0, capacity);
471 if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
472 return destString.extract(decomposition, capacity, *pErrorCode);
473 } else {
474 return -1;
475 }
476 }
477
478 U_CAPI UChar32 U_EXPORT2
479 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
480 return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
481 }
482
483 U_CAPI uint8_t U_EXPORT2
484 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
485 return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
486 }
487
488 U_CAPI UBool U_EXPORT2
489 unorm2_isNormalized(const UNormalizer2 *norm2,
490 const UChar *s, int32_t length,
491 UErrorCode *pErrorCode) {
492 if(U_FAILURE(*pErrorCode)) {
493 return 0;
494 }
495 if((s==NULL && length!=0) || length<-1) {
496 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
497 return 0;
498 }
499 UnicodeString sString(length<0, s, length);
500 return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
501 }
502
503 U_CAPI UNormalizationCheckResult U_EXPORT2
504 unorm2_quickCheck(const UNormalizer2 *norm2,
505 const UChar *s, int32_t length,
506 UErrorCode *pErrorCode) {
507 if(U_FAILURE(*pErrorCode)) {
508 return UNORM_NO;
509 }
510 if((s==NULL && length!=0) || length<-1) {
511 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
512 return UNORM_NO;
513 }
514 UnicodeString sString(length<0, s, length);
515 return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
516 }
517
518 U_CAPI int32_t U_EXPORT2
519 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
520 const UChar *s, int32_t length,
521 UErrorCode *pErrorCode) {
522 if(U_FAILURE(*pErrorCode)) {
523 return 0;
524 }
525 if((s==NULL && length!=0) || length<-1) {
526 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
527 return 0;
528 }
529 UnicodeString sString(length<0, s, length);
530 return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
531 }
532
533 U_CAPI UBool U_EXPORT2
534 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
535 return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
536 }
537
538 U_CAPI UBool U_EXPORT2
539 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
540 return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
541 }
542
543 U_CAPI UBool U_EXPORT2
544 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
545 return ((const Normalizer2 *)norm2)->isInert(c);
546 }
547
548 // Some properties APIs ---------------------------------------------------- ***
549
550 U_CAPI uint8_t U_EXPORT2
551 u_getCombiningClass(UChar32 c) {
552 UErrorCode errorCode=U_ZERO_ERROR;
553 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
554 if(U_SUCCESS(errorCode)) {
555 return nfd->getCombiningClass(c);
556 } else {
557 return 0;
558 }
559 }
560
561 U_CFUNC uint16_t
562 unorm_getFCD16(UChar32 c) {
563 UErrorCode errorCode=U_ZERO_ERROR;
564 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
565 if(U_SUCCESS(errorCode)) {
566 return impl->getFCD16(c);
567 } else {
568 return 0;
569 }
570 }
571
572 #endif // !UCONFIG_NO_NORMALIZATION