]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/normalizer2.cpp
ICU-62141.0.1.tar.gz
[apple/icu.git] / icuSources / common / normalizer2.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_NORMALIZATION
22
23 #include "unicode/edits.h"
24 #include "unicode/normalizer2.h"
25 #include "unicode/stringoptions.h"
26 #include "unicode/unistr.h"
27 #include "unicode/unorm.h"
28 #include "cstring.h"
29 #include "mutex.h"
30 #include "norm2allmodes.h"
31 #include "normalizer2impl.h"
32 #include "uassert.h"
33 #include "ucln_cmn.h"
34
35 using icu::Normalizer2Impl;
36
37 // NFC/NFD data machine-generated by gennorm2 --csource
38 #define INCLUDED_FROM_NORMALIZER2_CPP
39 #include "norm2_nfc_data.h"
40
41 U_NAMESPACE_BEGIN
42
43 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
44
45 Normalizer2::~Normalizer2() {}
46
47 void
48 Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
49 Edits *edits, UErrorCode &errorCode) const {
50 if (U_FAILURE(errorCode)) {
51 return;
52 }
53 if (edits != nullptr) {
54 errorCode = U_UNSUPPORTED_ERROR;
55 return;
56 }
57 UnicodeString src16 = UnicodeString::fromUTF8(src);
58 normalize(src16, errorCode).toUTF8(sink);
59 }
60
61 UBool
62 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
63 return FALSE;
64 }
65
66 UChar32
67 Normalizer2::composePair(UChar32, UChar32) const {
68 return U_SENTINEL;
69 }
70
71 uint8_t
72 Normalizer2::getCombiningClass(UChar32 /*c*/) const {
73 return 0;
74 }
75
76 UBool
77 Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
78 return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
79 }
80
81 // Normalizer2 implementation for the old UNORM_NONE.
82 class NoopNormalizer2 : public Normalizer2 {
83 virtual ~NoopNormalizer2();
84
85 virtual UnicodeString &
86 normalize(const UnicodeString &src,
87 UnicodeString &dest,
88 UErrorCode &errorCode) const U_OVERRIDE {
89 if(U_SUCCESS(errorCode)) {
90 if(&dest!=&src) {
91 dest=src;
92 } else {
93 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
94 }
95 }
96 return dest;
97 }
98 virtual void
99 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
100 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
101 if(U_SUCCESS(errorCode)) {
102 if (edits != nullptr) {
103 if ((options & U_EDITS_NO_RESET) == 0) {
104 edits->reset();
105 }
106 edits->addUnchanged(src.length());
107 }
108 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
109 sink.Append(src.data(), src.length());
110 }
111 sink.Flush();
112 }
113 }
114
115 virtual UnicodeString &
116 normalizeSecondAndAppend(UnicodeString &first,
117 const UnicodeString &second,
118 UErrorCode &errorCode) const U_OVERRIDE {
119 if(U_SUCCESS(errorCode)) {
120 if(&first!=&second) {
121 first.append(second);
122 } else {
123 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
124 }
125 }
126 return first;
127 }
128 virtual UnicodeString &
129 append(UnicodeString &first,
130 const UnicodeString &second,
131 UErrorCode &errorCode) const U_OVERRIDE {
132 if(U_SUCCESS(errorCode)) {
133 if(&first!=&second) {
134 first.append(second);
135 } else {
136 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
137 }
138 }
139 return first;
140 }
141 virtual UBool
142 getDecomposition(UChar32, UnicodeString &) const U_OVERRIDE {
143 return FALSE;
144 }
145 // No need to U_OVERRIDE the default getRawDecomposition().
146 virtual UBool
147 isNormalized(const UnicodeString &, UErrorCode &errorCode) const U_OVERRIDE {
148 return U_SUCCESS(errorCode);
149 }
150 virtual UBool
151 isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const U_OVERRIDE {
152 return U_SUCCESS(errorCode);
153 }
154 virtual UNormalizationCheckResult
155 quickCheck(const UnicodeString &, UErrorCode &) const U_OVERRIDE {
156 return UNORM_YES;
157 }
158 virtual int32_t
159 spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const U_OVERRIDE {
160 return s.length();
161 }
162 virtual UBool hasBoundaryBefore(UChar32) const U_OVERRIDE { return TRUE; }
163 virtual UBool hasBoundaryAfter(UChar32) const U_OVERRIDE { return TRUE; }
164 virtual UBool isInert(UChar32) const U_OVERRIDE { return TRUE; }
165 };
166
167 NoopNormalizer2::~NoopNormalizer2() {}
168
169 Normalizer2WithImpl::~Normalizer2WithImpl() {}
170
171 DecomposeNormalizer2::~DecomposeNormalizer2() {}
172
173 ComposeNormalizer2::~ComposeNormalizer2() {}
174
175 FCDNormalizer2::~FCDNormalizer2() {}
176
177 // instance cache ---------------------------------------------------------- ***
178
179 Norm2AllModes::~Norm2AllModes() {
180 delete impl;
181 }
182
183 Norm2AllModes *
184 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
185 if(U_FAILURE(errorCode)) {
186 delete impl;
187 return NULL;
188 }
189 Norm2AllModes *allModes=new Norm2AllModes(impl);
190 if(allModes==NULL) {
191 errorCode=U_MEMORY_ALLOCATION_ERROR;
192 delete impl;
193 return NULL;
194 }
195 return allModes;
196 }
197
198 Norm2AllModes *
199 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
200 if(U_FAILURE(errorCode)) {
201 return NULL;
202 }
203 Normalizer2Impl *impl=new Normalizer2Impl;
204 if(impl==NULL) {
205 errorCode=U_MEMORY_ALLOCATION_ERROR;
206 return NULL;
207 }
208 impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
209 norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
210 return createInstance(impl, errorCode);
211 }
212
213 U_CDECL_BEGIN
214 static UBool U_CALLCONV uprv_normalizer2_cleanup();
215 U_CDECL_END
216
217 static Norm2AllModes *nfcSingleton;
218 static Normalizer2 *noopSingleton;
219
220 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
221 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
222
223 // UInitOnce singleton initialization functions
224 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
225 nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
226 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
227 }
228
229 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
230 if(U_FAILURE(errorCode)) {
231 return;
232 }
233 noopSingleton=new NoopNormalizer2;
234 if(noopSingleton==NULL) {
235 errorCode=U_MEMORY_ALLOCATION_ERROR;
236 return;
237 }
238 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
239 }
240
241 U_CDECL_BEGIN
242
243 static UBool U_CALLCONV uprv_normalizer2_cleanup() {
244 delete nfcSingleton;
245 nfcSingleton = NULL;
246 delete noopSingleton;
247 noopSingleton = NULL;
248 nfcInitOnce.reset();
249 noopInitOnce.reset();
250 return TRUE;
251 }
252
253 U_CDECL_END
254
255 const Norm2AllModes *
256 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
257 if(U_FAILURE(errorCode)) { return NULL; }
258 umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
259 return nfcSingleton;
260 }
261
262 const Normalizer2 *
263 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
264 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
265 return allModes!=NULL ? &allModes->comp : NULL;
266 }
267
268 const Normalizer2 *
269 Normalizer2::getNFDInstance(UErrorCode &errorCode) {
270 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
271 return allModes!=NULL ? &allModes->decomp : NULL;
272 }
273
274 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
275 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
276 return allModes!=NULL ? &allModes->fcd : NULL;
277 }
278
279 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
280 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
281 return allModes!=NULL ? &allModes->fcc : NULL;
282 }
283
284 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
285 if(U_FAILURE(errorCode)) { return NULL; }
286 umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
287 return noopSingleton;
288 }
289
290 const Normalizer2Impl *
291 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
292 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
293 return allModes!=NULL ? allModes->impl : NULL;
294 }
295
296 const Normalizer2Impl *
297 Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
298 return &((Normalizer2WithImpl *)norm2)->impl;
299 }
300
301 U_NAMESPACE_END
302
303 // C API ------------------------------------------------------------------- ***
304
305 U_NAMESPACE_USE
306
307 U_CAPI const UNormalizer2 * U_EXPORT2
308 unorm2_getNFCInstance(UErrorCode *pErrorCode) {
309 return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
310 }
311
312 U_CAPI const UNormalizer2 * U_EXPORT2
313 unorm2_getNFDInstance(UErrorCode *pErrorCode) {
314 return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
315 }
316
317 U_CAPI void U_EXPORT2
318 unorm2_close(UNormalizer2 *norm2) {
319 delete (Normalizer2 *)norm2;
320 }
321
322 U_CAPI int32_t U_EXPORT2
323 unorm2_normalize(const UNormalizer2 *norm2,
324 const UChar *src, int32_t length,
325 UChar *dest, int32_t capacity,
326 UErrorCode *pErrorCode) {
327 if(U_FAILURE(*pErrorCode)) {
328 return 0;
329 }
330 if( (src==NULL ? length!=0 : length<-1) ||
331 (dest==NULL ? capacity!=0 : capacity<0) ||
332 (src==dest && src!=NULL)
333 ) {
334 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
335 return 0;
336 }
337 UnicodeString destString(dest, 0, capacity);
338 // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
339 if(length!=0) {
340 const Normalizer2 *n2=(const Normalizer2 *)norm2;
341 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
342 if(n2wi!=NULL) {
343 // Avoid duplicate argument checking and support NUL-terminated src.
344 ReorderingBuffer buffer(n2wi->impl, destString);
345 if(buffer.init(length, *pErrorCode)) {
346 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
347 }
348 } else {
349 UnicodeString srcString(length<0, src, length);
350 n2->normalize(srcString, destString, *pErrorCode);
351 }
352 }
353 return destString.extract(dest, capacity, *pErrorCode);
354 }
355
356 static int32_t
357 normalizeSecondAndAppend(const UNormalizer2 *norm2,
358 UChar *first, int32_t firstLength, int32_t firstCapacity,
359 const UChar *second, int32_t secondLength,
360 UBool doNormalize,
361 UErrorCode *pErrorCode) {
362 if(U_FAILURE(*pErrorCode)) {
363 return 0;
364 }
365 if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
366 (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
367 (firstCapacity<0 || firstLength<-1)) ||
368 (first==second && first!=NULL)
369 ) {
370 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
371 return 0;
372 }
373 UnicodeString firstString(first, firstLength, firstCapacity);
374 firstLength=firstString.length(); // In case it was -1.
375 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
376 if(secondLength!=0) {
377 const Normalizer2 *n2=(const Normalizer2 *)norm2;
378 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
379 if(n2wi!=NULL) {
380 // Avoid duplicate argument checking and support NUL-terminated src.
381 UnicodeString safeMiddle;
382 {
383 ReorderingBuffer buffer(n2wi->impl, firstString);
384 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
385 n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
386 doNormalize, safeMiddle, buffer, *pErrorCode);
387 }
388 } // The ReorderingBuffer destructor finalizes firstString.
389 if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
390 // Restore the modified suffix of the first string.
391 // This does not restore first[] array contents between firstLength and firstCapacity.
392 // (That might be uninitialized memory, as far as we know.)
393 if(first!=NULL) { /* don't dereference NULL */
394 safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
395 if(firstLength<firstCapacity) {
396 first[firstLength]=0; // NUL-terminate in case it was originally.
397 }
398 }
399 }
400 } else {
401 UnicodeString secondString(secondLength<0, second, secondLength);
402 if(doNormalize) {
403 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
404 } else {
405 n2->append(firstString, secondString, *pErrorCode);
406 }
407 }
408 }
409 return firstString.extract(first, firstCapacity, *pErrorCode);
410 }
411
412 U_CAPI int32_t U_EXPORT2
413 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
414 UChar *first, int32_t firstLength, int32_t firstCapacity,
415 const UChar *second, int32_t secondLength,
416 UErrorCode *pErrorCode) {
417 return normalizeSecondAndAppend(norm2,
418 first, firstLength, firstCapacity,
419 second, secondLength,
420 TRUE, pErrorCode);
421 }
422
423 U_CAPI int32_t U_EXPORT2
424 unorm2_append(const UNormalizer2 *norm2,
425 UChar *first, int32_t firstLength, int32_t firstCapacity,
426 const UChar *second, int32_t secondLength,
427 UErrorCode *pErrorCode) {
428 return normalizeSecondAndAppend(norm2,
429 first, firstLength, firstCapacity,
430 second, secondLength,
431 FALSE, pErrorCode);
432 }
433
434 U_CAPI int32_t U_EXPORT2
435 unorm2_getDecomposition(const UNormalizer2 *norm2,
436 UChar32 c, UChar *decomposition, int32_t capacity,
437 UErrorCode *pErrorCode) {
438 if(U_FAILURE(*pErrorCode)) {
439 return 0;
440 }
441 if(decomposition==NULL ? capacity!=0 : capacity<0) {
442 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
443 return 0;
444 }
445 UnicodeString destString(decomposition, 0, capacity);
446 if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
447 return destString.extract(decomposition, capacity, *pErrorCode);
448 } else {
449 return -1;
450 }
451 }
452
453 U_CAPI int32_t U_EXPORT2
454 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
455 UChar32 c, UChar *decomposition, int32_t capacity,
456 UErrorCode *pErrorCode) {
457 if(U_FAILURE(*pErrorCode)) {
458 return 0;
459 }
460 if(decomposition==NULL ? capacity!=0 : capacity<0) {
461 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
462 return 0;
463 }
464 UnicodeString destString(decomposition, 0, capacity);
465 if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
466 return destString.extract(decomposition, capacity, *pErrorCode);
467 } else {
468 return -1;
469 }
470 }
471
472 U_CAPI UChar32 U_EXPORT2
473 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
474 return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
475 }
476
477 U_CAPI uint8_t U_EXPORT2
478 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
479 return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
480 }
481
482 U_CAPI UBool U_EXPORT2
483 unorm2_isNormalized(const UNormalizer2 *norm2,
484 const UChar *s, int32_t length,
485 UErrorCode *pErrorCode) {
486 if(U_FAILURE(*pErrorCode)) {
487 return 0;
488 }
489 if((s==NULL && length!=0) || length<-1) {
490 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
491 return 0;
492 }
493 UnicodeString sString(length<0, s, length);
494 return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
495 }
496
497 U_CAPI UNormalizationCheckResult U_EXPORT2
498 unorm2_quickCheck(const UNormalizer2 *norm2,
499 const UChar *s, int32_t length,
500 UErrorCode *pErrorCode) {
501 if(U_FAILURE(*pErrorCode)) {
502 return UNORM_NO;
503 }
504 if((s==NULL && length!=0) || length<-1) {
505 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
506 return UNORM_NO;
507 }
508 UnicodeString sString(length<0, s, length);
509 return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
510 }
511
512 U_CAPI int32_t U_EXPORT2
513 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
514 const UChar *s, int32_t length,
515 UErrorCode *pErrorCode) {
516 if(U_FAILURE(*pErrorCode)) {
517 return 0;
518 }
519 if((s==NULL && length!=0) || length<-1) {
520 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
521 return 0;
522 }
523 UnicodeString sString(length<0, s, length);
524 return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
525 }
526
527 U_CAPI UBool U_EXPORT2
528 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
529 return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
530 }
531
532 U_CAPI UBool U_EXPORT2
533 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
534 return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
535 }
536
537 U_CAPI UBool U_EXPORT2
538 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
539 return ((const Normalizer2 *)norm2)->isInert(c);
540 }
541
542 // Some properties APIs ---------------------------------------------------- ***
543
544 U_CAPI uint8_t U_EXPORT2
545 u_getCombiningClass(UChar32 c) {
546 UErrorCode errorCode=U_ZERO_ERROR;
547 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
548 if(U_SUCCESS(errorCode)) {
549 return nfd->getCombiningClass(c);
550 } else {
551 return 0;
552 }
553 }
554
555 U_CFUNC uint16_t
556 unorm_getFCD16(UChar32 c) {
557 UErrorCode errorCode=U_ZERO_ERROR;
558 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
559 if(U_SUCCESS(errorCode)) {
560 return impl->getFCD16(c);
561 } else {
562 return 0;
563 }
564 }
565
566 #endif // !UCONFIG_NO_NORMALIZATION