]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/normalizer2.cpp
ICU-57165.0.1.tar.gz
[apple/icu.git] / icuSources / common / normalizer2.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: normalizer2.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009nov22
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_NORMALIZATION
20
21 #include "unicode/normalizer2.h"
22 #include "unicode/unistr.h"
23 #include "unicode/unorm.h"
24 #include "cstring.h"
25 #include "mutex.h"
26 #include "norm2allmodes.h"
27 #include "normalizer2impl.h"
28 #include "uassert.h"
29 #include "ucln_cmn.h"
30
31 using icu::Normalizer2Impl;
32
33 // NFC/NFD data machine-generated by gennorm2 --csource
34 #define INCLUDED_FROM_NORMALIZER2_CPP
35 #include "norm2_nfc_data.h"
36
37 U_NAMESPACE_BEGIN
38
39 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
40
41 Normalizer2::~Normalizer2() {}
42
43 UBool
44 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
45 return FALSE;
46 }
47
48 UChar32
49 Normalizer2::composePair(UChar32, UChar32) const {
50 return U_SENTINEL;
51 }
52
53 uint8_t
54 Normalizer2::getCombiningClass(UChar32 /*c*/) const {
55 return 0;
56 }
57
58 // Normalizer2 implementation for the old UNORM_NONE.
59 class NoopNormalizer2 : public Normalizer2 {
60 virtual ~NoopNormalizer2();
61
62 virtual UnicodeString &
63 normalize(const UnicodeString &src,
64 UnicodeString &dest,
65 UErrorCode &errorCode) const {
66 if(U_SUCCESS(errorCode)) {
67 if(&dest!=&src) {
68 dest=src;
69 } else {
70 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
71 }
72 }
73 return dest;
74 }
75 virtual UnicodeString &
76 normalizeSecondAndAppend(UnicodeString &first,
77 const UnicodeString &second,
78 UErrorCode &errorCode) const {
79 if(U_SUCCESS(errorCode)) {
80 if(&first!=&second) {
81 first.append(second);
82 } else {
83 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
84 }
85 }
86 return first;
87 }
88 virtual UnicodeString &
89 append(UnicodeString &first,
90 const UnicodeString &second,
91 UErrorCode &errorCode) const {
92 if(U_SUCCESS(errorCode)) {
93 if(&first!=&second) {
94 first.append(second);
95 } else {
96 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
97 }
98 }
99 return first;
100 }
101 virtual UBool
102 getDecomposition(UChar32, UnicodeString &) const {
103 return FALSE;
104 }
105 // No need to override the default getRawDecomposition().
106 virtual UBool
107 isNormalized(const UnicodeString &, UErrorCode &) const {
108 return TRUE;
109 }
110 virtual UNormalizationCheckResult
111 quickCheck(const UnicodeString &, UErrorCode &) const {
112 return UNORM_YES;
113 }
114 virtual int32_t
115 spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const {
116 return s.length();
117 }
118 virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; }
119 virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; }
120 virtual UBool isInert(UChar32) const { return TRUE; }
121 };
122
123 NoopNormalizer2::~NoopNormalizer2() {}
124
125 Normalizer2WithImpl::~Normalizer2WithImpl() {}
126
127 DecomposeNormalizer2::~DecomposeNormalizer2() {}
128
129 ComposeNormalizer2::~ComposeNormalizer2() {}
130
131 FCDNormalizer2::~FCDNormalizer2() {}
132
133 // instance cache ---------------------------------------------------------- ***
134
135 Norm2AllModes::~Norm2AllModes() {
136 delete impl;
137 }
138
139 Norm2AllModes *
140 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
141 if(U_FAILURE(errorCode)) {
142 delete impl;
143 return NULL;
144 }
145 Norm2AllModes *allModes=new Norm2AllModes(impl);
146 if(allModes==NULL) {
147 errorCode=U_MEMORY_ALLOCATION_ERROR;
148 delete impl;
149 return NULL;
150 }
151 return allModes;
152 }
153
154 Norm2AllModes *
155 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
156 if(U_FAILURE(errorCode)) {
157 return NULL;
158 }
159 Normalizer2Impl *impl=new Normalizer2Impl;
160 if(impl==NULL) {
161 errorCode=U_MEMORY_ALLOCATION_ERROR;
162 return NULL;
163 }
164 impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
165 norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
166 return createInstance(impl, errorCode);
167 }
168
169 U_CDECL_BEGIN
170 static UBool U_CALLCONV uprv_normalizer2_cleanup();
171 U_CDECL_END
172
173 static Norm2AllModes *nfcSingleton;
174 static Normalizer2 *noopSingleton;
175
176 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
177 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
178
179 // UInitOnce singleton initialization functions
180 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
181 nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
182 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
183 }
184
185 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
186 if(U_FAILURE(errorCode)) {
187 return;
188 }
189 noopSingleton=new NoopNormalizer2;
190 if(noopSingleton==NULL) {
191 errorCode=U_MEMORY_ALLOCATION_ERROR;
192 return;
193 }
194 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
195 }
196
197 U_CDECL_BEGIN
198
199 static UBool U_CALLCONV uprv_normalizer2_cleanup() {
200 delete nfcSingleton;
201 nfcSingleton = NULL;
202 delete noopSingleton;
203 noopSingleton = NULL;
204 nfcInitOnce.reset();
205 noopInitOnce.reset();
206 return TRUE;
207 }
208
209 U_CDECL_END
210
211 const Norm2AllModes *
212 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
213 if(U_FAILURE(errorCode)) { return NULL; }
214 umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
215 return nfcSingleton;
216 }
217
218 const Normalizer2 *
219 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
220 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
221 return allModes!=NULL ? &allModes->comp : NULL;
222 }
223
224 const Normalizer2 *
225 Normalizer2::getNFDInstance(UErrorCode &errorCode) {
226 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
227 return allModes!=NULL ? &allModes->decomp : NULL;
228 }
229
230 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
231 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
232 return allModes!=NULL ? &allModes->fcd : NULL;
233 }
234
235 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
236 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
237 return allModes!=NULL ? &allModes->fcc : NULL;
238 }
239
240 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
241 if(U_FAILURE(errorCode)) { return NULL; }
242 umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
243 return noopSingleton;
244 }
245
246 const Normalizer2Impl *
247 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
248 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
249 return allModes!=NULL ? allModes->impl : NULL;
250 }
251
252 const Normalizer2Impl *
253 Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
254 return &((Normalizer2WithImpl *)norm2)->impl;
255 }
256
257 U_NAMESPACE_END
258
259 // C API ------------------------------------------------------------------- ***
260
261 U_NAMESPACE_USE
262
263 U_CAPI const UNormalizer2 * U_EXPORT2
264 unorm2_getNFCInstance(UErrorCode *pErrorCode) {
265 return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
266 }
267
268 U_CAPI const UNormalizer2 * U_EXPORT2
269 unorm2_getNFDInstance(UErrorCode *pErrorCode) {
270 return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
271 }
272
273 U_CAPI void U_EXPORT2
274 unorm2_close(UNormalizer2 *norm2) {
275 delete (Normalizer2 *)norm2;
276 }
277
278 U_CAPI int32_t U_EXPORT2
279 unorm2_normalize(const UNormalizer2 *norm2,
280 const UChar *src, int32_t length,
281 UChar *dest, int32_t capacity,
282 UErrorCode *pErrorCode) {
283 if(U_FAILURE(*pErrorCode)) {
284 return 0;
285 }
286 if( (src==NULL ? length!=0 : length<-1) ||
287 (dest==NULL ? capacity!=0 : capacity<0) ||
288 (src==dest && src!=NULL)
289 ) {
290 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
291 return 0;
292 }
293 UnicodeString destString(dest, 0, capacity);
294 // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
295 if(length!=0) {
296 const Normalizer2 *n2=(const Normalizer2 *)norm2;
297 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
298 if(n2wi!=NULL) {
299 // Avoid duplicate argument checking and support NUL-terminated src.
300 ReorderingBuffer buffer(n2wi->impl, destString);
301 if(buffer.init(length, *pErrorCode)) {
302 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
303 }
304 } else {
305 UnicodeString srcString(length<0, src, length);
306 n2->normalize(srcString, destString, *pErrorCode);
307 }
308 }
309 return destString.extract(dest, capacity, *pErrorCode);
310 }
311
312 static int32_t
313 normalizeSecondAndAppend(const UNormalizer2 *norm2,
314 UChar *first, int32_t firstLength, int32_t firstCapacity,
315 const UChar *second, int32_t secondLength,
316 UBool doNormalize,
317 UErrorCode *pErrorCode) {
318 if(U_FAILURE(*pErrorCode)) {
319 return 0;
320 }
321 if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
322 (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
323 (firstCapacity<0 || firstLength<-1)) ||
324 (first==second && first!=NULL)
325 ) {
326 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
327 return 0;
328 }
329 UnicodeString firstString(first, firstLength, firstCapacity);
330 firstLength=firstString.length(); // In case it was -1.
331 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
332 if(secondLength!=0) {
333 const Normalizer2 *n2=(const Normalizer2 *)norm2;
334 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
335 if(n2wi!=NULL) {
336 // Avoid duplicate argument checking and support NUL-terminated src.
337 UnicodeString safeMiddle;
338 {
339 ReorderingBuffer buffer(n2wi->impl, firstString);
340 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
341 n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
342 doNormalize, safeMiddle, buffer, *pErrorCode);
343 }
344 } // The ReorderingBuffer destructor finalizes firstString.
345 if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
346 // Restore the modified suffix of the first string.
347 // This does not restore first[] array contents between firstLength and firstCapacity.
348 // (That might be uninitialized memory, as far as we know.)
349 if(first!=NULL) { /* don't dereference NULL */
350 safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
351 if(firstLength<firstCapacity) {
352 first[firstLength]=0; // NUL-terminate in case it was originally.
353 }
354 }
355 }
356 } else {
357 UnicodeString secondString(secondLength<0, second, secondLength);
358 if(doNormalize) {
359 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
360 } else {
361 n2->append(firstString, secondString, *pErrorCode);
362 }
363 }
364 }
365 return firstString.extract(first, firstCapacity, *pErrorCode);
366 }
367
368 U_CAPI int32_t U_EXPORT2
369 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
370 UChar *first, int32_t firstLength, int32_t firstCapacity,
371 const UChar *second, int32_t secondLength,
372 UErrorCode *pErrorCode) {
373 return normalizeSecondAndAppend(norm2,
374 first, firstLength, firstCapacity,
375 second, secondLength,
376 TRUE, pErrorCode);
377 }
378
379 U_CAPI int32_t U_EXPORT2
380 unorm2_append(const UNormalizer2 *norm2,
381 UChar *first, int32_t firstLength, int32_t firstCapacity,
382 const UChar *second, int32_t secondLength,
383 UErrorCode *pErrorCode) {
384 return normalizeSecondAndAppend(norm2,
385 first, firstLength, firstCapacity,
386 second, secondLength,
387 FALSE, pErrorCode);
388 }
389
390 U_CAPI int32_t U_EXPORT2
391 unorm2_getDecomposition(const UNormalizer2 *norm2,
392 UChar32 c, UChar *decomposition, int32_t capacity,
393 UErrorCode *pErrorCode) {
394 if(U_FAILURE(*pErrorCode)) {
395 return 0;
396 }
397 if(decomposition==NULL ? capacity!=0 : capacity<0) {
398 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
399 return 0;
400 }
401 UnicodeString destString(decomposition, 0, capacity);
402 if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
403 return destString.extract(decomposition, capacity, *pErrorCode);
404 } else {
405 return -1;
406 }
407 }
408
409 U_CAPI int32_t U_EXPORT2
410 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
411 UChar32 c, UChar *decomposition, int32_t capacity,
412 UErrorCode *pErrorCode) {
413 if(U_FAILURE(*pErrorCode)) {
414 return 0;
415 }
416 if(decomposition==NULL ? capacity!=0 : capacity<0) {
417 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
418 return 0;
419 }
420 UnicodeString destString(decomposition, 0, capacity);
421 if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
422 return destString.extract(decomposition, capacity, *pErrorCode);
423 } else {
424 return -1;
425 }
426 }
427
428 U_CAPI UChar32 U_EXPORT2
429 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
430 return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
431 }
432
433 U_CAPI uint8_t U_EXPORT2
434 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
435 return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
436 }
437
438 U_CAPI UBool U_EXPORT2
439 unorm2_isNormalized(const UNormalizer2 *norm2,
440 const UChar *s, int32_t length,
441 UErrorCode *pErrorCode) {
442 if(U_FAILURE(*pErrorCode)) {
443 return 0;
444 }
445 if((s==NULL && length!=0) || length<-1) {
446 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
447 return 0;
448 }
449 UnicodeString sString(length<0, s, length);
450 return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
451 }
452
453 U_CAPI UNormalizationCheckResult U_EXPORT2
454 unorm2_quickCheck(const UNormalizer2 *norm2,
455 const UChar *s, int32_t length,
456 UErrorCode *pErrorCode) {
457 if(U_FAILURE(*pErrorCode)) {
458 return UNORM_NO;
459 }
460 if((s==NULL && length!=0) || length<-1) {
461 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
462 return UNORM_NO;
463 }
464 UnicodeString sString(length<0, s, length);
465 return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
466 }
467
468 U_CAPI int32_t U_EXPORT2
469 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
470 const UChar *s, int32_t length,
471 UErrorCode *pErrorCode) {
472 if(U_FAILURE(*pErrorCode)) {
473 return 0;
474 }
475 if((s==NULL && length!=0) || length<-1) {
476 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
477 return 0;
478 }
479 UnicodeString sString(length<0, s, length);
480 return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
481 }
482
483 U_CAPI UBool U_EXPORT2
484 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
485 return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
486 }
487
488 U_CAPI UBool U_EXPORT2
489 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
490 return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
491 }
492
493 U_CAPI UBool U_EXPORT2
494 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
495 return ((const Normalizer2 *)norm2)->isInert(c);
496 }
497
498 // Some properties APIs ---------------------------------------------------- ***
499
500 U_CAPI uint8_t U_EXPORT2
501 u_getCombiningClass(UChar32 c) {
502 UErrorCode errorCode=U_ZERO_ERROR;
503 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
504 if(U_SUCCESS(errorCode)) {
505 return nfd->getCombiningClass(c);
506 } else {
507 return 0;
508 }
509 }
510
511 U_CFUNC uint16_t
512 unorm_getFCD16(UChar32 c) {
513 UErrorCode errorCode=U_ZERO_ERROR;
514 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
515 if(U_SUCCESS(errorCode)) {
516 return impl->getFCD16(c);
517 } else {
518 return 0;
519 }
520 }
521
522 #endif // !UCONFIG_NO_NORMALIZATION