2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
23 /* CFUnicodeDecomposition.c
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Aki Inoue
29 #include <CoreFoundation/CFBase.h>
30 #include <CoreFoundation/CFCharacterSet.h>
31 #include <CoreFoundation/CFUniChar.h>
32 #include <CoreFoundation/CFUnicodeDecomposition.h>
33 #include "CFInternal.h"
34 #include "CFUniCharPriv.h"
36 // Canonical Decomposition
37 static UTF32Char
*__CFUniCharDecompositionTable
= NULL
;
38 static uint32_t __CFUniCharDecompositionTableLength
= 0;
39 static UTF32Char
*__CFUniCharMultipleDecompositionTable
= NULL
;
41 static const uint8_t *__CFUniCharDecomposableBitmapForBMP
= NULL
;
42 static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP
= NULL
;
43 static const uint8_t *__CFUniCharNonBaseBitmapForBMP
= NULL
;
45 static CFSpinLock_t __CFUniCharDecompositionTableLock
= 0;
47 static const uint8_t **__CFUniCharCombiningPriorityTable
= NULL
;
48 static uint8_t __CFUniCharCombiningPriorityTableNumPlane
= 0;
50 static void __CFUniCharLoadDecompositionTable(void) {
52 __CFSpinLock(&__CFUniCharDecompositionTableLock
);
54 if (NULL
== __CFUniCharDecompositionTable
) {
55 const void *bytes
= CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping
);
58 __CFSpinUnlock(&__CFUniCharDecompositionTableLock
);
62 __CFUniCharDecompositionTableLength
= *(((uint32_t *)bytes
)++);
63 __CFUniCharDecompositionTable
= (UTF32Char
*)bytes
;
64 __CFUniCharMultipleDecompositionTable
= (UTF32Char
*)((intptr_t)bytes
+ __CFUniCharDecompositionTableLength
);
66 __CFUniCharDecompositionTableLength
/= (sizeof(uint32_t) * 2);
67 __CFUniCharDecomposableBitmapForBMP
= CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet
, 0);
68 __CFUniCharHFSPlusDecomposableBitmapForBMP
= CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet
, 0);
69 __CFUniCharNonBaseBitmapForBMP
= CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet
, 0);
72 __CFSpinUnlock(&__CFUniCharDecompositionTableLock
);
75 static CFSpinLock_t __CFUniCharCompatibilityDecompositionTableLock
= 0;
76 static UTF32Char
*__CFUniCharCompatibilityDecompositionTable
= NULL
;
77 static uint32_t __CFUniCharCompatibilityDecompositionTableLength
= 0;
78 static UTF32Char
*__CFUniCharCompatibilityMultipleDecompositionTable
= NULL
;
80 static void __CFUniCharLoadCompatibilityDecompositionTable(void) {
82 __CFSpinLock(&__CFUniCharCompatibilityDecompositionTableLock
);
84 if (NULL
== __CFUniCharCompatibilityDecompositionTable
) {
85 const void *bytes
= CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping
);
88 __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock
);
92 __CFUniCharCompatibilityDecompositionTableLength
= *(((uint32_t *)bytes
)++);
93 __CFUniCharCompatibilityDecompositionTable
= (UTF32Char
*)bytes
;
94 __CFUniCharCompatibilityMultipleDecompositionTable
= (UTF32Char
*)((intptr_t)bytes
+ __CFUniCharCompatibilityDecompositionTableLength
);
96 __CFUniCharCompatibilityDecompositionTableLength
/= (sizeof(uint32_t) * 2);
99 __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock
);
102 CF_INLINE
bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character
, bool isHFSPlus
) {
103 return CFUniCharIsMemberOfBitmap(character
, (character
< 0x10000 ? (isHFSPlus
? __CFUniCharHFSPlusDecomposableBitmapForBMP
: __CFUniCharDecomposableBitmapForBMP
) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet
, ((character
>> 16) & 0xFF))));
106 CF_INLINE
bool __CFUniCharIsNonBaseCharacter(UTF32Char character
) {
107 return CFUniCharIsMemberOfBitmap(character
, (character
< 0x10000 ? __CFUniCharNonBaseBitmapForBMP
: CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet
, ((character
>> 16) & 0xFF))));
113 } __CFUniCharDecomposeMappings
;
115 static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings
*theTable
, uint32_t numElem
, UTF32Char character
) {
116 const __CFUniCharDecomposeMappings
*p
, *q
, *divider
;
118 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
)) {
124 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
125 if (character
< divider
->_key
) { q
= divider
- 1; }
126 else if (character
> divider
->_key
) { p
= divider
+ 1; }
127 else { return divider
->_value
; }
132 #define __CFUniCharGetCombiningPropertyForCharacter(character) CFUniCharGetCombiningPropertyForCharacter(character, (((character) >> 16) < __CFUniCharCombiningPriorityTableNumPlane ? __CFUniCharCombiningPriorityTable[(character) >> 16] : NULL))
134 static void __CFUniCharPrioritySort(UTF32Char
*characters
, uint32_t length
) {
136 UTF32Char
*ch1
, *ch2
;
138 UTF32Char
*end
= characters
+ length
;
140 if (NULL
== __CFUniCharCombiningPriorityTable
) {
141 __CFSpinLock(&__CFUniCharDecompositionTableLock
);
142 if (NULL
== __CFUniCharCombiningPriorityTable
) {
143 uint32_t numPlanes
= CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty
);
146 __CFUniCharCombiningPriorityTable
= (const uint8_t **)CFAllocatorAllocate(NULL
, sizeof(uint8_t *) * numPlanes
, 0);
147 for (idx
= 0;idx
< numPlanes
;idx
++) __CFUniCharCombiningPriorityTable
[idx
] = CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty
, idx
);
148 __CFUniCharCombiningPriorityTableNumPlane
= numPlanes
;
150 __CFSpinUnlock(&__CFUniCharDecompositionTableLock
);
153 if (length
< 2) return;
157 ch1
= characters
; ch2
= characters
+ 1;
158 p2
= __CFUniCharGetCombiningPropertyForCharacter(*ch1
);
160 p1
= p2
; p2
= __CFUniCharGetCombiningPropertyForCharacter(*ch2
);
162 UTF32Char tmp
= *ch1
; *ch1
= *ch2
; *ch2
= tmp
;
170 static uint32_t __CFUniCharRecursivelyDecomposeCharacter(UTF32Char character
, UTF32Char
*convertedChars
, uint32_t maxBufferLength
) {
171 uint32_t value
= __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings
*)__CFUniCharDecompositionTable
, __CFUniCharDecompositionTableLength
, character
);
172 uint32_t length
= CFUniCharConvertFlagToCount(value
);
173 UTF32Char firstChar
= value
& 0xFFFFFF;
174 UTF32Char
*mappings
= (length
> 1 ? __CFUniCharMultipleDecompositionTable
+ firstChar
: &firstChar
);
175 uint32_t usedLength
= 0;
177 if (maxBufferLength
< length
) return 0;
179 if (value
& kCFUniCharRecursiveDecompositionFlag
) {
180 usedLength
= __CFUniCharRecursivelyDecomposeCharacter(*mappings
, convertedChars
, maxBufferLength
- length
);
182 --length
; // Decrement for the first char
183 if (!usedLength
|| usedLength
+ length
> maxBufferLength
) return 0;
185 convertedChars
+= usedLength
;
188 usedLength
+= length
;
190 while (length
--) *(convertedChars
++) = *(mappings
++);
195 #define HANGUL_SBASE 0xAC00
196 #define HANGUL_LBASE 0x1100
197 #define HANGUL_VBASE 0x1161
198 #define HANGUL_TBASE 0x11A7
199 #define HANGUL_SCOUNT 11172
200 #define HANGUL_LCOUNT 19
201 #define HANGUL_VCOUNT 21
202 #define HANGUL_TCOUNT 28
203 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
205 uint32_t CFUniCharDecomposeCharacter(UTF32Char character
, UTF32Char
*convertedChars
, uint32_t maxBufferLength
) {
206 if (NULL
== __CFUniCharDecompositionTable
) __CFUniCharLoadDecompositionTable();
207 if (character
>= HANGUL_SBASE
&& character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
)) {
210 character
-= HANGUL_SBASE
;
212 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
214 if (maxBufferLength
< length
) return 0;
216 *(convertedChars
++) = character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
217 *(convertedChars
++) = (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
218 if (length
> 2) *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
221 return __CFUniCharRecursivelyDecomposeCharacter(character
, convertedChars
, maxBufferLength
);
225 #define MAX_BUFFER_LENGTH (32)
226 bool CFUniCharDecompose(const UTF16Char
*src
, uint32_t length
, uint32_t *consumedLength
, void *dst
, uint32_t maxLength
, uint32_t *filledLength
, bool needToReorder
, uint32_t dstFormat
, bool isHFSPlus
) {
227 uint32_t usedLength
= 0;
228 uint32_t originalLength
= length
;
229 UTF32Char buffer
[MAX_BUFFER_LENGTH
];
230 UTF32Char
*decompBuffer
= buffer
;
231 uint32_t decompBufferLen
= MAX_BUFFER_LENGTH
;
232 UTF32Char currentChar
;
234 bool isDecomp
= false;
235 bool isNonBase
= false;
237 if (NULL
== __CFUniCharDecompositionTable
) __CFUniCharLoadDecompositionTable();
240 currentChar
= *(src
++);
243 if (currentChar
< 0x80) {
245 if (usedLength
< maxLength
) {
247 case kCFUniCharUTF8Format
: *(((uint8_t *)dst
)++) = currentChar
; break;
248 case kCFUniCharUTF16Format
: *(((UTF16Char
*)dst
)++) = currentChar
; break;
249 case kCFUniCharUTF32Format
: *(((UTF32Char
*)dst
)++) = currentChar
; break;
252 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
253 if (consumedLength
) *consumedLength
= originalLength
- length
- 1;
254 if (filledLength
) *filledLength
= usedLength
;
262 if (CFUniCharIsSurrogateHighCharacter(currentChar
) && (length
> 0) && CFUniCharIsSurrogateLowCharacter(*src
)) {
263 currentChar
= CFUniCharGetLongCharacterForSurrogatePair(currentChar
, *(src
++));
267 isDecomp
= __CFUniCharIsDecomposableCharacterWithFlag(currentChar
, isHFSPlus
);
268 isNonBase
= (needToReorder
&& __CFUniCharIsNonBaseCharacter(currentChar
));
270 if (!isDecomp
|| isNonBase
) {
273 idx
= CFUniCharDecomposeCharacter(currentChar
, decompBuffer
, MAX_BUFFER_LENGTH
);
276 *decompBuffer
= currentChar
;
280 if (CFUniCharIsSurrogateHighCharacter(*src
) && ((length
+ 1) > 0) && CFUniCharIsSurrogateLowCharacter(*(src
+ 1))) {
281 currentChar
= CFUniCharGetLongCharacterForSurrogatePair(*src
, *(src
+ 1));
285 if (__CFUniCharIsNonBaseCharacter(currentChar
)) {
286 if (currentChar
> 0xFFFF) { // Non-BMP
293 if ((idx
+ 1) >= decompBufferLen
) {
294 UTF32Char
*newBuffer
;
296 decompBufferLen
+= MAX_BUFFER_LENGTH
;
297 newBuffer
= (UTF32Char
*)CFAllocatorAllocate(NULL
, sizeof(UTF32Char
) * decompBufferLen
, 0);
298 memmove(newBuffer
, decompBuffer
, (decompBufferLen
- MAX_BUFFER_LENGTH
) * sizeof(UTF32Char
));
299 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
300 decompBuffer
= newBuffer
;
303 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar
, isHFSPlus
)) { // Vietnamese accent, etc.
304 idx
+= CFUniCharDecomposeCharacter(currentChar
, decompBuffer
+ idx
, MAX_BUFFER_LENGTH
- idx
);
306 decompBuffer
[idx
++] = currentChar
;
313 if (idx
> 1) { // Need to reorder
314 __CFUniCharPrioritySort(decompBuffer
, idx
);
316 if (!CFUniCharFillDestinationBuffer(decompBuffer
, idx
, &dst
, maxLength
, &usedLength
, dstFormat
)) {
317 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
318 if (consumedLength
) *consumedLength
= originalLength
- length
;
319 if (filledLength
) *filledLength
= usedLength
;
323 if (dstFormat
== kCFUniCharUTF32Format
) {
326 if (usedLength
> maxLength
) {
327 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
328 if (consumedLength
) *consumedLength
= originalLength
- length
;
329 if (filledLength
) *filledLength
= usedLength
;
332 *(((UTF32Char
*)dst
)++) = currentChar
;
335 if (!CFUniCharFillDestinationBuffer(¤tChar
, 1, &dst
, maxLength
, &usedLength
, dstFormat
)) {
336 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
337 if (consumedLength
) *consumedLength
= originalLength
- length
;
338 if (filledLength
) *filledLength
= usedLength
;
344 if (dstFormat
== kCFUniCharUTF32Format
&& maxLength
) {
345 idx
= CFUniCharDecomposeCharacter(currentChar
, dst
, maxLength
- usedLength
);
348 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
349 if (consumedLength
) *consumedLength
= originalLength
- length
;
350 if (filledLength
) *filledLength
= usedLength
;
352 } else if (needToReorder
&& (idx
> 1)) { // Need to reorder
353 bool moreCombiningMarks
= false;
354 ++((UTF32Char
*)dst
); --idx
; ++usedLength
; // Skip the base
357 if (CFUniCharIsSurrogateHighCharacter(*src
) && ((length
+ 1) > 0) && CFUniCharIsSurrogateLowCharacter(*(src
+ 1))) {
358 currentChar
= CFUniCharGetLongCharacterForSurrogatePair(*src
, *(src
+ 1));
362 if (__CFUniCharIsNonBaseCharacter(currentChar
)) {
363 if (currentChar
> 0xFFFF) { // Non-BMP
370 if ((idx
+ usedLength
+ 1) >= maxLength
) {
371 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
372 if (consumedLength
) *consumedLength
= originalLength
- length
;
373 if (filledLength
) *filledLength
= usedLength
;
376 ((UTF32Char
*)dst
)[idx
++] = currentChar
;
377 moreCombiningMarks
= true;
382 if (moreCombiningMarks
) __CFUniCharPrioritySort(((UTF32Char
*)dst
), idx
);
386 ((UTF32Char
*)dst
) += idx
;
388 idx
= CFUniCharDecomposeCharacter(currentChar
, decompBuffer
, decompBufferLen
);
390 if (maxLength
&& idx
+ usedLength
> maxLength
) {
391 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
392 if (consumedLength
) *consumedLength
= originalLength
- length
;
393 if (filledLength
) *filledLength
= usedLength
;
395 } else if (needToReorder
&& (idx
> 1)) { // Need to reorder
396 bool moreCombiningMarks
= false;
399 if (CFUniCharIsSurrogateHighCharacter(*src
) && ((length
+ 1) > 0) && CFUniCharIsSurrogateLowCharacter(*(src
+ 1))) {
400 currentChar
= CFUniCharGetLongCharacterForSurrogatePair(*src
, *(src
+ 1));
404 if (__CFUniCharIsNonBaseCharacter(currentChar
)) {
405 if (currentChar
> 0xFFFF) { // Non-BMP
412 if ((idx
+ 1) >= decompBufferLen
) {
413 UTF32Char
*newBuffer
;
415 decompBufferLen
+= MAX_BUFFER_LENGTH
;
416 newBuffer
= (UTF32Char
*)CFAllocatorAllocate(NULL
, sizeof(UTF32Char
) * decompBufferLen
, 0);
417 memmove(newBuffer
, decompBuffer
, (decompBufferLen
- MAX_BUFFER_LENGTH
) * sizeof(UTF32Char
));
418 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
419 decompBuffer
= newBuffer
;
421 decompBuffer
[idx
++] = currentChar
;
422 moreCombiningMarks
= true;
427 if (moreCombiningMarks
) __CFUniCharPrioritySort(decompBuffer
+ 1, idx
- 1);
429 if (!CFUniCharFillDestinationBuffer(decompBuffer
, idx
, &dst
, maxLength
, &usedLength
, dstFormat
)) {
430 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
431 if (consumedLength
) *consumedLength
= originalLength
- length
;
432 if (filledLength
) *filledLength
= usedLength
;
438 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(NULL
, decompBuffer
);
440 if (consumedLength
) *consumedLength
= originalLength
- length
;
441 if (filledLength
) *filledLength
= usedLength
;
446 #define MAX_COMP_DECOMP_LEN (32)
448 static uint32_t __CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character
, UTF32Char
*convertedChars
) {
449 uint32_t value
= __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings
*)__CFUniCharCompatibilityDecompositionTable
, __CFUniCharCompatibilityDecompositionTableLength
, character
);
450 uint32_t length
= CFUniCharConvertFlagToCount(value
);
451 UTF32Char firstChar
= value
& 0xFFFFFF;
452 const UTF32Char
*mappings
= (length
> 1 ? __CFUniCharCompatibilityMultipleDecompositionTable
+ firstChar
: &firstChar
);
453 uint32_t usedLength
= length
;
454 UTF32Char currentChar
;
455 uint32_t currentLength
;
457 while (length
-- > 0) {
458 currentChar
= *(mappings
++);
459 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar
, false)) {
460 currentLength
= __CFUniCharRecursivelyDecomposeCharacter(currentChar
, convertedChars
, MAX_COMP_DECOMP_LEN
- length
);
461 convertedChars
+= currentLength
;
462 usedLength
+= (currentLength
- 1);
463 } else if (CFUniCharIsMemberOf(currentChar
, kCFUniCharCompatibilityDecomposableCharacterSet
)) {
464 currentLength
= __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar
, convertedChars
);
465 convertedChars
+= currentLength
;
466 usedLength
+= (currentLength
- 1);
468 *(convertedChars
++) = currentChar
;
475 CF_INLINE
void __CFUniCharMoveBufferFromEnd(UTF32Char
*convertedChars
, uint32_t length
, uint32_t delta
) {
476 const UTF32Char
*limit
= convertedChars
;
479 convertedChars
+= length
;
480 dstP
= convertedChars
+ delta
;
482 while (convertedChars
> limit
) *(--dstP
) = *(--convertedChars
);
485 __private_extern__
uint32_t CFUniCharCompatibilityDecompose(UTF32Char
*convertedChars
, uint32_t length
, uint32_t maxBufferLength
) {
486 UTF32Char currentChar
;
487 UTF32Char buffer
[MAX_COMP_DECOMP_LEN
];
488 const UTF32Char
*bufferP
;
489 const UTF32Char
*limit
= convertedChars
+ length
;
490 uint32_t filledLength
;
492 if (NULL
== __CFUniCharCompatibilityDecompositionTable
) __CFUniCharLoadCompatibilityDecompositionTable();
494 while (convertedChars
< limit
) {
495 currentChar
= *convertedChars
;
497 if (CFUniCharIsMemberOf(currentChar
, kCFUniCharCompatibilityDecomposableCharacterSet
)) {
498 filledLength
= __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar
, buffer
);
500 if (filledLength
+ length
- 1 > maxBufferLength
) return 0;
502 if (filledLength
> 1) __CFUniCharMoveBufferFromEnd(convertedChars
+ 1, limit
- convertedChars
- 1, filledLength
- 1);
505 length
+= (filledLength
- 1);
506 while (filledLength
-- > 0) *(convertedChars
++) = *(bufferP
++);
515 CF_EXPORT
void CFUniCharPrioritySort(UTF32Char
*characters
, uint32_t length
) {
516 __CFUniCharPrioritySort(characters
, length
);