2 * Copyright (c) 2008 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
23 /* CFUnicodeDecomposition.c
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Aki Inoue
29 #include <CoreFoundation/CFBase.h>
30 #include <CoreFoundation/CFCharacterSet.h>
31 #include <CoreFoundation/CFUniChar.h>
32 #include <CoreFoundation/CFUnicodeDecomposition.h>
33 #include "CFInternal.h"
34 #include "CFUniCharPriv.h"
36 // Canonical Decomposition
37 static UTF32Char
*__CFUniCharDecompositionTable
= NULL
;
38 static uint32_t __CFUniCharDecompositionTableLength
= 0;
39 static UTF32Char
*__CFUniCharMultipleDecompositionTable
= NULL
;
41 static const uint8_t *__CFUniCharDecomposableBitmapForBMP
= NULL
;
42 static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP
= NULL
;
44 static CFSpinLock_t __CFUniCharDecompositionTableLock
= CFSpinLockInit
;
46 static const uint8_t **__CFUniCharCombiningPriorityTable
= NULL
;
47 static uint8_t __CFUniCharCombiningPriorityTableNumPlane
= 0;
49 static void __CFUniCharLoadDecompositionTable(void) {
51 __CFSpinLock(&__CFUniCharDecompositionTableLock
);
53 if (NULL
== __CFUniCharDecompositionTable
) {
54 const uint32_t *bytes
= (uint32_t *)CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping
);
57 __CFSpinUnlock(&__CFUniCharDecompositionTableLock
);
61 __CFUniCharDecompositionTableLength
= *(bytes
++);
62 __CFUniCharDecompositionTable
= (UTF32Char
*)bytes
;
63 __CFUniCharMultipleDecompositionTable
= (UTF32Char
*)((intptr_t)bytes
+ __CFUniCharDecompositionTableLength
);
65 __CFUniCharDecompositionTableLength
/= (sizeof(uint32_t) * 2);
66 __CFUniCharDecomposableBitmapForBMP
= CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet
, 0);
67 __CFUniCharHFSPlusDecomposableBitmapForBMP
= CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet
, 0);
71 __CFUniCharCombiningPriorityTableNumPlane
= CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty
);
72 __CFUniCharCombiningPriorityTable
= (const uint8_t **)CFAllocatorAllocate(kCFAllocatorSystemDefault
, sizeof(uint8_t *) * __CFUniCharCombiningPriorityTableNumPlane
, 0);
73 for (idx
= 0;idx
< __CFUniCharCombiningPriorityTableNumPlane
;idx
++) __CFUniCharCombiningPriorityTable
[idx
] = (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty
, idx
);
76 __CFSpinUnlock(&__CFUniCharDecompositionTableLock
);
79 static CFSpinLock_t __CFUniCharCompatibilityDecompositionTableLock
= CFSpinLockInit
;
80 static UTF32Char
*__CFUniCharCompatibilityDecompositionTable
= NULL
;
81 static uint32_t __CFUniCharCompatibilityDecompositionTableLength
= 0;
82 static UTF32Char
*__CFUniCharCompatibilityMultipleDecompositionTable
= NULL
;
84 static void __CFUniCharLoadCompatibilityDecompositionTable(void) {
86 __CFSpinLock(&__CFUniCharCompatibilityDecompositionTableLock
);
88 if (NULL
== __CFUniCharCompatibilityDecompositionTable
) {
89 const uint32_t *bytes
= (uint32_t *)CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping
);
92 __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock
);
96 __CFUniCharCompatibilityDecompositionTableLength
= *(bytes
++);
97 __CFUniCharCompatibilityDecompositionTable
= (UTF32Char
*)bytes
;
98 __CFUniCharCompatibilityMultipleDecompositionTable
= (UTF32Char
*)((intptr_t)bytes
+ __CFUniCharCompatibilityDecompositionTableLength
);
100 __CFUniCharCompatibilityDecompositionTableLength
/= (sizeof(uint32_t) * 2);
103 __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock
);
106 CF_INLINE
bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character
, bool isHFSPlus
) {
107 return CFUniCharIsMemberOfBitmap(character
, (character
< 0x10000 ? (isHFSPlus
? __CFUniCharHFSPlusDecomposableBitmapForBMP
: __CFUniCharDecomposableBitmapForBMP
) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet
, ((character
>> 16) & 0xFF))));
110 CF_INLINE
uint8_t __CFUniCharGetCombiningPropertyForCharacter(UTF32Char character
) { return CFUniCharGetCombiningPropertyForCharacter(character
, (((character
) >> 16) < __CFUniCharCombiningPriorityTableNumPlane
? __CFUniCharCombiningPriorityTable
[(character
) >> 16] : NULL
)); }
112 CF_INLINE
bool __CFUniCharIsNonBaseCharacter(UTF32Char character
) { return ((0 == __CFUniCharGetCombiningPropertyForCharacter(character
)) ? false : true); } // the notion of non-base in normalization is characters with non-0 combining class
117 } __CFUniCharDecomposeMappings
;
119 static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings
*theTable
, uint32_t numElem
, UTF32Char character
) {
120 const __CFUniCharDecomposeMappings
*p
, *q
, *divider
;
122 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
)) {
128 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
129 if (character
< divider
->_key
) { q
= divider
- 1; }
130 else if (character
> divider
->_key
) { p
= divider
+ 1; }
131 else { return divider
->_value
; }
136 static void __CFUniCharPrioritySort(UTF32Char
*characters
, CFIndex length
) {
137 UTF32Char
*end
= characters
+ length
;
139 while ((characters
< end
) && (0 == __CFUniCharGetCombiningPropertyForCharacter(*characters
))) ++characters
;
141 if ((end
- characters
) > 1) {
143 UTF32Char
*ch1
, *ch2
;
148 ch1
= characters
; ch2
= characters
+ 1;
149 p2
= __CFUniCharGetCombiningPropertyForCharacter(*ch1
);
151 p1
= p2
; p2
= __CFUniCharGetCombiningPropertyForCharacter(*ch2
);
153 UTF32Char tmp
= *ch1
; *ch1
= *ch2
; *ch2
= tmp
;
162 static CFIndex
__CFUniCharRecursivelyDecomposeCharacter(UTF32Char character
, UTF32Char
*convertedChars
, CFIndex maxBufferLength
) {
163 uint32_t value
= __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings
*)__CFUniCharDecompositionTable
, __CFUniCharDecompositionTableLength
, character
);
164 CFIndex length
= CFUniCharConvertFlagToCount(value
);
165 UTF32Char firstChar
= value
& 0xFFFFFF;
166 UTF32Char
*mappings
= (length
> 1 ? __CFUniCharMultipleDecompositionTable
+ firstChar
: &firstChar
);
167 CFIndex usedLength
= 0;
169 if (maxBufferLength
< length
) return 0;
171 if (value
& kCFUniCharRecursiveDecompositionFlag
) {
172 usedLength
= __CFUniCharRecursivelyDecomposeCharacter(*mappings
, convertedChars
, maxBufferLength
- length
);
174 --length
; // Decrement for the first char
175 if (!usedLength
|| usedLength
+ length
> maxBufferLength
) return 0;
177 convertedChars
+= usedLength
;
180 usedLength
+= length
;
182 while (length
--) *(convertedChars
++) = *(mappings
++);
187 #define HANGUL_SBASE 0xAC00
188 #define HANGUL_LBASE 0x1100
189 #define HANGUL_VBASE 0x1161
190 #define HANGUL_TBASE 0x11A7
191 #define HANGUL_SCOUNT 11172
192 #define HANGUL_LCOUNT 19
193 #define HANGUL_VCOUNT 21
194 #define HANGUL_TCOUNT 28
195 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
197 CFIndex
CFUniCharDecomposeCharacter(UTF32Char character
, UTF32Char
*convertedChars
, CFIndex maxBufferLength
) {
198 if (NULL
== __CFUniCharDecompositionTable
) __CFUniCharLoadDecompositionTable();
199 if (character
>= HANGUL_SBASE
&& character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
)) {
202 character
-= HANGUL_SBASE
;
204 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
206 if (maxBufferLength
< length
) return 0;
208 *(convertedChars
++) = character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
209 *(convertedChars
++) = (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
210 if (length
> 2) *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
213 return __CFUniCharRecursivelyDecomposeCharacter(character
, convertedChars
, maxBufferLength
);
217 CF_INLINE
bool __CFProcessReorderBuffer(UTF32Char
*buffer
, CFIndex length
, void **dst
, CFIndex dstLength
, CFIndex
*filledLength
, uint32_t dstFormat
) {
218 if (length
> 1) __CFUniCharPrioritySort(buffer
, length
);
219 return CFUniCharFillDestinationBuffer(buffer
, length
, dst
, dstLength
, filledLength
, dstFormat
);
222 #define MAX_BUFFER_LENGTH (32)
223 bool CFUniCharDecompose(const UTF16Char
*src
, CFIndex length
, CFIndex
*consumedLength
, void *dst
, CFIndex maxLength
, CFIndex
*filledLength
, bool needToReorder
, uint32_t dstFormat
, bool isHFSPlus
) {
224 CFIndex usedLength
= 0;
225 CFIndex originalLength
= length
;
226 UTF32Char buffer
[MAX_BUFFER_LENGTH
];
227 UTF32Char
*decompBuffer
= buffer
;
228 CFIndex decompBufferSize
= MAX_BUFFER_LENGTH
;
229 CFIndex decompBufferLen
= 0;
230 CFIndex segmentLength
= 0;
231 UTF32Char currentChar
;
233 if (NULL
== __CFUniCharDecompositionTable
) __CFUniCharLoadDecompositionTable();
235 while ((length
- segmentLength
) > 0) {
236 currentChar
= *(src
++);
238 if (currentChar
< 0x80) {
239 if (decompBufferLen
> 0) {
240 if (!__CFProcessReorderBuffer(decompBuffer
, decompBufferLen
, &dst
, maxLength
, &usedLength
, dstFormat
)) break;
241 length
-= segmentLength
;
247 if (usedLength
>= maxLength
) break;
249 case kCFUniCharUTF8Format
: *(uint8_t *)dst
= currentChar
; dst
= (uint8_t *)dst
+ sizeof(uint8_t); break;
250 case kCFUniCharUTF16Format
: *(UTF16Char
*)dst
= currentChar
; dst
= (uint8_t *)dst
+ sizeof(UTF16Char
); break;
251 case kCFUniCharUTF32Format
: *(UTF32Char
*)dst
= currentChar
; dst
= (uint8_t *)dst
+ sizeof(UTF32Char
); break;
258 if (CFUniCharIsSurrogateLowCharacter(currentChar
)) { // Stray surrogagte
259 if (dstFormat
!= kCFUniCharUTF16Format
) break;
260 } else if (CFUniCharIsSurrogateHighCharacter(currentChar
)) {
261 if (((length
- segmentLength
) > 1) && CFUniCharIsSurrogateLowCharacter(*src
)) {
262 currentChar
= CFUniCharGetLongCharacterForSurrogatePair(currentChar
, *(src
++));
264 if (dstFormat
!= kCFUniCharUTF16Format
) break;
268 if (needToReorder
&& __CFUniCharIsNonBaseCharacter(currentChar
)) {
269 if ((decompBufferLen
+ 1) >= decompBufferSize
) {
270 UTF32Char
*newBuffer
;
272 decompBufferSize
+= MAX_BUFFER_LENGTH
;
273 newBuffer
= (UTF32Char
*)CFAllocatorAllocate(kCFAllocatorSystemDefault
, sizeof(UTF32Char
) * decompBufferSize
, 0);
274 memmove(newBuffer
, decompBuffer
, (decompBufferSize
- MAX_BUFFER_LENGTH
) * sizeof(UTF32Char
));
275 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(kCFAllocatorSystemDefault
, decompBuffer
);
276 decompBuffer
= newBuffer
;
279 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar
, isHFSPlus
)) { // Vietnamese accent, etc.
280 decompBufferLen
+= CFUniCharDecomposeCharacter(currentChar
, decompBuffer
+ decompBufferLen
, decompBufferSize
- decompBufferLen
);
282 decompBuffer
[decompBufferLen
++] = currentChar
;
285 if (decompBufferLen
> 0) {
286 if (!__CFProcessReorderBuffer(decompBuffer
, decompBufferLen
, &dst
, maxLength
, &usedLength
, dstFormat
)) break;
287 length
-= segmentLength
;
291 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar
, isHFSPlus
)) {
292 decompBufferLen
= CFUniCharDecomposeCharacter(currentChar
, decompBuffer
, MAX_BUFFER_LENGTH
);
295 *decompBuffer
= currentChar
;
298 if (!needToReorder
|| (decompBufferLen
== 1)) {
299 if (!CFUniCharFillDestinationBuffer(decompBuffer
, decompBufferLen
, &dst
, maxLength
, &usedLength
, dstFormat
)) break;
300 length
-= ((currentChar
> 0xFFFF) ? 2 : 1);
306 segmentLength
+= ((currentChar
> 0xFFFF) ? 2 : 1);
310 if ((decompBufferLen
> 0) && __CFProcessReorderBuffer(decompBuffer
, decompBufferLen
, &dst
, maxLength
, &usedLength
, dstFormat
)) length
-= segmentLength
;
312 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(kCFAllocatorSystemDefault
, decompBuffer
);
314 if (consumedLength
) *consumedLength
= originalLength
- length
;
315 if (filledLength
) *filledLength
= usedLength
;
317 return ((length
> 0) ? false : true);
320 #define MAX_COMP_DECOMP_LEN (32)
322 static CFIndex
__CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character
, UTF32Char
*convertedChars
) {
323 uint32_t value
= __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings
*)__CFUniCharCompatibilityDecompositionTable
, __CFUniCharCompatibilityDecompositionTableLength
, character
);
324 CFIndex length
= CFUniCharConvertFlagToCount(value
);
325 UTF32Char firstChar
= value
& 0xFFFFFF;
326 const UTF32Char
*mappings
= (length
> 1 ? __CFUniCharCompatibilityMultipleDecompositionTable
+ firstChar
: &firstChar
);
327 CFIndex usedLength
= length
;
328 UTF32Char currentChar
;
329 CFIndex currentLength
;
331 while (length
-- > 0) {
332 currentChar
= *(mappings
++);
333 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar
, false)) {
334 currentLength
= __CFUniCharRecursivelyDecomposeCharacter(currentChar
, convertedChars
, MAX_COMP_DECOMP_LEN
- length
);
335 convertedChars
+= currentLength
;
336 usedLength
+= (currentLength
- 1);
337 } else if (CFUniCharIsMemberOf(currentChar
, kCFUniCharCompatibilityDecomposableCharacterSet
)) {
338 currentLength
= __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar
, convertedChars
);
339 convertedChars
+= currentLength
;
340 usedLength
+= (currentLength
- 1);
342 *(convertedChars
++) = currentChar
;
349 CF_INLINE
void __CFUniCharMoveBufferFromEnd1(UTF32Char
*convertedChars
, CFIndex length
, CFIndex delta
) {
350 const UTF32Char
*limit
= convertedChars
;
353 convertedChars
+= length
;
354 dstP
= convertedChars
+ delta
;
356 while (convertedChars
> limit
) *(--dstP
) = *(--convertedChars
);
359 __private_extern__ CFIndex
CFUniCharCompatibilityDecompose(UTF32Char
*convertedChars
, CFIndex length
, CFIndex maxBufferLength
) {
360 UTF32Char currentChar
;
361 UTF32Char buffer
[MAX_COMP_DECOMP_LEN
];
362 const UTF32Char
*bufferP
;
363 const UTF32Char
*limit
= convertedChars
+ length
;
364 CFIndex filledLength
;
366 if (NULL
== __CFUniCharCompatibilityDecompositionTable
) __CFUniCharLoadCompatibilityDecompositionTable();
368 while (convertedChars
< limit
) {
369 currentChar
= *convertedChars
;
371 if (CFUniCharIsMemberOf(currentChar
, kCFUniCharCompatibilityDecomposableCharacterSet
)) {
372 filledLength
= __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar
, buffer
);
374 if (filledLength
+ length
- 1 > maxBufferLength
) return 0;
376 if (filledLength
> 1) __CFUniCharMoveBufferFromEnd1(convertedChars
+ 1, limit
- convertedChars
- 1, filledLength
- 1);
379 length
+= (filledLength
- 1);
380 while (filledLength
-- > 0) *(convertedChars
++) = *(bufferP
++);
389 CF_EXPORT
void CFUniCharPrioritySort(UTF32Char
*characters
, CFIndex length
) {
390 __CFUniCharPrioritySort(characters
, length
);
393 #undef MAX_BUFFER_LENGTH
394 #undef MAX_COMP_DECOMP_LEN