2 * Copyright (c) 2015 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
24 /* CFUnicodeDecomposition.c
25 Copyright (c) 1999-2014, Apple Inc. All rights reserved.
26 Responsibility: Aki Inoue
30 #include <CoreFoundation/CFBase.h>
31 #include <CoreFoundation/CFCharacterSet.h>
32 #include <CoreFoundation/CFUniChar.h>
33 #include <CoreFoundation/CFUnicodeDecomposition.h>
34 #include "CFInternal.h"
35 #include "CFUniCharPriv.h"
37 // Canonical Decomposition
38 static UTF32Char
*__CFUniCharDecompositionTable
= NULL
;
39 static uint32_t __CFUniCharDecompositionTableLength
= 0;
40 static UTF32Char
*__CFUniCharMultipleDecompositionTable
= NULL
;
42 static const uint8_t *__CFUniCharDecomposableBitmapForBMP
= NULL
;
43 static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP
= NULL
;
45 static CFLock_t __CFUniCharDecompositionTableLock
= CFLockInit
;
47 static const uint8_t **__CFUniCharCombiningPriorityTable
= NULL
;
48 static uint8_t __CFUniCharCombiningPriorityTableNumPlane
= 0;
50 static void __CFUniCharLoadDecompositionTable(void) {
52 __CFLock(&__CFUniCharDecompositionTableLock
);
54 if (NULL
== __CFUniCharDecompositionTable
) {
55 const uint32_t *bytes
= (uint32_t *)CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping
);
58 __CFUnlock(&__CFUniCharDecompositionTableLock
);
62 __CFUniCharDecompositionTableLength
= *(bytes
++);
63 __CFUniCharDecompositionTable
= (UTF32Char
*)bytes
;
64 __CFUniCharMultipleDecompositionTable
= (UTF32Char
*)((intptr_t)bytes
+ __CFUniCharDecompositionTableLength
);
66 __CFUniCharDecompositionTableLength
/= (sizeof(uint32_t) * 2);
67 __CFUniCharDecomposableBitmapForBMP
= CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet
, 0);
68 __CFUniCharHFSPlusDecomposableBitmapForBMP
= CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet
, 0);
72 __CFUniCharCombiningPriorityTableNumPlane
= CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty
);
73 __CFUniCharCombiningPriorityTable
= (const uint8_t **)CFAllocatorAllocate(kCFAllocatorSystemDefault
, sizeof(uint8_t *) * __CFUniCharCombiningPriorityTableNumPlane
, 0);
74 for (idx
= 0;idx
< __CFUniCharCombiningPriorityTableNumPlane
;idx
++) __CFUniCharCombiningPriorityTable
[idx
] = (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty
, idx
);
77 __CFUnlock(&__CFUniCharDecompositionTableLock
);
80 static CFLock_t __CFUniCharCompatibilityDecompositionTableLock
= CFLockInit
;
81 static UTF32Char
*__CFUniCharCompatibilityDecompositionTable
= NULL
;
82 static uint32_t __CFUniCharCompatibilityDecompositionTableLength
= 0;
83 static UTF32Char
*__CFUniCharCompatibilityMultipleDecompositionTable
= NULL
;
85 static void __CFUniCharLoadCompatibilityDecompositionTable(void) {
87 __CFLock(&__CFUniCharCompatibilityDecompositionTableLock
);
89 if (NULL
== __CFUniCharCompatibilityDecompositionTable
) {
90 const uint32_t *bytes
= (uint32_t *)CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping
);
93 __CFUnlock(&__CFUniCharCompatibilityDecompositionTableLock
);
97 __CFUniCharCompatibilityDecompositionTableLength
= *(bytes
++);
98 __CFUniCharCompatibilityDecompositionTable
= (UTF32Char
*)bytes
;
99 __CFUniCharCompatibilityMultipleDecompositionTable
= (UTF32Char
*)((intptr_t)bytes
+ __CFUniCharCompatibilityDecompositionTableLength
);
101 __CFUniCharCompatibilityDecompositionTableLength
/= (sizeof(uint32_t) * 2);
104 __CFUnlock(&__CFUniCharCompatibilityDecompositionTableLock
);
107 CF_INLINE
bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character
, bool isHFSPlus
) {
108 return CFUniCharIsMemberOfBitmap(character
, (character
< 0x10000 ? (isHFSPlus
? __CFUniCharHFSPlusDecomposableBitmapForBMP
: __CFUniCharDecomposableBitmapForBMP
) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet
, ((character
>> 16) & 0xFF))));
111 CF_INLINE
uint8_t __CFUniCharGetCombiningPropertyForCharacter(UTF32Char character
) { return CFUniCharGetCombiningPropertyForCharacter(character
, (((character
) >> 16) < __CFUniCharCombiningPriorityTableNumPlane
? __CFUniCharCombiningPriorityTable
[(character
) >> 16] : NULL
)); }
113 CF_INLINE
bool __CFUniCharIsNonBaseCharacter(UTF32Char character
) { return ((0 == __CFUniCharGetCombiningPropertyForCharacter(character
)) ? false : true); } // the notion of non-base in normalization is characters with non-0 combining class
118 } __CFUniCharDecomposeMappings
;
120 static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings
*theTable
, uint32_t numElem
, UTF32Char character
) {
121 const __CFUniCharDecomposeMappings
*p
, *q
, *divider
;
123 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
)) {
129 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
130 if (character
< divider
->_key
) { q
= divider
- 1; }
131 else if (character
> divider
->_key
) { p
= divider
+ 1; }
132 else { return divider
->_value
; }
137 static void __CFUniCharPrioritySort(UTF32Char
*characters
, CFIndex length
) {
138 UTF32Char
*end
= characters
+ length
;
140 while ((characters
< end
) && (0 == __CFUniCharGetCombiningPropertyForCharacter(*characters
))) ++characters
;
142 if ((end
- characters
) > 1) {
144 UTF32Char
*ch1
, *ch2
;
149 ch1
= characters
; ch2
= characters
+ 1;
150 p2
= __CFUniCharGetCombiningPropertyForCharacter(*ch1
);
152 p1
= p2
; p2
= __CFUniCharGetCombiningPropertyForCharacter(*ch2
);
154 UTF32Char tmp
= *ch1
; *ch1
= *ch2
; *ch2
= tmp
;
163 static CFIndex
__CFUniCharRecursivelyDecomposeCharacter(UTF32Char character
, UTF32Char
*convertedChars
, CFIndex maxBufferLength
) {
164 uint32_t value
= __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings
*)__CFUniCharDecompositionTable
, __CFUniCharDecompositionTableLength
, character
);
165 CFIndex length
= CFUniCharConvertFlagToCount(value
);
166 UTF32Char firstChar
= value
& 0xFFFFFF;
167 UTF32Char
*mappings
= (length
> 1 ? __CFUniCharMultipleDecompositionTable
+ firstChar
: &firstChar
);
168 CFIndex usedLength
= 0;
170 if (maxBufferLength
< length
) return 0;
172 if (value
& kCFUniCharRecursiveDecompositionFlag
) {
173 usedLength
= __CFUniCharRecursivelyDecomposeCharacter(*mappings
, convertedChars
, maxBufferLength
- length
);
175 --length
; // Decrement for the first char
176 if (!usedLength
|| usedLength
+ length
> maxBufferLength
) return 0;
178 convertedChars
+= usedLength
;
181 usedLength
+= length
;
183 while (length
--) *(convertedChars
++) = *(mappings
++);
188 #define HANGUL_SBASE 0xAC00
189 #define HANGUL_LBASE 0x1100
190 #define HANGUL_VBASE 0x1161
191 #define HANGUL_TBASE 0x11A7
192 #define HANGUL_SCOUNT 11172
193 #define HANGUL_LCOUNT 19
194 #define HANGUL_VCOUNT 21
195 #define HANGUL_TCOUNT 28
196 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
198 CFIndex
CFUniCharDecomposeCharacter(UTF32Char character
, UTF32Char
*convertedChars
, CFIndex maxBufferLength
) {
199 if (NULL
== __CFUniCharDecompositionTable
) __CFUniCharLoadDecompositionTable();
200 if (character
>= HANGUL_SBASE
&& character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
)) {
203 character
-= HANGUL_SBASE
;
205 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
207 if (maxBufferLength
< length
) return 0;
209 *(convertedChars
++) = character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
210 *(convertedChars
++) = (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
211 if (length
> 2) *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
214 return __CFUniCharRecursivelyDecomposeCharacter(character
, convertedChars
, maxBufferLength
);
218 CF_INLINE
bool __CFProcessReorderBuffer(UTF32Char
*buffer
, CFIndex length
, void **dst
, CFIndex dstLength
, CFIndex
*filledLength
, uint32_t dstFormat
) {
219 if (length
> 1) __CFUniCharPrioritySort(buffer
, length
);
220 return CFUniCharFillDestinationBuffer(buffer
, length
, dst
, dstLength
, filledLength
, dstFormat
);
223 #define MAX_BUFFER_LENGTH (32)
224 bool CFUniCharDecompose(const UTF16Char
*src
, CFIndex length
, CFIndex
*consumedLength
, void *dst
, CFIndex maxLength
, CFIndex
*filledLength
, bool needToReorder
, uint32_t dstFormat
, bool isHFSPlus
) {
225 CFIndex usedLength
= 0;
226 CFIndex originalLength
= length
;
227 UTF32Char buffer
[MAX_BUFFER_LENGTH
];
228 UTF32Char
*decompBuffer
= buffer
;
229 CFIndex decompBufferSize
= MAX_BUFFER_LENGTH
;
230 CFIndex decompBufferLen
= 0;
231 CFIndex segmentLength
= 0;
232 UTF32Char currentChar
;
234 if (NULL
== __CFUniCharDecompositionTable
) __CFUniCharLoadDecompositionTable();
236 while ((length
- segmentLength
) > 0) {
237 currentChar
= *(src
++);
239 if (currentChar
< 0x80) {
240 if (decompBufferLen
> 0) {
241 if (!__CFProcessReorderBuffer(decompBuffer
, decompBufferLen
, &dst
, maxLength
, &usedLength
, dstFormat
)) break;
242 length
-= segmentLength
;
248 if (usedLength
>= maxLength
) break;
250 case kCFUniCharUTF8Format
: *(uint8_t *)dst
= currentChar
; dst
= (uint8_t *)dst
+ sizeof(uint8_t); break;
251 case kCFUniCharUTF16Format
: *(UTF16Char
*)dst
= currentChar
; dst
= (uint8_t *)dst
+ sizeof(UTF16Char
); break;
252 case kCFUniCharUTF32Format
: *(UTF32Char
*)dst
= currentChar
; dst
= (uint8_t *)dst
+ sizeof(UTF32Char
); break;
259 if (CFUniCharIsSurrogateLowCharacter(currentChar
)) { // Stray surrogagte
260 if (dstFormat
!= kCFUniCharUTF16Format
) break;
261 } else if (CFUniCharIsSurrogateHighCharacter(currentChar
)) {
262 if (((length
- segmentLength
) > 1) && CFUniCharIsSurrogateLowCharacter(*src
)) {
263 currentChar
= CFUniCharGetLongCharacterForSurrogatePair(currentChar
, *(src
++));
265 if (dstFormat
!= kCFUniCharUTF16Format
) break;
269 if (needToReorder
&& __CFUniCharIsNonBaseCharacter(currentChar
)) {
270 if ((decompBufferLen
+ 1) >= decompBufferSize
) {
271 UTF32Char
*newBuffer
;
273 decompBufferSize
+= MAX_BUFFER_LENGTH
;
274 newBuffer
= (UTF32Char
*)CFAllocatorAllocate(kCFAllocatorSystemDefault
, sizeof(UTF32Char
) * decompBufferSize
, 0);
275 memmove(newBuffer
, decompBuffer
, (decompBufferSize
- MAX_BUFFER_LENGTH
) * sizeof(UTF32Char
));
276 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(kCFAllocatorSystemDefault
, decompBuffer
);
277 decompBuffer
= newBuffer
;
280 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar
, isHFSPlus
)) { // Vietnamese accent, etc.
281 decompBufferLen
+= CFUniCharDecomposeCharacter(currentChar
, decompBuffer
+ decompBufferLen
, decompBufferSize
- decompBufferLen
);
283 decompBuffer
[decompBufferLen
++] = currentChar
;
286 if (decompBufferLen
> 0) {
287 if (!__CFProcessReorderBuffer(decompBuffer
, decompBufferLen
, &dst
, maxLength
, &usedLength
, dstFormat
)) break;
288 length
-= segmentLength
;
292 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar
, isHFSPlus
)) {
293 decompBufferLen
= CFUniCharDecomposeCharacter(currentChar
, decompBuffer
, MAX_BUFFER_LENGTH
);
296 *decompBuffer
= currentChar
;
299 if (!needToReorder
|| (decompBufferLen
== 1)) {
300 if (!CFUniCharFillDestinationBuffer(decompBuffer
, decompBufferLen
, &dst
, maxLength
, &usedLength
, dstFormat
)) break;
301 length
-= ((currentChar
> 0xFFFF) ? 2 : 1);
307 segmentLength
+= ((currentChar
> 0xFFFF) ? 2 : 1);
311 if ((decompBufferLen
> 0) && __CFProcessReorderBuffer(decompBuffer
, decompBufferLen
, &dst
, maxLength
, &usedLength
, dstFormat
)) length
-= segmentLength
;
313 if (decompBuffer
!= buffer
) CFAllocatorDeallocate(kCFAllocatorSystemDefault
, decompBuffer
);
315 if (consumedLength
) *consumedLength
= originalLength
- length
;
316 if (filledLength
) *filledLength
= usedLength
;
318 return ((length
> 0) ? false : true);
321 #define MAX_COMP_DECOMP_LEN (32)
323 static CFIndex
__CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character
, UTF32Char
*convertedChars
) {
324 uint32_t value
= __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings
*)__CFUniCharCompatibilityDecompositionTable
, __CFUniCharCompatibilityDecompositionTableLength
, character
);
325 CFIndex length
= CFUniCharConvertFlagToCount(value
);
326 UTF32Char firstChar
= value
& 0xFFFFFF;
327 const UTF32Char
*mappings
= (length
> 1 ? __CFUniCharCompatibilityMultipleDecompositionTable
+ firstChar
: &firstChar
);
328 CFIndex usedLength
= length
;
329 UTF32Char currentChar
;
330 CFIndex currentLength
;
332 while (length
-- > 0) {
333 currentChar
= *(mappings
++);
334 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar
, false)) {
335 currentLength
= __CFUniCharRecursivelyDecomposeCharacter(currentChar
, convertedChars
, MAX_COMP_DECOMP_LEN
- length
);
336 convertedChars
+= currentLength
;
337 usedLength
+= (currentLength
- 1);
338 } else if (CFUniCharIsMemberOf(currentChar
, kCFUniCharCompatibilityDecomposableCharacterSet
)) {
339 currentLength
= __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar
, convertedChars
);
340 convertedChars
+= currentLength
;
341 usedLength
+= (currentLength
- 1);
343 *(convertedChars
++) = currentChar
;
350 CF_INLINE
void __CFUniCharMoveBufferFromEnd1(UTF32Char
*convertedChars
, CFIndex length
, CFIndex delta
) {
351 const UTF32Char
*limit
= convertedChars
;
354 convertedChars
+= length
;
355 dstP
= convertedChars
+ delta
;
357 while (convertedChars
> limit
) *(--dstP
) = *(--convertedChars
);
360 CF_PRIVATE CFIndex
CFUniCharCompatibilityDecompose(UTF32Char
*convertedChars
, CFIndex length
, CFIndex maxBufferLength
) {
361 UTF32Char currentChar
;
362 UTF32Char buffer
[MAX_COMP_DECOMP_LEN
];
363 const UTF32Char
*bufferP
;
364 const UTF32Char
*limit
= convertedChars
+ length
;
365 CFIndex filledLength
;
367 if (NULL
== __CFUniCharCompatibilityDecompositionTable
) __CFUniCharLoadCompatibilityDecompositionTable();
369 while (convertedChars
< limit
) {
370 currentChar
= *convertedChars
;
372 if (CFUniCharIsMemberOf(currentChar
, kCFUniCharCompatibilityDecomposableCharacterSet
)) {
373 filledLength
= __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar
, buffer
);
375 if (filledLength
+ length
- 1 > maxBufferLength
) return 0;
377 if (filledLength
> 1) __CFUniCharMoveBufferFromEnd1(convertedChars
+ 1, limit
- convertedChars
- 1, filledLength
- 1);
380 length
+= (filledLength
- 1);
381 while (filledLength
-- > 0) *(convertedChars
++) = *(bufferP
++);
390 CF_EXPORT
void CFUniCharPrioritySort(UTF32Char
*characters
, CFIndex length
) {
391 __CFUniCharPrioritySort(characters
, length
);
394 #undef MAX_BUFFER_LENGTH
395 #undef MAX_COMP_DECOMP_LEN