]> git.saurik.com Git - apple/cf.git/blob - CFUnicodeDecomposition.c
CF-744.tar.gz
[apple/cf.git] / CFUnicodeDecomposition.c
1 /*
2 * Copyright (c) 2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /* CFUnicodeDecomposition.c
25 Copyright (c) 1999-2012, Apple Inc. All rights reserved.
26 Responsibility: Aki Inoue
27 */
28
29 #include <string.h>
30 #include <CoreFoundation/CFBase.h>
31 #include <CoreFoundation/CFCharacterSet.h>
32 #include <CoreFoundation/CFUniChar.h>
33 #include <CoreFoundation/CFUnicodeDecomposition.h>
34 #include "CFInternal.h"
35 #include "CFUniCharPriv.h"
36
37 // Canonical Decomposition
38 static UTF32Char *__CFUniCharDecompositionTable = NULL;
39 static uint32_t __CFUniCharDecompositionTableLength = 0;
40 static UTF32Char *__CFUniCharMultipleDecompositionTable = NULL;
41
42 static const uint8_t *__CFUniCharDecomposableBitmapForBMP = NULL;
43 static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP = NULL;
44
45 static CFSpinLock_t __CFUniCharDecompositionTableLock = CFSpinLockInit;
46
47 static const uint8_t **__CFUniCharCombiningPriorityTable = NULL;
48 static uint8_t __CFUniCharCombiningPriorityTableNumPlane = 0;
49
50 static void __CFUniCharLoadDecompositionTable(void) {
51
52 __CFSpinLock(&__CFUniCharDecompositionTableLock);
53
54 if (NULL == __CFUniCharDecompositionTable) {
55 const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping);
56
57 if (NULL == bytes) {
58 __CFSpinUnlock(&__CFUniCharDecompositionTableLock);
59 return;
60 }
61
62 __CFUniCharDecompositionTableLength = *(bytes++);
63 __CFUniCharDecompositionTable = (UTF32Char *)bytes;
64 __CFUniCharMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharDecompositionTableLength);
65
66 __CFUniCharDecompositionTableLength /= (sizeof(uint32_t) * 2);
67 __CFUniCharDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, 0);
68 __CFUniCharHFSPlusDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet, 0);
69
70 CFIndex idx;
71
72 __CFUniCharCombiningPriorityTableNumPlane = CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty);
73 __CFUniCharCombiningPriorityTable = (const uint8_t **)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(uint8_t *) * __CFUniCharCombiningPriorityTableNumPlane, 0);
74 for (idx = 0;idx < __CFUniCharCombiningPriorityTableNumPlane;idx++) __CFUniCharCombiningPriorityTable[idx] = (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, idx);
75 }
76
77 __CFSpinUnlock(&__CFUniCharDecompositionTableLock);
78 }
79
80 static CFSpinLock_t __CFUniCharCompatibilityDecompositionTableLock = CFSpinLockInit;
81 static UTF32Char *__CFUniCharCompatibilityDecompositionTable = NULL;
82 static uint32_t __CFUniCharCompatibilityDecompositionTableLength = 0;
83 static UTF32Char *__CFUniCharCompatibilityMultipleDecompositionTable = NULL;
84
85 static void __CFUniCharLoadCompatibilityDecompositionTable(void) {
86
87 __CFSpinLock(&__CFUniCharCompatibilityDecompositionTableLock);
88
89 if (NULL == __CFUniCharCompatibilityDecompositionTable) {
90 const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping);
91
92 if (NULL == bytes) {
93 __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock);
94 return;
95 }
96
97 __CFUniCharCompatibilityDecompositionTableLength = *(bytes++);
98 __CFUniCharCompatibilityDecompositionTable = (UTF32Char *)bytes;
99 __CFUniCharCompatibilityMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharCompatibilityDecompositionTableLength);
100
101 __CFUniCharCompatibilityDecompositionTableLength /= (sizeof(uint32_t) * 2);
102 }
103
104 __CFSpinUnlock(&__CFUniCharCompatibilityDecompositionTableLock);
105 }
106
107 CF_INLINE bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character, bool isHFSPlus) {
108 return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? (isHFSPlus ? __CFUniCharHFSPlusDecomposableBitmapForBMP : __CFUniCharDecomposableBitmapForBMP) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, ((character >> 16) & 0xFF))));
109 }
110
111 CF_INLINE uint8_t __CFUniCharGetCombiningPropertyForCharacter(UTF32Char character) { return CFUniCharGetCombiningPropertyForCharacter(character, (((character) >> 16) < __CFUniCharCombiningPriorityTableNumPlane ? __CFUniCharCombiningPriorityTable[(character) >> 16] : NULL)); }
112
113 CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) { return ((0 == __CFUniCharGetCombiningPropertyForCharacter(character)) ? false : true); } // the notion of non-base in normalization is characters with non-0 combining class
114
115 typedef struct {
116 uint32_t _key;
117 uint32_t _value;
118 } __CFUniCharDecomposeMappings;
119
120 static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings *theTable, uint32_t numElem, UTF32Char character) {
121 const __CFUniCharDecomposeMappings *p, *q, *divider;
122
123 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) {
124 return 0;
125 }
126 p = theTable;
127 q = p + (numElem-1);
128 while (p <= q) {
129 divider = p + ((q - p) >> 1); /* divide by 2 */
130 if (character < divider->_key) { q = divider - 1; }
131 else if (character > divider->_key) { p = divider + 1; }
132 else { return divider->_value; }
133 }
134 return 0;
135 }
136
137 static void __CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) {
138 UTF32Char *end = characters + length;
139
140 while ((characters < end) && (0 == __CFUniCharGetCombiningPropertyForCharacter(*characters))) ++characters;
141
142 if ((end - characters) > 1) {
143 uint32_t p1, p2;
144 UTF32Char *ch1, *ch2;
145 bool changes = true;
146
147 do {
148 changes = false;
149 ch1 = characters; ch2 = characters + 1;
150 p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch1);
151 while (ch2 < end) {
152 p1 = p2; p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch2);
153 if (p1 > p2) {
154 UTF32Char tmp = *ch1; *ch1 = *ch2; *ch2 = tmp;
155 changes = true;
156 }
157 ++ch1; ++ch2;
158 }
159 } while (changes);
160 }
161 }
162
163 static CFIndex __CFUniCharRecursivelyDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) {
164 uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharDecompositionTable, __CFUniCharDecompositionTableLength, character);
165 CFIndex length = CFUniCharConvertFlagToCount(value);
166 UTF32Char firstChar = value & 0xFFFFFF;
167 UTF32Char *mappings = (length > 1 ? __CFUniCharMultipleDecompositionTable + firstChar : &firstChar);
168 CFIndex usedLength = 0;
169
170 if (maxBufferLength < length) return 0;
171
172 if (value & kCFUniCharRecursiveDecompositionFlag) {
173 usedLength = __CFUniCharRecursivelyDecomposeCharacter(*mappings, convertedChars, maxBufferLength - length);
174
175 --length; // Decrement for the first char
176 if (!usedLength || usedLength + length > maxBufferLength) return 0;
177 ++mappings;
178 convertedChars += usedLength;
179 }
180
181 usedLength += length;
182
183 while (length--) *(convertedChars++) = *(mappings++);
184
185 return usedLength;
186 }
187
188 #define HANGUL_SBASE 0xAC00
189 #define HANGUL_LBASE 0x1100
190 #define HANGUL_VBASE 0x1161
191 #define HANGUL_TBASE 0x11A7
192 #define HANGUL_SCOUNT 11172
193 #define HANGUL_LCOUNT 19
194 #define HANGUL_VCOUNT 21
195 #define HANGUL_TCOUNT 28
196 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
197
198 CFIndex CFUniCharDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) {
199 if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable();
200 if (character >= HANGUL_SBASE && character <= (HANGUL_SBASE + HANGUL_SCOUNT)) {
201 CFIndex length;
202
203 character -= HANGUL_SBASE;
204
205 length = (character % HANGUL_TCOUNT ? 3 : 2);
206
207 if (maxBufferLength < length) return 0;
208
209 *(convertedChars++) = character / HANGUL_NCOUNT + HANGUL_LBASE;
210 *(convertedChars++) = (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
211 if (length > 2) *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
212 return length;
213 } else {
214 return __CFUniCharRecursivelyDecomposeCharacter(character, convertedChars, maxBufferLength);
215 }
216 }
217
218 CF_INLINE bool __CFProcessReorderBuffer(UTF32Char *buffer, CFIndex length, void **dst, CFIndex dstLength, CFIndex *filledLength, uint32_t dstFormat) {
219 if (length > 1) __CFUniCharPrioritySort(buffer, length);
220 return CFUniCharFillDestinationBuffer(buffer, length, dst, dstLength, filledLength, dstFormat);
221 }
222
223 #define MAX_BUFFER_LENGTH (32)
224 bool CFUniCharDecompose(const UTF16Char *src, CFIndex length, CFIndex *consumedLength, void *dst, CFIndex maxLength, CFIndex *filledLength, bool needToReorder, uint32_t dstFormat, bool isHFSPlus) {
225 CFIndex usedLength = 0;
226 CFIndex originalLength = length;
227 UTF32Char buffer[MAX_BUFFER_LENGTH];
228 UTF32Char *decompBuffer = buffer;
229 CFIndex decompBufferSize = MAX_BUFFER_LENGTH;
230 CFIndex decompBufferLen = 0;
231 CFIndex segmentLength = 0;
232 UTF32Char currentChar;
233
234 if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable();
235
236 while ((length - segmentLength) > 0) {
237 currentChar = *(src++);
238
239 if (currentChar < 0x80) {
240 if (decompBufferLen > 0) {
241 if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break;
242 length -= segmentLength;
243 segmentLength = 0;
244 decompBufferLen = 0;
245 }
246
247 if (maxLength > 0) {
248 if (usedLength >= maxLength) break;
249 switch (dstFormat) {
250 case kCFUniCharUTF8Format: *(uint8_t *)dst = currentChar; dst = (uint8_t *)dst + sizeof(uint8_t); break;
251 case kCFUniCharUTF16Format: *(UTF16Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF16Char); break;
252 case kCFUniCharUTF32Format: *(UTF32Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF32Char); break;
253 }
254 }
255
256 --length;
257 ++usedLength;
258 } else {
259 if (CFUniCharIsSurrogateLowCharacter(currentChar)) { // Stray surrogagte
260 if (dstFormat != kCFUniCharUTF16Format) break;
261 } else if (CFUniCharIsSurrogateHighCharacter(currentChar)) {
262 if (((length - segmentLength) > 1) && CFUniCharIsSurrogateLowCharacter(*src)) {
263 currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(src++));
264 } else {
265 if (dstFormat != kCFUniCharUTF16Format) break;
266 }
267 }
268
269 if (needToReorder && __CFUniCharIsNonBaseCharacter(currentChar)) {
270 if ((decompBufferLen + 1) >= decompBufferSize) {
271 UTF32Char *newBuffer;
272
273 decompBufferSize += MAX_BUFFER_LENGTH;
274 newBuffer = (UTF32Char *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF32Char) * decompBufferSize, 0);
275 memmove(newBuffer, decompBuffer, (decompBufferSize - MAX_BUFFER_LENGTH) * sizeof(UTF32Char));
276 if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer);
277 decompBuffer = newBuffer;
278 }
279
280 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { // Vietnamese accent, etc.
281 decompBufferLen += CFUniCharDecomposeCharacter(currentChar, decompBuffer + decompBufferLen, decompBufferSize - decompBufferLen);
282 } else {
283 decompBuffer[decompBufferLen++] = currentChar;
284 }
285 } else {
286 if (decompBufferLen > 0) {
287 if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break;
288 length -= segmentLength;
289 segmentLength = 0;
290 }
291
292 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) {
293 decompBufferLen = CFUniCharDecomposeCharacter(currentChar, decompBuffer, MAX_BUFFER_LENGTH);
294 } else {
295 decompBufferLen = 1;
296 *decompBuffer = currentChar;
297 }
298
299 if (!needToReorder || (decompBufferLen == 1)) {
300 if (!CFUniCharFillDestinationBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break;
301 length -= ((currentChar > 0xFFFF) ? 2 : 1);
302 decompBufferLen = 0;
303 continue;
304 }
305 }
306
307 segmentLength += ((currentChar > 0xFFFF) ? 2 : 1);
308 }
309 }
310
311 if ((decompBufferLen > 0) && __CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) length -= segmentLength;
312
313 if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer);
314
315 if (consumedLength) *consumedLength = originalLength - length;
316 if (filledLength) *filledLength = usedLength;
317
318 return ((length > 0) ? false : true);
319 }
320
321 #define MAX_COMP_DECOMP_LEN (32)
322
323 static CFIndex __CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars) {
324 uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharCompatibilityDecompositionTable, __CFUniCharCompatibilityDecompositionTableLength, character);
325 CFIndex length = CFUniCharConvertFlagToCount(value);
326 UTF32Char firstChar = value & 0xFFFFFF;
327 const UTF32Char *mappings = (length > 1 ? __CFUniCharCompatibilityMultipleDecompositionTable + firstChar : &firstChar);
328 CFIndex usedLength = length;
329 UTF32Char currentChar;
330 CFIndex currentLength;
331
332 while (length-- > 0) {
333 currentChar = *(mappings++);
334 if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, false)) {
335 currentLength = __CFUniCharRecursivelyDecomposeCharacter(currentChar, convertedChars, MAX_COMP_DECOMP_LEN - length);
336 convertedChars += currentLength;
337 usedLength += (currentLength - 1);
338 } else if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) {
339 currentLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, convertedChars);
340 convertedChars += currentLength;
341 usedLength += (currentLength - 1);
342 } else {
343 *(convertedChars++) = currentChar;
344 }
345 }
346
347 return usedLength;
348 }
349
350 CF_INLINE void __CFUniCharMoveBufferFromEnd1(UTF32Char *convertedChars, CFIndex length, CFIndex delta) {
351 const UTF32Char *limit = convertedChars;
352 UTF32Char *dstP;
353
354 convertedChars += length;
355 dstP = convertedChars + delta;
356
357 while (convertedChars > limit) *(--dstP) = *(--convertedChars);
358 }
359
360 __private_extern__ CFIndex CFUniCharCompatibilityDecompose(UTF32Char *convertedChars, CFIndex length, CFIndex maxBufferLength) {
361 UTF32Char currentChar;
362 UTF32Char buffer[MAX_COMP_DECOMP_LEN];
363 const UTF32Char *bufferP;
364 const UTF32Char *limit = convertedChars + length;
365 CFIndex filledLength;
366
367 if (NULL == __CFUniCharCompatibilityDecompositionTable) __CFUniCharLoadCompatibilityDecompositionTable();
368
369 while (convertedChars < limit) {
370 currentChar = *convertedChars;
371
372 if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) {
373 filledLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, buffer);
374
375 if (filledLength + length - 1 > maxBufferLength) return 0;
376
377 if (filledLength > 1) __CFUniCharMoveBufferFromEnd1(convertedChars + 1, limit - convertedChars - 1, filledLength - 1);
378
379 bufferP = buffer;
380 length += (filledLength - 1);
381 while (filledLength-- > 0) *(convertedChars++) = *(bufferP++);
382 } else {
383 ++convertedChars;
384 }
385 }
386
387 return length;
388 }
389
390 CF_EXPORT void CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) {
391 __CFUniCharPrioritySort(characters, length);
392 }
393
394 #undef MAX_BUFFER_LENGTH
395 #undef MAX_COMP_DECOMP_LEN
396 #undef HANGUL_SBASE
397 #undef HANGUL_LBASE
398 #undef HANGUL_VBASE
399 #undef HANGUL_TBASE
400 #undef HANGUL_SCOUNT
401 #undef HANGUL_LCOUNT
402 #undef HANGUL_VCOUNT
403 #undef HANGUL_TCOUNT
404 #undef HANGUL_NCOUNT
405