]> git.saurik.com Git - apple/cf.git/blob - StringEncodings.subproj/CFUnicodePrecomposition.c
CF-299.tar.gz
[apple/cf.git] / StringEncodings.subproj / CFUnicodePrecomposition.c
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /* CFUnicodePrecomposition.c
26 Copyright 1999-2002, Apple, Inc. All rights reserved.
27 Responsibility: Aki Inoue
28 */
29
30 #if !defined(KERNEL)
31 #define KERNEL 0
32 #endif
33
34 #include <string.h>
35 #if KERNEL
36 #include "CFUnicodePrecomposition.h"
37 #include "CFUniCharPrecompData.h"
38 #else KERNEL
39 #include <CoreFoundation/CFBase.h>
40 #include <CoreFoundation/CFCharacterSet.h>
41 #include "CFUniChar.h"
42 #include "CFUnicodePrecomposition.h"
43 #include "CFInternal.h"
44 #include "CFUniCharPriv.h"
45 #endif KERNEL
46
47 // Canonical Precomposition
48 #if KERNEL
49 static const uint32_t __CFUniCharPrecompositionTableLength = (sizeof(__CFUniCharPrecompSourceTable) / (sizeof(uint32_t) * 2));
50 CF_EXPORT uint8_t **CFUniCharCombiningPriorityTable;
51 CF_EXPORT uint8_t **CFUniCharCombiningPriorityExtraTable;
52 CF_EXPORT uint8_t CFUniCharNumberOfPlanesForCombiningPriority;
53
54 CF_EXPORT uint8_t __CFUniCharGetCombiningPriority(UTF32Char character) {
55 if (character < (CFUniCharNumberOfPlanesForCombiningPriority << 16)) {
56 uint32_t plane = character >> 16;
57 const uint8_t *bitmap = CFUniCharCombiningPriorityTable[plane];
58
59 if (bitmap) {
60 uint8_t value = bitmap[(character >> 8) & 0xFF];
61
62 if (value) {
63 bitmap = CFUniCharCombiningPriorityExtraTable[plane] + ((value - 1) * 256);
64 return bitmap[character % 256];
65 }
66 }
67 }
68 return 0;
69 }
70
71 CF_EXPORT uint8_t **CFUniCharNonBaseBitmap;
72 CF_EXPORT uint8_t CFUniCharNumberOfPlanesForNonBaseBitmap;
73
74 CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) {
75 if (character < (CFUniCharNumberOfPlanesForNonBaseBitmap << 16)) {
76 const uint8_t *bitmap = CFUniCharNonBaseBitmap[character >> 16];
77 uint8_t value = bitmap[(character >> 8) & 0xFF];
78
79 if (value == 0xFF) {
80 return true;
81 } else if (value) {
82 bitmap = bitmap + ((value - 1) * 32) + 256;
83 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? true : false);
84 }
85 }
86 return false;
87 }
88
89 #else KERNEL
90 static UTF32Char *__CFUniCharPrecompSourceTable = NULL;
91 static uint32_t __CFUniCharPrecompositionTableLength = 0;
92 static uint16_t *__CFUniCharBMPPrecompDestinationTable = NULL;
93 static uint32_t *__CFUniCharNonBMPPrecompDestinationTable = NULL;
94
95 static const uint8_t *__CFUniCharNonBaseBitmapForBMP_P = NULL; // Adding _P so the symbol name is different from the one in CFUnicodeDecomposition.c
96 static const uint8_t *__CFUniCharCombiningClassForBMP = NULL;
97
98 static CFSpinLock_t __CFUniCharPrecompositionTableLock = 0;
99
100 static void __CFUniCharLoadPrecompositionTable(void) {
101
102 __CFSpinLock(&__CFUniCharPrecompositionTableLock);
103
104 if (NULL == __CFUniCharPrecompSourceTable) {
105 const void *bytes = CFUniCharGetMappingData(kCFUniCharCanonicalPrecompMapping);
106 uint32_t bmpMappingLength;
107
108 if (NULL == bytes) {
109 __CFSpinUnlock(&__CFUniCharPrecompositionTableLock);
110 return;
111 }
112
113 __CFUniCharPrecompositionTableLength = *(((uint32_t *)bytes)++);
114 bmpMappingLength = *(((uint32_t *)bytes)++);
115 __CFUniCharPrecompSourceTable = (UTF32Char *)bytes;
116 __CFUniCharBMPPrecompDestinationTable = (uint16_t *)((intptr_t)bytes + (__CFUniCharPrecompositionTableLength * sizeof(UTF32Char) * 2));
117 __CFUniCharNonBMPPrecompDestinationTable = (uint32_t *)(((intptr_t)__CFUniCharBMPPrecompDestinationTable) + bmpMappingLength);
118
119 __CFUniCharNonBaseBitmapForBMP_P = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
120 __CFUniCharCombiningClassForBMP = CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, 0);
121 }
122
123 __CFSpinUnlock(&__CFUniCharPrecompositionTableLock);
124 }
125
126 // Adding _P so the symbol name is different from the one in CFUnicodeDecomposition.c
127 #define __CFUniCharIsNonBaseCharacter __CFUniCharIsNonBaseCharacter_P
128 CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) {
129 return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? __CFUniCharNonBaseBitmapForBMP_P : CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, ((character >> 16) & 0xFF))));
130 }
131 #endif KERNEL
132
133 typedef struct {
134 UTF16Char _key;
135 UTF16Char _value;
136 } __CFUniCharPrecomposeBMPMappings;
137
138 static UTF16Char __CFUniCharGetMappedBMPValue(const __CFUniCharPrecomposeBMPMappings *theTable, uint32_t numElem, UTF16Char character) {
139 const __CFUniCharPrecomposeBMPMappings *p, *q, *divider;
140
141 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) {
142 return 0;
143 }
144 p = theTable;
145 q = p + (numElem-1);
146 while (p <= q) {
147 divider = p + ((q - p) >> 1); /* divide by 2 */
148 if (character < divider->_key) { q = divider - 1; }
149 else if (character > divider->_key) { p = divider + 1; }
150 else { return divider->_value; }
151 }
152 return 0;
153 }
154
155 typedef struct {
156 UTF32Char _key;
157 uint32_t _value;
158 } __CFUniCharPrecomposeMappings;
159
160 static uint32_t __CFUniCharGetMappedValue_P(const __CFUniCharPrecomposeMappings *theTable, uint32_t numElem, UTF32Char character) {
161 const __CFUniCharPrecomposeMappings *p, *q, *divider;
162
163 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) {
164 return 0;
165 }
166 p = theTable;
167 q = p + (numElem-1);
168 while (p <= q) {
169 divider = p + ((q - p) >> 1); /* divide by 2 */
170 if (character < divider->_key) { q = divider - 1; }
171 else if (character > divider->_key) { p = divider + 1; }
172 else { return divider->_value; }
173 }
174 return 0;
175 }
176
177 #if !KERNEL
178 __private_extern__
179 #endif !KERNEL
180 UTF32Char CFUniCharPrecomposeCharacter(UTF32Char base, UTF32Char combining) {
181 uint32_t value;
182
183 #if !KERNEL
184 if (NULL == __CFUniCharPrecompSourceTable) __CFUniCharLoadPrecompositionTable();
185 #endif !KERNEL
186
187 if (!(value = __CFUniCharGetMappedValue_P((const __CFUniCharPrecomposeMappings *)__CFUniCharPrecompSourceTable, __CFUniCharPrecompositionTableLength, combining))) return 0xFFFD;
188
189 #if !KERNEL
190 // We don't have precomposition in non-BMP
191 if (value & kCFUniCharNonBmpFlag) {
192 value = __CFUniCharGetMappedValue_P((const __CFUniCharPrecomposeMappings *)((uint32_t *)__CFUniCharNonBMPPrecompDestinationTable + (value & 0xFFFF)), (value >> 16) & 0x7FFF, base);
193 } else {
194 #endif !KERNEL
195 value = __CFUniCharGetMappedBMPValue((const __CFUniCharPrecomposeBMPMappings *)((uint32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)), (value >> 16), base);
196 #if !KERNEL
197 }
198 #endif !KERNEL
199 return (value ? value : 0xFFFD);
200 }
201
202 #define HANGUL_SBASE 0xAC00
203 #define HANGUL_LBASE 0x1100
204 #define HANGUL_VBASE 0x1161
205 #define HANGUL_TBASE 0x11A7
206 #define HANGUL_SCOUNT 11172
207 #define HANGUL_LCOUNT 19
208 #define HANGUL_VCOUNT 21
209 #define HANGUL_TCOUNT 28
210 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
211
212 CF_INLINE void __CFUniCharMoveBufferFromEnd(UTF16Char *convertedChars, uint32_t length, uint32_t delta) {
213 const UTF16Char *limit = convertedChars;
214 UTF16Char *dstP;
215
216 convertedChars += length;
217 dstP = convertedChars + delta;
218
219 while (convertedChars > limit) *(--dstP) = *(--convertedChars);
220 }
221
222 bool CFUniCharPrecompose(const UTF16Char *characters, uint32_t length, uint32_t *consumedLength, UTF16Char *precomposed, uint32_t maxLength, uint32_t *filledLength) {
223 UTF32Char currentChar = 0, lastChar = 0, precomposedChar = 0xFFFD;
224 uint32_t originalLength = length, usedLength = 0;
225 UTF16Char *currentBase = precomposed;
226 uint8_t currentClass, lastClass = 0;
227 bool currentBaseIsBMP = true;
228 bool isPrecomposed;
229
230 #if !KERNEL
231 if (NULL == __CFUniCharPrecompSourceTable) __CFUniCharLoadPrecompositionTable();
232 #endif !KERNEL
233
234 while (length > 0) {
235 currentChar = *(characters++);
236 --length;
237
238 if (CFUniCharIsSurrogateHighCharacter(currentChar) && (length > 0) && CFUniCharIsSurrogateLowCharacter(*characters)) {
239 currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(characters++));
240 --length;
241 }
242
243 if (lastChar && __CFUniCharIsNonBaseCharacter(currentChar)) {
244 isPrecomposed = (precomposedChar == 0xFFFD ? false : true);
245 if (isPrecomposed) lastChar = precomposedChar;
246
247 #if KERNEL
248 currentClass = __CFUniCharGetCombiningPriority(currentChar);
249 #else KERNEL
250 currentClass = (currentChar > 0xFFFF ? CFUniCharGetUnicodeProperty(currentChar, kCFUniCharCombiningProperty) : CFUniCharGetCombiningPropertyForCharacter(currentChar, __CFUniCharCombiningClassForBMP));
251 #endif KERNEL
252
253 if ((lastClass == 0) || (currentClass != lastClass)) {
254 if ((precomposedChar = CFUniCharPrecomposeCharacter(lastChar, currentChar)) == 0xFFFD) {
255 if (isPrecomposed) precomposedChar = lastChar;
256 lastClass = currentClass;
257 } else {
258 lastClass = 0;
259 continue;
260 }
261 }
262 if (currentChar > 0xFFFF) { // Non-BMP
263 usedLength += 2;
264 if (usedLength > maxLength) break;
265 currentChar -= 0x10000;
266 *(precomposed++) = (UTF16Char)((currentChar >> 10) + 0xD800UL);
267 *(precomposed++) = (UTF16Char)((currentChar & 0x3FF) + 0xDC00UL);
268 } else {
269 ++usedLength;
270 if (usedLength > maxLength) break;
271 *(precomposed++) = (UTF16Char)currentChar;
272 }
273 } else {
274 if ((currentChar >= HANGUL_LBASE) && (currentChar < (HANGUL_LBASE + 0xFF))) { // Hangul Jamo
275 int8_t lIndex = currentChar - HANGUL_LBASE;
276
277 if ((length > 0) && (0 <= lIndex) && (lIndex <= HANGUL_LCOUNT)) {
278 int16_t vIndex = *characters - HANGUL_VBASE;
279
280 if ((vIndex >= 0) && (vIndex <= HANGUL_VCOUNT)) {
281 int16_t tIndex = 0;
282
283 ++characters; --length;
284
285 if (length > 0) {
286 tIndex = *characters - HANGUL_TBASE;
287 if ((tIndex < 0) || (tIndex > HANGUL_TCOUNT)) {
288 tIndex = 0;
289 } else {
290 ++characters; --length;
291 }
292 }
293 currentChar = (lIndex * HANGUL_VCOUNT + vIndex) * HANGUL_TCOUNT + tIndex + HANGUL_SBASE;
294 }
295 }
296 }
297
298 if (precomposedChar != 0xFFFD) {
299 if (currentBaseIsBMP) { // Non-BMP
300 if (lastChar > 0xFFFF) { // Last char was Non-BMP
301 --usedLength;
302 memmove(currentBase + 1, currentBase + 2, (precomposed - (currentBase + 2)) * sizeof(UTF16Char));
303 }
304 *(currentBase) = (UTF16Char)precomposedChar;
305 } else {
306 if (lastChar < 0x10000) { // Last char was BMP
307 ++usedLength;
308 if (usedLength > maxLength) break;
309 __CFUniCharMoveBufferFromEnd(currentBase + 1, precomposed - (currentBase + 1), 1);
310 }
311 precomposedChar -= 0x10000;
312 *currentBase = (UTF16Char)((precomposedChar >> 10) + 0xD800UL);
313 *(currentBase + 1) = (UTF16Char)((precomposedChar & 0x3FF) + 0xDC00UL);
314 }
315 precomposedChar = 0xFFFD;
316 }
317 currentBase = precomposed;
318
319 lastChar = currentChar;
320 lastClass = 0;
321
322 if (currentChar > 0xFFFF) { // Non-BMP
323 usedLength += 2;
324 if (usedLength > maxLength) break;
325 currentChar -= 0x10000;
326 *(precomposed++) = (UTF16Char)((currentChar >> 10) + 0xD800UL);
327 *(precomposed++) = (UTF16Char)((currentChar & 0x3FF) + 0xDC00UL);
328 currentBaseIsBMP = false;
329 } else {
330 ++usedLength;
331 if (usedLength > maxLength) break;
332 *(precomposed++) = (UTF16Char)currentChar;
333 currentBaseIsBMP = true;
334 }
335 }
336 }
337
338 if (precomposedChar != 0xFFFD) {
339 if (currentChar > 0xFFFF) { // Non-BMP
340 if (lastChar < 0x10000) { // Last char was BMP
341 ++usedLength;
342 if (usedLength > maxLength) {
343 if (consumedLength) *consumedLength = originalLength - length;
344 if (filledLength) *filledLength = usedLength;
345 return false;
346 }
347 __CFUniCharMoveBufferFromEnd(currentBase + 1, precomposed - (currentBase + 1), 1);
348 }
349 precomposedChar -= 0x10000;
350 *currentBase = (UTF16Char)((precomposedChar >> 10) + 0xD800UL);
351 *(currentBase + 1) = (UTF16Char)((precomposedChar & 0x3FF) + 0xDC00UL);
352 } else {
353 if (lastChar > 0xFFFF) { // Last char was Non-BMP
354 --usedLength;
355 memmove(currentBase + 1, currentBase + 2, (precomposed - (currentBase + 2)) * sizeof(UTF16Char));
356 }
357 *(currentBase) = (UTF16Char)precomposedChar;
358 }
359 }
360
361 if (consumedLength) *consumedLength = originalLength - length;
362 if (filledLength) *filledLength = usedLength;
363
364 return true;
365 }
366