]> git.saurik.com Git - apple/cf.git/blob - CFICUConverters.c
CF-550.tar.gz
[apple/cf.git] / CFICUConverters.c
1 /*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /*
24 * CFICUConverters.c
25 * CoreFoundation
26 *
27 * Created by Aki Inoue on 07/12/04.
28 * Copyright 2007-2009, Apple Inc. All rights reserved.
29 *
30 */
31
32 #include "CFStringEncodingDatabase.h"
33 #include "CFStringEncodingConverterPriv.h"
34 #include "CFICUConverters.h"
35 #include <CoreFoundation/CFStringEncodingExt.h>
36 #include <unicode/ucnv.h>
37 #include <unicode/uversion.h>
38 #include "CFInternal.h"
39 #include <stdio.h>
40
41 #if DEPLOYMENT_TARGET_WINDOWS
42 #define strncasecmp_l(a, b, c, d) _strnicmp(a, b, c)
43 #define snprintf _snprintf
44 #endif
45
46 // Thread data support
47 typedef struct {
48 uint8_t _numSlots;
49 uint8_t _nextSlot;
50 UConverter **_converters;
51 } __CFICUThreadData;
52
53 static void __CFICUThreadDataDestructor(void *context) {
54 __CFICUThreadData * data = (__CFICUThreadData *)context;
55
56 if (NULL != data->_converters) { // scan to make sure deallocation
57 UConverter **converter = data->_converters;
58 UConverter **limit = converter + data->_numSlots;
59
60 while (converter < limit) {
61 if (NULL != converter) ucnv_close(*converter);
62 ++converter;
63 }
64 CFAllocatorDeallocate(NULL, data->_converters);
65 }
66
67 CFAllocatorDeallocate(NULL, data);
68 }
69
70 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
71 #import <pthread.h>
72
73 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
74 __CFICUThreadData * data;
75
76 pthread_key_init_np(__CFTSDKeyICUConverter, __CFICUThreadDataDestructor);
77 data = (__CFICUThreadData *)pthread_getspecific(__CFTSDKeyICUConverter);
78
79 if (NULL == data) {
80 data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
81 memset(data, 0, sizeof(__CFICUThreadData));
82 pthread_setspecific(__CFTSDKeyICUConverter, (const void *)data);
83 }
84
85 return data;
86 }
87 #elif DEPLOYMENT_TARGET_WINDOWS
88 __private_extern__ void __CFStringEncodingICUThreadDataCleaner(void *context) { __CFICUThreadDataDestructor(context); }
89
90 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
91 __CFThreadSpecificData *threadData = __CFGetThreadSpecificData_inline();
92
93 if (NULL == threadData->_icuThreadData) {
94 threadData->_icuThreadData = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
95 memset(threadData->_icuThreadData, 0, sizeof(__CFICUThreadData));
96 }
97
98 return (__CFICUThreadData *)threadData->_icuThreadData;
99 }
100 #else
101 #error Need implementation for thread data
102 #endif
103
104 __private_extern__ const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
105 #define STACK_BUFFER_SIZE (60)
106 char buffer[STACK_BUFFER_SIZE];
107 const char *result = NULL;
108 UErrorCode errorCode = U_ZERO_ERROR;
109 uint32_t codepage = 0;
110
111 if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";
112
113 if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows
114
115 if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;
116
117 if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);
118
119 return result;
120 #undef STACK_BUFFER_SIZE
121 }
122
123 __private_extern__ CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
124 uint32_t codepage;
125 UErrorCode errorCode = U_ZERO_ERROR;
126
127 if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
128
129 if (0 != ucnv_countAliases(icuName, &errorCode)) {
130 CFStringEncoding encoding;
131 const char *name;
132
133 // Try WINDOWS platform
134 name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
135
136 if (NULL != name) {
137 if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
138
139 if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
140 }
141
142 // Try JAVA platform
143 name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
144 if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
145
146 // Try MIME platform
147 name = ucnv_getStandardName(icuName, "MIME", &errorCode);
148 if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
149 }
150
151 return kCFStringEncodingInvalidId;
152 }
153
154 CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
155 UConverter *converter;
156 UErrorCode errorCode = U_ZERO_ERROR;
157 uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
158
159 if (0 != streamID) { // this is a part of streaming previously created
160 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
161
162 --streamID; // map to array index
163
164 if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
165 }
166
167 converter = ucnv_open(icuName, &errorCode);
168
169 if (NULL != converter) {
170 char lossyByte = CFStringEncodingMaskToLossyByte(flags);
171
172 if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';
173
174 if (0 ==lossyByte) {
175 if (toUnicode) {
176 ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
177 } else {
178 ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
179 }
180 } else {
181 ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
182 }
183 }
184
185 return converter;
186 }
187
188 #define ICU_CONVERTER_SLOT_INCREMENT (10)
189 #define ICU_CONVERTER_MAX_SLOT (255)
190
191 static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
192 uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
193
194 if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
195 if (0 == streamID) {
196 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
197
198 if (NULL == data->_converters) {
199 data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
200 memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
201 data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
202 data->_nextSlot = 0;
203 } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
204 CFIndex index;
205
206 for (index = 0;index < data->_numSlots;index++) {
207 if (NULL == data->_converters[index]) {
208 data->_nextSlot = index;
209 break;
210 }
211 }
212
213 if (index >= data->_numSlots) { // we're full
214 UConverter **newConverters;
215 CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;
216
217 if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
218 CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
219 ucnv_close(converter);
220 return 0;
221 }
222
223 newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
224 memset(newConverters, 0, sizeof(UConverter *) * newSize);
225 memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
226 CFAllocatorDeallocate(NULL, data->_converters);
227 data->_converters = newConverters;
228 data->_nextSlot = data->_numSlots;
229 data->_numSlots = newSize;
230 }
231 }
232
233 data->_converters[data->_nextSlot] = converter;
234 streamID = data->_nextSlot + 1;
235
236 // now find next slot
237 ++data->_nextSlot;
238
239 if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
240 data->_nextSlot = 0;
241
242 while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
243 }
244 }
245
246 return CFStringEncodingStreamIDToMask(streamID);
247 }
248
249 if (0 != streamID) {
250 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
251
252 --streamID; // map to array index
253
254 if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
255 data->_converters[streamID] = NULL;
256 if (data->_nextSlot > streamID) data->_nextSlot = streamID;
257 }
258 }
259
260 ucnv_close(converter);
261
262 return 0;
263 }
264
265 #define MAX_BUFFER_SIZE (1000)
266
267 #if (U_ICU_VERSION_MAJOR_NUM > 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM > 0))
268 #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
269 #endif
270 #define HAS_ICU_BUG_6024743 (1)
271 #define HAS_ICU_BUG_6025527 (1)
272
273 __private_extern__ CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
274 UConverter *converter;
275 UErrorCode errorCode = U_ZERO_ERROR;
276 const UTF16Char *source = characters;
277 const UTF16Char *sourceLimit = source + numChars;
278 char *destination = (char *)bytes;
279 const char *destinationLimit = destination + maxByteLen;
280 bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
281 CFIndex status;
282
283 if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;
284
285 if (0 == maxByteLen) {
286 char buffer[MAX_BUFFER_SIZE];
287 CFIndex totalLength = 0;
288
289 while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
290 destination = buffer;
291 destinationLimit = destination + MAX_BUFFER_SIZE;
292
293 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
294
295 totalLength += (destination - buffer);
296
297 if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
298 }
299
300 if (NULL != usedByteLen) *usedByteLen = totalLength;
301 } else {
302 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
303
304 if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
305 }
306
307 status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
308
309 if (NULL != usedCharLen) {
310 #if HAS_ICU_BUG_6024743
311 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
312 if (kCFStringEncodingInvalidInputStream == status) {
313 #define MAX_ERROR_BUFFER_LEN (32)
314 UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
315 int8_t errorLength = MAX_ERROR_BUFFER_LEN;
316 #undef MAX_ERROR_BUFFER_LEN
317
318 errorCode = U_ZERO_ERROR;
319
320 ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);
321
322 if (U_ZERO_ERROR == errorCode) {
323 source -= errorLength;
324 } else {
325 // Gah, something is terribly wrong. Reset everything
326 source = characters; // 0 length
327 if (NULL != usedByteLen) *usedByteLen = 0;
328 }
329 }
330 #endif
331 *usedCharLen = source - characters;
332 }
333
334 status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
335
336 return status;
337 }
338
339 __private_extern__ CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
340 UConverter *converter;
341 UErrorCode errorCode = U_ZERO_ERROR;
342 const char *source = (const char *)bytes;
343 const char *sourceLimit = source + numBytes;
344 UTF16Char *destination = characters;
345 const UTF16Char *destinationLimit = destination + maxCharLen;
346 bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
347 CFIndex status;
348
349 if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;
350
351 if (0 == maxCharLen) {
352 UTF16Char buffer[MAX_BUFFER_SIZE];
353 CFIndex totalLength = 0;
354
355 while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
356 destination = buffer;
357 destinationLimit = destination + MAX_BUFFER_SIZE;
358
359 ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
360
361 totalLength += (destination - buffer);
362
363 if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
364 }
365
366 if (NULL != usedCharLen) *usedCharLen = totalLength;
367 } else {
368 ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
369
370 if (NULL != usedCharLen) *usedCharLen = destination - characters;
371 }
372
373 status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
374
375 if (NULL != usedByteLen) {
376 #if HAS_ICU_BUG_6024743
377 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
378 if (kCFStringEncodingInvalidInputStream == status) {
379 #define MAX_ERROR_BUFFER_LEN (32)
380 char errorBuffer[MAX_ERROR_BUFFER_LEN];
381 int8_t errorLength = MAX_ERROR_BUFFER_LEN;
382 #undef MAX_ERROR_BUFFER_LEN
383
384 errorCode = U_ZERO_ERROR;
385
386 ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
387
388 if (U_ZERO_ERROR == errorCode) {
389 #if HAS_ICU_BUG_6025527
390 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
391 if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
392 #endif
393 source -= errorLength;
394 } else {
395 // Gah, something is terribly wrong. Reset everything
396 source = (const char *)bytes; // 0 length
397 if (NULL != usedCharLen) *usedCharLen = 0;
398 }
399 }
400 #endif
401
402 *usedByteLen = source - (const char *)bytes;
403 }
404
405 status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
406
407 return status;
408 }
409
410 __private_extern__ CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
411 CFIndex usedCharLen;
412 return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
413 }
414
415 __private_extern__ CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
416 CFIndex usedByteLen;
417 return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
418 }
419
420 __private_extern__ CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
421 CFIndex count = ucnv_countAvailable();
422 CFIndex numEncodings = 0;
423 CFStringEncoding *encodings;
424 CFStringEncoding encoding;
425 CFIndex index;
426
427 if (0 == count) return NULL;
428
429 encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);
430
431 for (index = 0;index < count;index++) {
432 encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));
433
434 if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
435 }
436
437 if (0 == numEncodings) {
438 CFAllocatorDeallocate(allocator, encodings);
439 encodings = NULL;
440 }
441
442 *numberOfIndex = numEncodings;
443
444 return encodings;
445 }