]> git.saurik.com Git - apple/cf.git/blob - CFICUConverters.c
CF-550.13.tar.gz
[apple/cf.git] / CFICUConverters.c
1 /*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /*
25 * CFICUConverters.c
26 * CoreFoundation
27 *
28 * Created by Aki Inoue on 07/12/04.
29 * Copyright 2007-2009, Apple Inc. All rights reserved.
30 *
31 */
32
33 #include "CFStringEncodingDatabase.h"
34 #include "CFStringEncodingConverterPriv.h"
35 #include "CFICUConverters.h"
36 #include <CoreFoundation/CFStringEncodingExt.h>
37 #include <unicode/ucnv.h>
38 #include <unicode/uversion.h>
39 #include "CFInternal.h"
40 #include <stdio.h>
41
42 #if DEPLOYMENT_TARGET_WINDOWS
43 #define strncasecmp_l(a, b, c, d) _strnicmp(a, b, c)
44 #define snprintf _snprintf
45 #endif
46
47 // Thread data support
48 typedef struct {
49 uint8_t _numSlots;
50 uint8_t _nextSlot;
51 UConverter **_converters;
52 } __CFICUThreadData;
53
54 static void __CFICUThreadDataDestructor(void *context) {
55 __CFICUThreadData * data = (__CFICUThreadData *)context;
56
57 if (NULL != data->_converters) { // scan to make sure deallocation
58 UConverter **converter = data->_converters;
59 UConverter **limit = converter + data->_numSlots;
60
61 while (converter < limit) {
62 if (NULL != converter) ucnv_close(*converter);
63 ++converter;
64 }
65 CFAllocatorDeallocate(NULL, data->_converters);
66 }
67
68 CFAllocatorDeallocate(NULL, data);
69 }
70
71 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
72 #import <pthread.h>
73
74 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
75 __CFICUThreadData * data;
76
77 pthread_key_init_np(__CFTSDKeyICUConverter, __CFICUThreadDataDestructor);
78 data = (__CFICUThreadData *)pthread_getspecific(__CFTSDKeyICUConverter);
79
80 if (NULL == data) {
81 data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
82 memset(data, 0, sizeof(__CFICUThreadData));
83 pthread_setspecific(__CFTSDKeyICUConverter, (const void *)data);
84 }
85
86 return data;
87 }
88 #elif DEPLOYMENT_TARGET_WINDOWS
89 __private_extern__ void __CFStringEncodingICUThreadDataCleaner(void *context) { __CFICUThreadDataDestructor(context); }
90
91 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
92 __CFThreadSpecificData *threadData = __CFGetThreadSpecificData_inline();
93
94 if (NULL == threadData->_icuThreadData) {
95 threadData->_icuThreadData = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
96 memset(threadData->_icuThreadData, 0, sizeof(__CFICUThreadData));
97 }
98
99 return (__CFICUThreadData *)threadData->_icuThreadData;
100 }
101 #else
102 #error Need implementation for thread data
103 #endif
104
105 __private_extern__ const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
106 #define STACK_BUFFER_SIZE (60)
107 char buffer[STACK_BUFFER_SIZE];
108 const char *result = NULL;
109 UErrorCode errorCode = U_ZERO_ERROR;
110 uint32_t codepage = 0;
111
112 if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";
113
114 if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows
115
116 if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;
117
118 if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);
119
120 return result;
121 #undef STACK_BUFFER_SIZE
122 }
123
124 __private_extern__ CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
125 uint32_t codepage;
126 UErrorCode errorCode = U_ZERO_ERROR;
127
128 if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
129
130 if (0 != ucnv_countAliases(icuName, &errorCode)) {
131 CFStringEncoding encoding;
132 const char *name;
133
134 // Try WINDOWS platform
135 name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
136
137 if (NULL != name) {
138 if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
139
140 if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
141 }
142
143 // Try JAVA platform
144 name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
145 if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
146
147 // Try MIME platform
148 name = ucnv_getStandardName(icuName, "MIME", &errorCode);
149 if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
150 }
151
152 return kCFStringEncodingInvalidId;
153 }
154
155 CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
156 UConverter *converter;
157 UErrorCode errorCode = U_ZERO_ERROR;
158 uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
159
160 if (0 != streamID) { // this is a part of streaming previously created
161 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
162
163 --streamID; // map to array index
164
165 if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
166 }
167
168 converter = ucnv_open(icuName, &errorCode);
169
170 if (NULL != converter) {
171 char lossyByte = CFStringEncodingMaskToLossyByte(flags);
172
173 if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';
174
175 if (0 ==lossyByte) {
176 if (toUnicode) {
177 ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
178 } else {
179 ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
180 }
181 } else {
182 ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
183 }
184 }
185
186 return converter;
187 }
188
189 #define ICU_CONVERTER_SLOT_INCREMENT (10)
190 #define ICU_CONVERTER_MAX_SLOT (255)
191
192 static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
193 uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
194
195 if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
196 if (0 == streamID) {
197 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
198
199 if (NULL == data->_converters) {
200 data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
201 memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
202 data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
203 data->_nextSlot = 0;
204 } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
205 CFIndex index;
206
207 for (index = 0;index < data->_numSlots;index++) {
208 if (NULL == data->_converters[index]) {
209 data->_nextSlot = index;
210 break;
211 }
212 }
213
214 if (index >= data->_numSlots) { // we're full
215 UConverter **newConverters;
216 CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;
217
218 if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
219 CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
220 ucnv_close(converter);
221 return 0;
222 }
223
224 newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
225 memset(newConverters, 0, sizeof(UConverter *) * newSize);
226 memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
227 CFAllocatorDeallocate(NULL, data->_converters);
228 data->_converters = newConverters;
229 data->_nextSlot = data->_numSlots;
230 data->_numSlots = newSize;
231 }
232 }
233
234 data->_converters[data->_nextSlot] = converter;
235 streamID = data->_nextSlot + 1;
236
237 // now find next slot
238 ++data->_nextSlot;
239
240 if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
241 data->_nextSlot = 0;
242
243 while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
244 }
245 }
246
247 return CFStringEncodingStreamIDToMask(streamID);
248 }
249
250 if (0 != streamID) {
251 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
252
253 --streamID; // map to array index
254
255 if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
256 data->_converters[streamID] = NULL;
257 if (data->_nextSlot > streamID) data->_nextSlot = streamID;
258 }
259 }
260
261 ucnv_close(converter);
262
263 return 0;
264 }
265
266 #define MAX_BUFFER_SIZE (1000)
267
268 #if (U_ICU_VERSION_MAJOR_NUM > 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM > 0))
269 #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
270 #endif
271 #define HAS_ICU_BUG_6024743 (1)
272 #define HAS_ICU_BUG_6025527 (1)
273
274 __private_extern__ CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
275 UConverter *converter;
276 UErrorCode errorCode = U_ZERO_ERROR;
277 const UTF16Char *source = characters;
278 const UTF16Char *sourceLimit = source + numChars;
279 char *destination = (char *)bytes;
280 const char *destinationLimit = destination + maxByteLen;
281 bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
282 CFIndex status;
283
284 if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;
285
286 if (0 == maxByteLen) {
287 char buffer[MAX_BUFFER_SIZE];
288 CFIndex totalLength = 0;
289
290 while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
291 destination = buffer;
292 destinationLimit = destination + MAX_BUFFER_SIZE;
293
294 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
295
296 totalLength += (destination - buffer);
297
298 if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
299 }
300
301 if (NULL != usedByteLen) *usedByteLen = totalLength;
302 } else {
303 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
304
305 if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
306 }
307
308 status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
309
310 if (NULL != usedCharLen) {
311 #if HAS_ICU_BUG_6024743
312 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
313 if (kCFStringEncodingInvalidInputStream == status) {
314 #define MAX_ERROR_BUFFER_LEN (32)
315 UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
316 int8_t errorLength = MAX_ERROR_BUFFER_LEN;
317 #undef MAX_ERROR_BUFFER_LEN
318
319 errorCode = U_ZERO_ERROR;
320
321 ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);
322
323 if (U_ZERO_ERROR == errorCode) {
324 source -= errorLength;
325 } else {
326 // Gah, something is terribly wrong. Reset everything
327 source = characters; // 0 length
328 if (NULL != usedByteLen) *usedByteLen = 0;
329 }
330 }
331 #endif
332 *usedCharLen = source - characters;
333 }
334
335 status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
336
337 return status;
338 }
339
340 __private_extern__ CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
341 UConverter *converter;
342 UErrorCode errorCode = U_ZERO_ERROR;
343 const char *source = (const char *)bytes;
344 const char *sourceLimit = source + numBytes;
345 UTF16Char *destination = characters;
346 const UTF16Char *destinationLimit = destination + maxCharLen;
347 bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
348 CFIndex status;
349
350 if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;
351
352 if (0 == maxCharLen) {
353 UTF16Char buffer[MAX_BUFFER_SIZE];
354 CFIndex totalLength = 0;
355
356 while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
357 destination = buffer;
358 destinationLimit = destination + MAX_BUFFER_SIZE;
359
360 ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
361
362 totalLength += (destination - buffer);
363
364 if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
365 }
366
367 if (NULL != usedCharLen) *usedCharLen = totalLength;
368 } else {
369 ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
370
371 if (NULL != usedCharLen) *usedCharLen = destination - characters;
372 }
373
374 status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
375
376 if (NULL != usedByteLen) {
377 #if HAS_ICU_BUG_6024743
378 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
379 if (kCFStringEncodingInvalidInputStream == status) {
380 #define MAX_ERROR_BUFFER_LEN (32)
381 char errorBuffer[MAX_ERROR_BUFFER_LEN];
382 int8_t errorLength = MAX_ERROR_BUFFER_LEN;
383 #undef MAX_ERROR_BUFFER_LEN
384
385 errorCode = U_ZERO_ERROR;
386
387 ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
388
389 if (U_ZERO_ERROR == errorCode) {
390 #if HAS_ICU_BUG_6025527
391 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
392 if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
393 #endif
394 source -= errorLength;
395 } else {
396 // Gah, something is terribly wrong. Reset everything
397 source = (const char *)bytes; // 0 length
398 if (NULL != usedCharLen) *usedCharLen = 0;
399 }
400 }
401 #endif
402
403 *usedByteLen = source - (const char *)bytes;
404 }
405
406 status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
407
408 return status;
409 }
410
411 __private_extern__ CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
412 CFIndex usedCharLen;
413 return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
414 }
415
416 __private_extern__ CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
417 CFIndex usedByteLen;
418 return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
419 }
420
421 __private_extern__ CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
422 CFIndex count = ucnv_countAvailable();
423 CFIndex numEncodings = 0;
424 CFStringEncoding *encodings;
425 CFStringEncoding encoding;
426 CFIndex index;
427
428 if (0 == count) return NULL;
429
430 encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);
431
432 for (index = 0;index < count;index++) {
433 encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));
434
435 if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
436 }
437
438 if (0 == numEncodings) {
439 CFAllocatorDeallocate(allocator, encodings);
440 encodings = NULL;
441 }
442
443 *numberOfIndex = numEncodings;
444
445 return encodings;
446 }