]> git.saurik.com Git - apple/cf.git/blob - CFICUConverters.c
CF-635.15.tar.gz
[apple/cf.git] / CFICUConverters.c
1 /*
2 * Copyright (c) 2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /* CFICUConverters.c
25 Copyright (c) 2004-2011, Apple Inc. All rights reserved.
26 Responsibility: Aki Inoue
27 */
28
29 #include "CFStringEncodingDatabase.h"
30 #include "CFStringEncodingConverterPriv.h"
31 #include "CFICUConverters.h"
32 #include <CoreFoundation/CFStringEncodingExt.h>
33 #include <CoreFoundation/CFUniChar.h>
34 #include <unicode/ucnv.h>
35 #include <unicode/uversion.h>
36 #include "CFInternal.h"
37 #include <stdio.h>
38
39 // Thread data support
40 typedef struct {
41 uint8_t _numSlots;
42 uint8_t _nextSlot;
43 UConverter **_converters;
44 } __CFICUThreadData;
45
46 static void __CFICUThreadDataDestructor(void *context) {
47 __CFICUThreadData * data = (__CFICUThreadData *)context;
48
49 if (NULL != data->_converters) { // scan to make sure deallocation
50 UConverter **converter = data->_converters;
51 UConverter **limit = converter + data->_numSlots;
52
53 while (converter < limit) {
54 if (NULL != converter) ucnv_close(*converter);
55 ++converter;
56 }
57 CFAllocatorDeallocate(NULL, data->_converters);
58 }
59
60 CFAllocatorDeallocate(NULL, data);
61 }
62
63 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
64 __CFICUThreadData * data;
65
66 data = (__CFICUThreadData *)_CFGetTSD(__CFTSDKeyICUConverter);
67
68 if (NULL == data) {
69 data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
70 memset(data, 0, sizeof(__CFICUThreadData));
71 _CFSetTSD(__CFTSDKeyICUConverter, (void *)data, __CFICUThreadDataDestructor);
72 }
73
74 return data;
75 }
76
77 __private_extern__ const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
78 #define STACK_BUFFER_SIZE (60)
79 char buffer[STACK_BUFFER_SIZE];
80 const char *result = NULL;
81 UErrorCode errorCode = U_ZERO_ERROR;
82 uint32_t codepage = 0;
83
84 if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";
85
86 if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows
87
88 if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;
89
90 if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);
91
92 return result;
93 #undef STACK_BUFFER_SIZE
94 }
95
96 __private_extern__ CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
97 uint32_t codepage;
98 UErrorCode errorCode = U_ZERO_ERROR;
99
100 if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
101
102 if (0 != ucnv_countAliases(icuName, &errorCode)) {
103 CFStringEncoding encoding;
104 const char *name;
105
106 // Try WINDOWS platform
107 name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
108
109 if (NULL != name) {
110 if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
111
112 if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
113 }
114
115 // Try JAVA platform
116 name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
117 if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
118
119 // Try MIME platform
120 name = ucnv_getStandardName(icuName, "MIME", &errorCode);
121 if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
122 }
123
124 return kCFStringEncodingInvalidId;
125 }
126
127 CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
128 UConverter *converter;
129 UErrorCode errorCode = U_ZERO_ERROR;
130 uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
131
132 if (0 != streamID) { // this is a part of streaming previously created
133 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
134
135 --streamID; // map to array index
136
137 if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
138 }
139
140 converter = ucnv_open(icuName, &errorCode);
141
142 if (NULL != converter) {
143 char lossyByte = CFStringEncodingMaskToLossyByte(flags);
144
145 if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';
146
147 if (0 ==lossyByte) {
148 if (toUnicode) {
149 ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
150 } else {
151 ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
152 }
153 } else {
154 ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
155 }
156 }
157
158 return converter;
159 }
160
161 #define ICU_CONVERTER_SLOT_INCREMENT (10)
162 #define ICU_CONVERTER_MAX_SLOT (255)
163
164 static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
165 uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
166
167 if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
168 if (0 == streamID) {
169 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
170
171 if (NULL == data->_converters) {
172 data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
173 memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
174 data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
175 data->_nextSlot = 0;
176 } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
177 CFIndex index;
178
179 for (index = 0;index < data->_numSlots;index++) {
180 if (NULL == data->_converters[index]) {
181 data->_nextSlot = index;
182 break;
183 }
184 }
185
186 if (index >= data->_numSlots) { // we're full
187 UConverter **newConverters;
188 CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;
189
190 if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
191 CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
192 ucnv_close(converter);
193 return 0;
194 }
195
196 newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
197 memset(newConverters, 0, sizeof(UConverter *) * newSize);
198 memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
199 CFAllocatorDeallocate(NULL, data->_converters);
200 data->_converters = newConverters;
201 data->_nextSlot = data->_numSlots;
202 data->_numSlots = newSize;
203 }
204 }
205
206 data->_converters[data->_nextSlot] = converter;
207 streamID = data->_nextSlot + 1;
208
209 // now find next slot
210 ++data->_nextSlot;
211
212 if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
213 data->_nextSlot = 0;
214
215 while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
216 }
217 }
218
219 return CFStringEncodingStreamIDToMask(streamID);
220 }
221
222 if (0 != streamID) {
223 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
224
225 --streamID; // map to array index
226
227 if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
228 data->_converters[streamID] = NULL;
229 if (data->_nextSlot > streamID) data->_nextSlot = streamID;
230 }
231 }
232
233 ucnv_close(converter);
234
235 return 0;
236 }
237
238 #define MAX_BUFFER_SIZE (1000)
239
240 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
241 #if (U_ICU_VERSION_MAJOR_NUM > 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM > 6))
242 #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
243 #endif
244 #endif
245 #define HAS_ICU_BUG_6024743 (1)
246 #define HAS_ICU_BUG_6025527 (1)
247
248 __private_extern__ CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
249 UConverter *converter;
250 UErrorCode errorCode = U_ZERO_ERROR;
251 const UTF16Char *source = characters;
252 const UTF16Char *sourceLimit = source + numChars;
253 char *destination = (char *)bytes;
254 const char *destinationLimit = destination + maxByteLen;
255 bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
256 CFIndex status;
257
258 if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;
259
260 if (0 == maxByteLen) {
261 char buffer[MAX_BUFFER_SIZE];
262 CFIndex totalLength = 0;
263
264 while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
265 destination = buffer;
266 destinationLimit = destination + MAX_BUFFER_SIZE;
267
268 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
269
270 totalLength += (destination - buffer);
271
272 if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
273 }
274
275 if (NULL != usedByteLen) *usedByteLen = totalLength;
276 } else {
277 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
278
279 #if HAS_ICU_BUG_6024743
280 /* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding <rdar://problem/7183045> Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */
281 if (U_BUFFER_OVERFLOW_ERROR == errorCode) {
282 const uint8_t *bitmap = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
283 const uint8_t *nonBase;
284 UTF32Char character;
285
286 do {
287 // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates)
288 do {
289 sourceLimit = (source - 1);
290 character = *sourceLimit;
291 nonBase = bitmap;
292
293 if (CFUniCharIsSurrogateLowCharacter(character)) {
294 --sourceLimit;
295 character = CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit, character);
296 nonBase = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, (character >> 16) & 0x000F);
297 character &= 0xFFFF;
298 }
299 } while ((sourceLimit > characters) && CFUniCharIsMemberOfBitmap(character, nonBase));
300
301 if (sourceLimit > characters) {
302 source = characters;
303 destination = (char *)bytes;
304 errorCode = U_ZERO_ERROR;
305
306 ucnv_resetFromUnicode(converter);
307
308 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
309 }
310 } while (U_BUFFER_OVERFLOW_ERROR == errorCode);
311
312 errorCode = U_BUFFER_OVERFLOW_ERROR;
313 }
314 #endif
315 if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
316 }
317
318 status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
319
320 if (NULL != usedCharLen) {
321 #if HAS_ICU_BUG_6024743
322 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
323 if (kCFStringEncodingInvalidInputStream == status) {
324 #define MAX_ERROR_BUFFER_LEN (32)
325 UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
326 int8_t errorLength = MAX_ERROR_BUFFER_LEN;
327 #undef MAX_ERROR_BUFFER_LEN
328
329 errorCode = U_ZERO_ERROR;
330
331 ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);
332
333 if (U_ZERO_ERROR == errorCode) {
334 source -= errorLength;
335 } else {
336 // Gah, something is terribly wrong. Reset everything
337 source = characters; // 0 length
338 if (NULL != usedByteLen) *usedByteLen = 0;
339 }
340 }
341 #endif
342 *usedCharLen = source - characters;
343 }
344
345 status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
346
347 return status;
348 }
349
350 __private_extern__ CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
351 UConverter *converter;
352 UErrorCode errorCode = U_ZERO_ERROR;
353 const char *source = (const char *)bytes;
354 const char *sourceLimit = source + numBytes;
355 UTF16Char *destination = characters;
356 const UTF16Char *destinationLimit = destination + maxCharLen;
357 bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
358 CFIndex status;
359
360 if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;
361
362 if (0 == maxCharLen) {
363 UTF16Char buffer[MAX_BUFFER_SIZE];
364 CFIndex totalLength = 0;
365
366 while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
367 destination = buffer;
368 destinationLimit = destination + MAX_BUFFER_SIZE;
369
370 ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
371
372 totalLength += (destination - buffer);
373
374 if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
375 }
376
377 if (NULL != usedCharLen) *usedCharLen = totalLength;
378 } else {
379 ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
380
381 if (NULL != usedCharLen) *usedCharLen = destination - characters;
382 }
383
384 status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
385
386 if (NULL != usedByteLen) {
387 #if HAS_ICU_BUG_6024743
388 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
389 if (kCFStringEncodingInvalidInputStream == status) {
390 #define MAX_ERROR_BUFFER_LEN (32)
391 char errorBuffer[MAX_ERROR_BUFFER_LEN];
392 int8_t errorLength = MAX_ERROR_BUFFER_LEN;
393 #undef MAX_ERROR_BUFFER_LEN
394
395 errorCode = U_ZERO_ERROR;
396
397 ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
398
399 if (U_ZERO_ERROR == errorCode) {
400 #if HAS_ICU_BUG_6025527
401 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
402 if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
403 #endif
404 source -= errorLength;
405 } else {
406 // Gah, something is terribly wrong. Reset everything
407 source = (const char *)bytes; // 0 length
408 if (NULL != usedCharLen) *usedCharLen = 0;
409 }
410 }
411 #endif
412
413 *usedByteLen = source - (const char *)bytes;
414 }
415
416 status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
417
418 return status;
419 }
420
421 __private_extern__ CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
422 CFIndex usedCharLen;
423 return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
424 }
425
426 __private_extern__ CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
427 CFIndex usedByteLen;
428 return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
429 }
430
431 __private_extern__ CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
432 CFIndex count = ucnv_countAvailable();
433 CFIndex numEncodings = 0;
434 CFStringEncoding *encodings;
435 CFStringEncoding encoding;
436 CFIndex index;
437
438 if (0 == count) return NULL;
439
440 encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);
441
442 for (index = 0;index < count;index++) {
443 encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));
444
445 if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
446 }
447
448 if (0 == numEncodings) {
449 CFAllocatorDeallocate(allocator, encodings);
450 encodings = NULL;
451 }
452
453 *numberOfIndex = numEncodings;
454
455 return encodings;
456 }