]> git.saurik.com Git - apple/cf.git/blob - CFICUConverters.c
CF-1152.14.tar.gz
[apple/cf.git] / CFICUConverters.c
1 /*
2 * Copyright (c) 2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /* CFICUConverters.c
25 Copyright (c) 2004-2014, Apple Inc. All rights reserved.
26 Responsibility: Aki Inoue
27 */
28
29 #include "CFStringEncodingDatabase.h"
30 #include "CFStringEncodingConverterPriv.h"
31 #include "CFICUConverters.h"
32 #include <CoreFoundation/CFStringEncodingExt.h>
33 #include <CoreFoundation/CFUniChar.h>
34 #include <unicode/ucnv.h>
35 #include <unicode/uversion.h>
36 #include "CFInternal.h"
37 #include <stdio.h>
38
39 // Thread data support
40 typedef struct {
41 uint8_t _numSlots;
42 uint8_t _nextSlot;
43 UConverter **_converters;
44 } __CFICUThreadData;
45
46 static void __CFICUThreadDataDestructor(void *context) {
47 __CFICUThreadData * data = (__CFICUThreadData *)context;
48
49 if (NULL != data->_converters) { // scan to make sure deallocation
50 UConverter **converter = data->_converters;
51 UConverter **limit = converter + data->_numSlots;
52
53 while (converter < limit) {
54 if (NULL != converter) ucnv_close(*converter);
55 ++converter;
56 }
57 CFAllocatorDeallocate(NULL, data->_converters);
58 }
59
60 CFAllocatorDeallocate(NULL, data);
61 }
62
63 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
64 __CFICUThreadData * data;
65
66 data = (__CFICUThreadData *)_CFGetTSD(__CFTSDKeyICUConverter);
67
68 if (NULL == data) {
69 data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
70 memset(data, 0, sizeof(__CFICUThreadData));
71 _CFSetTSD(__CFTSDKeyICUConverter, (void *)data, __CFICUThreadDataDestructor);
72 }
73
74 return data;
75 }
76
77 CF_PRIVATE const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
78 #define STACK_BUFFER_SIZE (60)
79 char buffer[STACK_BUFFER_SIZE];
80 const char *result = NULL;
81 UErrorCode errorCode = U_ZERO_ERROR;
82 uint32_t codepage = 0;
83
84 if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";
85
86 if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows
87
88 if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;
89
90 if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);
91
92 return result;
93 #undef STACK_BUFFER_SIZE
94 }
95
96 CF_PRIVATE CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
97 uint32_t codepage;
98 char *endPtr;
99 UErrorCode errorCode = U_ZERO_ERROR;
100
101 if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage);
102
103 if (0 != ucnv_countAliases(icuName, &errorCode)) {
104 CFStringEncoding encoding;
105 const char *name;
106
107 // Try WINDOWS platform
108 name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
109
110 if (NULL != name) {
111 if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage);
112
113 if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
114 }
115
116 // Try JAVA platform
117 name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
118 if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
119
120 // Try MIME platform
121 name = ucnv_getStandardName(icuName, "MIME", &errorCode);
122 if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
123 }
124
125 return kCFStringEncodingInvalidId;
126 }
127
128 CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
129 UConverter *converter;
130 UErrorCode errorCode = U_ZERO_ERROR;
131 uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
132
133 if (0 != streamID) { // this is a part of streaming previously created
134 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
135
136 --streamID; // map to array index
137
138 if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
139 }
140
141 converter = ucnv_open(icuName, &errorCode);
142
143 if (NULL != converter) {
144 char lossyByte = CFStringEncodingMaskToLossyByte(flags);
145
146 if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';
147
148 if (0 ==lossyByte) {
149 if (toUnicode) {
150 ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
151 } else {
152 ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
153 }
154 } else {
155 ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
156 }
157 }
158
159 return converter;
160 }
161
162 #define ICU_CONVERTER_SLOT_INCREMENT (10)
163 #define ICU_CONVERTER_MAX_SLOT (255)
164
165 static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
166 uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
167
168 if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
169 if (0 == streamID) {
170 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
171
172 if (NULL == data->_converters) {
173 data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
174 memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
175 data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
176 data->_nextSlot = 0;
177 } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
178 CFIndex index;
179
180 for (index = 0;index < data->_numSlots;index++) {
181 if (NULL == data->_converters[index]) {
182 data->_nextSlot = index;
183 break;
184 }
185 }
186
187 if (index >= data->_numSlots) { // we're full
188 UConverter **newConverters;
189 CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;
190
191 if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
192 CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
193 ucnv_close(converter);
194 return 0;
195 }
196
197 newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
198 memset(newConverters, 0, sizeof(UConverter *) * newSize);
199 memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
200 CFAllocatorDeallocate(NULL, data->_converters);
201 data->_converters = newConverters;
202 data->_nextSlot = data->_numSlots;
203 data->_numSlots = newSize;
204 }
205 }
206
207 data->_converters[data->_nextSlot] = converter;
208 streamID = data->_nextSlot + 1;
209
210 // now find next slot
211 ++data->_nextSlot;
212
213 if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
214 data->_nextSlot = 0;
215
216 while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
217 }
218 }
219
220 return CFStringEncodingStreamIDToMask(streamID);
221 }
222
223 if (0 != streamID) {
224 __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
225
226 --streamID; // map to array index
227
228 if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
229 data->_converters[streamID] = NULL;
230 if (data->_nextSlot > streamID) data->_nextSlot = streamID;
231 }
232 }
233
234 ucnv_close(converter);
235
236 return 0;
237 }
238
239 #define MAX_BUFFER_SIZE (1000)
240
241 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
242 #if 0
243 // we're no longer doing this check. Revive when the status in the bug changed.
244 #if (U_ICU_VERSION_MAJOR_NUM > 49)
245 #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
246 #endif
247 #endif
248 #endif
249 #define HAS_ICU_BUG_6024743 (1)
250 #define HAS_ICU_BUG_6025527 (1)
251
252 CF_PRIVATE CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
253 UConverter *converter;
254 UErrorCode errorCode = U_ZERO_ERROR;
255 const UTF16Char *source = characters;
256 const UTF16Char *sourceLimit = source + numChars;
257 char *destination = (char *)bytes;
258 const char *destinationLimit = destination + maxByteLen;
259 bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
260 CFIndex status;
261
262 if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;
263
264 if (0 == maxByteLen) {
265 char buffer[MAX_BUFFER_SIZE];
266 CFIndex totalLength = 0;
267
268 while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
269 destination = buffer;
270 destinationLimit = destination + MAX_BUFFER_SIZE;
271
272 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
273
274 totalLength += (destination - buffer);
275
276 if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
277 }
278
279 if (NULL != usedByteLen) *usedByteLen = totalLength;
280 } else {
281 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
282
283 #if HAS_ICU_BUG_6024743
284 /* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding <rdar://problem/7183045> Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */
285 if (U_BUFFER_OVERFLOW_ERROR == errorCode) {
286 const uint8_t *bitmap = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
287 const uint8_t *nonBase;
288 UTF32Char character;
289
290 do {
291 // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates)
292 do {
293 sourceLimit = (source - 1);
294 character = *sourceLimit;
295 nonBase = bitmap;
296
297 if (CFUniCharIsSurrogateLowCharacter(character)) {
298 --sourceLimit;
299 character = CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit, character);
300 nonBase = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, (character >> 16) & 0x000F);
301 character &= 0xFFFF;
302 }
303 } while ((sourceLimit > characters) && CFUniCharIsMemberOfBitmap(character, nonBase));
304
305 if (sourceLimit > characters) {
306 source = characters;
307 destination = (char *)bytes;
308 errorCode = U_ZERO_ERROR;
309
310 ucnv_resetFromUnicode(converter);
311
312 ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
313 }
314 } while (U_BUFFER_OVERFLOW_ERROR == errorCode);
315
316 errorCode = U_BUFFER_OVERFLOW_ERROR;
317 }
318 #endif
319 if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
320 }
321
322 status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
323
324 if (NULL != usedCharLen) {
325 #if HAS_ICU_BUG_6024743
326 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
327 if (kCFStringEncodingInvalidInputStream == status) {
328 #define MAX_ERROR_BUFFER_LEN (32)
329 UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
330 int8_t errorLength = MAX_ERROR_BUFFER_LEN;
331 #undef MAX_ERROR_BUFFER_LEN
332
333 errorCode = U_ZERO_ERROR;
334
335 ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);
336
337 if (U_ZERO_ERROR == errorCode) {
338 source -= errorLength;
339 } else {
340 // Gah, something is terribly wrong. Reset everything
341 source = characters; // 0 length
342 if (NULL != usedByteLen) *usedByteLen = 0;
343 }
344 }
345 #endif
346 *usedCharLen = source - characters;
347 }
348
349 status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
350
351 return status;
352 }
353
354 CF_PRIVATE CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
355 UConverter *converter;
356 UErrorCode errorCode = U_ZERO_ERROR;
357 const char *source = (const char *)bytes;
358 const char *sourceLimit = source + numBytes;
359 UTF16Char *destination = characters;
360 const UTF16Char *destinationLimit = destination + maxCharLen;
361 bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
362 CFIndex status;
363
364 if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;
365
366 if (0 == maxCharLen) {
367 UTF16Char buffer[MAX_BUFFER_SIZE];
368 CFIndex totalLength = 0;
369
370 while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
371 destination = buffer;
372 destinationLimit = destination + MAX_BUFFER_SIZE;
373
374 ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
375
376 totalLength += (destination - buffer);
377
378 if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
379 }
380
381 if (NULL != usedCharLen) *usedCharLen = totalLength;
382 } else {
383 ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
384
385 if (NULL != usedCharLen) *usedCharLen = destination - characters;
386 }
387
388 status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
389
390 if (NULL != usedByteLen) {
391 #if HAS_ICU_BUG_6024743
392 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
393 if (kCFStringEncodingInvalidInputStream == status) {
394 #define MAX_ERROR_BUFFER_LEN (32)
395 char errorBuffer[MAX_ERROR_BUFFER_LEN];
396 int8_t errorLength = MAX_ERROR_BUFFER_LEN;
397 #undef MAX_ERROR_BUFFER_LEN
398
399 errorCode = U_ZERO_ERROR;
400
401 ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
402
403 if (U_ZERO_ERROR == errorCode) {
404 #if HAS_ICU_BUG_6025527
405 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
406 if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
407 #endif
408 source -= errorLength;
409 } else {
410 // Gah, something is terribly wrong. Reset everything
411 source = (const char *)bytes; // 0 length
412 if (NULL != usedCharLen) *usedCharLen = 0;
413 }
414 }
415 #endif
416
417 *usedByteLen = source - (const char *)bytes;
418 }
419
420 status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
421
422 return status;
423 }
424
425 CF_PRIVATE CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
426 CFIndex usedCharLen;
427 return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
428 }
429
430 CF_PRIVATE CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
431 CFIndex usedByteLen;
432 return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
433 }
434
435 CF_PRIVATE CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
436 CFIndex count = ucnv_countAvailable();
437 CFIndex numEncodings = 0;
438 CFStringEncoding *encodings;
439 CFStringEncoding encoding;
440 CFIndex index;
441
442 if (0 == count) return NULL;
443
444 encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);
445
446 for (index = 0;index < count;index++) {
447 encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));
448
449 if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
450 }
451
452 if (0 == numEncodings) {
453 CFAllocatorDeallocate(allocator, encodings);
454 encodings = NULL;
455 }
456
457 *numberOfIndex = numEncodings;
458
459 return encodings;
460 }