2 * Copyright (c) 2011 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
25 Copyright (c) 2004-2011, Apple Inc. All rights reserved.
26 Responsibility: Aki Inoue
29 #include "CFStringEncodingDatabase.h"
30 #include "CFStringEncodingConverterPriv.h"
31 #include "CFICUConverters.h"
32 #include <CoreFoundation/CFStringEncodingExt.h>
33 #include <CoreFoundation/CFUniChar.h>
34 #include <unicode/ucnv.h>
35 #include <unicode/uversion.h>
36 #include "CFInternal.h"
39 // Thread data support
43 UConverter
**_converters
;
46 static void __CFICUThreadDataDestructor(void *context
) {
47 __CFICUThreadData
* data
= (__CFICUThreadData
*)context
;
49 if (NULL
!= data
->_converters
) { // scan to make sure deallocation
50 UConverter
**converter
= data
->_converters
;
51 UConverter
**limit
= converter
+ data
->_numSlots
;
53 while (converter
< limit
) {
54 if (NULL
!= converter
) ucnv_close(*converter
);
57 CFAllocatorDeallocate(NULL
, data
->_converters
);
60 CFAllocatorDeallocate(NULL
, data
);
63 CF_INLINE __CFICUThreadData
*__CFStringEncodingICUGetThreadData() {
64 __CFICUThreadData
* data
;
66 data
= (__CFICUThreadData
*)_CFGetTSD(__CFTSDKeyICUConverter
);
69 data
= (__CFICUThreadData
*)CFAllocatorAllocate(NULL
, sizeof(__CFICUThreadData
), 0);
70 memset(data
, 0, sizeof(__CFICUThreadData
));
71 _CFSetTSD(__CFTSDKeyICUConverter
, (void *)data
, __CFICUThreadDataDestructor
);
77 __private_extern__
const char *__CFStringEncodingGetICUName(CFStringEncoding encoding
) {
78 #define STACK_BUFFER_SIZE (60)
79 char buffer
[STACK_BUFFER_SIZE
];
80 const char *result
= NULL
;
81 UErrorCode errorCode
= U_ZERO_ERROR
;
82 uint32_t codepage
= 0;
84 if (kCFStringEncodingUTF7_IMAP
== encoding
) return "IMAP-mailbox-name";
86 if (kCFStringEncodingUnicode
!= (encoding
& 0x0F00)) codepage
= __CFStringEncodingGetWindowsCodePage(encoding
); // we don't use codepage for UTF to avoid little endian weirdness of Windows
88 if ((0 != codepage
) && (snprintf(buffer
, STACK_BUFFER_SIZE
, "windows-%d", codepage
) < STACK_BUFFER_SIZE
) && (NULL
!= (result
= ucnv_getAlias(buffer
, 0, &errorCode
)))) return result
;
90 if (__CFStringEncodingGetCanonicalName(encoding
, buffer
, STACK_BUFFER_SIZE
)) result
= ucnv_getAlias(buffer
, 0, &errorCode
);
93 #undef STACK_BUFFER_SIZE
96 __private_extern__ CFStringEncoding
__CFStringEncodingGetFromICUName(const char *icuName
) {
98 UErrorCode errorCode
= U_ZERO_ERROR
;
100 if ((0 == strncasecmp_l(icuName
, "windows-", strlen("windows-"), NULL
)) && (0 != (codepage
= strtol(icuName
+ strlen("windows-"), NULL
, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage
);
102 if (0 != ucnv_countAliases(icuName
, &errorCode
)) {
103 CFStringEncoding encoding
;
106 // Try WINDOWS platform
107 name
= ucnv_getStandardName(icuName
, "WINDOWS", &errorCode
);
110 if ((0 == strncasecmp_l(name
, "windows-", strlen("windows-"), NULL
)) && (0 != (codepage
= strtol(name
+ strlen("windows-"), NULL
, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage
);
112 if (strncasecmp_l(icuName
, name
, strlen(name
), NULL
) && (kCFStringEncodingInvalidId
!= (encoding
= __CFStringEncodingGetFromCanonicalName(name
)))) return encoding
;
116 name
= ucnv_getStandardName(icuName
, "JAVA", &errorCode
);
117 if ((NULL
!= name
) && strncasecmp_l(icuName
, name
, strlen(name
), NULL
) && (kCFStringEncodingInvalidId
!= (encoding
= __CFStringEncodingGetFromCanonicalName(name
)))) return encoding
;
120 name
= ucnv_getStandardName(icuName
, "MIME", &errorCode
);
121 if ((NULL
!= name
) && strncasecmp_l(icuName
, name
, strlen(name
), NULL
) && (kCFStringEncodingInvalidId
!= (encoding
= __CFStringEncodingGetFromCanonicalName(name
)))) return encoding
;
124 return kCFStringEncodingInvalidId
;
127 CF_INLINE UConverter
*__CFStringEncodingConverterCreateICUConverter(const char *icuName
, uint32_t flags
, bool toUnicode
) {
128 UConverter
*converter
;
129 UErrorCode errorCode
= U_ZERO_ERROR
;
130 uint8_t streamID
= CFStringEncodingStreamIDFromMask(flags
);
132 if (0 != streamID
) { // this is a part of streaming previously created
133 __CFICUThreadData
*data
= __CFStringEncodingICUGetThreadData();
135 --streamID
; // map to array index
137 if ((streamID
< data
->_numSlots
) && (NULL
!= data
->_converters
[streamID
])) return data
->_converters
[streamID
];
140 converter
= ucnv_open(icuName
, &errorCode
);
142 if (NULL
!= converter
) {
143 char lossyByte
= CFStringEncodingMaskToLossyByte(flags
);
145 if ((0 == lossyByte
) && (0 != (flags
& kCFStringEncodingAllowLossyConversion
))) lossyByte
= '?';
149 ucnv_setToUCallBack(converter
, &UCNV_TO_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &errorCode
);
151 ucnv_setFromUCallBack(converter
, &UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &errorCode
);
154 ucnv_setSubstChars(converter
, &lossyByte
, 1, &errorCode
);
161 #define ICU_CONVERTER_SLOT_INCREMENT (10)
162 #define ICU_CONVERTER_MAX_SLOT (255)
164 static CFIndex
__CFStringEncodingConverterReleaseICUConverter(UConverter
*converter
, uint32_t flags
, CFIndex status
) {
165 uint8_t streamID
= CFStringEncodingStreamIDFromMask(flags
);
167 if ((kCFStringEncodingInvalidInputStream
!= status
) && ((0 != (flags
& kCFStringEncodingPartialInput
)) || ((kCFStringEncodingInsufficientOutputBufferLength
== status
) && (0 != (flags
& kCFStringEncodingPartialOutput
))))) {
169 __CFICUThreadData
*data
= __CFStringEncodingICUGetThreadData();
171 if (NULL
== data
->_converters
) {
172 data
->_converters
= (UConverter
**)CFAllocatorAllocate(NULL
, sizeof(UConverter
*) * ICU_CONVERTER_SLOT_INCREMENT
, 0);
173 memset(data
->_converters
, 0, sizeof(UConverter
*) * ICU_CONVERTER_SLOT_INCREMENT
);
174 data
->_numSlots
= ICU_CONVERTER_SLOT_INCREMENT
;
176 } else if ((data
->_nextSlot
>= data
->_numSlots
) || (NULL
!= data
->_converters
[data
->_nextSlot
])) { // Need to find one
179 for (index
= 0;index
< data
->_numSlots
;index
++) {
180 if (NULL
== data
->_converters
[index
]) {
181 data
->_nextSlot
= index
;
186 if (index
>= data
->_numSlots
) { // we're full
187 UConverter
**newConverters
;
188 CFIndex newSize
= data
->_numSlots
+ ICU_CONVERTER_SLOT_INCREMENT
;
190 if (newSize
> ICU_CONVERTER_MAX_SLOT
) { // something is terribly wrong
191 CFLog(kCFLogLevelError
, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
192 ucnv_close(converter
);
196 newConverters
= (UConverter
**)CFAllocatorAllocate(NULL
, sizeof(UConverter
*) * newSize
, 0);
197 memset(newConverters
, 0, sizeof(UConverter
*) * newSize
);
198 memcpy(newConverters
, data
->_converters
, sizeof(UConverter
*) * data
->_numSlots
);
199 CFAllocatorDeallocate(NULL
, data
->_converters
);
200 data
->_converters
= newConverters
;
201 data
->_nextSlot
= data
->_numSlots
;
202 data
->_numSlots
= newSize
;
206 data
->_converters
[data
->_nextSlot
] = converter
;
207 streamID
= data
->_nextSlot
+ 1;
209 // now find next slot
212 if ((data
->_nextSlot
>= data
->_numSlots
) || (NULL
!= data
->_converters
[data
->_nextSlot
])) {
215 while ((data
->_nextSlot
< data
->_numSlots
) && (NULL
!= data
->_converters
[data
->_nextSlot
])) ++data
->_nextSlot
;
219 return CFStringEncodingStreamIDToMask(streamID
);
223 __CFICUThreadData
*data
= __CFStringEncodingICUGetThreadData();
225 --streamID
; // map to array index
227 if ((streamID
< data
->_numSlots
) && (converter
== data
->_converters
[streamID
])) {
228 data
->_converters
[streamID
] = NULL
;
229 if (data
->_nextSlot
> streamID
) data
->_nextSlot
= streamID
;
233 ucnv_close(converter
);
238 #define MAX_BUFFER_SIZE (1000)
240 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
241 #if (U_ICU_VERSION_MAJOR_NUM > 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM > 6))
242 #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
245 #define HAS_ICU_BUG_6024743 (1)
246 #define HAS_ICU_BUG_6025527 (1)
248 __private_extern__ CFIndex
__CFStringEncodingICUToBytes(const char *icuName
, uint32_t flags
, const UniChar
*characters
, CFIndex numChars
, CFIndex
*usedCharLen
, uint8_t *bytes
, CFIndex maxByteLen
, CFIndex
*usedByteLen
) {
249 UConverter
*converter
;
250 UErrorCode errorCode
= U_ZERO_ERROR
;
251 const UTF16Char
*source
= characters
;
252 const UTF16Char
*sourceLimit
= source
+ numChars
;
253 char *destination
= (char *)bytes
;
254 const char *destinationLimit
= destination
+ maxByteLen
;
255 bool flush
= ((0 == (flags
& kCFStringEncodingPartialInput
)) ? true : false);
258 if (NULL
== (converter
= __CFStringEncodingConverterCreateICUConverter(icuName
, flags
, false))) return kCFStringEncodingConverterUnavailable
;
260 if (0 == maxByteLen
) {
261 char buffer
[MAX_BUFFER_SIZE
];
262 CFIndex totalLength
= 0;
264 while ((source
< sourceLimit
) && (U_ZERO_ERROR
== errorCode
)) {
265 destination
= buffer
;
266 destinationLimit
= destination
+ MAX_BUFFER_SIZE
;
268 ucnv_fromUnicode(converter
, &destination
, destinationLimit
, (const UChar
**)&source
, (const UChar
*)sourceLimit
, NULL
, flush
, &errorCode
);
270 totalLength
+= (destination
- buffer
);
272 if (U_BUFFER_OVERFLOW_ERROR
== errorCode
) errorCode
= U_ZERO_ERROR
;
275 if (NULL
!= usedByteLen
) *usedByteLen
= totalLength
;
277 ucnv_fromUnicode(converter
, &destination
, destinationLimit
, (const UChar
**)&source
, (const UChar
*)sourceLimit
, NULL
, flush
, &errorCode
);
279 #if HAS_ICU_BUG_6024743
280 /* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding <rdar://problem/7183045> Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */
281 if (U_BUFFER_OVERFLOW_ERROR
== errorCode
) {
282 const uint8_t *bitmap
= CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet
, 0);
283 const uint8_t *nonBase
;
287 // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates)
289 sourceLimit
= (source
- 1);
290 character
= *sourceLimit
;
293 if (CFUniCharIsSurrogateLowCharacter(character
)) {
295 character
= CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit
, character
);
296 nonBase
= CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet
, (character
>> 16) & 0x000F);
299 } while ((sourceLimit
> characters
) && CFUniCharIsMemberOfBitmap(character
, nonBase
));
301 if (sourceLimit
> characters
) {
303 destination
= (char *)bytes
;
304 errorCode
= U_ZERO_ERROR
;
306 ucnv_resetFromUnicode(converter
);
308 ucnv_fromUnicode(converter
, &destination
, destinationLimit
, (const UChar
**)&source
, (const UChar
*)sourceLimit
, NULL
, flush
, &errorCode
);
310 } while (U_BUFFER_OVERFLOW_ERROR
== errorCode
);
312 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
315 if (NULL
!= usedByteLen
) *usedByteLen
= destination
- (const char *)bytes
;
318 status
= ((U_ZERO_ERROR
== errorCode
) ? kCFStringEncodingConversionSuccess
: ((U_BUFFER_OVERFLOW_ERROR
== errorCode
) ? kCFStringEncodingInsufficientOutputBufferLength
: kCFStringEncodingInvalidInputStream
));
320 if (NULL
!= usedCharLen
) {
321 #if HAS_ICU_BUG_6024743
322 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
323 if (kCFStringEncodingInvalidInputStream
== status
) {
324 #define MAX_ERROR_BUFFER_LEN (32)
325 UTF16Char errorBuffer
[MAX_ERROR_BUFFER_LEN
];
326 int8_t errorLength
= MAX_ERROR_BUFFER_LEN
;
327 #undef MAX_ERROR_BUFFER_LEN
329 errorCode
= U_ZERO_ERROR
;
331 ucnv_getInvalidUChars(converter
, (UChar
*)errorBuffer
, &errorLength
, &errorCode
);
333 if (U_ZERO_ERROR
== errorCode
) {
334 source
-= errorLength
;
336 // Gah, something is terribly wrong. Reset everything
337 source
= characters
; // 0 length
338 if (NULL
!= usedByteLen
) *usedByteLen
= 0;
342 *usedCharLen
= source
- characters
;
345 status
|= __CFStringEncodingConverterReleaseICUConverter(converter
, flags
, status
);
350 __private_extern__ CFIndex
__CFStringEncodingICUToUnicode(const char *icuName
, uint32_t flags
, const uint8_t *bytes
, CFIndex numBytes
, CFIndex
*usedByteLen
, UniChar
*characters
, CFIndex maxCharLen
, CFIndex
*usedCharLen
) {
351 UConverter
*converter
;
352 UErrorCode errorCode
= U_ZERO_ERROR
;
353 const char *source
= (const char *)bytes
;
354 const char *sourceLimit
= source
+ numBytes
;
355 UTF16Char
*destination
= characters
;
356 const UTF16Char
*destinationLimit
= destination
+ maxCharLen
;
357 bool flush
= ((0 == (flags
& kCFStringEncodingPartialInput
)) ? true : false);
360 if (NULL
== (converter
= __CFStringEncodingConverterCreateICUConverter(icuName
, flags
, true))) return kCFStringEncodingConverterUnavailable
;
362 if (0 == maxCharLen
) {
363 UTF16Char buffer
[MAX_BUFFER_SIZE
];
364 CFIndex totalLength
= 0;
366 while ((source
< sourceLimit
) && (U_ZERO_ERROR
== errorCode
)) {
367 destination
= buffer
;
368 destinationLimit
= destination
+ MAX_BUFFER_SIZE
;
370 ucnv_toUnicode(converter
, (UChar
**)&destination
, (const UChar
*)destinationLimit
, &source
, sourceLimit
, NULL
, flush
, &errorCode
);
372 totalLength
+= (destination
- buffer
);
374 if (U_BUFFER_OVERFLOW_ERROR
== errorCode
) errorCode
= U_ZERO_ERROR
;
377 if (NULL
!= usedCharLen
) *usedCharLen
= totalLength
;
379 ucnv_toUnicode(converter
, (UChar
**)&destination
, (const UChar
*)destinationLimit
, &source
, sourceLimit
, NULL
, flush
, &errorCode
);
381 if (NULL
!= usedCharLen
) *usedCharLen
= destination
- characters
;
384 status
= ((U_ZERO_ERROR
== errorCode
) ? kCFStringEncodingConversionSuccess
: ((U_BUFFER_OVERFLOW_ERROR
== errorCode
) ? kCFStringEncodingInsufficientOutputBufferLength
: kCFStringEncodingInvalidInputStream
));
386 if (NULL
!= usedByteLen
) {
387 #if HAS_ICU_BUG_6024743
388 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
389 if (kCFStringEncodingInvalidInputStream
== status
) {
390 #define MAX_ERROR_BUFFER_LEN (32)
391 char errorBuffer
[MAX_ERROR_BUFFER_LEN
];
392 int8_t errorLength
= MAX_ERROR_BUFFER_LEN
;
393 #undef MAX_ERROR_BUFFER_LEN
395 errorCode
= U_ZERO_ERROR
;
397 ucnv_getInvalidChars(converter
, errorBuffer
, &errorLength
, &errorCode
);
399 if (U_ZERO_ERROR
== errorCode
) {
400 #if HAS_ICU_BUG_6025527
401 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
402 if ((errorLength
> 0) && ('\0' == errorBuffer
[errorLength
- 1])) --errorLength
;
404 source
-= errorLength
;
406 // Gah, something is terribly wrong. Reset everything
407 source
= (const char *)bytes
; // 0 length
408 if (NULL
!= usedCharLen
) *usedCharLen
= 0;
413 *usedByteLen
= source
- (const char *)bytes
;
416 status
|= __CFStringEncodingConverterReleaseICUConverter(converter
, flags
, status
);
421 __private_extern__ CFIndex
__CFStringEncodingICUCharLength(const char *icuName
, uint32_t flags
, const uint8_t *bytes
, CFIndex numBytes
) {
423 return (__CFStringEncodingICUToUnicode(icuName
, flags
, bytes
, numBytes
, NULL
, NULL
, 0, &usedCharLen
) == kCFStringEncodingConversionSuccess
? usedCharLen
: 0);
426 __private_extern__ CFIndex
__CFStringEncodingICUByteLength(const char *icuName
, uint32_t flags
, const UniChar
*characters
, CFIndex numChars
) {
428 return (__CFStringEncodingICUToBytes(icuName
, flags
, characters
, numChars
, NULL
, NULL
, 0, &usedByteLen
) == kCFStringEncodingConversionSuccess
? usedByteLen
: 0);
431 __private_extern__ CFStringEncoding
*__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator
, CFIndex
*numberOfIndex
) {
432 CFIndex count
= ucnv_countAvailable();
433 CFIndex numEncodings
= 0;
434 CFStringEncoding
*encodings
;
435 CFStringEncoding encoding
;
438 if (0 == count
) return NULL
;
440 encodings
= (CFStringEncoding
*)CFAllocatorAllocate(NULL
, sizeof(CFStringEncoding
) * count
, 0);
442 for (index
= 0;index
< count
;index
++) {
443 encoding
= __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index
));
445 if (kCFStringEncodingInvalidId
!= encoding
) encodings
[numEncodings
++] = encoding
;
448 if (0 == numEncodings
) {
449 CFAllocatorDeallocate(allocator
, encodings
);
453 *numberOfIndex
= numEncodings
;