2 * Copyright (c) 2015 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
25 Copyright (c) 2004-2014, Apple Inc. All rights reserved.
26 Responsibility: Aki Inoue
29 #include "CFStringEncodingDatabase.h"
30 #include "CFStringEncodingConverterPriv.h"
31 #include "CFICUConverters.h"
32 #include <CoreFoundation/CFStringEncodingExt.h>
33 #include <CoreFoundation/CFUniChar.h>
34 #include <unicode/ucnv.h>
35 #include <unicode/uversion.h>
36 #include "CFInternal.h"
39 // Thread data support
43 UConverter
**_converters
;
46 static void __CFICUThreadDataDestructor(void *context
) {
47 __CFICUThreadData
* data
= (__CFICUThreadData
*)context
;
49 if (NULL
!= data
->_converters
) { // scan to make sure deallocation
50 UConverter
**converter
= data
->_converters
;
51 UConverter
**limit
= converter
+ data
->_numSlots
;
53 while (converter
< limit
) {
54 if (NULL
!= converter
) ucnv_close(*converter
);
57 CFAllocatorDeallocate(NULL
, data
->_converters
);
60 CFAllocatorDeallocate(NULL
, data
);
63 CF_INLINE __CFICUThreadData
*__CFStringEncodingICUGetThreadData() {
64 __CFICUThreadData
* data
;
66 data
= (__CFICUThreadData
*)_CFGetTSD(__CFTSDKeyICUConverter
);
69 data
= (__CFICUThreadData
*)CFAllocatorAllocate(NULL
, sizeof(__CFICUThreadData
), 0);
70 memset(data
, 0, sizeof(__CFICUThreadData
));
71 _CFSetTSD(__CFTSDKeyICUConverter
, (void *)data
, __CFICUThreadDataDestructor
);
77 CF_PRIVATE
const char *__CFStringEncodingGetICUName(CFStringEncoding encoding
) {
78 #define STACK_BUFFER_SIZE (60)
79 char buffer
[STACK_BUFFER_SIZE
];
80 const char *result
= NULL
;
81 UErrorCode errorCode
= U_ZERO_ERROR
;
82 uint32_t codepage
= 0;
84 if (kCFStringEncodingUTF7_IMAP
== encoding
) return "IMAP-mailbox-name";
86 if (kCFStringEncodingUnicode
!= (encoding
& 0x0F00)) codepage
= __CFStringEncodingGetWindowsCodePage(encoding
); // we don't use codepage for UTF to avoid little endian weirdness of Windows
88 if ((0 != codepage
) && (snprintf(buffer
, STACK_BUFFER_SIZE
, "windows-%d", codepage
) < STACK_BUFFER_SIZE
) && (NULL
!= (result
= ucnv_getAlias(buffer
, 0, &errorCode
)))) return result
;
90 if (__CFStringEncodingGetCanonicalName(encoding
, buffer
, STACK_BUFFER_SIZE
)) result
= ucnv_getAlias(buffer
, 0, &errorCode
);
93 #undef STACK_BUFFER_SIZE
96 CF_PRIVATE CFStringEncoding
__CFStringEncodingGetFromICUName(const char *icuName
) {
99 UErrorCode errorCode
= U_ZERO_ERROR
;
101 if ((0 == strncasecmp_l(icuName
, "windows-", strlen("windows-"), NULL
)) && (0 != (codepage
= strtol(icuName
+ strlen("windows-"), &endPtr
, 10))) && (*endPtr
== '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage
);
103 if (0 != ucnv_countAliases(icuName
, &errorCode
)) {
104 CFStringEncoding encoding
;
107 // Try WINDOWS platform
108 name
= ucnv_getStandardName(icuName
, "WINDOWS", &errorCode
);
111 if ((0 == strncasecmp_l(name
, "windows-", strlen("windows-"), NULL
)) && (0 != (codepage
= strtol(name
+ strlen("windows-"), &endPtr
, 10))) && (*endPtr
== '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage
);
113 if (strncasecmp_l(icuName
, name
, strlen(name
), NULL
) && (kCFStringEncodingInvalidId
!= (encoding
= __CFStringEncodingGetFromCanonicalName(name
)))) return encoding
;
117 name
= ucnv_getStandardName(icuName
, "JAVA", &errorCode
);
118 if ((NULL
!= name
) && strncasecmp_l(icuName
, name
, strlen(name
), NULL
) && (kCFStringEncodingInvalidId
!= (encoding
= __CFStringEncodingGetFromCanonicalName(name
)))) return encoding
;
121 name
= ucnv_getStandardName(icuName
, "MIME", &errorCode
);
122 if ((NULL
!= name
) && strncasecmp_l(icuName
, name
, strlen(name
), NULL
) && (kCFStringEncodingInvalidId
!= (encoding
= __CFStringEncodingGetFromCanonicalName(name
)))) return encoding
;
125 return kCFStringEncodingInvalidId
;
128 CF_INLINE UConverter
*__CFStringEncodingConverterCreateICUConverter(const char *icuName
, uint32_t flags
, bool toUnicode
) {
129 UConverter
*converter
;
130 UErrorCode errorCode
= U_ZERO_ERROR
;
131 uint8_t streamID
= CFStringEncodingStreamIDFromMask(flags
);
133 if (0 != streamID
) { // this is a part of streaming previously created
134 __CFICUThreadData
*data
= __CFStringEncodingICUGetThreadData();
136 --streamID
; // map to array index
138 if ((streamID
< data
->_numSlots
) && (NULL
!= data
->_converters
[streamID
])) return data
->_converters
[streamID
];
141 converter
= ucnv_open(icuName
, &errorCode
);
143 if (NULL
!= converter
) {
144 char lossyByte
= CFStringEncodingMaskToLossyByte(flags
);
146 if ((0 == lossyByte
) && (0 != (flags
& kCFStringEncodingAllowLossyConversion
))) lossyByte
= '?';
150 ucnv_setToUCallBack(converter
, &UCNV_TO_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &errorCode
);
152 ucnv_setFromUCallBack(converter
, &UCNV_FROM_U_CALLBACK_STOP
, NULL
, NULL
, NULL
, &errorCode
);
155 ucnv_setSubstChars(converter
, &lossyByte
, 1, &errorCode
);
162 #define ICU_CONVERTER_SLOT_INCREMENT (10)
163 #define ICU_CONVERTER_MAX_SLOT (255)
165 static CFIndex
__CFStringEncodingConverterReleaseICUConverter(UConverter
*converter
, uint32_t flags
, CFIndex status
) {
166 uint8_t streamID
= CFStringEncodingStreamIDFromMask(flags
);
168 if ((kCFStringEncodingInvalidInputStream
!= status
) && ((0 != (flags
& kCFStringEncodingPartialInput
)) || ((kCFStringEncodingInsufficientOutputBufferLength
== status
) && (0 != (flags
& kCFStringEncodingPartialOutput
))))) {
170 __CFICUThreadData
*data
= __CFStringEncodingICUGetThreadData();
172 if (NULL
== data
->_converters
) {
173 data
->_converters
= (UConverter
**)CFAllocatorAllocate(NULL
, sizeof(UConverter
*) * ICU_CONVERTER_SLOT_INCREMENT
, 0);
174 memset(data
->_converters
, 0, sizeof(UConverter
*) * ICU_CONVERTER_SLOT_INCREMENT
);
175 data
->_numSlots
= ICU_CONVERTER_SLOT_INCREMENT
;
177 } else if ((data
->_nextSlot
>= data
->_numSlots
) || (NULL
!= data
->_converters
[data
->_nextSlot
])) { // Need to find one
180 for (index
= 0;index
< data
->_numSlots
;index
++) {
181 if (NULL
== data
->_converters
[index
]) {
182 data
->_nextSlot
= index
;
187 if (index
>= data
->_numSlots
) { // we're full
188 UConverter
**newConverters
;
189 CFIndex newSize
= data
->_numSlots
+ ICU_CONVERTER_SLOT_INCREMENT
;
191 if (newSize
> ICU_CONVERTER_MAX_SLOT
) { // something is terribly wrong
192 CFLog(kCFLogLevelError
, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
193 ucnv_close(converter
);
197 newConverters
= (UConverter
**)CFAllocatorAllocate(NULL
, sizeof(UConverter
*) * newSize
, 0);
198 memset(newConverters
, 0, sizeof(UConverter
*) * newSize
);
199 memcpy(newConverters
, data
->_converters
, sizeof(UConverter
*) * data
->_numSlots
);
200 CFAllocatorDeallocate(NULL
, data
->_converters
);
201 data
->_converters
= newConverters
;
202 data
->_nextSlot
= data
->_numSlots
;
203 data
->_numSlots
= newSize
;
207 data
->_converters
[data
->_nextSlot
] = converter
;
208 streamID
= data
->_nextSlot
+ 1;
210 // now find next slot
213 if ((data
->_nextSlot
>= data
->_numSlots
) || (NULL
!= data
->_converters
[data
->_nextSlot
])) {
216 while ((data
->_nextSlot
< data
->_numSlots
) && (NULL
!= data
->_converters
[data
->_nextSlot
])) ++data
->_nextSlot
;
220 return CFStringEncodingStreamIDToMask(streamID
);
224 __CFICUThreadData
*data
= __CFStringEncodingICUGetThreadData();
226 --streamID
; // map to array index
228 if ((streamID
< data
->_numSlots
) && (converter
== data
->_converters
[streamID
])) {
229 data
->_converters
[streamID
] = NULL
;
230 if (data
->_nextSlot
> streamID
) data
->_nextSlot
= streamID
;
234 ucnv_close(converter
);
239 #define MAX_BUFFER_SIZE (1000)
241 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
243 // we're no longer doing this check. Revive when the status in the bug changed.
244 #if (U_ICU_VERSION_MAJOR_NUM > 49)
245 #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
249 #define HAS_ICU_BUG_6024743 (1)
250 #define HAS_ICU_BUG_6025527 (1)
252 CF_PRIVATE CFIndex
__CFStringEncodingICUToBytes(const char *icuName
, uint32_t flags
, const UniChar
*characters
, CFIndex numChars
, CFIndex
*usedCharLen
, uint8_t *bytes
, CFIndex maxByteLen
, CFIndex
*usedByteLen
) {
253 UConverter
*converter
;
254 UErrorCode errorCode
= U_ZERO_ERROR
;
255 const UTF16Char
*source
= characters
;
256 const UTF16Char
*sourceLimit
= source
+ numChars
;
257 char *destination
= (char *)bytes
;
258 const char *destinationLimit
= destination
+ maxByteLen
;
259 bool flush
= ((0 == (flags
& kCFStringEncodingPartialInput
)) ? true : false);
262 if (NULL
== (converter
= __CFStringEncodingConverterCreateICUConverter(icuName
, flags
, false))) return kCFStringEncodingConverterUnavailable
;
264 if (0 == maxByteLen
) {
265 char buffer
[MAX_BUFFER_SIZE
];
266 CFIndex totalLength
= 0;
268 while ((source
< sourceLimit
) && (U_ZERO_ERROR
== errorCode
)) {
269 destination
= buffer
;
270 destinationLimit
= destination
+ MAX_BUFFER_SIZE
;
272 ucnv_fromUnicode(converter
, &destination
, destinationLimit
, (const UChar
**)&source
, (const UChar
*)sourceLimit
, NULL
, flush
, &errorCode
);
274 totalLength
+= (destination
- buffer
);
276 if (U_BUFFER_OVERFLOW_ERROR
== errorCode
) errorCode
= U_ZERO_ERROR
;
279 if (NULL
!= usedByteLen
) *usedByteLen
= totalLength
;
281 ucnv_fromUnicode(converter
, &destination
, destinationLimit
, (const UChar
**)&source
, (const UChar
*)sourceLimit
, NULL
, flush
, &errorCode
);
283 #if HAS_ICU_BUG_6024743
284 /* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding <rdar://problem/7183045> Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */
285 if (U_BUFFER_OVERFLOW_ERROR
== errorCode
) {
286 const uint8_t *bitmap
= CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet
, 0);
287 const uint8_t *nonBase
;
291 // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates)
293 sourceLimit
= (source
- 1);
294 character
= *sourceLimit
;
297 if (CFUniCharIsSurrogateLowCharacter(character
)) {
299 character
= CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit
, character
);
300 nonBase
= CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet
, (character
>> 16) & 0x000F);
303 } while ((sourceLimit
> characters
) && CFUniCharIsMemberOfBitmap(character
, nonBase
));
305 if (sourceLimit
> characters
) {
307 destination
= (char *)bytes
;
308 errorCode
= U_ZERO_ERROR
;
310 ucnv_resetFromUnicode(converter
);
312 ucnv_fromUnicode(converter
, &destination
, destinationLimit
, (const UChar
**)&source
, (const UChar
*)sourceLimit
, NULL
, flush
, &errorCode
);
314 } while (U_BUFFER_OVERFLOW_ERROR
== errorCode
);
316 errorCode
= U_BUFFER_OVERFLOW_ERROR
;
319 if (NULL
!= usedByteLen
) *usedByteLen
= destination
- (const char *)bytes
;
322 status
= ((U_ZERO_ERROR
== errorCode
) ? kCFStringEncodingConversionSuccess
: ((U_BUFFER_OVERFLOW_ERROR
== errorCode
) ? kCFStringEncodingInsufficientOutputBufferLength
: kCFStringEncodingInvalidInputStream
));
324 if (NULL
!= usedCharLen
) {
325 #if HAS_ICU_BUG_6024743
326 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
327 if (kCFStringEncodingInvalidInputStream
== status
) {
328 #define MAX_ERROR_BUFFER_LEN (32)
329 UTF16Char errorBuffer
[MAX_ERROR_BUFFER_LEN
];
330 int8_t errorLength
= MAX_ERROR_BUFFER_LEN
;
331 #undef MAX_ERROR_BUFFER_LEN
333 errorCode
= U_ZERO_ERROR
;
335 ucnv_getInvalidUChars(converter
, (UChar
*)errorBuffer
, &errorLength
, &errorCode
);
337 if (U_ZERO_ERROR
== errorCode
) {
338 source
-= errorLength
;
340 // Gah, something is terribly wrong. Reset everything
341 source
= characters
; // 0 length
342 if (NULL
!= usedByteLen
) *usedByteLen
= 0;
346 *usedCharLen
= source
- characters
;
349 status
|= __CFStringEncodingConverterReleaseICUConverter(converter
, flags
, status
);
354 CF_PRIVATE CFIndex
__CFStringEncodingICUToUnicode(const char *icuName
, uint32_t flags
, const uint8_t *bytes
, CFIndex numBytes
, CFIndex
*usedByteLen
, UniChar
*characters
, CFIndex maxCharLen
, CFIndex
*usedCharLen
) {
355 UConverter
*converter
;
356 UErrorCode errorCode
= U_ZERO_ERROR
;
357 const char *source
= (const char *)bytes
;
358 const char *sourceLimit
= source
+ numBytes
;
359 UTF16Char
*destination
= characters
;
360 const UTF16Char
*destinationLimit
= destination
+ maxCharLen
;
361 bool flush
= ((0 == (flags
& kCFStringEncodingPartialInput
)) ? true : false);
364 if (NULL
== (converter
= __CFStringEncodingConverterCreateICUConverter(icuName
, flags
, true))) return kCFStringEncodingConverterUnavailable
;
366 if (0 == maxCharLen
) {
367 UTF16Char buffer
[MAX_BUFFER_SIZE
];
368 CFIndex totalLength
= 0;
370 while ((source
< sourceLimit
) && (U_ZERO_ERROR
== errorCode
)) {
371 destination
= buffer
;
372 destinationLimit
= destination
+ MAX_BUFFER_SIZE
;
374 ucnv_toUnicode(converter
, (UChar
**)&destination
, (const UChar
*)destinationLimit
, &source
, sourceLimit
, NULL
, flush
, &errorCode
);
376 totalLength
+= (destination
- buffer
);
378 if (U_BUFFER_OVERFLOW_ERROR
== errorCode
) errorCode
= U_ZERO_ERROR
;
381 if (NULL
!= usedCharLen
) *usedCharLen
= totalLength
;
383 ucnv_toUnicode(converter
, (UChar
**)&destination
, (const UChar
*)destinationLimit
, &source
, sourceLimit
, NULL
, flush
, &errorCode
);
385 if (NULL
!= usedCharLen
) *usedCharLen
= destination
- characters
;
388 status
= ((U_ZERO_ERROR
== errorCode
) ? kCFStringEncodingConversionSuccess
: ((U_BUFFER_OVERFLOW_ERROR
== errorCode
) ? kCFStringEncodingInsufficientOutputBufferLength
: kCFStringEncodingInvalidInputStream
));
390 if (NULL
!= usedByteLen
) {
391 #if HAS_ICU_BUG_6024743
392 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
393 if (kCFStringEncodingInvalidInputStream
== status
) {
394 #define MAX_ERROR_BUFFER_LEN (32)
395 char errorBuffer
[MAX_ERROR_BUFFER_LEN
];
396 int8_t errorLength
= MAX_ERROR_BUFFER_LEN
;
397 #undef MAX_ERROR_BUFFER_LEN
399 errorCode
= U_ZERO_ERROR
;
401 ucnv_getInvalidChars(converter
, errorBuffer
, &errorLength
, &errorCode
);
403 if (U_ZERO_ERROR
== errorCode
) {
404 #if HAS_ICU_BUG_6025527
405 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
406 if ((errorLength
> 0) && ('\0' == errorBuffer
[errorLength
- 1])) --errorLength
;
408 source
-= errorLength
;
410 // Gah, something is terribly wrong. Reset everything
411 source
= (const char *)bytes
; // 0 length
412 if (NULL
!= usedCharLen
) *usedCharLen
= 0;
417 *usedByteLen
= source
- (const char *)bytes
;
420 status
|= __CFStringEncodingConverterReleaseICUConverter(converter
, flags
, status
);
425 CF_PRIVATE CFIndex
__CFStringEncodingICUCharLength(const char *icuName
, uint32_t flags
, const uint8_t *bytes
, CFIndex numBytes
) {
427 return (__CFStringEncodingICUToUnicode(icuName
, flags
, bytes
, numBytes
, NULL
, NULL
, 0, &usedCharLen
) == kCFStringEncodingConversionSuccess
? usedCharLen
: 0);
430 CF_PRIVATE CFIndex
__CFStringEncodingICUByteLength(const char *icuName
, uint32_t flags
, const UniChar
*characters
, CFIndex numChars
) {
432 return (__CFStringEncodingICUToBytes(icuName
, flags
, characters
, numChars
, NULL
, NULL
, 0, &usedByteLen
) == kCFStringEncodingConversionSuccess
? usedByteLen
: 0);
435 CF_PRIVATE CFStringEncoding
*__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator
, CFIndex
*numberOfIndex
) {
436 CFIndex count
= ucnv_countAvailable();
437 CFIndex numEncodings
= 0;
438 CFStringEncoding
*encodings
;
439 CFStringEncoding encoding
;
442 if (0 == count
) return NULL
;
444 encodings
= (CFStringEncoding
*)CFAllocatorAllocate(NULL
, sizeof(CFStringEncoding
) * count
, 0);
446 for (index
= 0;index
< count
;index
++) {
447 encoding
= __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index
));
449 if (kCFStringEncodingInvalidId
!= encoding
) encodings
[numEncodings
++] = encoding
;
452 if (0 == numEncodings
) {
453 CFAllocatorDeallocate(allocator
, encodings
);
457 *numberOfIndex
= numEncodings
;