CF-550.tar.gz

[apple/cf.git] / CFICUConverters.c
diff --git a/CFICUConverters.c b/CFICUConverters.c

new file mode 100644 (file)

index 0000000..2698c3b
--- /dev/null
+++ b/CFICUConverters.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_LICENSE_HEADER_END@
+ */
+/*
+ *  CFICUConverters.c
+ *  CoreFoundation
+ *
+ *  Created by Aki Inoue on 07/12/04.
+ *  Copyright 2007-2009, Apple Inc. All rights reserved.
+ *
+ */
+
+#include "CFStringEncodingDatabase.h"
+#include "CFStringEncodingConverterPriv.h"
+#include "CFICUConverters.h"
+#include <CoreFoundation/CFStringEncodingExt.h>
+#include <unicode/ucnv.h>
+#include <unicode/uversion.h>
+#include "CFInternal.h"
+#include <stdio.h>
+
+#if DEPLOYMENT_TARGET_WINDOWS
+#define strncasecmp_l(a, b, c, d) _strnicmp(a, b, c)
+#define snprintf _snprintf
+#endif
+
+// Thread data support
+typedef struct {
+    uint8_t _numSlots;
+    uint8_t _nextSlot;
+    UConverter **_converters;
+} __CFICUThreadData;
+
+static void __CFICUThreadDataDestructor(void *context) {
+    __CFICUThreadData * data = (__CFICUThreadData *)context;
+    
+    if (NULL != data->_converters) { // scan to make sure deallocation
+        UConverter **converter = data->_converters;
+        UConverter **limit = converter + data->_numSlots;
+        
+        while (converter < limit) {
+            if (NULL != converter) ucnv_close(*converter);
+            ++converter;
+        }
+        CFAllocatorDeallocate(NULL, data->_converters);
+    }
+    
+    CFAllocatorDeallocate(NULL, data);
+}
+
+#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
+#import <pthread.h>
+
+CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
+    __CFICUThreadData * data;
+
+    pthread_key_init_np(__CFTSDKeyICUConverter, __CFICUThreadDataDestructor);
+    data = (__CFICUThreadData *)pthread_getspecific(__CFTSDKeyICUConverter);
+
+    if (NULL == data) {
+        data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
+        memset(data, 0, sizeof(__CFICUThreadData));
+        pthread_setspecific(__CFTSDKeyICUConverter, (const void *)data);
+    }
+
+    return data;
+}
+#elif DEPLOYMENT_TARGET_WINDOWS
+__private_extern__ void __CFStringEncodingICUThreadDataCleaner(void *context) { __CFICUThreadDataDestructor(context); }
+
+CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
+    __CFThreadSpecificData *threadData = __CFGetThreadSpecificData_inline();
+
+    if (NULL == threadData->_icuThreadData) {
+        threadData->_icuThreadData = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
+        memset(threadData->_icuThreadData, 0, sizeof(__CFICUThreadData));
+    }
+
+    return (__CFICUThreadData *)threadData->_icuThreadData;
+}
+#else
+#error Need implementation for thread data
+#endif
+
+__private_extern__ const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
+#define STACK_BUFFER_SIZE (60)
+    char buffer[STACK_BUFFER_SIZE];
+    const char *result = NULL;
+    UErrorCode errorCode = U_ZERO_ERROR;
+    uint32_t codepage = 0;
+
+    if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";
+
+    if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows
+
+    if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;
+
+    if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);
+
+    return result;
+#undef STACK_BUFFER_SIZE
+}
+
+__private_extern__ CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
+    uint32_t codepage;
+    UErrorCode errorCode = U_ZERO_ERROR;
+
+    if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
+
+    if (0 != ucnv_countAliases(icuName, &errorCode)) {
+        CFStringEncoding encoding;
+        const char *name;
+
+        // Try WINDOWS platform
+        name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
+        
+        if (NULL != name) {
+            if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
+            
+            if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
+        }
+
+        // Try JAVA platform
+        name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
+        if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
+
+        // Try MIME platform
+        name = ucnv_getStandardName(icuName, "MIME", &errorCode);
+        if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
+    }
+
+    return kCFStringEncodingInvalidId;
+}
+
+CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
+    UConverter *converter;
+    UErrorCode errorCode = U_ZERO_ERROR;
+    uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
+
+    if (0 != streamID) { // this is a part of streaming previously created
+        __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
+
+        --streamID; // map to array index
+
+        if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
+    }
+
+    converter = ucnv_open(icuName, &errorCode);
+
+    if (NULL != converter) {
+        char lossyByte = CFStringEncodingMaskToLossyByte(flags);
+
+        if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';
+
+        if (0 ==lossyByte) {
+            if (toUnicode) {
+                ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
+            } else {
+                ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
+            }
+        } else {
+            ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
+        }
+    }
+
+    return converter;
+}
+
+#define ICU_CONVERTER_SLOT_INCREMENT (10)
+#define ICU_CONVERTER_MAX_SLOT (255)
+
+static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
+    uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
+
+    if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
+        if (0 == streamID) {
+            __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
+
+            if (NULL == data->_converters) {
+                data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
+                memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
+                data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
+                data->_nextSlot = 0;
+            } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
+                CFIndex index;
+
+                for (index = 0;index < data->_numSlots;index++) {
+                    if (NULL == data->_converters[index]) {
+                        data->_nextSlot = index;
+                        break;
+                    }
+                }
+
+                if (index >= data->_numSlots) { // we're full
+                    UConverter **newConverters;
+                    CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;
+
+                    if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
+                        CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
+                        ucnv_close(converter);
+                        return 0;
+                    }
+
+                    newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
+                    memset(newConverters, 0, sizeof(UConverter *) * newSize);
+                    memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
+                    CFAllocatorDeallocate(NULL, data->_converters);
+                    data->_converters = newConverters;
+                    data->_nextSlot = data->_numSlots;
+                    data->_numSlots = newSize;
+                }
+            }
+
+            data->_converters[data->_nextSlot] = converter;
+            streamID = data->_nextSlot + 1;
+
+            // now find next slot
+            ++data->_nextSlot;
+
+            if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
+                data->_nextSlot = 0;
+
+                while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
+            }
+        }
+
+        return CFStringEncodingStreamIDToMask(streamID);
+    }
+
+    if (0 != streamID) {
+        __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
+
+        --streamID; // map to array index
+
+        if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
+            data->_converters[streamID] = NULL;
+            if (data->_nextSlot > streamID) data->_nextSlot = streamID;
+        }
+    }
+
+    ucnv_close(converter);
+
+    return 0;
+}
+
+#define MAX_BUFFER_SIZE (1000)
+
+#if (U_ICU_VERSION_MAJOR_NUM > 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM > 0))
+#warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
+#endif
+#define HAS_ICU_BUG_6024743 (1)
+#define HAS_ICU_BUG_6025527 (1)
+
+__private_extern__ CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
+    UConverter *converter;
+    UErrorCode errorCode = U_ZERO_ERROR;
+    const UTF16Char *source = characters;
+    const UTF16Char *sourceLimit = source + numChars;
+    char *destination = (char *)bytes;
+    const char *destinationLimit = destination + maxByteLen;
+    bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
+    CFIndex status;
+
+    if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;
+
+    if (0 == maxByteLen) {
+        char buffer[MAX_BUFFER_SIZE];
+        CFIndex totalLength = 0;
+
+        while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
+            destination = buffer;
+            destinationLimit = destination + MAX_BUFFER_SIZE;
+
+            ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
+
+            totalLength += (destination - buffer);
+
+            if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
+        }
+
+        if (NULL != usedByteLen) *usedByteLen = totalLength;
+    } else {
+        ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
+        
+        if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
+    }
+
+    status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
+
+    if (NULL != usedCharLen) {
+#if HAS_ICU_BUG_6024743
+/* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
+       if (kCFStringEncodingInvalidInputStream == status) {
+#define MAX_ERROR_BUFFER_LEN (32)
+           UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
+           int8_t errorLength = MAX_ERROR_BUFFER_LEN;
+#undef MAX_ERROR_BUFFER_LEN
+
+           errorCode = U_ZERO_ERROR;
+
+           ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);
+
+           if (U_ZERO_ERROR == errorCode) {
+               source -= errorLength;
+           } else {
+               // Gah, something is terribly wrong. Reset everything
+               source = characters; // 0 length
+               if (NULL != usedByteLen) *usedByteLen = 0;
+           }
+       }
+#endif
+       *usedCharLen = source - characters;
+    }
+
+    status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
+
+    return status;
+}
+
+__private_extern__ CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
+    UConverter *converter;
+    UErrorCode errorCode = U_ZERO_ERROR;
+    const char *source = (const char *)bytes;
+    const char *sourceLimit = source + numBytes;
+    UTF16Char *destination = characters;
+    const UTF16Char *destinationLimit = destination + maxCharLen;
+    bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
+    CFIndex status;
+
+    if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;
+
+    if (0 == maxCharLen) {
+        UTF16Char buffer[MAX_BUFFER_SIZE];
+        CFIndex totalLength = 0;
+        
+        while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
+            destination = buffer;
+            destinationLimit = destination + MAX_BUFFER_SIZE;
+            
+            ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
+            
+            totalLength += (destination - buffer);
+            
+            if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
+        }
+        
+        if (NULL != usedCharLen) *usedCharLen = totalLength;
+    } else {
+        ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
+
+        if (NULL != usedCharLen) *usedCharLen = destination - characters;
+    }
+
+    status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
+
+    if (NULL != usedByteLen) {
+#if HAS_ICU_BUG_6024743
+       /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
+       if (kCFStringEncodingInvalidInputStream == status) {
+#define MAX_ERROR_BUFFER_LEN (32)
+           char errorBuffer[MAX_ERROR_BUFFER_LEN];
+           int8_t errorLength = MAX_ERROR_BUFFER_LEN;
+#undef MAX_ERROR_BUFFER_LEN
+
+           errorCode = U_ZERO_ERROR;
+           
+           ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
+           
+           if (U_ZERO_ERROR == errorCode) {
+#if HAS_ICU_BUG_6025527
+                // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
+                if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
+#endif
+               source -= errorLength;
+           } else {
+               // Gah, something is terribly wrong. Reset everything
+               source = (const char *)bytes; // 0 length
+               if (NULL != usedCharLen) *usedCharLen = 0;
+           }
+       }
+#endif
+
+       *usedByteLen = source - (const char *)bytes;
+    }
+    
+    status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
+
+    return status;
+}
+
+__private_extern__ CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
+    CFIndex usedCharLen;
+    return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
+}
+
+__private_extern__ CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
+    CFIndex usedByteLen;
+    return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
+}
+
+__private_extern__ CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
+    CFIndex count = ucnv_countAvailable();
+    CFIndex numEncodings = 0;
+    CFStringEncoding *encodings;
+    CFStringEncoding encoding;
+    CFIndex index;
+
+    if (0 == count) return NULL;
+
+    encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);
+
+    for (index = 0;index < count;index++) {
+        encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));
+
+        if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
+    }
+
+    if (0 == numEncodings) {
+        CFAllocatorDeallocate(allocator, encodings);
+        encodings = NULL;
+    }
+
+    *numberOfIndex = numEncodings;
+
+    return encodings;
+}