CFICUConverters.c

   1 /*
   2  * Copyright (c) 2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*
  24  *  CFICUConverters.c
  25  *  CoreFoundation
  26  *
  27  *  Created by Aki Inoue on 07/12/04.
  28  *  Copyright 2007-2009, Apple Inc. All rights reserved.
  29  *
  30  */
  31
  32 #include "CFStringEncodingDatabase.h"
  33 #include "CFStringEncodingConverterPriv.h"
  34 #include "CFICUConverters.h"
  35 #include <CoreFoundation/CFStringEncodingExt.h>
  36 #include <unicode/ucnv.h>
  37 #include <unicode/uversion.h>
  38 #include "CFInternal.h"
  39 #include <stdio.h>
  40
  41 #if DEPLOYMENT_TARGET_WINDOWS
  42 #define strncasecmp_l(a, b, c, d) _strnicmp(a, b, c)
  43 #define snprintf _snprintf
  44 #endif
  45
  46 // Thread data support
  47 typedef struct {
  48     uint8_t _numSlots;
  49     uint8_t _nextSlot;
  50     UConverter **_converters;
  51 } __CFICUThreadData;
  52
  53 static void __CFICUThreadDataDestructor(void *context) {
  54     __CFICUThreadData * data = (__CFICUThreadData *)context;
  55
  56     if (NULL != data->_converters) { // scan to make sure deallocation
  57         UConverter **converter = data->_converters;
  58         UConverter **limit = converter + data->_numSlots;
  59
  60         while (converter < limit) {
  61             if (NULL != converter) ucnv_close(*converter);
  62             ++converter;
  63         }
  64         CFAllocatorDeallocate(NULL, data->_converters);
  65     }
  66
  67     CFAllocatorDeallocate(NULL, data);
  68 }
  69
  70 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
  71 #import <pthread.h>
  72
  73 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
  74     __CFICUThreadData * data;
  75
  76     pthread_key_init_np(__CFTSDKeyICUConverter, __CFICUThreadDataDestructor);
  77     data = (__CFICUThreadData *)pthread_getspecific(__CFTSDKeyICUConverter);
  78
  79     if (NULL == data) {
  80         data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
  81         memset(data, 0, sizeof(__CFICUThreadData));
  82         pthread_setspecific(__CFTSDKeyICUConverter, (const void *)data);
  83     }
  84
  85     return data;
  86 }
  87 #elif DEPLOYMENT_TARGET_WINDOWS
  88 __private_extern__ void __CFStringEncodingICUThreadDataCleaner(void *context) { __CFICUThreadDataDestructor(context); }
  89
  90 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
  91     __CFThreadSpecificData *threadData = __CFGetThreadSpecificData_inline();
  92
  93     if (NULL == threadData->_icuThreadData) {
  94         threadData->_icuThreadData = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
  95         memset(threadData->_icuThreadData, 0, sizeof(__CFICUThreadData));
  96     }
  97
  98     return (__CFICUThreadData *)threadData->_icuThreadData;
  99 }
 100 #else
 101 #error Need implementation for thread data
 102 #endif
 103
 104 __private_extern__ const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
 105 #define STACK_BUFFER_SIZE (60)
 106     char buffer[STACK_BUFFER_SIZE];
 107     const char *result = NULL;
 108     UErrorCode errorCode = U_ZERO_ERROR;
 109     uint32_t codepage = 0;
 110
 111     if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";
 112
 113     if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows
 114
 115     if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;
 116
 117     if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);
 118
 119     return result;
 120 #undef STACK_BUFFER_SIZE
 121 }
 122
 123 __private_extern__ CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
 124     uint32_t codepage;
 125     UErrorCode errorCode = U_ZERO_ERROR;
 126
 127     if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
 128
 129     if (0 != ucnv_countAliases(icuName, &errorCode)) {
 130         CFStringEncoding encoding;
 131         const char *name;
 132
 133         // Try WINDOWS platform
 134         name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
 135
 136         if (NULL != name) {
 137             if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
 138
 139             if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
 140         }
 141
 142         // Try JAVA platform
 143         name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
 144         if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
 145
 146         // Try MIME platform
 147         name = ucnv_getStandardName(icuName, "MIME", &errorCode);
 148         if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
 149     }
 150
 151     return kCFStringEncodingInvalidId;
 152 }
 153
 154 CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
 155     UConverter *converter;
 156     UErrorCode errorCode = U_ZERO_ERROR;
 157     uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
 158
 159     if (0 != streamID) { // this is a part of streaming previously created
 160         __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
 161
 162         --streamID; // map to array index
 163
 164         if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
 165     }
 166
 167     converter = ucnv_open(icuName, &errorCode);
 168
 169     if (NULL != converter) {
 170         char lossyByte = CFStringEncodingMaskToLossyByte(flags);
 171
 172         if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';
 173
 174         if (0 ==lossyByte) {
 175             if (toUnicode) {
 176                 ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
 177             } else {
 178                 ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
 179             }
 180         } else {
 181             ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
 182         }
 183     }
 184
 185     return converter;
 186 }
 187
 188 #define ICU_CONVERTER_SLOT_INCREMENT (10)
 189 #define ICU_CONVERTER_MAX_SLOT (255)
 190
 191 static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
 192     uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
 193
 194     if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
 195         if (0 == streamID) {
 196             __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
 197
 198             if (NULL == data->_converters) {
 199                 data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
 200                 memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
 201                 data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
 202                 data->_nextSlot = 0;
 203             } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
 204                 CFIndex index;
 205
 206                 for (index = 0;index < data->_numSlots;index++) {
 207                     if (NULL == data->_converters[index]) {
 208                         data->_nextSlot = index;
 209                         break;
 210                     }
 211                 }
 212
 213                 if (index >= data->_numSlots) { // we're full
 214                     UConverter **newConverters;
 215                     CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;
 216
 217                     if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
 218                         CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
 219                         ucnv_close(converter);
 220                         return 0;
 221                     }
 222
 223                     newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
 224                     memset(newConverters, 0, sizeof(UConverter *) * newSize);
 225                     memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
 226                     CFAllocatorDeallocate(NULL, data->_converters);
 227                     data->_converters = newConverters;
 228                     data->_nextSlot = data->_numSlots;
 229                     data->_numSlots = newSize;
 230                 }
 231             }
 232
 233             data->_converters[data->_nextSlot] = converter;
 234             streamID = data->_nextSlot + 1;
 235
 236             // now find next slot
 237             ++data->_nextSlot;
 238
 239             if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
 240                 data->_nextSlot = 0;
 241
 242                 while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
 243             }
 244         }
 245
 246         return CFStringEncodingStreamIDToMask(streamID);
 247     }
 248
 249     if (0 != streamID) {
 250         __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
 251
 252         --streamID; // map to array index
 253
 254         if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
 255             data->_converters[streamID] = NULL;
 256             if (data->_nextSlot > streamID) data->_nextSlot = streamID;
 257         }
 258     }
 259
 260     ucnv_close(converter);
 261
 262     return 0;
 263 }
 264
 265 #define MAX_BUFFER_SIZE (1000)
 266
 267 #if (U_ICU_VERSION_MAJOR_NUM > 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM > 0))
 268 #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
 269 #endif
 270 #define HAS_ICU_BUG_6024743 (1)
 271 #define HAS_ICU_BUG_6025527 (1)
 272
 273 __private_extern__ CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 274     UConverter *converter;
 275     UErrorCode errorCode = U_ZERO_ERROR;
 276     const UTF16Char *source = characters;
 277     const UTF16Char *sourceLimit = source + numChars;
 278     char *destination = (char *)bytes;
 279     const char *destinationLimit = destination + maxByteLen;
 280     bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
 281     CFIndex status;
 282
 283     if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;
 284
 285     if (0 == maxByteLen) {
 286         char buffer[MAX_BUFFER_SIZE];
 287         CFIndex totalLength = 0;
 288
 289         while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
 290             destination = buffer;
 291             destinationLimit = destination + MAX_BUFFER_SIZE;
 292
 293             ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
 294
 295             totalLength += (destination - buffer);
 296
 297             if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
 298         }
 299
 300         if (NULL != usedByteLen) *usedByteLen = totalLength;
 301     } else {
 302         ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
 303
 304         if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
 305     }
 306
 307     status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
 308
 309     if (NULL != usedCharLen) {
 310 #if HAS_ICU_BUG_6024743
 311 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
 312         if (kCFStringEncodingInvalidInputStream == status) {
 313 #define MAX_ERROR_BUFFER_LEN (32)
 314             UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
 315             int8_t errorLength = MAX_ERROR_BUFFER_LEN;
 316 #undef MAX_ERROR_BUFFER_LEN
 317
 318             errorCode = U_ZERO_ERROR;
 319
 320             ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);
 321
 322             if (U_ZERO_ERROR == errorCode) {
 323                 source -= errorLength;
 324             } else {
 325                 // Gah, something is terribly wrong. Reset everything
 326                 source = characters; // 0 length
 327                 if (NULL != usedByteLen) *usedByteLen = 0;
 328             }
 329         }
 330 #endif
 331         *usedCharLen = source - characters;
 332     }
 333
 334     status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
 335
 336     return status;
 337 }
 338
 339 __private_extern__ CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 340     UConverter *converter;
 341     UErrorCode errorCode = U_ZERO_ERROR;
 342     const char *source = (const char *)bytes;
 343     const char *sourceLimit = source + numBytes;
 344     UTF16Char *destination = characters;
 345     const UTF16Char *destinationLimit = destination + maxCharLen;
 346     bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
 347     CFIndex status;
 348
 349     if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;
 350
 351     if (0 == maxCharLen) {
 352         UTF16Char buffer[MAX_BUFFER_SIZE];
 353         CFIndex totalLength = 0;
 354
 355         while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
 356             destination = buffer;
 357             destinationLimit = destination + MAX_BUFFER_SIZE;
 358
 359             ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
 360
 361             totalLength += (destination - buffer);
 362
 363             if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
 364         }
 365
 366         if (NULL != usedCharLen) *usedCharLen = totalLength;
 367     } else {
 368         ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
 369
 370         if (NULL != usedCharLen) *usedCharLen = destination - characters;
 371     }
 372
 373     status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
 374
 375     if (NULL != usedByteLen) {
 376 #if HAS_ICU_BUG_6024743
 377         /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
 378         if (kCFStringEncodingInvalidInputStream == status) {
 379 #define MAX_ERROR_BUFFER_LEN (32)
 380             char errorBuffer[MAX_ERROR_BUFFER_LEN];
 381             int8_t errorLength = MAX_ERROR_BUFFER_LEN;
 382 #undef MAX_ERROR_BUFFER_LEN
 383
 384             errorCode = U_ZERO_ERROR;
 385
 386             ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
 387
 388             if (U_ZERO_ERROR == errorCode) {
 389 #if HAS_ICU_BUG_6025527
 390                 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
 391                 if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
 392 #endif
 393                 source -= errorLength;
 394             } else {
 395                 // Gah, something is terribly wrong. Reset everything
 396                 source = (const char *)bytes; // 0 length
 397                 if (NULL != usedCharLen) *usedCharLen = 0;
 398             }
 399         }
 400 #endif
 401
 402         *usedByteLen = source - (const char *)bytes;
 403     }
 404
 405     status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
 406
 407     return status;
 408 }
 409
 410 __private_extern__ CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
 411     CFIndex usedCharLen;
 412     return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
 413 }
 414
 415 __private_extern__ CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
 416     CFIndex usedByteLen;
 417     return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
 418 }
 419
 420 __private_extern__ CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
 421     CFIndex count = ucnv_countAvailable();
 422     CFIndex numEncodings = 0;
 423     CFStringEncoding *encodings;
 424     CFStringEncoding encoding;
 425     CFIndex index;
 426
 427     if (0 == count) return NULL;
 428
 429     encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);
 430
 431     for (index = 0;index < count;index++) {
 432         encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));
 433
 434         if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
 435     }
 436
 437     if (0 == numEncodings) {
 438         CFAllocatorDeallocate(allocator, encodings);
 439         encodings = NULL;
 440     }
 441
 442     *numberOfIndex = numEncodings;
 443
 444     return encodings;
 445 }