CFICUConverters.c

   1 /*
   2  * Copyright (c) 2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 /*
  25  *  CFICUConverters.c
  26  *  CoreFoundation
  27  *
  28  *  Created by Aki Inoue on 07/12/04.
  29  *  Copyright 2007-2009, Apple Inc. All rights reserved.
  30  *
  31  */
  32
  33 #include "CFStringEncodingDatabase.h"
  34 #include "CFStringEncodingConverterPriv.h"
  35 #include "CFICUConverters.h"
  36 #include <CoreFoundation/CFStringEncodingExt.h>
  37 #include <unicode/ucnv.h>
  38 #include <unicode/uversion.h>
  39 #include "CFInternal.h"
  40 #include <stdio.h>
  41
  42 #if DEPLOYMENT_TARGET_WINDOWS
  43 #define strncasecmp_l(a, b, c, d) _strnicmp(a, b, c)
  44 #define snprintf _snprintf
  45 #endif
  46
  47 // Thread data support
  48 typedef struct {
  49     uint8_t _numSlots;
  50     uint8_t _nextSlot;
  51     UConverter **_converters;
  52 } __CFICUThreadData;
  53
  54 static void __CFICUThreadDataDestructor(void *context) {
  55     __CFICUThreadData * data = (__CFICUThreadData *)context;
  56
  57     if (NULL != data->_converters) { // scan to make sure deallocation
  58         UConverter **converter = data->_converters;
  59         UConverter **limit = converter + data->_numSlots;
  60
  61         while (converter < limit) {
  62             if (NULL != converter) ucnv_close(*converter);
  63             ++converter;
  64         }
  65         CFAllocatorDeallocate(NULL, data->_converters);
  66     }
  67
  68     CFAllocatorDeallocate(NULL, data);
  69 }
  70
  71 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
  72 #import <pthread.h>
  73
  74 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
  75     __CFICUThreadData * data;
  76
  77     pthread_key_init_np(__CFTSDKeyICUConverter, __CFICUThreadDataDestructor);
  78     data = (__CFICUThreadData *)pthread_getspecific(__CFTSDKeyICUConverter);
  79
  80     if (NULL == data) {
  81         data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
  82         memset(data, 0, sizeof(__CFICUThreadData));
  83         pthread_setspecific(__CFTSDKeyICUConverter, (const void *)data);
  84     }
  85
  86     return data;
  87 }
  88 #elif DEPLOYMENT_TARGET_WINDOWS
  89 __private_extern__ void __CFStringEncodingICUThreadDataCleaner(void *context) { __CFICUThreadDataDestructor(context); }
  90
  91 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
  92     __CFThreadSpecificData *threadData = __CFGetThreadSpecificData_inline();
  93
  94     if (NULL == threadData->_icuThreadData) {
  95         threadData->_icuThreadData = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
  96         memset(threadData->_icuThreadData, 0, sizeof(__CFICUThreadData));
  97     }
  98
  99     return (__CFICUThreadData *)threadData->_icuThreadData;
 100 }
 101 #else
 102 #error Need implementation for thread data
 103 #endif
 104
 105 __private_extern__ const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
 106 #define STACK_BUFFER_SIZE (60)
 107     char buffer[STACK_BUFFER_SIZE];
 108     const char *result = NULL;
 109     UErrorCode errorCode = U_ZERO_ERROR;
 110     uint32_t codepage = 0;
 111
 112     if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";
 113
 114     if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows
 115
 116     if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;
 117
 118     if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);
 119
 120     return result;
 121 #undef STACK_BUFFER_SIZE
 122 }
 123
 124 __private_extern__ CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
 125     uint32_t codepage;
 126     UErrorCode errorCode = U_ZERO_ERROR;
 127
 128     if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
 129
 130     if (0 != ucnv_countAliases(icuName, &errorCode)) {
 131         CFStringEncoding encoding;
 132         const char *name;
 133
 134         // Try WINDOWS platform
 135         name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
 136
 137         if (NULL != name) {
 138             if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), NULL, 10)))) return __CFStringEncodingGetFromWindowsCodePage(codepage);
 139
 140             if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
 141         }
 142
 143         // Try JAVA platform
 144         name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
 145         if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
 146
 147         // Try MIME platform
 148         name = ucnv_getStandardName(icuName, "MIME", &errorCode);
 149         if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
 150     }
 151
 152     return kCFStringEncodingInvalidId;
 153 }
 154
 155 CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
 156     UConverter *converter;
 157     UErrorCode errorCode = U_ZERO_ERROR;
 158     uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
 159
 160     if (0 != streamID) { // this is a part of streaming previously created
 161         __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
 162
 163         --streamID; // map to array index
 164
 165         if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
 166     }
 167
 168     converter = ucnv_open(icuName, &errorCode);
 169
 170     if (NULL != converter) {
 171         char lossyByte = CFStringEncodingMaskToLossyByte(flags);
 172
 173         if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';
 174
 175         if (0 ==lossyByte) {
 176             if (toUnicode) {
 177                 ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
 178             } else {
 179                 ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
 180             }
 181         } else {
 182             ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
 183         }
 184     }
 185
 186     return converter;
 187 }
 188
 189 #define ICU_CONVERTER_SLOT_INCREMENT (10)
 190 #define ICU_CONVERTER_MAX_SLOT (255)
 191
 192 static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
 193     uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
 194
 195     if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
 196         if (0 == streamID) {
 197             __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
 198
 199             if (NULL == data->_converters) {
 200                 data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
 201                 memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
 202                 data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
 203                 data->_nextSlot = 0;
 204             } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
 205                 CFIndex index;
 206
 207                 for (index = 0;index < data->_numSlots;index++) {
 208                     if (NULL == data->_converters[index]) {
 209                         data->_nextSlot = index;
 210                         break;
 211                     }
 212                 }
 213
 214                 if (index >= data->_numSlots) { // we're full
 215                     UConverter **newConverters;
 216                     CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;
 217
 218                     if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
 219                         CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
 220                         ucnv_close(converter);
 221                         return 0;
 222                     }
 223
 224                     newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
 225                     memset(newConverters, 0, sizeof(UConverter *) * newSize);
 226                     memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
 227                     CFAllocatorDeallocate(NULL, data->_converters);
 228                     data->_converters = newConverters;
 229                     data->_nextSlot = data->_numSlots;
 230                     data->_numSlots = newSize;
 231                 }
 232             }
 233
 234             data->_converters[data->_nextSlot] = converter;
 235             streamID = data->_nextSlot + 1;
 236
 237             // now find next slot
 238             ++data->_nextSlot;
 239
 240             if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
 241                 data->_nextSlot = 0;
 242
 243                 while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
 244             }
 245         }
 246
 247         return CFStringEncodingStreamIDToMask(streamID);
 248     }
 249
 250     if (0 != streamID) {
 251         __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
 252
 253         --streamID; // map to array index
 254
 255         if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
 256             data->_converters[streamID] = NULL;
 257             if (data->_nextSlot > streamID) data->_nextSlot = streamID;
 258         }
 259     }
 260
 261     ucnv_close(converter);
 262
 263     return 0;
 264 }
 265
 266 #define MAX_BUFFER_SIZE (1000)
 267
 268 #if (U_ICU_VERSION_MAJOR_NUM > 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM > 0))
 269 #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
 270 #endif
 271 #define HAS_ICU_BUG_6024743 (1)
 272 #define HAS_ICU_BUG_6025527 (1)
 273
 274 __private_extern__ CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 275     UConverter *converter;
 276     UErrorCode errorCode = U_ZERO_ERROR;
 277     const UTF16Char *source = characters;
 278     const UTF16Char *sourceLimit = source + numChars;
 279     char *destination = (char *)bytes;
 280     const char *destinationLimit = destination + maxByteLen;
 281     bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
 282     CFIndex status;
 283
 284     if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;
 285
 286     if (0 == maxByteLen) {
 287         char buffer[MAX_BUFFER_SIZE];
 288         CFIndex totalLength = 0;
 289
 290         while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
 291             destination = buffer;
 292             destinationLimit = destination + MAX_BUFFER_SIZE;
 293
 294             ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
 295
 296             totalLength += (destination - buffer);
 297
 298             if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
 299         }
 300
 301         if (NULL != usedByteLen) *usedByteLen = totalLength;
 302     } else {
 303         ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
 304
 305         if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
 306     }
 307
 308     status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
 309
 310     if (NULL != usedCharLen) {
 311 #if HAS_ICU_BUG_6024743
 312 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
 313         if (kCFStringEncodingInvalidInputStream == status) {
 314 #define MAX_ERROR_BUFFER_LEN (32)
 315             UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
 316             int8_t errorLength = MAX_ERROR_BUFFER_LEN;
 317 #undef MAX_ERROR_BUFFER_LEN
 318
 319             errorCode = U_ZERO_ERROR;
 320
 321             ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);
 322
 323             if (U_ZERO_ERROR == errorCode) {
 324                 source -= errorLength;
 325             } else {
 326                 // Gah, something is terribly wrong. Reset everything
 327                 source = characters; // 0 length
 328                 if (NULL != usedByteLen) *usedByteLen = 0;
 329             }
 330         }
 331 #endif
 332         *usedCharLen = source - characters;
 333     }
 334
 335     status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
 336
 337     return status;
 338 }
 339
 340 __private_extern__ CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 341     UConverter *converter;
 342     UErrorCode errorCode = U_ZERO_ERROR;
 343     const char *source = (const char *)bytes;
 344     const char *sourceLimit = source + numBytes;
 345     UTF16Char *destination = characters;
 346     const UTF16Char *destinationLimit = destination + maxCharLen;
 347     bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
 348     CFIndex status;
 349
 350     if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;
 351
 352     if (0 == maxCharLen) {
 353         UTF16Char buffer[MAX_BUFFER_SIZE];
 354         CFIndex totalLength = 0;
 355
 356         while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
 357             destination = buffer;
 358             destinationLimit = destination + MAX_BUFFER_SIZE;
 359
 360             ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
 361
 362             totalLength += (destination - buffer);
 363
 364             if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
 365         }
 366
 367         if (NULL != usedCharLen) *usedCharLen = totalLength;
 368     } else {
 369         ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
 370
 371         if (NULL != usedCharLen) *usedCharLen = destination - characters;
 372     }
 373
 374     status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
 375
 376     if (NULL != usedByteLen) {
 377 #if HAS_ICU_BUG_6024743
 378         /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
 379         if (kCFStringEncodingInvalidInputStream == status) {
 380 #define MAX_ERROR_BUFFER_LEN (32)
 381             char errorBuffer[MAX_ERROR_BUFFER_LEN];
 382             int8_t errorLength = MAX_ERROR_BUFFER_LEN;
 383 #undef MAX_ERROR_BUFFER_LEN
 384
 385             errorCode = U_ZERO_ERROR;
 386
 387             ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
 388
 389             if (U_ZERO_ERROR == errorCode) {
 390 #if HAS_ICU_BUG_6025527
 391                 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
 392                 if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
 393 #endif
 394                 source -= errorLength;
 395             } else {
 396                 // Gah, something is terribly wrong. Reset everything
 397                 source = (const char *)bytes; // 0 length
 398                 if (NULL != usedCharLen) *usedCharLen = 0;
 399             }
 400         }
 401 #endif
 402
 403         *usedByteLen = source - (const char *)bytes;
 404     }
 405
 406     status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
 407
 408     return status;
 409 }
 410
 411 __private_extern__ CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
 412     CFIndex usedCharLen;
 413     return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
 414 }
 415
 416 __private_extern__ CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
 417     CFIndex usedByteLen;
 418     return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
 419 }
 420
 421 __private_extern__ CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
 422     CFIndex count = ucnv_countAvailable();
 423     CFIndex numEncodings = 0;
 424     CFStringEncoding *encodings;
 425     CFStringEncoding encoding;
 426     CFIndex index;
 427
 428     if (0 == count) return NULL;
 429
 430     encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);
 431
 432     for (index = 0;index < count;index++) {
 433         encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));
 434
 435         if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
 436     }
 437
 438     if (0 == numEncodings) {
 439         CFAllocatorDeallocate(allocator, encodings);
 440         encodings = NULL;
 441     }
 442
 443     *numberOfIndex = numEncodings;
 444
 445     return encodings;
 446 }