CFICUConverters.c

   1 /*
   2  * Copyright (c) 2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 /*      CFICUConverters.c
  25         Copyright (c) 2004-2014, Apple Inc. All rights reserved.
  26         Responsibility: Aki Inoue
  27 */
  28
  29 #include "CFStringEncodingDatabase.h"
  30 #include "CFStringEncodingConverterPriv.h"
  31 #include "CFICUConverters.h"
  32 #include <CoreFoundation/CFStringEncodingExt.h>
  33 #include <CoreFoundation/CFUniChar.h>
  34 #include <unicode/ucnv.h>
  35 #include <unicode/uversion.h>
  36 #include "CFInternal.h"
  37 #include <stdio.h>
  38
  39 // Thread data support
  40 typedef struct {
  41     uint8_t _numSlots;
  42     uint8_t _nextSlot;
  43     UConverter **_converters;
  44 } __CFICUThreadData;
  45
  46 static void __CFICUThreadDataDestructor(void *context) {
  47     __CFICUThreadData * data = (__CFICUThreadData *)context;
  48
  49     if (NULL != data->_converters) { // scan to make sure deallocation
  50         UConverter **converter = data->_converters;
  51         UConverter **limit = converter + data->_numSlots;
  52
  53         while (converter < limit) {
  54             if (NULL != converter) ucnv_close(*converter);
  55             ++converter;
  56         }
  57         CFAllocatorDeallocate(NULL, data->_converters);
  58     }
  59
  60     CFAllocatorDeallocate(NULL, data);
  61 }
  62
  63 CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
  64     __CFICUThreadData * data;
  65
  66     data = (__CFICUThreadData *)_CFGetTSD(__CFTSDKeyICUConverter);
  67
  68     if (NULL == data) {
  69         data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
  70         memset(data, 0, sizeof(__CFICUThreadData));
  71         _CFSetTSD(__CFTSDKeyICUConverter, (void *)data, __CFICUThreadDataDestructor);
  72     }
  73
  74     return data;
  75 }
  76
  77 CF_PRIVATE const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
  78 #define STACK_BUFFER_SIZE (60)
  79     char buffer[STACK_BUFFER_SIZE];
  80     const char *result = NULL;
  81     UErrorCode errorCode = U_ZERO_ERROR;
  82     uint32_t codepage = 0;
  83
  84     if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";
  85
  86     if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows
  87
  88     if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;
  89
  90     if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);
  91
  92     return result;
  93 #undef STACK_BUFFER_SIZE
  94 }
  95
  96 CF_PRIVATE CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
  97     uint32_t codepage;
  98     char *endPtr;
  99     UErrorCode errorCode = U_ZERO_ERROR;
 100
 101     if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage);
 102
 103     if (0 != ucnv_countAliases(icuName, &errorCode)) {
 104         CFStringEncoding encoding;
 105         const char *name;
 106
 107         // Try WINDOWS platform
 108         name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
 109
 110         if (NULL != name) {
 111             if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage);
 112
 113             if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
 114         }
 115
 116         // Try JAVA platform
 117         name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
 118         if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
 119
 120         // Try MIME platform
 121         name = ucnv_getStandardName(icuName, "MIME", &errorCode);
 122         if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
 123     }
 124
 125     return kCFStringEncodingInvalidId;
 126 }
 127
 128 CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
 129     UConverter *converter;
 130     UErrorCode errorCode = U_ZERO_ERROR;
 131     uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
 132
 133     if (0 != streamID) { // this is a part of streaming previously created
 134         __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
 135
 136         --streamID; // map to array index
 137
 138         if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
 139     }
 140
 141     converter = ucnv_open(icuName, &errorCode);
 142
 143     if (NULL != converter) {
 144         char lossyByte = CFStringEncodingMaskToLossyByte(flags);
 145
 146         if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';
 147
 148         if (0 ==lossyByte) {
 149             if (toUnicode) {
 150                 ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
 151             } else {
 152                 ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
 153             }
 154         } else {
 155             ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
 156         }
 157     }
 158
 159     return converter;
 160 }
 161
 162 #define ICU_CONVERTER_SLOT_INCREMENT (10)
 163 #define ICU_CONVERTER_MAX_SLOT (255)
 164
 165 static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
 166     uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
 167
 168     if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
 169         if (0 == streamID) {
 170             __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
 171
 172             if (NULL == data->_converters) {
 173                 data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
 174                 memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
 175                 data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
 176                 data->_nextSlot = 0;
 177             } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
 178                 CFIndex index;
 179
 180                 for (index = 0;index < data->_numSlots;index++) {
 181                     if (NULL == data->_converters[index]) {
 182                         data->_nextSlot = index;
 183                         break;
 184                     }
 185                 }
 186
 187                 if (index >= data->_numSlots) { // we're full
 188                     UConverter **newConverters;
 189                     CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;
 190
 191                     if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
 192                         CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
 193                         ucnv_close(converter);
 194                         return 0;
 195                     }
 196
 197                     newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
 198                     memset(newConverters, 0, sizeof(UConverter *) * newSize);
 199                     memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
 200                     CFAllocatorDeallocate(NULL, data->_converters);
 201                     data->_converters = newConverters;
 202                     data->_nextSlot = data->_numSlots;
 203                     data->_numSlots = newSize;
 204                 }
 205             }
 206
 207             data->_converters[data->_nextSlot] = converter;
 208             streamID = data->_nextSlot + 1;
 209
 210             // now find next slot
 211             ++data->_nextSlot;
 212
 213             if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
 214                 data->_nextSlot = 0;
 215
 216                 while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
 217             }
 218         }
 219
 220         return CFStringEncodingStreamIDToMask(streamID);
 221     }
 222
 223     if (0 != streamID) {
 224         __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
 225
 226         --streamID; // map to array index
 227
 228         if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
 229             data->_converters[streamID] = NULL;
 230             if (data->_nextSlot > streamID) data->_nextSlot = streamID;
 231         }
 232     }
 233
 234     ucnv_close(converter);
 235
 236     return 0;
 237 }
 238
 239 #define MAX_BUFFER_SIZE (1000)
 240
 241 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
 242 #if 0
 243 // we're no longer doing this check. Revive when the status in the bug changed.
 244 #if (U_ICU_VERSION_MAJOR_NUM > 49)
 245 #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
 246 #endif
 247 #endif
 248 #endif
 249 #define HAS_ICU_BUG_6024743 (1)
 250 #define HAS_ICU_BUG_6025527 (1)
 251
 252 CF_PRIVATE CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 253     UConverter *converter;
 254     UErrorCode errorCode = U_ZERO_ERROR;
 255     const UTF16Char *source = characters;
 256     const UTF16Char *sourceLimit = source + numChars;
 257     char *destination = (char *)bytes;
 258     const char *destinationLimit = destination + maxByteLen;
 259     bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
 260     CFIndex status;
 261
 262     if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;
 263
 264     if (0 == maxByteLen) {
 265         char buffer[MAX_BUFFER_SIZE];
 266         CFIndex totalLength = 0;
 267
 268         while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
 269             destination = buffer;
 270             destinationLimit = destination + MAX_BUFFER_SIZE;
 271
 272             ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
 273
 274             totalLength += (destination - buffer);
 275
 276             if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
 277         }
 278
 279         if (NULL != usedByteLen) *usedByteLen = totalLength;
 280     } else {
 281         ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
 282
 283 #if HAS_ICU_BUG_6024743
 284 /* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding <rdar://problem/7183045> Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */
 285         if (U_BUFFER_OVERFLOW_ERROR == errorCode) {
 286             const uint8_t *bitmap = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
 287             const uint8_t *nonBase;
 288             UTF32Char character;
 289
 290             do {
 291                 // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates)
 292                 do {
 293                     sourceLimit = (source - 1);
 294                     character = *sourceLimit;
 295                     nonBase = bitmap;
 296
 297                     if (CFUniCharIsSurrogateLowCharacter(character)) {
 298                         --sourceLimit;
 299                         character = CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit, character);
 300                         nonBase = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, (character >> 16) & 0x000F);
 301                         character &= 0xFFFF;
 302                     }
 303                 } while ((sourceLimit > characters) && CFUniCharIsMemberOfBitmap(character, nonBase));
 304
 305                 if (sourceLimit > characters) {
 306                     source = characters;
 307                     destination = (char *)bytes;
 308                     errorCode = U_ZERO_ERROR;
 309
 310                     ucnv_resetFromUnicode(converter);
 311
 312                     ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
 313                 }
 314             } while (U_BUFFER_OVERFLOW_ERROR == errorCode);
 315
 316             errorCode = U_BUFFER_OVERFLOW_ERROR;
 317         }
 318 #endif
 319         if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
 320     }
 321
 322     status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
 323
 324     if (NULL != usedCharLen) {
 325 #if HAS_ICU_BUG_6024743
 326 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
 327         if (kCFStringEncodingInvalidInputStream == status) {
 328 #define MAX_ERROR_BUFFER_LEN (32)
 329             UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
 330             int8_t errorLength = MAX_ERROR_BUFFER_LEN;
 331 #undef MAX_ERROR_BUFFER_LEN
 332
 333             errorCode = U_ZERO_ERROR;
 334
 335             ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);
 336
 337             if (U_ZERO_ERROR == errorCode) {
 338                 source -= errorLength;
 339             } else {
 340                 // Gah, something is terribly wrong. Reset everything
 341                 source = characters; // 0 length
 342                 if (NULL != usedByteLen) *usedByteLen = 0;
 343             }
 344         }
 345 #endif
 346         *usedCharLen = source - characters;
 347     }
 348
 349     status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
 350
 351     return status;
 352 }
 353
 354 CF_PRIVATE CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 355     UConverter *converter;
 356     UErrorCode errorCode = U_ZERO_ERROR;
 357     const char *source = (const char *)bytes;
 358     const char *sourceLimit = source + numBytes;
 359     UTF16Char *destination = characters;
 360     const UTF16Char *destinationLimit = destination + maxCharLen;
 361     bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
 362     CFIndex status;
 363
 364     if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;
 365
 366     if (0 == maxCharLen) {
 367         UTF16Char buffer[MAX_BUFFER_SIZE];
 368         CFIndex totalLength = 0;
 369
 370         while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
 371             destination = buffer;
 372             destinationLimit = destination + MAX_BUFFER_SIZE;
 373
 374             ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
 375
 376             totalLength += (destination - buffer);
 377
 378             if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
 379         }
 380
 381         if (NULL != usedCharLen) *usedCharLen = totalLength;
 382     } else {
 383         ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
 384
 385         if (NULL != usedCharLen) *usedCharLen = destination - characters;
 386     }
 387
 388     status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
 389
 390     if (NULL != usedByteLen) {
 391 #if HAS_ICU_BUG_6024743
 392         /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
 393         if (kCFStringEncodingInvalidInputStream == status) {
 394 #define MAX_ERROR_BUFFER_LEN (32)
 395             char errorBuffer[MAX_ERROR_BUFFER_LEN];
 396             int8_t errorLength = MAX_ERROR_BUFFER_LEN;
 397 #undef MAX_ERROR_BUFFER_LEN
 398
 399             errorCode = U_ZERO_ERROR;
 400
 401             ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
 402
 403             if (U_ZERO_ERROR == errorCode) {
 404 #if HAS_ICU_BUG_6025527
 405                 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
 406                 if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
 407 #endif
 408                 source -= errorLength;
 409             } else {
 410                 // Gah, something is terribly wrong. Reset everything
 411                 source = (const char *)bytes; // 0 length
 412                 if (NULL != usedCharLen) *usedCharLen = 0;
 413             }
 414         }
 415 #endif
 416
 417         *usedByteLen = source - (const char *)bytes;
 418     }
 419
 420     status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
 421
 422     return status;
 423 }
 424
 425 CF_PRIVATE CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
 426     CFIndex usedCharLen;
 427     return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
 428 }
 429
 430 CF_PRIVATE CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
 431     CFIndex usedByteLen;
 432     return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
 433 }
 434
 435 CF_PRIVATE CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
 436     CFIndex count = ucnv_countAvailable();
 437     CFIndex numEncodings = 0;
 438     CFStringEncoding *encodings;
 439     CFStringEncoding encoding;
 440     CFIndex index;
 441
 442     if (0 == count) return NULL;
 443
 444     encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);
 445
 446     for (index = 0;index < count;index++) {
 447         encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));
 448
 449         if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
 450     }
 451
 452     if (0 == numEncodings) {
 453         CFAllocatorDeallocate(allocator, encodings);
 454         encodings = NULL;
 455     }
 456
 457     *numberOfIndex = numEncodings;
 458
 459     return encodings;
 460 }