CFStringEncodings.c

   1 /*
   2  * Copyright (c) 2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*      CFStringEncodings.c
  24         Copyright 1999-2002, Apple, Inc. All rights reserved.
  25         Responsibility: Aki Inoue
  26 */
  27
  28 #include "CFInternal.h"
  29 #include <CoreFoundation/CFString.h>
  30 #include <CoreFoundation/CFByteOrder.h>
  31 #include "CFPriv.h"
  32 #include <string.h>
  33 #include "CFStringEncodingConverterExt.h"
  34 #include "CFUniChar.h"
  35 #include "CFUnicodeDecomposition.h"
  36
  37 static UInt32 __CFWantsToUseASCIICompatibleConversion = (UInt32)-1;
  38 CF_INLINE UInt32 __CFGetASCIICompatibleFlag(void) {
  39     if (__CFWantsToUseASCIICompatibleConversion == (UInt32)-1) {
  40         __CFWantsToUseASCIICompatibleConversion = false;
  41     }
  42     return (__CFWantsToUseASCIICompatibleConversion ? kCFStringEncodingASCIICompatibleConversion : 0);
  43 }
  44
  45 void _CFStringEncodingSetForceASCIICompatibility(Boolean flag) {
  46     __CFWantsToUseASCIICompatibleConversion = (flag ? (UInt32)true : (UInt32)false);
  47 }
  48
  49 Boolean (*__CFCharToUniCharFunc)(UInt32 flags, uint8_t ch, UniChar *unicodeChar) = NULL;
  50
  51 // To avoid early initialization issues, we just initialize this here
  52 // This should not be const as it is changed
  53 UniChar __CFCharToUniCharTable[256] = {
  54   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
  55  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
  56  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
  57  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
  58  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
  59  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
  60  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
  61 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
  62 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
  63 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
  64 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
  65 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
  66 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
  67 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
  68 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
  69 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
  70 };
  71
  72 void __CFSetCharToUniCharFunc(Boolean (*func)(UInt32 flags, UInt8 ch, UniChar *unicodeChar)) {
  73     if (__CFCharToUniCharFunc != func) {
  74         int ch;
  75         __CFCharToUniCharFunc = func;
  76         if (func) {
  77             for (ch = 128; ch < 256; ch++) {
  78                 UniChar uch;
  79                 __CFCharToUniCharTable[ch] = (__CFCharToUniCharFunc(0, ch, &uch) ? uch : 0xFFFD);
  80             }
  81         } else {        // If we have no __CFCharToUniCharFunc, assume 128..255 return the value as-is
  82             for (ch = 128; ch < 256; ch++) __CFCharToUniCharTable[ch] = ch;
  83         }
  84     }
  85 }
  86
  87 __private_extern__ void __CFStrConvertBytesToUnicode(const uint8_t *bytes, UniChar *buffer, CFIndex numChars) {
  88     CFIndex idx;
  89     for (idx = 0; idx < numChars; idx++) buffer[idx] = __CFCharToUniCharTable[bytes[idx]];
  90 }
  91
  92
  93 /* The minimum length the output buffers should be in the above functions
  94 */
  95 #define kCFCharConversionBufferLength 512
  96
  97
  98 #define MAX_LOCAL_CHARS         (sizeof(buffer->localBuffer) / sizeof(uint8_t))
  99 #define MAX_LOCAL_UNICHARS      (sizeof(buffer->localBuffer) / sizeof(UniChar))
 100
 101 /* Convert a byte stream to ASCII (7-bit!) or Unicode, with a CFVarWidthCharBuffer struct on the stack. false return indicates an error occured during the conversion. The caller needs to free the returned buffer in either ascii or unicode (indicated by isASCII), if shouldFreeChars is true.
 102 9/18/98 __CFStringDecodeByteStream now avoids to allocate buffer if buffer->chars is not NULL
 103 Added useClientsMemoryPtr; if not-NULL, and the provided memory can be used as is, this is set to true
 104 __CFStringDecodeByteStream2() is kept around for any internal clients who might be using it; it should be deprecated
 105 !!! converterFlags is only used for the UTF8 converter at this point
 106 */
 107 Boolean __CFStringDecodeByteStream2(const uint8_t *bytes, UInt32 len, CFStringEncoding encoding, Boolean alwaysUnicode, CFVarWidthCharBuffer *buffer, Boolean *useClientsMemoryPtr) {
 108     return __CFStringDecodeByteStream3(bytes, len, encoding, alwaysUnicode, buffer, useClientsMemoryPtr, 0);
 109 }
 110
 111 enum {
 112     __NSNonLossyErrorMode = -1,
 113     __NSNonLossyASCIIMode = 0,
 114     __NSNonLossyBackslashMode = 1,
 115     __NSNonLossyHexInitialMode = __NSNonLossyBackslashMode + 1,
 116     __NSNonLossyHexFinalMode = __NSNonLossyHexInitialMode + 4,
 117     __NSNonLossyOctalInitialMode = __NSNonLossyHexFinalMode + 1,
 118     __NSNonLossyOctalFinalMode = __NSNonLossyHexFinalMode + 3
 119 };
 120
 121 Boolean __CFStringDecodeByteStream3(const uint8_t *bytes, CFIndex len, CFStringEncoding encoding, Boolean alwaysUnicode, CFVarWidthCharBuffer *buffer, Boolean *useClientsMemoryPtr, UInt32 converterFlags) {
 122
 123     if (useClientsMemoryPtr) *useClientsMemoryPtr = false;
 124
 125     buffer->isASCII = !alwaysUnicode;
 126     buffer->shouldFreeChars = false;
 127     buffer->numChars = 0;
 128
 129     if (0 == len) return true;
 130
 131     buffer->allocator = (buffer->allocator ? buffer->allocator : __CFGetDefaultAllocator());
 132
 133     if ((encoding == kCFStringEncodingUTF16) || (encoding == kCFStringEncodingUTF16BE) || (encoding == kCFStringEncodingUTF16LE)) { // UTF-16
 134         const UTF16Char *src = (const UTF16Char *)bytes;
 135         const UTF16Char *limit = (const UTF16Char *)(bytes + len);
 136         bool swap = false;
 137
 138         if (kCFStringEncodingUTF16 == encoding) {
 139             UTF16Char bom = ((*src == 0xFFFE) || (*src == 0xFEFF) ? *(src++) : 0);
 140
 141 #if __CF_BIG_ENDIAN__
 142             if (bom == 0xFFFE) swap = true;
 143 #else
 144             if (bom != 0xFEFF) swap = true;
 145 #endif
 146             if (bom) useClientsMemoryPtr = NULL;
 147         } else {
 148 #if __CF_BIG_ENDIAN__
 149             if (kCFStringEncodingUTF16LE == encoding) swap = true;
 150 #else
 151             if (kCFStringEncodingUTF16BE == encoding) swap = true;
 152 #endif
 153         }
 154
 155         buffer->numChars = limit - src;
 156
 157         if (useClientsMemoryPtr && !swap) { // If the caller is ready to deal with no-copy situation, and the situation is possible, indicate it...
 158             *useClientsMemoryPtr = true;
 159             buffer->chars.unicode = (UniChar *)src;
 160             buffer->isASCII = false;
 161         } else {
 162             if (buffer->isASCII) {      // Let's see if we can reduce the Unicode down to ASCII...
 163                 const UTF16Char *characters = src;
 164                 UTF16Char mask = (swap ? 0x80FF : 0xFF80);
 165
 166                 while (characters < limit) {
 167                     if (*(characters++) & mask) {
 168                         buffer->isASCII = false;
 169                         break;
 170                     }
 171                 }
 172             }
 173
 174             if (buffer->isASCII) {
 175                 uint8_t *dst;
 176                 if (NULL == buffer->chars.ascii) { // we never reallocate when buffer is supplied
 177                     if (buffer->numChars > MAX_LOCAL_CHARS) {
 178                         buffer->chars.ascii = (UInt8 *)CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(uint8_t)), 0);
 179                         buffer->shouldFreeChars = true;
 180                     } else {
 181                         buffer->chars.ascii = (uint8_t *)buffer->localBuffer;
 182                     }
 183                 }
 184                 dst = buffer->chars.ascii;
 185
 186                 if (swap) {
 187                     while (src < limit) *(dst++) = (*(src++) >> 8);
 188                 } else {
 189                     while (src < limit) *(dst++) = (uint8_t)*(src++);
 190                 }
 191             } else {
 192                 UTF16Char *dst;
 193
 194                 if (NULL == buffer->chars.unicode) { // we never reallocate when buffer is supplied
 195                     if (buffer->numChars > MAX_LOCAL_UNICHARS) {
 196                         buffer->chars.unicode = (UniChar *)CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(UTF16Char)), 0);
 197                         buffer->shouldFreeChars = true;
 198                     } else {
 199                         buffer->chars.unicode = (UTF16Char *)buffer->localBuffer;
 200                     }
 201                 }
 202                 dst = buffer->chars.unicode;
 203
 204                 if (swap) {
 205                     while (src < limit) *(dst++) = CFSwapInt16(*(src++));
 206                 } else {
 207                     memmove(dst, src, buffer->numChars * sizeof(UTF16Char));
 208                 }
 209             }
 210         }
 211     } else if ((encoding == kCFStringEncodingUTF32) || (encoding == kCFStringEncodingUTF32BE) || (encoding == kCFStringEncodingUTF32LE)) {
 212         const UTF32Char *src = (const UTF32Char *)bytes;
 213         const UTF32Char *limit = (const UTF32Char *)(bytes + len);
 214         bool swap = false;
 215         static bool strictUTF32 = (bool)-1;
 216
 217         if ((bool)-1 == strictUTF32) strictUTF32 = (_CFExecutableLinkedOnOrAfter(CFSystemVersionLeopard) != 0);
 218
 219         if (kCFStringEncodingUTF32 == encoding) {
 220             UTF32Char bom = ((*src == 0xFFFE0000) || (*src == 0x0000FEFF) ? *(src++) : 0);
 221
 222 #if __CF_BIG_ENDIAN__
 223             if (bom == 0xFFFE0000) swap = true;
 224 #else
 225             if (bom != 0x0000FEFF) swap = true;
 226 #endif
 227         } else {
 228 #if __CF_BIG_ENDIAN__
 229             if (kCFStringEncodingUTF32LE == encoding) swap = true;
 230 #else
 231             if (kCFStringEncodingUTF32BE == encoding) swap = true;
 232 #endif
 233         }
 234
 235         buffer->numChars = limit - src;
 236
 237         {
 238             // Let's see if we have non-ASCII or non-BMP
 239             const UTF32Char *characters = src;
 240             UTF32Char asciiMask = (swap ? 0x80FFFFFF : 0xFFFFFF80);
 241             UTF32Char bmpMask = (swap ? 0x0000FFFF : 0xFFFF0000);
 242
 243             while (characters < limit) {
 244                 if (*characters & asciiMask) {
 245                     buffer->isASCII = false;
 246                     if (*characters & bmpMask) {
 247                         if (strictUTF32 && ((swap ? (UTF32Char)CFSwapInt32(*characters) : *characters) > 0x10FFFF)) return false; // outside of Unicode Scaler Value
 248                         ++(buffer->numChars);
 249                     }
 250                 }
 251                 ++characters;
 252             }
 253         }
 254
 255         if (buffer->isASCII) {
 256             uint8_t *dst;
 257             if (NULL == buffer->chars.ascii) { // we never reallocate when buffer is supplied
 258                 if (buffer->numChars > MAX_LOCAL_CHARS) {
 259                     buffer->chars.ascii = (UInt8 *)CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(uint8_t)), 0);
 260                     buffer->shouldFreeChars = true;
 261                 } else {
 262                     buffer->chars.ascii = (uint8_t *)buffer->localBuffer;
 263                 }
 264             }
 265             dst = buffer->chars.ascii;
 266
 267             if (swap) {
 268                 while (src < limit) *(dst++) = (*(src++) >> 24);
 269             } else {
 270                 while (src < limit) *(dst++) = *(src++);
 271             }
 272         } else {
 273             if (NULL == buffer->chars.unicode) { // we never reallocate when buffer is supplied
 274                 if (buffer->numChars > MAX_LOCAL_UNICHARS) {
 275                     buffer->chars.unicode = (UniChar *)CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(UTF16Char)), 0);
 276                     buffer->shouldFreeChars = true;
 277                 } else {
 278                     buffer->chars.unicode = (UTF16Char *)buffer->localBuffer;
 279                 }
 280             }
 281             return (CFUniCharFromUTF32(src, limit - src, buffer->chars.unicode, (strictUTF32 ? false : true), __CF_BIG_ENDIAN__ ? !swap : swap) ? TRUE : FALSE);
 282         }
 283     } else {
 284         CFIndex idx;
 285         const uint8_t *chars = (const uint8_t *)bytes;
 286         const uint8_t *end = chars + len;
 287
 288         switch (encoding) {
 289         case kCFStringEncodingNonLossyASCII: {
 290             UTF16Char currentValue = 0;
 291             uint8_t character;
 292             int8_t mode = __NSNonLossyASCIIMode;
 293
 294             buffer->isASCII = false;
 295             buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
 296             buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : (UniChar *)CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
 297             buffer->numChars = 0;
 298
 299             while (chars < end) {
 300                 character = (*chars++);
 301
 302                 switch (mode) {
 303                 case __NSNonLossyASCIIMode:
 304                     if (character == '\\') {
 305                         mode = __NSNonLossyBackslashMode;
 306                     } else if (character < 0x80) {
 307                         currentValue = character;
 308                     } else {
 309                         mode = __NSNonLossyErrorMode;
 310                     }
 311                     break;
 312
 313                 case __NSNonLossyBackslashMode:
 314                     if ((character == 'U') || (character == 'u')) {
 315                         mode = __NSNonLossyHexInitialMode;
 316                         currentValue = 0;
 317                     } else if ((character >= '0') && (character <= '9')) {
 318                         mode = __NSNonLossyOctalInitialMode;
 319                         currentValue = character - '0';
 320                     } else if (character == '\\') {
 321                         mode = __NSNonLossyASCIIMode;
 322                         currentValue = character;
 323                     } else {
 324                         mode = __NSNonLossyErrorMode;
 325                     }
 326                     break;
 327
 328                 default:
 329                     if (mode < __NSNonLossyHexFinalMode) {
 330                         if ((character >= '0') && (character <= '9')) {
 331                             currentValue = (currentValue << 4) | (character - '0');
 332                             if (++mode == __NSNonLossyHexFinalMode) mode = __NSNonLossyASCIIMode;
 333                         } else {
 334                             if (character >= 'a') character -= ('a' - 'A');
 335                             if ((character >= 'A') && (character <= 'F')) {
 336                                 currentValue = (currentValue << 4) | ((character - 'A') + 10);
 337                                 if (++mode == __NSNonLossyHexFinalMode) mode = __NSNonLossyASCIIMode;
 338                             } else {
 339                                 mode = __NSNonLossyErrorMode;
 340                             }
 341                         }
 342                     } else {
 343                         if ((character >= '0') && (character <= '9')) {
 344                             currentValue = (currentValue << 3) | (character - '0');
 345                             if (++mode == __NSNonLossyOctalFinalMode) mode = __NSNonLossyASCIIMode;
 346                         } else {
 347                             mode = __NSNonLossyErrorMode;
 348                         }
 349                     }
 350                     break;
 351                 }
 352
 353                 if (mode == __NSNonLossyASCIIMode) {
 354                     buffer->chars.unicode[buffer->numChars++] = currentValue;
 355                 } else if (mode == __NSNonLossyErrorMode) {
 356                     return false;
 357                 }
 358             }
 359             return (mode == __NSNonLossyASCIIMode);
 360         }
 361
 362         case kCFStringEncodingUTF8:
 363             if ((len >= 3) && (chars[0] == 0xef) && (chars[1] == 0xbb) && (chars[2] == 0xbf)) { // If UTF8 BOM, skip
 364                 chars += 3;
 365                 len -= 3;
 366                 if (0 == len) return true;
 367             }
 368             if (buffer->isASCII) {
 369                 for (idx = 0; idx < len; idx++) {
 370                     if (128 <= chars[idx]) {
 371                         buffer->isASCII = false;
 372                         break;
 373                     }
 374                 }
 375             }
 376             if (buffer->isASCII) {
 377                 buffer->numChars = len;
 378                 buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
 379                 buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : (UInt8 *)CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
 380                 memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
 381             } else {
 382                 CFIndex numDone;
 383                 static CFStringEncodingToUnicodeProc __CFFromUTF8 = NULL;
 384
 385                 if (!__CFFromUTF8) {
 386                     const CFStringEncodingConverter *converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
 387                     __CFFromUTF8 = (CFStringEncodingToUnicodeProc)converter->toUnicode;
 388                 }
 389
 390                 buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
 391                 buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : (UniChar *)CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
 392                 buffer->numChars = 0;
 393                 while (chars < end) {
 394                     numDone = 0;
 395                     chars += __CFFromUTF8(converterFlags, chars, end - chars, &(buffer->chars.unicode[buffer->numChars]), len - buffer->numChars, &numDone);
 396
 397                     if (0 == numDone) {
 398                         if (buffer->shouldFreeChars) CFAllocatorDeallocate(buffer->allocator, buffer->chars.unicode);
 399                         buffer->isASCII = !alwaysUnicode;
 400                         buffer->shouldFreeChars = false;
 401                         buffer->chars.ascii = NULL;
 402                         buffer->numChars = 0;
 403                         return false;
 404                     }
 405                     buffer->numChars += numDone;
 406                 }
 407             }
 408             break;
 409
 410         default:
 411             if (CFStringEncodingIsValidEncoding(encoding)) {
 412                 const CFStringEncodingConverter *converter = CFStringEncodingGetConverter(encoding);
 413                 Boolean isASCIISuperset = __CFStringEncodingIsSupersetOfASCII(encoding);
 414
 415                 if (!converter) return false;
 416
 417                 if (!isASCIISuperset) buffer->isASCII = false;
 418
 419                 if (buffer->isASCII) {
 420                     for (idx = 0; idx < len; idx++) {
 421                         if (128 <= chars[idx]) {
 422                             buffer->isASCII = false;
 423                             break;
 424                         }
 425                     }
 426                 }
 427
 428                 if (converter->encodingClass == kCFStringEncodingConverterCheapEightBit) {
 429                     if (buffer->isASCII) {
 430                         buffer->numChars = len;
 431                         buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
 432                         buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : (UInt8 *)CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
 433                         memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
 434                     } else {
 435                         buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
 436                         buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : (UniChar *)CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
 437                         buffer->numChars = len;
 438                         if (kCFStringEncodingASCII == encoding || kCFStringEncodingISOLatin1 == encoding) {
 439                             for (idx = 0; idx < len; idx++) buffer->chars.unicode[idx] = (UniChar)chars[idx];
 440                         } else {
 441                             for (idx = 0; idx < len; idx++)
 442                                 if (chars[idx] < 0x80 && isASCIISuperset)
 443                                     buffer->chars.unicode[idx] = (UniChar)chars[idx];
 444                                 else if (!((CFStringEncodingCheapEightBitToUnicodeProc)converter->toUnicode)(0, chars[idx], buffer->chars.unicode + idx))
 445                                     return false;
 446                         }
 447                     }
 448                 } else {
 449                     if (buffer->isASCII) {
 450                         buffer->numChars = len;
 451                         buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
 452                         buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : (UInt8 *)CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
 453                         memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
 454                     } else {
 455                         CFIndex guessedLength = CFStringEncodingCharLengthForBytes(encoding, 0, bytes, len);
 456                         static UInt32 lossyFlag = (UInt32)-1;
 457
 458                         buffer->shouldFreeChars = !buffer->chars.unicode && (guessedLength <= MAX_LOCAL_UNICHARS) ? false : true;
 459                         buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (guessedLength <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : (UniChar *)CFAllocatorAllocate(buffer->allocator, guessedLength * sizeof(UniChar), 0));
 460
 461                         if (lossyFlag == (UInt32)-1) lossyFlag = (_CFExecutableLinkedOnOrAfter(CFSystemVersionPanther) ? 0 : kCFStringEncodingAllowLossyConversion);
 462
 463                         if (CFStringEncodingBytesToUnicode(encoding, lossyFlag|__CFGetASCIICompatibleFlag(), bytes, len, NULL, buffer->chars.unicode, (guessedLength > MAX_LOCAL_UNICHARS ? guessedLength : MAX_LOCAL_UNICHARS), &(buffer->numChars))) {
 464                             if (buffer->shouldFreeChars) CFAllocatorDeallocate(buffer->allocator, buffer->chars.unicode);
 465                             buffer->isASCII = !alwaysUnicode;
 466                             buffer->shouldFreeChars = false;
 467                             buffer->chars.ascii = NULL;
 468                             buffer->numChars = 0;
 469                             return false;
 470                         }
 471                     }
 472                 }
 473             } else {
 474                 return false;
 475             }
 476         }
 477     }
 478
 479     return true;
 480 }
 481
 482
 483 /* Create a byte stream from a CFString backing. Can convert a string piece at a time
 484    into a fixed size buffer. Returns number of characters converted.
 485    Characters that cannot be converted to the specified encoding are represented
 486    with the char specified by lossByte; if 0, then lossy conversion is not allowed
 487    and conversion stops, returning partial results.
 488    Pass buffer==NULL if you don't care about the converted string (but just the convertability,
 489    or number of bytes required, indicated by usedBufLen).
 490    Does not zero-terminate. If you want to create Pascal or C string, allow one extra byte at start or end.
 491
 492    Note: This function is intended to work through CFString functions, so it should work
 493    with NSStrings as well as CFStrings.
 494 */
 495 CFIndex __CFStringEncodeByteStream(CFStringRef string, CFIndex rangeLoc, CFIndex rangeLen, Boolean generatingExternalFile, CFStringEncoding encoding, char lossByte, uint8_t *buffer, CFIndex max, CFIndex *usedBufLen) {
 496     CFIndex totalBytesWritten = 0;      /* Number of written bytes */
 497     CFIndex numCharsProcessed = 0;      /* Number of processed chars */
 498     const UniChar *unichars;
 499
 500     if (encoding == kCFStringEncodingUTF8 && (unichars = CFStringGetCharactersPtr(string))) {
 501         static CFStringEncodingToBytesProc __CFToUTF8 = NULL;
 502
 503         if (!__CFToUTF8) {
 504             const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
 505             __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes;
 506         }
 507         numCharsProcessed = __CFToUTF8((generatingExternalFile ? kCFStringEncodingPrependBOM : 0), unichars + rangeLoc, rangeLen, buffer, (buffer ? max : 0), &totalBytesWritten);
 508
 509     } else if (encoding == kCFStringEncodingNonLossyASCII) {
 510         const char *hex = "0123456789abcdef";
 511         UniChar ch;
 512         CFStringInlineBuffer buf;
 513         CFStringInitInlineBuffer(string, &buf, CFRangeMake(rangeLoc, rangeLen));
 514         while (numCharsProcessed < rangeLen) {
 515             CFIndex reqLength; /* Required number of chars to encode this UniChar */
 516             CFIndex cnt;
 517             char tmp[6];
 518             ch = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed);
 519             if ((ch >= ' ' && ch <= '~' && ch != '\\') || (ch == '\n' || ch == '\r' || ch == '\t')) {
 520                 reqLength = 1;
 521                 tmp[0] = (char)ch;
 522             } else {
 523                 if (ch == '\\') {
 524                     tmp[1] = '\\';
 525                     reqLength = 2;
 526                 } else if (ch < 256) {  /* \nnn; note that this is not NEXTSTEP encoding but a (small) UniChar */
 527                     tmp[1] = '0' + (ch >> 6);
 528                     tmp[2] = '0' + ((ch >> 3) & 7);
 529                     tmp[3] = '0' + (ch & 7);
 530                     reqLength = 4;
 531                 } else {        /* \Unnnn */
 532                     tmp[1] = 'u'; // Changed to small+u in order to be aligned with Java
 533                     tmp[2] = hex[(ch >> 12) & 0x0f];
 534                     tmp[3] = hex[(ch >> 8) & 0x0f];
 535                     tmp[4] = hex[(ch >> 4) & 0x0f];
 536                     tmp[5] = hex[ch & 0x0f];
 537                     reqLength = 6;
 538                 }
 539                 tmp[0] = '\\';
 540             }
 541             if (buffer) {
 542                 if (totalBytesWritten + reqLength > max) break; /* Doesn't fit..
 543 .*/
 544                 for (cnt = 0; cnt < reqLength; cnt++) {
 545                     buffer[totalBytesWritten + cnt] = tmp[cnt];
 546                 }
 547             }
 548             totalBytesWritten += reqLength;
 549             numCharsProcessed++;
 550         }
 551     } else if ((encoding == kCFStringEncodingUTF16) || (encoding == kCFStringEncodingUTF16BE) || (encoding == kCFStringEncodingUTF16LE)) {
 552         CFIndex extraForBOM = (generatingExternalFile && (encoding == kCFStringEncodingUTF16) ? sizeof(UniChar) : 0);
 553         numCharsProcessed = rangeLen;
 554         if (buffer && (numCharsProcessed * (CFIndex)sizeof(UniChar) + extraForBOM > max)) {
 555             numCharsProcessed = (max > extraForBOM) ? ((max - extraForBOM) / sizeof(UniChar)) : 0;
 556         }
 557         totalBytesWritten = (numCharsProcessed * sizeof(UniChar)) + extraForBOM;
 558         if (buffer) {
 559             if (extraForBOM) {  /* Generate BOM */
 560 #if __CF_BIG_ENDIAN__
 561                 *buffer++ = 0xfe; *buffer++ = 0xff;
 562 #else
 563                 *buffer++ = 0xff; *buffer++ = 0xfe;
 564 #endif
 565             }
 566             CFStringGetCharacters(string, CFRangeMake(rangeLoc, numCharsProcessed), (UniChar *)buffer);
 567             if ((__CF_BIG_ENDIAN__ ?  kCFStringEncodingUTF16LE : kCFStringEncodingUTF16BE) == encoding) { // Need to swap
 568                 UTF16Char *characters = (UTF16Char *)buffer;
 569                 const UTF16Char *limit = characters + numCharsProcessed;
 570
 571                 while (characters < limit) {
 572                     *characters = CFSwapInt16(*characters);
 573                     ++characters;
 574                 }
 575             }
 576         }
 577     } else if ((encoding == kCFStringEncodingUTF32) || (encoding == kCFStringEncodingUTF32BE) || (encoding == kCFStringEncodingUTF32LE)) {
 578         UTF32Char character;
 579         CFStringInlineBuffer buf;
 580         UTF32Char *characters = (UTF32Char *)buffer;
 581
 582         bool swap = (encoding == (__CF_BIG_ENDIAN__ ? kCFStringEncodingUTF32LE : kCFStringEncodingUTF32BE) ? true : false);
 583         if (generatingExternalFile && (encoding == kCFStringEncodingUTF32)) {
 584             totalBytesWritten += sizeof(UTF32Char);
 585             if (characters) {
 586                 if (totalBytesWritten > max) { // insufficient buffer
 587                     totalBytesWritten = 0;
 588                 } else {
 589                     *(characters++) = 0x0000FEFF;
 590                 }
 591             }
 592         }
 593
 594         CFStringInitInlineBuffer(string, &buf, CFRangeMake(rangeLoc, rangeLen));
 595         while (numCharsProcessed < rangeLen) {
 596             character = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed);
 597
 598             if (CFUniCharIsSurrogateHighCharacter(character)) {
 599                 UTF16Char otherCharacter;
 600
 601                 if (((numCharsProcessed + 1) < rangeLen) && CFUniCharIsSurrogateLowCharacter((otherCharacter = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed + 1)))) {
 602                     character = CFUniCharGetLongCharacterForSurrogatePair(character, otherCharacter);
 603                 } else if (lossByte) {
 604                     character = lossByte;
 605                 } else {
 606                     break;
 607                 }
 608             } else if (CFUniCharIsSurrogateLowCharacter(character)) {
 609                 if (lossByte) {
 610                     character = lossByte;
 611                 } else {
 612                     break;
 613                 }
 614             }
 615
 616             totalBytesWritten += sizeof(UTF32Char);
 617
 618             if (characters) {
 619                 if (totalBytesWritten > max) {
 620                     totalBytesWritten -= sizeof(UTF32Char);
 621                     break;
 622                 }
 623                 *(characters++) = (swap ? CFSwapInt32(character) : character);
 624             }
 625
 626             numCharsProcessed += (character > 0xFFFF ? 2 : 1);
 627         }
 628     } else {
 629         CFIndex numChars;
 630         UInt32 flags;
 631         const unsigned char *cString = NULL;
 632         Boolean isASCIISuperset = __CFStringEncodingIsSupersetOfASCII(encoding);
 633
 634         if (!CF_IS_OBJC(CFStringGetTypeID(), string) && isASCIISuperset) { // Checking for NSString to avoid infinite recursion
 635             const unsigned char *ptr;
 636             if ((cString = (const unsigned char *)CFStringGetCStringPtr(string, __CFStringGetEightBitStringEncoding()))) {
 637                 ptr = (cString += rangeLoc);
 638                 if (__CFStringGetEightBitStringEncoding() == encoding) {
 639                     numCharsProcessed = (rangeLen < max || buffer == NULL ? rangeLen : max);
 640                     if (buffer) memmove(buffer, cString, numCharsProcessed);
 641                     if (usedBufLen) *usedBufLen = numCharsProcessed;
 642                     return numCharsProcessed;
 643                 }
 644                 while (*ptr < 0x80 && rangeLen > 0) {
 645                     ++ptr;
 646                     --rangeLen;
 647                 }
 648                 numCharsProcessed = ptr - cString;
 649                 if (buffer) {
 650                     numCharsProcessed = (numCharsProcessed < max ? numCharsProcessed : max);
 651                     memmove(buffer, cString, numCharsProcessed);
 652                     buffer += numCharsProcessed;
 653                     max -= numCharsProcessed;
 654                 }
 655                 if (!rangeLen || (buffer && (max == 0))) {
 656                     if (usedBufLen) *usedBufLen = numCharsProcessed;
 657                     return numCharsProcessed;
 658                 }
 659                 rangeLoc += numCharsProcessed;
 660                 totalBytesWritten += numCharsProcessed;
 661             }
 662             if (!cString && (cString = CFStringGetPascalStringPtr(string, __CFStringGetEightBitStringEncoding()))) {
 663                 ptr = (cString += (rangeLoc + 1));
 664                 if (__CFStringGetEightBitStringEncoding() == encoding) {
 665                     numCharsProcessed = (rangeLen < max || buffer == NULL ? rangeLen : max);
 666                     if (buffer) memmove(buffer, cString, numCharsProcessed);
 667                     if (usedBufLen) *usedBufLen = numCharsProcessed;
 668                     return numCharsProcessed;
 669                 }
 670                 while (*ptr < 0x80 && rangeLen > 0) {
 671                     ++ptr;
 672                     --rangeLen;
 673                 }
 674                 numCharsProcessed = ptr - cString;
 675                 if (buffer) {
 676                     numCharsProcessed = (numCharsProcessed < max ? numCharsProcessed : max);
 677                     memmove(buffer, cString, numCharsProcessed);
 678                     buffer += numCharsProcessed;
 679                     max -= numCharsProcessed;
 680                 }
 681                 if (!rangeLen || (buffer && (max == 0))) {
 682                     if (usedBufLen) *usedBufLen = numCharsProcessed;
 683                     return numCharsProcessed;
 684                 }
 685                 rangeLoc += numCharsProcessed;
 686                 totalBytesWritten += numCharsProcessed;
 687             }
 688         }
 689
 690         if (!buffer) max = 0;
 691
 692         // Special case for Foundation. When lossByte == 0xFF && encoding kCFStringEncodingASCII, we do the default ASCII fallback conversion
 693         // Aki 11/24/04 __CFGetASCIICompatibleFlag() is called only for non-ASCII superset encodings. Otherwise, it could lead to a deadlock (see 3890536).
 694         flags = (lossByte ? ((unsigned char)lossByte == 0xFF && encoding == kCFStringEncodingASCII ? kCFStringEncodingAllowLossyConversion : CFStringEncodingLossyByteToMask(lossByte)) : 0) | (generatingExternalFile ? kCFStringEncodingPrependBOM : 0) | (isASCIISuperset ? 0 : __CFGetASCIICompatibleFlag());
 695
 696         if (!cString && (cString = (const unsigned char *)CFStringGetCharactersPtr(string))) { // Must be Unicode string
 697             if (CFStringEncodingIsValidEncoding(encoding)) { // Converter available in CF
 698                 CFStringEncodingUnicodeToBytes(encoding, flags, (const UniChar *)cString + rangeLoc, rangeLen, &numCharsProcessed, buffer, max, &totalBytesWritten);
 699             } else {
 700                 return 0;
 701             }
 702         } else {
 703             UniChar charBuf[kCFCharConversionBufferLength];
 704             CFIndex currentLength;
 705             CFIndex usedLen;
 706             CFIndex lastUsedLen = 0, lastNumChars = 0;
 707             uint32_t result;
 708             Boolean isCFBuiltin = CFStringEncodingIsValidEncoding(encoding);
 709 #define MAX_DECOMP_LEN (6)
 710
 711             while (rangeLen > 0) {
 712                 currentLength = (rangeLen > kCFCharConversionBufferLength ? kCFCharConversionBufferLength : rangeLen);
 713                 CFStringGetCharacters(string, CFRangeMake(rangeLoc, currentLength), charBuf);
 714
 715                 // could be in the middle of surrogate pair; back up.
 716                 if ((rangeLen > kCFCharConversionBufferLength) && CFUniCharIsSurrogateHighCharacter(charBuf[kCFCharConversionBufferLength - 1])) --currentLength;
 717
 718                 if (isCFBuiltin) { // Converter available in CF
 719                     if ((result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, currentLength, &numChars, buffer, max, &usedLen)) != kCFStringEncodingConversionSuccess) {
 720                         if (kCFStringEncodingInvalidInputStream == result) {
 721                             CFRange composedRange;
 722                             // Check the tail
 723                             if ((rangeLen > kCFCharConversionBufferLength) && ((currentLength - numChars) < MAX_DECOMP_LEN)) {
 724                                 composedRange = CFStringGetRangeOfComposedCharactersAtIndex(string, rangeLoc + currentLength);
 725
 726                                 if ((composedRange.length <= MAX_DECOMP_LEN) && (composedRange.location < (rangeLoc + numChars))) {
 727                                     result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, composedRange.location - rangeLoc, &numChars, buffer, max, &usedLen);
 728                                 }
 729                             }
 730
 731                             // Check the head
 732                             if ((kCFStringEncodingConversionSuccess != result) && (lastNumChars > 0) && (numChars < MAX_DECOMP_LEN)) {
 733                                 composedRange = CFStringGetRangeOfComposedCharactersAtIndex(string, rangeLoc);
 734
 735                                 if ((composedRange.length <= MAX_DECOMP_LEN) && (composedRange.location < rangeLoc)) {
 736                                     // Try if the composed range can be converted
 737                                     CFStringGetCharacters(string, composedRange, charBuf);
 738
 739                                     if (CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, composedRange.length, &numChars, NULL, 0, &usedLen) == kCFStringEncodingConversionSuccess) { // OK let's try the last run
 740                                         CFIndex lastRangeLoc = rangeLoc - lastNumChars;
 741
 742                                         currentLength = composedRange.location - lastRangeLoc;
 743                                         CFStringGetCharacters(string, CFRangeMake(lastRangeLoc, currentLength), charBuf);
 744
 745                                         if ((result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, currentLength, &numChars, (max ? buffer - lastUsedLen : NULL), (max ? max + lastUsedLen : 0), &usedLen)) == kCFStringEncodingConversionSuccess) { // OK let's try the last run
 746                                             // Looks good. back up
 747                                             totalBytesWritten -= lastUsedLen;
 748                                             numCharsProcessed -= lastNumChars;
 749
 750                                             rangeLoc = lastRangeLoc;
 751                                             rangeLen += lastNumChars;
 752
 753                                             if (max) {
 754                                                 buffer -= lastUsedLen;
 755                                                 max += lastUsedLen;
 756                                             }
 757                                         }
 758                                     }
 759                                 }
 760                             }
 761                         }
 762
 763                         if (kCFStringEncodingConversionSuccess != result) { // really failed
 764                             totalBytesWritten += usedLen;
 765                             numCharsProcessed += numChars;
 766                             break;
 767                         }
 768                     }
 769                 } else {
 770                     return 0;
 771                 }
 772
 773                 totalBytesWritten += usedLen;
 774                 numCharsProcessed += numChars;
 775
 776                 rangeLoc += numChars;
 777                 rangeLen -= numChars;
 778                 if (max) {
 779                     buffer += usedLen;
 780                     max -= usedLen;
 781                     if (max <= 0) break;
 782                 }
 783                 lastUsedLen = usedLen; lastNumChars = numChars;
 784                 flags &= ~kCFStringEncodingPrependBOM;
 785             }
 786         }
 787     }
 788     if (usedBufLen) *usedBufLen = totalBytesWritten;
 789     return numCharsProcessed;
 790 }
 791
 792 CFStringRef CFStringCreateWithFileSystemRepresentation(CFAllocatorRef alloc, const char *buffer) {
 793     return CFStringCreateWithCString(alloc, buffer, CFStringFileSystemEncoding());
 794 }
 795
 796 CFIndex CFStringGetMaximumSizeOfFileSystemRepresentation(CFStringRef string) {
 797     CFIndex len = CFStringGetLength(string);
 798     CFStringEncoding enc = CFStringGetFastestEncoding(string);
 799     switch (enc) {
 800         case kCFStringEncodingASCII:
 801         case kCFStringEncodingMacRoman:
 802             return len * 3 + 1;
 803         default:
 804             return len * 9 + 1;
 805     }
 806 }
 807
 808 Boolean CFStringGetFileSystemRepresentation(CFStringRef string, char *buffer, CFIndex maxBufLen) {
 809 #if DEPLOYMENT_TARGET_MACOSX
 810 #define MAX_STACK_BUFFER_LEN    (255)
 811     const UTF16Char *characters = CFStringGetCharactersPtr(string);
 812     const char *bufferLimit = buffer + maxBufLen;
 813     CFIndex length = CFStringGetLength(string);
 814     CFIndex usedBufLen;
 815
 816     if (maxBufLen < length) return false; // Since we're using UTF-8, the byte length is never shorter than the char length. Also, it filters out 0 == maxBufLen
 817
 818     if (NULL == characters) {
 819         UTF16Char charactersBuffer[MAX_STACK_BUFFER_LEN];
 820         CFRange range = CFRangeMake(0, 0);
 821         const char *bytes = CFStringGetCStringPtr(string, __CFStringGetEightBitStringEncoding());
 822
 823         if (NULL != bytes) {
 824             const char *originalBytes = bytes;
 825             const char *bytesLimit = bytes + length;
 826
 827             while ((bytes < bytesLimit) && (buffer < bufferLimit) && (0 == (*bytes & 0x80))) *(buffer++) = *(bytes++);
 828
 829             range.location = bytes - originalBytes;
 830         }
 831         while ((range.location < length) && (buffer < bufferLimit)) {
 832             range.length = length - range.location;
 833             if (range.length > MAX_STACK_BUFFER_LEN) range.length = MAX_STACK_BUFFER_LEN;
 834
 835             CFStringGetCharacters(string, range, charactersBuffer);
 836             if ((range.length == MAX_STACK_BUFFER_LEN) && CFUniCharIsSurrogateHighCharacter(charactersBuffer[MAX_STACK_BUFFER_LEN - 1])) --range.length; // Backup for a high surrogate
 837
 838             if (!CFUniCharDecompose(charactersBuffer, range.length, NULL, (void *)buffer, bufferLimit - buffer, &usedBufLen, true, kCFUniCharUTF8Format, true)) return false;
 839
 840             buffer += usedBufLen;
 841             range.location += range.length;
 842         }
 843     } else {
 844         if (!CFUniCharDecompose(characters, length, NULL, (void *)buffer, maxBufLen, &usedBufLen, true, kCFUniCharUTF8Format, true)) return false;
 845         buffer += usedBufLen;
 846     }
 847
 848     if (buffer < bufferLimit) { // Since the filename has its own limit, this is ok for now
 849         *buffer = '\0';
 850         return true;
 851     } else {
 852         return false;
 853     }
 854 #else __MACH__
 855     return CFStringGetCString(string, buffer, maxBufLen, CFStringFileSystemEncoding());
 856 #endif __MACH__
 857 }
 858
 859 Boolean _CFStringGetFileSystemRepresentation(CFStringRef string, uint8_t *buffer, CFIndex maxBufLen) {
 860     return CFStringGetFileSystemRepresentation(string, (char *)buffer, maxBufLen);
 861 }
 862