String.subproj/CFStringEncodings.c

   1 /*
   2  * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*      CFStringEncodings.c
  24         Copyright 1999-2002, Apple, Inc. All rights reserved.
  25         Responsibility: Aki Inoue
  26 */
  27
  28 #include "CFInternal.h"
  29 #include <CoreFoundation/CFString.h>
  30 #include <CoreFoundation/CFByteOrder.h>
  31 #include "CFUtilitiesPriv.h"
  32 #include <string.h>
  33 #include "CFStringEncodingConverterExt.h"
  34 #include "CFUniChar.h"
  35 #include "CFUnicodeDecomposition.h"
  36
  37 static UInt32 __CFWantsToUseASCIICompatibleConversion = (UInt32)-1;
  38 CF_INLINE UInt32 __CFGetASCIICompatibleFlag(void) {
  39     if (__CFWantsToUseASCIICompatibleConversion == (UInt32)-1) {
  40         __CFWantsToUseASCIICompatibleConversion = false;
  41     }
  42     return (__CFWantsToUseASCIICompatibleConversion ? kCFStringEncodingASCIICompatibleConversion : 0);
  43 }
  44
  45 void _CFStringEncodingSetForceASCIICompatibility(Boolean flag) {
  46     __CFWantsToUseASCIICompatibleConversion = (flag ? (UInt32)true : (UInt32)false);
  47 }
  48
  49 Boolean (*__CFCharToUniCharFunc)(UInt32 flags, uint8_t ch, UniChar *unicodeChar) = NULL;
  50
  51 // To avoid early initialization issues, we just initialize this here
  52 // This should not be const as it is changed
  53 UniChar __CFCharToUniCharTable[256] = {
  54   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
  55  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
  56  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
  57  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
  58  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
  59  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
  60  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
  61 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
  62 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
  63 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
  64 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
  65 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
  66 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
  67 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
  68 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
  69 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
  70 };
  71
  72 void __CFSetCharToUniCharFunc(Boolean (*func)(UInt32 flags, UInt8 ch, UniChar *unicodeChar)) {
  73     if (__CFCharToUniCharFunc != func) {
  74         int ch;
  75         __CFCharToUniCharFunc = func;
  76         if (func) {
  77             for (ch = 128; ch < 256; ch++) {
  78                 UniChar uch;
  79                 __CFCharToUniCharTable[ch] = (__CFCharToUniCharFunc(0, ch, &uch) ? uch : 0xFFFD);
  80             }
  81         } else {        // If we have no __CFCharToUniCharFunc, assume 128..255 return the value as-is
  82             for (ch = 128; ch < 256; ch++) __CFCharToUniCharTable[ch] = ch;
  83         }
  84     }
  85 }
  86
  87 __private_extern__ void __CFStrConvertBytesToUnicode(const uint8_t *bytes, UniChar *buffer, CFIndex numChars) {
  88     CFIndex idx;
  89     for (idx = 0; idx < numChars; idx++) buffer[idx] = __CFCharToUniCharTable[bytes[idx]];
  90 }
  91
  92
  93 /* The minimum length the output buffers should be in the above functions
  94 */
  95 #define kCFCharConversionBufferLength 512
  96
  97
  98 #define MAX_LOCAL_CHARS         (sizeof(buffer->localBuffer) / sizeof(uint8_t))
  99 #define MAX_LOCAL_UNICHARS      (sizeof(buffer->localBuffer) / sizeof(UniChar))
 100
 101 /* Convert a byte stream to ASCII (7-bit!) or Unicode, with a CFVarWidthCharBuffer struct on the stack. false return indicates an error occured during the conversion. The caller needs to free the returned buffer in either ascii or unicode (indicated by isASCII), if shouldFreeChars is true.
 102 9/18/98 __CFStringDecodeByteStream now avoids to allocate buffer if buffer->chars is not NULL
 103 Added useClientsMemoryPtr; if not-NULL, and the provided memory can be used as is, this is set to true
 104 __CFStringDecodeByteStream2() is kept around for any internal clients who might be using it; it should be deprecated
 105 !!! converterFlags is only used for the UTF8 converter at this point
 106 */
 107 Boolean __CFStringDecodeByteStream2(const uint8_t *bytes, UInt32 len, CFStringEncoding encoding, Boolean alwaysUnicode, CFVarWidthCharBuffer *buffer, Boolean *useClientsMemoryPtr) {
 108     return __CFStringDecodeByteStream3(bytes, len, encoding, alwaysUnicode, buffer, useClientsMemoryPtr, 0);
 109 }
 110
 111 enum {
 112     __NSNonLossyErrorMode = -1,
 113     __NSNonLossyASCIIMode = 0,
 114     __NSNonLossyBackslashMode = 1,
 115     __NSNonLossyHexInitialMode = __NSNonLossyBackslashMode + 1,
 116     __NSNonLossyHexFinalMode = __NSNonLossyHexInitialMode + 4,
 117     __NSNonLossyOctalInitialMode = __NSNonLossyHexFinalMode + 1,
 118     __NSNonLossyOctalFinalMode = __NSNonLossyHexFinalMode + 3
 119 };
 120
 121 Boolean __CFStringDecodeByteStream3(const uint8_t *bytes, UInt32 len, CFStringEncoding encoding, Boolean alwaysUnicode, CFVarWidthCharBuffer *buffer, Boolean *useClientsMemoryPtr, UInt32 converterFlags) {
 122
 123     if (useClientsMemoryPtr) *useClientsMemoryPtr = false;
 124
 125     buffer->isASCII = !alwaysUnicode;
 126     buffer->shouldFreeChars = false;
 127     buffer->numChars = 0;
 128
 129     if (0 == len) return true;
 130
 131     buffer->allocator = (buffer->allocator ? buffer->allocator : __CFGetDefaultAllocator());
 132
 133     if ((encoding == kCFStringEncodingUTF16) || (encoding == kCFStringEncodingUTF16BE) || (encoding == kCFStringEncodingUTF16LE)) { // UTF-16
 134         const UTF16Char *src = (const UTF16Char *)bytes;
 135         const UTF16Char *limit = (const UTF16Char *)(bytes + len);
 136         bool swap = false;
 137
 138         if (kCFStringEncodingUTF16 == encoding) {
 139             UTF16Char bom = ((*src == 0xFFFE) || (*src == 0xFEFF) ? *(src++) : 0);
 140
 141 #if defined(__BIG_ENDIAN__)
 142             if (bom == 0xFFFE) swap = true;
 143 #else
 144             if (bom != 0xFEFF) swap = true;
 145 #endif
 146             if (bom) useClientsMemoryPtr = NULL;
 147         } else {
 148 #if defined(__BIG_ENDIAN__)
 149             if (kCFStringEncodingUTF16LE == encoding) swap = true;
 150 #else
 151             if (kCFStringEncodingUTF16BE == encoding) swap = true;
 152 #endif
 153         }
 154
 155         buffer->numChars = limit - src;
 156
 157         if (useClientsMemoryPtr && !swap) { // If the caller is ready to deal with no-copy situation, and the situation is possible, indicate it...
 158             *useClientsMemoryPtr = true;
 159             buffer->chars.unicode = (UniChar *)src;
 160             buffer->isASCII = false;
 161         } else {
 162             if (buffer->isASCII) {      // Let's see if we can reduce the Unicode down to ASCII...
 163                 const UTF16Char *characters = src;
 164                 UTF16Char mask = (swap ? 0x80FF : 0xFF80);
 165
 166                 while (characters < limit) {
 167                     if (*(characters++) & mask) {
 168                         buffer->isASCII = false;
 169                         break;
 170                     }
 171                 }
 172             }
 173
 174             if (buffer->isASCII) {
 175                 uint8_t *dst;
 176                 if (NULL == buffer->chars.ascii) { // we never reallocate when buffer is supplied
 177                     if (buffer->numChars > MAX_LOCAL_CHARS) {
 178                         buffer->chars.ascii = CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(uint8_t)), 0);
 179                         buffer->shouldFreeChars = true;
 180                     } else {
 181                         buffer->chars.ascii = (uint8_t *)buffer->localBuffer;
 182                     }
 183                 }
 184                 dst = buffer->chars.ascii;
 185
 186                 if (swap) {
 187                     while (src < limit) *(dst++) = (*(src++) >> 8);
 188                 } else {
 189                     while (src < limit) *(dst++) = *(src++);
 190                 }
 191             } else {
 192                 UTF16Char *dst;
 193
 194                 if (NULL == buffer->chars.unicode) { // we never reallocate when buffer is supplied
 195                     if (buffer->numChars > MAX_LOCAL_UNICHARS) {
 196                         buffer->chars.unicode = CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(UTF16Char)), 0);
 197                         buffer->shouldFreeChars = true;
 198                     } else {
 199                         buffer->chars.unicode = (UTF16Char *)buffer->localBuffer;
 200                     }
 201                 }
 202                 dst = buffer->chars.unicode;
 203
 204                 if (swap) {
 205                     while (src < limit) *(dst++) = CFSwapInt16(*(src++));
 206                 } else {
 207                     memmove(dst, src, buffer->numChars * sizeof(UTF16Char));
 208                 }
 209             }
 210         }
 211     } else if ((encoding == kCFStringEncodingUTF32) || (encoding == kCFStringEncodingUTF32BE) || (encoding == kCFStringEncodingUTF32LE)) {
 212         const UTF32Char *src = (const UTF32Char *)bytes;
 213         const UTF32Char *limit = (const UTF32Char *)(bytes + len);
 214         bool swap = false;
 215
 216         if (kCFStringEncodingUTF32 == encoding) {
 217             UTF32Char bom = ((*src == 0xFFFE0000) || (*src == 0x0000FEFF) ? *(src++) : 0);
 218
 219 #if defined(__BIG_ENDIAN__)
 220             if (bom == 0xFFFE0000) swap = true;
 221 #else
 222             if (bom != 0x0000FEFF) swap = true;
 223 #endif
 224         } else {
 225 #if defined(__BIG_ENDIAN__)
 226             if (kCFStringEncodingUTF32LE == encoding) swap = true;
 227 #else
 228             if (kCFStringEncodingUTF32BE == encoding) swap = true;
 229 #endif
 230         }
 231
 232         buffer->numChars = limit - src;
 233
 234         {
 235             // Let's see if we have non-ASCII or non-BMP
 236             const UTF32Char *characters = src;
 237             UTF32Char asciiMask = (swap ? 0x80FFFFFF : 0xFFFFFF80);
 238             UTF32Char bmpMask = (swap ? 0x0000FFFF : 0xFFFF0000);
 239
 240             while (characters < limit) {
 241                 if (*characters & asciiMask) {
 242                     buffer->isASCII = false;
 243                     if (*characters & bmpMask) ++(buffer->numChars);
 244                 }
 245                 ++characters;
 246             }
 247         }
 248
 249         if (buffer->isASCII) {
 250             uint8_t *dst;
 251             if (NULL == buffer->chars.ascii) { // we never reallocate when buffer is supplied
 252                 if (buffer->numChars > MAX_LOCAL_CHARS) {
 253                     buffer->chars.ascii = CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(uint8_t)), 0);
 254                     buffer->shouldFreeChars = true;
 255                 } else {
 256                     buffer->chars.ascii = (uint8_t *)buffer->localBuffer;
 257                 }
 258             }
 259             dst = buffer->chars.ascii;
 260
 261             if (swap) {
 262                 while (src < limit) *(dst++) = (*(src++) >> 24);
 263             } else {
 264                 while (src < limit) *(dst++) = *(src++);
 265             }
 266         } else {
 267             if (NULL == buffer->chars.unicode) { // we never reallocate when buffer is supplied
 268                 if (buffer->numChars > MAX_LOCAL_UNICHARS) {
 269                     buffer->chars.unicode = CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(UTF16Char)), 0);
 270                     buffer->shouldFreeChars = true;
 271                 } else {
 272                     buffer->chars.unicode = (UTF16Char *)buffer->localBuffer;
 273                 }
 274             }
 275             CFUniCharFromUTF32(src, limit - src, buffer->chars.unicode, false,
 276 #if defined(__BIG_ENDIAN__)
 277             !swap
 278 #else
 279             swap
 280 #endif
 281             );
 282         }
 283     } else {
 284         UInt32 idx;
 285         const uint8_t *chars = (const uint8_t *)bytes;
 286         const uint8_t *end = chars + len;
 287
 288         switch (encoding) {
 289         case kCFStringEncodingNonLossyASCII: {
 290             UTF16Char currentValue = 0;
 291             uint8_t character;
 292             int8_t mode = __NSNonLossyASCIIMode;
 293
 294             buffer->isASCII = false;
 295             buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
 296             buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
 297             buffer->numChars = 0;
 298
 299             while (chars < end) {
 300                 character = (*chars++);
 301
 302                 switch (mode) {
 303                 case __NSNonLossyASCIIMode:
 304                     if (character == '\\') {
 305                         mode = __NSNonLossyBackslashMode;
 306                     } else if (character < 0x80) {
 307                         currentValue = character;
 308                     } else {
 309                         mode = __NSNonLossyErrorMode;
 310                     }
 311                     break;
 312
 313                 case __NSNonLossyBackslashMode:
 314                     if ((character == 'U') || (character == 'u')) {
 315                         mode = __NSNonLossyHexInitialMode;
 316                         currentValue = 0;
 317                     } else if ((character >= '0') && (character <= '9')) {
 318                         mode = __NSNonLossyOctalInitialMode;
 319                         currentValue = character - '0';
 320                     } else if (character == '\\') {
 321                         mode = __NSNonLossyASCIIMode;
 322                         currentValue = character;
 323                     } else {
 324                         mode = __NSNonLossyErrorMode;
 325                     }
 326                     break;
 327
 328                 default:
 329                     if (mode < __NSNonLossyHexFinalMode) {
 330                         if ((character >= '0') && (character <= '9')) {
 331                             currentValue = (currentValue << 4) | (character - '0');
 332                             if (++mode == __NSNonLossyHexFinalMode) mode = __NSNonLossyASCIIMode;
 333                         } else {
 334                             if (character >= 'a') character -= ('a' - 'A');
 335                             if ((character >= 'A') && (character <= 'F')) {
 336                                 currentValue = (currentValue << 4) | ((character - 'A') + 10);
 337                                 if (++mode == __NSNonLossyHexFinalMode) mode = __NSNonLossyASCIIMode;
 338                             } else {
 339                                 mode = __NSNonLossyErrorMode;
 340                             }
 341                         }
 342                     } else {
 343                         if ((character >= '0') && (character <= '9')) {
 344                             currentValue = (currentValue << 3) | (character - '0');
 345                             if (++mode == __NSNonLossyOctalFinalMode) mode = __NSNonLossyASCIIMode;
 346                         } else {
 347                             mode = __NSNonLossyErrorMode;
 348                         }
 349                     }
 350                     break;
 351                 }
 352
 353                 if (mode == __NSNonLossyASCIIMode) {
 354                     buffer->chars.unicode[buffer->numChars++] = currentValue;
 355                 } else if (mode == __NSNonLossyErrorMode) {
 356                     return false;
 357                 }
 358             }
 359             return (mode == __NSNonLossyASCIIMode);
 360         }
 361
 362         case kCFStringEncodingUTF8:
 363             if ((len >= 3) && (chars[0] == 0xef) && (chars[1] == 0xbb) && (chars[2] == 0xbf)) { // If UTF8 BOM, skip
 364                 chars += 3;
 365                 len -= 3;
 366                 if (0 == len) return true;
 367             }
 368             if (buffer->isASCII) {
 369                 for (idx = 0; idx < len; idx++) {
 370                     if (128 <= chars[idx]) {
 371                         buffer->isASCII = false;
 372                         break;
 373                     }
 374                 }
 375             }
 376             if (buffer->isASCII) {
 377                 buffer->numChars = len;
 378                 buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
 379                 buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
 380                 memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
 381             } else {
 382                 UInt32 numDone;
 383                 static CFStringEncodingToUnicodeProc __CFFromUTF8 = NULL;
 384
 385                 if (!__CFFromUTF8) {
 386                     const CFStringEncodingConverter *converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
 387                     __CFFromUTF8 = (CFStringEncodingToUnicodeProc)converter->toUnicode;
 388                 }
 389
 390                 buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
 391                 buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
 392                 buffer->numChars = 0;
 393                 while (chars < end) {
 394                     numDone = 0;
 395                     chars += __CFFromUTF8(converterFlags, chars, end - chars, &(buffer->chars.unicode[buffer->numChars]), len - buffer->numChars, &numDone);
 396
 397                     if (0 == numDone) {
 398                         if (buffer->shouldFreeChars) CFAllocatorDeallocate(buffer->allocator, buffer->chars.unicode);
 399                         buffer->isASCII = !alwaysUnicode;
 400                         buffer->shouldFreeChars = false;
 401                         buffer->chars.ascii = NULL;
 402                         buffer->numChars = 0;
 403                         return false;
 404                     }
 405                     buffer->numChars += numDone;
 406                 }
 407             }
 408             break;
 409
 410         default:
 411             if (CFStringEncodingIsValidEncoding(encoding)) {
 412                 const CFStringEncodingConverter *converter = CFStringEncodingGetConverter(encoding);
 413                 Boolean isASCIISuperset = __CFStringEncodingIsSupersetOfASCII(encoding);
 414
 415                 if (!converter) return false;
 416
 417                 if (!isASCIISuperset) buffer->isASCII = false;
 418
 419                 if (buffer->isASCII) {
 420                     for (idx = 0; idx < len; idx++) {
 421                         if (128 <= chars[idx]) {
 422                             buffer->isASCII = false;
 423                             break;
 424                         }
 425                     }
 426                 }
 427
 428                 if (converter->encodingClass == kCFStringEncodingConverterCheapEightBit) {
 429                     if (buffer->isASCII) {
 430                         buffer->numChars = len;
 431                         buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
 432                         buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
 433                         memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
 434                     } else {
 435                         buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
 436                         buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
 437                         buffer->numChars = len;
 438                         if (kCFStringEncodingASCII == encoding || kCFStringEncodingISOLatin1 == encoding) {
 439                             for (idx = 0; idx < len; idx++) buffer->chars.unicode[idx] = (UniChar)chars[idx];
 440                         } else {
 441                             for (idx = 0; idx < len; idx++)
 442                                 if (chars[idx] < 0x80 && isASCIISuperset)
 443                                     buffer->chars.unicode[idx] = (UniChar)chars[idx];
 444                                 else if (!((CFStringEncodingCheapEightBitToUnicodeProc)converter->toUnicode)(0, chars[idx], buffer->chars.unicode + idx))
 445                                     return false;
 446                         }
 447                     }
 448                 } else {
 449                     if (buffer->isASCII) {
 450                         buffer->numChars = len;
 451                         buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
 452                         buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
 453                         memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
 454                     } else {
 455                         UInt32 guessedLength = CFStringEncodingCharLengthForBytes(encoding, 0, bytes, len);
 456                         static UInt32 lossyFlag = (UInt32)-1;
 457
 458                         buffer->shouldFreeChars = !buffer->chars.unicode && (guessedLength <= MAX_LOCAL_UNICHARS) ? false : true;
 459                         buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (guessedLength <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, guessedLength * sizeof(UniChar), 0));
 460
 461                         if (lossyFlag == (UInt32)-1) lossyFlag = (_CFExecutableLinkedOnOrAfter(CFSystemVersionPanther) ? 0 : kCFStringEncodingAllowLossyConversion);
 462
 463                         if (CFStringEncodingBytesToUnicode(encoding, lossyFlag|__CFGetASCIICompatibleFlag(), bytes, len, NULL, buffer->chars.unicode, (guessedLength > MAX_LOCAL_UNICHARS ? guessedLength : MAX_LOCAL_UNICHARS), &(buffer->numChars))) {
 464                             if (buffer->shouldFreeChars) CFAllocatorDeallocate(buffer->allocator, buffer->chars.unicode);
 465                             buffer->isASCII = !alwaysUnicode;
 466                             buffer->shouldFreeChars = false;
 467                             buffer->chars.ascii = NULL;
 468                             buffer->numChars = 0;
 469                             return false;
 470                         }
 471                     }
 472                 }
 473             } else {
 474                 return false;
 475             }
 476         }
 477     }
 478
 479     return true;
 480 }
 481
 482
 483 /* Create a byte stream from a CFString backing. Can convert a string piece at a time
 484    into a fixed size buffer. Returns number of characters converted.
 485    Characters that cannot be converted to the specified encoding are represented
 486    with the char specified by lossByte; if 0, then lossy conversion is not allowed
 487    and conversion stops, returning partial results.
 488    Pass buffer==NULL if you don't care about the converted string (but just the convertability,
 489    or number of bytes required, indicated by usedBufLen).
 490    Does not zero-terminate. If you want to create Pascal or C string, allow one extra byte at start or end.
 491
 492    Note: This function is intended to work through CFString functions, so it should work
 493    with NSStrings as well as CFStrings.
 494 */
 495 CFIndex __CFStringEncodeByteStream(CFStringRef string, CFIndex rangeLoc, CFIndex rangeLen, Boolean generatingExternalFile, CFStringEncoding encoding, char lossByte, uint8_t *buffer, CFIndex max, CFIndex *usedBufLen) {
 496     CFIndex totalBytesWritten = 0;      /* Number of written bytes */
 497     CFIndex numCharsProcessed = 0;      /* Number of processed chars */
 498     const UniChar *unichars;
 499
 500     if (encoding == kCFStringEncodingUTF8 && (unichars = CFStringGetCharactersPtr(string))) {
 501         static CFStringEncodingToBytesProc __CFToUTF8 = NULL;
 502
 503         if (!__CFToUTF8) {
 504             const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
 505             __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes;
 506         }
 507         numCharsProcessed = __CFToUTF8((generatingExternalFile ? kCFStringEncodingPrependBOM : 0), unichars + rangeLoc, rangeLen, buffer, (buffer ? max : 0), &totalBytesWritten);
 508
 509     } else if (encoding == kCFStringEncodingNonLossyASCII) {
 510         const char *hex = "0123456789abcdef";
 511         UniChar ch;
 512         CFStringInlineBuffer buf;
 513         CFStringInitInlineBuffer(string, &buf, CFRangeMake(rangeLoc, rangeLen));
 514         while (numCharsProcessed < rangeLen) {
 515             CFIndex reqLength; /* Required number of chars to encode this UniChar */
 516             CFIndex cnt;
 517             char tmp[6];
 518             ch = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed);
 519             if ((ch >= ' ' && ch <= '~' && ch != '\\') || (ch == '\n' || ch == '\r' || ch == '\t')) {
 520                 reqLength = 1;
 521                 tmp[0] = ch;
 522             } else {
 523                 if (ch == '\\') {
 524                     tmp[1] = '\\';
 525                     reqLength = 2;
 526                 } else if (ch < 256) {  /* \nnn; note that this is not NEXTSTEP encoding but a (small) UniChar */
 527                     tmp[1] = '0' + (ch >> 6);
 528                     tmp[2] = '0' + ((ch >> 3) & 7);
 529                     tmp[3] = '0' + (ch & 7);
 530                     reqLength = 4;
 531                 } else {        /* \Unnnn */
 532                     tmp[1] = 'u'; // Changed to small+u in order to be aligned with Java
 533                     tmp[2] = hex[(ch >> 12) & 0x0f];
 534                     tmp[3] = hex[(ch >> 8) & 0x0f];
 535                     tmp[4] = hex[(ch >> 4) & 0x0f];
 536                     tmp[5] = hex[ch & 0x0f];
 537                     reqLength = 6;
 538                 }
 539                 tmp[0] = '\\';
 540             }
 541             if (buffer) {
 542                 if (totalBytesWritten + reqLength > max) break; /* Doesn't fit..
 543 .*/
 544                 for (cnt = 0; cnt < reqLength; cnt++) {
 545                     buffer[totalBytesWritten + cnt] = tmp[cnt];
 546                 }
 547             }
 548             totalBytesWritten += reqLength;
 549             numCharsProcessed++;
 550         }
 551     } else if ((encoding == kCFStringEncodingUTF16) || (encoding == kCFStringEncodingUTF16BE) || (encoding == kCFStringEncodingUTF16LE)) {
 552         CFIndex extraForBOM = (generatingExternalFile && (encoding == kCFStringEncodingUTF16) ? sizeof(UniChar) : 0);
 553         numCharsProcessed = rangeLen;
 554         if (buffer && (numCharsProcessed * (CFIndex)sizeof(UniChar) + extraForBOM > max)) {
 555             numCharsProcessed = (max > extraForBOM) ? ((max - extraForBOM) / sizeof(UniChar)) : 0;
 556         }
 557         totalBytesWritten = (numCharsProcessed * sizeof(UniChar)) + extraForBOM;
 558         if (buffer) {
 559             if (extraForBOM) {  /* Generate BOM */
 560 #if defined(__BIG_ENDIAN__)
 561                 *buffer++ = 0xfe; *buffer++ = 0xff;
 562 #else
 563                 *buffer++ = 0xff; *buffer++ = 0xfe;
 564 #endif
 565             }
 566             CFStringGetCharacters(string, CFRangeMake(rangeLoc, numCharsProcessed), (UniChar *)buffer);
 567             if (
 568 #if defined(__BIG_ENDIAN__)
 569                 kCFStringEncodingUTF16LE
 570 #else
 571                 kCFStringEncodingUTF16BE
 572 #endif
 573                 == encoding) { // Need to swap
 574                 UTF16Char *characters = (UTF16Char *)buffer;
 575                 const UTF16Char *limit = characters + numCharsProcessed;
 576
 577                 while (characters < limit) {
 578                     *characters = CFSwapInt16(*characters);
 579                     ++characters;
 580                 }
 581             }
 582         }
 583     } else if ((encoding == kCFStringEncodingUTF32) || (encoding == kCFStringEncodingUTF32BE) || (encoding == kCFStringEncodingUTF32LE)) {
 584         UTF32Char character;
 585         CFStringInlineBuffer buf;
 586         UTF32Char *characters = (UTF32Char *)buffer;
 587
 588 #if defined(__BIG_ENDIAN__)
 589         bool swap = (encoding == kCFStringEncodingUTF32LE ? true : false);
 590 #else
 591         bool swap = (encoding == kCFStringEncodingUTF32BE ? true : false);
 592 #endif
 593
 594         if (generatingExternalFile && (encoding == kCFStringEncodingUTF32)) {
 595             totalBytesWritten += sizeof(UTF32Char);
 596             if (characters) {
 597                 if (totalBytesWritten > max) { // insufficient buffer
 598                     totalBytesWritten = 0;
 599                 } else {
 600 #if defined(__BIG_ENDIAN__)
 601                     *(characters++) = 0x0000FEFF;
 602 #else
 603                     *(characters++) = 0xFFFE0000;
 604 #endif
 605                 }
 606             }
 607         }
 608
 609         CFStringInitInlineBuffer(string, &buf, CFRangeMake(rangeLoc, rangeLen));
 610         while (numCharsProcessed < rangeLen) {
 611             character = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed);
 612
 613             if (CFUniCharIsSurrogateHighCharacter(character)) {
 614                 UTF16Char otherCharacter;
 615
 616                 if (((numCharsProcessed + 1) < rangeLen) && CFUniCharIsSurrogateLowCharacter((otherCharacter = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed + 1)))) {
 617                     character = CFUniCharGetLongCharacterForSurrogatePair(character, otherCharacter);
 618                 } else if (lossByte) {
 619                     character = lossByte;
 620                 } else {
 621                     break;
 622                 }
 623             } else if (CFUniCharIsSurrogateLowCharacter(character)) {
 624                 if (lossByte) {
 625                     character = lossByte;
 626                 } else {
 627                     break;
 628                 }
 629             }
 630
 631             totalBytesWritten += sizeof(UTF32Char);
 632
 633             if (characters) {
 634                 if (totalBytesWritten > max) {
 635                     totalBytesWritten -= sizeof(UTF32Char);
 636                     break;
 637                 }
 638                 *(characters++) = (swap ? CFSwapInt32(character) : character);
 639             }
 640
 641             numCharsProcessed += (character > 0xFFFF ? 2 : 1);
 642         }
 643     } else {
 644         CFIndex numChars;
 645         UInt32 flags;
 646         const unsigned char *cString = NULL;
 647         BOOL isASCIISuperset = __CFStringEncodingIsSupersetOfASCII(encoding);
 648
 649         if (!CF_IS_OBJC(CFStringGetTypeID(), string) && isASCIISuperset) { // Checking for NSString to avoid infinite recursion
 650             const unsigned char *ptr;
 651             if ((cString = CFStringGetCStringPtr(string, __CFStringGetEightBitStringEncoding()))) {
 652                 ptr = (cString += rangeLoc);
 653                 if (__CFStringGetEightBitStringEncoding() == encoding) {
 654                     numCharsProcessed = (rangeLen < max || buffer == NULL ? rangeLen : max);
 655                     if (buffer) memmove(buffer, cString, numCharsProcessed);
 656                     if (usedBufLen) *usedBufLen = numCharsProcessed;
 657                     return numCharsProcessed;
 658                 }
 659                 while (*ptr < 0x80 && rangeLen > 0) {
 660                     ++ptr;
 661                     --rangeLen;
 662                 }
 663                 numCharsProcessed = ptr - cString;
 664                 if (buffer) {
 665                     numCharsProcessed = (numCharsProcessed < max ? numCharsProcessed : max);
 666                     memmove(buffer, cString, numCharsProcessed);
 667                     buffer += numCharsProcessed;
 668                     max -= numCharsProcessed;
 669                 }
 670                 if (!rangeLen || (buffer && (max == 0))) {
 671                     if (usedBufLen) *usedBufLen = numCharsProcessed;
 672                     return numCharsProcessed;
 673                 }
 674                 rangeLoc += numCharsProcessed;
 675                 totalBytesWritten += numCharsProcessed;
 676             }
 677             if (!cString && (cString = CFStringGetPascalStringPtr(string, __CFStringGetEightBitStringEncoding()))) {
 678                 ptr = (cString += (rangeLoc + 1));
 679                 if (__CFStringGetEightBitStringEncoding() == encoding) {
 680                     numCharsProcessed = (rangeLen < max || buffer == NULL ? rangeLen : max);
 681                     if (buffer) memmove(buffer, cString, numCharsProcessed);
 682                     if (usedBufLen) *usedBufLen = numCharsProcessed;
 683                     return numCharsProcessed;
 684                 }
 685                 while (*ptr < 0x80 && rangeLen > 0) {
 686                     ++ptr;
 687                     --rangeLen;
 688                 }
 689                 numCharsProcessed = ptr - cString;
 690                 if (buffer) {
 691                     numCharsProcessed = (numCharsProcessed < max ? numCharsProcessed : max);
 692                     memmove(buffer, cString, numCharsProcessed);
 693                     buffer += numCharsProcessed;
 694                     max -= numCharsProcessed;
 695                 }
 696                 if (!rangeLen || (buffer && (max == 0))) {
 697                     if (usedBufLen) *usedBufLen = numCharsProcessed;
 698                     return numCharsProcessed;
 699                 }
 700                 rangeLoc += numCharsProcessed;
 701                 totalBytesWritten += numCharsProcessed;
 702             }
 703         }
 704
 705         if (!buffer) max = 0;
 706
 707         // Special case for Foundation. When lossByte == 0xFF && encoding kCFStringEncodingASCII, we do the default ASCII fallback conversion
 708         // Aki 11/24/04 __CFGetASCIICompatibleFlag() is called only for non-ASCII superset encodings. Otherwise, it could lead to a deadlock (see 3890536).
 709         flags = (lossByte ? ((unsigned char)lossByte == 0xFF && encoding == kCFStringEncodingASCII ? kCFStringEncodingAllowLossyConversion : CFStringEncodingLossyByteToMask(lossByte)) : 0) | (generatingExternalFile ? kCFStringEncodingPrependBOM : 0) | (isASCIISuperset ? 0 : __CFGetASCIICompatibleFlag());
 710
 711         if (!cString && (cString = (const char*)CFStringGetCharactersPtr(string))) { // Must be Unicode string
 712             if (CFStringEncodingIsValidEncoding(encoding)) { // Converter available in CF
 713                 CFStringEncodingUnicodeToBytes(encoding, flags, (const UniChar*)cString + rangeLoc, rangeLen, &numCharsProcessed, buffer, max, &totalBytesWritten);
 714             } else {
 715                 return 0;
 716             }
 717         } else {
 718             UniChar charBuf[kCFCharConversionBufferLength];
 719             UInt32 currentLength;
 720             UInt32 usedLen;
 721             uint32_t lastUsedLen = 0, lastNumChars = 0;
 722             uint32_t result;
 723             Boolean isCFBuiltin = CFStringEncodingIsValidEncoding(encoding);
 724 #define MAX_DECOMP_LEN (6)
 725
 726             while (rangeLen > 0) {
 727                 currentLength = (rangeLen > kCFCharConversionBufferLength ? kCFCharConversionBufferLength : rangeLen);
 728                 CFStringGetCharacters(string, CFRangeMake(rangeLoc, currentLength), charBuf);
 729
 730                 // could be in the middle of surrogate pair; back up.
 731                 if ((rangeLen > kCFCharConversionBufferLength) && CFUniCharIsSurrogateHighCharacter(charBuf[kCFCharConversionBufferLength - 1])) --currentLength;
 732
 733                 if (isCFBuiltin) { // Converter available in CF
 734                     if ((result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, currentLength, &numChars, buffer, max, &usedLen)) != kCFStringEncodingConversionSuccess) {
 735                         if (kCFStringEncodingInvalidInputStream == result) {
 736                             CFRange composedRange;
 737                             // Check the tail
 738                             if ((rangeLen > kCFCharConversionBufferLength) && ((currentLength - numChars) < MAX_DECOMP_LEN)) {
 739                                 composedRange = CFStringGetRangeOfComposedCharactersAtIndex(string, rangeLoc + currentLength);
 740
 741                                 if ((composedRange.length <= MAX_DECOMP_LEN) && (composedRange.location < (rangeLoc + numChars))) {
 742                                     result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, composedRange.location - rangeLoc, &numChars, buffer, max, &usedLen);
 743                                 }
 744                             }
 745
 746                             // Check the head
 747                             if ((kCFStringEncodingConversionSuccess != result) && (lastNumChars > 0) && (numChars < MAX_DECOMP_LEN)) {
 748                                 composedRange = CFStringGetRangeOfComposedCharactersAtIndex(string, rangeLoc);
 749
 750                                 if ((composedRange.length <= MAX_DECOMP_LEN) && (composedRange.location < rangeLoc)) {
 751                                     // Try if the composed range can be converted
 752                                     CFStringGetCharacters(string, composedRange, charBuf);
 753
 754                                     if (CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, composedRange.length, &numChars, NULL, 0, &usedLen) == kCFStringEncodingConversionSuccess) { // OK let's try the last run
 755                                         CFIndex lastRangeLoc = rangeLoc - lastNumChars;
 756
 757                                         currentLength = composedRange.location - lastRangeLoc;
 758                                         CFStringGetCharacters(string, CFRangeMake(lastRangeLoc, currentLength), charBuf);
 759
 760                                         if ((result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, currentLength, &numChars, (max ? buffer - lastUsedLen : NULL), (max ? max + lastUsedLen : 0), &usedLen)) == kCFStringEncodingConversionSuccess) { // OK let's try the last run
 761                                             // Looks good. back up
 762                                             totalBytesWritten -= lastUsedLen;
 763                                             numCharsProcessed -= lastNumChars;
 764
 765                                             rangeLoc = lastRangeLoc;
 766                                             rangeLen += lastNumChars;
 767
 768                                             if (max) {
 769                                                 buffer -= lastUsedLen;
 770                                                 max += lastUsedLen;
 771                                             }
 772                                         }
 773                                     }
 774                                 }
 775                             }
 776                         }
 777
 778                         if (kCFStringEncodingConversionSuccess != result) { // really failed
 779                             totalBytesWritten += usedLen;
 780                             numCharsProcessed += numChars;
 781                             break;
 782                         }
 783                     }
 784                 } else {
 785                     return 0;
 786                 }
 787
 788                 totalBytesWritten += usedLen;
 789                 numCharsProcessed += numChars;
 790
 791                 rangeLoc += numChars;
 792                 rangeLen -= numChars;
 793                 if (max) {
 794                     buffer += usedLen;
 795                     max -= usedLen;
 796                     if (max <= 0) break;
 797                 }
 798                 lastUsedLen = usedLen; lastNumChars = numChars;
 799                 flags &= ~kCFStringEncodingPrependBOM;
 800             }
 801         }
 802     }
 803     if (usedBufLen) *usedBufLen = totalBytesWritten;
 804     return numCharsProcessed;
 805 }
 806
 807 CFStringRef CFStringCreateWithFileSystemRepresentation(CFAllocatorRef alloc, const char *buffer) {
 808     return CFStringCreateWithCString(alloc, buffer, CFStringFileSystemEncoding());
 809 }
 810
 811 CFIndex CFStringGetMaximumSizeOfFileSystemRepresentation(CFStringRef string) {
 812     CFIndex len = CFStringGetLength(string);
 813     CFStringEncoding enc = CFStringGetFastestEncoding(string);
 814     switch (enc) {
 815         case kCFStringEncodingASCII:
 816         case kCFStringEncodingMacRoman:
 817             return len * 3 + 1;
 818         default:
 819             return len * 9 + 1;
 820     }
 821 }
 822
 823 Boolean CFStringGetFileSystemRepresentation(CFStringRef string, char *buffer, CFIndex maxBufLen) {
 824 #if defined(__MACH__)
 825 #define MAX_STACK_BUFFER_LEN    (255)
 826     const UTF16Char *characters = CFStringGetCharactersPtr(string);
 827     uint32_t usedBufLen;
 828
 829     if (NULL == characters) {
 830         CFIndex length = CFStringGetLength(string);
 831
 832         if (length > MAX_STACK_BUFFER_LEN) {
 833             UTF16Char charactersBuffer[MAX_STACK_BUFFER_LEN];
 834             CFRange range = CFRangeMake(0, MAX_STACK_BUFFER_LEN);
 835             uint32_t localUsedBufLen;
 836
 837             usedBufLen = 0;
 838
 839             while (length > 0) {
 840                 CFStringGetCharacters(string, range, charactersBuffer);
 841                 if (CFUniCharIsSurrogateHighCharacter(charactersBuffer[range.length - 1])) --range.length; // Backup for a high surrogate
 842
 843                 if (!CFUniCharDecompose(charactersBuffer, range.length, NULL, (void *)buffer, maxBufLen - usedBufLen, &localUsedBufLen, true, kCFUniCharUTF8Format, true)) return false;
 844                 buffer += localUsedBufLen;
 845                 usedBufLen += localUsedBufLen;
 846
 847                 length -= range.length;
 848                 range.location += range.length;
 849                 range.length = (length < MAX_STACK_BUFFER_LEN ? length : MAX_STACK_BUFFER_LEN);
 850             }
 851         } else {
 852             UTF16Char charactersBuffer[MAX_STACK_BUFFER_LEN];
 853
 854             CFStringGetCharacters(string, CFRangeMake(0, length), charactersBuffer);
 855             if (!CFUniCharDecompose(charactersBuffer, length, NULL, (void *)buffer, maxBufLen, &usedBufLen, true, kCFUniCharUTF8Format, true)) return false;
 856             buffer += usedBufLen;
 857         }
 858     } else {
 859         if (!CFUniCharDecompose(characters, CFStringGetLength(string), NULL, (void *)buffer, maxBufLen, &usedBufLen, true, kCFUniCharUTF8Format, true)) return false;
 860         buffer += usedBufLen;
 861     }
 862
 863     if (usedBufLen < (uint32_t)maxBufLen) { // Since the filename has its own limit, this is ok for now
 864         *buffer = '\0';
 865         return true;
 866     } else {
 867         return false;
 868     }
 869 #else __MACH__
 870     return CFStringGetCString(string, buffer, maxBufLen, CFStringFileSystemEncoding());
 871 #endif __MACH__
 872 }
 873
 874 Boolean _CFStringGetFileSystemRepresentation(CFStringRef string, uint8_t *buffer, CFIndex maxBufLen) {
 875     return CFStringGetFileSystemRepresentation(string, buffer, maxBufLen);
 876 }
 877