CFStringEncodingConverter.c

   1 /*
   2  * Copyright (c) 2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*      CFStringEncodingConverter.c
  24         Copyright 1998-2002, Apple, Inc. All rights reserved.
  25         Responsibility: Aki Inoue
  26 */
  27
  28 #include "CFInternal.h"
  29 #include <CoreFoundation/CFArray.h>
  30 #include <CoreFoundation/CFDictionary.h>
  31 #include "CFUniChar.h"
  32 #include "CFPriv.h"
  33 #include "CFUnicodeDecomposition.h"
  34 #include "CFStringEncodingConverterExt.h"
  35 #include "CFStringEncodingConverterPriv.h"
  36 #include <stdlib.h>
  37 #if !defined(__WIN32__)
  38 #include <pthread.h>
  39 #endif
  40
  41
  42 /* Macros
  43 */
  44 #define TO_BYTE(conv,flags,chars,numChars,bytes,max,used) (conv->_toBytes ? conv->toBytes(conv,flags,chars,numChars,bytes,max,used) : ((CFStringEncodingToBytesProc)conv->toBytes)(flags,chars,numChars,bytes,max,used))
  45 #define TO_UNICODE(conv,flags,bytes,numBytes,chars,max,used) (conv->_toUnicode ?  (flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical) ? conv->toCanonicalUnicode(conv,flags,bytes,numBytes,chars,max,used) : conv->toUnicode(conv,flags,bytes,numBytes,chars,max,used)) : ((CFStringEncodingToUnicodeProc)conv->toUnicode)(flags,bytes,numBytes,chars,max,used))
  46
  47 #define ASCIINewLine 0x0a
  48 #define kSurrogateHighStart 0xD800
  49 #define kSurrogateHighEnd 0xDBFF
  50 #define kSurrogateLowStart 0xDC00
  51 #define kSurrogateLowEnd 0xDFFF
  52
  53 /* Mapping 128..255 to lossy ASCII
  54 */
  55 static const struct {
  56     unsigned char chars[4];
  57 } _toLossyASCIITable[] = {
  58     {{' ', 0, 0, 0}}, // NO-BREAK SPACE
  59     {{'!', 0, 0, 0}}, // INVERTED EXCLAMATION MARK
  60     {{'c', 0, 0, 0}}, // CENT SIGN
  61     {{'L', 0, 0, 0}}, // POUND SIGN
  62     {{'$', 0, 0, 0}}, // CURRENCY SIGN
  63     {{'Y', 0, 0, 0}}, // YEN SIGN
  64     {{'|', 0, 0, 0}}, // BROKEN BAR
  65     {{0, 0, 0, 0}}, // SECTION SIGN
  66     {{0, 0, 0, 0}}, // DIAERESIS
  67     {{'(', 'C', ')', 0}}, // COPYRIGHT SIGN
  68     {{'a', 0, 0, 0}}, // FEMININE ORDINAL INDICATOR
  69     {{'<', '<', 0, 0}}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
  70     {{0, 0, 0, 0}}, // NOT SIGN
  71     {{'-', 0, 0, 0}}, // SOFT HYPHEN
  72     {{'(', 'R', ')', 0}}, // REGISTERED SIGN
  73     {{0, 0, 0, 0}}, // MACRON
  74     {{0, 0, 0, 0}}, // DEGREE SIGN
  75     {{'+', '-', 0, 0}}, // PLUS-MINUS SIGN
  76     {{'2', 0, 0, 0}}, // SUPERSCRIPT TWO
  77     {{'3', 0, 0, 0}}, // SUPERSCRIPT THREE
  78     {{0, 0, 0, 0}}, // ACUTE ACCENT
  79     {{0, 0, 0, 0}}, // MICRO SIGN
  80     {{0, 0, 0, 0}}, // PILCROW SIGN
  81     {{0, 0, 0, 0}}, // MIDDLE DOT
  82     {{0, 0, 0, 0}}, // CEDILLA
  83     {{'1', 0, 0, 0}}, // SUPERSCRIPT ONE
  84     {{'o', 0, 0, 0}}, // MASCULINE ORDINAL INDICATOR
  85     {{'>', '>', 0, 0}}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
  86     {{'1', '/', '4', 0}}, // VULGAR FRACTION ONE QUARTER
  87     {{'1', '/', '2', 0}}, // VULGAR FRACTION ONE HALF
  88     {{'3', '/', '4', 0}}, // VULGAR FRACTION THREE QUARTERS
  89     {{'?', 0, 0, 0}}, // INVERTED QUESTION MARK
  90     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH GRAVE
  91     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH ACUTE
  92     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
  93     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH TILDE
  94     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH DIAERESIS
  95     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH RING ABOVE
  96     {{'A', 'E', 0, 0}}, // LATIN CAPITAL LETTER AE
  97     {{'C', 0, 0, 0}}, // LATIN CAPITAL LETTER C WITH CEDILLA
  98     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH GRAVE
  99     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH ACUTE
 100     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
 101     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH DIAERESIS
 102     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH GRAVE
 103     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH ACUTE
 104     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
 105     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH DIAERESIS
 106     {{'T', 'H', 0, 0}}, // LATIN CAPITAL LETTER ETH (Icelandic)
 107     {{'N', 0, 0, 0}}, // LATIN CAPITAL LETTER N WITH TILDE
 108     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH GRAVE
 109     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH ACUTE
 110     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
 111     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH TILDE
 112     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH DIAERESIS
 113     {{'X', 0, 0, 0}}, // MULTIPLICATION SIGN
 114     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH STROKE
 115     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH GRAVE
 116     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH ACUTE
 117     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
 118     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH DIAERESIS
 119     {{'Y', 0, 0, 0}}, // LATIN CAPITAL LETTER Y WITH ACUTE
 120     {{'t', 'h', 0, 0}}, // LATIN CAPITAL LETTER THORN (Icelandic)
 121     {{'s', 0, 0, 0}}, // LATIN SMALL LETTER SHARP S (German)
 122     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH GRAVE
 123     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH ACUTE
 124     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH CIRCUMFLEX
 125     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH TILDE
 126     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH DIAERESIS
 127     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH RING ABOVE
 128     {{'a', 'e', 0, 0}}, // LATIN SMALL LETTER AE
 129     {{'c', 0, 0, 0}}, // LATIN SMALL LETTER C WITH CEDILLA
 130     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH GRAVE
 131     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH ACUTE
 132     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH CIRCUMFLEX
 133     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH DIAERESIS
 134     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH GRAVE
 135     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH ACUTE
 136     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH CIRCUMFLEX
 137     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH DIAERESIS
 138     {{'T', 'H', 0, 0}}, // LATIN SMALL LETTER ETH (Icelandic)
 139     {{'n', 0, 0, 0}}, // LATIN SMALL LETTER N WITH TILDE
 140     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH GRAVE
 141     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH ACUTE
 142     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH CIRCUMFLEX
 143     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH TILDE
 144     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH DIAERESIS
 145     {{'/', 0, 0, 0}}, // DIVISION SIGN
 146     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH STROKE
 147     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH GRAVE
 148     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH ACUTE
 149     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH CIRCUMFLEX
 150     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH DIAERESIS
 151     {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH ACUTE
 152     {{'t', 'h', 0, 0}}, // LATIN SMALL LETTER THORN (Icelandic)
 153     {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH DIAERESIS
 154 };
 155
 156 CF_INLINE CFIndex __CFToASCIILatin1Fallback(UniChar character, uint8_t *bytes, CFIndex maxByteLen) {
 157     const uint8_t *losChars = (const uint8_t*)_toLossyASCIITable + (character - 0xA0) * sizeof(uint8_t[4]);
 158     CFIndex numBytes = 0;
 159     CFIndex idx, max = (maxByteLen && (maxByteLen < 4) ? maxByteLen : 4);
 160
 161     for (idx = 0;idx < max;idx++) {
 162         if (losChars[idx]) {
 163             if (maxByteLen) bytes[idx] = losChars[idx];
 164             ++numBytes;
 165         } else {
 166             break;
 167         }
 168     }
 169
 170     return numBytes;
 171 }
 172
 173 static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 174     CFIndex processCharLen = 1, filledBytesLen = 1;
 175     uint8_t byte = '?';
 176
 177     if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range
 178         byte = (uint8_t)(*characters - 0x80);
 179     } else if (*characters < 0x100) {
 180         *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen);
 181         return 1;
 182     } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) {
 183         processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1);
 184     } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) {
 185         byte = ' ';
 186     } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) {
 187         byte = ASCIINewLine;
 188     } else if (*characters == 0x2026) { // ellipsis
 189         if (0 == maxByteLen) {
 190             filledBytesLen = 3;
 191         } else if (maxByteLen > 2) {
 192             memset(bytes, '.', 3);
 193             *usedByteLen = 3;
 194             return processCharLen;
 195         }
 196     } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) {
 197         UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
 198
 199         (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH);
 200         if (*decomposed < 0x80) {
 201             byte = (uint8_t)(*decomposed);
 202         } else {
 203             UTF16Char theChar = *decomposed;
 204
 205             return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen);
 206         }
 207     }
 208
 209     if (maxByteLen) *bytes = byte;
 210     *usedByteLen = filledBytesLen;
 211     return processCharLen;
 212 }
 213
 214 static CFIndex __CFDefaultToUnicodeFallbackProc(const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 215     if (maxCharLen) *characters = (UniChar)'?';
 216     *usedCharLen = 1;
 217     return 1;
 218 }
 219
 220 #define TO_BYTE_FALLBACK(conv,chars,numChars,bytes,max,used) (conv->toBytesFallback(chars,numChars,bytes,max,used))
 221 #define TO_UNICODE_FALLBACK(conv,bytes,numBytes,chars,max,used) (conv->toUnicodeFallback(bytes,numBytes,chars,max,used))
 222
 223 #define EXTRA_BASE (0x0F00)
 224
 225 /* Wrapper funcs for non-standard converters
 226 */
 227 static CFIndex __CFToBytesCheapEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 228     CFIndex processedCharLen = 0;
 229     CFIndex length = (maxByteLen && (maxByteLen < numChars) ? maxByteLen : numChars);
 230     uint8_t byte;
 231
 232     while (processedCharLen < length) {
 233         if (!((CFStringEncodingCheapEightBitToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters[processedCharLen], &byte)) break;
 234
 235         if (maxByteLen) bytes[processedCharLen] = byte;
 236         processedCharLen++;
 237     }
 238
 239     *usedByteLen = processedCharLen;
 240     return processedCharLen;
 241 }
 242
 243 static CFIndex __CFToUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 244     CFIndex processedByteLen = 0;
 245     CFIndex length = (maxCharLen && (maxCharLen < numBytes) ? maxCharLen : numBytes);
 246     UniChar character;
 247
 248     while (processedByteLen < length) {
 249         if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], &character)) break;
 250
 251         if (maxCharLen) characters[processedByteLen] = character;
 252         processedByteLen++;
 253     }
 254
 255     *usedCharLen = processedByteLen;
 256     return processedByteLen;
 257 }
 258
 259 static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 260     CFIndex processedByteLen = 0;
 261     CFIndex theUsedCharLen = 0;
 262     UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
 263     CFIndex usedLen;
 264     UniChar character;
 265     bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
 266
 267     while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 268         if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], &character)) break;
 269
 270         if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
 271             CFIndex idx;
 272
 273             usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
 274             *usedCharLen = theUsedCharLen;
 275
 276             for (idx = 0;idx < usedLen;idx++) {
 277                 if (charBuffer[idx] > 0xFFFF) { // Non-BMP
 278                     if (theUsedCharLen + 2 > maxCharLen)  return processedByteLen;
 279                     theUsedCharLen += 2;
 280                     if (maxCharLen) {
 281                         charBuffer[idx] = charBuffer[idx] - 0x10000;
 282                         *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
 283                         *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
 284                     }
 285                 } else {
 286                     if (theUsedCharLen + 1 > maxCharLen)  return processedByteLen;
 287                     ++theUsedCharLen;
 288                     *(characters++) = charBuffer[idx];
 289                 }
 290             }
 291         } else {
 292             if (maxCharLen) *(characters++) = character;
 293             ++theUsedCharLen;
 294         }
 295         processedByteLen++;
 296     }
 297
 298     *usedCharLen = theUsedCharLen;
 299     return processedByteLen;
 300 }
 301
 302 static CFIndex __CFToBytesStandardEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 303     CFIndex processedCharLen = 0;
 304     uint8_t byte;
 305     CFIndex usedLen;
 306
 307     *usedByteLen = 0;
 308
 309     while (numChars && (!maxByteLen || (*usedByteLen < maxByteLen))) {
 310         if (!(usedLen = ((CFStringEncodingStandardEightBitToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters, numChars, &byte))) break;
 311
 312         if (maxByteLen) bytes[*usedByteLen] = byte;
 313         (*usedByteLen)++;
 314         characters += usedLen;
 315         numChars -= usedLen;
 316         processedCharLen += usedLen;
 317     }
 318
 319     return processedCharLen;
 320 }
 321
 322 static CFIndex __CFToUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 323     CFIndex processedByteLen = 0;
 324 #if 0 || 0
 325     UniChar charBuffer[20]; // Dynamic stack allocation is GNU specific
 326 #else
 327     UniChar charBuffer[((const _CFEncodingConverter*)converter)->maxLen];
 328 #endif
 329     CFIndex usedLen;
 330
 331     *usedCharLen = 0;
 332
 333     while ((processedByteLen < numBytes) && (!maxCharLen || (*usedCharLen < maxCharLen))) {
 334         if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], charBuffer))) break;
 335
 336         if (maxCharLen) {
 337             CFIndex idx;
 338
 339             if (*usedCharLen + usedLen > maxCharLen) break;
 340
 341             for (idx = 0;idx < usedLen;idx++) {
 342                 characters[*usedCharLen + idx] = charBuffer[idx];
 343             }
 344         }
 345         *usedCharLen += usedLen;
 346         processedByteLen++;
 347     }
 348
 349     return processedByteLen;
 350 }
 351
 352 static CFIndex __CFToCanonicalUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 353     CFIndex processedByteLen = 0;
 354 #if 0 || 0
 355     UniChar charBuffer[20]; // Dynamic stack allocation is GNU specific
 356 #else
 357     UniChar charBuffer[((const _CFEncodingConverter*)converter)->maxLen];
 358 #endif
 359     UTF32Char decompBuffer[MAX_DECOMPOSED_LENGTH];
 360     CFIndex usedLen;
 361     CFIndex decompedLen;
 362     CFIndex idx, decompIndex;
 363     bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
 364     CFIndex theUsedCharLen = 0;
 365
 366     while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 367         if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], charBuffer))) break;
 368
 369         for (idx = 0;idx < usedLen;idx++) {
 370             if (CFUniCharIsDecomposableCharacter(charBuffer[idx], isHFSPlus)) {
 371                 decompedLen = CFUniCharDecomposeCharacter(charBuffer[idx], decompBuffer, MAX_DECOMPOSED_LENGTH);
 372                 *usedCharLen = theUsedCharLen;
 373
 374                 for (decompIndex = 0;decompIndex < decompedLen;decompIndex++) {
 375                     if (decompBuffer[decompIndex] > 0xFFFF) { // Non-BMP
 376                         if (theUsedCharLen + 2 > maxCharLen)  return processedByteLen;
 377                         theUsedCharLen += 2;
 378                         if (maxCharLen) {
 379                             charBuffer[idx] = charBuffer[idx] - 0x10000;
 380                             *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL;
 381                             *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL;
 382                         }
 383                     } else {
 384                         if (theUsedCharLen + 1 > maxCharLen)  return processedByteLen;
 385                         ++theUsedCharLen;
 386                         *(characters++) = charBuffer[idx];
 387                     }
 388                 }
 389             } else {
 390                 if (maxCharLen) *(characters++) = charBuffer[idx];
 391                 ++theUsedCharLen;
 392             }
 393         }
 394         processedByteLen++;
 395     }
 396
 397     *usedCharLen = theUsedCharLen;
 398     return processedByteLen;
 399 }
 400
 401 static CFIndex __CFToBytesCheapMultiByteWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 402     CFIndex processedCharLen = 0;
 403 #if 0 || 0
 404     uint8_t byteBuffer[20]; // Dynamic stack allocation is GNU specific
 405 #else
 406     uint8_t byteBuffer[((const _CFEncodingConverter*)converter)->maxLen];
 407 #endif
 408     CFIndex usedLen;
 409
 410     *usedByteLen = 0;
 411
 412     while ((processedCharLen < numChars) && (!maxByteLen || (*usedByteLen < maxByteLen))) {
 413         if (!(usedLen = ((CFStringEncodingCheapMultiByteToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters[processedCharLen], byteBuffer))) break;
 414
 415         if (maxByteLen) {
 416             CFIndex idx;
 417
 418             if (*usedByteLen + usedLen > maxByteLen) break;
 419
 420             for (idx = 0;idx <usedLen;idx++) {
 421                 bytes[*usedByteLen + idx] = byteBuffer[idx];
 422             }
 423         }
 424
 425         *usedByteLen += usedLen;
 426         processedCharLen++;
 427     }
 428
 429     return processedCharLen;
 430 }
 431
 432 static CFIndex __CFToUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 433     CFIndex processedByteLen = 0;
 434     UniChar character;
 435     CFIndex usedLen;
 436
 437     *usedCharLen = 0;
 438
 439     while (numBytes && (!maxCharLen || (*usedCharLen < maxCharLen))) {
 440         if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes, numBytes, &character))) break;
 441
 442         if (maxCharLen) *(characters++) = character;
 443         (*usedCharLen)++;
 444         processedByteLen += usedLen;
 445         bytes += usedLen;
 446         numBytes -= usedLen;
 447     }
 448
 449     return processedByteLen;
 450 }
 451
 452 static CFIndex __CFToCanonicalUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 453     CFIndex processedByteLen = 0;
 454     UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
 455     UniChar character;
 456     CFIndex usedLen;
 457     CFIndex decomposedLen;
 458     CFIndex theUsedCharLen = 0;
 459     bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
 460
 461     while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 462         if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes, numBytes, &character))) break;
 463
 464         if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
 465             CFIndex idx;
 466
 467             decomposedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
 468             *usedCharLen = theUsedCharLen;
 469
 470             for (idx = 0;idx < decomposedLen;idx++) {
 471                 if (charBuffer[idx] > 0xFFFF) { // Non-BMP
 472                     if (theUsedCharLen + 2 > maxCharLen)  return processedByteLen;
 473                     theUsedCharLen += 2;
 474                     if (maxCharLen) {
 475                         charBuffer[idx] = charBuffer[idx] - 0x10000;
 476                         *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
 477                         *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
 478                     }
 479                 } else {
 480                     if (theUsedCharLen + 1 > maxCharLen)  return processedByteLen;
 481                     ++theUsedCharLen;
 482                     *(characters++) = charBuffer[idx];
 483                 }
 484             }
 485         } else {
 486             if (maxCharLen) *(characters++) = character;
 487             ++theUsedCharLen;
 488         }
 489
 490         processedByteLen += usedLen;
 491         bytes += usedLen;
 492         numBytes -= usedLen;
 493     }
 494     *usedCharLen = theUsedCharLen;
 495     return processedByteLen;
 496 }
 497
 498 /* static functions
 499 */
 500 static _CFConverterEntry __CFConverterEntryASCII = {
 501     kCFStringEncodingASCII, NULL,
 502     "Western (ASCII)", {"us-ascii", "ascii", "iso-646-us", NULL}, NULL, NULL, NULL, NULL,
 503     kCFStringEncodingMacRoman // We use string encoding's script range here
 504 };
 505
 506 static _CFConverterEntry __CFConverterEntryISOLatin1 = {
 507     kCFStringEncodingISOLatin1, NULL,
 508     "Western (ISO Latin 1)", {"iso-8859-1", "latin1","iso-latin-1", NULL}, NULL, NULL, NULL, NULL,
 509     kCFStringEncodingMacRoman // We use string encoding's script range here
 510 };
 511
 512 static _CFConverterEntry __CFConverterEntryMacRoman = {
 513     kCFStringEncodingMacRoman, NULL,
 514     "Western (Mac OS Roman)", {"macintosh", "mac", "x-mac-roman", NULL}, NULL, NULL, NULL, NULL,
 515     kCFStringEncodingMacRoman // We use string encoding's script range here
 516 };
 517
 518 static _CFConverterEntry __CFConverterEntryWinLatin1 = {
 519     kCFStringEncodingWindowsLatin1, NULL,
 520     "Western (Windows Latin 1)", {"windows-1252", "cp1252", "windows latin1", NULL}, NULL, NULL, NULL, NULL,
 521     kCFStringEncodingMacRoman // We use string encoding's script range here
 522 };
 523
 524 static _CFConverterEntry __CFConverterEntryNextStepLatin = {
 525     kCFStringEncodingNextStepLatin, NULL,
 526     "Western (NextStep)", {"x-nextstep", NULL, NULL, NULL}, NULL, NULL, NULL, NULL,
 527     kCFStringEncodingMacRoman // We use string encoding's script range here
 528 };
 529
 530 static _CFConverterEntry __CFConverterEntryUTF8 = {
 531     kCFStringEncodingUTF8, NULL,
 532     "UTF-8", {"utf-8", "unicode-1-1-utf8", NULL, NULL}, NULL, NULL, NULL, NULL,
 533     kCFStringEncodingUnicode // We use string encoding's script range here
 534 };
 535
 536 CF_INLINE _CFConverterEntry *__CFStringEncodingConverterGetEntry(uint32_t encoding) {
 537     switch (encoding) {
 538         case kCFStringEncodingInvalidId:
 539         case kCFStringEncodingASCII:
 540             return &__CFConverterEntryASCII;
 541
 542         case kCFStringEncodingISOLatin1:
 543             return &__CFConverterEntryISOLatin1;
 544
 545         case kCFStringEncodingMacRoman:
 546             return &__CFConverterEntryMacRoman;
 547
 548         case kCFStringEncodingWindowsLatin1:
 549             return &__CFConverterEntryWinLatin1;
 550
 551         case kCFStringEncodingNextStepLatin:
 552             return &__CFConverterEntryNextStepLatin;
 553
 554         case kCFStringEncodingUTF8:
 555             return &__CFConverterEntryUTF8;
 556
 557         default: {
 558             return NULL;
 559         }
 560     }
 561 }
 562
 563 CF_INLINE _CFEncodingConverter *__CFEncodingConverterFromDefinition(const CFStringEncodingConverter *definition) {
 564 #define NUM_OF_ENTRIES_CYCLE (10)
 565     static CFSpinLock_t _indexLock = CFSpinLockInit;
 566     static uint32_t _currentIndex = 0;
 567     static uint32_t _allocatedSize = 0;
 568     static _CFEncodingConverter *_allocatedEntries = NULL;
 569     _CFEncodingConverter *converter;
 570
 571
 572     __CFSpinLock(&_indexLock);
 573     if ((_currentIndex + 1) >= _allocatedSize) {
 574         _currentIndex = 0;
 575         _allocatedSize = 0;
 576         _allocatedEntries = NULL;
 577     }
 578     if (_allocatedEntries == NULL) { // Not allocated yet
 579         _allocatedEntries = (_CFEncodingConverter *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(_CFEncodingConverter) * NUM_OF_ENTRIES_CYCLE, 0);
 580         _allocatedSize = NUM_OF_ENTRIES_CYCLE;
 581         converter = &(_allocatedEntries[_currentIndex]);
 582     } else {
 583         converter = &(_allocatedEntries[++_currentIndex]);
 584     }
 585     __CFSpinUnlock(&_indexLock);
 586
 587     switch (definition->encodingClass) {
 588         case kCFStringEncodingConverterStandard:
 589             converter->toBytes = (_CFToBytesProc)definition->toBytes;
 590             converter->toUnicode = (_CFToUnicodeProc)definition->toUnicode;
 591             converter->toCanonicalUnicode = (_CFToUnicodeProc)definition->toUnicode;
 592             converter->_toBytes = NULL;
 593             converter->_toUnicode = NULL;
 594             converter->maxLen = 2;
 595             break;
 596
 597         case kCFStringEncodingConverterCheapEightBit:
 598             converter->toBytes = __CFToBytesCheapEightBitWrapper;
 599             converter->toUnicode = __CFToUnicodeCheapEightBitWrapper;
 600             converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapEightBitWrapper;
 601             converter->_toBytes = definition->toBytes;
 602             converter->_toUnicode = definition->toUnicode;
 603             converter->maxLen = 1;
 604             break;
 605
 606         case kCFStringEncodingConverterStandardEightBit:
 607             converter->toBytes = __CFToBytesStandardEightBitWrapper;
 608             converter->toUnicode = __CFToUnicodeStandardEightBitWrapper;
 609             converter->toCanonicalUnicode = __CFToCanonicalUnicodeStandardEightBitWrapper;
 610             converter->_toBytes = definition->toBytes;
 611             converter->_toUnicode = definition->toUnicode;
 612             converter->maxLen = definition->maxDecomposedCharLen;
 613             break;
 614
 615         case kCFStringEncodingConverterCheapMultiByte:
 616             converter->toBytes = __CFToBytesCheapMultiByteWrapper;
 617             converter->toUnicode = __CFToUnicodeCheapMultiByteWrapper;
 618             converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapMultiByteWrapper;
 619             converter->_toBytes = definition->toBytes;
 620             converter->_toUnicode = definition->toUnicode;
 621             converter->maxLen = definition->maxBytesPerChar;
 622             break;
 623
 624         case kCFStringEncodingConverterPlatformSpecific:
 625             converter->toBytes = NULL;
 626             converter->toUnicode = NULL;
 627             converter->toCanonicalUnicode = NULL;
 628             converter->_toBytes = NULL;
 629             converter->_toUnicode = NULL;
 630             converter->maxLen = 0;
 631             converter->toBytesLen = NULL;
 632             converter->toUnicodeLen = NULL;
 633             converter->toBytesFallback = NULL;
 634             converter->toUnicodeFallback = NULL;
 635             converter->toBytesPrecompose = NULL;
 636             converter->isValidCombiningChar = NULL;
 637             return converter;
 638
 639         default: // Shouln't be here
 640             return NULL;
 641     }
 642
 643     converter->toBytesLen = (definition->toBytesLen ? definition->toBytesLen : (CFStringEncodingToBytesLenProc)(uintptr_t)definition->maxBytesPerChar);
 644     converter->toUnicodeLen = (definition->toUnicodeLen ? definition->toUnicodeLen : (CFStringEncodingToUnicodeLenProc)(uintptr_t)definition->maxDecomposedCharLen);
 645     converter->toBytesFallback = (definition->toBytesFallback ? definition->toBytesFallback : __CFDefaultToBytesFallbackProc);
 646     converter->toUnicodeFallback = (definition->toUnicodeFallback ? definition->toUnicodeFallback : __CFDefaultToUnicodeFallbackProc);
 647     converter->toBytesPrecompose = (definition->toBytesPrecompose ? definition->toBytesPrecompose : NULL);
 648     converter->isValidCombiningChar = (definition->isValidCombiningChar ? definition->isValidCombiningChar : NULL);
 649
 650     return converter;
 651 }
 652
 653 CF_INLINE const CFStringEncodingConverter *__CFStringEncodingConverterGetDefinition(_CFConverterEntry *entry) {
 654     if (!entry) return NULL;
 655
 656     switch (entry->encoding) {
 657         case kCFStringEncodingASCII:
 658             return &__CFConverterASCII;
 659
 660         case kCFStringEncodingISOLatin1:
 661             return &__CFConverterISOLatin1;
 662
 663         case kCFStringEncodingMacRoman:
 664             return &__CFConverterMacRoman;
 665
 666         case kCFStringEncodingWindowsLatin1:
 667             return &__CFConverterWinLatin1;
 668
 669         case kCFStringEncodingNextStepLatin:
 670             return &__CFConverterNextStepLatin;
 671
 672         case kCFStringEncodingUTF8:
 673             return &__CFConverterUTF8;
 674
 675         default:
 676             return NULL;
 677     }
 678 }
 679
 680 static const _CFEncodingConverter *__CFGetConverter(uint32_t encoding) {
 681     _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding);
 682
 683     if (!entry) return NULL;
 684
 685     if (!entry->converter) {
 686         const CFStringEncodingConverter *definition = __CFStringEncodingConverterGetDefinition(entry);
 687
 688         if (definition) {
 689             entry->converter = __CFEncodingConverterFromDefinition(definition);
 690             entry->toBytesFallback = definition->toBytesFallback;
 691             entry->toUnicodeFallback = definition->toUnicodeFallback;
 692         }
 693     }
 694
 695     return (_CFEncodingConverter *)entry->converter;
 696 }
 697
 698 /* Public API
 699 */
 700 uint32_t CFStringEncodingUnicodeToBytes(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 701     if (encoding == kCFStringEncodingUTF8) {
 702         static CFStringEncodingToBytesProc __CFToUTF8 = NULL;
 703         CFIndex convertedCharLen;
 704         CFIndex usedLen;
 705
 706
 707         if ((flags & kCFStringEncodingUseCanonical) || (flags & kCFStringEncodingUseHFSPlusCanonical)) {
 708             (void)CFUniCharDecompose(characters, numChars, &convertedCharLen, (void *)bytes, maxByteLen, &usedLen, true, kCFUniCharUTF8Format, (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false));
 709         } else {
 710             if (!__CFToUTF8) {
 711                 const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
 712                 __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes;
 713             }
 714             convertedCharLen = __CFToUTF8(0, characters, numChars, bytes, maxByteLen, &usedLen);
 715         }
 716         if (usedCharLen) *usedCharLen = convertedCharLen;
 717         if (usedByteLen) *usedByteLen = usedLen;
 718
 719         if (convertedCharLen == numChars) {
 720             return kCFStringEncodingConversionSuccess;
 721         } else if (maxByteLen && (maxByteLen == usedLen)) {
 722             return kCFStringEncodingInsufficientOutputBufferLength;
 723         } else {
 724             return kCFStringEncodingInvalidInputStream;
 725         }
 726     } else {
 727         const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 728         CFIndex usedLen = 0;
 729         CFIndex localUsedByteLen;
 730         CFIndex theUsedByteLen = 0;
 731         uint32_t theResult = kCFStringEncodingConversionSuccess;
 732         CFStringEncodingToBytesPrecomposeProc toBytesPrecompose = NULL;
 733         CFStringEncodingIsValidCombiningCharacterProc isValidCombiningChar = NULL;
 734
 735         if (!converter) return kCFStringEncodingConverterUnavailable;
 736
 737         if (flags & kCFStringEncodingSubstituteCombinings) {
 738             if (!(flags & kCFStringEncodingAllowLossyConversion)) isValidCombiningChar = converter->isValidCombiningChar;
 739        } else {
 740             isValidCombiningChar = converter->isValidCombiningChar;
 741             if (!(flags & kCFStringEncodingIgnoreCombinings)) {
 742                 toBytesPrecompose = converter->toBytesPrecompose;
 743                 flags |= kCFStringEncodingComposeCombinings;
 744             }
 745         }
 746
 747
 748         while ((usedLen < numChars) && (!maxByteLen || (theUsedByteLen < maxByteLen))) {
 749             if ((usedLen += TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) < numChars) {
 750                 CFIndex dummy;
 751
 752                 if (isValidCombiningChar && (usedLen > 0) && isValidCombiningChar(characters[usedLen])) {
 753                     if (toBytesPrecompose) {
 754                         CFIndex localUsedLen = usedLen;
 755
 756                         while (isValidCombiningChar(characters[--usedLen]));
 757                         theUsedByteLen += localUsedByteLen;
 758                         if (converter->maxLen > 1) {
 759                             TO_BYTE(converter, flags, characters + usedLen, localUsedLen - usedLen, NULL, 0, &localUsedByteLen);
 760                             theUsedByteLen -= localUsedByteLen;
 761                         } else {
 762                             theUsedByteLen--;
 763                         }
 764                         if ((localUsedLen = toBytesPrecompose(flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) > 0) {
 765                             usedLen += localUsedLen;
 766                             if ((usedLen < numChars) && isValidCombiningChar(characters[usedLen])) { // There is a non-base char not combined remaining
 767                                 theUsedByteLen += localUsedByteLen;
 768                                 theResult = kCFStringEncodingInvalidInputStream;
 769                                 break;
 770                             }
 771                         } else if (flags & kCFStringEncodingAllowLossyConversion) {
 772                             uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
 773
 774                             if (lossyByte) {
 775                                                                 while (isValidCombiningChar(characters[++usedLen]));
 776                                 localUsedByteLen = 1;
 777                                 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
 778                             } else {
 779                                 ++usedLen;
 780                                 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
 781                             }
 782                         } else {
 783                             theResult = kCFStringEncodingInvalidInputStream;
 784                             break;
 785                         }
 786                     } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up
 787                                     theUsedByteLen += localUsedByteLen;
 788                                     theResult = kCFStringEncodingInsufficientOutputBufferLength;
 789                                     break;
 790                     } else if (flags & kCFStringEncodingIgnoreCombinings) {
 791                         while ((++usedLen < numChars) && isValidCombiningChar(characters[usedLen]));
 792                     } else {
 793                         uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
 794
 795                         theUsedByteLen += localUsedByteLen;
 796                         if (lossyByte) {
 797                             ++usedLen;
 798                             localUsedByteLen = 1;
 799                             if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
 800                         } else {
 801                             usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
 802                         }
 803                     }
 804                 } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up
 805                     theUsedByteLen += localUsedByteLen;
 806
 807                     if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) {
 808                         CFIndex localUsedLen;
 809
 810                         localUsedByteLen = 0;
 811                         while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen;
 812                     }
 813                     if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength;
 814                     break;
 815                 } else if (flags & kCFStringEncodingAllowLossyConversion) {
 816                     uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
 817
 818                     theUsedByteLen += localUsedByteLen;
 819                     if (lossyByte) {
 820                         ++usedLen;
 821                         localUsedByteLen = 1;
 822                         if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
 823                     } else {
 824                         usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
 825                     }
 826                 } else {
 827                     theUsedByteLen += localUsedByteLen;
 828                     theResult = kCFStringEncodingInvalidInputStream;
 829                     break;
 830                 }
 831             }
 832             theUsedByteLen += localUsedByteLen;
 833         }
 834
 835         if (usedLen < numChars && maxByteLen && theResult == kCFStringEncodingConversionSuccess) {
 836             if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) {
 837                 CFIndex localUsedLen;
 838
 839                 localUsedByteLen = 0;
 840                 while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen;
 841             }
 842             if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength;
 843         }
 844         if (usedByteLen) *usedByteLen = theUsedByteLen;
 845         if (usedCharLen) *usedCharLen = usedLen;
 846
 847         return theResult;
 848     }
 849 }
 850
 851 uint32_t CFStringEncodingBytesToUnicode(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 852     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 853     CFIndex usedLen = 0;
 854     CFIndex theUsedCharLen = 0;
 855     CFIndex localUsedCharLen;
 856     uint32_t theResult = kCFStringEncodingConversionSuccess;
 857
 858     if (!converter) return kCFStringEncodingConverterUnavailable;
 859
 860
 861     while ((usedLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 862         if ((usedLen += TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen)) < numBytes) {
 863             CFIndex tempUsedCharLen;
 864
 865             if (maxCharLen && ((maxCharLen == theUsedCharLen + localUsedCharLen) || (((flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical)) || (maxCharLen == theUsedCharLen + localUsedCharLen + 1)) && TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, NULL, 0, &tempUsedCharLen)))) { // buffer was filled up
 866                 theUsedCharLen += localUsedCharLen;
 867                 theResult = kCFStringEncodingInsufficientOutputBufferLength;
 868                 break;
 869             } else if (flags & kCFStringEncodingAllowLossyConversion) {
 870                 theUsedCharLen += localUsedCharLen;
 871                 usedLen += TO_UNICODE_FALLBACK(converter, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen);
 872             } else {
 873                 theUsedCharLen += localUsedCharLen;
 874                 theResult = kCFStringEncodingInvalidInputStream;
 875                 break;
 876             }
 877         }
 878         theUsedCharLen += localUsedCharLen;
 879     }
 880
 881     if (usedLen < numBytes && maxCharLen && theResult == kCFStringEncodingConversionSuccess) {
 882         theResult = kCFStringEncodingInsufficientOutputBufferLength;
 883     }
 884     if (usedCharLen) *usedCharLen = theUsedCharLen;
 885     if (usedByteLen) *usedByteLen = usedLen;
 886
 887     return theResult;
 888 }
 889
 890 __private_extern__ bool CFStringEncodingIsValidEncoding(uint32_t encoding) {
 891     return (CFStringEncodingGetConverter(encoding) ? true : false);
 892 }
 893
 894 __private_extern__ const char *CFStringEncodingName(uint32_t encoding) {
 895     _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding);
 896     if (entry) return entry->encodingName;
 897     return NULL;
 898 }
 899
 900 __private_extern__ const char **CFStringEncodingCanonicalCharsetNames(uint32_t encoding) {
 901     _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding);
 902     if (entry) return entry->ianaNames;
 903     return NULL;
 904 }
 905
 906 __private_extern__ uint32_t CFStringEncodingGetScriptCodeForEncoding(CFStringEncoding encoding) {
 907     _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding);
 908
 909     return (entry ? entry->scriptCode : ((encoding & 0x0FFF) == kCFStringEncodingUnicode ? kCFStringEncodingUnicode : (encoding < 0xFF ? encoding : kCFStringEncodingInvalidId)));
 910 }
 911
 912 __private_extern__ CFIndex CFStringEncodingCharLengthForBytes(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
 913     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 914
 915     if (converter) {
 916         uintptr_t switchVal = (uintptr_t)(converter->toUnicodeLen);
 917
 918         if (switchVal < 0xFFFF) {
 919             return switchVal * numBytes;
 920         } else {
 921             return converter->toUnicodeLen(flags, bytes, numBytes);
 922         }
 923     }
 924
 925     return 0;
 926 }
 927
 928 __private_extern__ CFIndex CFStringEncodingByteLengthForCharacters(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars) {
 929     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 930
 931     if (converter) {
 932         uintptr_t switchVal = (uintptr_t)(converter->toBytesLen);
 933
 934         if (switchVal < 0xFFFF) {
 935             return switchVal * numChars;
 936         } else {
 937             return converter->toBytesLen(flags, characters, numChars);
 938         }
 939     }
 940
 941     return 0;
 942 }
 943
 944 __private_extern__ void CFStringEncodingRegisterFallbackProcedures(uint32_t encoding, CFStringEncodingToBytesFallbackProc toBytes, CFStringEncodingToUnicodeFallbackProc toUnicode) {
 945     _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding);
 946
 947     if (entry && __CFGetConverter(encoding)) {
 948         ((_CFEncodingConverter*)entry->converter)->toBytesFallback = (toBytes ? toBytes : entry->toBytesFallback);
 949         ((_CFEncodingConverter*)entry->converter)->toUnicodeFallback = (toUnicode ? toUnicode : entry->toUnicodeFallback);
 950     }
 951 }
 952
 953 __private_extern__ const CFStringEncodingConverter *CFStringEncodingGetConverter(uint32_t encoding) {
 954     return __CFStringEncodingConverterGetDefinition(__CFStringEncodingConverterGetEntry(encoding));
 955 }
 956
 957 static const uint32_t __CFBuiltinEncodings[] = {
 958     kCFStringEncodingMacRoman,
 959     kCFStringEncodingWindowsLatin1,
 960     kCFStringEncodingISOLatin1,
 961     kCFStringEncodingNextStepLatin,
 962     kCFStringEncodingASCII,
 963     kCFStringEncodingUTF8,
 964     /* These seven are available only in CFString-level */
 965     kCFStringEncodingNonLossyASCII,
 966
 967     kCFStringEncodingUTF16,
 968     kCFStringEncodingUTF16BE,
 969     kCFStringEncodingUTF16LE,
 970
 971     kCFStringEncodingUTF32,
 972     kCFStringEncodingUTF32BE,
 973     kCFStringEncodingUTF32LE,
 974
 975     kCFStringEncodingInvalidId,
 976 };
 977
 978
 979 __private_extern__ const uint32_t *CFStringEncodingListOfAvailableEncodings(void) {
 980     return __CFBuiltinEncodings;
 981 }
 982
 983
 984 #undef TO_BYTE
 985 #undef TO_UNICODE
 986 #undef ASCIINewLine
 987 #undef kSurrogateHighStart
 988 #undef kSurrogateHighEnd
 989 #undef kSurrogateLowStart
 990 #undef kSurrogateLowEnd
 991 #undef TO_BYTE_FALLBACK
 992 #undef TO_UNICODE_FALLBACK
 993 #undef EXTRA_BASE
 994 #undef NUM_OF_ENTRIES_CYCLE
 995