CFStringEncodingConverter.c

   1 /*
   2  * Copyright (c) 2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*      CFStringEncodingConverter.c
  24         Copyright (c) 1998-2009, Apple Inc. All rights reserved.
  25         Responsibility: Aki Inoue
  26 */
  27
  28 #include "CFInternal.h"
  29 #include <CoreFoundation/CFArray.h>
  30 #include <CoreFoundation/CFDictionary.h>
  31 #include "CFICUConverters.h"
  32 #include <CoreFoundation/CFUniChar.h>
  33 #include <CoreFoundation/CFPriv.h>
  34 #include "CFUnicodeDecomposition.h"
  35 #include "CFStringEncodingConverterExt.h"
  36 #include "CFStringEncodingConverterPriv.h"
  37 #include <stdlib.h>
  38 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
  39 #include <pthread.h>
  40 #endif
  41
  42 typedef CFIndex (*_CFToBytesProc)(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen);
  43 typedef CFIndex (*_CFToUnicodeProc)(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen);
  44
  45 typedef struct {
  46     const CFStringEncodingConverter *definition;
  47     _CFToBytesProc toBytes;
  48     _CFToUnicodeProc toUnicode;
  49     _CFToUnicodeProc toCanonicalUnicode;
  50     CFStringEncodingToBytesFallbackProc toBytesFallback;
  51     CFStringEncodingToUnicodeFallbackProc toUnicodeFallback;
  52 } _CFEncodingConverter;
  53
  54 /* Macros
  55 */
  56 #define TO_BYTE(conv,flags,chars,numChars,bytes,max,used) (conv->toBytes ? conv->toBytes(conv,flags,chars,numChars,bytes,max,used) : ((CFStringEncodingToBytesProc)conv->definition->toBytes)(flags,chars,numChars,bytes,max,used))
  57 #define TO_UNICODE(conv,flags,bytes,numBytes,chars,max,used) (conv->toUnicode ?  (flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical) ? conv->toCanonicalUnicode(conv,flags,bytes,numBytes,chars,max,used) : conv->toUnicode(conv,flags,bytes,numBytes,chars,max,used)) : ((CFStringEncodingToUnicodeProc)conv->definition->toUnicode)(flags,bytes,numBytes,chars,max,used))
  58
  59 #define ASCIINewLine 0x0a
  60 #define kSurrogateHighStart 0xD800
  61 #define kSurrogateHighEnd 0xDBFF
  62 #define kSurrogateLowStart 0xDC00
  63 #define kSurrogateLowEnd 0xDFFF
  64
  65 static const uint8_t __CFMaximumConvertedLength = 20;
  66
  67 /* Mapping 128..255 to lossy ASCII
  68 */
  69 static const struct {
  70     unsigned char chars[4];
  71 } _toLossyASCIITable[] = {
  72     {{' ', 0, 0, 0}}, // NO-BREAK SPACE
  73     {{'!', 0, 0, 0}}, // INVERTED EXCLAMATION MARK
  74     {{'c', 0, 0, 0}}, // CENT SIGN
  75     {{'L', 0, 0, 0}}, // POUND SIGN
  76     {{'$', 0, 0, 0}}, // CURRENCY SIGN
  77     {{'Y', 0, 0, 0}}, // YEN SIGN
  78     {{'|', 0, 0, 0}}, // BROKEN BAR
  79     {{0, 0, 0, 0}}, // SECTION SIGN
  80     {{0, 0, 0, 0}}, // DIAERESIS
  81     {{'(', 'C', ')', 0}}, // COPYRIGHT SIGN
  82     {{'a', 0, 0, 0}}, // FEMININE ORDINAL INDICATOR
  83     {{'<', '<', 0, 0}}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
  84     {{0, 0, 0, 0}}, // NOT SIGN
  85     {{'-', 0, 0, 0}}, // SOFT HYPHEN
  86     {{'(', 'R', ')', 0}}, // REGISTERED SIGN
  87     {{0, 0, 0, 0}}, // MACRON
  88     {{0, 0, 0, 0}}, // DEGREE SIGN
  89     {{'+', '-', 0, 0}}, // PLUS-MINUS SIGN
  90     {{'2', 0, 0, 0}}, // SUPERSCRIPT TWO
  91     {{'3', 0, 0, 0}}, // SUPERSCRIPT THREE
  92     {{0, 0, 0, 0}}, // ACUTE ACCENT
  93     {{0, 0, 0, 0}}, // MICRO SIGN
  94     {{0, 0, 0, 0}}, // PILCROW SIGN
  95     {{0, 0, 0, 0}}, // MIDDLE DOT
  96     {{0, 0, 0, 0}}, // CEDILLA
  97     {{'1', 0, 0, 0}}, // SUPERSCRIPT ONE
  98     {{'o', 0, 0, 0}}, // MASCULINE ORDINAL INDICATOR
  99     {{'>', '>', 0, 0}}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
 100     {{'1', '/', '4', 0}}, // VULGAR FRACTION ONE QUARTER
 101     {{'1', '/', '2', 0}}, // VULGAR FRACTION ONE HALF
 102     {{'3', '/', '4', 0}}, // VULGAR FRACTION THREE QUARTERS
 103     {{'?', 0, 0, 0}}, // INVERTED QUESTION MARK
 104     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH GRAVE
 105     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH ACUTE
 106     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
 107     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH TILDE
 108     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH DIAERESIS
 109     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH RING ABOVE
 110     {{'A', 'E', 0, 0}}, // LATIN CAPITAL LETTER AE
 111     {{'C', 0, 0, 0}}, // LATIN CAPITAL LETTER C WITH CEDILLA
 112     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH GRAVE
 113     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH ACUTE
 114     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
 115     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH DIAERESIS
 116     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH GRAVE
 117     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH ACUTE
 118     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
 119     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH DIAERESIS
 120     {{'T', 'H', 0, 0}}, // LATIN CAPITAL LETTER ETH (Icelandic)
 121     {{'N', 0, 0, 0}}, // LATIN CAPITAL LETTER N WITH TILDE
 122     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH GRAVE
 123     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH ACUTE
 124     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
 125     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH TILDE
 126     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH DIAERESIS
 127     {{'X', 0, 0, 0}}, // MULTIPLICATION SIGN
 128     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH STROKE
 129     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH GRAVE
 130     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH ACUTE
 131     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
 132     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH DIAERESIS
 133     {{'Y', 0, 0, 0}}, // LATIN CAPITAL LETTER Y WITH ACUTE
 134     {{'t', 'h', 0, 0}}, // LATIN CAPITAL LETTER THORN (Icelandic)
 135     {{'s', 0, 0, 0}}, // LATIN SMALL LETTER SHARP S (German)
 136     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH GRAVE
 137     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH ACUTE
 138     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH CIRCUMFLEX
 139     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH TILDE
 140     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH DIAERESIS
 141     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH RING ABOVE
 142     {{'a', 'e', 0, 0}}, // LATIN SMALL LETTER AE
 143     {{'c', 0, 0, 0}}, // LATIN SMALL LETTER C WITH CEDILLA
 144     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH GRAVE
 145     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH ACUTE
 146     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH CIRCUMFLEX
 147     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH DIAERESIS
 148     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH GRAVE
 149     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH ACUTE
 150     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH CIRCUMFLEX
 151     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH DIAERESIS
 152     {{'T', 'H', 0, 0}}, // LATIN SMALL LETTER ETH (Icelandic)
 153     {{'n', 0, 0, 0}}, // LATIN SMALL LETTER N WITH TILDE
 154     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH GRAVE
 155     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH ACUTE
 156     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH CIRCUMFLEX
 157     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH TILDE
 158     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH DIAERESIS
 159     {{'/', 0, 0, 0}}, // DIVISION SIGN
 160     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH STROKE
 161     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH GRAVE
 162     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH ACUTE
 163     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH CIRCUMFLEX
 164     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH DIAERESIS
 165     {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH ACUTE
 166     {{'t', 'h', 0, 0}}, // LATIN SMALL LETTER THORN (Icelandic)
 167     {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH DIAERESIS
 168 };
 169
 170 CF_INLINE CFIndex __CFToASCIILatin1Fallback(UniChar character, uint8_t *bytes, CFIndex maxByteLen) {
 171     const uint8_t *losChars = (const uint8_t*)_toLossyASCIITable + (character - 0xA0) * sizeof(uint8_t[4]);
 172     CFIndex numBytes = 0;
 173     CFIndex idx, max = (maxByteLen && (maxByteLen < 4) ? maxByteLen : 4);
 174
 175     for (idx = 0;idx < max;idx++) {
 176         if (losChars[idx]) {
 177             if (maxByteLen) bytes[idx] = losChars[idx];
 178             ++numBytes;
 179         } else {
 180             break;
 181         }
 182     }
 183
 184     return numBytes;
 185 }
 186
 187 static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 188     CFIndex processCharLen = 1, filledBytesLen = 1;
 189     uint8_t byte = '?';
 190
 191     if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range
 192         byte = (uint8_t)(*characters - 0x80);
 193     } else if (*characters < 0x100) {
 194         *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen);
 195         return 1;
 196     } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) {
 197         processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1);
 198     } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) {
 199         byte = ' ';
 200     } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) {
 201         byte = ASCIINewLine;
 202     } else if (*characters == 0x2026) { // ellipsis
 203         if (0 == maxByteLen) {
 204             filledBytesLen = 3;
 205         } else if (maxByteLen > 2) {
 206             memset(bytes, '.', 3);
 207             *usedByteLen = 3;
 208             return processCharLen;
 209         }
 210     } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) {
 211         UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
 212
 213         (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH);
 214         if (*decomposed < 0x80) {
 215             byte = (uint8_t)(*decomposed);
 216         } else {
 217             UTF16Char theChar = *decomposed;
 218
 219             return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen);
 220         }
 221     }
 222
 223     if (maxByteLen) *bytes = byte;
 224     *usedByteLen = filledBytesLen;
 225     return processCharLen;
 226 }
 227
 228 static CFIndex __CFDefaultToUnicodeFallbackProc(const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 229     if (maxCharLen) *characters = (UniChar)'?';
 230     *usedCharLen = 1;
 231     return 1;
 232 }
 233
 234 #define TO_BYTE_FALLBACK(conv,chars,numChars,bytes,max,used) (conv->toBytesFallback(chars,numChars,bytes,max,used))
 235 #define TO_UNICODE_FALLBACK(conv,bytes,numBytes,chars,max,used) (conv->toUnicodeFallback(bytes,numBytes,chars,max,used))
 236
 237 #define EXTRA_BASE (0x0F00)
 238
 239 /* Wrapper funcs for non-standard converters
 240 */
 241 static CFIndex __CFToBytesCheapEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 242     CFIndex processedCharLen = 0;
 243     CFIndex length = (maxByteLen && (maxByteLen < numChars) ? maxByteLen : numChars);
 244     uint8_t byte;
 245
 246     while (processedCharLen < length) {
 247         if (!((CFStringEncodingCheapEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], &byte)) break;
 248
 249         if (maxByteLen) bytes[processedCharLen] = byte;
 250         processedCharLen++;
 251     }
 252
 253     *usedByteLen = processedCharLen;
 254     return processedCharLen;
 255 }
 256
 257 static CFIndex __CFToUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 258     CFIndex processedByteLen = 0;
 259     CFIndex length = (maxCharLen && (maxCharLen < numBytes) ? maxCharLen : numBytes);
 260     UniChar character;
 261
 262     while (processedByteLen < length) {
 263         if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break;
 264
 265         if (maxCharLen) characters[processedByteLen] = character;
 266         processedByteLen++;
 267     }
 268
 269     *usedCharLen = processedByteLen;
 270     return processedByteLen;
 271 }
 272
 273 static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 274     CFIndex processedByteLen = 0;
 275     CFIndex theUsedCharLen = 0;
 276     UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
 277     CFIndex usedLen;
 278     UniChar character;
 279     bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
 280
 281     while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 282         if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break;
 283
 284         if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
 285             CFIndex idx;
 286
 287             usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
 288             *usedCharLen = theUsedCharLen;
 289
 290             for (idx = 0;idx < usedLen;idx++) {
 291                 if (charBuffer[idx] > 0xFFFF) { // Non-BMP
 292                     if (theUsedCharLen + 2 > maxCharLen)  return processedByteLen;
 293                     theUsedCharLen += 2;
 294                     if (maxCharLen) {
 295                         charBuffer[idx] = charBuffer[idx] - 0x10000;
 296                         *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
 297                         *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
 298                     }
 299                 } else {
 300                     if (theUsedCharLen + 1 > maxCharLen)  return processedByteLen;
 301                     ++theUsedCharLen;
 302                     *(characters++) = charBuffer[idx];
 303                 }
 304             }
 305         } else {
 306             if (maxCharLen) *(characters++) = character;
 307             ++theUsedCharLen;
 308         }
 309         processedByteLen++;
 310     }
 311
 312     *usedCharLen = theUsedCharLen;
 313     return processedByteLen;
 314 }
 315
 316 static CFIndex __CFToBytesStandardEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 317     CFIndex processedCharLen = 0;
 318     uint8_t byte;
 319     CFIndex usedLen;
 320
 321     *usedByteLen = 0;
 322
 323     while (numChars && (!maxByteLen || (*usedByteLen < maxByteLen))) {
 324         if (!(usedLen = ((CFStringEncodingStandardEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters, numChars, &byte))) break;
 325
 326         if (maxByteLen) bytes[*usedByteLen] = byte;
 327         (*usedByteLen)++;
 328         characters += usedLen;
 329         numChars -= usedLen;
 330         processedCharLen += usedLen;
 331     }
 332
 333     return processedCharLen;
 334 }
 335
 336 static CFIndex __CFToUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 337     CFIndex processedByteLen = 0;
 338     UniChar charBuffer[__CFMaximumConvertedLength];
 339     CFIndex usedLen;
 340
 341     *usedCharLen = 0;
 342
 343     while ((processedByteLen < numBytes) && (!maxCharLen || (*usedCharLen < maxCharLen))) {
 344         if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break;
 345
 346         if (maxCharLen) {
 347             CFIndex idx;
 348
 349             if (*usedCharLen + usedLen > maxCharLen) break;
 350
 351             for (idx = 0;idx < usedLen;idx++) {
 352                 characters[*usedCharLen + idx] = charBuffer[idx];
 353             }
 354         }
 355         *usedCharLen += usedLen;
 356         processedByteLen++;
 357     }
 358
 359     return processedByteLen;
 360 }
 361
 362 static CFIndex __CFToCanonicalUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 363     CFIndex processedByteLen = 0;
 364     UniChar charBuffer[__CFMaximumConvertedLength];
 365     UTF32Char decompBuffer[MAX_DECOMPOSED_LENGTH];
 366     CFIndex usedLen;
 367     CFIndex decompedLen;
 368     CFIndex idx, decompIndex;
 369     bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
 370     CFIndex theUsedCharLen = 0;
 371
 372     while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 373         if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break;
 374
 375         for (idx = 0;idx < usedLen;idx++) {
 376             if (CFUniCharIsDecomposableCharacter(charBuffer[idx], isHFSPlus)) {
 377                 decompedLen = CFUniCharDecomposeCharacter(charBuffer[idx], decompBuffer, MAX_DECOMPOSED_LENGTH);
 378                 *usedCharLen = theUsedCharLen;
 379
 380                 for (decompIndex = 0;decompIndex < decompedLen;decompIndex++) {
 381                     if (decompBuffer[decompIndex] > 0xFFFF) { // Non-BMP
 382                         if (theUsedCharLen + 2 > maxCharLen)  return processedByteLen;
 383                         theUsedCharLen += 2;
 384                         if (maxCharLen) {
 385                             charBuffer[idx] = charBuffer[idx] - 0x10000;
 386                             *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL;
 387                             *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL;
 388                         }
 389                     } else {
 390                         if (theUsedCharLen + 1 > maxCharLen)  return processedByteLen;
 391                         ++theUsedCharLen;
 392                         *(characters++) = charBuffer[idx];
 393                     }
 394                 }
 395             } else {
 396                 if (maxCharLen) *(characters++) = charBuffer[idx];
 397                 ++theUsedCharLen;
 398             }
 399         }
 400         processedByteLen++;
 401     }
 402
 403     *usedCharLen = theUsedCharLen;
 404     return processedByteLen;
 405 }
 406
 407 static CFIndex __CFToBytesCheapMultiByteWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 408     CFIndex processedCharLen = 0;
 409     uint8_t byteBuffer[__CFMaximumConvertedLength];
 410     CFIndex usedLen;
 411
 412     *usedByteLen = 0;
 413
 414     while ((processedCharLen < numChars) && (!maxByteLen || (*usedByteLen < maxByteLen))) {
 415         if (!(usedLen = ((CFStringEncodingCheapMultiByteToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], byteBuffer))) break;
 416
 417         if (maxByteLen) {
 418             CFIndex idx;
 419
 420             if (*usedByteLen + usedLen > maxByteLen) break;
 421
 422             for (idx = 0;idx <usedLen;idx++) {
 423                 bytes[*usedByteLen + idx] = byteBuffer[idx];
 424             }
 425         }
 426
 427         *usedByteLen += usedLen;
 428         processedCharLen++;
 429     }
 430
 431     return processedCharLen;
 432 }
 433
 434 static CFIndex __CFToUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 435     CFIndex processedByteLen = 0;
 436     UniChar character;
 437     CFIndex usedLen;
 438
 439     *usedCharLen = 0;
 440
 441     while (numBytes && (!maxCharLen || (*usedCharLen < maxCharLen))) {
 442         if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break;
 443
 444         if (maxCharLen) *(characters++) = character;
 445         (*usedCharLen)++;
 446         processedByteLen += usedLen;
 447         bytes += usedLen;
 448         numBytes -= usedLen;
 449     }
 450
 451     return processedByteLen;
 452 }
 453
 454 static CFIndex __CFToCanonicalUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 455     CFIndex processedByteLen = 0;
 456     UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
 457     UniChar character;
 458     CFIndex usedLen;
 459     CFIndex decomposedLen;
 460     CFIndex theUsedCharLen = 0;
 461     bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
 462
 463     while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 464         if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break;
 465
 466         if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
 467             CFIndex idx;
 468
 469             decomposedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
 470             *usedCharLen = theUsedCharLen;
 471
 472             for (idx = 0;idx < decomposedLen;idx++) {
 473                 if (charBuffer[idx] > 0xFFFF) { // Non-BMP
 474                     if (theUsedCharLen + 2 > maxCharLen)  return processedByteLen;
 475                     theUsedCharLen += 2;
 476                     if (maxCharLen) {
 477                         charBuffer[idx] = charBuffer[idx] - 0x10000;
 478                         *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
 479                         *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
 480                     }
 481                 } else {
 482                     if (theUsedCharLen + 1 > maxCharLen)  return processedByteLen;
 483                     ++theUsedCharLen;
 484                     *(characters++) = charBuffer[idx];
 485                 }
 486             }
 487         } else {
 488             if (maxCharLen) *(characters++) = character;
 489             ++theUsedCharLen;
 490         }
 491
 492         processedByteLen += usedLen;
 493         bytes += usedLen;
 494         numBytes -= usedLen;
 495     }
 496     *usedCharLen = theUsedCharLen;
 497     return processedByteLen;
 498 }
 499
 500 /* static functions
 501 */
 502 CF_INLINE _CFEncodingConverter *__CFEncodingConverterFromDefinition(const CFStringEncodingConverter *definition, CFStringEncoding encoding) {
 503 #define NUM_OF_ENTRIES_CYCLE (10)
 504     static uint32_t _currentIndex = 0;
 505     static uint32_t _allocatedSize = 0;
 506     static _CFEncodingConverter *_allocatedEntries = NULL;
 507     _CFEncodingConverter *converter;
 508
 509
 510     if ((_currentIndex + 1) >= _allocatedSize) {
 511         _currentIndex = 0;
 512         _allocatedSize = 0;
 513         _allocatedEntries = NULL;
 514     }
 515     if (_allocatedEntries == NULL) { // Not allocated yet
 516         _allocatedEntries = (_CFEncodingConverter *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(_CFEncodingConverter) * NUM_OF_ENTRIES_CYCLE, 0);
 517         _allocatedSize = NUM_OF_ENTRIES_CYCLE;
 518         converter = &(_allocatedEntries[_currentIndex]);
 519     } else {
 520         converter = &(_allocatedEntries[++_currentIndex]);
 521     }
 522
 523     memset(converter, 0, sizeof(_CFEncodingConverter));
 524
 525     converter->definition = definition;
 526
 527     switch (definition->encodingClass) {
 528         case kCFStringEncodingConverterStandard:
 529             converter->toBytes = NULL;
 530             converter->toUnicode = NULL;
 531             converter->toCanonicalUnicode = NULL;
 532             break;
 533
 534         case kCFStringEncodingConverterCheapEightBit:
 535             converter->toBytes = __CFToBytesCheapEightBitWrapper;
 536             converter->toUnicode = __CFToUnicodeCheapEightBitWrapper;
 537             converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapEightBitWrapper;
 538             break;
 539
 540         case kCFStringEncodingConverterStandardEightBit:
 541             converter->toBytes = __CFToBytesStandardEightBitWrapper;
 542             converter->toUnicode = __CFToUnicodeStandardEightBitWrapper;
 543             converter->toCanonicalUnicode = __CFToCanonicalUnicodeStandardEightBitWrapper;
 544             break;
 545
 546         case kCFStringEncodingConverterCheapMultiByte:
 547             converter->toBytes = __CFToBytesCheapMultiByteWrapper;
 548             converter->toUnicode = __CFToUnicodeCheapMultiByteWrapper;
 549             converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapMultiByteWrapper;
 550             break;
 551
 552         case kCFStringEncodingConverterICU:
 553             converter->toBytes = (_CFToBytesProc)__CFStringEncodingGetICUName(encoding);
 554             break;
 555
 556         case kCFStringEncodingConverterPlatformSpecific:
 557             break;
 558
 559         default: // Shouln't be here
 560             return NULL;
 561     }
 562
 563     converter->toBytesFallback = (definition->toBytesFallback ? definition->toBytesFallback : __CFDefaultToBytesFallbackProc);
 564     converter->toUnicodeFallback = (definition->toUnicodeFallback ? definition->toUnicodeFallback : __CFDefaultToUnicodeFallbackProc);
 565
 566     return converter;
 567 }
 568
 569 CF_INLINE const CFStringEncodingConverter *__CFStringEncodingConverterGetDefinition(CFStringEncoding encoding) {
 570     switch (encoding) {
 571         case kCFStringEncodingUTF8:
 572             return &__CFConverterUTF8;
 573
 574         case kCFStringEncodingMacRoman:
 575             return &__CFConverterMacRoman;
 576
 577         case kCFStringEncodingWindowsLatin1:
 578             return &__CFConverterWinLatin1;
 579
 580         case kCFStringEncodingASCII:
 581             return &__CFConverterASCII;
 582
 583         case kCFStringEncodingISOLatin1:
 584             return &__CFConverterISOLatin1;
 585
 586
 587         case kCFStringEncodingNextStepLatin:
 588             return &__CFConverterNextStepLatin;
 589
 590
 591         default:
 592             return __CFStringEncodingGetExternalConverter(encoding);
 593     }
 594 }
 595
 596 static const _CFEncodingConverter *__CFGetConverter(uint32_t encoding) {
 597     const _CFEncodingConverter *converter = NULL;
 598     const _CFEncodingConverter **commonConverterSlot = NULL;
 599     static _CFEncodingConverter *commonConverters[3] = {NULL, NULL, NULL}; // UTF8, MacRoman/WinLatin1, and the default encoding*
 600     static CFMutableDictionaryRef mappingTable = NULL;
 601     static CFSpinLock_t lock = CFSpinLockInit;
 602
 603     switch (encoding) {
 604         case kCFStringEncodingUTF8: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[0]); break;
 605
 606             /* the swith here should avoid possible bootstrap issues in the default: case below when invoked from CFStringGetSystemEncoding() */
 607 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
 608         case kCFStringEncodingMacRoman: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[1]); break;
 609 #elif DEPLOYMENT_TARGET_WINDOWS
 610         case kCFStringEncodingWindowsLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break;
 611 #else
 612 #warning This case must match __defaultEncoding value defined in CFString.c
 613         case kCFStringEncodingISOLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break;
 614 #endif /* DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED */
 615
 616         default: if (CFStringGetSystemEncoding() == encoding) commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[2]); break;
 617     }
 618
 619     __CFSpinLock(&lock);
 620     converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot);
 621     __CFSpinUnlock(&lock);
 622
 623     if (NULL == converter) {
 624         const CFStringEncodingConverter *definition = __CFStringEncodingConverterGetDefinition(encoding);
 625
 626         if (NULL != definition) {
 627             __CFSpinLock(&lock);
 628             converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot);
 629
 630             if (NULL == converter) {
 631                 converter = __CFEncodingConverterFromDefinition(definition, encoding);
 632
 633                 if (NULL == commonConverterSlot) {
 634                     if (NULL == mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, NULL, NULL);
 635
 636                     CFDictionarySetValue(mappingTable, (const void *)(uintptr_t)encoding, converter);
 637                 } else {
 638                     *commonConverterSlot = converter;
 639                 }
 640             }
 641             __CFSpinUnlock(&lock);
 642         }
 643     }
 644
 645     return converter;
 646 }
 647
 648 /* Public API
 649 */
 650 uint32_t CFStringEncodingUnicodeToBytes(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 651     if (encoding == kCFStringEncodingUTF8) {
 652         static CFStringEncodingToBytesProc __CFToUTF8 = NULL;
 653         CFIndex convertedCharLen;
 654         CFIndex usedLen;
 655
 656
 657         if ((flags & kCFStringEncodingUseCanonical) || (flags & kCFStringEncodingUseHFSPlusCanonical)) {
 658             (void)CFUniCharDecompose(characters, numChars, &convertedCharLen, (void *)bytes, maxByteLen, &usedLen, true, kCFUniCharUTF8Format, (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false));
 659         } else {
 660             if (!__CFToUTF8) {
 661                 const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
 662                 __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes;
 663             }
 664             convertedCharLen = __CFToUTF8(0, characters, numChars, bytes, maxByteLen, &usedLen);
 665         }
 666         if (usedCharLen) *usedCharLen = convertedCharLen;
 667         if (usedByteLen) *usedByteLen = usedLen;
 668
 669         if (convertedCharLen == numChars) {
 670             return kCFStringEncodingConversionSuccess;
 671         } else if ((maxByteLen > 0) && ((maxByteLen - usedLen) < 10)) { // could be filled outbuf
 672             UTF16Char character = characters[convertedCharLen];
 673
 674             if (((character >= kSurrogateLowStart) && (character <= kSurrogateLowEnd)) || ((character >= kSurrogateHighStart) && (character <= kSurrogateHighEnd) && ((1 == (numChars - convertedCharLen)) || (characters[convertedCharLen + 1] < kSurrogateLowStart) || (characters[convertedCharLen + 1] > kSurrogateLowEnd)))) return kCFStringEncodingInvalidInputStream;
 675
 676             return kCFStringEncodingInsufficientOutputBufferLength;
 677         } else {
 678             return kCFStringEncodingInvalidInputStream;
 679         }
 680     } else {
 681         const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 682         CFIndex usedLen = 0;
 683         CFIndex localUsedByteLen;
 684         CFIndex theUsedByteLen = 0;
 685         uint32_t theResult = kCFStringEncodingConversionSuccess;
 686         CFStringEncodingToBytesPrecomposeProc toBytesPrecompose = NULL;
 687         CFStringEncodingIsValidCombiningCharacterProc isValidCombiningChar = NULL;
 688
 689         if (!converter) return kCFStringEncodingConverterUnavailable;
 690
 691         if (flags & kCFStringEncodingSubstituteCombinings) {
 692             if (!(flags & kCFStringEncodingAllowLossyConversion)) isValidCombiningChar = converter->definition->isValidCombiningChar;
 693        } else {
 694             isValidCombiningChar = converter->definition->isValidCombiningChar;
 695             if (!(flags & kCFStringEncodingIgnoreCombinings)) {
 696                 toBytesPrecompose = converter->definition->toBytesPrecompose;
 697                 flags |= kCFStringEncodingComposeCombinings;
 698             }
 699         }
 700
 701         if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToBytes((const char *)converter->toBytes, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen);
 702
 703         /* Platform converter */
 704         if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformUnicodeToBytes(encoding, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen);
 705
 706         while ((usedLen < numChars) && (!maxByteLen || (theUsedByteLen < maxByteLen))) {
 707             if ((usedLen += TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) < numChars) {
 708                 CFIndex dummy;
 709
 710                 if (isValidCombiningChar && (usedLen > 0) && isValidCombiningChar(characters[usedLen])) {
 711                     if (toBytesPrecompose) {
 712                         CFIndex localUsedLen = usedLen;
 713
 714                         while (isValidCombiningChar(characters[--usedLen]));
 715                         theUsedByteLen += localUsedByteLen;
 716                         if (converter->definition->maxBytesPerChar > 1) {
 717                             TO_BYTE(converter, flags, characters + usedLen, localUsedLen - usedLen, NULL, 0, &localUsedByteLen);
 718                             theUsedByteLen -= localUsedByteLen;
 719                         } else {
 720                             theUsedByteLen--;
 721                         }
 722                         if ((localUsedLen = toBytesPrecompose(flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) > 0) {
 723                             usedLen += localUsedLen;
 724                             if ((usedLen < numChars) && isValidCombiningChar(characters[usedLen])) { // There is a non-base char not combined remaining
 725                                 theUsedByteLen += localUsedByteLen;
 726                                 theResult = kCFStringEncodingInvalidInputStream;
 727                                 break;
 728                             }
 729                         } else if (flags & kCFStringEncodingAllowLossyConversion) {
 730                             uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
 731
 732                             if (lossyByte) {
 733                                 while (isValidCombiningChar(characters[++usedLen]));
 734                                 localUsedByteLen = 1;
 735                                 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
 736                             } else {
 737                                 ++usedLen;
 738                                 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
 739                             }
 740                         } else {
 741                             theResult = kCFStringEncodingInvalidInputStream;
 742                             break;
 743                         }
 744                     } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up
 745                                     theUsedByteLen += localUsedByteLen;
 746                                     theResult = kCFStringEncodingInsufficientOutputBufferLength;
 747                                     break;
 748                     } else if (flags & kCFStringEncodingIgnoreCombinings) {
 749                         while ((++usedLen < numChars) && isValidCombiningChar(characters[usedLen]));
 750                     } else {
 751                         uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
 752
 753                         theUsedByteLen += localUsedByteLen;
 754                         if (lossyByte) {
 755                             ++usedLen;
 756                             localUsedByteLen = 1;
 757                             if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
 758                         } else {
 759                             usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
 760                         }
 761                     }
 762                 } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up
 763                     theUsedByteLen += localUsedByteLen;
 764
 765                     if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) {
 766                         CFIndex localUsedLen;
 767
 768                         localUsedByteLen = 0;
 769                         while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen;
 770                     }
 771                     if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength;
 772                     break;
 773                 } else if (flags & kCFStringEncodingAllowLossyConversion) {
 774                     uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
 775
 776                     theUsedByteLen += localUsedByteLen;
 777                     if (lossyByte) {
 778                         ++usedLen;
 779                         localUsedByteLen = 1;
 780                         if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
 781                     } else {
 782                         usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
 783                     }
 784                 } else {
 785                     theUsedByteLen += localUsedByteLen;
 786                     theResult = kCFStringEncodingInvalidInputStream;
 787                     break;
 788                 }
 789             }
 790             theUsedByteLen += localUsedByteLen;
 791         }
 792
 793         if (usedLen < numChars && maxByteLen && theResult == kCFStringEncodingConversionSuccess) {
 794             if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) {
 795                 CFIndex localUsedLen;
 796
 797                 localUsedByteLen = 0;
 798                 while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen;
 799             }
 800             if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength;
 801         }
 802         if (usedByteLen) *usedByteLen = theUsedByteLen;
 803         if (usedCharLen) *usedCharLen = usedLen;
 804
 805         return theResult;
 806     }
 807 }
 808
 809 uint32_t CFStringEncodingBytesToUnicode(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 810     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 811     CFIndex usedLen = 0;
 812     CFIndex theUsedCharLen = 0;
 813     CFIndex localUsedCharLen;
 814     uint32_t theResult = kCFStringEncodingConversionSuccess;
 815
 816     if (!converter) return kCFStringEncodingConverterUnavailable;
 817
 818     if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToUnicode((const char *)converter->toBytes, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen);
 819
 820     /* Platform converter */
 821     if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformBytesToUnicode(encoding, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen);
 822
 823     while ((usedLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 824         if ((usedLen += TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen)) < numBytes) {
 825             CFIndex tempUsedCharLen;
 826
 827             if (maxCharLen && ((maxCharLen == theUsedCharLen + localUsedCharLen) || (((flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical)) || (maxCharLen == theUsedCharLen + localUsedCharLen + 1)) && TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, NULL, 0, &tempUsedCharLen)))) { // buffer was filled up
 828                 theUsedCharLen += localUsedCharLen;
 829                 theResult = kCFStringEncodingInsufficientOutputBufferLength;
 830                 break;
 831             } else if (flags & kCFStringEncodingAllowLossyConversion) {
 832                 theUsedCharLen += localUsedCharLen;
 833                 usedLen += TO_UNICODE_FALLBACK(converter, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen);
 834             } else {
 835                 theUsedCharLen += localUsedCharLen;
 836                 theResult = kCFStringEncodingInvalidInputStream;
 837                 break;
 838             }
 839         }
 840         theUsedCharLen += localUsedCharLen;
 841     }
 842
 843     if (usedLen < numBytes && maxCharLen && theResult == kCFStringEncodingConversionSuccess) {
 844         theResult = kCFStringEncodingInsufficientOutputBufferLength;
 845     }
 846     if (usedCharLen) *usedCharLen = theUsedCharLen;
 847     if (usedByteLen) *usedByteLen = usedLen;
 848
 849     return theResult;
 850 }
 851
 852 __private_extern__ bool CFStringEncodingIsValidEncoding(uint32_t encoding) {
 853     return (CFStringEncodingGetConverter(encoding) ? true : false);
 854 }
 855
 856 __private_extern__ CFIndex CFStringEncodingCharLengthForBytes(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
 857     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 858
 859     if (converter) {
 860         if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUCharLength((const char *)converter->toBytes, flags, bytes, numBytes);
 861
 862         if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformCharLengthForBytes(encoding, flags, bytes, numBytes);
 863
 864         if (1 == converter->definition->maxBytesPerChar) return numBytes;
 865
 866         if (NULL == converter->definition->toUnicodeLen) {
 867             CFIndex usedByteLen = 0;
 868             CFIndex totalLength = 0;
 869             CFIndex usedCharLen;
 870
 871             while (numBytes > 0) {
 872                 usedByteLen = TO_UNICODE(converter, flags, bytes, numBytes, NULL, 0, &usedCharLen);
 873
 874                 bytes += usedByteLen;
 875                 numBytes -= usedByteLen;
 876                 totalLength += usedCharLen;
 877
 878                 if (numBytes > 0) {
 879                     if (0 == (flags & kCFStringEncodingAllowLossyConversion)) return 0;
 880
 881                     usedByteLen = TO_UNICODE_FALLBACK(converter, bytes, numBytes, NULL, 0, &usedCharLen);
 882
 883                     bytes += usedByteLen;
 884                     numBytes -= usedByteLen;
 885                     totalLength += usedCharLen;
 886                 }
 887             }
 888
 889             return totalLength;
 890         } else {
 891             return converter->definition->toUnicodeLen(flags, bytes, numBytes);
 892         }
 893     }
 894
 895     return 0;
 896 }
 897
 898 __private_extern__ CFIndex CFStringEncodingByteLengthForCharacters(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars) {
 899     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 900
 901     if (converter) {
 902         if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUByteLength((const char *)converter->toBytes, flags, characters, numChars);
 903
 904         if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformByteLengthForCharacters(encoding, flags, characters, numChars);
 905
 906         if (1 == converter->definition->maxBytesPerChar) return numChars;
 907
 908         if (NULL == converter->definition->toBytesLen) {
 909             CFIndex usedCharLen;
 910
 911             return ((kCFStringEncodingConversionSuccess == CFStringEncodingUnicodeToBytes(encoding, flags, characters, numChars, &usedCharLen, NULL, 0, NULL)) ? usedCharLen : 0);
 912         } else {
 913             return converter->definition->toBytesLen(flags, characters, numChars);
 914         }
 915     }
 916
 917     return 0;
 918 }
 919
 920 __private_extern__ void CFStringEncodingRegisterFallbackProcedures(uint32_t encoding, CFStringEncodingToBytesFallbackProc toBytes, CFStringEncodingToUnicodeFallbackProc toUnicode) {
 921     _CFEncodingConverter *converter = (_CFEncodingConverter *)__CFGetConverter(encoding);
 922
 923     if (NULL != converter) {
 924        const CFStringEncodingConverter *body = CFStringEncodingGetConverter(encoding);
 925
 926         converter->toBytesFallback = ((NULL == toBytes) ? ((NULL == body) ? __CFDefaultToBytesFallbackProc : body->toBytesFallback) : toBytes);
 927         converter->toUnicodeFallback = ((NULL == toUnicode) ? ((NULL == body) ? __CFDefaultToUnicodeFallbackProc : body->toUnicodeFallback) : toUnicode);
 928     }
 929 }
 930
 931 __private_extern__ const CFStringEncodingConverter *CFStringEncodingGetConverter(uint32_t encoding) {
 932     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 933
 934     return ((NULL == converter) ? NULL : converter->definition);
 935 }
 936
 937 static const CFStringEncoding __CFBuiltinEncodings[] = {
 938     kCFStringEncodingMacRoman,
 939     kCFStringEncodingWindowsLatin1,
 940     kCFStringEncodingISOLatin1,
 941     kCFStringEncodingNextStepLatin,
 942     kCFStringEncodingASCII,
 943     kCFStringEncodingUTF8,
 944     /* These seven are available only in CFString-level */
 945     kCFStringEncodingNonLossyASCII,
 946
 947     kCFStringEncodingUTF16,
 948     kCFStringEncodingUTF16BE,
 949     kCFStringEncodingUTF16LE,
 950
 951     kCFStringEncodingUTF32,
 952     kCFStringEncodingUTF32BE,
 953     kCFStringEncodingUTF32LE,
 954
 955     kCFStringEncodingInvalidId,
 956 };
 957
 958 static CFComparisonResult __CFStringEncodingComparator(const void *v1, const void *v2, void *context) {
 959     CFComparisonResult val1 = (*(const CFStringEncoding *)v1) & 0xFFFF;
 960     CFComparisonResult val2 = (*(const CFStringEncoding *)v2) & 0xFFFF;
 961
 962     return ((val1 == val2) ? ((CFComparisonResult)(*(const CFStringEncoding *)v1) - (CFComparisonResult)(*(const CFStringEncoding *)v2)) : val1 - val2);
 963 }
 964
 965 static void __CFStringEncodingFliterDupes(CFStringEncoding *encodings, CFIndex numSlots) {
 966     CFStringEncoding last = kCFStringEncodingInvalidId;
 967     const CFStringEncoding *limitEncodings = encodings + numSlots;
 968
 969     while (encodings < limitEncodings) {
 970         if (last == *encodings) {
 971             if ((encodings + 1) < limitEncodings) memmove(encodings, encodings + 1, sizeof(CFStringEncoding) * (limitEncodings - encodings - 1));
 972             --limitEncodings;
 973         } else {
 974             last = *(encodings++);
 975         }
 976     }
 977 }
 978
 979 __private_extern__ const CFStringEncoding *CFStringEncodingListOfAvailableEncodings(void) {
 980     static const CFStringEncoding *encodings = NULL;
 981
 982     if (NULL == encodings) {
 983         CFStringEncoding *list = (CFStringEncoding *)__CFBuiltinEncodings;
 984         CFIndex numICUConverters = 0, numPlatformConverters = 0;
 985         CFStringEncoding *icuConverters = __CFStringEncodingCreateICUEncodings(NULL, &numICUConverters);
 986         CFStringEncoding *platformConverters = __CFStringEncodingCreateListOfAvailablePlatformConverters(NULL, &numPlatformConverters);
 987
 988         if ((NULL != icuConverters) || (NULL != platformConverters)) {
 989             CFIndex numSlots = (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters + numPlatformConverters;
 990
 991             list = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * numSlots, 0);
 992
 993             memcpy(list, __CFBuiltinEncodings, sizeof(__CFBuiltinEncodings));
 994
 995             if (NULL != icuConverters) {
 996                 memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)), icuConverters, sizeof(CFStringEncoding) * numICUConverters);
 997                 CFAllocatorDeallocate(NULL, icuConverters);
 998             }
 999
1000             if (NULL != platformConverters) {
1001                 memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters, platformConverters, sizeof(CFStringEncoding) * numPlatformConverters);
1002                 CFAllocatorDeallocate(NULL, platformConverters);
1003             }
1004
1005             CFQSortArray(list, numSlots, sizeof(CFStringEncoding), (CFComparatorFunction)__CFStringEncodingComparator, NULL);
1006             __CFStringEncodingFliterDupes(list, numSlots);
1007         }
1008         if (!OSAtomicCompareAndSwapPtrBarrier(NULL, list, (void * volatile *)&encodings) && (list != __CFBuiltinEncodings)) CFAllocatorDeallocate(NULL, list);
1009     }
1010
1011     return encodings;
1012 }
1013
1014 #undef TO_BYTE
1015 #undef TO_UNICODE
1016 #undef ASCIINewLine
1017 #undef kSurrogateHighStart
1018 #undef kSurrogateHighEnd
1019 #undef kSurrogateLowStart
1020 #undef kSurrogateLowEnd
1021 #undef TO_BYTE_FALLBACK
1022 #undef TO_UNICODE_FALLBACK
1023 #undef EXTRA_BASE
1024 #undef NUM_OF_ENTRIES_CYCLE
1025