CFStringEncodingConverter.c

   1 /*
   2  * Copyright (c) 2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24 /*      CFStringEncodingConverter.c
  25         Copyright (c) 1998-2011, Apple Inc. All rights reserved.
  26         Responsibility: Aki Inoue
  27 */
  28
  29 #include "CFInternal.h"
  30 #include <CoreFoundation/CFArray.h>
  31 #include <CoreFoundation/CFDictionary.h>
  32 #include "CFICUConverters.h"
  33 #include <CoreFoundation/CFUniChar.h>
  34 #include <CoreFoundation/CFPriv.h>
  35 #include "CFUnicodeDecomposition.h"
  36 #include "CFStringEncodingConverterExt.h"
  37 #include "CFStringEncodingConverterPriv.h"
  38 #include <stdlib.h>
  39 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
  40 #include <pthread.h>
  41 #endif
  42
  43 typedef CFIndex (*_CFToBytesProc)(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen);
  44 typedef CFIndex (*_CFToUnicodeProc)(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen);
  45
  46 typedef struct {
  47     const CFStringEncodingConverter *definition;
  48     _CFToBytesProc toBytes;
  49     _CFToUnicodeProc toUnicode;
  50     _CFToUnicodeProc toCanonicalUnicode;
  51     CFStringEncodingToBytesFallbackProc toBytesFallback;
  52     CFStringEncodingToUnicodeFallbackProc toUnicodeFallback;
  53 } _CFEncodingConverter;
  54
  55 /* Macros
  56 */
  57 #define TO_BYTE(conv,flags,chars,numChars,bytes,max,used) (conv->toBytes ? conv->toBytes(conv,flags,chars,numChars,bytes,max,used) : ((CFStringEncodingToBytesProc)conv->definition->toBytes)(flags,chars,numChars,bytes,max,used))
  58 #define TO_UNICODE(conv,flags,bytes,numBytes,chars,max,used) (conv->toUnicode ?  (flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical) ? conv->toCanonicalUnicode(conv,flags,bytes,numBytes,chars,max,used) : conv->toUnicode(conv,flags,bytes,numBytes,chars,max,used)) : ((CFStringEncodingToUnicodeProc)conv->definition->toUnicode)(flags,bytes,numBytes,chars,max,used))
  59
  60 #define ASCIINewLine 0x0a
  61 #define kSurrogateHighStart 0xD800
  62 #define kSurrogateHighEnd 0xDBFF
  63 #define kSurrogateLowStart 0xDC00
  64 #define kSurrogateLowEnd 0xDFFF
  65
  66 static const uint8_t __CFMaximumConvertedLength = 20;
  67
  68 /* Mapping 128..255 to lossy ASCII
  69 */
  70 static const struct {
  71     unsigned char chars[4];
  72 } _toLossyASCIITable[] = {
  73     {{' ', 0, 0, 0}}, // NO-BREAK SPACE
  74     {{'!', 0, 0, 0}}, // INVERTED EXCLAMATION MARK
  75     {{'c', 0, 0, 0}}, // CENT SIGN
  76     {{'L', 0, 0, 0}}, // POUND SIGN
  77     {{'$', 0, 0, 0}}, // CURRENCY SIGN
  78     {{'Y', 0, 0, 0}}, // YEN SIGN
  79     {{'|', 0, 0, 0}}, // BROKEN BAR
  80     {{0, 0, 0, 0}}, // SECTION SIGN
  81     {{0, 0, 0, 0}}, // DIAERESIS
  82     {{'(', 'C', ')', 0}}, // COPYRIGHT SIGN
  83     {{'a', 0, 0, 0}}, // FEMININE ORDINAL INDICATOR
  84     {{'<', '<', 0, 0}}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
  85     {{0, 0, 0, 0}}, // NOT SIGN
  86     {{'-', 0, 0, 0}}, // SOFT HYPHEN
  87     {{'(', 'R', ')', 0}}, // REGISTERED SIGN
  88     {{0, 0, 0, 0}}, // MACRON
  89     {{0, 0, 0, 0}}, // DEGREE SIGN
  90     {{'+', '-', 0, 0}}, // PLUS-MINUS SIGN
  91     {{'2', 0, 0, 0}}, // SUPERSCRIPT TWO
  92     {{'3', 0, 0, 0}}, // SUPERSCRIPT THREE
  93     {{0, 0, 0, 0}}, // ACUTE ACCENT
  94     {{0, 0, 0, 0}}, // MICRO SIGN
  95     {{0, 0, 0, 0}}, // PILCROW SIGN
  96     {{0, 0, 0, 0}}, // MIDDLE DOT
  97     {{0, 0, 0, 0}}, // CEDILLA
  98     {{'1', 0, 0, 0}}, // SUPERSCRIPT ONE
  99     {{'o', 0, 0, 0}}, // MASCULINE ORDINAL INDICATOR
 100     {{'>', '>', 0, 0}}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
 101     {{'1', '/', '4', 0}}, // VULGAR FRACTION ONE QUARTER
 102     {{'1', '/', '2', 0}}, // VULGAR FRACTION ONE HALF
 103     {{'3', '/', '4', 0}}, // VULGAR FRACTION THREE QUARTERS
 104     {{'?', 0, 0, 0}}, // INVERTED QUESTION MARK
 105     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH GRAVE
 106     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH ACUTE
 107     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
 108     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH TILDE
 109     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH DIAERESIS
 110     {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH RING ABOVE
 111     {{'A', 'E', 0, 0}}, // LATIN CAPITAL LETTER AE
 112     {{'C', 0, 0, 0}}, // LATIN CAPITAL LETTER C WITH CEDILLA
 113     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH GRAVE
 114     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH ACUTE
 115     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
 116     {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH DIAERESIS
 117     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH GRAVE
 118     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH ACUTE
 119     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
 120     {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH DIAERESIS
 121     {{'T', 'H', 0, 0}}, // LATIN CAPITAL LETTER ETH (Icelandic)
 122     {{'N', 0, 0, 0}}, // LATIN CAPITAL LETTER N WITH TILDE
 123     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH GRAVE
 124     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH ACUTE
 125     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
 126     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH TILDE
 127     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH DIAERESIS
 128     {{'X', 0, 0, 0}}, // MULTIPLICATION SIGN
 129     {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH STROKE
 130     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH GRAVE
 131     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH ACUTE
 132     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
 133     {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH DIAERESIS
 134     {{'Y', 0, 0, 0}}, // LATIN CAPITAL LETTER Y WITH ACUTE
 135     {{'t', 'h', 0, 0}}, // LATIN CAPITAL LETTER THORN (Icelandic)
 136     {{'s', 0, 0, 0}}, // LATIN SMALL LETTER SHARP S (German)
 137     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH GRAVE
 138     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH ACUTE
 139     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH CIRCUMFLEX
 140     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH TILDE
 141     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH DIAERESIS
 142     {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH RING ABOVE
 143     {{'a', 'e', 0, 0}}, // LATIN SMALL LETTER AE
 144     {{'c', 0, 0, 0}}, // LATIN SMALL LETTER C WITH CEDILLA
 145     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH GRAVE
 146     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH ACUTE
 147     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH CIRCUMFLEX
 148     {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH DIAERESIS
 149     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH GRAVE
 150     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH ACUTE
 151     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH CIRCUMFLEX
 152     {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH DIAERESIS
 153     {{'T', 'H', 0, 0}}, // LATIN SMALL LETTER ETH (Icelandic)
 154     {{'n', 0, 0, 0}}, // LATIN SMALL LETTER N WITH TILDE
 155     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH GRAVE
 156     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH ACUTE
 157     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH CIRCUMFLEX
 158     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH TILDE
 159     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH DIAERESIS
 160     {{'/', 0, 0, 0}}, // DIVISION SIGN
 161     {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH STROKE
 162     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH GRAVE
 163     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH ACUTE
 164     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH CIRCUMFLEX
 165     {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH DIAERESIS
 166     {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH ACUTE
 167     {{'t', 'h', 0, 0}}, // LATIN SMALL LETTER THORN (Icelandic)
 168     {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH DIAERESIS
 169 };
 170
 171 CF_INLINE CFIndex __CFToASCIILatin1Fallback(UniChar character, uint8_t *bytes, CFIndex maxByteLen) {
 172     const uint8_t *losChars = (const uint8_t*)_toLossyASCIITable + (character - 0xA0) * sizeof(uint8_t[4]);
 173     CFIndex numBytes = 0;
 174     CFIndex idx, max = (maxByteLen && (maxByteLen < 4) ? maxByteLen : 4);
 175
 176     for (idx = 0;idx < max;idx++) {
 177         if (losChars[idx]) {
 178             if (maxByteLen) bytes[idx] = losChars[idx];
 179             ++numBytes;
 180         } else {
 181             break;
 182         }
 183     }
 184
 185     return numBytes;
 186 }
 187
 188 static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 189     CFIndex processCharLen = 1, filledBytesLen = 1;
 190     uint8_t byte = '?';
 191
 192     if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range
 193         byte = (uint8_t)(*characters - 0x80);
 194     } else if (*characters < 0x100) {
 195         *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen);
 196         return 1;
 197     } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) {
 198         processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1);
 199     } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) {
 200         byte = ' ';
 201     } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) {
 202         byte = ASCIINewLine;
 203     } else if (*characters == 0x2026) { // ellipsis
 204         if (0 == maxByteLen) {
 205             filledBytesLen = 3;
 206         } else if (maxByteLen > 2) {
 207             memset(bytes, '.', 3);
 208             *usedByteLen = 3;
 209             return processCharLen;
 210         }
 211     } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) {
 212         UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
 213
 214         (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH);
 215         if (*decomposed < 0x80) {
 216             byte = (uint8_t)(*decomposed);
 217         } else {
 218             UTF16Char theChar = *decomposed;
 219
 220             return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen);
 221         }
 222     }
 223
 224     if (maxByteLen) *bytes = byte;
 225     *usedByteLen = filledBytesLen;
 226     return processCharLen;
 227 }
 228
 229 static CFIndex __CFDefaultToUnicodeFallbackProc(const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 230     if (maxCharLen) *characters = (UniChar)'?';
 231     *usedCharLen = 1;
 232     return 1;
 233 }
 234
 235 #define TO_BYTE_FALLBACK(conv,chars,numChars,bytes,max,used) (conv->toBytesFallback(chars,numChars,bytes,max,used))
 236 #define TO_UNICODE_FALLBACK(conv,bytes,numBytes,chars,max,used) (conv->toUnicodeFallback(bytes,numBytes,chars,max,used))
 237
 238 #define EXTRA_BASE (0x0F00)
 239
 240 /* Wrapper funcs for non-standard converters
 241 */
 242 static CFIndex __CFToBytesCheapEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 243     CFIndex processedCharLen = 0;
 244     CFIndex length = (maxByteLen && (maxByteLen < numChars) ? maxByteLen : numChars);
 245     uint8_t byte;
 246
 247     while (processedCharLen < length) {
 248         if (!((CFStringEncodingCheapEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], &byte)) break;
 249
 250         if (maxByteLen) bytes[processedCharLen] = byte;
 251         processedCharLen++;
 252     }
 253
 254     *usedByteLen = processedCharLen;
 255     return processedCharLen;
 256 }
 257
 258 static CFIndex __CFToUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 259     CFIndex processedByteLen = 0;
 260     CFIndex length = (maxCharLen && (maxCharLen < numBytes) ? maxCharLen : numBytes);
 261     UniChar character;
 262
 263     while (processedByteLen < length) {
 264         if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break;
 265
 266         if (maxCharLen) characters[processedByteLen] = character;
 267         processedByteLen++;
 268     }
 269
 270     *usedCharLen = processedByteLen;
 271     return processedByteLen;
 272 }
 273
 274 static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 275     CFIndex processedByteLen = 0;
 276     CFIndex theUsedCharLen = 0;
 277     UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
 278     CFIndex usedLen;
 279     UniChar character;
 280     bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
 281
 282     while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 283         if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break;
 284
 285         if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
 286             CFIndex idx;
 287
 288             usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
 289             *usedCharLen = theUsedCharLen;
 290
 291             for (idx = 0;idx < usedLen;idx++) {
 292                 if (charBuffer[idx] > 0xFFFF) { // Non-BMP
 293                     if (theUsedCharLen + 2 > maxCharLen)  return processedByteLen;
 294                     theUsedCharLen += 2;
 295                     if (maxCharLen) {
 296                         charBuffer[idx] = charBuffer[idx] - 0x10000;
 297                         *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
 298                         *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
 299                     }
 300                 } else {
 301                     if (theUsedCharLen + 1 > maxCharLen)  return processedByteLen;
 302                     ++theUsedCharLen;
 303                     *(characters++) = charBuffer[idx];
 304                 }
 305             }
 306         } else {
 307             if (maxCharLen) *(characters++) = character;
 308             ++theUsedCharLen;
 309         }
 310         processedByteLen++;
 311     }
 312
 313     *usedCharLen = theUsedCharLen;
 314     return processedByteLen;
 315 }
 316
 317 static CFIndex __CFToBytesStandardEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 318     CFIndex processedCharLen = 0;
 319     uint8_t byte;
 320     CFIndex usedLen;
 321
 322     *usedByteLen = 0;
 323
 324     while (numChars && (!maxByteLen || (*usedByteLen < maxByteLen))) {
 325         if (!(usedLen = ((CFStringEncodingStandardEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters, numChars, &byte))) break;
 326
 327         if (maxByteLen) bytes[*usedByteLen] = byte;
 328         (*usedByteLen)++;
 329         characters += usedLen;
 330         numChars -= usedLen;
 331         processedCharLen += usedLen;
 332     }
 333
 334     return processedCharLen;
 335 }
 336
 337 static CFIndex __CFToUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 338     CFIndex processedByteLen = 0;
 339     UniChar charBuffer[__CFMaximumConvertedLength];
 340     CFIndex usedLen;
 341
 342     *usedCharLen = 0;
 343
 344     while ((processedByteLen < numBytes) && (!maxCharLen || (*usedCharLen < maxCharLen))) {
 345         if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break;
 346
 347         if (maxCharLen) {
 348             CFIndex idx;
 349
 350             if (*usedCharLen + usedLen > maxCharLen) break;
 351
 352             for (idx = 0;idx < usedLen;idx++) {
 353                 characters[*usedCharLen + idx] = charBuffer[idx];
 354             }
 355         }
 356         *usedCharLen += usedLen;
 357         processedByteLen++;
 358     }
 359
 360     return processedByteLen;
 361 }
 362
 363 static CFIndex __CFToCanonicalUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 364     CFIndex processedByteLen = 0;
 365     UniChar charBuffer[__CFMaximumConvertedLength];
 366     UTF32Char decompBuffer[MAX_DECOMPOSED_LENGTH];
 367     CFIndex usedLen;
 368     CFIndex decompedLen;
 369     CFIndex idx, decompIndex;
 370     bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
 371     CFIndex theUsedCharLen = 0;
 372
 373     while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 374         if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break;
 375
 376         for (idx = 0;idx < usedLen;idx++) {
 377             if (CFUniCharIsDecomposableCharacter(charBuffer[idx], isHFSPlus)) {
 378                 decompedLen = CFUniCharDecomposeCharacter(charBuffer[idx], decompBuffer, MAX_DECOMPOSED_LENGTH);
 379                 *usedCharLen = theUsedCharLen;
 380
 381                 for (decompIndex = 0;decompIndex < decompedLen;decompIndex++) {
 382                     if (decompBuffer[decompIndex] > 0xFFFF) { // Non-BMP
 383                         if (theUsedCharLen + 2 > maxCharLen)  return processedByteLen;
 384                         theUsedCharLen += 2;
 385                         if (maxCharLen) {
 386                             charBuffer[idx] = charBuffer[idx] - 0x10000;
 387                             *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL;
 388                             *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL;
 389                         }
 390                     } else {
 391                         if (theUsedCharLen + 1 > maxCharLen)  return processedByteLen;
 392                         ++theUsedCharLen;
 393                         *(characters++) = charBuffer[idx];
 394                     }
 395                 }
 396             } else {
 397                 if (maxCharLen) *(characters++) = charBuffer[idx];
 398                 ++theUsedCharLen;
 399             }
 400         }
 401         processedByteLen++;
 402     }
 403
 404     *usedCharLen = theUsedCharLen;
 405     return processedByteLen;
 406 }
 407
 408 static CFIndex __CFToBytesCheapMultiByteWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 409     CFIndex processedCharLen = 0;
 410     uint8_t byteBuffer[__CFMaximumConvertedLength];
 411     CFIndex usedLen;
 412
 413     *usedByteLen = 0;
 414
 415     while ((processedCharLen < numChars) && (!maxByteLen || (*usedByteLen < maxByteLen))) {
 416         if (!(usedLen = ((CFStringEncodingCheapMultiByteToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], byteBuffer))) break;
 417
 418         if (maxByteLen) {
 419             CFIndex idx;
 420
 421             if (*usedByteLen + usedLen > maxByteLen) break;
 422
 423             for (idx = 0;idx <usedLen;idx++) {
 424                 bytes[*usedByteLen + idx] = byteBuffer[idx];
 425             }
 426         }
 427
 428         *usedByteLen += usedLen;
 429         processedCharLen++;
 430     }
 431
 432     return processedCharLen;
 433 }
 434
 435 static CFIndex __CFToUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 436     CFIndex processedByteLen = 0;
 437     UniChar character;
 438     CFIndex usedLen;
 439
 440     *usedCharLen = 0;
 441
 442     while (numBytes && (!maxCharLen || (*usedCharLen < maxCharLen))) {
 443         if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break;
 444
 445         if (maxCharLen) *(characters++) = character;
 446         (*usedCharLen)++;
 447         processedByteLen += usedLen;
 448         bytes += usedLen;
 449         numBytes -= usedLen;
 450     }
 451
 452     return processedByteLen;
 453 }
 454
 455 static CFIndex __CFToCanonicalUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 456     CFIndex processedByteLen = 0;
 457     UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
 458     UniChar character;
 459     CFIndex usedLen;
 460     CFIndex decomposedLen;
 461     CFIndex theUsedCharLen = 0;
 462     bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
 463
 464     while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 465         if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break;
 466
 467         if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
 468             CFIndex idx;
 469
 470             decomposedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
 471             *usedCharLen = theUsedCharLen;
 472
 473             for (idx = 0;idx < decomposedLen;idx++) {
 474                 if (charBuffer[idx] > 0xFFFF) { // Non-BMP
 475                     if (theUsedCharLen + 2 > maxCharLen)  return processedByteLen;
 476                     theUsedCharLen += 2;
 477                     if (maxCharLen) {
 478                         charBuffer[idx] = charBuffer[idx] - 0x10000;
 479                         *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
 480                         *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
 481                     }
 482                 } else {
 483                     if (theUsedCharLen + 1 > maxCharLen)  return processedByteLen;
 484                     ++theUsedCharLen;
 485                     *(characters++) = charBuffer[idx];
 486                 }
 487             }
 488         } else {
 489             if (maxCharLen) *(characters++) = character;
 490             ++theUsedCharLen;
 491         }
 492
 493         processedByteLen += usedLen;
 494         bytes += usedLen;
 495         numBytes -= usedLen;
 496     }
 497     *usedCharLen = theUsedCharLen;
 498     return processedByteLen;
 499 }
 500
 501 /* static functions
 502 */
 503 CF_INLINE _CFEncodingConverter *__CFEncodingConverterFromDefinition(const CFStringEncodingConverter *definition, CFStringEncoding encoding) {
 504 #define NUM_OF_ENTRIES_CYCLE (10)
 505     static uint32_t _currentIndex = 0;
 506     static uint32_t _allocatedSize = 0;
 507     static _CFEncodingConverter *_allocatedEntries = NULL;
 508     _CFEncodingConverter *converter;
 509
 510
 511     if ((_currentIndex + 1) >= _allocatedSize) {
 512         _currentIndex = 0;
 513         _allocatedSize = 0;
 514         _allocatedEntries = NULL;
 515     }
 516     if (_allocatedEntries == NULL) { // Not allocated yet
 517         _allocatedEntries = (_CFEncodingConverter *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(_CFEncodingConverter) * NUM_OF_ENTRIES_CYCLE, 0);
 518         _allocatedSize = NUM_OF_ENTRIES_CYCLE;
 519         converter = &(_allocatedEntries[_currentIndex]);
 520     } else {
 521         converter = &(_allocatedEntries[++_currentIndex]);
 522     }
 523
 524     memset(converter, 0, sizeof(_CFEncodingConverter));
 525
 526     converter->definition = definition;
 527
 528     switch (definition->encodingClass) {
 529         case kCFStringEncodingConverterStandard:
 530             converter->toBytes = NULL;
 531             converter->toUnicode = NULL;
 532             converter->toCanonicalUnicode = NULL;
 533             break;
 534
 535         case kCFStringEncodingConverterCheapEightBit:
 536             converter->toBytes = __CFToBytesCheapEightBitWrapper;
 537             converter->toUnicode = __CFToUnicodeCheapEightBitWrapper;
 538             converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapEightBitWrapper;
 539             break;
 540
 541         case kCFStringEncodingConverterStandardEightBit:
 542             converter->toBytes = __CFToBytesStandardEightBitWrapper;
 543             converter->toUnicode = __CFToUnicodeStandardEightBitWrapper;
 544             converter->toCanonicalUnicode = __CFToCanonicalUnicodeStandardEightBitWrapper;
 545             break;
 546
 547         case kCFStringEncodingConverterCheapMultiByte:
 548             converter->toBytes = __CFToBytesCheapMultiByteWrapper;
 549             converter->toUnicode = __CFToUnicodeCheapMultiByteWrapper;
 550             converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapMultiByteWrapper;
 551             break;
 552
 553         case kCFStringEncodingConverterICU:
 554             converter->toBytes = (_CFToBytesProc)__CFStringEncodingGetICUName(encoding);
 555             break;
 556
 557         case kCFStringEncodingConverterPlatformSpecific:
 558             break;
 559
 560         default: // Shouln't be here
 561             return NULL;
 562     }
 563
 564     converter->toBytesFallback = (definition->toBytesFallback ? definition->toBytesFallback : __CFDefaultToBytesFallbackProc);
 565     converter->toUnicodeFallback = (definition->toUnicodeFallback ? definition->toUnicodeFallback : __CFDefaultToUnicodeFallbackProc);
 566
 567     return converter;
 568 }
 569
 570 CF_INLINE const CFStringEncodingConverter *__CFStringEncodingConverterGetDefinition(CFStringEncoding encoding) {
 571     switch (encoding) {
 572         case kCFStringEncodingUTF8:
 573             return &__CFConverterUTF8;
 574
 575         case kCFStringEncodingMacRoman:
 576             return &__CFConverterMacRoman;
 577
 578         case kCFStringEncodingWindowsLatin1:
 579             return &__CFConverterWinLatin1;
 580
 581         case kCFStringEncodingASCII:
 582             return &__CFConverterASCII;
 583
 584         case kCFStringEncodingISOLatin1:
 585             return &__CFConverterISOLatin1;
 586
 587
 588         case kCFStringEncodingNextStepLatin:
 589             return &__CFConverterNextStepLatin;
 590
 591
 592         default:
 593             return __CFStringEncodingGetExternalConverter(encoding);
 594     }
 595 }
 596
 597 static const _CFEncodingConverter *__CFGetConverter(uint32_t encoding) {
 598     const _CFEncodingConverter *converter = NULL;
 599     const _CFEncodingConverter **commonConverterSlot = NULL;
 600     static _CFEncodingConverter *commonConverters[3] = {NULL, NULL, NULL}; // UTF8, MacRoman/WinLatin1, and the default encoding*
 601     static CFMutableDictionaryRef mappingTable = NULL;
 602     static CFSpinLock_t lock = CFSpinLockInit;
 603
 604     switch (encoding) {
 605         case kCFStringEncodingUTF8: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[0]); break;
 606
 607             /* the swith here should avoid possible bootstrap issues in the default: case below when invoked from CFStringGetSystemEncoding() */
 608 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_LINUX
 609         case kCFStringEncodingMacRoman: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[1]); break;
 610 #elif DEPLOYMENT_TARGET_WINDOWS
 611         case kCFStringEncodingWindowsLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break;
 612 #else
 613 #warning This case must match __defaultEncoding value defined in CFString.c
 614         case kCFStringEncodingISOLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break;
 615 #endif /* DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED */
 616
 617         default: if (CFStringGetSystemEncoding() == encoding) commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[2]); break;
 618     }
 619
 620     __CFSpinLock(&lock);
 621     converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot);
 622     __CFSpinUnlock(&lock);
 623
 624     if (NULL == converter) {
 625         const CFStringEncodingConverter *definition = __CFStringEncodingConverterGetDefinition(encoding);
 626
 627         if (NULL != definition) {
 628             __CFSpinLock(&lock);
 629             converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot);
 630
 631             if (NULL == converter) {
 632                 converter = __CFEncodingConverterFromDefinition(definition, encoding);
 633
 634                 if (NULL == commonConverterSlot) {
 635                     if (NULL == mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, NULL, NULL);
 636
 637                     CFDictionarySetValue(mappingTable, (const void *)(uintptr_t)encoding, converter);
 638                 } else {
 639                     *commonConverterSlot = converter;
 640                 }
 641             }
 642             __CFSpinUnlock(&lock);
 643         }
 644     }
 645
 646     return converter;
 647 }
 648
 649 /* Public API
 650 */
 651 uint32_t CFStringEncodingUnicodeToBytes(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 652     if (encoding == kCFStringEncodingUTF8) {
 653         static CFStringEncodingToBytesProc __CFToUTF8 = NULL;
 654         CFIndex convertedCharLen;
 655         CFIndex usedLen;
 656
 657
 658         if ((flags & kCFStringEncodingUseCanonical) || (flags & kCFStringEncodingUseHFSPlusCanonical)) {
 659             (void)CFUniCharDecompose(characters, numChars, &convertedCharLen, (void *)bytes, maxByteLen, &usedLen, true, kCFUniCharUTF8Format, (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false));
 660         } else {
 661             if (!__CFToUTF8) {
 662                 const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
 663                 __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes;
 664             }
 665             convertedCharLen = __CFToUTF8(0, characters, numChars, bytes, maxByteLen, &usedLen);
 666         }
 667         if (usedCharLen) *usedCharLen = convertedCharLen;
 668         if (usedByteLen) *usedByteLen = usedLen;
 669
 670         if (convertedCharLen == numChars) {
 671             return kCFStringEncodingConversionSuccess;
 672         } else if ((maxByteLen > 0) && ((maxByteLen - usedLen) < 10)) { // could be filled outbuf
 673             UTF16Char character = characters[convertedCharLen];
 674
 675             if (((character >= kSurrogateLowStart) && (character <= kSurrogateLowEnd)) || ((character >= kSurrogateHighStart) && (character <= kSurrogateHighEnd) && ((1 == (numChars - convertedCharLen)) || (characters[convertedCharLen + 1] < kSurrogateLowStart) || (characters[convertedCharLen + 1] > kSurrogateLowEnd)))) return kCFStringEncodingInvalidInputStream;
 676
 677             return kCFStringEncodingInsufficientOutputBufferLength;
 678         } else {
 679             return kCFStringEncodingInvalidInputStream;
 680         }
 681     } else {
 682         const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 683         CFIndex usedLen = 0;
 684         CFIndex localUsedByteLen;
 685         CFIndex theUsedByteLen = 0;
 686         uint32_t theResult = kCFStringEncodingConversionSuccess;
 687         CFStringEncodingToBytesPrecomposeProc toBytesPrecompose = NULL;
 688         CFStringEncodingIsValidCombiningCharacterProc isValidCombiningChar = NULL;
 689
 690         if (!converter) return kCFStringEncodingConverterUnavailable;
 691
 692         if (flags & kCFStringEncodingSubstituteCombinings) {
 693             if (!(flags & kCFStringEncodingAllowLossyConversion)) isValidCombiningChar = converter->definition->isValidCombiningChar;
 694        } else {
 695             isValidCombiningChar = converter->definition->isValidCombiningChar;
 696             if (!(flags & kCFStringEncodingIgnoreCombinings)) {
 697                 toBytesPrecompose = converter->definition->toBytesPrecompose;
 698                 flags |= kCFStringEncodingComposeCombinings;
 699             }
 700         }
 701
 702         if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToBytes((const char *)converter->toBytes, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen);
 703
 704         /* Platform converter */
 705         if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformUnicodeToBytes(encoding, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen);
 706
 707         while ((usedLen < numChars) && (!maxByteLen || (theUsedByteLen < maxByteLen))) {
 708             if ((usedLen += TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) < numChars) {
 709                 CFIndex dummy;
 710
 711                 if (isValidCombiningChar && (usedLen > 0) && isValidCombiningChar(characters[usedLen])) {
 712                     if (toBytesPrecompose) {
 713                         CFIndex localUsedLen = usedLen;
 714
 715                         while (isValidCombiningChar(characters[--usedLen]));
 716                         theUsedByteLen += localUsedByteLen;
 717                         if (converter->definition->maxBytesPerChar > 1) {
 718                             TO_BYTE(converter, flags, characters + usedLen, localUsedLen - usedLen, NULL, 0, &localUsedByteLen);
 719                             theUsedByteLen -= localUsedByteLen;
 720                         } else {
 721                             theUsedByteLen--;
 722                         }
 723                         if ((localUsedLen = toBytesPrecompose(flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) > 0) {
 724                             usedLen += localUsedLen;
 725                             if ((usedLen < numChars) && isValidCombiningChar(characters[usedLen])) { // There is a non-base char not combined remaining
 726                                 theUsedByteLen += localUsedByteLen;
 727                                 theResult = kCFStringEncodingInvalidInputStream;
 728                                 break;
 729                             }
 730                         } else if (flags & kCFStringEncodingAllowLossyConversion) {
 731                             uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
 732
 733                             if (lossyByte) {
 734                                 while (isValidCombiningChar(characters[++usedLen]));
 735                                 localUsedByteLen = 1;
 736                                 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
 737                             } else {
 738                                 ++usedLen;
 739                                 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
 740                             }
 741                         } else {
 742                             theResult = kCFStringEncodingInvalidInputStream;
 743                             break;
 744                         }
 745                     } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up
 746                                     theUsedByteLen += localUsedByteLen;
 747                                     theResult = kCFStringEncodingInsufficientOutputBufferLength;
 748                                     break;
 749                     } else if (flags & kCFStringEncodingIgnoreCombinings) {
 750                         while ((++usedLen < numChars) && isValidCombiningChar(characters[usedLen]));
 751                     } else {
 752                         uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
 753
 754                         theUsedByteLen += localUsedByteLen;
 755                         if (lossyByte) {
 756                             ++usedLen;
 757                             localUsedByteLen = 1;
 758                             if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
 759                         } else {
 760                             usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
 761                         }
 762                     }
 763                 } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up
 764                     theUsedByteLen += localUsedByteLen;
 765
 766                     if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) {
 767                         CFIndex localUsedLen;
 768
 769                         localUsedByteLen = 0;
 770                         while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen;
 771                     }
 772                     if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength;
 773                     break;
 774                 } else if (flags & kCFStringEncodingAllowLossyConversion) {
 775                     uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
 776
 777                     theUsedByteLen += localUsedByteLen;
 778                     if (lossyByte) {
 779                         ++usedLen;
 780                         localUsedByteLen = 1;
 781                         if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
 782                     } else {
 783                         usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
 784                     }
 785                 } else {
 786                     theUsedByteLen += localUsedByteLen;
 787                     theResult = kCFStringEncodingInvalidInputStream;
 788                     break;
 789                 }
 790             }
 791             theUsedByteLen += localUsedByteLen;
 792         }
 793
 794         if (usedLen < numChars && maxByteLen && theResult == kCFStringEncodingConversionSuccess) {
 795             if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) {
 796                 CFIndex localUsedLen;
 797
 798                 localUsedByteLen = 0;
 799                 while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen;
 800             }
 801             if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength;
 802         }
 803         if (usedByteLen) *usedByteLen = theUsedByteLen;
 804         if (usedCharLen) *usedCharLen = usedLen;
 805
 806         return theResult;
 807     }
 808 }
 809
 810 uint32_t CFStringEncodingBytesToUnicode(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
 811     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 812     CFIndex usedLen = 0;
 813     CFIndex theUsedCharLen = 0;
 814     CFIndex localUsedCharLen;
 815     uint32_t theResult = kCFStringEncodingConversionSuccess;
 816
 817     if (!converter) return kCFStringEncodingConverterUnavailable;
 818
 819     if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToUnicode((const char *)converter->toBytes, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen);
 820
 821     /* Platform converter */
 822     if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformBytesToUnicode(encoding, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen);
 823
 824     while ((usedLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
 825         if ((usedLen += TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen)) < numBytes) {
 826             CFIndex tempUsedCharLen;
 827
 828             if (maxCharLen && ((maxCharLen == theUsedCharLen + localUsedCharLen) || (((flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical)) || (maxCharLen == theUsedCharLen + localUsedCharLen + 1)) && TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, NULL, 0, &tempUsedCharLen)))) { // buffer was filled up
 829                 theUsedCharLen += localUsedCharLen;
 830                 theResult = kCFStringEncodingInsufficientOutputBufferLength;
 831                 break;
 832             } else if (flags & kCFStringEncodingAllowLossyConversion) {
 833                 theUsedCharLen += localUsedCharLen;
 834                 usedLen += TO_UNICODE_FALLBACK(converter, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen);
 835             } else {
 836                 theUsedCharLen += localUsedCharLen;
 837                 theResult = kCFStringEncodingInvalidInputStream;
 838                 break;
 839             }
 840         }
 841         theUsedCharLen += localUsedCharLen;
 842     }
 843
 844     if (usedLen < numBytes && maxCharLen && theResult == kCFStringEncodingConversionSuccess) {
 845         theResult = kCFStringEncodingInsufficientOutputBufferLength;
 846     }
 847     if (usedCharLen) *usedCharLen = theUsedCharLen;
 848     if (usedByteLen) *usedByteLen = usedLen;
 849
 850     return theResult;
 851 }
 852
 853 __private_extern__ bool CFStringEncodingIsValidEncoding(uint32_t encoding) {
 854     return (CFStringEncodingGetConverter(encoding) ? true : false);
 855 }
 856
 857 __private_extern__ CFIndex CFStringEncodingCharLengthForBytes(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
 858     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 859
 860     if (converter) {
 861         if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUCharLength((const char *)converter->toBytes, flags, bytes, numBytes);
 862
 863         if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformCharLengthForBytes(encoding, flags, bytes, numBytes);
 864
 865         if (1 == converter->definition->maxBytesPerChar) return numBytes;
 866
 867         if (NULL == converter->definition->toUnicodeLen) {
 868             CFIndex usedByteLen = 0;
 869             CFIndex totalLength = 0;
 870             CFIndex usedCharLen;
 871
 872             while (numBytes > 0) {
 873                 usedByteLen = TO_UNICODE(converter, flags, bytes, numBytes, NULL, 0, &usedCharLen);
 874
 875                 bytes += usedByteLen;
 876                 numBytes -= usedByteLen;
 877                 totalLength += usedCharLen;
 878
 879                 if (numBytes > 0) {
 880                     if (0 == (flags & kCFStringEncodingAllowLossyConversion)) return 0;
 881
 882                     usedByteLen = TO_UNICODE_FALLBACK(converter, bytes, numBytes, NULL, 0, &usedCharLen);
 883
 884                     bytes += usedByteLen;
 885                     numBytes -= usedByteLen;
 886                     totalLength += usedCharLen;
 887                 }
 888             }
 889
 890             return totalLength;
 891         } else {
 892             return converter->definition->toUnicodeLen(flags, bytes, numBytes);
 893         }
 894     }
 895
 896     return 0;
 897 }
 898
 899 __private_extern__ CFIndex CFStringEncodingByteLengthForCharacters(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars) {
 900     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 901
 902     if (converter) {
 903         if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUByteLength((const char *)converter->toBytes, flags, characters, numChars);
 904
 905         if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformByteLengthForCharacters(encoding, flags, characters, numChars);
 906
 907         if (1 == converter->definition->maxBytesPerChar) return numChars;
 908
 909         if (NULL == converter->definition->toBytesLen) {
 910             CFIndex usedByteLen;
 911
 912             return ((kCFStringEncodingConversionSuccess == CFStringEncodingUnicodeToBytes(encoding, flags, characters, numChars, NULL, NULL, 0, &usedByteLen)) ? usedByteLen : 0);
 913         } else {
 914             return converter->definition->toBytesLen(flags, characters, numChars);
 915         }
 916     }
 917
 918     return 0;
 919 }
 920
 921 __private_extern__ void CFStringEncodingRegisterFallbackProcedures(uint32_t encoding, CFStringEncodingToBytesFallbackProc toBytes, CFStringEncodingToUnicodeFallbackProc toUnicode) {
 922     _CFEncodingConverter *converter = (_CFEncodingConverter *)__CFGetConverter(encoding);
 923
 924     if (NULL != converter) {
 925        const CFStringEncodingConverter *body = CFStringEncodingGetConverter(encoding);
 926
 927         converter->toBytesFallback = ((NULL == toBytes) ? ((NULL == body) ? __CFDefaultToBytesFallbackProc : body->toBytesFallback) : toBytes);
 928         converter->toUnicodeFallback = ((NULL == toUnicode) ? ((NULL == body) ? __CFDefaultToUnicodeFallbackProc : body->toUnicodeFallback) : toUnicode);
 929     }
 930 }
 931
 932 __private_extern__ const CFStringEncodingConverter *CFStringEncodingGetConverter(uint32_t encoding) {
 933     const _CFEncodingConverter *converter = __CFGetConverter(encoding);
 934
 935     return ((NULL == converter) ? NULL : converter->definition);
 936 }
 937
 938 static const CFStringEncoding __CFBuiltinEncodings[] = {
 939     kCFStringEncodingMacRoman,
 940     kCFStringEncodingWindowsLatin1,
 941     kCFStringEncodingISOLatin1,
 942     kCFStringEncodingNextStepLatin,
 943     kCFStringEncodingASCII,
 944     kCFStringEncodingUTF8,
 945     /* These seven are available only in CFString-level */
 946     kCFStringEncodingNonLossyASCII,
 947
 948     kCFStringEncodingUTF16,
 949     kCFStringEncodingUTF16BE,
 950     kCFStringEncodingUTF16LE,
 951
 952     kCFStringEncodingUTF32,
 953     kCFStringEncodingUTF32BE,
 954     kCFStringEncodingUTF32LE,
 955
 956     kCFStringEncodingInvalidId,
 957 };
 958
 959 static CFComparisonResult __CFStringEncodingComparator(const void *v1, const void *v2, void *context) {
 960     CFComparisonResult val1 = (*(const CFStringEncoding *)v1) & 0xFFFF;
 961     CFComparisonResult val2 = (*(const CFStringEncoding *)v2) & 0xFFFF;
 962
 963     return ((val1 == val2) ? ((CFComparisonResult)(*(const CFStringEncoding *)v1) - (CFComparisonResult)(*(const CFStringEncoding *)v2)) : val1 - val2);
 964 }
 965
 966 static void __CFStringEncodingFliterDupes(CFStringEncoding *encodings, CFIndex numSlots) {
 967     CFStringEncoding last = kCFStringEncodingInvalidId;
 968     const CFStringEncoding *limitEncodings = encodings + numSlots;
 969
 970     while (encodings < limitEncodings) {
 971         if (last == *encodings) {
 972             if ((encodings + 1) < limitEncodings) memmove(encodings, encodings + 1, sizeof(CFStringEncoding) * (limitEncodings - encodings - 1));
 973             --limitEncodings;
 974         } else {
 975             last = *(encodings++);
 976         }
 977     }
 978 }
 979
 980 __private_extern__ const CFStringEncoding *CFStringEncodingListOfAvailableEncodings(void) {
 981     static const CFStringEncoding *encodings = NULL;
 982
 983     if (NULL == encodings) {
 984         CFStringEncoding *list = (CFStringEncoding *)__CFBuiltinEncodings;
 985         CFIndex numICUConverters = 0, numPlatformConverters = 0;
 986         CFStringEncoding *icuConverters = __CFStringEncodingCreateICUEncodings(NULL, &numICUConverters);
 987         CFStringEncoding *platformConverters = __CFStringEncodingCreateListOfAvailablePlatformConverters(NULL, &numPlatformConverters);
 988
 989         if ((NULL != icuConverters) || (NULL != platformConverters)) {
 990             CFIndex numSlots = (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters + numPlatformConverters;
 991
 992             list = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * numSlots, 0);
 993
 994             memcpy(list, __CFBuiltinEncodings, sizeof(__CFBuiltinEncodings));
 995
 996             if (NULL != icuConverters) {
 997                 memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)), icuConverters, sizeof(CFStringEncoding) * numICUConverters);
 998                 CFAllocatorDeallocate(NULL, icuConverters);
 999             }
1000
1001             if (NULL != platformConverters) {
1002                 memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters, platformConverters, sizeof(CFStringEncoding) * numPlatformConverters);
1003                 CFAllocatorDeallocate(NULL, platformConverters);
1004             }
1005
1006             CFQSortArray(list, numSlots, sizeof(CFStringEncoding), (CFComparatorFunction)__CFStringEncodingComparator, NULL);
1007             __CFStringEncodingFliterDupes(list, numSlots);
1008         }
1009         if (!OSAtomicCompareAndSwapPtrBarrier(NULL, list, (void * volatile *)&encodings) && (list != __CFBuiltinEncodings)) CFAllocatorDeallocate(NULL, list);
1010     }
1011
1012     return encodings;
1013 }
1014
1015 #undef TO_BYTE
1016 #undef TO_UNICODE
1017 #undef ASCIINewLine
1018 #undef kSurrogateHighStart
1019 #undef kSurrogateHighEnd
1020 #undef kSurrogateLowStart
1021 #undef kSurrogateLowEnd
1022 #undef TO_BYTE_FALLBACK
1023 #undef TO_UNICODE_FALLBACK
1024 #undef EXTRA_BASE
1025 #undef NUM_OF_ENTRIES_CYCLE
1026