X-Git-Url: https://git.saurik.com/apple/cf.git/blobdiff_plain/47a9ab1f151d80a00a045f81937ddac81c51a463..bd5b749cf7786ae858ab372fc8f64179736c6515:/CFStringEncodingConverter.c diff --git a/CFStringEncodingConverter.c b/CFStringEncodingConverter.c new file mode 100644 index 0000000..3e92768 --- /dev/null +++ b/CFStringEncodingConverter.c @@ -0,0 +1,995 @@ +/* + * Copyright (c) 2008 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* CFStringEncodingConverter.c + Copyright 1998-2002, Apple, Inc. All rights reserved. + Responsibility: Aki Inoue +*/ + +#include "CFInternal.h" +#include +#include +#include "CFUniChar.h" +#include "CFPriv.h" +#include "CFUnicodeDecomposition.h" +#include "CFStringEncodingConverterExt.h" +#include "CFStringEncodingConverterPriv.h" +#include +#if !defined(__WIN32__) +#include +#endif + + +/* Macros +*/ +#define TO_BYTE(conv,flags,chars,numChars,bytes,max,used) (conv->_toBytes ? conv->toBytes(conv,flags,chars,numChars,bytes,max,used) : ((CFStringEncodingToBytesProc)conv->toBytes)(flags,chars,numChars,bytes,max,used)) +#define TO_UNICODE(conv,flags,bytes,numBytes,chars,max,used) (conv->_toUnicode ? (flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical) ? conv->toCanonicalUnicode(conv,flags,bytes,numBytes,chars,max,used) : conv->toUnicode(conv,flags,bytes,numBytes,chars,max,used)) : ((CFStringEncodingToUnicodeProc)conv->toUnicode)(flags,bytes,numBytes,chars,max,used)) + +#define ASCIINewLine 0x0a +#define kSurrogateHighStart 0xD800 +#define kSurrogateHighEnd 0xDBFF +#define kSurrogateLowStart 0xDC00 +#define kSurrogateLowEnd 0xDFFF + +/* Mapping 128..255 to lossy ASCII +*/ +static const struct { + unsigned char chars[4]; +} _toLossyASCIITable[] = { + {{' ', 0, 0, 0}}, // NO-BREAK SPACE + {{'!', 0, 0, 0}}, // INVERTED EXCLAMATION MARK + {{'c', 0, 0, 0}}, // CENT SIGN + {{'L', 0, 0, 0}}, // POUND SIGN + {{'$', 0, 0, 0}}, // CURRENCY SIGN + {{'Y', 0, 0, 0}}, // YEN SIGN + {{'|', 0, 0, 0}}, // BROKEN BAR + {{0, 0, 0, 0}}, // SECTION SIGN + {{0, 0, 0, 0}}, // DIAERESIS + {{'(', 'C', ')', 0}}, // COPYRIGHT SIGN + {{'a', 0, 0, 0}}, // FEMININE ORDINAL INDICATOR + {{'<', '<', 0, 0}}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + {{0, 0, 0, 0}}, // NOT SIGN + {{'-', 0, 0, 0}}, // SOFT HYPHEN + {{'(', 'R', ')', 0}}, // REGISTERED SIGN + {{0, 0, 0, 0}}, // MACRON + {{0, 0, 0, 0}}, // DEGREE SIGN + {{'+', '-', 0, 0}}, // PLUS-MINUS SIGN + {{'2', 0, 0, 0}}, // SUPERSCRIPT TWO + {{'3', 0, 0, 0}}, // SUPERSCRIPT THREE + {{0, 0, 0, 0}}, // ACUTE ACCENT + {{0, 0, 0, 0}}, // MICRO SIGN + {{0, 0, 0, 0}}, // PILCROW SIGN + {{0, 0, 0, 0}}, // MIDDLE DOT + {{0, 0, 0, 0}}, // CEDILLA + {{'1', 0, 0, 0}}, // SUPERSCRIPT ONE + {{'o', 0, 0, 0}}, // MASCULINE ORDINAL INDICATOR + {{'>', '>', 0, 0}}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + {{'1', '/', '4', 0}}, // VULGAR FRACTION ONE QUARTER + {{'1', '/', '2', 0}}, // VULGAR FRACTION ONE HALF + {{'3', '/', '4', 0}}, // VULGAR FRACTION THREE QUARTERS + {{'?', 0, 0, 0}}, // INVERTED QUESTION MARK + {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH GRAVE + {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH ACUTE + {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX + {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH TILDE + {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH DIAERESIS + {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH RING ABOVE + {{'A', 'E', 0, 0}}, // LATIN CAPITAL LETTER AE + {{'C', 0, 0, 0}}, // LATIN CAPITAL LETTER C WITH CEDILLA + {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH GRAVE + {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH ACUTE + {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX + {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH DIAERESIS + {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH GRAVE + {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH ACUTE + {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX + {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH DIAERESIS + {{'T', 'H', 0, 0}}, // LATIN CAPITAL LETTER ETH (Icelandic) + {{'N', 0, 0, 0}}, // LATIN CAPITAL LETTER N WITH TILDE + {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH GRAVE + {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH ACUTE + {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX + {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH TILDE + {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH DIAERESIS + {{'X', 0, 0, 0}}, // MULTIPLICATION SIGN + {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH STROKE + {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH GRAVE + {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH ACUTE + {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX + {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH DIAERESIS + {{'Y', 0, 0, 0}}, // LATIN CAPITAL LETTER Y WITH ACUTE + {{'t', 'h', 0, 0}}, // LATIN CAPITAL LETTER THORN (Icelandic) + {{'s', 0, 0, 0}}, // LATIN SMALL LETTER SHARP S (German) + {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH GRAVE + {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH ACUTE + {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH CIRCUMFLEX + {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH TILDE + {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH DIAERESIS + {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH RING ABOVE + {{'a', 'e', 0, 0}}, // LATIN SMALL LETTER AE + {{'c', 0, 0, 0}}, // LATIN SMALL LETTER C WITH CEDILLA + {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH GRAVE + {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH ACUTE + {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH CIRCUMFLEX + {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH DIAERESIS + {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH GRAVE + {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH ACUTE + {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH CIRCUMFLEX + {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH DIAERESIS + {{'T', 'H', 0, 0}}, // LATIN SMALL LETTER ETH (Icelandic) + {{'n', 0, 0, 0}}, // LATIN SMALL LETTER N WITH TILDE + {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH GRAVE + {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH ACUTE + {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH CIRCUMFLEX + {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH TILDE + {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH DIAERESIS + {{'/', 0, 0, 0}}, // DIVISION SIGN + {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH STROKE + {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH GRAVE + {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH ACUTE + {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH CIRCUMFLEX + {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH DIAERESIS + {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH ACUTE + {{'t', 'h', 0, 0}}, // LATIN SMALL LETTER THORN (Icelandic) + {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH DIAERESIS +}; + +CF_INLINE CFIndex __CFToASCIILatin1Fallback(UniChar character, uint8_t *bytes, CFIndex maxByteLen) { + const uint8_t *losChars = (const uint8_t*)_toLossyASCIITable + (character - 0xA0) * sizeof(uint8_t[4]); + CFIndex numBytes = 0; + CFIndex idx, max = (maxByteLen && (maxByteLen < 4) ? maxByteLen : 4); + + for (idx = 0;idx < max;idx++) { + if (losChars[idx]) { + if (maxByteLen) bytes[idx] = losChars[idx]; + ++numBytes; + } else { + break; + } + } + + return numBytes; +} + +static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { + CFIndex processCharLen = 1, filledBytesLen = 1; + uint8_t byte = '?'; + + if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range + byte = (uint8_t)(*characters - 0x80); + } else if (*characters < 0x100) { + *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen); + return 1; + } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) { + processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1); + } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) { + byte = ' '; + } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) { + byte = ASCIINewLine; + } else if (*characters == 0x2026) { // ellipsis + if (0 == maxByteLen) { + filledBytesLen = 3; + } else if (maxByteLen > 2) { + memset(bytes, '.', 3); + *usedByteLen = 3; + return processCharLen; + } + } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) { + UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; + + (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH); + if (*decomposed < 0x80) { + byte = (uint8_t)(*decomposed); + } else { + UTF16Char theChar = *decomposed; + + return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen); + } + } + + if (maxByteLen) *bytes = byte; + *usedByteLen = filledBytesLen; + return processCharLen; +} + +static CFIndex __CFDefaultToUnicodeFallbackProc(const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { + if (maxCharLen) *characters = (UniChar)'?'; + *usedCharLen = 1; + return 1; +} + +#define TO_BYTE_FALLBACK(conv,chars,numChars,bytes,max,used) (conv->toBytesFallback(chars,numChars,bytes,max,used)) +#define TO_UNICODE_FALLBACK(conv,bytes,numBytes,chars,max,used) (conv->toUnicodeFallback(bytes,numBytes,chars,max,used)) + +#define EXTRA_BASE (0x0F00) + +/* Wrapper funcs for non-standard converters +*/ +static CFIndex __CFToBytesCheapEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { + CFIndex processedCharLen = 0; + CFIndex length = (maxByteLen && (maxByteLen < numChars) ? maxByteLen : numChars); + uint8_t byte; + + while (processedCharLen < length) { + if (!((CFStringEncodingCheapEightBitToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters[processedCharLen], &byte)) break; + + if (maxByteLen) bytes[processedCharLen] = byte; + processedCharLen++; + } + + *usedByteLen = processedCharLen; + return processedCharLen; +} + +static CFIndex __CFToUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { + CFIndex processedByteLen = 0; + CFIndex length = (maxCharLen && (maxCharLen < numBytes) ? maxCharLen : numBytes); + UniChar character; + + while (processedByteLen < length) { + if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], &character)) break; + + if (maxCharLen) characters[processedByteLen] = character; + processedByteLen++; + } + + *usedCharLen = processedByteLen; + return processedByteLen; +} + +static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { + CFIndex processedByteLen = 0; + CFIndex theUsedCharLen = 0; + UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; + CFIndex usedLen; + UniChar character; + bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); + + while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { + if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], &character)) break; + + if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { + CFIndex idx; + + usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); + *usedCharLen = theUsedCharLen; + + for (idx = 0;idx < usedLen;idx++) { + if (charBuffer[idx] > 0xFFFF) { // Non-BMP + if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; + theUsedCharLen += 2; + if (maxCharLen) { + charBuffer[idx] = charBuffer[idx] - 0x10000; + *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL; + *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL; + } + } else { + if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; + ++theUsedCharLen; + *(characters++) = charBuffer[idx]; + } + } + } else { + if (maxCharLen) *(characters++) = character; + ++theUsedCharLen; + } + processedByteLen++; + } + + *usedCharLen = theUsedCharLen; + return processedByteLen; +} + +static CFIndex __CFToBytesStandardEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { + CFIndex processedCharLen = 0; + uint8_t byte; + CFIndex usedLen; + + *usedByteLen = 0; + + while (numChars && (!maxByteLen || (*usedByteLen < maxByteLen))) { + if (!(usedLen = ((CFStringEncodingStandardEightBitToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters, numChars, &byte))) break; + + if (maxByteLen) bytes[*usedByteLen] = byte; + (*usedByteLen)++; + characters += usedLen; + numChars -= usedLen; + processedCharLen += usedLen; + } + + return processedCharLen; +} + +static CFIndex __CFToUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { + CFIndex processedByteLen = 0; +#if 0 || 0 + UniChar charBuffer[20]; // Dynamic stack allocation is GNU specific +#else + UniChar charBuffer[((const _CFEncodingConverter*)converter)->maxLen]; +#endif + CFIndex usedLen; + + *usedCharLen = 0; + + while ((processedByteLen < numBytes) && (!maxCharLen || (*usedCharLen < maxCharLen))) { + if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], charBuffer))) break; + + if (maxCharLen) { + CFIndex idx; + + if (*usedCharLen + usedLen > maxCharLen) break; + + for (idx = 0;idx < usedLen;idx++) { + characters[*usedCharLen + idx] = charBuffer[idx]; + } + } + *usedCharLen += usedLen; + processedByteLen++; + } + + return processedByteLen; +} + +static CFIndex __CFToCanonicalUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { + CFIndex processedByteLen = 0; +#if 0 || 0 + UniChar charBuffer[20]; // Dynamic stack allocation is GNU specific +#else + UniChar charBuffer[((const _CFEncodingConverter*)converter)->maxLen]; +#endif + UTF32Char decompBuffer[MAX_DECOMPOSED_LENGTH]; + CFIndex usedLen; + CFIndex decompedLen; + CFIndex idx, decompIndex; + bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); + CFIndex theUsedCharLen = 0; + + while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { + if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], charBuffer))) break; + + for (idx = 0;idx < usedLen;idx++) { + if (CFUniCharIsDecomposableCharacter(charBuffer[idx], isHFSPlus)) { + decompedLen = CFUniCharDecomposeCharacter(charBuffer[idx], decompBuffer, MAX_DECOMPOSED_LENGTH); + *usedCharLen = theUsedCharLen; + + for (decompIndex = 0;decompIndex < decompedLen;decompIndex++) { + if (decompBuffer[decompIndex] > 0xFFFF) { // Non-BMP + if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; + theUsedCharLen += 2; + if (maxCharLen) { + charBuffer[idx] = charBuffer[idx] - 0x10000; + *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL; + *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL; + } + } else { + if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; + ++theUsedCharLen; + *(characters++) = charBuffer[idx]; + } + } + } else { + if (maxCharLen) *(characters++) = charBuffer[idx]; + ++theUsedCharLen; + } + } + processedByteLen++; + } + + *usedCharLen = theUsedCharLen; + return processedByteLen; +} + +static CFIndex __CFToBytesCheapMultiByteWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { + CFIndex processedCharLen = 0; +#if 0 || 0 + uint8_t byteBuffer[20]; // Dynamic stack allocation is GNU specific +#else + uint8_t byteBuffer[((const _CFEncodingConverter*)converter)->maxLen]; +#endif + CFIndex usedLen; + + *usedByteLen = 0; + + while ((processedCharLen < numChars) && (!maxByteLen || (*usedByteLen < maxByteLen))) { + if (!(usedLen = ((CFStringEncodingCheapMultiByteToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters[processedCharLen], byteBuffer))) break; + + if (maxByteLen) { + CFIndex idx; + + if (*usedByteLen + usedLen > maxByteLen) break; + + for (idx = 0;idx _toUnicode)(flags, bytes, numBytes, &character))) break; + + if (maxCharLen) *(characters++) = character; + (*usedCharLen)++; + processedByteLen += usedLen; + bytes += usedLen; + numBytes -= usedLen; + } + + return processedByteLen; +} + +static CFIndex __CFToCanonicalUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { + CFIndex processedByteLen = 0; + UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; + UniChar character; + CFIndex usedLen; + CFIndex decomposedLen; + CFIndex theUsedCharLen = 0; + bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); + + while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) { + if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes, numBytes, &character))) break; + + if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { + CFIndex idx; + + decomposedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); + *usedCharLen = theUsedCharLen; + + for (idx = 0;idx < decomposedLen;idx++) { + if (charBuffer[idx] > 0xFFFF) { // Non-BMP + if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; + theUsedCharLen += 2; + if (maxCharLen) { + charBuffer[idx] = charBuffer[idx] - 0x10000; + *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL; + *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL; + } + } else { + if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; + ++theUsedCharLen; + *(characters++) = charBuffer[idx]; + } + } + } else { + if (maxCharLen) *(characters++) = character; + ++theUsedCharLen; + } + + processedByteLen += usedLen; + bytes += usedLen; + numBytes -= usedLen; + } + *usedCharLen = theUsedCharLen; + return processedByteLen; +} + +/* static functions +*/ +static _CFConverterEntry __CFConverterEntryASCII = { + kCFStringEncodingASCII, NULL, + "Western (ASCII)", {"us-ascii", "ascii", "iso-646-us", NULL}, NULL, NULL, NULL, NULL, + kCFStringEncodingMacRoman // We use string encoding's script range here +}; + +static _CFConverterEntry __CFConverterEntryISOLatin1 = { + kCFStringEncodingISOLatin1, NULL, + "Western (ISO Latin 1)", {"iso-8859-1", "latin1","iso-latin-1", NULL}, NULL, NULL, NULL, NULL, + kCFStringEncodingMacRoman // We use string encoding's script range here +}; + +static _CFConverterEntry __CFConverterEntryMacRoman = { + kCFStringEncodingMacRoman, NULL, + "Western (Mac OS Roman)", {"macintosh", "mac", "x-mac-roman", NULL}, NULL, NULL, NULL, NULL, + kCFStringEncodingMacRoman // We use string encoding's script range here +}; + +static _CFConverterEntry __CFConverterEntryWinLatin1 = { + kCFStringEncodingWindowsLatin1, NULL, + "Western (Windows Latin 1)", {"windows-1252", "cp1252", "windows latin1", NULL}, NULL, NULL, NULL, NULL, + kCFStringEncodingMacRoman // We use string encoding's script range here +}; + +static _CFConverterEntry __CFConverterEntryNextStepLatin = { + kCFStringEncodingNextStepLatin, NULL, + "Western (NextStep)", {"x-nextstep", NULL, NULL, NULL}, NULL, NULL, NULL, NULL, + kCFStringEncodingMacRoman // We use string encoding's script range here +}; + +static _CFConverterEntry __CFConverterEntryUTF8 = { + kCFStringEncodingUTF8, NULL, + "UTF-8", {"utf-8", "unicode-1-1-utf8", NULL, NULL}, NULL, NULL, NULL, NULL, + kCFStringEncodingUnicode // We use string encoding's script range here +}; + +CF_INLINE _CFConverterEntry *__CFStringEncodingConverterGetEntry(uint32_t encoding) { + switch (encoding) { + case kCFStringEncodingInvalidId: + case kCFStringEncodingASCII: + return &__CFConverterEntryASCII; + + case kCFStringEncodingISOLatin1: + return &__CFConverterEntryISOLatin1; + + case kCFStringEncodingMacRoman: + return &__CFConverterEntryMacRoman; + + case kCFStringEncodingWindowsLatin1: + return &__CFConverterEntryWinLatin1; + + case kCFStringEncodingNextStepLatin: + return &__CFConverterEntryNextStepLatin; + + case kCFStringEncodingUTF8: + return &__CFConverterEntryUTF8; + + default: { + return NULL; + } + } +} + +CF_INLINE _CFEncodingConverter *__CFEncodingConverterFromDefinition(const CFStringEncodingConverter *definition) { +#define NUM_OF_ENTRIES_CYCLE (10) + static CFSpinLock_t _indexLock = CFSpinLockInit; + static uint32_t _currentIndex = 0; + static uint32_t _allocatedSize = 0; + static _CFEncodingConverter *_allocatedEntries = NULL; + _CFEncodingConverter *converter; + + + __CFSpinLock(&_indexLock); + if ((_currentIndex + 1) >= _allocatedSize) { + _currentIndex = 0; + _allocatedSize = 0; + _allocatedEntries = NULL; + } + if (_allocatedEntries == NULL) { // Not allocated yet + _allocatedEntries = (_CFEncodingConverter *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(_CFEncodingConverter) * NUM_OF_ENTRIES_CYCLE, 0); + _allocatedSize = NUM_OF_ENTRIES_CYCLE; + converter = &(_allocatedEntries[_currentIndex]); + } else { + converter = &(_allocatedEntries[++_currentIndex]); + } + __CFSpinUnlock(&_indexLock); + + switch (definition->encodingClass) { + case kCFStringEncodingConverterStandard: + converter->toBytes = (_CFToBytesProc)definition->toBytes; + converter->toUnicode = (_CFToUnicodeProc)definition->toUnicode; + converter->toCanonicalUnicode = (_CFToUnicodeProc)definition->toUnicode; + converter->_toBytes = NULL; + converter->_toUnicode = NULL; + converter->maxLen = 2; + break; + + case kCFStringEncodingConverterCheapEightBit: + converter->toBytes = __CFToBytesCheapEightBitWrapper; + converter->toUnicode = __CFToUnicodeCheapEightBitWrapper; + converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapEightBitWrapper; + converter->_toBytes = definition->toBytes; + converter->_toUnicode = definition->toUnicode; + converter->maxLen = 1; + break; + + case kCFStringEncodingConverterStandardEightBit: + converter->toBytes = __CFToBytesStandardEightBitWrapper; + converter->toUnicode = __CFToUnicodeStandardEightBitWrapper; + converter->toCanonicalUnicode = __CFToCanonicalUnicodeStandardEightBitWrapper; + converter->_toBytes = definition->toBytes; + converter->_toUnicode = definition->toUnicode; + converter->maxLen = definition->maxDecomposedCharLen; + break; + + case kCFStringEncodingConverterCheapMultiByte: + converter->toBytes = __CFToBytesCheapMultiByteWrapper; + converter->toUnicode = __CFToUnicodeCheapMultiByteWrapper; + converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapMultiByteWrapper; + converter->_toBytes = definition->toBytes; + converter->_toUnicode = definition->toUnicode; + converter->maxLen = definition->maxBytesPerChar; + break; + + case kCFStringEncodingConverterPlatformSpecific: + converter->toBytes = NULL; + converter->toUnicode = NULL; + converter->toCanonicalUnicode = NULL; + converter->_toBytes = NULL; + converter->_toUnicode = NULL; + converter->maxLen = 0; + converter->toBytesLen = NULL; + converter->toUnicodeLen = NULL; + converter->toBytesFallback = NULL; + converter->toUnicodeFallback = NULL; + converter->toBytesPrecompose = NULL; + converter->isValidCombiningChar = NULL; + return converter; + + default: // Shouln't be here + return NULL; + } + + converter->toBytesLen = (definition->toBytesLen ? definition->toBytesLen : (CFStringEncodingToBytesLenProc)(uintptr_t)definition->maxBytesPerChar); + converter->toUnicodeLen = (definition->toUnicodeLen ? definition->toUnicodeLen : (CFStringEncodingToUnicodeLenProc)(uintptr_t)definition->maxDecomposedCharLen); + converter->toBytesFallback = (definition->toBytesFallback ? definition->toBytesFallback : __CFDefaultToBytesFallbackProc); + converter->toUnicodeFallback = (definition->toUnicodeFallback ? definition->toUnicodeFallback : __CFDefaultToUnicodeFallbackProc); + converter->toBytesPrecompose = (definition->toBytesPrecompose ? definition->toBytesPrecompose : NULL); + converter->isValidCombiningChar = (definition->isValidCombiningChar ? definition->isValidCombiningChar : NULL); + + return converter; +} + +CF_INLINE const CFStringEncodingConverter *__CFStringEncodingConverterGetDefinition(_CFConverterEntry *entry) { + if (!entry) return NULL; + + switch (entry->encoding) { + case kCFStringEncodingASCII: + return &__CFConverterASCII; + + case kCFStringEncodingISOLatin1: + return &__CFConverterISOLatin1; + + case kCFStringEncodingMacRoman: + return &__CFConverterMacRoman; + + case kCFStringEncodingWindowsLatin1: + return &__CFConverterWinLatin1; + + case kCFStringEncodingNextStepLatin: + return &__CFConverterNextStepLatin; + + case kCFStringEncodingUTF8: + return &__CFConverterUTF8; + + default: + return NULL; + } +} + +static const _CFEncodingConverter *__CFGetConverter(uint32_t encoding) { + _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); + + if (!entry) return NULL; + + if (!entry->converter) { + const CFStringEncodingConverter *definition = __CFStringEncodingConverterGetDefinition(entry); + + if (definition) { + entry->converter = __CFEncodingConverterFromDefinition(definition); + entry->toBytesFallback = definition->toBytesFallback; + entry->toUnicodeFallback = definition->toUnicodeFallback; + } + } + + return (_CFEncodingConverter *)entry->converter; +} + +/* Public API +*/ +uint32_t CFStringEncodingUnicodeToBytes(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { + if (encoding == kCFStringEncodingUTF8) { + static CFStringEncodingToBytesProc __CFToUTF8 = NULL; + CFIndex convertedCharLen; + CFIndex usedLen; + + + if ((flags & kCFStringEncodingUseCanonical) || (flags & kCFStringEncodingUseHFSPlusCanonical)) { + (void)CFUniCharDecompose(characters, numChars, &convertedCharLen, (void *)bytes, maxByteLen, &usedLen, true, kCFUniCharUTF8Format, (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false)); + } else { + if (!__CFToUTF8) { + const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8); + __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes; + } + convertedCharLen = __CFToUTF8(0, characters, numChars, bytes, maxByteLen, &usedLen); + } + if (usedCharLen) *usedCharLen = convertedCharLen; + if (usedByteLen) *usedByteLen = usedLen; + + if (convertedCharLen == numChars) { + return kCFStringEncodingConversionSuccess; + } else if (maxByteLen && (maxByteLen == usedLen)) { + return kCFStringEncodingInsufficientOutputBufferLength; + } else { + return kCFStringEncodingInvalidInputStream; + } + } else { + const _CFEncodingConverter *converter = __CFGetConverter(encoding); + CFIndex usedLen = 0; + CFIndex localUsedByteLen; + CFIndex theUsedByteLen = 0; + uint32_t theResult = kCFStringEncodingConversionSuccess; + CFStringEncodingToBytesPrecomposeProc toBytesPrecompose = NULL; + CFStringEncodingIsValidCombiningCharacterProc isValidCombiningChar = NULL; + + if (!converter) return kCFStringEncodingConverterUnavailable; + + if (flags & kCFStringEncodingSubstituteCombinings) { + if (!(flags & kCFStringEncodingAllowLossyConversion)) isValidCombiningChar = converter->isValidCombiningChar; + } else { + isValidCombiningChar = converter->isValidCombiningChar; + if (!(flags & kCFStringEncodingIgnoreCombinings)) { + toBytesPrecompose = converter->toBytesPrecompose; + flags |= kCFStringEncodingComposeCombinings; + } + } + + + while ((usedLen < numChars) && (!maxByteLen || (theUsedByteLen < maxByteLen))) { + if ((usedLen += TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) < numChars) { + CFIndex dummy; + + if (isValidCombiningChar && (usedLen > 0) && isValidCombiningChar(characters[usedLen])) { + if (toBytesPrecompose) { + CFIndex localUsedLen = usedLen; + + while (isValidCombiningChar(characters[--usedLen])); + theUsedByteLen += localUsedByteLen; + if (converter->maxLen > 1) { + TO_BYTE(converter, flags, characters + usedLen, localUsedLen - usedLen, NULL, 0, &localUsedByteLen); + theUsedByteLen -= localUsedByteLen; + } else { + theUsedByteLen--; + } + if ((localUsedLen = toBytesPrecompose(flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) > 0) { + usedLen += localUsedLen; + if ((usedLen < numChars) && isValidCombiningChar(characters[usedLen])) { // There is a non-base char not combined remaining + theUsedByteLen += localUsedByteLen; + theResult = kCFStringEncodingInvalidInputStream; + break; + } + } else if (flags & kCFStringEncodingAllowLossyConversion) { + uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); + + if (lossyByte) { + while (isValidCombiningChar(characters[++usedLen])); + localUsedByteLen = 1; + if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; + } else { + ++usedLen; + usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); + } + } else { + theResult = kCFStringEncodingInvalidInputStream; + break; + } + } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up + theUsedByteLen += localUsedByteLen; + theResult = kCFStringEncodingInsufficientOutputBufferLength; + break; + } else if (flags & kCFStringEncodingIgnoreCombinings) { + while ((++usedLen < numChars) && isValidCombiningChar(characters[usedLen])); + } else { + uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); + + theUsedByteLen += localUsedByteLen; + if (lossyByte) { + ++usedLen; + localUsedByteLen = 1; + if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; + } else { + usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); + } + } + } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up + theUsedByteLen += localUsedByteLen; + + if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) { + CFIndex localUsedLen; + + localUsedByteLen = 0; + while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen; + } + if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength; + break; + } else if (flags & kCFStringEncodingAllowLossyConversion) { + uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); + + theUsedByteLen += localUsedByteLen; + if (lossyByte) { + ++usedLen; + localUsedByteLen = 1; + if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; + } else { + usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); + } + } else { + theUsedByteLen += localUsedByteLen; + theResult = kCFStringEncodingInvalidInputStream; + break; + } + } + theUsedByteLen += localUsedByteLen; + } + + if (usedLen < numChars && maxByteLen && theResult == kCFStringEncodingConversionSuccess) { + if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) { + CFIndex localUsedLen; + + localUsedByteLen = 0; + while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen; + } + if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength; + } + if (usedByteLen) *usedByteLen = theUsedByteLen; + if (usedCharLen) *usedCharLen = usedLen; + + return theResult; + } +} + +uint32_t CFStringEncodingBytesToUnicode(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { + const _CFEncodingConverter *converter = __CFGetConverter(encoding); + CFIndex usedLen = 0; + CFIndex theUsedCharLen = 0; + CFIndex localUsedCharLen; + uint32_t theResult = kCFStringEncodingConversionSuccess; + + if (!converter) return kCFStringEncodingConverterUnavailable; + + + while ((usedLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { + if ((usedLen += TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen)) < numBytes) { + CFIndex tempUsedCharLen; + + if (maxCharLen && ((maxCharLen == theUsedCharLen + localUsedCharLen) || (((flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical)) || (maxCharLen == theUsedCharLen + localUsedCharLen + 1)) && TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, NULL, 0, &tempUsedCharLen)))) { // buffer was filled up + theUsedCharLen += localUsedCharLen; + theResult = kCFStringEncodingInsufficientOutputBufferLength; + break; + } else if (flags & kCFStringEncodingAllowLossyConversion) { + theUsedCharLen += localUsedCharLen; + usedLen += TO_UNICODE_FALLBACK(converter, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen); + } else { + theUsedCharLen += localUsedCharLen; + theResult = kCFStringEncodingInvalidInputStream; + break; + } + } + theUsedCharLen += localUsedCharLen; + } + + if (usedLen < numBytes && maxCharLen && theResult == kCFStringEncodingConversionSuccess) { + theResult = kCFStringEncodingInsufficientOutputBufferLength; + } + if (usedCharLen) *usedCharLen = theUsedCharLen; + if (usedByteLen) *usedByteLen = usedLen; + + return theResult; +} + +__private_extern__ bool CFStringEncodingIsValidEncoding(uint32_t encoding) { + return (CFStringEncodingGetConverter(encoding) ? true : false); +} + +__private_extern__ const char *CFStringEncodingName(uint32_t encoding) { + _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); + if (entry) return entry->encodingName; + return NULL; +} + +__private_extern__ const char **CFStringEncodingCanonicalCharsetNames(uint32_t encoding) { + _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); + if (entry) return entry->ianaNames; + return NULL; +} + +__private_extern__ uint32_t CFStringEncodingGetScriptCodeForEncoding(CFStringEncoding encoding) { + _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); + + return (entry ? entry->scriptCode : ((encoding & 0x0FFF) == kCFStringEncodingUnicode ? kCFStringEncodingUnicode : (encoding < 0xFF ? encoding : kCFStringEncodingInvalidId))); +} + +__private_extern__ CFIndex CFStringEncodingCharLengthForBytes(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) { + const _CFEncodingConverter *converter = __CFGetConverter(encoding); + + if (converter) { + uintptr_t switchVal = (uintptr_t)(converter->toUnicodeLen); + + if (switchVal < 0xFFFF) { + return switchVal * numBytes; + } else { + return converter->toUnicodeLen(flags, bytes, numBytes); + } + } + + return 0; +} + +__private_extern__ CFIndex CFStringEncodingByteLengthForCharacters(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars) { + const _CFEncodingConverter *converter = __CFGetConverter(encoding); + + if (converter) { + uintptr_t switchVal = (uintptr_t)(converter->toBytesLen); + + if (switchVal < 0xFFFF) { + return switchVal * numChars; + } else { + return converter->toBytesLen(flags, characters, numChars); + } + } + + return 0; +} + +__private_extern__ void CFStringEncodingRegisterFallbackProcedures(uint32_t encoding, CFStringEncodingToBytesFallbackProc toBytes, CFStringEncodingToUnicodeFallbackProc toUnicode) { + _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); + + if (entry && __CFGetConverter(encoding)) { + ((_CFEncodingConverter*)entry->converter)->toBytesFallback = (toBytes ? toBytes : entry->toBytesFallback); + ((_CFEncodingConverter*)entry->converter)->toUnicodeFallback = (toUnicode ? toUnicode : entry->toUnicodeFallback); + } +} + +__private_extern__ const CFStringEncodingConverter *CFStringEncodingGetConverter(uint32_t encoding) { + return __CFStringEncodingConverterGetDefinition(__CFStringEncodingConverterGetEntry(encoding)); +} + +static const uint32_t __CFBuiltinEncodings[] = { + kCFStringEncodingMacRoman, + kCFStringEncodingWindowsLatin1, + kCFStringEncodingISOLatin1, + kCFStringEncodingNextStepLatin, + kCFStringEncodingASCII, + kCFStringEncodingUTF8, + /* These seven are available only in CFString-level */ + kCFStringEncodingNonLossyASCII, + + kCFStringEncodingUTF16, + kCFStringEncodingUTF16BE, + kCFStringEncodingUTF16LE, + + kCFStringEncodingUTF32, + kCFStringEncodingUTF32BE, + kCFStringEncodingUTF32LE, + + kCFStringEncodingInvalidId, +}; + + +__private_extern__ const uint32_t *CFStringEncodingListOfAvailableEncodings(void) { + return __CFBuiltinEncodings; +} + + +#undef TO_BYTE +#undef TO_UNICODE +#undef ASCIINewLine +#undef kSurrogateHighStart +#undef kSurrogateHighEnd +#undef kSurrogateLowStart +#undef kSurrogateLowEnd +#undef TO_BYTE_FALLBACK +#undef TO_UNICODE_FALLBACK +#undef EXTRA_BASE +#undef NUM_OF_ENTRIES_CYCLE +