X-Git-Url: https://git.saurik.com/apple/cf.git/blobdiff_plain/47a9ab1f151d80a00a045f81937ddac81c51a463..bd5b749cf7786ae858ab372fc8f64179736c6515:/StringEncodings.subproj/CFStringEncodingConverter.c diff --git a/StringEncodings.subproj/CFStringEncodingConverter.c b/StringEncodings.subproj/CFStringEncodingConverter.c deleted file mode 100644 index 00a41cd..0000000 --- a/StringEncodings.subproj/CFStringEncodingConverter.c +++ /dev/null @@ -1,988 +0,0 @@ -/* - * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* CFStringEncodingConverter.c - Copyright 1998-2002, Apple, Inc. All rights reserved. - Responsibility: Aki Inoue -*/ - -#include "CFInternal.h" -#include -#include -#include "CFUniChar.h" -#include "CFUtilitiesPriv.h" -#include "CFUnicodeDecomposition.h" -#include "CFStringEncodingConverterExt.h" -#include "CFStringEncodingConverterPriv.h" -#include -#if !defined(__MACOS8__) -#ifdef __WIN32__ -#include -#else // Mach, HP-UX, Solaris -#include -#endif -#endif __MACOS8__ - - -/* Macros -*/ -#define TO_BYTE(conv,flags,chars,numChars,bytes,max,used) (conv->_toBytes ? conv->toBytes(conv,flags,chars,numChars,bytes,max,used) : ((CFStringEncodingToBytesProc)conv->toBytes)(flags,chars,numChars,bytes,max,used)) -#define TO_UNICODE(conv,flags,bytes,numBytes,chars,max,used) (conv->_toUnicode ? (flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical) ? conv->toCanonicalUnicode(conv,flags,bytes,numBytes,chars,max,used) : conv->toUnicode(conv,flags,bytes,numBytes,chars,max,used)) : ((CFStringEncodingToUnicodeProc)conv->toUnicode)(flags,bytes,numBytes,chars,max,used)) - -#define LineSeparator 0x2028 -#define ParagraphSeparator 0x2029 -#define ASCIINewLine 0x0a -#define kSurrogateHighStart 0xD800 -#define kSurrogateHighEnd 0xDBFF -#define kSurrogateLowStart 0xDC00 -#define kSurrogateLowEnd 0xDFFF - -/* Mapping 128..255 to lossy ASCII -*/ -static const struct { - unsigned char chars[4]; -} _toLossyASCIITable[] = { - {{' ', 0, 0, 0}}, // NO-BREAK SPACE - {{'!', 0, 0, 0}}, // INVERTED EXCLAMATION MARK - {{'c', 0, 0, 0}}, // CENT SIGN - {{'L', 0, 0, 0}}, // POUND SIGN - {{'$', 0, 0, 0}}, // CURRENCY SIGN - {{'Y', 0, 0, 0}}, // YEN SIGN - {{'|', 0, 0, 0}}, // BROKEN BAR - {{0, 0, 0, 0}}, // SECTION SIGN - {{0, 0, 0, 0}}, // DIAERESIS - {{'(', 'C', ')', 0}}, // COPYRIGHT SIGN - {{'a', 0, 0, 0}}, // FEMININE ORDINAL INDICATOR - {{'<', '<', 0, 0}}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK - {{0, 0, 0, 0}}, // NOT SIGN - {{'-', 0, 0, 0}}, // SOFT HYPHEN - {{'(', 'R', ')', 0}}, // REGISTERED SIGN - {{0, 0, 0, 0}}, // MACRON - {{0, 0, 0, 0}}, // DEGREE SIGN - {{'+', '-', 0, 0}}, // PLUS-MINUS SIGN - {{'2', 0, 0, 0}}, // SUPERSCRIPT TWO - {{'3', 0, 0, 0}}, // SUPERSCRIPT THREE - {{0, 0, 0, 0}}, // ACUTE ACCENT - {{0, 0, 0, 0}}, // MICRO SIGN - {{0, 0, 0, 0}}, // PILCROW SIGN - {{0, 0, 0, 0}}, // MIDDLE DOT - {{0, 0, 0, 0}}, // CEDILLA - {{'1', 0, 0, 0}}, // SUPERSCRIPT ONE - {{'o', 0, 0, 0}}, // MASCULINE ORDINAL INDICATOR - {{'>', '>', 0, 0}}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK - {{'1', '/', '4', 0}}, // VULGAR FRACTION ONE QUARTER - {{'1', '/', '2', 0}}, // VULGAR FRACTION ONE HALF - {{'3', '/', '4', 0}}, // VULGAR FRACTION THREE QUARTERS - {{'?', 0, 0, 0}}, // INVERTED QUESTION MARK - {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH GRAVE - {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH ACUTE - {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX - {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH TILDE - {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH DIAERESIS - {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH RING ABOVE - {{'A', 'E', 0, 0}}, // LATIN CAPITAL LETTER AE - {{'C', 0, 0, 0}}, // LATIN CAPITAL LETTER C WITH CEDILLA - {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH GRAVE - {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH ACUTE - {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX - {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH DIAERESIS - {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH GRAVE - {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH ACUTE - {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX - {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH DIAERESIS - {{'T', 'H', 0, 0}}, // LATIN CAPITAL LETTER ETH (Icelandic) - {{'N', 0, 0, 0}}, // LATIN CAPITAL LETTER N WITH TILDE - {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH GRAVE - {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH ACUTE - {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX - {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH TILDE - {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH DIAERESIS - {{'X', 0, 0, 0}}, // MULTIPLICATION SIGN - {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH STROKE - {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH GRAVE - {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH ACUTE - {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX - {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH DIAERESIS - {{'Y', 0, 0, 0}}, // LATIN CAPITAL LETTER Y WITH ACUTE - {{'t', 'h', 0, 0}}, // LATIN CAPITAL LETTER THORN (Icelandic) - {{'s', 0, 0, 0}}, // LATIN SMALL LETTER SHARP S (German) - {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH GRAVE - {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH ACUTE - {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH CIRCUMFLEX - {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH TILDE - {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH DIAERESIS - {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH RING ABOVE - {{'a', 'e', 0, 0}}, // LATIN SMALL LETTER AE - {{'c', 0, 0, 0}}, // LATIN SMALL LETTER C WITH CEDILLA - {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH GRAVE - {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH ACUTE - {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH CIRCUMFLEX - {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH DIAERESIS - {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH GRAVE - {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH ACUTE - {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH CIRCUMFLEX - {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH DIAERESIS - {{'T', 'H', 0, 0}}, // LATIN SMALL LETTER ETH (Icelandic) - {{'n', 0, 0, 0}}, // LATIN SMALL LETTER N WITH TILDE - {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH GRAVE - {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH ACUTE - {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH CIRCUMFLEX - {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH TILDE - {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH DIAERESIS - {{'/', 0, 0, 0}}, // DIVISION SIGN - {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH STROKE - {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH GRAVE - {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH ACUTE - {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH CIRCUMFLEX - {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH DIAERESIS - {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH ACUTE - {{'t', 'h', 0, 0}}, // LATIN SMALL LETTER THORN (Icelandic) - {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH DIAERESIS -}; - -CF_INLINE UInt32 __CFToASCIILatin1Fallback(UniChar character, UInt8 *bytes, UInt32 maxByteLen) { - const char *losChars = (const unsigned char*)_toLossyASCIITable + (character - 0xA0) * sizeof(unsigned char[4]); - unsigned int numBytes = 0; - int idx, max = (maxByteLen && (maxByteLen < 4) ? maxByteLen : 4); - - for (idx = 0;idx < max;idx++) { - if (losChars[idx]) { - if (maxByteLen) bytes[idx] = losChars[idx]; - ++numBytes; - } else { - break; - } - } - - return numBytes; -} - -static UInt32 __CFDefaultToBytesFallbackProc(const UniChar *characters, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) { - if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range - if (maxByteLen) *bytes = (UInt8)(*characters - 0x80); - *usedByteLen = 1; - return 1; - } else if (*characters < 0x100) { - *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen); - return 1; - } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) { - if (maxByteLen) *bytes = '?'; - *usedByteLen = 1; - return (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1); - } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) { - if (maxByteLen) *bytes = ' '; - *usedByteLen = 1; - return 1; - } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) { - if (maxByteLen) *bytes = ASCIINewLine; - *usedByteLen = 1; - return 1; - } else if (!CFUniCharIsMemberOf(*characters, kCFUniCharLetterCharacterSet)) { - *usedByteLen = 0; - return 1; - } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) { - UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; - - (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH); - if (*decomposed < 0x80) { - if (maxByteLen) *bytes = (UInt8)(*decomposed); - *usedByteLen = 1; - return 1; - } else { - UTF16Char theChar = *decomposed; - - return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen); - } - } else { - if (maxByteLen) *bytes = '?'; - *usedByteLen = 1; - return 1; - } -} - -static UInt32 __CFDefaultToUnicodeFallbackProc(const uint8_t *bytes, UInt32 numBytes, UniChar *characters, UInt32 maxCharLen, UInt32 *usedCharLen) { - if (maxCharLen) *characters = (UniChar)'?'; - *usedCharLen = 1; - return 1; -} - -#define TO_BYTE_FALLBACK(conv,chars,numChars,bytes,max,used) (conv->toBytesFallback(chars,numChars,bytes,max,used)) -#define TO_UNICODE_FALLBACK(conv,bytes,numBytes,chars,max,used) (conv->toUnicodeFallback(bytes,numBytes,chars,max,used)) - -#define EXTRA_BASE (0x0F00) - -/* Wrapper funcs for non-standard converters -*/ -static UInt32 __CFToBytesCheapEightBitWrapper(const void *converter, UInt32 flags, const UniChar *characters, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) { - UInt32 processedCharLen = 0; - UInt32 length = (maxByteLen && (maxByteLen < numChars) ? maxByteLen : numChars); - uint8_t byte; - - while (processedCharLen < length) { - if (!((CFStringEncodingCheapEightBitToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters[processedCharLen], &byte)) break; - - if (maxByteLen) bytes[processedCharLen] = byte; - processedCharLen++; - } - - *usedByteLen = processedCharLen; - return processedCharLen; -} - -static UInt32 __CFToUnicodeCheapEightBitWrapper(const void *converter, UInt32 flags, const uint8_t *bytes, UInt32 numBytes, UniChar *characters, UInt32 maxCharLen, UInt32 *usedCharLen) { - UInt32 processedByteLen = 0; - UInt32 length = (maxCharLen && (maxCharLen < numBytes) ? maxCharLen : numBytes); - UniChar character; - - while (processedByteLen < length) { - if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], &character)) break; - - if (maxCharLen) characters[processedByteLen] = character; - processedByteLen++; - } - - *usedCharLen = processedByteLen; - return processedByteLen; -} - -static UInt32 __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, UInt32 flags, const uint8_t *bytes, UInt32 numBytes, UniChar *characters, UInt32 maxCharLen, UInt32 *usedCharLen) { - UInt32 processedByteLen = 0; - UInt32 theUsedCharLen = 0; - UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; - UInt32 usedLen; - UniChar character; - bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); - - while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { - if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], &character)) break; - - if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { - uint32_t idx; - - usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); - *usedCharLen = theUsedCharLen; - - for (idx = 0;idx < usedLen;idx++) { - if (charBuffer[idx] > 0xFFFF) { // Non-BMP - if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; - theUsedCharLen += 2; - if (maxCharLen) { - charBuffer[idx] = charBuffer[idx] - 0x10000; - *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL; - *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL; - } - } else { - if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; - ++theUsedCharLen; - *(characters++) = charBuffer[idx]; - } - } - } else { - if (maxCharLen) *(characters++) = character; - ++theUsedCharLen; - } - processedByteLen++; - } - - *usedCharLen = theUsedCharLen; - return processedByteLen; -} - -static UInt32 __CFToBytesStandardEightBitWrapper(const void *converter, UInt32 flags, const UniChar *characters, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) { - UInt32 processedCharLen = 0; - uint8_t byte; - UInt32 usedLen; - - *usedByteLen = 0; - - while (numChars && (!maxByteLen || (*usedByteLen < maxByteLen))) { - if (!(usedLen = ((CFStringEncodingStandardEightBitToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters, numChars, &byte))) break; - - if (maxByteLen) bytes[*usedByteLen] = byte; - (*usedByteLen)++; - characters += usedLen; - numChars -= usedLen; - processedCharLen += usedLen; - } - - return processedCharLen; -} - -static UInt32 __CFToUnicodeStandardEightBitWrapper(const void *converter, UInt32 flags, const uint8_t *bytes, UInt32 numBytes, UniChar *characters, UInt32 maxCharLen, UInt32 *usedCharLen) { - UInt32 processedByteLen = 0; -#if defined(__MACOS8__) || defined(__WIN32__) - UniChar charBuffer[20]; // Dynamic stack allocation is GNU specific -#else - UniChar charBuffer[((const _CFEncodingConverter*)converter)->maxLen]; -#endif - UInt32 usedLen; - - *usedCharLen = 0; - - while ((processedByteLen < numBytes) && (!maxCharLen || (*usedCharLen < maxCharLen))) { - if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], charBuffer))) break; - - if (maxCharLen) { - uint16_t idx; - - if (*usedCharLen + usedLen > maxCharLen) break; - - for (idx = 0;idx < usedLen;idx++) { - characters[*usedCharLen + idx] = charBuffer[idx]; - } - } - *usedCharLen += usedLen; - processedByteLen++; - } - - return processedByteLen; -} - -static UInt32 __CFToCanonicalUnicodeStandardEightBitWrapper(const void *converter, UInt32 flags, const uint8_t *bytes, UInt32 numBytes, UniChar *characters, UInt32 maxCharLen, UInt32 *usedCharLen) { - UInt32 processedByteLen = 0; -#if defined(__MACOS8__) || defined(__WIN32__) - UniChar charBuffer[20]; // Dynamic stack allocation is GNU specific -#else - UniChar charBuffer[((const _CFEncodingConverter*)converter)->maxLen]; -#endif - UTF32Char decompBuffer[MAX_DECOMPOSED_LENGTH]; - UInt32 usedLen; - UInt32 decompedLen; - UInt32 idx, decompIndex; - bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); - UInt32 theUsedCharLen = 0; - - while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { - if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes[processedByteLen], charBuffer))) break; - - for (idx = 0;idx < usedLen;idx++) { - if (CFUniCharIsDecomposableCharacter(charBuffer[idx], isHFSPlus)) { - decompedLen = CFUniCharDecomposeCharacter(charBuffer[idx], decompBuffer, MAX_DECOMPOSED_LENGTH); - *usedCharLen = theUsedCharLen; - - for (decompIndex = 0;decompIndex < decompedLen;decompIndex++) { - if (decompBuffer[decompIndex] > 0xFFFF) { // Non-BMP - if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; - theUsedCharLen += 2; - if (maxCharLen) { - charBuffer[idx] = charBuffer[idx] - 0x10000; - *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL; - *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL; - } - } else { - if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; - ++theUsedCharLen; - *(characters++) = charBuffer[idx]; - } - } - } else { - if (maxCharLen) *(characters++) = charBuffer[idx]; - ++theUsedCharLen; - } - } - processedByteLen++; - } - - *usedCharLen = theUsedCharLen; - return processedByteLen; -} - -static UInt32 __CFToBytesCheapMultiByteWrapper(const void *converter, UInt32 flags, const UniChar *characters, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) { - UInt32 processedCharLen = 0; -#if defined(__MACOS8__) || defined(__WIN32__) - uint8_t byteBuffer[20]; // Dynamic stack allocation is GNU specific -#else - uint8_t byteBuffer[((const _CFEncodingConverter*)converter)->maxLen]; -#endif - UInt32 usedLen; - - *usedByteLen = 0; - - while ((processedCharLen < numChars) && (!maxByteLen || (*usedByteLen < maxByteLen))) { - if (!(usedLen = ((CFStringEncodingCheapMultiByteToBytesProc)((const _CFEncodingConverter*)converter)->_toBytes)(flags, characters[processedCharLen], byteBuffer))) break; - - if (maxByteLen) { - uint16_t idx; - - if (*usedByteLen + usedLen > maxByteLen) break; - - for (idx = 0;idx _toUnicode)(flags, bytes, numBytes, &character))) break; - - if (maxCharLen) *(characters++) = character; - (*usedCharLen)++; - processedByteLen += usedLen; - bytes += usedLen; - numBytes -= usedLen; - } - - return processedByteLen; -} - -static UInt32 __CFToCanonicalUnicodeCheapMultiByteWrapper(const void *converter, UInt32 flags, const uint8_t *bytes, UInt32 numBytes, UniChar *characters, UInt32 maxCharLen, UInt32 *usedCharLen) { - UInt32 processedByteLen = 0; - UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; - UniChar character; - UInt32 usedLen; - UInt32 decomposedLen; - UInt32 theUsedCharLen = 0; - bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); - - while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) { - if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->_toUnicode)(flags, bytes, numBytes, &character))) break; - - if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { - uint32_t idx; - - decomposedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); - *usedCharLen = theUsedCharLen; - - for (idx = 0;idx < decomposedLen;idx++) { - if (charBuffer[idx] > 0xFFFF) { // Non-BMP - if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; - theUsedCharLen += 2; - if (maxCharLen) { - charBuffer[idx] = charBuffer[idx] - 0x10000; - *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL; - *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL; - } - } else { - if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; - ++theUsedCharLen; - *(characters++) = charBuffer[idx]; - } - } - } else { - if (maxCharLen) *(characters++) = character; - ++theUsedCharLen; - } - - processedByteLen += usedLen; - bytes += usedLen; - numBytes -= usedLen; - } - *usedCharLen = theUsedCharLen; - return processedByteLen; -} - -/* static functions -*/ -static _CFConverterEntry __CFConverterEntryASCII = { - kCFStringEncodingASCII, NULL, - "Western (ASCII)", {"us-ascii", "ascii", "iso-646-us", NULL}, NULL, NULL, NULL, NULL, - kCFStringEncodingMacRoman // We use string encoding's script range here -}; - -static _CFConverterEntry __CFConverterEntryISOLatin1 = { - kCFStringEncodingISOLatin1, NULL, - "Western (ISO Latin 1)", {"iso-8859-1", "latin1","iso-latin-1", NULL}, NULL, NULL, NULL, NULL, - kCFStringEncodingMacRoman // We use string encoding's script range here -}; - -static _CFConverterEntry __CFConverterEntryMacRoman = { - kCFStringEncodingMacRoman, NULL, - "Western (Mac OS Roman)", {"macintosh", "mac", "x-mac-roman", NULL}, NULL, NULL, NULL, NULL, - kCFStringEncodingMacRoman // We use string encoding's script range here -}; - -static _CFConverterEntry __CFConverterEntryWinLatin1 = { - kCFStringEncodingWindowsLatin1, NULL, - "Western (Windows Latin 1)", {"windows-1252", "cp1252", "windows latin1", NULL}, NULL, NULL, NULL, NULL, - kCFStringEncodingMacRoman // We use string encoding's script range here -}; - -static _CFConverterEntry __CFConverterEntryNextStepLatin = { - kCFStringEncodingNextStepLatin, NULL, - "Western (NextStep)", {"x-nextstep", NULL, NULL, NULL}, NULL, NULL, NULL, NULL, - kCFStringEncodingMacRoman // We use string encoding's script range here -}; - -static _CFConverterEntry __CFConverterEntryUTF8 = { - kCFStringEncodingUTF8, NULL, - "UTF-8", {"utf-8", "unicode-1-1-utf8", NULL, NULL}, NULL, NULL, NULL, NULL, - kCFStringEncodingUnicode // We use string encoding's script range here -}; - -CF_INLINE _CFConverterEntry *__CFStringEncodingConverterGetEntry(UInt32 encoding) { - switch (encoding) { - case kCFStringEncodingInvalidId: - case kCFStringEncodingASCII: - return &__CFConverterEntryASCII; - - case kCFStringEncodingISOLatin1: - return &__CFConverterEntryISOLatin1; - - case kCFStringEncodingMacRoman: - return &__CFConverterEntryMacRoman; - - case kCFStringEncodingWindowsLatin1: - return &__CFConverterEntryWinLatin1; - - case kCFStringEncodingNextStepLatin: - return &__CFConverterEntryNextStepLatin; - - case kCFStringEncodingUTF8: - return &__CFConverterEntryUTF8; - - default: { - return NULL; - } - } -} - -CF_INLINE _CFEncodingConverter *__CFEncodingConverterFromDefinition(const CFStringEncodingConverter *definition) { -#define NUM_OF_ENTRIES_CYCLE (10) - static CFSpinLock_t _indexLock = 0; - static UInt32 _currentIndex = 0; - static UInt32 _allocatedSize = 0; - static _CFEncodingConverter *_allocatedEntries = NULL; - _CFEncodingConverter *converter; - - - __CFSpinLock(&_indexLock); - if ((_currentIndex + 1) >= _allocatedSize) { - _currentIndex = 0; - _allocatedSize = 0; - _allocatedEntries = NULL; - } - if (_allocatedEntries == NULL) { // Not allocated yet - _allocatedEntries = (_CFEncodingConverter *)CFAllocatorAllocate(NULL, sizeof(_CFEncodingConverter) * NUM_OF_ENTRIES_CYCLE, 0); - _allocatedSize = NUM_OF_ENTRIES_CYCLE; - converter = &(_allocatedEntries[_currentIndex]); - } else { - converter = &(_allocatedEntries[++_currentIndex]); - } - __CFSpinUnlock(&_indexLock); - - switch (definition->encodingClass) { - case kCFStringEncodingConverterStandard: - converter->toBytes = definition->toBytes; - converter->toUnicode = definition->toUnicode; - converter->toCanonicalUnicode = definition->toUnicode; - converter->_toBytes = NULL; - converter->_toUnicode = NULL; - converter->maxLen = 2; - break; - - case kCFStringEncodingConverterCheapEightBit: - converter->toBytes = __CFToBytesCheapEightBitWrapper; - converter->toUnicode = __CFToUnicodeCheapEightBitWrapper; - converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapEightBitWrapper; - converter->_toBytes = definition->toBytes; - converter->_toUnicode = definition->toUnicode; - converter->maxLen = 1; - break; - - case kCFStringEncodingConverterStandardEightBit: - converter->toBytes = __CFToBytesStandardEightBitWrapper; - converter->toUnicode = __CFToUnicodeStandardEightBitWrapper; - converter->toCanonicalUnicode = __CFToCanonicalUnicodeStandardEightBitWrapper; - converter->_toBytes = definition->toBytes; - converter->_toUnicode = definition->toUnicode; - converter->maxLen = definition->maxDecomposedCharLen; - break; - - case kCFStringEncodingConverterCheapMultiByte: - converter->toBytes = __CFToBytesCheapMultiByteWrapper; - converter->toUnicode = __CFToUnicodeCheapMultiByteWrapper; - converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapMultiByteWrapper; - converter->_toBytes = definition->toBytes; - converter->_toUnicode = definition->toUnicode; - converter->maxLen = definition->maxBytesPerChar; - break; - - case kCFStringEncodingConverterPlatformSpecific: - converter->toBytes = NULL; - converter->toUnicode = NULL; - converter->toCanonicalUnicode = NULL; - converter->_toBytes = NULL; - converter->_toUnicode = NULL; - converter->maxLen = 0; - converter->toBytesLen = NULL; - converter->toUnicodeLen = NULL; - converter->toBytesFallback = NULL; - converter->toUnicodeFallback = NULL; - converter->toBytesPrecompose = NULL; - converter->isValidCombiningChar = NULL; - return converter; - - default: // Shouln't be here - return NULL; - } - - converter->toBytesLen = (definition->toBytesLen ? definition->toBytesLen : (CFStringEncodingToBytesLenProc)(UInt32)definition->maxBytesPerChar); - converter->toUnicodeLen = (definition->toUnicodeLen ? definition->toUnicodeLen : (CFStringEncodingToUnicodeLenProc)(UInt32)definition->maxDecomposedCharLen); - converter->toBytesFallback = (definition->toBytesFallback ? definition->toBytesFallback : __CFDefaultToBytesFallbackProc); - converter->toUnicodeFallback = (definition->toUnicodeFallback ? definition->toUnicodeFallback : __CFDefaultToUnicodeFallbackProc); - converter->toBytesPrecompose = (definition->toBytesPrecompose ? definition->toBytesPrecompose : NULL); - converter->isValidCombiningChar = (definition->isValidCombiningChar ? definition->isValidCombiningChar : NULL); - - return converter; -} - -CF_INLINE const CFStringEncodingConverter *__CFStringEncodingConverterGetDefinition(_CFConverterEntry *entry) { - if (!entry) return NULL; - - switch (entry->encoding) { - case kCFStringEncodingASCII: - return &__CFConverterASCII; - - case kCFStringEncodingISOLatin1: - return &__CFConverterISOLatin1; - - case kCFStringEncodingMacRoman: - return &__CFConverterMacRoman; - - case kCFStringEncodingWindowsLatin1: - return &__CFConverterWinLatin1; - - case kCFStringEncodingNextStepLatin: - return &__CFConverterNextStepLatin; - - case kCFStringEncodingUTF8: - return &__CFConverterUTF8; - - default: - return NULL; - } -} - -static const _CFEncodingConverter *__CFGetConverter(UInt32 encoding) { - _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); - - if (!entry) return NULL; - - if (!entry->converter) { - const CFStringEncodingConverter *definition = __CFStringEncodingConverterGetDefinition(entry); - - if (definition) { - entry->converter = __CFEncodingConverterFromDefinition(definition); - entry->toBytesFallback = definition->toBytesFallback; - entry->toUnicodeFallback = definition->toUnicodeFallback; - } - } - - return (_CFEncodingConverter *)entry->converter; -} - -/* Public API -*/ -UInt32 CFStringEncodingUnicodeToBytes(UInt32 encoding, UInt32 flags, const UniChar *characters, UInt32 numChars, UInt32 *usedCharLen, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) { - if (encoding == kCFStringEncodingUTF8) { - static CFStringEncodingToBytesProc __CFToUTF8 = NULL; - uint32_t convertedCharLen; - uint32_t usedLen; - - - if ((flags & kCFStringEncodingUseCanonical) || (flags & kCFStringEncodingUseHFSPlusCanonical)) { - (void)CFUniCharDecompose(characters, numChars, &convertedCharLen, (void *)bytes, maxByteLen, &usedLen, true, kCFUniCharUTF8Format, (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false)); - } else { - if (!__CFToUTF8) { - const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8); - __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes; - } - convertedCharLen = __CFToUTF8(0, characters, numChars, bytes, maxByteLen, (UInt32 *)&usedLen); - } - if (usedCharLen) *usedCharLen = convertedCharLen; - if (usedByteLen) *usedByteLen = usedLen; - - if (convertedCharLen == numChars) { - return kCFStringEncodingConversionSuccess; - } else if (maxByteLen && (maxByteLen == usedLen)) { - return kCFStringEncodingInsufficientOutputBufferLength; - } else { - return kCFStringEncodingInvalidInputStream; - } - } else { - const _CFEncodingConverter *converter = __CFGetConverter(encoding); - UInt32 usedLen = 0; - UInt32 localUsedByteLen; - UInt32 theUsedByteLen = 0; - UInt32 theResult = kCFStringEncodingConversionSuccess; - CFStringEncodingToBytesPrecomposeProc toBytesPrecompose = NULL; - CFStringEncodingIsValidCombiningCharacterProc isValidCombiningChar = NULL; - - if (!converter) return kCFStringEncodingConverterUnavailable; - - if (flags & kCFStringEncodingSubstituteCombinings) { - if (!(flags & kCFStringEncodingAllowLossyConversion)) isValidCombiningChar = converter->isValidCombiningChar; - } else { - isValidCombiningChar = converter->isValidCombiningChar; - if (!(flags & kCFStringEncodingIgnoreCombinings)) { - toBytesPrecompose = converter->toBytesPrecompose; - flags |= kCFStringEncodingComposeCombinings; - } - } - - - while ((usedLen < numChars) && (!maxByteLen || (theUsedByteLen < maxByteLen))) { - if ((usedLen += TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) < numChars) { - UInt32 dummy; - - if (isValidCombiningChar && (usedLen > 0) && isValidCombiningChar(characters[usedLen])) { - if (toBytesPrecompose) { - UInt32 localUsedLen = usedLen; - - while (isValidCombiningChar(characters[--usedLen])); - theUsedByteLen += localUsedByteLen; - if (converter->maxLen > 1) { - TO_BYTE(converter, flags, characters + usedLen, localUsedLen - usedLen, NULL, 0, &localUsedByteLen); - theUsedByteLen -= localUsedByteLen; - } else { - theUsedByteLen--; - } - if ((localUsedLen = toBytesPrecompose(flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) > 0) { - usedLen += localUsedLen; - if ((usedLen < numChars) && isValidCombiningChar(characters[usedLen])) { // There is a non-base char not combined remaining - theUsedByteLen += localUsedByteLen; - theResult = kCFStringEncodingInvalidInputStream; - break; - } - } else if (flags & kCFStringEncodingAllowLossyConversion) { - uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); - - if (lossyByte) { - while (isValidCombiningChar(characters[++usedLen])); - localUsedByteLen = 1; - if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; - } else { - ++usedLen; - usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); - } - } else { - theResult = kCFStringEncodingInvalidInputStream; - break; - } - } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up - theUsedByteLen += localUsedByteLen; - theResult = kCFStringEncodingInsufficientOutputBufferLength; - break; - } else if (flags & kCFStringEncodingIgnoreCombinings) { - while ((++usedLen < numChars) && isValidCombiningChar(characters[usedLen])); - } else { - uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); - - theUsedByteLen += localUsedByteLen; - if (lossyByte) { - ++usedLen; - localUsedByteLen = 1; - if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; - } else { - usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); - } - } - } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up - theUsedByteLen += localUsedByteLen; - - if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) { - UInt32 localUsedLen; - - localUsedByteLen = 0; - while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen; - } - if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength; - break; - } else if (flags & kCFStringEncodingAllowLossyConversion) { - uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags); - - theUsedByteLen += localUsedByteLen; - if (lossyByte) { - ++usedLen; - localUsedByteLen = 1; - if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte; - } else { - usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen); - } - } else { - theUsedByteLen += localUsedByteLen; - theResult = kCFStringEncodingInvalidInputStream; - break; - } - } - theUsedByteLen += localUsedByteLen; - } - - if (usedLen < numChars && maxByteLen && theResult == kCFStringEncodingConversionSuccess) { - if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) { - UInt32 localUsedLen; - - localUsedByteLen = 0; - while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen; - } - if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength; - } - if (usedByteLen) *usedByteLen = theUsedByteLen; - if (usedCharLen) *usedCharLen = usedLen; - - return theResult; - } -} - -UInt32 CFStringEncodingBytesToUnicode(UInt32 encoding, UInt32 flags, const uint8_t *bytes, UInt32 numBytes, UInt32 *usedByteLen, UniChar *characters, UInt32 maxCharLen, UInt32 *usedCharLen) { - const _CFEncodingConverter *converter = __CFGetConverter(encoding); - UInt32 usedLen = 0; - UInt32 theUsedCharLen = 0; - UInt32 localUsedCharLen; - UInt32 theResult = kCFStringEncodingConversionSuccess; - - if (!converter) return kCFStringEncodingConverterUnavailable; - - - while ((usedLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { - if ((usedLen += TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen)) < numBytes) { - UInt32 tempUsedCharLen; - - if (maxCharLen && ((maxCharLen == theUsedCharLen + localUsedCharLen) || ((flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical)) && TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, NULL, 0, &tempUsedCharLen)))) { // buffer was filled up - theUsedCharLen += localUsedCharLen; - theResult = kCFStringEncodingInsufficientOutputBufferLength; - break; - } else if (flags & kCFStringEncodingAllowLossyConversion) { - theUsedCharLen += localUsedCharLen; - usedLen += TO_UNICODE_FALLBACK(converter, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen); - } else { - theUsedCharLen += localUsedCharLen; - theResult = kCFStringEncodingInvalidInputStream; - break; - } - } - theUsedCharLen += localUsedCharLen; - } - - if (usedLen < numBytes && maxCharLen && theResult == kCFStringEncodingConversionSuccess) { - theResult = kCFStringEncodingInsufficientOutputBufferLength; - } - if (usedCharLen) *usedCharLen = theUsedCharLen; - if (usedByteLen) *usedByteLen = usedLen; - - return theResult; -} - -__private_extern__ Boolean CFStringEncodingIsValidEncoding(UInt32 encoding) { - return (CFStringEncodingGetConverter(encoding) ? true : false); -} - -__private_extern__ const char *CFStringEncodingName(UInt32 encoding) { - _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); - if (entry) return entry->encodingName; - return NULL; -} - -__private_extern__ const char **CFStringEncodingCanonicalCharsetNames(UInt32 encoding) { - _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); - if (entry) return entry->ianaNames; - return NULL; -} - -__private_extern__ UInt32 CFStringEncodingGetScriptCodeForEncoding(CFStringEncoding encoding) { - _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); - - return (entry ? entry->scriptCode : ((encoding & 0x0FFF) == kCFStringEncodingUnicode ? kCFStringEncodingUnicode : (encoding < 0xFF ? encoding : kCFStringEncodingInvalidId))); -} - -__private_extern__ UInt32 CFStringEncodingCharLengthForBytes(UInt32 encoding, UInt32 flags, const uint8_t *bytes, UInt32 numBytes) { - const _CFEncodingConverter *converter = __CFGetConverter(encoding); - - if (converter) { - UInt32 switchVal = (UInt32)(converter->toUnicodeLen); - - if (switchVal < 0xFFFF) - return switchVal * numBytes; - else - return converter->toUnicodeLen(flags, bytes, numBytes); - } - - return 0; -} - -__private_extern__ UInt32 CFStringEncodingByteLengthForCharacters(UInt32 encoding, UInt32 flags, const UniChar *characters, UInt32 numChars) { - const _CFEncodingConverter *converter = __CFGetConverter(encoding); - - if (converter) { - UInt32 switchVal = (UInt32)(converter->toBytesLen); - - if (switchVal < 0xFFFF) - return switchVal * numChars; - else - return converter->toBytesLen(flags, characters, numChars); - } - - return 0; -} - -__private_extern__ void CFStringEncodingRegisterFallbackProcedures(UInt32 encoding, CFStringEncodingToBytesFallbackProc toBytes, CFStringEncodingToUnicodeFallbackProc toUnicode) { - _CFConverterEntry *entry = __CFStringEncodingConverterGetEntry(encoding); - - if (entry && __CFGetConverter(encoding)) { - ((_CFEncodingConverter*)entry->converter)->toBytesFallback = (toBytes ? toBytes : entry->toBytesFallback); - ((_CFEncodingConverter*)entry->converter)->toUnicodeFallback = (toUnicode ? toUnicode : entry->toUnicodeFallback); - } -} - -__private_extern__ const CFStringEncodingConverter *CFStringEncodingGetConverter(UInt32 encoding) { - return __CFStringEncodingConverterGetDefinition(__CFStringEncodingConverterGetEntry(encoding)); -} - -static const UInt32 __CFBuiltinEncodings[] = { - kCFStringEncodingMacRoman, - kCFStringEncodingWindowsLatin1, - kCFStringEncodingISOLatin1, - kCFStringEncodingNextStepLatin, - kCFStringEncodingASCII, - kCFStringEncodingUTF8, - /* These seven are available only in CFString-level */ - kCFStringEncodingNonLossyASCII, - - kCFStringEncodingUTF16, - kCFStringEncodingUTF16BE, - kCFStringEncodingUTF16LE, - - kCFStringEncodingUTF32, - kCFStringEncodingUTF32BE, - kCFStringEncodingUTF32LE, - - kCFStringEncodingInvalidId, -}; - - -__private_extern__ const UInt32 *CFStringEncodingListOfAvailableEncodings(void) { - return __CFBuiltinEncodings; -} -