]> git.saurik.com Git - apple/cf.git/blame - CFStringEncodingConverter.c
CF-1153.18.tar.gz
[apple/cf.git] / CFStringEncodingConverter.c
CommitLineData
9ce05555 1/*
e29e285d 2 * Copyright (c) 2015 Apple Inc. All rights reserved.
9ce05555
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
d7384798 5 *
9ce05555
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
d7384798 12 *
9ce05555
A
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
d7384798 20 *
9ce05555
A
21 * @APPLE_LICENSE_HEADER_END@
22 */
f64f9b69 23
9ce05555 24/* CFStringEncodingConverter.c
d7384798 25 Copyright (c) 1998-2014, Apple Inc. All rights reserved.
9ce05555
A
26 Responsibility: Aki Inoue
27*/
28
29#include "CFInternal.h"
30#include <CoreFoundation/CFArray.h>
31#include <CoreFoundation/CFDictionary.h>
cf7d2af9
A
32#include "CFICUConverters.h"
33#include <CoreFoundation/CFUniChar.h>
34#include <CoreFoundation/CFPriv.h>
9ce05555
A
35#include "CFUnicodeDecomposition.h"
36#include "CFStringEncodingConverterExt.h"
37#include "CFStringEncodingConverterPriv.h"
38#include <stdlib.h>
9ce05555 39
cf7d2af9
A
40typedef CFIndex (*_CFToBytesProc)(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen);
41typedef CFIndex (*_CFToUnicodeProc)(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen);
42
43typedef struct {
44 const CFStringEncodingConverter *definition;
45 _CFToBytesProc toBytes;
46 _CFToUnicodeProc toUnicode;
47 _CFToUnicodeProc toCanonicalUnicode;
48 CFStringEncodingToBytesFallbackProc toBytesFallback;
49 CFStringEncodingToUnicodeFallbackProc toUnicodeFallback;
50} _CFEncodingConverter;
9ce05555
A
51
52/* Macros
53*/
cf7d2af9
A
54#define TO_BYTE(conv,flags,chars,numChars,bytes,max,used) (conv->toBytes ? conv->toBytes(conv,flags,chars,numChars,bytes,max,used) : ((CFStringEncodingToBytesProc)conv->definition->toBytes)(flags,chars,numChars,bytes,max,used))
55#define TO_UNICODE(conv,flags,bytes,numBytes,chars,max,used) (conv->toUnicode ? (flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical) ? conv->toCanonicalUnicode(conv,flags,bytes,numBytes,chars,max,used) : conv->toUnicode(conv,flags,bytes,numBytes,chars,max,used)) : ((CFStringEncodingToUnicodeProc)conv->definition->toUnicode)(flags,bytes,numBytes,chars,max,used))
9ce05555 56
9ce05555
A
57#define ASCIINewLine 0x0a
58#define kSurrogateHighStart 0xD800
59#define kSurrogateHighEnd 0xDBFF
60#define kSurrogateLowStart 0xDC00
61#define kSurrogateLowEnd 0xDFFF
62
cf7d2af9
A
63static const uint8_t __CFMaximumConvertedLength = 20;
64
9ce05555
A
65/* Mapping 128..255 to lossy ASCII
66*/
67static const struct {
68 unsigned char chars[4];
69} _toLossyASCIITable[] = {
70 {{' ', 0, 0, 0}}, // NO-BREAK SPACE
71 {{'!', 0, 0, 0}}, // INVERTED EXCLAMATION MARK
72 {{'c', 0, 0, 0}}, // CENT SIGN
73 {{'L', 0, 0, 0}}, // POUND SIGN
74 {{'$', 0, 0, 0}}, // CURRENCY SIGN
75 {{'Y', 0, 0, 0}}, // YEN SIGN
76 {{'|', 0, 0, 0}}, // BROKEN BAR
77 {{0, 0, 0, 0}}, // SECTION SIGN
78 {{0, 0, 0, 0}}, // DIAERESIS
79 {{'(', 'C', ')', 0}}, // COPYRIGHT SIGN
80 {{'a', 0, 0, 0}}, // FEMININE ORDINAL INDICATOR
81 {{'<', '<', 0, 0}}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
82 {{0, 0, 0, 0}}, // NOT SIGN
83 {{'-', 0, 0, 0}}, // SOFT HYPHEN
84 {{'(', 'R', ')', 0}}, // REGISTERED SIGN
85 {{0, 0, 0, 0}}, // MACRON
86 {{0, 0, 0, 0}}, // DEGREE SIGN
87 {{'+', '-', 0, 0}}, // PLUS-MINUS SIGN
88 {{'2', 0, 0, 0}}, // SUPERSCRIPT TWO
89 {{'3', 0, 0, 0}}, // SUPERSCRIPT THREE
90 {{0, 0, 0, 0}}, // ACUTE ACCENT
91 {{0, 0, 0, 0}}, // MICRO SIGN
92 {{0, 0, 0, 0}}, // PILCROW SIGN
93 {{0, 0, 0, 0}}, // MIDDLE DOT
94 {{0, 0, 0, 0}}, // CEDILLA
95 {{'1', 0, 0, 0}}, // SUPERSCRIPT ONE
96 {{'o', 0, 0, 0}}, // MASCULINE ORDINAL INDICATOR
97 {{'>', '>', 0, 0}}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
98 {{'1', '/', '4', 0}}, // VULGAR FRACTION ONE QUARTER
99 {{'1', '/', '2', 0}}, // VULGAR FRACTION ONE HALF
100 {{'3', '/', '4', 0}}, // VULGAR FRACTION THREE QUARTERS
101 {{'?', 0, 0, 0}}, // INVERTED QUESTION MARK
102 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH GRAVE
103 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH ACUTE
104 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
105 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH TILDE
106 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH DIAERESIS
107 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH RING ABOVE
108 {{'A', 'E', 0, 0}}, // LATIN CAPITAL LETTER AE
109 {{'C', 0, 0, 0}}, // LATIN CAPITAL LETTER C WITH CEDILLA
110 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH GRAVE
111 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH ACUTE
112 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
113 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH DIAERESIS
114 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH GRAVE
115 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH ACUTE
116 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
117 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH DIAERESIS
118 {{'T', 'H', 0, 0}}, // LATIN CAPITAL LETTER ETH (Icelandic)
119 {{'N', 0, 0, 0}}, // LATIN CAPITAL LETTER N WITH TILDE
120 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH GRAVE
121 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH ACUTE
122 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
123 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH TILDE
124 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH DIAERESIS
125 {{'X', 0, 0, 0}}, // MULTIPLICATION SIGN
126 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH STROKE
127 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH GRAVE
128 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH ACUTE
129 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
130 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH DIAERESIS
131 {{'Y', 0, 0, 0}}, // LATIN CAPITAL LETTER Y WITH ACUTE
132 {{'t', 'h', 0, 0}}, // LATIN CAPITAL LETTER THORN (Icelandic)
133 {{'s', 0, 0, 0}}, // LATIN SMALL LETTER SHARP S (German)
134 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH GRAVE
135 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH ACUTE
136 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH CIRCUMFLEX
137 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH TILDE
138 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH DIAERESIS
139 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH RING ABOVE
140 {{'a', 'e', 0, 0}}, // LATIN SMALL LETTER AE
141 {{'c', 0, 0, 0}}, // LATIN SMALL LETTER C WITH CEDILLA
142 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH GRAVE
143 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH ACUTE
144 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH CIRCUMFLEX
145 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH DIAERESIS
146 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH GRAVE
147 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH ACUTE
148 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH CIRCUMFLEX
149 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH DIAERESIS
150 {{'T', 'H', 0, 0}}, // LATIN SMALL LETTER ETH (Icelandic)
151 {{'n', 0, 0, 0}}, // LATIN SMALL LETTER N WITH TILDE
152 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH GRAVE
153 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH ACUTE
154 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH CIRCUMFLEX
155 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH TILDE
156 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH DIAERESIS
157 {{'/', 0, 0, 0}}, // DIVISION SIGN
158 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH STROKE
159 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH GRAVE
160 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH ACUTE
161 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH CIRCUMFLEX
162 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH DIAERESIS
163 {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH ACUTE
164 {{'t', 'h', 0, 0}}, // LATIN SMALL LETTER THORN (Icelandic)
165 {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH DIAERESIS
166};
167
bd5b749c
A
168CF_INLINE CFIndex __CFToASCIILatin1Fallback(UniChar character, uint8_t *bytes, CFIndex maxByteLen) {
169 const uint8_t *losChars = (const uint8_t*)_toLossyASCIITable + (character - 0xA0) * sizeof(uint8_t[4]);
170 CFIndex numBytes = 0;
171 CFIndex idx, max = (maxByteLen && (maxByteLen < 4) ? maxByteLen : 4);
9ce05555
A
172
173 for (idx = 0;idx < max;idx++) {
174 if (losChars[idx]) {
175 if (maxByteLen) bytes[idx] = losChars[idx];
176 ++numBytes;
177 } else {
178 break;
179 }
180 }
181
182 return numBytes;
183}
184
bd5b749c
A
185static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
186 CFIndex processCharLen = 1, filledBytesLen = 1;
187 uint8_t byte = '?';
188
9ce05555 189 if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range
bd5b749c 190 byte = (uint8_t)(*characters - 0x80);
9ce05555
A
191 } else if (*characters < 0x100) {
192 *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen);
193 return 1;
194 } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) {
bd5b749c 195 processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1);
9ce05555 196 } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) {
bd5b749c 197 byte = ' ';
9ce05555 198 } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) {
bd5b749c
A
199 byte = ASCIINewLine;
200 } else if (*characters == 0x2026) { // ellipsis
201 if (0 == maxByteLen) {
202 filledBytesLen = 3;
203 } else if (maxByteLen > 2) {
204 memset(bytes, '.', 3);
205 *usedByteLen = 3;
206 return processCharLen;
207 }
9ce05555
A
208 } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) {
209 UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
210
211 (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH);
212 if (*decomposed < 0x80) {
bd5b749c 213 byte = (uint8_t)(*decomposed);
9ce05555
A
214 } else {
215 UTF16Char theChar = *decomposed;
216
217 return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen);
218 }
9ce05555 219 }
bd5b749c
A
220
221 if (maxByteLen) *bytes = byte;
222 *usedByteLen = filledBytesLen;
223 return processCharLen;
9ce05555
A
224}
225
bd5b749c 226static CFIndex __CFDefaultToUnicodeFallbackProc(const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
9ce05555
A
227 if (maxCharLen) *characters = (UniChar)'?';
228 *usedCharLen = 1;
229 return 1;
230}
231
232#define TO_BYTE_FALLBACK(conv,chars,numChars,bytes,max,used) (conv->toBytesFallback(chars,numChars,bytes,max,used))
233#define TO_UNICODE_FALLBACK(conv,bytes,numBytes,chars,max,used) (conv->toUnicodeFallback(bytes,numBytes,chars,max,used))
234
235#define EXTRA_BASE (0x0F00)
236
237/* Wrapper funcs for non-standard converters
238*/
bd5b749c
A
239static CFIndex __CFToBytesCheapEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
240 CFIndex processedCharLen = 0;
241 CFIndex length = (maxByteLen && (maxByteLen < numChars) ? maxByteLen : numChars);
9ce05555
A
242 uint8_t byte;
243
244 while (processedCharLen < length) {
cf7d2af9 245 if (!((CFStringEncodingCheapEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], &byte)) break;
9ce05555
A
246
247 if (maxByteLen) bytes[processedCharLen] = byte;
248 processedCharLen++;
249 }
250
251 *usedByteLen = processedCharLen;
252 return processedCharLen;
253}
254
bd5b749c
A
255static CFIndex __CFToUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
256 CFIndex processedByteLen = 0;
257 CFIndex length = (maxCharLen && (maxCharLen < numBytes) ? maxCharLen : numBytes);
9ce05555
A
258 UniChar character;
259
260 while (processedByteLen < length) {
cf7d2af9 261 if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break;
9ce05555
A
262
263 if (maxCharLen) characters[processedByteLen] = character;
264 processedByteLen++;
265 }
266
267 *usedCharLen = processedByteLen;
268 return processedByteLen;
269}
270
bd5b749c
A
271static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
272 CFIndex processedByteLen = 0;
273 CFIndex theUsedCharLen = 0;
9ce05555 274 UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
bd5b749c 275 CFIndex usedLen;
9ce05555
A
276 UniChar character;
277 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
278
279 while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
cf7d2af9 280 if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break;
9ce05555
A
281
282 if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
bd5b749c 283 CFIndex idx;
9ce05555
A
284
285 usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
286 *usedCharLen = theUsedCharLen;
287
288 for (idx = 0;idx < usedLen;idx++) {
289 if (charBuffer[idx] > 0xFFFF) { // Non-BMP
290 if (theUsedCharLen + 2 > maxCharLen) return processedByteLen;
291 theUsedCharLen += 2;
292 if (maxCharLen) {
293 charBuffer[idx] = charBuffer[idx] - 0x10000;
bd5b749c
A
294 *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
295 *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
9ce05555
A
296 }
297 } else {
298 if (theUsedCharLen + 1 > maxCharLen) return processedByteLen;
299 ++theUsedCharLen;
300 *(characters++) = charBuffer[idx];
301 }
302 }
303 } else {
304 if (maxCharLen) *(characters++) = character;
305 ++theUsedCharLen;
306 }
307 processedByteLen++;
308 }
309
310 *usedCharLen = theUsedCharLen;
311 return processedByteLen;
312}
313
bd5b749c
A
314static CFIndex __CFToBytesStandardEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
315 CFIndex processedCharLen = 0;
9ce05555 316 uint8_t byte;
bd5b749c 317 CFIndex usedLen;
9ce05555
A
318
319 *usedByteLen = 0;
320
321 while (numChars && (!maxByteLen || (*usedByteLen < maxByteLen))) {
cf7d2af9 322 if (!(usedLen = ((CFStringEncodingStandardEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters, numChars, &byte))) break;
9ce05555
A
323
324 if (maxByteLen) bytes[*usedByteLen] = byte;
325 (*usedByteLen)++;
326 characters += usedLen;
327 numChars -= usedLen;
328 processedCharLen += usedLen;
329 }
330
331 return processedCharLen;
332}
333
bd5b749c
A
334static CFIndex __CFToUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
335 CFIndex processedByteLen = 0;
cf7d2af9 336 UniChar charBuffer[__CFMaximumConvertedLength];
bd5b749c 337 CFIndex usedLen;
9ce05555
A
338
339 *usedCharLen = 0;
340
341 while ((processedByteLen < numBytes) && (!maxCharLen || (*usedCharLen < maxCharLen))) {
cf7d2af9 342 if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break;
9ce05555
A
343
344 if (maxCharLen) {
bd5b749c 345 CFIndex idx;
9ce05555
A
346
347 if (*usedCharLen + usedLen > maxCharLen) break;
348
349 for (idx = 0;idx < usedLen;idx++) {
350 characters[*usedCharLen + idx] = charBuffer[idx];
351 }
352 }
353 *usedCharLen += usedLen;
354 processedByteLen++;
355 }
356
357 return processedByteLen;
358}
359
bd5b749c
A
360static CFIndex __CFToCanonicalUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
361 CFIndex processedByteLen = 0;
cf7d2af9 362 UniChar charBuffer[__CFMaximumConvertedLength];
9ce05555 363 UTF32Char decompBuffer[MAX_DECOMPOSED_LENGTH];
bd5b749c
A
364 CFIndex usedLen;
365 CFIndex decompedLen;
366 CFIndex idx, decompIndex;
9ce05555 367 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
bd5b749c 368 CFIndex theUsedCharLen = 0;
9ce05555
A
369
370 while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
cf7d2af9 371 if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break;
9ce05555
A
372
373 for (idx = 0;idx < usedLen;idx++) {
374 if (CFUniCharIsDecomposableCharacter(charBuffer[idx], isHFSPlus)) {
375 decompedLen = CFUniCharDecomposeCharacter(charBuffer[idx], decompBuffer, MAX_DECOMPOSED_LENGTH);
376 *usedCharLen = theUsedCharLen;
377
378 for (decompIndex = 0;decompIndex < decompedLen;decompIndex++) {
379 if (decompBuffer[decompIndex] > 0xFFFF) { // Non-BMP
380 if (theUsedCharLen + 2 > maxCharLen) return processedByteLen;
381 theUsedCharLen += 2;
382 if (maxCharLen) {
383 charBuffer[idx] = charBuffer[idx] - 0x10000;
384 *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL;
385 *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL;
386 }
387 } else {
388 if (theUsedCharLen + 1 > maxCharLen) return processedByteLen;
389 ++theUsedCharLen;
390 *(characters++) = charBuffer[idx];
391 }
392 }
393 } else {
394 if (maxCharLen) *(characters++) = charBuffer[idx];
395 ++theUsedCharLen;
396 }
397 }
398 processedByteLen++;
399 }
400
401 *usedCharLen = theUsedCharLen;
402 return processedByteLen;
403}
404
bd5b749c
A
405static CFIndex __CFToBytesCheapMultiByteWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
406 CFIndex processedCharLen = 0;
cf7d2af9 407 uint8_t byteBuffer[__CFMaximumConvertedLength];
bd5b749c 408 CFIndex usedLen;
9ce05555
A
409
410 *usedByteLen = 0;
411
412 while ((processedCharLen < numChars) && (!maxByteLen || (*usedByteLen < maxByteLen))) {
cf7d2af9 413 if (!(usedLen = ((CFStringEncodingCheapMultiByteToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], byteBuffer))) break;
9ce05555
A
414
415 if (maxByteLen) {
bd5b749c 416 CFIndex idx;
9ce05555
A
417
418 if (*usedByteLen + usedLen > maxByteLen) break;
419
420 for (idx = 0;idx <usedLen;idx++) {
421 bytes[*usedByteLen + idx] = byteBuffer[idx];
422 }
423 }
424
425 *usedByteLen += usedLen;
426 processedCharLen++;
427 }
428
429 return processedCharLen;
430}
431
bd5b749c
A
432static CFIndex __CFToUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
433 CFIndex processedByteLen = 0;
9ce05555 434 UniChar character;
bd5b749c 435 CFIndex usedLen;
9ce05555
A
436
437 *usedCharLen = 0;
438
439 while (numBytes && (!maxCharLen || (*usedCharLen < maxCharLen))) {
cf7d2af9 440 if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break;
9ce05555
A
441
442 if (maxCharLen) *(characters++) = character;
443 (*usedCharLen)++;
444 processedByteLen += usedLen;
445 bytes += usedLen;
446 numBytes -= usedLen;
447 }
448
449 return processedByteLen;
450}
451
bd5b749c
A
452static CFIndex __CFToCanonicalUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
453 CFIndex processedByteLen = 0;
9ce05555
A
454 UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
455 UniChar character;
bd5b749c
A
456 CFIndex usedLen;
457 CFIndex decomposedLen;
458 CFIndex theUsedCharLen = 0;
9ce05555
A
459 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
460
461 while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
cf7d2af9 462 if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break;
9ce05555
A
463
464 if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
bd5b749c 465 CFIndex idx;
9ce05555
A
466
467 decomposedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
468 *usedCharLen = theUsedCharLen;
469
470 for (idx = 0;idx < decomposedLen;idx++) {
471 if (charBuffer[idx] > 0xFFFF) { // Non-BMP
472 if (theUsedCharLen + 2 > maxCharLen) return processedByteLen;
473 theUsedCharLen += 2;
474 if (maxCharLen) {
475 charBuffer[idx] = charBuffer[idx] - 0x10000;
bd5b749c
A
476 *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
477 *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
9ce05555
A
478 }
479 } else {
480 if (theUsedCharLen + 1 > maxCharLen) return processedByteLen;
481 ++theUsedCharLen;
482 *(characters++) = charBuffer[idx];
483 }
484 }
485 } else {
486 if (maxCharLen) *(characters++) = character;
487 ++theUsedCharLen;
488 }
489
490 processedByteLen += usedLen;
491 bytes += usedLen;
492 numBytes -= usedLen;
493 }
494 *usedCharLen = theUsedCharLen;
495 return processedByteLen;
496}
497
498/* static functions
499*/
cf7d2af9 500CF_INLINE _CFEncodingConverter *__CFEncodingConverterFromDefinition(const CFStringEncodingConverter *definition, CFStringEncoding encoding) {
9ce05555 501#define NUM_OF_ENTRIES_CYCLE (10)
bd5b749c
A
502 static uint32_t _currentIndex = 0;
503 static uint32_t _allocatedSize = 0;
9ce05555
A
504 static _CFEncodingConverter *_allocatedEntries = NULL;
505 _CFEncodingConverter *converter;
506
507
9ce05555
A
508 if ((_currentIndex + 1) >= _allocatedSize) {
509 _currentIndex = 0;
510 _allocatedSize = 0;
511 _allocatedEntries = NULL;
512 }
513 if (_allocatedEntries == NULL) { // Not allocated yet
bd5b749c 514 _allocatedEntries = (_CFEncodingConverter *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(_CFEncodingConverter) * NUM_OF_ENTRIES_CYCLE, 0);
9ce05555
A
515 _allocatedSize = NUM_OF_ENTRIES_CYCLE;
516 converter = &(_allocatedEntries[_currentIndex]);
517 } else {
518 converter = &(_allocatedEntries[++_currentIndex]);
519 }
cf7d2af9
A
520
521 memset(converter, 0, sizeof(_CFEncodingConverter));
522
523 converter->definition = definition;
9ce05555
A
524
525 switch (definition->encodingClass) {
526 case kCFStringEncodingConverterStandard:
cf7d2af9
A
527 converter->toBytes = NULL;
528 converter->toUnicode = NULL;
529 converter->toCanonicalUnicode = NULL;
9ce05555
A
530 break;
531
532 case kCFStringEncodingConverterCheapEightBit:
533 converter->toBytes = __CFToBytesCheapEightBitWrapper;
534 converter->toUnicode = __CFToUnicodeCheapEightBitWrapper;
535 converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapEightBitWrapper;
9ce05555
A
536 break;
537
538 case kCFStringEncodingConverterStandardEightBit:
539 converter->toBytes = __CFToBytesStandardEightBitWrapper;
540 converter->toUnicode = __CFToUnicodeStandardEightBitWrapper;
541 converter->toCanonicalUnicode = __CFToCanonicalUnicodeStandardEightBitWrapper;
9ce05555
A
542 break;
543
544 case kCFStringEncodingConverterCheapMultiByte:
545 converter->toBytes = __CFToBytesCheapMultiByteWrapper;
546 converter->toUnicode = __CFToUnicodeCheapMultiByteWrapper;
547 converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapMultiByteWrapper;
cf7d2af9
A
548 break;
549
856091c5 550#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
cf7d2af9
A
551 case kCFStringEncodingConverterICU:
552 converter->toBytes = (_CFToBytesProc)__CFStringEncodingGetICUName(encoding);
9ce05555 553 break;
856091c5 554#endif
9ce05555
A
555
556 case kCFStringEncodingConverterPlatformSpecific:
cf7d2af9 557 break;
9ce05555
A
558
559 default: // Shouln't be here
560 return NULL;
561 }
562
9ce05555
A
563 converter->toBytesFallback = (definition->toBytesFallback ? definition->toBytesFallback : __CFDefaultToBytesFallbackProc);
564 converter->toUnicodeFallback = (definition->toUnicodeFallback ? definition->toUnicodeFallback : __CFDefaultToUnicodeFallbackProc);
9ce05555
A
565
566 return converter;
567}
568
cf7d2af9
A
569CF_INLINE const CFStringEncodingConverter *__CFStringEncodingConverterGetDefinition(CFStringEncoding encoding) {
570 switch (encoding) {
571 case kCFStringEncodingUTF8:
572 return &__CFConverterUTF8;
573
574 case kCFStringEncodingMacRoman:
575 return &__CFConverterMacRoman;
576
577 case kCFStringEncodingWindowsLatin1:
578 return &__CFConverterWinLatin1;
579
9ce05555
A
580 case kCFStringEncodingASCII:
581 return &__CFConverterASCII;
582
583 case kCFStringEncodingISOLatin1:
584 return &__CFConverterISOLatin1;
585
9ce05555
A
586
587 case kCFStringEncodingNextStepLatin:
588 return &__CFConverterNextStepLatin;
589
9ce05555
A
590
591 default:
cf7d2af9 592 return __CFStringEncodingGetExternalConverter(encoding);
9ce05555
A
593 }
594}
595
bd5b749c 596static const _CFEncodingConverter *__CFGetConverter(uint32_t encoding) {
cf7d2af9
A
597 const _CFEncodingConverter *converter = NULL;
598 const _CFEncodingConverter **commonConverterSlot = NULL;
599 static _CFEncodingConverter *commonConverters[3] = {NULL, NULL, NULL}; // UTF8, MacRoman/WinLatin1, and the default encoding*
600 static CFMutableDictionaryRef mappingTable = NULL;
d7384798 601 static OSSpinLock lock = OS_SPINLOCK_INIT;
cf7d2af9
A
602
603 switch (encoding) {
604 case kCFStringEncodingUTF8: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[0]); break;
605
606 /* the swith here should avoid possible bootstrap issues in the default: case below when invoked from CFStringGetSystemEncoding() */
856091c5 607#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_EMBEDDED_MINI || DEPLOYMENT_TARGET_LINUX
cf7d2af9
A
608 case kCFStringEncodingMacRoman: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[1]); break;
609#elif DEPLOYMENT_TARGET_WINDOWS
610 case kCFStringEncodingWindowsLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break;
611#else
612#warning This case must match __defaultEncoding value defined in CFString.c
613 case kCFStringEncodingISOLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break;
856091c5 614#endif
cf7d2af9
A
615
616 default: if (CFStringGetSystemEncoding() == encoding) commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[2]); break;
617 }
618
d7384798 619 OSSpinLockLock(&lock);
cf7d2af9 620 converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot);
d7384798 621 OSSpinLockUnlock(&lock);
9ce05555 622
cf7d2af9
A
623 if (NULL == converter) {
624 const CFStringEncodingConverter *definition = __CFStringEncodingConverterGetDefinition(encoding);
9ce05555 625
cf7d2af9 626 if (NULL != definition) {
d7384798 627 OSSpinLockLock(&lock);
cf7d2af9 628 converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot);
9ce05555 629
cf7d2af9
A
630 if (NULL == converter) {
631 converter = __CFEncodingConverterFromDefinition(definition, encoding);
632
633 if (NULL == commonConverterSlot) {
634 if (NULL == mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, NULL, NULL);
635
636 CFDictionarySetValue(mappingTable, (const void *)(uintptr_t)encoding, converter);
637 } else {
638 *commonConverterSlot = converter;
639 }
640 }
d7384798 641 OSSpinLockUnlock(&lock);
9ce05555
A
642 }
643 }
644
cf7d2af9 645 return converter;
9ce05555
A
646}
647
648/* Public API
649*/
bd5b749c 650uint32_t CFStringEncodingUnicodeToBytes(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
9ce05555
A
651 if (encoding == kCFStringEncodingUTF8) {
652 static CFStringEncodingToBytesProc __CFToUTF8 = NULL;
bd5b749c
A
653 CFIndex convertedCharLen;
654 CFIndex usedLen;
9ce05555
A
655
656
657 if ((flags & kCFStringEncodingUseCanonical) || (flags & kCFStringEncodingUseHFSPlusCanonical)) {
658 (void)CFUniCharDecompose(characters, numChars, &convertedCharLen, (void *)bytes, maxByteLen, &usedLen, true, kCFUniCharUTF8Format, (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false));
659 } else {
660 if (!__CFToUTF8) {
661 const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
662 __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes;
663 }
bd5b749c 664 convertedCharLen = __CFToUTF8(0, characters, numChars, bytes, maxByteLen, &usedLen);
9ce05555
A
665 }
666 if (usedCharLen) *usedCharLen = convertedCharLen;
667 if (usedByteLen) *usedByteLen = usedLen;
668
669 if (convertedCharLen == numChars) {
670 return kCFStringEncodingConversionSuccess;
cf7d2af9
A
671 } else if ((maxByteLen > 0) && ((maxByteLen - usedLen) < 10)) { // could be filled outbuf
672 UTF16Char character = characters[convertedCharLen];
673
674 if (((character >= kSurrogateLowStart) && (character <= kSurrogateLowEnd)) || ((character >= kSurrogateHighStart) && (character <= kSurrogateHighEnd) && ((1 == (numChars - convertedCharLen)) || (characters[convertedCharLen + 1] < kSurrogateLowStart) || (characters[convertedCharLen + 1] > kSurrogateLowEnd)))) return kCFStringEncodingInvalidInputStream;
675
9ce05555
A
676 return kCFStringEncodingInsufficientOutputBufferLength;
677 } else {
678 return kCFStringEncodingInvalidInputStream;
679 }
680 } else {
681 const _CFEncodingConverter *converter = __CFGetConverter(encoding);
bd5b749c
A
682 CFIndex usedLen = 0;
683 CFIndex localUsedByteLen;
684 CFIndex theUsedByteLen = 0;
685 uint32_t theResult = kCFStringEncodingConversionSuccess;
9ce05555
A
686 CFStringEncodingToBytesPrecomposeProc toBytesPrecompose = NULL;
687 CFStringEncodingIsValidCombiningCharacterProc isValidCombiningChar = NULL;
688
689 if (!converter) return kCFStringEncodingConverterUnavailable;
690
691 if (flags & kCFStringEncodingSubstituteCombinings) {
cf7d2af9 692 if (!(flags & kCFStringEncodingAllowLossyConversion)) isValidCombiningChar = converter->definition->isValidCombiningChar;
9ce05555 693 } else {
cf7d2af9 694 isValidCombiningChar = converter->definition->isValidCombiningChar;
9ce05555 695 if (!(flags & kCFStringEncodingIgnoreCombinings)) {
cf7d2af9 696 toBytesPrecompose = converter->definition->toBytesPrecompose;
9ce05555
A
697 flags |= kCFStringEncodingComposeCombinings;
698 }
699 }
700
856091c5 701#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
cf7d2af9 702 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToBytes((const char *)converter->toBytes, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen);
856091c5 703#endif
cf7d2af9
A
704
705 /* Platform converter */
706 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformUnicodeToBytes(encoding, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen);
9ce05555
A
707
708 while ((usedLen < numChars) && (!maxByteLen || (theUsedByteLen < maxByteLen))) {
709 if ((usedLen += TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) < numChars) {
bd5b749c 710 CFIndex dummy;
9ce05555
A
711
712 if (isValidCombiningChar && (usedLen > 0) && isValidCombiningChar(characters[usedLen])) {
713 if (toBytesPrecompose) {
bd5b749c 714 CFIndex localUsedLen = usedLen;
9ce05555
A
715
716 while (isValidCombiningChar(characters[--usedLen]));
717 theUsedByteLen += localUsedByteLen;
cf7d2af9 718 if (converter->definition->maxBytesPerChar > 1) {
9ce05555
A
719 TO_BYTE(converter, flags, characters + usedLen, localUsedLen - usedLen, NULL, 0, &localUsedByteLen);
720 theUsedByteLen -= localUsedByteLen;
721 } else {
722 theUsedByteLen--;
723 }
724 if ((localUsedLen = toBytesPrecompose(flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) > 0) {
725 usedLen += localUsedLen;
726 if ((usedLen < numChars) && isValidCombiningChar(characters[usedLen])) { // There is a non-base char not combined remaining
727 theUsedByteLen += localUsedByteLen;
728 theResult = kCFStringEncodingInvalidInputStream;
729 break;
730 }
731 } else if (flags & kCFStringEncodingAllowLossyConversion) {
732 uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
733
734 if (lossyByte) {
cf7d2af9 735 while (isValidCombiningChar(characters[++usedLen]));
9ce05555
A
736 localUsedByteLen = 1;
737 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
738 } else {
739 ++usedLen;
740 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
741 }
742 } else {
743 theResult = kCFStringEncodingInvalidInputStream;
744 break;
745 }
746 } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up
747 theUsedByteLen += localUsedByteLen;
748 theResult = kCFStringEncodingInsufficientOutputBufferLength;
749 break;
750 } else if (flags & kCFStringEncodingIgnoreCombinings) {
751 while ((++usedLen < numChars) && isValidCombiningChar(characters[usedLen]));
752 } else {
753 uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
754
755 theUsedByteLen += localUsedByteLen;
756 if (lossyByte) {
757 ++usedLen;
758 localUsedByteLen = 1;
759 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
760 } else {
761 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
762 }
763 }
764 } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up
765 theUsedByteLen += localUsedByteLen;
766
767 if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) {
bd5b749c 768 CFIndex localUsedLen;
9ce05555
A
769
770 localUsedByteLen = 0;
db04bbf9
A
771 // after the buffer is full, we still try out all the rest of the characters
772 // if all characters cannot be converted, we mark the result as insufficient output buffer
773 while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) {
774 if (localUsedByteLen == 0) {
775 usedLen += localUsedLen;
776 }
777 }
9ce05555
A
778 }
779 if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength;
780 break;
781 } else if (flags & kCFStringEncodingAllowLossyConversion) {
782 uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
783
784 theUsedByteLen += localUsedByteLen;
785 if (lossyByte) {
786 ++usedLen;
787 localUsedByteLen = 1;
788 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
789 } else {
790 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
791 }
792 } else {
793 theUsedByteLen += localUsedByteLen;
794 theResult = kCFStringEncodingInvalidInputStream;
795 break;
796 }
797 }
798 theUsedByteLen += localUsedByteLen;
799 }
800
801 if (usedLen < numChars && maxByteLen && theResult == kCFStringEncodingConversionSuccess) {
802 if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) {
bd5b749c 803 CFIndex localUsedLen;
9ce05555
A
804
805 localUsedByteLen = 0;
db04bbf9
A
806 while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) {
807 if (!localUsedByteLen) {
808 usedLen += localUsedLen;
809 }
810 }
9ce05555
A
811 }
812 if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength;
813 }
814 if (usedByteLen) *usedByteLen = theUsedByteLen;
815 if (usedCharLen) *usedCharLen = usedLen;
816
817 return theResult;
818 }
819}
820
bd5b749c 821uint32_t CFStringEncodingBytesToUnicode(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
9ce05555 822 const _CFEncodingConverter *converter = __CFGetConverter(encoding);
bd5b749c
A
823 CFIndex usedLen = 0;
824 CFIndex theUsedCharLen = 0;
825 CFIndex localUsedCharLen;
826 uint32_t theResult = kCFStringEncodingConversionSuccess;
9ce05555
A
827
828 if (!converter) return kCFStringEncodingConverterUnavailable;
829
856091c5 830#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
cf7d2af9 831 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToUnicode((const char *)converter->toBytes, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen);
856091c5 832#endif
cf7d2af9
A
833
834 /* Platform converter */
835 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformBytesToUnicode(encoding, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen);
9ce05555
A
836
837 while ((usedLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
838 if ((usedLen += TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen)) < numBytes) {
bd5b749c 839 CFIndex tempUsedCharLen;
9ce05555 840
bd5b749c 841 if (maxCharLen && ((maxCharLen == theUsedCharLen + localUsedCharLen) || (((flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical)) || (maxCharLen == theUsedCharLen + localUsedCharLen + 1)) && TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, NULL, 0, &tempUsedCharLen)))) { // buffer was filled up
9ce05555
A
842 theUsedCharLen += localUsedCharLen;
843 theResult = kCFStringEncodingInsufficientOutputBufferLength;
844 break;
845 } else if (flags & kCFStringEncodingAllowLossyConversion) {
846 theUsedCharLen += localUsedCharLen;
847 usedLen += TO_UNICODE_FALLBACK(converter, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen);
848 } else {
849 theUsedCharLen += localUsedCharLen;
850 theResult = kCFStringEncodingInvalidInputStream;
851 break;
852 }
853 }
854 theUsedCharLen += localUsedCharLen;
855 }
856
857 if (usedLen < numBytes && maxCharLen && theResult == kCFStringEncodingConversionSuccess) {
858 theResult = kCFStringEncodingInsufficientOutputBufferLength;
859 }
860 if (usedCharLen) *usedCharLen = theUsedCharLen;
861 if (usedByteLen) *usedByteLen = usedLen;
862
863 return theResult;
864}
865
a48904a4 866CF_PRIVATE bool CFStringEncodingIsValidEncoding(uint32_t encoding) {
9ce05555
A
867 return (CFStringEncodingGetConverter(encoding) ? true : false);
868}
869
a48904a4 870CF_PRIVATE CFIndex CFStringEncodingCharLengthForBytes(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
cf7d2af9 871 const _CFEncodingConverter *converter = __CFGetConverter(encoding);
9ce05555 872
cf7d2af9 873 if (converter) {
856091c5 874#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
cf7d2af9 875 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUCharLength((const char *)converter->toBytes, flags, bytes, numBytes);
856091c5
A
876#endif
877
cf7d2af9 878 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformCharLengthForBytes(encoding, flags, bytes, numBytes);
9ce05555 879
cf7d2af9 880 if (1 == converter->definition->maxBytesPerChar) return numBytes;
9ce05555 881
cf7d2af9
A
882 if (NULL == converter->definition->toUnicodeLen) {
883 CFIndex usedByteLen = 0;
884 CFIndex totalLength = 0;
885 CFIndex usedCharLen;
9ce05555 886
cf7d2af9
A
887 while (numBytes > 0) {
888 usedByteLen = TO_UNICODE(converter, flags, bytes, numBytes, NULL, 0, &usedCharLen);
889
890 bytes += usedByteLen;
891 numBytes -= usedByteLen;
892 totalLength += usedCharLen;
893
894 if (numBytes > 0) {
895 if (0 == (flags & kCFStringEncodingAllowLossyConversion)) return 0;
9ce05555 896
cf7d2af9
A
897 usedByteLen = TO_UNICODE_FALLBACK(converter, bytes, numBytes, NULL, 0, &usedCharLen);
898
899 bytes += usedByteLen;
900 numBytes -= usedByteLen;
901 totalLength += usedCharLen;
902 }
903 }
904
905 return totalLength;
bd5b749c 906 } else {
cf7d2af9
A
907 return converter->definition->toUnicodeLen(flags, bytes, numBytes);
908 }
9ce05555
A
909 }
910
911 return 0;
912}
913
a48904a4 914CF_PRIVATE CFIndex CFStringEncodingByteLengthForCharacters(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars) {
9ce05555
A
915 const _CFEncodingConverter *converter = __CFGetConverter(encoding);
916
917 if (converter) {
856091c5 918#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
cf7d2af9 919 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUByteLength((const char *)converter->toBytes, flags, characters, numChars);
856091c5 920#endif
cf7d2af9
A
921
922 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformByteLengthForCharacters(encoding, flags, characters, numChars);
923
924 if (1 == converter->definition->maxBytesPerChar) return numChars;
925
926 if (NULL == converter->definition->toBytesLen) {
8ca704e1 927 CFIndex usedByteLen;
9ce05555 928
8ca704e1 929 return ((kCFStringEncodingConversionSuccess == CFStringEncodingUnicodeToBytes(encoding, flags, characters, numChars, NULL, NULL, 0, &usedByteLen)) ? usedByteLen : 0);
bd5b749c 930 } else {
cf7d2af9
A
931 return converter->definition->toBytesLen(flags, characters, numChars);
932 }
9ce05555
A
933 }
934
935 return 0;
936}
937
a48904a4 938void CFStringEncodingRegisterFallbackProcedures(uint32_t encoding, CFStringEncodingToBytesFallbackProc toBytes, CFStringEncodingToUnicodeFallbackProc toUnicode) {
cf7d2af9
A
939 _CFEncodingConverter *converter = (_CFEncodingConverter *)__CFGetConverter(encoding);
940
941 if (NULL != converter) {
942 const CFStringEncodingConverter *body = CFStringEncodingGetConverter(encoding);
9ce05555 943
cf7d2af9
A
944 converter->toBytesFallback = ((NULL == toBytes) ? ((NULL == body) ? __CFDefaultToBytesFallbackProc : body->toBytesFallback) : toBytes);
945 converter->toUnicodeFallback = ((NULL == toUnicode) ? ((NULL == body) ? __CFDefaultToUnicodeFallbackProc : body->toUnicodeFallback) : toUnicode);
9ce05555
A
946 }
947}
948
a48904a4 949CF_PRIVATE const CFStringEncodingConverter *CFStringEncodingGetConverter(uint32_t encoding) {
cf7d2af9
A
950 const _CFEncodingConverter *converter = __CFGetConverter(encoding);
951
952 return ((NULL == converter) ? NULL : converter->definition);
9ce05555
A
953}
954
cf7d2af9 955static const CFStringEncoding __CFBuiltinEncodings[] = {
9ce05555
A
956 kCFStringEncodingMacRoman,
957 kCFStringEncodingWindowsLatin1,
958 kCFStringEncodingISOLatin1,
959 kCFStringEncodingNextStepLatin,
960 kCFStringEncodingASCII,
961 kCFStringEncodingUTF8,
d8925383 962 /* These seven are available only in CFString-level */
9ce05555 963 kCFStringEncodingNonLossyASCII,
d8925383
A
964
965 kCFStringEncodingUTF16,
966 kCFStringEncodingUTF16BE,
967 kCFStringEncodingUTF16LE,
968
969 kCFStringEncodingUTF32,
970 kCFStringEncodingUTF32BE,
971 kCFStringEncodingUTF32LE,
972
9ce05555
A
973 kCFStringEncodingInvalidId,
974};
975
cf7d2af9
A
976static CFComparisonResult __CFStringEncodingComparator(const void *v1, const void *v2, void *context) {
977 CFComparisonResult val1 = (*(const CFStringEncoding *)v1) & 0xFFFF;
978 CFComparisonResult val2 = (*(const CFStringEncoding *)v2) & 0xFFFF;
9ce05555 979
cf7d2af9 980 return ((val1 == val2) ? ((CFComparisonResult)(*(const CFStringEncoding *)v1) - (CFComparisonResult)(*(const CFStringEncoding *)v2)) : val1 - val2);
9ce05555
A
981}
982
cf7d2af9
A
983static void __CFStringEncodingFliterDupes(CFStringEncoding *encodings, CFIndex numSlots) {
984 CFStringEncoding last = kCFStringEncodingInvalidId;
985 const CFStringEncoding *limitEncodings = encodings + numSlots;
986
987 while (encodings < limitEncodings) {
988 if (last == *encodings) {
989 if ((encodings + 1) < limitEncodings) memmove(encodings, encodings + 1, sizeof(CFStringEncoding) * (limitEncodings - encodings - 1));
990 --limitEncodings;
991 } else {
992 last = *(encodings++);
993 }
994 }
995}
996
a48904a4 997CF_PRIVATE const CFStringEncoding *CFStringEncodingListOfAvailableEncodings(void) {
cf7d2af9
A
998 static const CFStringEncoding *encodings = NULL;
999
1000 if (NULL == encodings) {
1001 CFStringEncoding *list = (CFStringEncoding *)__CFBuiltinEncodings;
1002 CFIndex numICUConverters = 0, numPlatformConverters = 0;
856091c5 1003#if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX
cf7d2af9 1004 CFStringEncoding *icuConverters = __CFStringEncodingCreateICUEncodings(NULL, &numICUConverters);
856091c5
A
1005#else
1006 CFStringEncoding *icuConverters = NULL;
1007#endif
cf7d2af9
A
1008 CFStringEncoding *platformConverters = __CFStringEncodingCreateListOfAvailablePlatformConverters(NULL, &numPlatformConverters);
1009
1010 if ((NULL != icuConverters) || (NULL != platformConverters)) {
1011 CFIndex numSlots = (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters + numPlatformConverters;
1012
1013 list = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * numSlots, 0);
1014
1015 memcpy(list, __CFBuiltinEncodings, sizeof(__CFBuiltinEncodings));
1016
1017 if (NULL != icuConverters) {
1018 memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)), icuConverters, sizeof(CFStringEncoding) * numICUConverters);
1019 CFAllocatorDeallocate(NULL, icuConverters);
1020 }
1021
1022 if (NULL != platformConverters) {
1023 memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters, platformConverters, sizeof(CFStringEncoding) * numPlatformConverters);
1024 CFAllocatorDeallocate(NULL, platformConverters);
1025 }
1026
1027 CFQSortArray(list, numSlots, sizeof(CFStringEncoding), (CFComparatorFunction)__CFStringEncodingComparator, NULL);
1028 __CFStringEncodingFliterDupes(list, numSlots);
1029 }
1030 if (!OSAtomicCompareAndSwapPtrBarrier(NULL, list, (void * volatile *)&encodings) && (list != __CFBuiltinEncodings)) CFAllocatorDeallocate(NULL, list);
1031 }
1032
1033 return encodings;
1034}
bd5b749c
A
1035
1036#undef TO_BYTE
1037#undef TO_UNICODE
1038#undef ASCIINewLine
1039#undef kSurrogateHighStart
1040#undef kSurrogateHighEnd
1041#undef kSurrogateLowStart
1042#undef kSurrogateLowEnd
1043#undef TO_BYTE_FALLBACK
1044#undef TO_UNICODE_FALLBACK
1045#undef EXTRA_BASE
1046#undef NUM_OF_ENTRIES_CYCLE
1047