]> git.saurik.com Git - apple/cf.git/blob - CFStringEncodingConverter.c
52b24765c5c74646592650c278cf6d3e42e85d52
[apple/cf.git] / CFStringEncodingConverter.c
1 /*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /* CFStringEncodingConverter.c
24 Copyright (c) 1998-2009, Apple Inc. All rights reserved.
25 Responsibility: Aki Inoue
26 */
27
28 #include "CFInternal.h"
29 #include <CoreFoundation/CFArray.h>
30 #include <CoreFoundation/CFDictionary.h>
31 #include "CFICUConverters.h"
32 #include <CoreFoundation/CFUniChar.h>
33 #include <CoreFoundation/CFPriv.h>
34 #include "CFUnicodeDecomposition.h"
35 #include "CFStringEncodingConverterExt.h"
36 #include "CFStringEncodingConverterPriv.h"
37 #include <stdlib.h>
38 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
39 #include <pthread.h>
40 #endif
41
42 typedef CFIndex (*_CFToBytesProc)(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen);
43 typedef CFIndex (*_CFToUnicodeProc)(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen);
44
45 typedef struct {
46 const CFStringEncodingConverter *definition;
47 _CFToBytesProc toBytes;
48 _CFToUnicodeProc toUnicode;
49 _CFToUnicodeProc toCanonicalUnicode;
50 CFStringEncodingToBytesFallbackProc toBytesFallback;
51 CFStringEncodingToUnicodeFallbackProc toUnicodeFallback;
52 } _CFEncodingConverter;
53
54 /* Macros
55 */
56 #define TO_BYTE(conv,flags,chars,numChars,bytes,max,used) (conv->toBytes ? conv->toBytes(conv,flags,chars,numChars,bytes,max,used) : ((CFStringEncodingToBytesProc)conv->definition->toBytes)(flags,chars,numChars,bytes,max,used))
57 #define TO_UNICODE(conv,flags,bytes,numBytes,chars,max,used) (conv->toUnicode ? (flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical) ? conv->toCanonicalUnicode(conv,flags,bytes,numBytes,chars,max,used) : conv->toUnicode(conv,flags,bytes,numBytes,chars,max,used)) : ((CFStringEncodingToUnicodeProc)conv->definition->toUnicode)(flags,bytes,numBytes,chars,max,used))
58
59 #define ASCIINewLine 0x0a
60 #define kSurrogateHighStart 0xD800
61 #define kSurrogateHighEnd 0xDBFF
62 #define kSurrogateLowStart 0xDC00
63 #define kSurrogateLowEnd 0xDFFF
64
65 static const uint8_t __CFMaximumConvertedLength = 20;
66
67 /* Mapping 128..255 to lossy ASCII
68 */
69 static const struct {
70 unsigned char chars[4];
71 } _toLossyASCIITable[] = {
72 {{' ', 0, 0, 0}}, // NO-BREAK SPACE
73 {{'!', 0, 0, 0}}, // INVERTED EXCLAMATION MARK
74 {{'c', 0, 0, 0}}, // CENT SIGN
75 {{'L', 0, 0, 0}}, // POUND SIGN
76 {{'$', 0, 0, 0}}, // CURRENCY SIGN
77 {{'Y', 0, 0, 0}}, // YEN SIGN
78 {{'|', 0, 0, 0}}, // BROKEN BAR
79 {{0, 0, 0, 0}}, // SECTION SIGN
80 {{0, 0, 0, 0}}, // DIAERESIS
81 {{'(', 'C', ')', 0}}, // COPYRIGHT SIGN
82 {{'a', 0, 0, 0}}, // FEMININE ORDINAL INDICATOR
83 {{'<', '<', 0, 0}}, // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
84 {{0, 0, 0, 0}}, // NOT SIGN
85 {{'-', 0, 0, 0}}, // SOFT HYPHEN
86 {{'(', 'R', ')', 0}}, // REGISTERED SIGN
87 {{0, 0, 0, 0}}, // MACRON
88 {{0, 0, 0, 0}}, // DEGREE SIGN
89 {{'+', '-', 0, 0}}, // PLUS-MINUS SIGN
90 {{'2', 0, 0, 0}}, // SUPERSCRIPT TWO
91 {{'3', 0, 0, 0}}, // SUPERSCRIPT THREE
92 {{0, 0, 0, 0}}, // ACUTE ACCENT
93 {{0, 0, 0, 0}}, // MICRO SIGN
94 {{0, 0, 0, 0}}, // PILCROW SIGN
95 {{0, 0, 0, 0}}, // MIDDLE DOT
96 {{0, 0, 0, 0}}, // CEDILLA
97 {{'1', 0, 0, 0}}, // SUPERSCRIPT ONE
98 {{'o', 0, 0, 0}}, // MASCULINE ORDINAL INDICATOR
99 {{'>', '>', 0, 0}}, // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
100 {{'1', '/', '4', 0}}, // VULGAR FRACTION ONE QUARTER
101 {{'1', '/', '2', 0}}, // VULGAR FRACTION ONE HALF
102 {{'3', '/', '4', 0}}, // VULGAR FRACTION THREE QUARTERS
103 {{'?', 0, 0, 0}}, // INVERTED QUESTION MARK
104 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH GRAVE
105 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH ACUTE
106 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
107 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH TILDE
108 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH DIAERESIS
109 {{'A', 0, 0, 0}}, // LATIN CAPITAL LETTER A WITH RING ABOVE
110 {{'A', 'E', 0, 0}}, // LATIN CAPITAL LETTER AE
111 {{'C', 0, 0, 0}}, // LATIN CAPITAL LETTER C WITH CEDILLA
112 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH GRAVE
113 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH ACUTE
114 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
115 {{'E', 0, 0, 0}}, // LATIN CAPITAL LETTER E WITH DIAERESIS
116 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH GRAVE
117 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH ACUTE
118 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
119 {{'I', 0, 0, 0}}, // LATIN CAPITAL LETTER I WITH DIAERESIS
120 {{'T', 'H', 0, 0}}, // LATIN CAPITAL LETTER ETH (Icelandic)
121 {{'N', 0, 0, 0}}, // LATIN CAPITAL LETTER N WITH TILDE
122 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH GRAVE
123 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH ACUTE
124 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
125 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH TILDE
126 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH DIAERESIS
127 {{'X', 0, 0, 0}}, // MULTIPLICATION SIGN
128 {{'O', 0, 0, 0}}, // LATIN CAPITAL LETTER O WITH STROKE
129 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH GRAVE
130 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH ACUTE
131 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
132 {{'U', 0, 0, 0}}, // LATIN CAPITAL LETTER U WITH DIAERESIS
133 {{'Y', 0, 0, 0}}, // LATIN CAPITAL LETTER Y WITH ACUTE
134 {{'t', 'h', 0, 0}}, // LATIN CAPITAL LETTER THORN (Icelandic)
135 {{'s', 0, 0, 0}}, // LATIN SMALL LETTER SHARP S (German)
136 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH GRAVE
137 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH ACUTE
138 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH CIRCUMFLEX
139 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH TILDE
140 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH DIAERESIS
141 {{'a', 0, 0, 0}}, // LATIN SMALL LETTER A WITH RING ABOVE
142 {{'a', 'e', 0, 0}}, // LATIN SMALL LETTER AE
143 {{'c', 0, 0, 0}}, // LATIN SMALL LETTER C WITH CEDILLA
144 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH GRAVE
145 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH ACUTE
146 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH CIRCUMFLEX
147 {{'e', 0, 0, 0}}, // LATIN SMALL LETTER E WITH DIAERESIS
148 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH GRAVE
149 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH ACUTE
150 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH CIRCUMFLEX
151 {{'i', 0, 0, 0}}, // LATIN SMALL LETTER I WITH DIAERESIS
152 {{'T', 'H', 0, 0}}, // LATIN SMALL LETTER ETH (Icelandic)
153 {{'n', 0, 0, 0}}, // LATIN SMALL LETTER N WITH TILDE
154 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH GRAVE
155 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH ACUTE
156 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH CIRCUMFLEX
157 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH TILDE
158 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH DIAERESIS
159 {{'/', 0, 0, 0}}, // DIVISION SIGN
160 {{'o', 0, 0, 0}}, // LATIN SMALL LETTER O WITH STROKE
161 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH GRAVE
162 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH ACUTE
163 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH CIRCUMFLEX
164 {{'u', 0, 0, 0}}, // LATIN SMALL LETTER U WITH DIAERESIS
165 {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH ACUTE
166 {{'t', 'h', 0, 0}}, // LATIN SMALL LETTER THORN (Icelandic)
167 {{'y', 0, 0, 0}}, // LATIN SMALL LETTER Y WITH DIAERESIS
168 };
169
170 CF_INLINE CFIndex __CFToASCIILatin1Fallback(UniChar character, uint8_t *bytes, CFIndex maxByteLen) {
171 const uint8_t *losChars = (const uint8_t*)_toLossyASCIITable + (character - 0xA0) * sizeof(uint8_t[4]);
172 CFIndex numBytes = 0;
173 CFIndex idx, max = (maxByteLen && (maxByteLen < 4) ? maxByteLen : 4);
174
175 for (idx = 0;idx < max;idx++) {
176 if (losChars[idx]) {
177 if (maxByteLen) bytes[idx] = losChars[idx];
178 ++numBytes;
179 } else {
180 break;
181 }
182 }
183
184 return numBytes;
185 }
186
187 static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
188 CFIndex processCharLen = 1, filledBytesLen = 1;
189 uint8_t byte = '?';
190
191 if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range
192 byte = (uint8_t)(*characters - 0x80);
193 } else if (*characters < 0x100) {
194 *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen);
195 return 1;
196 } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) {
197 processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1);
198 } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) {
199 byte = ' ';
200 } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) {
201 byte = ASCIINewLine;
202 } else if (*characters == 0x2026) { // ellipsis
203 if (0 == maxByteLen) {
204 filledBytesLen = 3;
205 } else if (maxByteLen > 2) {
206 memset(bytes, '.', 3);
207 *usedByteLen = 3;
208 return processCharLen;
209 }
210 } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) {
211 UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
212
213 (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH);
214 if (*decomposed < 0x80) {
215 byte = (uint8_t)(*decomposed);
216 } else {
217 UTF16Char theChar = *decomposed;
218
219 return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen);
220 }
221 }
222
223 if (maxByteLen) *bytes = byte;
224 *usedByteLen = filledBytesLen;
225 return processCharLen;
226 }
227
228 static CFIndex __CFDefaultToUnicodeFallbackProc(const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
229 if (maxCharLen) *characters = (UniChar)'?';
230 *usedCharLen = 1;
231 return 1;
232 }
233
234 #define TO_BYTE_FALLBACK(conv,chars,numChars,bytes,max,used) (conv->toBytesFallback(chars,numChars,bytes,max,used))
235 #define TO_UNICODE_FALLBACK(conv,bytes,numBytes,chars,max,used) (conv->toUnicodeFallback(bytes,numBytes,chars,max,used))
236
237 #define EXTRA_BASE (0x0F00)
238
239 /* Wrapper funcs for non-standard converters
240 */
241 static CFIndex __CFToBytesCheapEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
242 CFIndex processedCharLen = 0;
243 CFIndex length = (maxByteLen && (maxByteLen < numChars) ? maxByteLen : numChars);
244 uint8_t byte;
245
246 while (processedCharLen < length) {
247 if (!((CFStringEncodingCheapEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], &byte)) break;
248
249 if (maxByteLen) bytes[processedCharLen] = byte;
250 processedCharLen++;
251 }
252
253 *usedByteLen = processedCharLen;
254 return processedCharLen;
255 }
256
257 static CFIndex __CFToUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
258 CFIndex processedByteLen = 0;
259 CFIndex length = (maxCharLen && (maxCharLen < numBytes) ? maxCharLen : numBytes);
260 UniChar character;
261
262 while (processedByteLen < length) {
263 if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break;
264
265 if (maxCharLen) characters[processedByteLen] = character;
266 processedByteLen++;
267 }
268
269 *usedCharLen = processedByteLen;
270 return processedByteLen;
271 }
272
273 static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
274 CFIndex processedByteLen = 0;
275 CFIndex theUsedCharLen = 0;
276 UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
277 CFIndex usedLen;
278 UniChar character;
279 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
280
281 while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
282 if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break;
283
284 if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
285 CFIndex idx;
286
287 usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
288 *usedCharLen = theUsedCharLen;
289
290 for (idx = 0;idx < usedLen;idx++) {
291 if (charBuffer[idx] > 0xFFFF) { // Non-BMP
292 if (theUsedCharLen + 2 > maxCharLen) return processedByteLen;
293 theUsedCharLen += 2;
294 if (maxCharLen) {
295 charBuffer[idx] = charBuffer[idx] - 0x10000;
296 *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
297 *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
298 }
299 } else {
300 if (theUsedCharLen + 1 > maxCharLen) return processedByteLen;
301 ++theUsedCharLen;
302 *(characters++) = charBuffer[idx];
303 }
304 }
305 } else {
306 if (maxCharLen) *(characters++) = character;
307 ++theUsedCharLen;
308 }
309 processedByteLen++;
310 }
311
312 *usedCharLen = theUsedCharLen;
313 return processedByteLen;
314 }
315
316 static CFIndex __CFToBytesStandardEightBitWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
317 CFIndex processedCharLen = 0;
318 uint8_t byte;
319 CFIndex usedLen;
320
321 *usedByteLen = 0;
322
323 while (numChars && (!maxByteLen || (*usedByteLen < maxByteLen))) {
324 if (!(usedLen = ((CFStringEncodingStandardEightBitToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters, numChars, &byte))) break;
325
326 if (maxByteLen) bytes[*usedByteLen] = byte;
327 (*usedByteLen)++;
328 characters += usedLen;
329 numChars -= usedLen;
330 processedCharLen += usedLen;
331 }
332
333 return processedCharLen;
334 }
335
336 static CFIndex __CFToUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
337 CFIndex processedByteLen = 0;
338 UniChar charBuffer[__CFMaximumConvertedLength];
339 CFIndex usedLen;
340
341 *usedCharLen = 0;
342
343 while ((processedByteLen < numBytes) && (!maxCharLen || (*usedCharLen < maxCharLen))) {
344 if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break;
345
346 if (maxCharLen) {
347 CFIndex idx;
348
349 if (*usedCharLen + usedLen > maxCharLen) break;
350
351 for (idx = 0;idx < usedLen;idx++) {
352 characters[*usedCharLen + idx] = charBuffer[idx];
353 }
354 }
355 *usedCharLen += usedLen;
356 processedByteLen++;
357 }
358
359 return processedByteLen;
360 }
361
362 static CFIndex __CFToCanonicalUnicodeStandardEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
363 CFIndex processedByteLen = 0;
364 UniChar charBuffer[__CFMaximumConvertedLength];
365 UTF32Char decompBuffer[MAX_DECOMPOSED_LENGTH];
366 CFIndex usedLen;
367 CFIndex decompedLen;
368 CFIndex idx, decompIndex;
369 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
370 CFIndex theUsedCharLen = 0;
371
372 while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
373 if (!(usedLen = ((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], charBuffer))) break;
374
375 for (idx = 0;idx < usedLen;idx++) {
376 if (CFUniCharIsDecomposableCharacter(charBuffer[idx], isHFSPlus)) {
377 decompedLen = CFUniCharDecomposeCharacter(charBuffer[idx], decompBuffer, MAX_DECOMPOSED_LENGTH);
378 *usedCharLen = theUsedCharLen;
379
380 for (decompIndex = 0;decompIndex < decompedLen;decompIndex++) {
381 if (decompBuffer[decompIndex] > 0xFFFF) { // Non-BMP
382 if (theUsedCharLen + 2 > maxCharLen) return processedByteLen;
383 theUsedCharLen += 2;
384 if (maxCharLen) {
385 charBuffer[idx] = charBuffer[idx] - 0x10000;
386 *(characters++) = (charBuffer[idx] >> 10) + 0xD800UL;
387 *(characters++) = (charBuffer[idx] & 0x3FF) + 0xDC00UL;
388 }
389 } else {
390 if (theUsedCharLen + 1 > maxCharLen) return processedByteLen;
391 ++theUsedCharLen;
392 *(characters++) = charBuffer[idx];
393 }
394 }
395 } else {
396 if (maxCharLen) *(characters++) = charBuffer[idx];
397 ++theUsedCharLen;
398 }
399 }
400 processedByteLen++;
401 }
402
403 *usedCharLen = theUsedCharLen;
404 return processedByteLen;
405 }
406
407 static CFIndex __CFToBytesCheapMultiByteWrapper(const void *converter, uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
408 CFIndex processedCharLen = 0;
409 uint8_t byteBuffer[__CFMaximumConvertedLength];
410 CFIndex usedLen;
411
412 *usedByteLen = 0;
413
414 while ((processedCharLen < numChars) && (!maxByteLen || (*usedByteLen < maxByteLen))) {
415 if (!(usedLen = ((CFStringEncodingCheapMultiByteToBytesProc)((const _CFEncodingConverter*)converter)->definition->toBytes)(flags, characters[processedCharLen], byteBuffer))) break;
416
417 if (maxByteLen) {
418 CFIndex idx;
419
420 if (*usedByteLen + usedLen > maxByteLen) break;
421
422 for (idx = 0;idx <usedLen;idx++) {
423 bytes[*usedByteLen + idx] = byteBuffer[idx];
424 }
425 }
426
427 *usedByteLen += usedLen;
428 processedCharLen++;
429 }
430
431 return processedCharLen;
432 }
433
434 static CFIndex __CFToUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
435 CFIndex processedByteLen = 0;
436 UniChar character;
437 CFIndex usedLen;
438
439 *usedCharLen = 0;
440
441 while (numBytes && (!maxCharLen || (*usedCharLen < maxCharLen))) {
442 if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break;
443
444 if (maxCharLen) *(characters++) = character;
445 (*usedCharLen)++;
446 processedByteLen += usedLen;
447 bytes += usedLen;
448 numBytes -= usedLen;
449 }
450
451 return processedByteLen;
452 }
453
454 static CFIndex __CFToCanonicalUnicodeCheapMultiByteWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
455 CFIndex processedByteLen = 0;
456 UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
457 UniChar character;
458 CFIndex usedLen;
459 CFIndex decomposedLen;
460 CFIndex theUsedCharLen = 0;
461 bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
462
463 while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
464 if (!(usedLen = ((CFStringEncodingCheapMultiByteToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes, numBytes, &character))) break;
465
466 if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
467 CFIndex idx;
468
469 decomposedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
470 *usedCharLen = theUsedCharLen;
471
472 for (idx = 0;idx < decomposedLen;idx++) {
473 if (charBuffer[idx] > 0xFFFF) { // Non-BMP
474 if (theUsedCharLen + 2 > maxCharLen) return processedByteLen;
475 theUsedCharLen += 2;
476 if (maxCharLen) {
477 charBuffer[idx] = charBuffer[idx] - 0x10000;
478 *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
479 *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
480 }
481 } else {
482 if (theUsedCharLen + 1 > maxCharLen) return processedByteLen;
483 ++theUsedCharLen;
484 *(characters++) = charBuffer[idx];
485 }
486 }
487 } else {
488 if (maxCharLen) *(characters++) = character;
489 ++theUsedCharLen;
490 }
491
492 processedByteLen += usedLen;
493 bytes += usedLen;
494 numBytes -= usedLen;
495 }
496 *usedCharLen = theUsedCharLen;
497 return processedByteLen;
498 }
499
500 /* static functions
501 */
502 CF_INLINE _CFEncodingConverter *__CFEncodingConverterFromDefinition(const CFStringEncodingConverter *definition, CFStringEncoding encoding) {
503 #define NUM_OF_ENTRIES_CYCLE (10)
504 static uint32_t _currentIndex = 0;
505 static uint32_t _allocatedSize = 0;
506 static _CFEncodingConverter *_allocatedEntries = NULL;
507 _CFEncodingConverter *converter;
508
509
510 if ((_currentIndex + 1) >= _allocatedSize) {
511 _currentIndex = 0;
512 _allocatedSize = 0;
513 _allocatedEntries = NULL;
514 }
515 if (_allocatedEntries == NULL) { // Not allocated yet
516 _allocatedEntries = (_CFEncodingConverter *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(_CFEncodingConverter) * NUM_OF_ENTRIES_CYCLE, 0);
517 _allocatedSize = NUM_OF_ENTRIES_CYCLE;
518 converter = &(_allocatedEntries[_currentIndex]);
519 } else {
520 converter = &(_allocatedEntries[++_currentIndex]);
521 }
522
523 memset(converter, 0, sizeof(_CFEncodingConverter));
524
525 converter->definition = definition;
526
527 switch (definition->encodingClass) {
528 case kCFStringEncodingConverterStandard:
529 converter->toBytes = NULL;
530 converter->toUnicode = NULL;
531 converter->toCanonicalUnicode = NULL;
532 break;
533
534 case kCFStringEncodingConverterCheapEightBit:
535 converter->toBytes = __CFToBytesCheapEightBitWrapper;
536 converter->toUnicode = __CFToUnicodeCheapEightBitWrapper;
537 converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapEightBitWrapper;
538 break;
539
540 case kCFStringEncodingConverterStandardEightBit:
541 converter->toBytes = __CFToBytesStandardEightBitWrapper;
542 converter->toUnicode = __CFToUnicodeStandardEightBitWrapper;
543 converter->toCanonicalUnicode = __CFToCanonicalUnicodeStandardEightBitWrapper;
544 break;
545
546 case kCFStringEncodingConverterCheapMultiByte:
547 converter->toBytes = __CFToBytesCheapMultiByteWrapper;
548 converter->toUnicode = __CFToUnicodeCheapMultiByteWrapper;
549 converter->toCanonicalUnicode = __CFToCanonicalUnicodeCheapMultiByteWrapper;
550 break;
551
552 case kCFStringEncodingConverterICU:
553 converter->toBytes = (_CFToBytesProc)__CFStringEncodingGetICUName(encoding);
554 break;
555
556 case kCFStringEncodingConverterPlatformSpecific:
557 break;
558
559 default: // Shouln't be here
560 return NULL;
561 }
562
563 converter->toBytesFallback = (definition->toBytesFallback ? definition->toBytesFallback : __CFDefaultToBytesFallbackProc);
564 converter->toUnicodeFallback = (definition->toUnicodeFallback ? definition->toUnicodeFallback : __CFDefaultToUnicodeFallbackProc);
565
566 return converter;
567 }
568
569 CF_INLINE const CFStringEncodingConverter *__CFStringEncodingConverterGetDefinition(CFStringEncoding encoding) {
570 switch (encoding) {
571 case kCFStringEncodingUTF8:
572 return &__CFConverterUTF8;
573
574 case kCFStringEncodingMacRoman:
575 return &__CFConverterMacRoman;
576
577 case kCFStringEncodingWindowsLatin1:
578 return &__CFConverterWinLatin1;
579
580 case kCFStringEncodingASCII:
581 return &__CFConverterASCII;
582
583 case kCFStringEncodingISOLatin1:
584 return &__CFConverterISOLatin1;
585
586
587 case kCFStringEncodingNextStepLatin:
588 return &__CFConverterNextStepLatin;
589
590
591 default:
592 return __CFStringEncodingGetExternalConverter(encoding);
593 }
594 }
595
596 static const _CFEncodingConverter *__CFGetConverter(uint32_t encoding) {
597 const _CFEncodingConverter *converter = NULL;
598 const _CFEncodingConverter **commonConverterSlot = NULL;
599 static _CFEncodingConverter *commonConverters[3] = {NULL, NULL, NULL}; // UTF8, MacRoman/WinLatin1, and the default encoding*
600 static CFMutableDictionaryRef mappingTable = NULL;
601 static CFSpinLock_t lock = CFSpinLockInit;
602
603 switch (encoding) {
604 case kCFStringEncodingUTF8: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[0]); break;
605
606 /* the swith here should avoid possible bootstrap issues in the default: case below when invoked from CFStringGetSystemEncoding() */
607 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
608 case kCFStringEncodingMacRoman: commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[1]); break;
609 #elif DEPLOYMENT_TARGET_WINDOWS
610 case kCFStringEncodingWindowsLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break;
611 #else
612 #warning This case must match __defaultEncoding value defined in CFString.c
613 case kCFStringEncodingISOLatin1: commonConverterSlot = (const _CFEncodingConverter **)(&(commonConverters[1])); break;
614 #endif /* DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED */
615
616 default: if (CFStringGetSystemEncoding() == encoding) commonConverterSlot = (const _CFEncodingConverter **)&(commonConverters[2]); break;
617 }
618
619 __CFSpinLock(&lock);
620 converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot);
621 __CFSpinUnlock(&lock);
622
623 if (NULL == converter) {
624 const CFStringEncodingConverter *definition = __CFStringEncodingConverterGetDefinition(encoding);
625
626 if (NULL != definition) {
627 __CFSpinLock(&lock);
628 converter = ((NULL == commonConverterSlot) ? ((NULL == mappingTable) ? NULL : (const _CFEncodingConverter *)CFDictionaryGetValue(mappingTable, (const void *)(uintptr_t)encoding)) : *commonConverterSlot);
629
630 if (NULL == converter) {
631 converter = __CFEncodingConverterFromDefinition(definition, encoding);
632
633 if (NULL == commonConverterSlot) {
634 if (NULL == mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, NULL, NULL);
635
636 CFDictionarySetValue(mappingTable, (const void *)(uintptr_t)encoding, converter);
637 } else {
638 *commonConverterSlot = converter;
639 }
640 }
641 __CFSpinUnlock(&lock);
642 }
643 }
644
645 return converter;
646 }
647
648 /* Public API
649 */
650 uint32_t CFStringEncodingUnicodeToBytes(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
651 if (encoding == kCFStringEncodingUTF8) {
652 static CFStringEncodingToBytesProc __CFToUTF8 = NULL;
653 CFIndex convertedCharLen;
654 CFIndex usedLen;
655
656
657 if ((flags & kCFStringEncodingUseCanonical) || (flags & kCFStringEncodingUseHFSPlusCanonical)) {
658 (void)CFUniCharDecompose(characters, numChars, &convertedCharLen, (void *)bytes, maxByteLen, &usedLen, true, kCFUniCharUTF8Format, (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false));
659 } else {
660 if (!__CFToUTF8) {
661 const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
662 __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes;
663 }
664 convertedCharLen = __CFToUTF8(0, characters, numChars, bytes, maxByteLen, &usedLen);
665 }
666 if (usedCharLen) *usedCharLen = convertedCharLen;
667 if (usedByteLen) *usedByteLen = usedLen;
668
669 if (convertedCharLen == numChars) {
670 return kCFStringEncodingConversionSuccess;
671 } else if ((maxByteLen > 0) && ((maxByteLen - usedLen) < 10)) { // could be filled outbuf
672 UTF16Char character = characters[convertedCharLen];
673
674 if (((character >= kSurrogateLowStart) && (character <= kSurrogateLowEnd)) || ((character >= kSurrogateHighStart) && (character <= kSurrogateHighEnd) && ((1 == (numChars - convertedCharLen)) || (characters[convertedCharLen + 1] < kSurrogateLowStart) || (characters[convertedCharLen + 1] > kSurrogateLowEnd)))) return kCFStringEncodingInvalidInputStream;
675
676 return kCFStringEncodingInsufficientOutputBufferLength;
677 } else {
678 return kCFStringEncodingInvalidInputStream;
679 }
680 } else {
681 const _CFEncodingConverter *converter = __CFGetConverter(encoding);
682 CFIndex usedLen = 0;
683 CFIndex localUsedByteLen;
684 CFIndex theUsedByteLen = 0;
685 uint32_t theResult = kCFStringEncodingConversionSuccess;
686 CFStringEncodingToBytesPrecomposeProc toBytesPrecompose = NULL;
687 CFStringEncodingIsValidCombiningCharacterProc isValidCombiningChar = NULL;
688
689 if (!converter) return kCFStringEncodingConverterUnavailable;
690
691 if (flags & kCFStringEncodingSubstituteCombinings) {
692 if (!(flags & kCFStringEncodingAllowLossyConversion)) isValidCombiningChar = converter->definition->isValidCombiningChar;
693 } else {
694 isValidCombiningChar = converter->definition->isValidCombiningChar;
695 if (!(flags & kCFStringEncodingIgnoreCombinings)) {
696 toBytesPrecompose = converter->definition->toBytesPrecompose;
697 flags |= kCFStringEncodingComposeCombinings;
698 }
699 }
700
701 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToBytes((const char *)converter->toBytes, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen);
702
703 /* Platform converter */
704 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformUnicodeToBytes(encoding, flags, characters, numChars, usedCharLen, bytes, maxByteLen, usedByteLen);
705
706 while ((usedLen < numChars) && (!maxByteLen || (theUsedByteLen < maxByteLen))) {
707 if ((usedLen += TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) < numChars) {
708 CFIndex dummy;
709
710 if (isValidCombiningChar && (usedLen > 0) && isValidCombiningChar(characters[usedLen])) {
711 if (toBytesPrecompose) {
712 CFIndex localUsedLen = usedLen;
713
714 while (isValidCombiningChar(characters[--usedLen]));
715 theUsedByteLen += localUsedByteLen;
716 if (converter->definition->maxBytesPerChar > 1) {
717 TO_BYTE(converter, flags, characters + usedLen, localUsedLen - usedLen, NULL, 0, &localUsedByteLen);
718 theUsedByteLen -= localUsedByteLen;
719 } else {
720 theUsedByteLen--;
721 }
722 if ((localUsedLen = toBytesPrecompose(flags, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen)) > 0) {
723 usedLen += localUsedLen;
724 if ((usedLen < numChars) && isValidCombiningChar(characters[usedLen])) { // There is a non-base char not combined remaining
725 theUsedByteLen += localUsedByteLen;
726 theResult = kCFStringEncodingInvalidInputStream;
727 break;
728 }
729 } else if (flags & kCFStringEncodingAllowLossyConversion) {
730 uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
731
732 if (lossyByte) {
733 while (isValidCombiningChar(characters[++usedLen]));
734 localUsedByteLen = 1;
735 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
736 } else {
737 ++usedLen;
738 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
739 }
740 } else {
741 theResult = kCFStringEncodingInvalidInputStream;
742 break;
743 }
744 } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up
745 theUsedByteLen += localUsedByteLen;
746 theResult = kCFStringEncodingInsufficientOutputBufferLength;
747 break;
748 } else if (flags & kCFStringEncodingIgnoreCombinings) {
749 while ((++usedLen < numChars) && isValidCombiningChar(characters[usedLen]));
750 } else {
751 uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
752
753 theUsedByteLen += localUsedByteLen;
754 if (lossyByte) {
755 ++usedLen;
756 localUsedByteLen = 1;
757 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
758 } else {
759 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
760 }
761 }
762 } else if (maxByteLen && ((maxByteLen == theUsedByteLen + localUsedByteLen) || TO_BYTE(converter, flags, characters + usedLen, numChars - usedLen, NULL, 0, &dummy))) { // buffer was filled up
763 theUsedByteLen += localUsedByteLen;
764
765 if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) {
766 CFIndex localUsedLen;
767
768 localUsedByteLen = 0;
769 while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen;
770 }
771 if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength;
772 break;
773 } else if (flags & kCFStringEncodingAllowLossyConversion) {
774 uint8_t lossyByte = CFStringEncodingMaskToLossyByte(flags);
775
776 theUsedByteLen += localUsedByteLen;
777 if (lossyByte) {
778 ++usedLen;
779 localUsedByteLen = 1;
780 if (maxByteLen) *(bytes + theUsedByteLen) = lossyByte;
781 } else {
782 usedLen += TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, bytes + theUsedByteLen, (maxByteLen ? maxByteLen - theUsedByteLen : 0), &localUsedByteLen);
783 }
784 } else {
785 theUsedByteLen += localUsedByteLen;
786 theResult = kCFStringEncodingInvalidInputStream;
787 break;
788 }
789 }
790 theUsedByteLen += localUsedByteLen;
791 }
792
793 if (usedLen < numChars && maxByteLen && theResult == kCFStringEncodingConversionSuccess) {
794 if (flags & kCFStringEncodingAllowLossyConversion && !CFStringEncodingMaskToLossyByte(flags)) {
795 CFIndex localUsedLen;
796
797 localUsedByteLen = 0;
798 while ((usedLen < numChars) && !localUsedByteLen && (localUsedLen = TO_BYTE_FALLBACK(converter, characters + usedLen, numChars - usedLen, NULL, 0, &localUsedByteLen))) usedLen += localUsedLen;
799 }
800 if (usedLen < numChars) theResult = kCFStringEncodingInsufficientOutputBufferLength;
801 }
802 if (usedByteLen) *usedByteLen = theUsedByteLen;
803 if (usedCharLen) *usedCharLen = usedLen;
804
805 return theResult;
806 }
807 }
808
809 uint32_t CFStringEncodingBytesToUnicode(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
810 const _CFEncodingConverter *converter = __CFGetConverter(encoding);
811 CFIndex usedLen = 0;
812 CFIndex theUsedCharLen = 0;
813 CFIndex localUsedCharLen;
814 uint32_t theResult = kCFStringEncodingConversionSuccess;
815
816 if (!converter) return kCFStringEncodingConverterUnavailable;
817
818 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUToUnicode((const char *)converter->toBytes, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen);
819
820 /* Platform converter */
821 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformBytesToUnicode(encoding, flags, bytes, numBytes, usedByteLen, characters, maxCharLen, usedCharLen);
822
823 while ((usedLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
824 if ((usedLen += TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen)) < numBytes) {
825 CFIndex tempUsedCharLen;
826
827 if (maxCharLen && ((maxCharLen == theUsedCharLen + localUsedCharLen) || (((flags & (kCFStringEncodingUseCanonical|kCFStringEncodingUseHFSPlusCanonical)) || (maxCharLen == theUsedCharLen + localUsedCharLen + 1)) && TO_UNICODE(converter, flags, bytes + usedLen, numBytes - usedLen, NULL, 0, &tempUsedCharLen)))) { // buffer was filled up
828 theUsedCharLen += localUsedCharLen;
829 theResult = kCFStringEncodingInsufficientOutputBufferLength;
830 break;
831 } else if (flags & kCFStringEncodingAllowLossyConversion) {
832 theUsedCharLen += localUsedCharLen;
833 usedLen += TO_UNICODE_FALLBACK(converter, bytes + usedLen, numBytes - usedLen, characters + theUsedCharLen, (maxCharLen ? maxCharLen - theUsedCharLen : 0), &localUsedCharLen);
834 } else {
835 theUsedCharLen += localUsedCharLen;
836 theResult = kCFStringEncodingInvalidInputStream;
837 break;
838 }
839 }
840 theUsedCharLen += localUsedCharLen;
841 }
842
843 if (usedLen < numBytes && maxCharLen && theResult == kCFStringEncodingConversionSuccess) {
844 theResult = kCFStringEncodingInsufficientOutputBufferLength;
845 }
846 if (usedCharLen) *usedCharLen = theUsedCharLen;
847 if (usedByteLen) *usedByteLen = usedLen;
848
849 return theResult;
850 }
851
852 __private_extern__ bool CFStringEncodingIsValidEncoding(uint32_t encoding) {
853 return (CFStringEncodingGetConverter(encoding) ? true : false);
854 }
855
856 __private_extern__ CFIndex CFStringEncodingCharLengthForBytes(uint32_t encoding, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
857 const _CFEncodingConverter *converter = __CFGetConverter(encoding);
858
859 if (converter) {
860 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUCharLength((const char *)converter->toBytes, flags, bytes, numBytes);
861
862 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformCharLengthForBytes(encoding, flags, bytes, numBytes);
863
864 if (1 == converter->definition->maxBytesPerChar) return numBytes;
865
866 if (NULL == converter->definition->toUnicodeLen) {
867 CFIndex usedByteLen = 0;
868 CFIndex totalLength = 0;
869 CFIndex usedCharLen;
870
871 while (numBytes > 0) {
872 usedByteLen = TO_UNICODE(converter, flags, bytes, numBytes, NULL, 0, &usedCharLen);
873
874 bytes += usedByteLen;
875 numBytes -= usedByteLen;
876 totalLength += usedCharLen;
877
878 if (numBytes > 0) {
879 if (0 == (flags & kCFStringEncodingAllowLossyConversion)) return 0;
880
881 usedByteLen = TO_UNICODE_FALLBACK(converter, bytes, numBytes, NULL, 0, &usedCharLen);
882
883 bytes += usedByteLen;
884 numBytes -= usedByteLen;
885 totalLength += usedCharLen;
886 }
887 }
888
889 return totalLength;
890 } else {
891 return converter->definition->toUnicodeLen(flags, bytes, numBytes);
892 }
893 }
894
895 return 0;
896 }
897
898 __private_extern__ CFIndex CFStringEncodingByteLengthForCharacters(uint32_t encoding, uint32_t flags, const UniChar *characters, CFIndex numChars) {
899 const _CFEncodingConverter *converter = __CFGetConverter(encoding);
900
901 if (converter) {
902 if (kCFStringEncodingConverterICU == converter->definition->encodingClass) return __CFStringEncodingICUByteLength((const char *)converter->toBytes, flags, characters, numChars);
903
904 if (kCFStringEncodingConverterPlatformSpecific == converter->definition->encodingClass) return __CFStringEncodingPlatformByteLengthForCharacters(encoding, flags, characters, numChars);
905
906 if (1 == converter->definition->maxBytesPerChar) return numChars;
907
908 if (NULL == converter->definition->toBytesLen) {
909 CFIndex usedCharLen;
910
911 return ((kCFStringEncodingConversionSuccess == CFStringEncodingUnicodeToBytes(encoding, flags, characters, numChars, &usedCharLen, NULL, 0, NULL)) ? usedCharLen : 0);
912 } else {
913 return converter->definition->toBytesLen(flags, characters, numChars);
914 }
915 }
916
917 return 0;
918 }
919
920 __private_extern__ void CFStringEncodingRegisterFallbackProcedures(uint32_t encoding, CFStringEncodingToBytesFallbackProc toBytes, CFStringEncodingToUnicodeFallbackProc toUnicode) {
921 _CFEncodingConverter *converter = (_CFEncodingConverter *)__CFGetConverter(encoding);
922
923 if (NULL != converter) {
924 const CFStringEncodingConverter *body = CFStringEncodingGetConverter(encoding);
925
926 converter->toBytesFallback = ((NULL == toBytes) ? ((NULL == body) ? __CFDefaultToBytesFallbackProc : body->toBytesFallback) : toBytes);
927 converter->toUnicodeFallback = ((NULL == toUnicode) ? ((NULL == body) ? __CFDefaultToUnicodeFallbackProc : body->toUnicodeFallback) : toUnicode);
928 }
929 }
930
931 __private_extern__ const CFStringEncodingConverter *CFStringEncodingGetConverter(uint32_t encoding) {
932 const _CFEncodingConverter *converter = __CFGetConverter(encoding);
933
934 return ((NULL == converter) ? NULL : converter->definition);
935 }
936
937 static const CFStringEncoding __CFBuiltinEncodings[] = {
938 kCFStringEncodingMacRoman,
939 kCFStringEncodingWindowsLatin1,
940 kCFStringEncodingISOLatin1,
941 kCFStringEncodingNextStepLatin,
942 kCFStringEncodingASCII,
943 kCFStringEncodingUTF8,
944 /* These seven are available only in CFString-level */
945 kCFStringEncodingNonLossyASCII,
946
947 kCFStringEncodingUTF16,
948 kCFStringEncodingUTF16BE,
949 kCFStringEncodingUTF16LE,
950
951 kCFStringEncodingUTF32,
952 kCFStringEncodingUTF32BE,
953 kCFStringEncodingUTF32LE,
954
955 kCFStringEncodingInvalidId,
956 };
957
958 static CFComparisonResult __CFStringEncodingComparator(const void *v1, const void *v2, void *context) {
959 CFComparisonResult val1 = (*(const CFStringEncoding *)v1) & 0xFFFF;
960 CFComparisonResult val2 = (*(const CFStringEncoding *)v2) & 0xFFFF;
961
962 return ((val1 == val2) ? ((CFComparisonResult)(*(const CFStringEncoding *)v1) - (CFComparisonResult)(*(const CFStringEncoding *)v2)) : val1 - val2);
963 }
964
965 static void __CFStringEncodingFliterDupes(CFStringEncoding *encodings, CFIndex numSlots) {
966 CFStringEncoding last = kCFStringEncodingInvalidId;
967 const CFStringEncoding *limitEncodings = encodings + numSlots;
968
969 while (encodings < limitEncodings) {
970 if (last == *encodings) {
971 if ((encodings + 1) < limitEncodings) memmove(encodings, encodings + 1, sizeof(CFStringEncoding) * (limitEncodings - encodings - 1));
972 --limitEncodings;
973 } else {
974 last = *(encodings++);
975 }
976 }
977 }
978
979 __private_extern__ const CFStringEncoding *CFStringEncodingListOfAvailableEncodings(void) {
980 static const CFStringEncoding *encodings = NULL;
981
982 if (NULL == encodings) {
983 CFStringEncoding *list = (CFStringEncoding *)__CFBuiltinEncodings;
984 CFIndex numICUConverters = 0, numPlatformConverters = 0;
985 CFStringEncoding *icuConverters = __CFStringEncodingCreateICUEncodings(NULL, &numICUConverters);
986 CFStringEncoding *platformConverters = __CFStringEncodingCreateListOfAvailablePlatformConverters(NULL, &numPlatformConverters);
987
988 if ((NULL != icuConverters) || (NULL != platformConverters)) {
989 CFIndex numSlots = (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters + numPlatformConverters;
990
991 list = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * numSlots, 0);
992
993 memcpy(list, __CFBuiltinEncodings, sizeof(__CFBuiltinEncodings));
994
995 if (NULL != icuConverters) {
996 memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)), icuConverters, sizeof(CFStringEncoding) * numICUConverters);
997 CFAllocatorDeallocate(NULL, icuConverters);
998 }
999
1000 if (NULL != platformConverters) {
1001 memcpy(list + (sizeof(__CFBuiltinEncodings) / sizeof(*__CFBuiltinEncodings)) + numICUConverters, platformConverters, sizeof(CFStringEncoding) * numPlatformConverters);
1002 CFAllocatorDeallocate(NULL, platformConverters);
1003 }
1004
1005 CFQSortArray(list, numSlots, sizeof(CFStringEncoding), (CFComparatorFunction)__CFStringEncodingComparator, NULL);
1006 __CFStringEncodingFliterDupes(list, numSlots);
1007 }
1008 if (!OSAtomicCompareAndSwapPtrBarrier(NULL, list, (void * volatile *)&encodings) && (list != __CFBuiltinEncodings)) CFAllocatorDeallocate(NULL, list);
1009 }
1010
1011 return encodings;
1012 }
1013
1014 #undef TO_BYTE
1015 #undef TO_UNICODE
1016 #undef ASCIINewLine
1017 #undef kSurrogateHighStart
1018 #undef kSurrogateHighEnd
1019 #undef kSurrogateLowStart
1020 #undef kSurrogateLowEnd
1021 #undef TO_BYTE_FALLBACK
1022 #undef TO_UNICODE_FALLBACK
1023 #undef EXTRA_BASE
1024 #undef NUM_OF_ENTRIES_CYCLE
1025