]> git.saurik.com Git - apple/cf.git/blob - CFStringEncodings.c
CF-476.13.tar.gz
[apple/cf.git] / CFStringEncodings.c
1 /*
2 * Copyright (c) 2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /* CFStringEncodings.c
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Aki Inoue
26 */
27
28 #include "CFInternal.h"
29 #include <CoreFoundation/CFString.h>
30 #include <CoreFoundation/CFByteOrder.h>
31 #include "CFPriv.h"
32 #include <string.h>
33 #include "CFStringEncodingConverterExt.h"
34 #include "CFUniChar.h"
35 #include "CFUnicodeDecomposition.h"
36
37 static UInt32 __CFWantsToUseASCIICompatibleConversion = (UInt32)-1;
38 CF_INLINE UInt32 __CFGetASCIICompatibleFlag(void) {
39 if (__CFWantsToUseASCIICompatibleConversion == (UInt32)-1) {
40 __CFWantsToUseASCIICompatibleConversion = false;
41 }
42 return (__CFWantsToUseASCIICompatibleConversion ? kCFStringEncodingASCIICompatibleConversion : 0);
43 }
44
45 void _CFStringEncodingSetForceASCIICompatibility(Boolean flag) {
46 __CFWantsToUseASCIICompatibleConversion = (flag ? (UInt32)true : (UInt32)false);
47 }
48
49 Boolean (*__CFCharToUniCharFunc)(UInt32 flags, uint8_t ch, UniChar *unicodeChar) = NULL;
50
51 // To avoid early initialization issues, we just initialize this here
52 // This should not be const as it is changed
53 UniChar __CFCharToUniCharTable[256] = {
54 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
55 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
56 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
57 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
58 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
59 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
60 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
61 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
62 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
63 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
64 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
65 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
66 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
67 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
68 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
69 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
70 };
71
72 void __CFSetCharToUniCharFunc(Boolean (*func)(UInt32 flags, UInt8 ch, UniChar *unicodeChar)) {
73 if (__CFCharToUniCharFunc != func) {
74 int ch;
75 __CFCharToUniCharFunc = func;
76 if (func) {
77 for (ch = 128; ch < 256; ch++) {
78 UniChar uch;
79 __CFCharToUniCharTable[ch] = (__CFCharToUniCharFunc(0, ch, &uch) ? uch : 0xFFFD);
80 }
81 } else { // If we have no __CFCharToUniCharFunc, assume 128..255 return the value as-is
82 for (ch = 128; ch < 256; ch++) __CFCharToUniCharTable[ch] = ch;
83 }
84 }
85 }
86
87 __private_extern__ void __CFStrConvertBytesToUnicode(const uint8_t *bytes, UniChar *buffer, CFIndex numChars) {
88 CFIndex idx;
89 for (idx = 0; idx < numChars; idx++) buffer[idx] = __CFCharToUniCharTable[bytes[idx]];
90 }
91
92
93 /* The minimum length the output buffers should be in the above functions
94 */
95 #define kCFCharConversionBufferLength 512
96
97
98 #define MAX_LOCAL_CHARS (sizeof(buffer->localBuffer) / sizeof(uint8_t))
99 #define MAX_LOCAL_UNICHARS (sizeof(buffer->localBuffer) / sizeof(UniChar))
100
101 /* Convert a byte stream to ASCII (7-bit!) or Unicode, with a CFVarWidthCharBuffer struct on the stack. false return indicates an error occured during the conversion. The caller needs to free the returned buffer in either ascii or unicode (indicated by isASCII), if shouldFreeChars is true.
102 9/18/98 __CFStringDecodeByteStream now avoids to allocate buffer if buffer->chars is not NULL
103 Added useClientsMemoryPtr; if not-NULL, and the provided memory can be used as is, this is set to true
104 __CFStringDecodeByteStream2() is kept around for any internal clients who might be using it; it should be deprecated
105 !!! converterFlags is only used for the UTF8 converter at this point
106 */
107 Boolean __CFStringDecodeByteStream2(const uint8_t *bytes, UInt32 len, CFStringEncoding encoding, Boolean alwaysUnicode, CFVarWidthCharBuffer *buffer, Boolean *useClientsMemoryPtr) {
108 return __CFStringDecodeByteStream3(bytes, len, encoding, alwaysUnicode, buffer, useClientsMemoryPtr, 0);
109 }
110
111 enum {
112 __NSNonLossyErrorMode = -1,
113 __NSNonLossyASCIIMode = 0,
114 __NSNonLossyBackslashMode = 1,
115 __NSNonLossyHexInitialMode = __NSNonLossyBackslashMode + 1,
116 __NSNonLossyHexFinalMode = __NSNonLossyHexInitialMode + 4,
117 __NSNonLossyOctalInitialMode = __NSNonLossyHexFinalMode + 1,
118 __NSNonLossyOctalFinalMode = __NSNonLossyHexFinalMode + 3
119 };
120
121 Boolean __CFStringDecodeByteStream3(const uint8_t *bytes, CFIndex len, CFStringEncoding encoding, Boolean alwaysUnicode, CFVarWidthCharBuffer *buffer, Boolean *useClientsMemoryPtr, UInt32 converterFlags) {
122
123 if (useClientsMemoryPtr) *useClientsMemoryPtr = false;
124
125 buffer->isASCII = !alwaysUnicode;
126 buffer->shouldFreeChars = false;
127 buffer->numChars = 0;
128
129 if (0 == len) return true;
130
131 buffer->allocator = (buffer->allocator ? buffer->allocator : __CFGetDefaultAllocator());
132
133 if ((encoding == kCFStringEncodingUTF16) || (encoding == kCFStringEncodingUTF16BE) || (encoding == kCFStringEncodingUTF16LE)) { // UTF-16
134 const UTF16Char *src = (const UTF16Char *)bytes;
135 const UTF16Char *limit = (const UTF16Char *)(bytes + len);
136 bool swap = false;
137
138 if (kCFStringEncodingUTF16 == encoding) {
139 UTF16Char bom = ((*src == 0xFFFE) || (*src == 0xFEFF) ? *(src++) : 0);
140
141 #if __CF_BIG_ENDIAN__
142 if (bom == 0xFFFE) swap = true;
143 #else
144 if (bom != 0xFEFF) swap = true;
145 #endif
146 if (bom) useClientsMemoryPtr = NULL;
147 } else {
148 #if __CF_BIG_ENDIAN__
149 if (kCFStringEncodingUTF16LE == encoding) swap = true;
150 #else
151 if (kCFStringEncodingUTF16BE == encoding) swap = true;
152 #endif
153 }
154
155 buffer->numChars = limit - src;
156
157 if (useClientsMemoryPtr && !swap) { // If the caller is ready to deal with no-copy situation, and the situation is possible, indicate it...
158 *useClientsMemoryPtr = true;
159 buffer->chars.unicode = (UniChar *)src;
160 buffer->isASCII = false;
161 } else {
162 if (buffer->isASCII) { // Let's see if we can reduce the Unicode down to ASCII...
163 const UTF16Char *characters = src;
164 UTF16Char mask = (swap ? 0x80FF : 0xFF80);
165
166 while (characters < limit) {
167 if (*(characters++) & mask) {
168 buffer->isASCII = false;
169 break;
170 }
171 }
172 }
173
174 if (buffer->isASCII) {
175 uint8_t *dst;
176 if (NULL == buffer->chars.ascii) { // we never reallocate when buffer is supplied
177 if (buffer->numChars > MAX_LOCAL_CHARS) {
178 buffer->chars.ascii = (UInt8 *)CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(uint8_t)), 0);
179 buffer->shouldFreeChars = true;
180 } else {
181 buffer->chars.ascii = (uint8_t *)buffer->localBuffer;
182 }
183 }
184 dst = buffer->chars.ascii;
185
186 if (swap) {
187 while (src < limit) *(dst++) = (*(src++) >> 8);
188 } else {
189 while (src < limit) *(dst++) = (uint8_t)*(src++);
190 }
191 } else {
192 UTF16Char *dst;
193
194 if (NULL == buffer->chars.unicode) { // we never reallocate when buffer is supplied
195 if (buffer->numChars > MAX_LOCAL_UNICHARS) {
196 buffer->chars.unicode = (UniChar *)CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(UTF16Char)), 0);
197 buffer->shouldFreeChars = true;
198 } else {
199 buffer->chars.unicode = (UTF16Char *)buffer->localBuffer;
200 }
201 }
202 dst = buffer->chars.unicode;
203
204 if (swap) {
205 while (src < limit) *(dst++) = CFSwapInt16(*(src++));
206 } else {
207 memmove(dst, src, buffer->numChars * sizeof(UTF16Char));
208 }
209 }
210 }
211 } else if ((encoding == kCFStringEncodingUTF32) || (encoding == kCFStringEncodingUTF32BE) || (encoding == kCFStringEncodingUTF32LE)) {
212 const UTF32Char *src = (const UTF32Char *)bytes;
213 const UTF32Char *limit = (const UTF32Char *)(bytes + len);
214 bool swap = false;
215 static bool strictUTF32 = (bool)-1;
216
217 if ((bool)-1 == strictUTF32) strictUTF32 = (_CFExecutableLinkedOnOrAfter(CFSystemVersionLeopard) != 0);
218
219 if (kCFStringEncodingUTF32 == encoding) {
220 UTF32Char bom = ((*src == 0xFFFE0000) || (*src == 0x0000FEFF) ? *(src++) : 0);
221
222 #if __CF_BIG_ENDIAN__
223 if (bom == 0xFFFE0000) swap = true;
224 #else
225 if (bom != 0x0000FEFF) swap = true;
226 #endif
227 } else {
228 #if __CF_BIG_ENDIAN__
229 if (kCFStringEncodingUTF32LE == encoding) swap = true;
230 #else
231 if (kCFStringEncodingUTF32BE == encoding) swap = true;
232 #endif
233 }
234
235 buffer->numChars = limit - src;
236
237 {
238 // Let's see if we have non-ASCII or non-BMP
239 const UTF32Char *characters = src;
240 UTF32Char asciiMask = (swap ? 0x80FFFFFF : 0xFFFFFF80);
241 UTF32Char bmpMask = (swap ? 0x0000FFFF : 0xFFFF0000);
242
243 while (characters < limit) {
244 if (*characters & asciiMask) {
245 buffer->isASCII = false;
246 if (*characters & bmpMask) {
247 if (strictUTF32 && ((swap ? (UTF32Char)CFSwapInt32(*characters) : *characters) > 0x10FFFF)) return false; // outside of Unicode Scaler Value
248 ++(buffer->numChars);
249 }
250 }
251 ++characters;
252 }
253 }
254
255 if (buffer->isASCII) {
256 uint8_t *dst;
257 if (NULL == buffer->chars.ascii) { // we never reallocate when buffer is supplied
258 if (buffer->numChars > MAX_LOCAL_CHARS) {
259 buffer->chars.ascii = (UInt8 *)CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(uint8_t)), 0);
260 buffer->shouldFreeChars = true;
261 } else {
262 buffer->chars.ascii = (uint8_t *)buffer->localBuffer;
263 }
264 }
265 dst = buffer->chars.ascii;
266
267 if (swap) {
268 while (src < limit) *(dst++) = (*(src++) >> 24);
269 } else {
270 while (src < limit) *(dst++) = *(src++);
271 }
272 } else {
273 if (NULL == buffer->chars.unicode) { // we never reallocate when buffer is supplied
274 if (buffer->numChars > MAX_LOCAL_UNICHARS) {
275 buffer->chars.unicode = (UniChar *)CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(UTF16Char)), 0);
276 buffer->shouldFreeChars = true;
277 } else {
278 buffer->chars.unicode = (UTF16Char *)buffer->localBuffer;
279 }
280 }
281 return (CFUniCharFromUTF32(src, limit - src, buffer->chars.unicode, (strictUTF32 ? false : true), __CF_BIG_ENDIAN__ ? !swap : swap) ? TRUE : FALSE);
282 }
283 } else {
284 CFIndex idx;
285 const uint8_t *chars = (const uint8_t *)bytes;
286 const uint8_t *end = chars + len;
287
288 switch (encoding) {
289 case kCFStringEncodingNonLossyASCII: {
290 UTF16Char currentValue = 0;
291 uint8_t character;
292 int8_t mode = __NSNonLossyASCIIMode;
293
294 buffer->isASCII = false;
295 buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
296 buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : (UniChar *)CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
297 buffer->numChars = 0;
298
299 while (chars < end) {
300 character = (*chars++);
301
302 switch (mode) {
303 case __NSNonLossyASCIIMode:
304 if (character == '\\') {
305 mode = __NSNonLossyBackslashMode;
306 } else if (character < 0x80) {
307 currentValue = character;
308 } else {
309 mode = __NSNonLossyErrorMode;
310 }
311 break;
312
313 case __NSNonLossyBackslashMode:
314 if ((character == 'U') || (character == 'u')) {
315 mode = __NSNonLossyHexInitialMode;
316 currentValue = 0;
317 } else if ((character >= '0') && (character <= '9')) {
318 mode = __NSNonLossyOctalInitialMode;
319 currentValue = character - '0';
320 } else if (character == '\\') {
321 mode = __NSNonLossyASCIIMode;
322 currentValue = character;
323 } else {
324 mode = __NSNonLossyErrorMode;
325 }
326 break;
327
328 default:
329 if (mode < __NSNonLossyHexFinalMode) {
330 if ((character >= '0') && (character <= '9')) {
331 currentValue = (currentValue << 4) | (character - '0');
332 if (++mode == __NSNonLossyHexFinalMode) mode = __NSNonLossyASCIIMode;
333 } else {
334 if (character >= 'a') character -= ('a' - 'A');
335 if ((character >= 'A') && (character <= 'F')) {
336 currentValue = (currentValue << 4) | ((character - 'A') + 10);
337 if (++mode == __NSNonLossyHexFinalMode) mode = __NSNonLossyASCIIMode;
338 } else {
339 mode = __NSNonLossyErrorMode;
340 }
341 }
342 } else {
343 if ((character >= '0') && (character <= '9')) {
344 currentValue = (currentValue << 3) | (character - '0');
345 if (++mode == __NSNonLossyOctalFinalMode) mode = __NSNonLossyASCIIMode;
346 } else {
347 mode = __NSNonLossyErrorMode;
348 }
349 }
350 break;
351 }
352
353 if (mode == __NSNonLossyASCIIMode) {
354 buffer->chars.unicode[buffer->numChars++] = currentValue;
355 } else if (mode == __NSNonLossyErrorMode) {
356 return false;
357 }
358 }
359 return (mode == __NSNonLossyASCIIMode);
360 }
361
362 case kCFStringEncodingUTF8:
363 if ((len >= 3) && (chars[0] == 0xef) && (chars[1] == 0xbb) && (chars[2] == 0xbf)) { // If UTF8 BOM, skip
364 chars += 3;
365 len -= 3;
366 if (0 == len) return true;
367 }
368 if (buffer->isASCII) {
369 for (idx = 0; idx < len; idx++) {
370 if (128 <= chars[idx]) {
371 buffer->isASCII = false;
372 break;
373 }
374 }
375 }
376 if (buffer->isASCII) {
377 buffer->numChars = len;
378 buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
379 buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : (UInt8 *)CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
380 memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
381 } else {
382 CFIndex numDone;
383 static CFStringEncodingToUnicodeProc __CFFromUTF8 = NULL;
384
385 if (!__CFFromUTF8) {
386 const CFStringEncodingConverter *converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
387 __CFFromUTF8 = (CFStringEncodingToUnicodeProc)converter->toUnicode;
388 }
389
390 buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
391 buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : (UniChar *)CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
392 buffer->numChars = 0;
393 while (chars < end) {
394 numDone = 0;
395 chars += __CFFromUTF8(converterFlags, chars, end - chars, &(buffer->chars.unicode[buffer->numChars]), len - buffer->numChars, &numDone);
396
397 if (0 == numDone) {
398 if (buffer->shouldFreeChars) CFAllocatorDeallocate(buffer->allocator, buffer->chars.unicode);
399 buffer->isASCII = !alwaysUnicode;
400 buffer->shouldFreeChars = false;
401 buffer->chars.ascii = NULL;
402 buffer->numChars = 0;
403 return false;
404 }
405 buffer->numChars += numDone;
406 }
407 }
408 break;
409
410 default:
411 if (CFStringEncodingIsValidEncoding(encoding)) {
412 const CFStringEncodingConverter *converter = CFStringEncodingGetConverter(encoding);
413 Boolean isASCIISuperset = __CFStringEncodingIsSupersetOfASCII(encoding);
414
415 if (!converter) return false;
416
417 if (!isASCIISuperset) buffer->isASCII = false;
418
419 if (buffer->isASCII) {
420 for (idx = 0; idx < len; idx++) {
421 if (128 <= chars[idx]) {
422 buffer->isASCII = false;
423 break;
424 }
425 }
426 }
427
428 if (converter->encodingClass == kCFStringEncodingConverterCheapEightBit) {
429 if (buffer->isASCII) {
430 buffer->numChars = len;
431 buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
432 buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : (UInt8 *)CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
433 memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
434 } else {
435 buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
436 buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : (UniChar *)CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
437 buffer->numChars = len;
438 if (kCFStringEncodingASCII == encoding || kCFStringEncodingISOLatin1 == encoding) {
439 for (idx = 0; idx < len; idx++) buffer->chars.unicode[idx] = (UniChar)chars[idx];
440 } else {
441 for (idx = 0; idx < len; idx++)
442 if (chars[idx] < 0x80 && isASCIISuperset)
443 buffer->chars.unicode[idx] = (UniChar)chars[idx];
444 else if (!((CFStringEncodingCheapEightBitToUnicodeProc)converter->toUnicode)(0, chars[idx], buffer->chars.unicode + idx))
445 return false;
446 }
447 }
448 } else {
449 if (buffer->isASCII) {
450 buffer->numChars = len;
451 buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
452 buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : (UInt8 *)CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
453 memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
454 } else {
455 CFIndex guessedLength = CFStringEncodingCharLengthForBytes(encoding, 0, bytes, len);
456 static UInt32 lossyFlag = (UInt32)-1;
457
458 buffer->shouldFreeChars = !buffer->chars.unicode && (guessedLength <= MAX_LOCAL_UNICHARS) ? false : true;
459 buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (guessedLength <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : (UniChar *)CFAllocatorAllocate(buffer->allocator, guessedLength * sizeof(UniChar), 0));
460
461 if (lossyFlag == (UInt32)-1) lossyFlag = (_CFExecutableLinkedOnOrAfter(CFSystemVersionPanther) ? 0 : kCFStringEncodingAllowLossyConversion);
462
463 if (CFStringEncodingBytesToUnicode(encoding, lossyFlag|__CFGetASCIICompatibleFlag(), bytes, len, NULL, buffer->chars.unicode, (guessedLength > MAX_LOCAL_UNICHARS ? guessedLength : MAX_LOCAL_UNICHARS), &(buffer->numChars))) {
464 if (buffer->shouldFreeChars) CFAllocatorDeallocate(buffer->allocator, buffer->chars.unicode);
465 buffer->isASCII = !alwaysUnicode;
466 buffer->shouldFreeChars = false;
467 buffer->chars.ascii = NULL;
468 buffer->numChars = 0;
469 return false;
470 }
471 }
472 }
473 } else {
474 return false;
475 }
476 }
477 }
478
479 return true;
480 }
481
482
483 /* Create a byte stream from a CFString backing. Can convert a string piece at a time
484 into a fixed size buffer. Returns number of characters converted.
485 Characters that cannot be converted to the specified encoding are represented
486 with the char specified by lossByte; if 0, then lossy conversion is not allowed
487 and conversion stops, returning partial results.
488 Pass buffer==NULL if you don't care about the converted string (but just the convertability,
489 or number of bytes required, indicated by usedBufLen).
490 Does not zero-terminate. If you want to create Pascal or C string, allow one extra byte at start or end.
491
492 Note: This function is intended to work through CFString functions, so it should work
493 with NSStrings as well as CFStrings.
494 */
495 CFIndex __CFStringEncodeByteStream(CFStringRef string, CFIndex rangeLoc, CFIndex rangeLen, Boolean generatingExternalFile, CFStringEncoding encoding, char lossByte, uint8_t *buffer, CFIndex max, CFIndex *usedBufLen) {
496 CFIndex totalBytesWritten = 0; /* Number of written bytes */
497 CFIndex numCharsProcessed = 0; /* Number of processed chars */
498 const UniChar *unichars;
499
500 if (encoding == kCFStringEncodingUTF8 && (unichars = CFStringGetCharactersPtr(string))) {
501 static CFStringEncodingToBytesProc __CFToUTF8 = NULL;
502
503 if (!__CFToUTF8) {
504 const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
505 __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes;
506 }
507 numCharsProcessed = __CFToUTF8((generatingExternalFile ? kCFStringEncodingPrependBOM : 0), unichars + rangeLoc, rangeLen, buffer, (buffer ? max : 0), &totalBytesWritten);
508
509 } else if (encoding == kCFStringEncodingNonLossyASCII) {
510 const char *hex = "0123456789abcdef";
511 UniChar ch;
512 CFStringInlineBuffer buf;
513 CFStringInitInlineBuffer(string, &buf, CFRangeMake(rangeLoc, rangeLen));
514 while (numCharsProcessed < rangeLen) {
515 CFIndex reqLength; /* Required number of chars to encode this UniChar */
516 CFIndex cnt;
517 char tmp[6];
518 ch = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed);
519 if ((ch >= ' ' && ch <= '~' && ch != '\\') || (ch == '\n' || ch == '\r' || ch == '\t')) {
520 reqLength = 1;
521 tmp[0] = (char)ch;
522 } else {
523 if (ch == '\\') {
524 tmp[1] = '\\';
525 reqLength = 2;
526 } else if (ch < 256) { /* \nnn; note that this is not NEXTSTEP encoding but a (small) UniChar */
527 tmp[1] = '0' + (ch >> 6);
528 tmp[2] = '0' + ((ch >> 3) & 7);
529 tmp[3] = '0' + (ch & 7);
530 reqLength = 4;
531 } else { /* \Unnnn */
532 tmp[1] = 'u'; // Changed to small+u in order to be aligned with Java
533 tmp[2] = hex[(ch >> 12) & 0x0f];
534 tmp[3] = hex[(ch >> 8) & 0x0f];
535 tmp[4] = hex[(ch >> 4) & 0x0f];
536 tmp[5] = hex[ch & 0x0f];
537 reqLength = 6;
538 }
539 tmp[0] = '\\';
540 }
541 if (buffer) {
542 if (totalBytesWritten + reqLength > max) break; /* Doesn't fit..
543 .*/
544 for (cnt = 0; cnt < reqLength; cnt++) {
545 buffer[totalBytesWritten + cnt] = tmp[cnt];
546 }
547 }
548 totalBytesWritten += reqLength;
549 numCharsProcessed++;
550 }
551 } else if ((encoding == kCFStringEncodingUTF16) || (encoding == kCFStringEncodingUTF16BE) || (encoding == kCFStringEncodingUTF16LE)) {
552 CFIndex extraForBOM = (generatingExternalFile && (encoding == kCFStringEncodingUTF16) ? sizeof(UniChar) : 0);
553 numCharsProcessed = rangeLen;
554 if (buffer && (numCharsProcessed * (CFIndex)sizeof(UniChar) + extraForBOM > max)) {
555 numCharsProcessed = (max > extraForBOM) ? ((max - extraForBOM) / sizeof(UniChar)) : 0;
556 }
557 totalBytesWritten = (numCharsProcessed * sizeof(UniChar)) + extraForBOM;
558 if (buffer) {
559 if (extraForBOM) { /* Generate BOM */
560 #if __CF_BIG_ENDIAN__
561 *buffer++ = 0xfe; *buffer++ = 0xff;
562 #else
563 *buffer++ = 0xff; *buffer++ = 0xfe;
564 #endif
565 }
566 CFStringGetCharacters(string, CFRangeMake(rangeLoc, numCharsProcessed), (UniChar *)buffer);
567 if ((__CF_BIG_ENDIAN__ ? kCFStringEncodingUTF16LE : kCFStringEncodingUTF16BE) == encoding) { // Need to swap
568 UTF16Char *characters = (UTF16Char *)buffer;
569 const UTF16Char *limit = characters + numCharsProcessed;
570
571 while (characters < limit) {
572 *characters = CFSwapInt16(*characters);
573 ++characters;
574 }
575 }
576 }
577 } else if ((encoding == kCFStringEncodingUTF32) || (encoding == kCFStringEncodingUTF32BE) || (encoding == kCFStringEncodingUTF32LE)) {
578 UTF32Char character;
579 CFStringInlineBuffer buf;
580 UTF32Char *characters = (UTF32Char *)buffer;
581
582 bool swap = (encoding == (__CF_BIG_ENDIAN__ ? kCFStringEncodingUTF32LE : kCFStringEncodingUTF32BE) ? true : false);
583 if (generatingExternalFile && (encoding == kCFStringEncodingUTF32)) {
584 totalBytesWritten += sizeof(UTF32Char);
585 if (characters) {
586 if (totalBytesWritten > max) { // insufficient buffer
587 totalBytesWritten = 0;
588 } else {
589 *(characters++) = 0x0000FEFF;
590 }
591 }
592 }
593
594 CFStringInitInlineBuffer(string, &buf, CFRangeMake(rangeLoc, rangeLen));
595 while (numCharsProcessed < rangeLen) {
596 character = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed);
597
598 if (CFUniCharIsSurrogateHighCharacter(character)) {
599 UTF16Char otherCharacter;
600
601 if (((numCharsProcessed + 1) < rangeLen) && CFUniCharIsSurrogateLowCharacter((otherCharacter = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed + 1)))) {
602 character = CFUniCharGetLongCharacterForSurrogatePair(character, otherCharacter);
603 } else if (lossByte) {
604 character = lossByte;
605 } else {
606 break;
607 }
608 } else if (CFUniCharIsSurrogateLowCharacter(character)) {
609 if (lossByte) {
610 character = lossByte;
611 } else {
612 break;
613 }
614 }
615
616 totalBytesWritten += sizeof(UTF32Char);
617
618 if (characters) {
619 if (totalBytesWritten > max) {
620 totalBytesWritten -= sizeof(UTF32Char);
621 break;
622 }
623 *(characters++) = (swap ? CFSwapInt32(character) : character);
624 }
625
626 numCharsProcessed += (character > 0xFFFF ? 2 : 1);
627 }
628 } else {
629 CFIndex numChars;
630 UInt32 flags;
631 const unsigned char *cString = NULL;
632 Boolean isASCIISuperset = __CFStringEncodingIsSupersetOfASCII(encoding);
633
634 if (!CF_IS_OBJC(CFStringGetTypeID(), string) && isASCIISuperset) { // Checking for NSString to avoid infinite recursion
635 const unsigned char *ptr;
636 if ((cString = (const unsigned char *)CFStringGetCStringPtr(string, __CFStringGetEightBitStringEncoding()))) {
637 ptr = (cString += rangeLoc);
638 if (__CFStringGetEightBitStringEncoding() == encoding) {
639 numCharsProcessed = (rangeLen < max || buffer == NULL ? rangeLen : max);
640 if (buffer) memmove(buffer, cString, numCharsProcessed);
641 if (usedBufLen) *usedBufLen = numCharsProcessed;
642 return numCharsProcessed;
643 }
644 while (*ptr < 0x80 && rangeLen > 0) {
645 ++ptr;
646 --rangeLen;
647 }
648 numCharsProcessed = ptr - cString;
649 if (buffer) {
650 numCharsProcessed = (numCharsProcessed < max ? numCharsProcessed : max);
651 memmove(buffer, cString, numCharsProcessed);
652 buffer += numCharsProcessed;
653 max -= numCharsProcessed;
654 }
655 if (!rangeLen || (buffer && (max == 0))) {
656 if (usedBufLen) *usedBufLen = numCharsProcessed;
657 return numCharsProcessed;
658 }
659 rangeLoc += numCharsProcessed;
660 totalBytesWritten += numCharsProcessed;
661 }
662 if (!cString && (cString = CFStringGetPascalStringPtr(string, __CFStringGetEightBitStringEncoding()))) {
663 ptr = (cString += (rangeLoc + 1));
664 if (__CFStringGetEightBitStringEncoding() == encoding) {
665 numCharsProcessed = (rangeLen < max || buffer == NULL ? rangeLen : max);
666 if (buffer) memmove(buffer, cString, numCharsProcessed);
667 if (usedBufLen) *usedBufLen = numCharsProcessed;
668 return numCharsProcessed;
669 }
670 while (*ptr < 0x80 && rangeLen > 0) {
671 ++ptr;
672 --rangeLen;
673 }
674 numCharsProcessed = ptr - cString;
675 if (buffer) {
676 numCharsProcessed = (numCharsProcessed < max ? numCharsProcessed : max);
677 memmove(buffer, cString, numCharsProcessed);
678 buffer += numCharsProcessed;
679 max -= numCharsProcessed;
680 }
681 if (!rangeLen || (buffer && (max == 0))) {
682 if (usedBufLen) *usedBufLen = numCharsProcessed;
683 return numCharsProcessed;
684 }
685 rangeLoc += numCharsProcessed;
686 totalBytesWritten += numCharsProcessed;
687 }
688 }
689
690 if (!buffer) max = 0;
691
692 // Special case for Foundation. When lossByte == 0xFF && encoding kCFStringEncodingASCII, we do the default ASCII fallback conversion
693 // Aki 11/24/04 __CFGetASCIICompatibleFlag() is called only for non-ASCII superset encodings. Otherwise, it could lead to a deadlock (see 3890536).
694 flags = (lossByte ? ((unsigned char)lossByte == 0xFF && encoding == kCFStringEncodingASCII ? kCFStringEncodingAllowLossyConversion : CFStringEncodingLossyByteToMask(lossByte)) : 0) | (generatingExternalFile ? kCFStringEncodingPrependBOM : 0) | (isASCIISuperset ? 0 : __CFGetASCIICompatibleFlag());
695
696 if (!cString && (cString = (const unsigned char *)CFStringGetCharactersPtr(string))) { // Must be Unicode string
697 if (CFStringEncodingIsValidEncoding(encoding)) { // Converter available in CF
698 CFStringEncodingUnicodeToBytes(encoding, flags, (const UniChar *)cString + rangeLoc, rangeLen, &numCharsProcessed, buffer, max, &totalBytesWritten);
699 } else {
700 return 0;
701 }
702 } else {
703 UniChar charBuf[kCFCharConversionBufferLength];
704 CFIndex currentLength;
705 CFIndex usedLen;
706 CFIndex lastUsedLen = 0, lastNumChars = 0;
707 uint32_t result;
708 Boolean isCFBuiltin = CFStringEncodingIsValidEncoding(encoding);
709 #define MAX_DECOMP_LEN (6)
710
711 while (rangeLen > 0) {
712 currentLength = (rangeLen > kCFCharConversionBufferLength ? kCFCharConversionBufferLength : rangeLen);
713 CFStringGetCharacters(string, CFRangeMake(rangeLoc, currentLength), charBuf);
714
715 // could be in the middle of surrogate pair; back up.
716 if ((rangeLen > kCFCharConversionBufferLength) && CFUniCharIsSurrogateHighCharacter(charBuf[kCFCharConversionBufferLength - 1])) --currentLength;
717
718 if (isCFBuiltin) { // Converter available in CF
719 if ((result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, currentLength, &numChars, buffer, max, &usedLen)) != kCFStringEncodingConversionSuccess) {
720 if (kCFStringEncodingInvalidInputStream == result) {
721 CFRange composedRange;
722 // Check the tail
723 if ((rangeLen > kCFCharConversionBufferLength) && ((currentLength - numChars) < MAX_DECOMP_LEN)) {
724 composedRange = CFStringGetRangeOfComposedCharactersAtIndex(string, rangeLoc + currentLength);
725
726 if ((composedRange.length <= MAX_DECOMP_LEN) && (composedRange.location < (rangeLoc + numChars))) {
727 result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, composedRange.location - rangeLoc, &numChars, buffer, max, &usedLen);
728 }
729 }
730
731 // Check the head
732 if ((kCFStringEncodingConversionSuccess != result) && (lastNumChars > 0) && (numChars < MAX_DECOMP_LEN)) {
733 composedRange = CFStringGetRangeOfComposedCharactersAtIndex(string, rangeLoc);
734
735 if ((composedRange.length <= MAX_DECOMP_LEN) && (composedRange.location < rangeLoc)) {
736 // Try if the composed range can be converted
737 CFStringGetCharacters(string, composedRange, charBuf);
738
739 if (CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, composedRange.length, &numChars, NULL, 0, &usedLen) == kCFStringEncodingConversionSuccess) { // OK let's try the last run
740 CFIndex lastRangeLoc = rangeLoc - lastNumChars;
741
742 currentLength = composedRange.location - lastRangeLoc;
743 CFStringGetCharacters(string, CFRangeMake(lastRangeLoc, currentLength), charBuf);
744
745 if ((result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, currentLength, &numChars, (max ? buffer - lastUsedLen : NULL), (max ? max + lastUsedLen : 0), &usedLen)) == kCFStringEncodingConversionSuccess) { // OK let's try the last run
746 // Looks good. back up
747 totalBytesWritten -= lastUsedLen;
748 numCharsProcessed -= lastNumChars;
749
750 rangeLoc = lastRangeLoc;
751 rangeLen += lastNumChars;
752
753 if (max) {
754 buffer -= lastUsedLen;
755 max += lastUsedLen;
756 }
757 }
758 }
759 }
760 }
761 }
762
763 if (kCFStringEncodingConversionSuccess != result) { // really failed
764 totalBytesWritten += usedLen;
765 numCharsProcessed += numChars;
766 break;
767 }
768 }
769 } else {
770 return 0;
771 }
772
773 totalBytesWritten += usedLen;
774 numCharsProcessed += numChars;
775
776 rangeLoc += numChars;
777 rangeLen -= numChars;
778 if (max) {
779 buffer += usedLen;
780 max -= usedLen;
781 if (max <= 0) break;
782 }
783 lastUsedLen = usedLen; lastNumChars = numChars;
784 flags &= ~kCFStringEncodingPrependBOM;
785 }
786 }
787 }
788 if (usedBufLen) *usedBufLen = totalBytesWritten;
789 return numCharsProcessed;
790 }
791
792 CFStringRef CFStringCreateWithFileSystemRepresentation(CFAllocatorRef alloc, const char *buffer) {
793 return CFStringCreateWithCString(alloc, buffer, CFStringFileSystemEncoding());
794 }
795
796 CFIndex CFStringGetMaximumSizeOfFileSystemRepresentation(CFStringRef string) {
797 CFIndex len = CFStringGetLength(string);
798 CFStringEncoding enc = CFStringGetFastestEncoding(string);
799 switch (enc) {
800 case kCFStringEncodingASCII:
801 case kCFStringEncodingMacRoman:
802 return len * 3 + 1;
803 default:
804 return len * 9 + 1;
805 }
806 }
807
808 Boolean CFStringGetFileSystemRepresentation(CFStringRef string, char *buffer, CFIndex maxBufLen) {
809 #if DEPLOYMENT_TARGET_MACOSX
810 #define MAX_STACK_BUFFER_LEN (255)
811 const UTF16Char *characters = CFStringGetCharactersPtr(string);
812 const char *bufferLimit = buffer + maxBufLen;
813 CFIndex length = CFStringGetLength(string);
814 CFIndex usedBufLen;
815
816 if (maxBufLen < length) return false; // Since we're using UTF-8, the byte length is never shorter than the char length. Also, it filters out 0 == maxBufLen
817
818 if (NULL == characters) {
819 UTF16Char charactersBuffer[MAX_STACK_BUFFER_LEN];
820 CFRange range = CFRangeMake(0, 0);
821 const char *bytes = CFStringGetCStringPtr(string, __CFStringGetEightBitStringEncoding());
822
823 if (NULL != bytes) {
824 const char *originalBytes = bytes;
825 const char *bytesLimit = bytes + length;
826
827 while ((bytes < bytesLimit) && (buffer < bufferLimit) && (0 == (*bytes & 0x80))) *(buffer++) = *(bytes++);
828
829 range.location = bytes - originalBytes;
830 }
831 while ((range.location < length) && (buffer < bufferLimit)) {
832 range.length = length - range.location;
833 if (range.length > MAX_STACK_BUFFER_LEN) range.length = MAX_STACK_BUFFER_LEN;
834
835 CFStringGetCharacters(string, range, charactersBuffer);
836 if ((range.length == MAX_STACK_BUFFER_LEN) && CFUniCharIsSurrogateHighCharacter(charactersBuffer[MAX_STACK_BUFFER_LEN - 1])) --range.length; // Backup for a high surrogate
837
838 if (!CFUniCharDecompose(charactersBuffer, range.length, NULL, (void *)buffer, bufferLimit - buffer, &usedBufLen, true, kCFUniCharUTF8Format, true)) return false;
839
840 buffer += usedBufLen;
841 range.location += range.length;
842 }
843 } else {
844 if (!CFUniCharDecompose(characters, length, NULL, (void *)buffer, maxBufLen, &usedBufLen, true, kCFUniCharUTF8Format, true)) return false;
845 buffer += usedBufLen;
846 }
847
848 if (buffer < bufferLimit) { // Since the filename has its own limit, this is ok for now
849 *buffer = '\0';
850 return true;
851 } else {
852 return false;
853 }
854 #else __MACH__
855 return CFStringGetCString(string, buffer, maxBufLen, CFStringFileSystemEncoding());
856 #endif __MACH__
857 }
858
859 Boolean _CFStringGetFileSystemRepresentation(CFStringRef string, uint8_t *buffer, CFIndex maxBufLen) {
860 return CFStringGetFileSystemRepresentation(string, (char *)buffer, maxBufLen);
861 }
862