2 * Copyright (c) 2008 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
23 /* CFStringEncodings.c
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Aki Inoue
28 #include "CFInternal.h"
29 #include <CoreFoundation/CFString.h>
30 #include <CoreFoundation/CFByteOrder.h>
33 #include "CFStringEncodingConverterExt.h"
34 #include "CFUniChar.h"
35 #include "CFUnicodeDecomposition.h"
37 static UInt32 __CFWantsToUseASCIICompatibleConversion
= (UInt32
)-1;
38 CF_INLINE UInt32
__CFGetASCIICompatibleFlag(void) {
39 if (__CFWantsToUseASCIICompatibleConversion
== (UInt32
)-1) {
40 __CFWantsToUseASCIICompatibleConversion
= false;
42 return (__CFWantsToUseASCIICompatibleConversion
? kCFStringEncodingASCIICompatibleConversion
: 0);
45 void _CFStringEncodingSetForceASCIICompatibility(Boolean flag
) {
46 __CFWantsToUseASCIICompatibleConversion
= (flag
? (UInt32
)true : (UInt32
)false);
49 Boolean (*__CFCharToUniCharFunc
)(UInt32 flags
, uint8_t ch
, UniChar
*unicodeChar
) = NULL
;
51 // To avoid early initialization issues, we just initialize this here
52 // This should not be const as it is changed
53 UniChar __CFCharToUniCharTable
[256] = {
54 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
55 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
56 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
57 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
58 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
59 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
60 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
61 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
62 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
63 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
64 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
65 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
66 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
67 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
68 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
69 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
72 void __CFSetCharToUniCharFunc(Boolean (*func
)(UInt32 flags
, UInt8 ch
, UniChar
*unicodeChar
)) {
73 if (__CFCharToUniCharFunc
!= func
) {
75 __CFCharToUniCharFunc
= func
;
77 for (ch
= 128; ch
< 256; ch
++) {
79 __CFCharToUniCharTable
[ch
] = (__CFCharToUniCharFunc(0, ch
, &uch
) ? uch
: 0xFFFD);
81 } else { // If we have no __CFCharToUniCharFunc, assume 128..255 return the value as-is
82 for (ch
= 128; ch
< 256; ch
++) __CFCharToUniCharTable
[ch
] = ch
;
87 __private_extern__
void __CFStrConvertBytesToUnicode(const uint8_t *bytes
, UniChar
*buffer
, CFIndex numChars
) {
89 for (idx
= 0; idx
< numChars
; idx
++) buffer
[idx
] = __CFCharToUniCharTable
[bytes
[idx
]];
93 /* The minimum length the output buffers should be in the above functions
95 #define kCFCharConversionBufferLength 512
98 #define MAX_LOCAL_CHARS (sizeof(buffer->localBuffer) / sizeof(uint8_t))
99 #define MAX_LOCAL_UNICHARS (sizeof(buffer->localBuffer) / sizeof(UniChar))
101 /* Convert a byte stream to ASCII (7-bit!) or Unicode, with a CFVarWidthCharBuffer struct on the stack. false return indicates an error occured during the conversion. The caller needs to free the returned buffer in either ascii or unicode (indicated by isASCII), if shouldFreeChars is true.
102 9/18/98 __CFStringDecodeByteStream now avoids to allocate buffer if buffer->chars is not NULL
103 Added useClientsMemoryPtr; if not-NULL, and the provided memory can be used as is, this is set to true
104 __CFStringDecodeByteStream2() is kept around for any internal clients who might be using it; it should be deprecated
105 !!! converterFlags is only used for the UTF8 converter at this point
107 Boolean
__CFStringDecodeByteStream2(const uint8_t *bytes
, UInt32 len
, CFStringEncoding encoding
, Boolean alwaysUnicode
, CFVarWidthCharBuffer
*buffer
, Boolean
*useClientsMemoryPtr
) {
108 return __CFStringDecodeByteStream3(bytes
, len
, encoding
, alwaysUnicode
, buffer
, useClientsMemoryPtr
, 0);
112 __NSNonLossyErrorMode
= -1,
113 __NSNonLossyASCIIMode
= 0,
114 __NSNonLossyBackslashMode
= 1,
115 __NSNonLossyHexInitialMode
= __NSNonLossyBackslashMode
+ 1,
116 __NSNonLossyHexFinalMode
= __NSNonLossyHexInitialMode
+ 4,
117 __NSNonLossyOctalInitialMode
= __NSNonLossyHexFinalMode
+ 1,
118 __NSNonLossyOctalFinalMode
= __NSNonLossyHexFinalMode
+ 3
121 Boolean
__CFStringDecodeByteStream3(const uint8_t *bytes
, CFIndex len
, CFStringEncoding encoding
, Boolean alwaysUnicode
, CFVarWidthCharBuffer
*buffer
, Boolean
*useClientsMemoryPtr
, UInt32 converterFlags
) {
123 if (useClientsMemoryPtr
) *useClientsMemoryPtr
= false;
125 buffer
->isASCII
= !alwaysUnicode
;
126 buffer
->shouldFreeChars
= false;
127 buffer
->numChars
= 0;
129 if (0 == len
) return true;
131 buffer
->allocator
= (buffer
->allocator
? buffer
->allocator
: __CFGetDefaultAllocator());
133 if ((encoding
== kCFStringEncodingUTF16
) || (encoding
== kCFStringEncodingUTF16BE
) || (encoding
== kCFStringEncodingUTF16LE
)) { // UTF-16
134 const UTF16Char
*src
= (const UTF16Char
*)bytes
;
135 const UTF16Char
*limit
= (const UTF16Char
*)(bytes
+ len
);
138 if (kCFStringEncodingUTF16
== encoding
) {
139 UTF16Char bom
= ((*src
== 0xFFFE) || (*src
== 0xFEFF) ? *(src
++) : 0);
141 #if __CF_BIG_ENDIAN__
142 if (bom
== 0xFFFE) swap
= true;
144 if (bom
!= 0xFEFF) swap
= true;
146 if (bom
) useClientsMemoryPtr
= NULL
;
148 #if __CF_BIG_ENDIAN__
149 if (kCFStringEncodingUTF16LE
== encoding
) swap
= true;
151 if (kCFStringEncodingUTF16BE
== encoding
) swap
= true;
155 buffer
->numChars
= limit
- src
;
157 if (useClientsMemoryPtr
&& !swap
) { // If the caller is ready to deal with no-copy situation, and the situation is possible, indicate it...
158 *useClientsMemoryPtr
= true;
159 buffer
->chars
.unicode
= (UniChar
*)src
;
160 buffer
->isASCII
= false;
162 if (buffer
->isASCII
) { // Let's see if we can reduce the Unicode down to ASCII...
163 const UTF16Char
*characters
= src
;
164 UTF16Char mask
= (swap
? 0x80FF : 0xFF80);
166 while (characters
< limit
) {
167 if (*(characters
++) & mask
) {
168 buffer
->isASCII
= false;
174 if (buffer
->isASCII
) {
176 if (NULL
== buffer
->chars
.ascii
) { // we never reallocate when buffer is supplied
177 if (buffer
->numChars
> MAX_LOCAL_CHARS
) {
178 buffer
->chars
.ascii
= (UInt8
*)CFAllocatorAllocate(buffer
->allocator
, (buffer
->numChars
* sizeof(uint8_t)), 0);
179 buffer
->shouldFreeChars
= true;
181 buffer
->chars
.ascii
= (uint8_t *)buffer
->localBuffer
;
184 dst
= buffer
->chars
.ascii
;
187 while (src
< limit
) *(dst
++) = (*(src
++) >> 8);
189 while (src
< limit
) *(dst
++) = (uint8_t)*(src
++);
194 if (NULL
== buffer
->chars
.unicode
) { // we never reallocate when buffer is supplied
195 if (buffer
->numChars
> MAX_LOCAL_UNICHARS
) {
196 buffer
->chars
.unicode
= (UniChar
*)CFAllocatorAllocate(buffer
->allocator
, (buffer
->numChars
* sizeof(UTF16Char
)), 0);
197 buffer
->shouldFreeChars
= true;
199 buffer
->chars
.unicode
= (UTF16Char
*)buffer
->localBuffer
;
202 dst
= buffer
->chars
.unicode
;
205 while (src
< limit
) *(dst
++) = CFSwapInt16(*(src
++));
207 memmove(dst
, src
, buffer
->numChars
* sizeof(UTF16Char
));
211 } else if ((encoding
== kCFStringEncodingUTF32
) || (encoding
== kCFStringEncodingUTF32BE
) || (encoding
== kCFStringEncodingUTF32LE
)) {
212 const UTF32Char
*src
= (const UTF32Char
*)bytes
;
213 const UTF32Char
*limit
= (const UTF32Char
*)(bytes
+ len
);
215 static bool strictUTF32
= (bool)-1;
217 if ((bool)-1 == strictUTF32
) strictUTF32
= (_CFExecutableLinkedOnOrAfter(CFSystemVersionLeopard
) != 0);
219 if (kCFStringEncodingUTF32
== encoding
) {
220 UTF32Char bom
= ((*src
== 0xFFFE0000) || (*src
== 0x0000FEFF) ? *(src
++) : 0);
222 #if __CF_BIG_ENDIAN__
223 if (bom
== 0xFFFE0000) swap
= true;
225 if (bom
!= 0x0000FEFF) swap
= true;
228 #if __CF_BIG_ENDIAN__
229 if (kCFStringEncodingUTF32LE
== encoding
) swap
= true;
231 if (kCFStringEncodingUTF32BE
== encoding
) swap
= true;
235 buffer
->numChars
= limit
- src
;
238 // Let's see if we have non-ASCII or non-BMP
239 const UTF32Char
*characters
= src
;
240 UTF32Char asciiMask
= (swap
? 0x80FFFFFF : 0xFFFFFF80);
241 UTF32Char bmpMask
= (swap
? 0x0000FFFF : 0xFFFF0000);
243 while (characters
< limit
) {
244 if (*characters
& asciiMask
) {
245 buffer
->isASCII
= false;
246 if (*characters
& bmpMask
) {
247 if (strictUTF32
&& ((swap
? (UTF32Char
)CFSwapInt32(*characters
) : *characters
) > 0x10FFFF)) return false; // outside of Unicode Scaler Value
248 ++(buffer
->numChars
);
255 if (buffer
->isASCII
) {
257 if (NULL
== buffer
->chars
.ascii
) { // we never reallocate when buffer is supplied
258 if (buffer
->numChars
> MAX_LOCAL_CHARS
) {
259 buffer
->chars
.ascii
= (UInt8
*)CFAllocatorAllocate(buffer
->allocator
, (buffer
->numChars
* sizeof(uint8_t)), 0);
260 buffer
->shouldFreeChars
= true;
262 buffer
->chars
.ascii
= (uint8_t *)buffer
->localBuffer
;
265 dst
= buffer
->chars
.ascii
;
268 while (src
< limit
) *(dst
++) = (*(src
++) >> 24);
270 while (src
< limit
) *(dst
++) = *(src
++);
273 if (NULL
== buffer
->chars
.unicode
) { // we never reallocate when buffer is supplied
274 if (buffer
->numChars
> MAX_LOCAL_UNICHARS
) {
275 buffer
->chars
.unicode
= (UniChar
*)CFAllocatorAllocate(buffer
->allocator
, (buffer
->numChars
* sizeof(UTF16Char
)), 0);
276 buffer
->shouldFreeChars
= true;
278 buffer
->chars
.unicode
= (UTF16Char
*)buffer
->localBuffer
;
281 return (CFUniCharFromUTF32(src
, limit
- src
, buffer
->chars
.unicode
, (strictUTF32
? false : true), __CF_BIG_ENDIAN__
? !swap
: swap
) ? TRUE
: FALSE
);
285 const uint8_t *chars
= (const uint8_t *)bytes
;
286 const uint8_t *end
= chars
+ len
;
289 case kCFStringEncodingNonLossyASCII
: {
290 UTF16Char currentValue
= 0;
292 int8_t mode
= __NSNonLossyASCIIMode
;
294 buffer
->isASCII
= false;
295 buffer
->shouldFreeChars
= !buffer
->chars
.unicode
&& (len
<= MAX_LOCAL_UNICHARS
) ? false : true;
296 buffer
->chars
.unicode
= (buffer
->chars
.unicode
? buffer
->chars
.unicode
: (len
<= MAX_LOCAL_UNICHARS
) ? (UniChar
*)buffer
->localBuffer
: (UniChar
*)CFAllocatorAllocate(buffer
->allocator
, len
* sizeof(UniChar
), 0));
297 buffer
->numChars
= 0;
299 while (chars
< end
) {
300 character
= (*chars
++);
303 case __NSNonLossyASCIIMode
:
304 if (character
== '\\') {
305 mode
= __NSNonLossyBackslashMode
;
306 } else if (character
< 0x80) {
307 currentValue
= character
;
309 mode
= __NSNonLossyErrorMode
;
313 case __NSNonLossyBackslashMode
:
314 if ((character
== 'U') || (character
== 'u')) {
315 mode
= __NSNonLossyHexInitialMode
;
317 } else if ((character
>= '0') && (character
<= '9')) {
318 mode
= __NSNonLossyOctalInitialMode
;
319 currentValue
= character
- '0';
320 } else if (character
== '\\') {
321 mode
= __NSNonLossyASCIIMode
;
322 currentValue
= character
;
324 mode
= __NSNonLossyErrorMode
;
329 if (mode
< __NSNonLossyHexFinalMode
) {
330 if ((character
>= '0') && (character
<= '9')) {
331 currentValue
= (currentValue
<< 4) | (character
- '0');
332 if (++mode
== __NSNonLossyHexFinalMode
) mode
= __NSNonLossyASCIIMode
;
334 if (character
>= 'a') character
-= ('a' - 'A');
335 if ((character
>= 'A') && (character
<= 'F')) {
336 currentValue
= (currentValue
<< 4) | ((character
- 'A') + 10);
337 if (++mode
== __NSNonLossyHexFinalMode
) mode
= __NSNonLossyASCIIMode
;
339 mode
= __NSNonLossyErrorMode
;
343 if ((character
>= '0') && (character
<= '9')) {
344 currentValue
= (currentValue
<< 3) | (character
- '0');
345 if (++mode
== __NSNonLossyOctalFinalMode
) mode
= __NSNonLossyASCIIMode
;
347 mode
= __NSNonLossyErrorMode
;
353 if (mode
== __NSNonLossyASCIIMode
) {
354 buffer
->chars
.unicode
[buffer
->numChars
++] = currentValue
;
355 } else if (mode
== __NSNonLossyErrorMode
) {
359 return (mode
== __NSNonLossyASCIIMode
);
362 case kCFStringEncodingUTF8
:
363 if ((len
>= 3) && (chars
[0] == 0xef) && (chars
[1] == 0xbb) && (chars
[2] == 0xbf)) { // If UTF8 BOM, skip
366 if (0 == len
) return true;
368 if (buffer
->isASCII
) {
369 for (idx
= 0; idx
< len
; idx
++) {
370 if (128 <= chars
[idx
]) {
371 buffer
->isASCII
= false;
376 if (buffer
->isASCII
) {
377 buffer
->numChars
= len
;
378 buffer
->shouldFreeChars
= !buffer
->chars
.ascii
&& (len
<= MAX_LOCAL_CHARS
) ? false : true;
379 buffer
->chars
.ascii
= (buffer
->chars
.ascii
? buffer
->chars
.ascii
: (len
<= MAX_LOCAL_CHARS
) ? (uint8_t *)buffer
->localBuffer
: (UInt8
*)CFAllocatorAllocate(buffer
->allocator
, len
* sizeof(uint8_t), 0));
380 memmove(buffer
->chars
.ascii
, chars
, len
* sizeof(uint8_t));
383 static CFStringEncodingToUnicodeProc __CFFromUTF8
= NULL
;
386 const CFStringEncodingConverter
*converter
= CFStringEncodingGetConverter(kCFStringEncodingUTF8
);
387 __CFFromUTF8
= (CFStringEncodingToUnicodeProc
)converter
->toUnicode
;
390 buffer
->shouldFreeChars
= !buffer
->chars
.unicode
&& (len
<= MAX_LOCAL_UNICHARS
) ? false : true;
391 buffer
->chars
.unicode
= (buffer
->chars
.unicode
? buffer
->chars
.unicode
: (len
<= MAX_LOCAL_UNICHARS
) ? (UniChar
*)buffer
->localBuffer
: (UniChar
*)CFAllocatorAllocate(buffer
->allocator
, len
* sizeof(UniChar
), 0));
392 buffer
->numChars
= 0;
393 while (chars
< end
) {
395 chars
+= __CFFromUTF8(converterFlags
, chars
, end
- chars
, &(buffer
->chars
.unicode
[buffer
->numChars
]), len
- buffer
->numChars
, &numDone
);
398 if (buffer
->shouldFreeChars
) CFAllocatorDeallocate(buffer
->allocator
, buffer
->chars
.unicode
);
399 buffer
->isASCII
= !alwaysUnicode
;
400 buffer
->shouldFreeChars
= false;
401 buffer
->chars
.ascii
= NULL
;
402 buffer
->numChars
= 0;
405 buffer
->numChars
+= numDone
;
411 if (CFStringEncodingIsValidEncoding(encoding
)) {
412 const CFStringEncodingConverter
*converter
= CFStringEncodingGetConverter(encoding
);
413 Boolean isASCIISuperset
= __CFStringEncodingIsSupersetOfASCII(encoding
);
415 if (!converter
) return false;
417 if (!isASCIISuperset
) buffer
->isASCII
= false;
419 if (buffer
->isASCII
) {
420 for (idx
= 0; idx
< len
; idx
++) {
421 if (128 <= chars
[idx
]) {
422 buffer
->isASCII
= false;
428 if (converter
->encodingClass
== kCFStringEncodingConverterCheapEightBit
) {
429 if (buffer
->isASCII
) {
430 buffer
->numChars
= len
;
431 buffer
->shouldFreeChars
= !buffer
->chars
.ascii
&& (len
<= MAX_LOCAL_CHARS
) ? false : true;
432 buffer
->chars
.ascii
= (buffer
->chars
.ascii
? buffer
->chars
.ascii
: (len
<= MAX_LOCAL_CHARS
) ? (uint8_t *)buffer
->localBuffer
: (UInt8
*)CFAllocatorAllocate(buffer
->allocator
, len
* sizeof(uint8_t), 0));
433 memmove(buffer
->chars
.ascii
, chars
, len
* sizeof(uint8_t));
435 buffer
->shouldFreeChars
= !buffer
->chars
.unicode
&& (len
<= MAX_LOCAL_UNICHARS
) ? false : true;
436 buffer
->chars
.unicode
= (buffer
->chars
.unicode
? buffer
->chars
.unicode
: (len
<= MAX_LOCAL_UNICHARS
) ? (UniChar
*)buffer
->localBuffer
: (UniChar
*)CFAllocatorAllocate(buffer
->allocator
, len
* sizeof(UniChar
), 0));
437 buffer
->numChars
= len
;
438 if (kCFStringEncodingASCII
== encoding
|| kCFStringEncodingISOLatin1
== encoding
) {
439 for (idx
= 0; idx
< len
; idx
++) buffer
->chars
.unicode
[idx
] = (UniChar
)chars
[idx
];
441 for (idx
= 0; idx
< len
; idx
++)
442 if (chars
[idx
] < 0x80 && isASCIISuperset
)
443 buffer
->chars
.unicode
[idx
] = (UniChar
)chars
[idx
];
444 else if (!((CFStringEncodingCheapEightBitToUnicodeProc
)converter
->toUnicode
)(0, chars
[idx
], buffer
->chars
.unicode
+ idx
))
449 if (buffer
->isASCII
) {
450 buffer
->numChars
= len
;
451 buffer
->shouldFreeChars
= !buffer
->chars
.ascii
&& (len
<= MAX_LOCAL_CHARS
) ? false : true;
452 buffer
->chars
.ascii
= (buffer
->chars
.ascii
? buffer
->chars
.ascii
: (len
<= MAX_LOCAL_CHARS
) ? (uint8_t *)buffer
->localBuffer
: (UInt8
*)CFAllocatorAllocate(buffer
->allocator
, len
* sizeof(uint8_t), 0));
453 memmove(buffer
->chars
.ascii
, chars
, len
* sizeof(uint8_t));
455 CFIndex guessedLength
= CFStringEncodingCharLengthForBytes(encoding
, 0, bytes
, len
);
456 static UInt32 lossyFlag
= (UInt32
)-1;
458 buffer
->shouldFreeChars
= !buffer
->chars
.unicode
&& (guessedLength
<= MAX_LOCAL_UNICHARS
) ? false : true;
459 buffer
->chars
.unicode
= (buffer
->chars
.unicode
? buffer
->chars
.unicode
: (guessedLength
<= MAX_LOCAL_UNICHARS
) ? (UniChar
*)buffer
->localBuffer
: (UniChar
*)CFAllocatorAllocate(buffer
->allocator
, guessedLength
* sizeof(UniChar
), 0));
461 if (lossyFlag
== (UInt32
)-1) lossyFlag
= (_CFExecutableLinkedOnOrAfter(CFSystemVersionPanther
) ? 0 : kCFStringEncodingAllowLossyConversion
);
463 if (CFStringEncodingBytesToUnicode(encoding
, lossyFlag
|__CFGetASCIICompatibleFlag(), bytes
, len
, NULL
, buffer
->chars
.unicode
, (guessedLength
> MAX_LOCAL_UNICHARS
? guessedLength
: MAX_LOCAL_UNICHARS
), &(buffer
->numChars
))) {
464 if (buffer
->shouldFreeChars
) CFAllocatorDeallocate(buffer
->allocator
, buffer
->chars
.unicode
);
465 buffer
->isASCII
= !alwaysUnicode
;
466 buffer
->shouldFreeChars
= false;
467 buffer
->chars
.ascii
= NULL
;
468 buffer
->numChars
= 0;
483 /* Create a byte stream from a CFString backing. Can convert a string piece at a time
484 into a fixed size buffer. Returns number of characters converted.
485 Characters that cannot be converted to the specified encoding are represented
486 with the char specified by lossByte; if 0, then lossy conversion is not allowed
487 and conversion stops, returning partial results.
488 Pass buffer==NULL if you don't care about the converted string (but just the convertability,
489 or number of bytes required, indicated by usedBufLen).
490 Does not zero-terminate. If you want to create Pascal or C string, allow one extra byte at start or end.
492 Note: This function is intended to work through CFString functions, so it should work
493 with NSStrings as well as CFStrings.
495 CFIndex
__CFStringEncodeByteStream(CFStringRef string
, CFIndex rangeLoc
, CFIndex rangeLen
, Boolean generatingExternalFile
, CFStringEncoding encoding
, char lossByte
, uint8_t *buffer
, CFIndex max
, CFIndex
*usedBufLen
) {
496 CFIndex totalBytesWritten
= 0; /* Number of written bytes */
497 CFIndex numCharsProcessed
= 0; /* Number of processed chars */
498 const UniChar
*unichars
;
500 if (encoding
== kCFStringEncodingUTF8
&& (unichars
= CFStringGetCharactersPtr(string
))) {
501 static CFStringEncodingToBytesProc __CFToUTF8
= NULL
;
504 const CFStringEncodingConverter
*utf8Converter
= CFStringEncodingGetConverter(kCFStringEncodingUTF8
);
505 __CFToUTF8
= (CFStringEncodingToBytesProc
)utf8Converter
->toBytes
;
507 numCharsProcessed
= __CFToUTF8((generatingExternalFile
? kCFStringEncodingPrependBOM
: 0), unichars
+ rangeLoc
, rangeLen
, buffer
, (buffer
? max
: 0), &totalBytesWritten
);
509 } else if (encoding
== kCFStringEncodingNonLossyASCII
) {
510 const char *hex
= "0123456789abcdef";
512 CFStringInlineBuffer buf
;
513 CFStringInitInlineBuffer(string
, &buf
, CFRangeMake(rangeLoc
, rangeLen
));
514 while (numCharsProcessed
< rangeLen
) {
515 CFIndex reqLength
; /* Required number of chars to encode this UniChar */
518 ch
= CFStringGetCharacterFromInlineBuffer(&buf
, numCharsProcessed
);
519 if ((ch
>= ' ' && ch
<= '~' && ch
!= '\\') || (ch
== '\n' || ch
== '\r' || ch
== '\t')) {
526 } else if (ch
< 256) { /* \nnn; note that this is not NEXTSTEP encoding but a (small) UniChar */
527 tmp
[1] = '0' + (ch
>> 6);
528 tmp
[2] = '0' + ((ch
>> 3) & 7);
529 tmp
[3] = '0' + (ch
& 7);
531 } else { /* \Unnnn */
532 tmp
[1] = 'u'; // Changed to small+u in order to be aligned with Java
533 tmp
[2] = hex
[(ch
>> 12) & 0x0f];
534 tmp
[3] = hex
[(ch
>> 8) & 0x0f];
535 tmp
[4] = hex
[(ch
>> 4) & 0x0f];
536 tmp
[5] = hex
[ch
& 0x0f];
542 if (totalBytesWritten
+ reqLength
> max
) break; /* Doesn't fit..
544 for (cnt
= 0; cnt
< reqLength
; cnt
++) {
545 buffer
[totalBytesWritten
+ cnt
] = tmp
[cnt
];
548 totalBytesWritten
+= reqLength
;
551 } else if ((encoding
== kCFStringEncodingUTF16
) || (encoding
== kCFStringEncodingUTF16BE
) || (encoding
== kCFStringEncodingUTF16LE
)) {
552 CFIndex extraForBOM
= (generatingExternalFile
&& (encoding
== kCFStringEncodingUTF16
) ? sizeof(UniChar
) : 0);
553 numCharsProcessed
= rangeLen
;
554 if (buffer
&& (numCharsProcessed
* (CFIndex
)sizeof(UniChar
) + extraForBOM
> max
)) {
555 numCharsProcessed
= (max
> extraForBOM
) ? ((max
- extraForBOM
) / sizeof(UniChar
)) : 0;
557 totalBytesWritten
= (numCharsProcessed
* sizeof(UniChar
)) + extraForBOM
;
559 if (extraForBOM
) { /* Generate BOM */
560 #if __CF_BIG_ENDIAN__
561 *buffer
++ = 0xfe; *buffer
++ = 0xff;
563 *buffer
++ = 0xff; *buffer
++ = 0xfe;
566 CFStringGetCharacters(string
, CFRangeMake(rangeLoc
, numCharsProcessed
), (UniChar
*)buffer
);
567 if ((__CF_BIG_ENDIAN__
? kCFStringEncodingUTF16LE
: kCFStringEncodingUTF16BE
) == encoding
) { // Need to swap
568 UTF16Char
*characters
= (UTF16Char
*)buffer
;
569 const UTF16Char
*limit
= characters
+ numCharsProcessed
;
571 while (characters
< limit
) {
572 *characters
= CFSwapInt16(*characters
);
577 } else if ((encoding
== kCFStringEncodingUTF32
) || (encoding
== kCFStringEncodingUTF32BE
) || (encoding
== kCFStringEncodingUTF32LE
)) {
579 CFStringInlineBuffer buf
;
580 UTF32Char
*characters
= (UTF32Char
*)buffer
;
582 bool swap
= (encoding
== (__CF_BIG_ENDIAN__
? kCFStringEncodingUTF32LE
: kCFStringEncodingUTF32BE
) ? true : false);
583 if (generatingExternalFile
&& (encoding
== kCFStringEncodingUTF32
)) {
584 totalBytesWritten
+= sizeof(UTF32Char
);
586 if (totalBytesWritten
> max
) { // insufficient buffer
587 totalBytesWritten
= 0;
589 *(characters
++) = 0x0000FEFF;
594 CFStringInitInlineBuffer(string
, &buf
, CFRangeMake(rangeLoc
, rangeLen
));
595 while (numCharsProcessed
< rangeLen
) {
596 character
= CFStringGetCharacterFromInlineBuffer(&buf
, numCharsProcessed
);
598 if (CFUniCharIsSurrogateHighCharacter(character
)) {
599 UTF16Char otherCharacter
;
601 if (((numCharsProcessed
+ 1) < rangeLen
) && CFUniCharIsSurrogateLowCharacter((otherCharacter
= CFStringGetCharacterFromInlineBuffer(&buf
, numCharsProcessed
+ 1)))) {
602 character
= CFUniCharGetLongCharacterForSurrogatePair(character
, otherCharacter
);
603 } else if (lossByte
) {
604 character
= lossByte
;
608 } else if (CFUniCharIsSurrogateLowCharacter(character
)) {
610 character
= lossByte
;
616 totalBytesWritten
+= sizeof(UTF32Char
);
619 if (totalBytesWritten
> max
) {
620 totalBytesWritten
-= sizeof(UTF32Char
);
623 *(characters
++) = (swap
? CFSwapInt32(character
) : character
);
626 numCharsProcessed
+= (character
> 0xFFFF ? 2 : 1);
631 const unsigned char *cString
= NULL
;
632 Boolean isASCIISuperset
= __CFStringEncodingIsSupersetOfASCII(encoding
);
634 if (!CF_IS_OBJC(CFStringGetTypeID(), string
) && isASCIISuperset
) { // Checking for NSString to avoid infinite recursion
635 const unsigned char *ptr
;
636 if ((cString
= (const unsigned char *)CFStringGetCStringPtr(string
, __CFStringGetEightBitStringEncoding()))) {
637 ptr
= (cString
+= rangeLoc
);
638 if (__CFStringGetEightBitStringEncoding() == encoding
) {
639 numCharsProcessed
= (rangeLen
< max
|| buffer
== NULL
? rangeLen
: max
);
640 if (buffer
) memmove(buffer
, cString
, numCharsProcessed
);
641 if (usedBufLen
) *usedBufLen
= numCharsProcessed
;
642 return numCharsProcessed
;
644 while (*ptr
< 0x80 && rangeLen
> 0) {
648 numCharsProcessed
= ptr
- cString
;
650 numCharsProcessed
= (numCharsProcessed
< max
? numCharsProcessed
: max
);
651 memmove(buffer
, cString
, numCharsProcessed
);
652 buffer
+= numCharsProcessed
;
653 max
-= numCharsProcessed
;
655 if (!rangeLen
|| (buffer
&& (max
== 0))) {
656 if (usedBufLen
) *usedBufLen
= numCharsProcessed
;
657 return numCharsProcessed
;
659 rangeLoc
+= numCharsProcessed
;
660 totalBytesWritten
+= numCharsProcessed
;
662 if (!cString
&& (cString
= CFStringGetPascalStringPtr(string
, __CFStringGetEightBitStringEncoding()))) {
663 ptr
= (cString
+= (rangeLoc
+ 1));
664 if (__CFStringGetEightBitStringEncoding() == encoding
) {
665 numCharsProcessed
= (rangeLen
< max
|| buffer
== NULL
? rangeLen
: max
);
666 if (buffer
) memmove(buffer
, cString
, numCharsProcessed
);
667 if (usedBufLen
) *usedBufLen
= numCharsProcessed
;
668 return numCharsProcessed
;
670 while (*ptr
< 0x80 && rangeLen
> 0) {
674 numCharsProcessed
= ptr
- cString
;
676 numCharsProcessed
= (numCharsProcessed
< max
? numCharsProcessed
: max
);
677 memmove(buffer
, cString
, numCharsProcessed
);
678 buffer
+= numCharsProcessed
;
679 max
-= numCharsProcessed
;
681 if (!rangeLen
|| (buffer
&& (max
== 0))) {
682 if (usedBufLen
) *usedBufLen
= numCharsProcessed
;
683 return numCharsProcessed
;
685 rangeLoc
+= numCharsProcessed
;
686 totalBytesWritten
+= numCharsProcessed
;
690 if (!buffer
) max
= 0;
692 // Special case for Foundation. When lossByte == 0xFF && encoding kCFStringEncodingASCII, we do the default ASCII fallback conversion
693 // Aki 11/24/04 __CFGetASCIICompatibleFlag() is called only for non-ASCII superset encodings. Otherwise, it could lead to a deadlock (see 3890536).
694 flags
= (lossByte
? ((unsigned char)lossByte
== 0xFF && encoding
== kCFStringEncodingASCII
? kCFStringEncodingAllowLossyConversion
: CFStringEncodingLossyByteToMask(lossByte
)) : 0) | (generatingExternalFile
? kCFStringEncodingPrependBOM
: 0) | (isASCIISuperset
? 0 : __CFGetASCIICompatibleFlag());
696 if (!cString
&& (cString
= (const unsigned char *)CFStringGetCharactersPtr(string
))) { // Must be Unicode string
697 if (CFStringEncodingIsValidEncoding(encoding
)) { // Converter available in CF
698 CFStringEncodingUnicodeToBytes(encoding
, flags
, (const UniChar
*)cString
+ rangeLoc
, rangeLen
, &numCharsProcessed
, buffer
, max
, &totalBytesWritten
);
703 UniChar charBuf
[kCFCharConversionBufferLength
];
704 CFIndex currentLength
;
706 CFIndex lastUsedLen
= 0, lastNumChars
= 0;
708 Boolean isCFBuiltin
= CFStringEncodingIsValidEncoding(encoding
);
709 #define MAX_DECOMP_LEN (6)
711 while (rangeLen
> 0) {
712 currentLength
= (rangeLen
> kCFCharConversionBufferLength
? kCFCharConversionBufferLength
: rangeLen
);
713 CFStringGetCharacters(string
, CFRangeMake(rangeLoc
, currentLength
), charBuf
);
715 // could be in the middle of surrogate pair; back up.
716 if ((rangeLen
> kCFCharConversionBufferLength
) && CFUniCharIsSurrogateHighCharacter(charBuf
[kCFCharConversionBufferLength
- 1])) --currentLength
;
718 if (isCFBuiltin
) { // Converter available in CF
719 if ((result
= CFStringEncodingUnicodeToBytes(encoding
, flags
, charBuf
, currentLength
, &numChars
, buffer
, max
, &usedLen
)) != kCFStringEncodingConversionSuccess
) {
720 if (kCFStringEncodingInvalidInputStream
== result
) {
721 CFRange composedRange
;
723 if ((rangeLen
> kCFCharConversionBufferLength
) && ((currentLength
- numChars
) < MAX_DECOMP_LEN
)) {
724 composedRange
= CFStringGetRangeOfComposedCharactersAtIndex(string
, rangeLoc
+ currentLength
);
726 if ((composedRange
.length
<= MAX_DECOMP_LEN
) && (composedRange
.location
< (rangeLoc
+ numChars
))) {
727 result
= CFStringEncodingUnicodeToBytes(encoding
, flags
, charBuf
, composedRange
.location
- rangeLoc
, &numChars
, buffer
, max
, &usedLen
);
732 if ((kCFStringEncodingConversionSuccess
!= result
) && (lastNumChars
> 0) && (numChars
< MAX_DECOMP_LEN
)) {
733 composedRange
= CFStringGetRangeOfComposedCharactersAtIndex(string
, rangeLoc
);
735 if ((composedRange
.length
<= MAX_DECOMP_LEN
) && (composedRange
.location
< rangeLoc
)) {
736 // Try if the composed range can be converted
737 CFStringGetCharacters(string
, composedRange
, charBuf
);
739 if (CFStringEncodingUnicodeToBytes(encoding
, flags
, charBuf
, composedRange
.length
, &numChars
, NULL
, 0, &usedLen
) == kCFStringEncodingConversionSuccess
) { // OK let's try the last run
740 CFIndex lastRangeLoc
= rangeLoc
- lastNumChars
;
742 currentLength
= composedRange
.location
- lastRangeLoc
;
743 CFStringGetCharacters(string
, CFRangeMake(lastRangeLoc
, currentLength
), charBuf
);
745 if ((result
= CFStringEncodingUnicodeToBytes(encoding
, flags
, charBuf
, currentLength
, &numChars
, (max
? buffer
- lastUsedLen
: NULL
), (max
? max
+ lastUsedLen
: 0), &usedLen
)) == kCFStringEncodingConversionSuccess
) { // OK let's try the last run
746 // Looks good. back up
747 totalBytesWritten
-= lastUsedLen
;
748 numCharsProcessed
-= lastNumChars
;
750 rangeLoc
= lastRangeLoc
;
751 rangeLen
+= lastNumChars
;
754 buffer
-= lastUsedLen
;
763 if (kCFStringEncodingConversionSuccess
!= result
) { // really failed
764 totalBytesWritten
+= usedLen
;
765 numCharsProcessed
+= numChars
;
773 totalBytesWritten
+= usedLen
;
774 numCharsProcessed
+= numChars
;
776 rangeLoc
+= numChars
;
777 rangeLen
-= numChars
;
783 lastUsedLen
= usedLen
; lastNumChars
= numChars
;
784 flags
&= ~kCFStringEncodingPrependBOM
;
788 if (usedBufLen
) *usedBufLen
= totalBytesWritten
;
789 return numCharsProcessed
;
792 CFStringRef
CFStringCreateWithFileSystemRepresentation(CFAllocatorRef alloc
, const char *buffer
) {
793 return CFStringCreateWithCString(alloc
, buffer
, CFStringFileSystemEncoding());
796 CFIndex
CFStringGetMaximumSizeOfFileSystemRepresentation(CFStringRef string
) {
797 CFIndex len
= CFStringGetLength(string
);
798 CFStringEncoding enc
= CFStringGetFastestEncoding(string
);
800 case kCFStringEncodingASCII
:
801 case kCFStringEncodingMacRoman
:
808 Boolean
CFStringGetFileSystemRepresentation(CFStringRef string
, char *buffer
, CFIndex maxBufLen
) {
809 #if DEPLOYMENT_TARGET_MACOSX
810 #define MAX_STACK_BUFFER_LEN (255)
811 const UTF16Char
*characters
= CFStringGetCharactersPtr(string
);
812 const char *bufferLimit
= buffer
+ maxBufLen
;
813 CFIndex length
= CFStringGetLength(string
);
816 if (maxBufLen
< length
) return false; // Since we're using UTF-8, the byte length is never shorter than the char length. Also, it filters out 0 == maxBufLen
818 if (NULL
== characters
) {
819 UTF16Char charactersBuffer
[MAX_STACK_BUFFER_LEN
];
820 CFRange range
= CFRangeMake(0, 0);
821 const char *bytes
= CFStringGetCStringPtr(string
, __CFStringGetEightBitStringEncoding());
824 const char *originalBytes
= bytes
;
825 const char *bytesLimit
= bytes
+ length
;
827 while ((bytes
< bytesLimit
) && (buffer
< bufferLimit
) && (0 == (*bytes
& 0x80))) *(buffer
++) = *(bytes
++);
829 range
.location
= bytes
- originalBytes
;
831 while ((range
.location
< length
) && (buffer
< bufferLimit
)) {
832 range
.length
= length
- range
.location
;
833 if (range
.length
> MAX_STACK_BUFFER_LEN
) range
.length
= MAX_STACK_BUFFER_LEN
;
835 CFStringGetCharacters(string
, range
, charactersBuffer
);
836 if ((range
.length
== MAX_STACK_BUFFER_LEN
) && CFUniCharIsSurrogateHighCharacter(charactersBuffer
[MAX_STACK_BUFFER_LEN
- 1])) --range
.length
; // Backup for a high surrogate
838 if (!CFUniCharDecompose(charactersBuffer
, range
.length
, NULL
, (void *)buffer
, bufferLimit
- buffer
, &usedBufLen
, true, kCFUniCharUTF8Format
, true)) return false;
840 buffer
+= usedBufLen
;
841 range
.location
+= range
.length
;
844 if (!CFUniCharDecompose(characters
, length
, NULL
, (void *)buffer
, maxBufLen
, &usedBufLen
, true, kCFUniCharUTF8Format
, true)) return false;
845 buffer
+= usedBufLen
;
848 if (buffer
< bufferLimit
) { // Since the filename has its own limit, this is ok for now
855 return CFStringGetCString(string
, buffer
, maxBufLen
, CFStringFileSystemEncoding());
859 Boolean
_CFStringGetFileSystemRepresentation(CFStringRef string
, uint8_t *buffer
, CFIndex maxBufLen
) {
860 return CFStringGetFileSystemRepresentation(string
, (char *)buffer
, maxBufLen
);