]> git.saurik.com Git - apple/cf.git/blob - String.subproj/CFStringEncodings.c
CF-368.25.tar.gz
[apple/cf.git] / String.subproj / CFStringEncodings.c
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /* CFStringEncodings.c
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Aki Inoue
26 */
27
28 #include "CFInternal.h"
29 #include <CoreFoundation/CFString.h>
30 #include <CoreFoundation/CFByteOrder.h>
31 #include "CFUtilitiesPriv.h"
32 #include <string.h>
33 #include "CFStringEncodingConverterExt.h"
34 #include "CFUniChar.h"
35 #include "CFUnicodeDecomposition.h"
36
37 static UInt32 __CFWantsToUseASCIICompatibleConversion = (UInt32)-1;
38 CF_INLINE UInt32 __CFGetASCIICompatibleFlag(void) {
39 if (__CFWantsToUseASCIICompatibleConversion == (UInt32)-1) {
40 __CFWantsToUseASCIICompatibleConversion = false;
41 }
42 return (__CFWantsToUseASCIICompatibleConversion ? kCFStringEncodingASCIICompatibleConversion : 0);
43 }
44
45 void _CFStringEncodingSetForceASCIICompatibility(Boolean flag) {
46 __CFWantsToUseASCIICompatibleConversion = (flag ? (UInt32)true : (UInt32)false);
47 }
48
49 Boolean (*__CFCharToUniCharFunc)(UInt32 flags, uint8_t ch, UniChar *unicodeChar) = NULL;
50
51 // To avoid early initialization issues, we just initialize this here
52 // This should not be const as it is changed
53 UniChar __CFCharToUniCharTable[256] = {
54 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
55 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
56 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
57 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
58 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
59 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
60 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
61 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
62 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
63 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
64 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
65 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
66 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
67 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
68 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
69 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
70 };
71
72 void __CFSetCharToUniCharFunc(Boolean (*func)(UInt32 flags, UInt8 ch, UniChar *unicodeChar)) {
73 if (__CFCharToUniCharFunc != func) {
74 int ch;
75 __CFCharToUniCharFunc = func;
76 if (func) {
77 for (ch = 128; ch < 256; ch++) {
78 UniChar uch;
79 __CFCharToUniCharTable[ch] = (__CFCharToUniCharFunc(0, ch, &uch) ? uch : 0xFFFD);
80 }
81 } else { // If we have no __CFCharToUniCharFunc, assume 128..255 return the value as-is
82 for (ch = 128; ch < 256; ch++) __CFCharToUniCharTable[ch] = ch;
83 }
84 }
85 }
86
87 __private_extern__ void __CFStrConvertBytesToUnicode(const uint8_t *bytes, UniChar *buffer, CFIndex numChars) {
88 CFIndex idx;
89 for (idx = 0; idx < numChars; idx++) buffer[idx] = __CFCharToUniCharTable[bytes[idx]];
90 }
91
92
93 /* The minimum length the output buffers should be in the above functions
94 */
95 #define kCFCharConversionBufferLength 512
96
97
98 #define MAX_LOCAL_CHARS (sizeof(buffer->localBuffer) / sizeof(uint8_t))
99 #define MAX_LOCAL_UNICHARS (sizeof(buffer->localBuffer) / sizeof(UniChar))
100
101 /* Convert a byte stream to ASCII (7-bit!) or Unicode, with a CFVarWidthCharBuffer struct on the stack. false return indicates an error occured during the conversion. The caller needs to free the returned buffer in either ascii or unicode (indicated by isASCII), if shouldFreeChars is true.
102 9/18/98 __CFStringDecodeByteStream now avoids to allocate buffer if buffer->chars is not NULL
103 Added useClientsMemoryPtr; if not-NULL, and the provided memory can be used as is, this is set to true
104 __CFStringDecodeByteStream2() is kept around for any internal clients who might be using it; it should be deprecated
105 !!! converterFlags is only used for the UTF8 converter at this point
106 */
107 Boolean __CFStringDecodeByteStream2(const uint8_t *bytes, UInt32 len, CFStringEncoding encoding, Boolean alwaysUnicode, CFVarWidthCharBuffer *buffer, Boolean *useClientsMemoryPtr) {
108 return __CFStringDecodeByteStream3(bytes, len, encoding, alwaysUnicode, buffer, useClientsMemoryPtr, 0);
109 }
110
111 enum {
112 __NSNonLossyErrorMode = -1,
113 __NSNonLossyASCIIMode = 0,
114 __NSNonLossyBackslashMode = 1,
115 __NSNonLossyHexInitialMode = __NSNonLossyBackslashMode + 1,
116 __NSNonLossyHexFinalMode = __NSNonLossyHexInitialMode + 4,
117 __NSNonLossyOctalInitialMode = __NSNonLossyHexFinalMode + 1,
118 __NSNonLossyOctalFinalMode = __NSNonLossyHexFinalMode + 3
119 };
120
121 Boolean __CFStringDecodeByteStream3(const uint8_t *bytes, UInt32 len, CFStringEncoding encoding, Boolean alwaysUnicode, CFVarWidthCharBuffer *buffer, Boolean *useClientsMemoryPtr, UInt32 converterFlags) {
122
123 if (useClientsMemoryPtr) *useClientsMemoryPtr = false;
124
125 buffer->isASCII = !alwaysUnicode;
126 buffer->shouldFreeChars = false;
127 buffer->numChars = 0;
128
129 if (0 == len) return true;
130
131 buffer->allocator = (buffer->allocator ? buffer->allocator : __CFGetDefaultAllocator());
132
133 if ((encoding == kCFStringEncodingUTF16) || (encoding == kCFStringEncodingUTF16BE) || (encoding == kCFStringEncodingUTF16LE)) { // UTF-16
134 const UTF16Char *src = (const UTF16Char *)bytes;
135 const UTF16Char *limit = (const UTF16Char *)(bytes + len);
136 bool swap = false;
137
138 if (kCFStringEncodingUTF16 == encoding) {
139 UTF16Char bom = ((*src == 0xFFFE) || (*src == 0xFEFF) ? *(src++) : 0);
140
141 #if defined(__BIG_ENDIAN__)
142 if (bom == 0xFFFE) swap = true;
143 #else
144 if (bom != 0xFEFF) swap = true;
145 #endif
146 if (bom) useClientsMemoryPtr = NULL;
147 } else {
148 #if defined(__BIG_ENDIAN__)
149 if (kCFStringEncodingUTF16LE == encoding) swap = true;
150 #else
151 if (kCFStringEncodingUTF16BE == encoding) swap = true;
152 #endif
153 }
154
155 buffer->numChars = limit - src;
156
157 if (useClientsMemoryPtr && !swap) { // If the caller is ready to deal with no-copy situation, and the situation is possible, indicate it...
158 *useClientsMemoryPtr = true;
159 buffer->chars.unicode = (UniChar *)src;
160 buffer->isASCII = false;
161 } else {
162 if (buffer->isASCII) { // Let's see if we can reduce the Unicode down to ASCII...
163 const UTF16Char *characters = src;
164 UTF16Char mask = (swap ? 0x80FF : 0xFF80);
165
166 while (characters < limit) {
167 if (*(characters++) & mask) {
168 buffer->isASCII = false;
169 break;
170 }
171 }
172 }
173
174 if (buffer->isASCII) {
175 uint8_t *dst;
176 if (NULL == buffer->chars.ascii) { // we never reallocate when buffer is supplied
177 if (buffer->numChars > MAX_LOCAL_CHARS) {
178 buffer->chars.ascii = CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(uint8_t)), 0);
179 buffer->shouldFreeChars = true;
180 } else {
181 buffer->chars.ascii = (uint8_t *)buffer->localBuffer;
182 }
183 }
184 dst = buffer->chars.ascii;
185
186 if (swap) {
187 while (src < limit) *(dst++) = (*(src++) >> 8);
188 } else {
189 while (src < limit) *(dst++) = *(src++);
190 }
191 } else {
192 UTF16Char *dst;
193
194 if (NULL == buffer->chars.unicode) { // we never reallocate when buffer is supplied
195 if (buffer->numChars > MAX_LOCAL_UNICHARS) {
196 buffer->chars.unicode = CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(UTF16Char)), 0);
197 buffer->shouldFreeChars = true;
198 } else {
199 buffer->chars.unicode = (UTF16Char *)buffer->localBuffer;
200 }
201 }
202 dst = buffer->chars.unicode;
203
204 if (swap) {
205 while (src < limit) *(dst++) = CFSwapInt16(*(src++));
206 } else {
207 memmove(dst, src, buffer->numChars * sizeof(UTF16Char));
208 }
209 }
210 }
211 } else if ((encoding == kCFStringEncodingUTF32) || (encoding == kCFStringEncodingUTF32BE) || (encoding == kCFStringEncodingUTF32LE)) {
212 const UTF32Char *src = (const UTF32Char *)bytes;
213 const UTF32Char *limit = (const UTF32Char *)(bytes + len);
214 bool swap = false;
215
216 if (kCFStringEncodingUTF32 == encoding) {
217 UTF32Char bom = ((*src == 0xFFFE0000) || (*src == 0x0000FEFF) ? *(src++) : 0);
218
219 #if defined(__BIG_ENDIAN__)
220 if (bom == 0xFFFE0000) swap = true;
221 #else
222 if (bom != 0x0000FEFF) swap = true;
223 #endif
224 } else {
225 #if defined(__BIG_ENDIAN__)
226 if (kCFStringEncodingUTF32LE == encoding) swap = true;
227 #else
228 if (kCFStringEncodingUTF32BE == encoding) swap = true;
229 #endif
230 }
231
232 buffer->numChars = limit - src;
233
234 {
235 // Let's see if we have non-ASCII or non-BMP
236 const UTF32Char *characters = src;
237 UTF32Char asciiMask = (swap ? 0x80FFFFFF : 0xFFFFFF80);
238 UTF32Char bmpMask = (swap ? 0x0000FFFF : 0xFFFF0000);
239
240 while (characters < limit) {
241 if (*characters & asciiMask) {
242 buffer->isASCII = false;
243 if (*characters & bmpMask) ++(buffer->numChars);
244 }
245 ++characters;
246 }
247 }
248
249 if (buffer->isASCII) {
250 uint8_t *dst;
251 if (NULL == buffer->chars.ascii) { // we never reallocate when buffer is supplied
252 if (buffer->numChars > MAX_LOCAL_CHARS) {
253 buffer->chars.ascii = CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(uint8_t)), 0);
254 buffer->shouldFreeChars = true;
255 } else {
256 buffer->chars.ascii = (uint8_t *)buffer->localBuffer;
257 }
258 }
259 dst = buffer->chars.ascii;
260
261 if (swap) {
262 while (src < limit) *(dst++) = (*(src++) >> 24);
263 } else {
264 while (src < limit) *(dst++) = *(src++);
265 }
266 } else {
267 if (NULL == buffer->chars.unicode) { // we never reallocate when buffer is supplied
268 if (buffer->numChars > MAX_LOCAL_UNICHARS) {
269 buffer->chars.unicode = CFAllocatorAllocate(buffer->allocator, (buffer->numChars * sizeof(UTF16Char)), 0);
270 buffer->shouldFreeChars = true;
271 } else {
272 buffer->chars.unicode = (UTF16Char *)buffer->localBuffer;
273 }
274 }
275 CFUniCharFromUTF32(src, limit - src, buffer->chars.unicode, false,
276 #if defined(__BIG_ENDIAN__)
277 !swap
278 #else
279 swap
280 #endif
281 );
282 }
283 } else {
284 UInt32 idx;
285 const uint8_t *chars = (const uint8_t *)bytes;
286 const uint8_t *end = chars + len;
287
288 switch (encoding) {
289 case kCFStringEncodingNonLossyASCII: {
290 UTF16Char currentValue = 0;
291 uint8_t character;
292 int8_t mode = __NSNonLossyASCIIMode;
293
294 buffer->isASCII = false;
295 buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
296 buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
297 buffer->numChars = 0;
298
299 while (chars < end) {
300 character = (*chars++);
301
302 switch (mode) {
303 case __NSNonLossyASCIIMode:
304 if (character == '\\') {
305 mode = __NSNonLossyBackslashMode;
306 } else if (character < 0x80) {
307 currentValue = character;
308 } else {
309 mode = __NSNonLossyErrorMode;
310 }
311 break;
312
313 case __NSNonLossyBackslashMode:
314 if ((character == 'U') || (character == 'u')) {
315 mode = __NSNonLossyHexInitialMode;
316 currentValue = 0;
317 } else if ((character >= '0') && (character <= '9')) {
318 mode = __NSNonLossyOctalInitialMode;
319 currentValue = character - '0';
320 } else if (character == '\\') {
321 mode = __NSNonLossyASCIIMode;
322 currentValue = character;
323 } else {
324 mode = __NSNonLossyErrorMode;
325 }
326 break;
327
328 default:
329 if (mode < __NSNonLossyHexFinalMode) {
330 if ((character >= '0') && (character <= '9')) {
331 currentValue = (currentValue << 4) | (character - '0');
332 if (++mode == __NSNonLossyHexFinalMode) mode = __NSNonLossyASCIIMode;
333 } else {
334 if (character >= 'a') character -= ('a' - 'A');
335 if ((character >= 'A') && (character <= 'F')) {
336 currentValue = (currentValue << 4) | ((character - 'A') + 10);
337 if (++mode == __NSNonLossyHexFinalMode) mode = __NSNonLossyASCIIMode;
338 } else {
339 mode = __NSNonLossyErrorMode;
340 }
341 }
342 } else {
343 if ((character >= '0') && (character <= '9')) {
344 currentValue = (currentValue << 3) | (character - '0');
345 if (++mode == __NSNonLossyOctalFinalMode) mode = __NSNonLossyASCIIMode;
346 } else {
347 mode = __NSNonLossyErrorMode;
348 }
349 }
350 break;
351 }
352
353 if (mode == __NSNonLossyASCIIMode) {
354 buffer->chars.unicode[buffer->numChars++] = currentValue;
355 } else if (mode == __NSNonLossyErrorMode) {
356 return false;
357 }
358 }
359 return (mode == __NSNonLossyASCIIMode);
360 }
361
362 case kCFStringEncodingUTF8:
363 if ((len >= 3) && (chars[0] == 0xef) && (chars[1] == 0xbb) && (chars[2] == 0xbf)) { // If UTF8 BOM, skip
364 chars += 3;
365 len -= 3;
366 if (0 == len) return true;
367 }
368 if (buffer->isASCII) {
369 for (idx = 0; idx < len; idx++) {
370 if (128 <= chars[idx]) {
371 buffer->isASCII = false;
372 break;
373 }
374 }
375 }
376 if (buffer->isASCII) {
377 buffer->numChars = len;
378 buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
379 buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
380 memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
381 } else {
382 UInt32 numDone;
383 static CFStringEncodingToUnicodeProc __CFFromUTF8 = NULL;
384
385 if (!__CFFromUTF8) {
386 const CFStringEncodingConverter *converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
387 __CFFromUTF8 = (CFStringEncodingToUnicodeProc)converter->toUnicode;
388 }
389
390 buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
391 buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
392 buffer->numChars = 0;
393 while (chars < end) {
394 numDone = 0;
395 chars += __CFFromUTF8(converterFlags, chars, end - chars, &(buffer->chars.unicode[buffer->numChars]), len - buffer->numChars, &numDone);
396
397 if (0 == numDone) {
398 if (buffer->shouldFreeChars) CFAllocatorDeallocate(buffer->allocator, buffer->chars.unicode);
399 buffer->isASCII = !alwaysUnicode;
400 buffer->shouldFreeChars = false;
401 buffer->chars.ascii = NULL;
402 buffer->numChars = 0;
403 return false;
404 }
405 buffer->numChars += numDone;
406 }
407 }
408 break;
409
410 default:
411 if (CFStringEncodingIsValidEncoding(encoding)) {
412 const CFStringEncodingConverter *converter = CFStringEncodingGetConverter(encoding);
413 Boolean isASCIISuperset = __CFStringEncodingIsSupersetOfASCII(encoding);
414
415 if (!converter) return false;
416
417 if (!isASCIISuperset) buffer->isASCII = false;
418
419 if (buffer->isASCII) {
420 for (idx = 0; idx < len; idx++) {
421 if (128 <= chars[idx]) {
422 buffer->isASCII = false;
423 break;
424 }
425 }
426 }
427
428 if (converter->encodingClass == kCFStringEncodingConverterCheapEightBit) {
429 if (buffer->isASCII) {
430 buffer->numChars = len;
431 buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
432 buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
433 memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
434 } else {
435 buffer->shouldFreeChars = !buffer->chars.unicode && (len <= MAX_LOCAL_UNICHARS) ? false : true;
436 buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (len <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(UniChar), 0));
437 buffer->numChars = len;
438 if (kCFStringEncodingASCII == encoding || kCFStringEncodingISOLatin1 == encoding) {
439 for (idx = 0; idx < len; idx++) buffer->chars.unicode[idx] = (UniChar)chars[idx];
440 } else {
441 for (idx = 0; idx < len; idx++)
442 if (chars[idx] < 0x80 && isASCIISuperset)
443 buffer->chars.unicode[idx] = (UniChar)chars[idx];
444 else if (!((CFStringEncodingCheapEightBitToUnicodeProc)converter->toUnicode)(0, chars[idx], buffer->chars.unicode + idx))
445 return false;
446 }
447 }
448 } else {
449 if (buffer->isASCII) {
450 buffer->numChars = len;
451 buffer->shouldFreeChars = !buffer->chars.ascii && (len <= MAX_LOCAL_CHARS) ? false : true;
452 buffer->chars.ascii = (buffer->chars.ascii ? buffer->chars.ascii : (len <= MAX_LOCAL_CHARS) ? (uint8_t *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, len * sizeof(uint8_t), 0));
453 memmove(buffer->chars.ascii, chars, len * sizeof(uint8_t));
454 } else {
455 UInt32 guessedLength = CFStringEncodingCharLengthForBytes(encoding, 0, bytes, len);
456 static UInt32 lossyFlag = (UInt32)-1;
457
458 buffer->shouldFreeChars = !buffer->chars.unicode && (guessedLength <= MAX_LOCAL_UNICHARS) ? false : true;
459 buffer->chars.unicode = (buffer->chars.unicode ? buffer->chars.unicode : (guessedLength <= MAX_LOCAL_UNICHARS) ? (UniChar *)buffer->localBuffer : CFAllocatorAllocate(buffer->allocator, guessedLength * sizeof(UniChar), 0));
460
461 if (lossyFlag == (UInt32)-1) lossyFlag = (_CFExecutableLinkedOnOrAfter(CFSystemVersionPanther) ? 0 : kCFStringEncodingAllowLossyConversion);
462
463 if (CFStringEncodingBytesToUnicode(encoding, lossyFlag|__CFGetASCIICompatibleFlag(), bytes, len, NULL, buffer->chars.unicode, (guessedLength > MAX_LOCAL_UNICHARS ? guessedLength : MAX_LOCAL_UNICHARS), &(buffer->numChars))) {
464 if (buffer->shouldFreeChars) CFAllocatorDeallocate(buffer->allocator, buffer->chars.unicode);
465 buffer->isASCII = !alwaysUnicode;
466 buffer->shouldFreeChars = false;
467 buffer->chars.ascii = NULL;
468 buffer->numChars = 0;
469 return false;
470 }
471 }
472 }
473 } else {
474 return false;
475 }
476 }
477 }
478
479 return true;
480 }
481
482
483 /* Create a byte stream from a CFString backing. Can convert a string piece at a time
484 into a fixed size buffer. Returns number of characters converted.
485 Characters that cannot be converted to the specified encoding are represented
486 with the char specified by lossByte; if 0, then lossy conversion is not allowed
487 and conversion stops, returning partial results.
488 Pass buffer==NULL if you don't care about the converted string (but just the convertability,
489 or number of bytes required, indicated by usedBufLen).
490 Does not zero-terminate. If you want to create Pascal or C string, allow one extra byte at start or end.
491
492 Note: This function is intended to work through CFString functions, so it should work
493 with NSStrings as well as CFStrings.
494 */
495 CFIndex __CFStringEncodeByteStream(CFStringRef string, CFIndex rangeLoc, CFIndex rangeLen, Boolean generatingExternalFile, CFStringEncoding encoding, char lossByte, uint8_t *buffer, CFIndex max, CFIndex *usedBufLen) {
496 CFIndex totalBytesWritten = 0; /* Number of written bytes */
497 CFIndex numCharsProcessed = 0; /* Number of processed chars */
498 const UniChar *unichars;
499
500 if (encoding == kCFStringEncodingUTF8 && (unichars = CFStringGetCharactersPtr(string))) {
501 static CFStringEncodingToBytesProc __CFToUTF8 = NULL;
502
503 if (!__CFToUTF8) {
504 const CFStringEncodingConverter *utf8Converter = CFStringEncodingGetConverter(kCFStringEncodingUTF8);
505 __CFToUTF8 = (CFStringEncodingToBytesProc)utf8Converter->toBytes;
506 }
507 numCharsProcessed = __CFToUTF8((generatingExternalFile ? kCFStringEncodingPrependBOM : 0), unichars + rangeLoc, rangeLen, buffer, (buffer ? max : 0), &totalBytesWritten);
508
509 } else if (encoding == kCFStringEncodingNonLossyASCII) {
510 const char *hex = "0123456789abcdef";
511 UniChar ch;
512 CFStringInlineBuffer buf;
513 CFStringInitInlineBuffer(string, &buf, CFRangeMake(rangeLoc, rangeLen));
514 while (numCharsProcessed < rangeLen) {
515 CFIndex reqLength; /* Required number of chars to encode this UniChar */
516 CFIndex cnt;
517 char tmp[6];
518 ch = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed);
519 if ((ch >= ' ' && ch <= '~' && ch != '\\') || (ch == '\n' || ch == '\r' || ch == '\t')) {
520 reqLength = 1;
521 tmp[0] = ch;
522 } else {
523 if (ch == '\\') {
524 tmp[1] = '\\';
525 reqLength = 2;
526 } else if (ch < 256) { /* \nnn; note that this is not NEXTSTEP encoding but a (small) UniChar */
527 tmp[1] = '0' + (ch >> 6);
528 tmp[2] = '0' + ((ch >> 3) & 7);
529 tmp[3] = '0' + (ch & 7);
530 reqLength = 4;
531 } else { /* \Unnnn */
532 tmp[1] = 'u'; // Changed to small+u in order to be aligned with Java
533 tmp[2] = hex[(ch >> 12) & 0x0f];
534 tmp[3] = hex[(ch >> 8) & 0x0f];
535 tmp[4] = hex[(ch >> 4) & 0x0f];
536 tmp[5] = hex[ch & 0x0f];
537 reqLength = 6;
538 }
539 tmp[0] = '\\';
540 }
541 if (buffer) {
542 if (totalBytesWritten + reqLength > max) break; /* Doesn't fit..
543 .*/
544 for (cnt = 0; cnt < reqLength; cnt++) {
545 buffer[totalBytesWritten + cnt] = tmp[cnt];
546 }
547 }
548 totalBytesWritten += reqLength;
549 numCharsProcessed++;
550 }
551 } else if ((encoding == kCFStringEncodingUTF16) || (encoding == kCFStringEncodingUTF16BE) || (encoding == kCFStringEncodingUTF16LE)) {
552 CFIndex extraForBOM = (generatingExternalFile && (encoding == kCFStringEncodingUTF16) ? sizeof(UniChar) : 0);
553 numCharsProcessed = rangeLen;
554 if (buffer && (numCharsProcessed * (CFIndex)sizeof(UniChar) + extraForBOM > max)) {
555 numCharsProcessed = (max > extraForBOM) ? ((max - extraForBOM) / sizeof(UniChar)) : 0;
556 }
557 totalBytesWritten = (numCharsProcessed * sizeof(UniChar)) + extraForBOM;
558 if (buffer) {
559 if (extraForBOM) { /* Generate BOM */
560 #if defined(__BIG_ENDIAN__)
561 *buffer++ = 0xfe; *buffer++ = 0xff;
562 #else
563 *buffer++ = 0xff; *buffer++ = 0xfe;
564 #endif
565 }
566 CFStringGetCharacters(string, CFRangeMake(rangeLoc, numCharsProcessed), (UniChar *)buffer);
567 if (
568 #if defined(__BIG_ENDIAN__)
569 kCFStringEncodingUTF16LE
570 #else
571 kCFStringEncodingUTF16BE
572 #endif
573 == encoding) { // Need to swap
574 UTF16Char *characters = (UTF16Char *)buffer;
575 const UTF16Char *limit = characters + numCharsProcessed;
576
577 while (characters < limit) {
578 *characters = CFSwapInt16(*characters);
579 ++characters;
580 }
581 }
582 }
583 } else if ((encoding == kCFStringEncodingUTF32) || (encoding == kCFStringEncodingUTF32BE) || (encoding == kCFStringEncodingUTF32LE)) {
584 UTF32Char character;
585 CFStringInlineBuffer buf;
586 UTF32Char *characters = (UTF32Char *)buffer;
587
588 #if defined(__BIG_ENDIAN__)
589 bool swap = (encoding == kCFStringEncodingUTF32LE ? true : false);
590 #else
591 bool swap = (encoding == kCFStringEncodingUTF32BE ? true : false);
592 #endif
593
594 if (generatingExternalFile && (encoding == kCFStringEncodingUTF32)) {
595 totalBytesWritten += sizeof(UTF32Char);
596 if (characters) {
597 if (totalBytesWritten > max) { // insufficient buffer
598 totalBytesWritten = 0;
599 } else {
600 #if defined(__BIG_ENDIAN__)
601 *(characters++) = 0x0000FEFF;
602 #else
603 *(characters++) = 0xFFFE0000;
604 #endif
605 }
606 }
607 }
608
609 CFStringInitInlineBuffer(string, &buf, CFRangeMake(rangeLoc, rangeLen));
610 while (numCharsProcessed < rangeLen) {
611 character = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed);
612
613 if (CFUniCharIsSurrogateHighCharacter(character)) {
614 UTF16Char otherCharacter;
615
616 if (((numCharsProcessed + 1) < rangeLen) && CFUniCharIsSurrogateLowCharacter((otherCharacter = CFStringGetCharacterFromInlineBuffer(&buf, numCharsProcessed + 1)))) {
617 character = CFUniCharGetLongCharacterForSurrogatePair(character, otherCharacter);
618 } else if (lossByte) {
619 character = lossByte;
620 } else {
621 break;
622 }
623 } else if (CFUniCharIsSurrogateLowCharacter(character)) {
624 if (lossByte) {
625 character = lossByte;
626 } else {
627 break;
628 }
629 }
630
631 totalBytesWritten += sizeof(UTF32Char);
632
633 if (characters) {
634 if (totalBytesWritten > max) {
635 totalBytesWritten -= sizeof(UTF32Char);
636 break;
637 }
638 *(characters++) = (swap ? CFSwapInt32(character) : character);
639 }
640
641 numCharsProcessed += (character > 0xFFFF ? 2 : 1);
642 }
643 } else {
644 CFIndex numChars;
645 UInt32 flags;
646 const unsigned char *cString = NULL;
647 BOOL isASCIISuperset = __CFStringEncodingIsSupersetOfASCII(encoding);
648
649 if (!CF_IS_OBJC(CFStringGetTypeID(), string) && isASCIISuperset) { // Checking for NSString to avoid infinite recursion
650 const unsigned char *ptr;
651 if ((cString = CFStringGetCStringPtr(string, __CFStringGetEightBitStringEncoding()))) {
652 ptr = (cString += rangeLoc);
653 if (__CFStringGetEightBitStringEncoding() == encoding) {
654 numCharsProcessed = (rangeLen < max || buffer == NULL ? rangeLen : max);
655 if (buffer) memmove(buffer, cString, numCharsProcessed);
656 if (usedBufLen) *usedBufLen = numCharsProcessed;
657 return numCharsProcessed;
658 }
659 while (*ptr < 0x80 && rangeLen > 0) {
660 ++ptr;
661 --rangeLen;
662 }
663 numCharsProcessed = ptr - cString;
664 if (buffer) {
665 numCharsProcessed = (numCharsProcessed < max ? numCharsProcessed : max);
666 memmove(buffer, cString, numCharsProcessed);
667 buffer += numCharsProcessed;
668 max -= numCharsProcessed;
669 }
670 if (!rangeLen || (buffer && (max == 0))) {
671 if (usedBufLen) *usedBufLen = numCharsProcessed;
672 return numCharsProcessed;
673 }
674 rangeLoc += numCharsProcessed;
675 totalBytesWritten += numCharsProcessed;
676 }
677 if (!cString && (cString = CFStringGetPascalStringPtr(string, __CFStringGetEightBitStringEncoding()))) {
678 ptr = (cString += (rangeLoc + 1));
679 if (__CFStringGetEightBitStringEncoding() == encoding) {
680 numCharsProcessed = (rangeLen < max || buffer == NULL ? rangeLen : max);
681 if (buffer) memmove(buffer, cString, numCharsProcessed);
682 if (usedBufLen) *usedBufLen = numCharsProcessed;
683 return numCharsProcessed;
684 }
685 while (*ptr < 0x80 && rangeLen > 0) {
686 ++ptr;
687 --rangeLen;
688 }
689 numCharsProcessed = ptr - cString;
690 if (buffer) {
691 numCharsProcessed = (numCharsProcessed < max ? numCharsProcessed : max);
692 memmove(buffer, cString, numCharsProcessed);
693 buffer += numCharsProcessed;
694 max -= numCharsProcessed;
695 }
696 if (!rangeLen || (buffer && (max == 0))) {
697 if (usedBufLen) *usedBufLen = numCharsProcessed;
698 return numCharsProcessed;
699 }
700 rangeLoc += numCharsProcessed;
701 totalBytesWritten += numCharsProcessed;
702 }
703 }
704
705 if (!buffer) max = 0;
706
707 // Special case for Foundation. When lossByte == 0xFF && encoding kCFStringEncodingASCII, we do the default ASCII fallback conversion
708 // Aki 11/24/04 __CFGetASCIICompatibleFlag() is called only for non-ASCII superset encodings. Otherwise, it could lead to a deadlock (see 3890536).
709 flags = (lossByte ? ((unsigned char)lossByte == 0xFF && encoding == kCFStringEncodingASCII ? kCFStringEncodingAllowLossyConversion : CFStringEncodingLossyByteToMask(lossByte)) : 0) | (generatingExternalFile ? kCFStringEncodingPrependBOM : 0) | (isASCIISuperset ? 0 : __CFGetASCIICompatibleFlag());
710
711 if (!cString && (cString = (const char*)CFStringGetCharactersPtr(string))) { // Must be Unicode string
712 if (CFStringEncodingIsValidEncoding(encoding)) { // Converter available in CF
713 CFStringEncodingUnicodeToBytes(encoding, flags, (const UniChar*)cString + rangeLoc, rangeLen, &numCharsProcessed, buffer, max, &totalBytesWritten);
714 } else {
715 return 0;
716 }
717 } else {
718 UniChar charBuf[kCFCharConversionBufferLength];
719 UInt32 currentLength;
720 UInt32 usedLen;
721 uint32_t lastUsedLen = 0, lastNumChars = 0;
722 uint32_t result;
723 Boolean isCFBuiltin = CFStringEncodingIsValidEncoding(encoding);
724 #define MAX_DECOMP_LEN (6)
725
726 while (rangeLen > 0) {
727 currentLength = (rangeLen > kCFCharConversionBufferLength ? kCFCharConversionBufferLength : rangeLen);
728 CFStringGetCharacters(string, CFRangeMake(rangeLoc, currentLength), charBuf);
729
730 // could be in the middle of surrogate pair; back up.
731 if ((rangeLen > kCFCharConversionBufferLength) && CFUniCharIsSurrogateHighCharacter(charBuf[kCFCharConversionBufferLength - 1])) --currentLength;
732
733 if (isCFBuiltin) { // Converter available in CF
734 if ((result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, currentLength, &numChars, buffer, max, &usedLen)) != kCFStringEncodingConversionSuccess) {
735 if (kCFStringEncodingInvalidInputStream == result) {
736 CFRange composedRange;
737 // Check the tail
738 if ((rangeLen > kCFCharConversionBufferLength) && ((currentLength - numChars) < MAX_DECOMP_LEN)) {
739 composedRange = CFStringGetRangeOfComposedCharactersAtIndex(string, rangeLoc + currentLength);
740
741 if ((composedRange.length <= MAX_DECOMP_LEN) && (composedRange.location < (rangeLoc + numChars))) {
742 result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, composedRange.location - rangeLoc, &numChars, buffer, max, &usedLen);
743 }
744 }
745
746 // Check the head
747 if ((kCFStringEncodingConversionSuccess != result) && (lastNumChars > 0) && (numChars < MAX_DECOMP_LEN)) {
748 composedRange = CFStringGetRangeOfComposedCharactersAtIndex(string, rangeLoc);
749
750 if ((composedRange.length <= MAX_DECOMP_LEN) && (composedRange.location < rangeLoc)) {
751 // Try if the composed range can be converted
752 CFStringGetCharacters(string, composedRange, charBuf);
753
754 if (CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, composedRange.length, &numChars, NULL, 0, &usedLen) == kCFStringEncodingConversionSuccess) { // OK let's try the last run
755 CFIndex lastRangeLoc = rangeLoc - lastNumChars;
756
757 currentLength = composedRange.location - lastRangeLoc;
758 CFStringGetCharacters(string, CFRangeMake(lastRangeLoc, currentLength), charBuf);
759
760 if ((result = CFStringEncodingUnicodeToBytes(encoding, flags, charBuf, currentLength, &numChars, (max ? buffer - lastUsedLen : NULL), (max ? max + lastUsedLen : 0), &usedLen)) == kCFStringEncodingConversionSuccess) { // OK let's try the last run
761 // Looks good. back up
762 totalBytesWritten -= lastUsedLen;
763 numCharsProcessed -= lastNumChars;
764
765 rangeLoc = lastRangeLoc;
766 rangeLen += lastNumChars;
767
768 if (max) {
769 buffer -= lastUsedLen;
770 max += lastUsedLen;
771 }
772 }
773 }
774 }
775 }
776 }
777
778 if (kCFStringEncodingConversionSuccess != result) { // really failed
779 totalBytesWritten += usedLen;
780 numCharsProcessed += numChars;
781 break;
782 }
783 }
784 } else {
785 return 0;
786 }
787
788 totalBytesWritten += usedLen;
789 numCharsProcessed += numChars;
790
791 rangeLoc += numChars;
792 rangeLen -= numChars;
793 if (max) {
794 buffer += usedLen;
795 max -= usedLen;
796 if (max <= 0) break;
797 }
798 lastUsedLen = usedLen; lastNumChars = numChars;
799 flags &= ~kCFStringEncodingPrependBOM;
800 }
801 }
802 }
803 if (usedBufLen) *usedBufLen = totalBytesWritten;
804 return numCharsProcessed;
805 }
806
807 CFStringRef CFStringCreateWithFileSystemRepresentation(CFAllocatorRef alloc, const char *buffer) {
808 return CFStringCreateWithCString(alloc, buffer, CFStringFileSystemEncoding());
809 }
810
811 CFIndex CFStringGetMaximumSizeOfFileSystemRepresentation(CFStringRef string) {
812 CFIndex len = CFStringGetLength(string);
813 CFStringEncoding enc = CFStringGetFastestEncoding(string);
814 switch (enc) {
815 case kCFStringEncodingASCII:
816 case kCFStringEncodingMacRoman:
817 return len * 3 + 1;
818 default:
819 return len * 9 + 1;
820 }
821 }
822
823 Boolean CFStringGetFileSystemRepresentation(CFStringRef string, char *buffer, CFIndex maxBufLen) {
824 #if defined(__MACH__)
825 #define MAX_STACK_BUFFER_LEN (255)
826 const UTF16Char *characters = CFStringGetCharactersPtr(string);
827 uint32_t usedBufLen;
828
829 if (NULL == characters) {
830 CFIndex length = CFStringGetLength(string);
831
832 if (length > MAX_STACK_BUFFER_LEN) {
833 UTF16Char charactersBuffer[MAX_STACK_BUFFER_LEN];
834 CFRange range = CFRangeMake(0, MAX_STACK_BUFFER_LEN);
835 uint32_t localUsedBufLen;
836
837 usedBufLen = 0;
838
839 while (length > 0) {
840 CFStringGetCharacters(string, range, charactersBuffer);
841 if (CFUniCharIsSurrogateHighCharacter(charactersBuffer[range.length - 1])) --range.length; // Backup for a high surrogate
842
843 if (!CFUniCharDecompose(charactersBuffer, range.length, NULL, (void *)buffer, maxBufLen - usedBufLen, &localUsedBufLen, true, kCFUniCharUTF8Format, true)) return false;
844 buffer += localUsedBufLen;
845 usedBufLen += localUsedBufLen;
846
847 length -= range.length;
848 range.location += range.length;
849 range.length = (length < MAX_STACK_BUFFER_LEN ? length : MAX_STACK_BUFFER_LEN);
850 }
851 } else {
852 UTF16Char charactersBuffer[MAX_STACK_BUFFER_LEN];
853
854 CFStringGetCharacters(string, CFRangeMake(0, length), charactersBuffer);
855 if (!CFUniCharDecompose(charactersBuffer, length, NULL, (void *)buffer, maxBufLen, &usedBufLen, true, kCFUniCharUTF8Format, true)) return false;
856 buffer += usedBufLen;
857 }
858 } else {
859 if (!CFUniCharDecompose(characters, CFStringGetLength(string), NULL, (void *)buffer, maxBufLen, &usedBufLen, true, kCFUniCharUTF8Format, true)) return false;
860 buffer += usedBufLen;
861 }
862
863 if (usedBufLen < (uint32_t)maxBufLen) { // Since the filename has its own limit, this is ok for now
864 *buffer = '\0';
865 return true;
866 } else {
867 return false;
868 }
869 #else __MACH__
870 return CFStringGetCString(string, buffer, maxBufLen, CFStringFileSystemEncoding());
871 #endif __MACH__
872 }
873
874 Boolean _CFStringGetFileSystemRepresentation(CFStringRef string, uint8_t *buffer, CFIndex maxBufLen) {
875 return CFStringGetFileSystemRepresentation(string, buffer, maxBufLen);
876 }
877