2 * Copyright (c) 2008 Apple Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
23 /* CFStringUtilities.c
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Aki Inoue
28 #include "CFInternal.h"
29 #include "CFStringEncodingConverterExt.h"
30 #include "CFUniChar.h"
31 #include <CoreFoundation/CFStringEncodingExt.h>
32 #include <CoreFoundation/CFPreferences.h>
34 #if (DEPLOYMENT_TARGET_MACOSX) || DEPLOYMENT_TARGET_LINUX
36 #elif defined(__WIN32__)
42 Boolean
CFStringIsEncodingAvailable(CFStringEncoding theEncoding
) {
43 switch (theEncoding
) {
44 case kCFStringEncodingASCII
: // Built-in encodings
45 case kCFStringEncodingMacRoman
:
46 case kCFStringEncodingUTF8
:
47 case kCFStringEncodingNonLossyASCII
:
48 case kCFStringEncodingWindowsLatin1
:
49 case kCFStringEncodingNextStepLatin
:
50 case kCFStringEncodingUTF16
:
51 case kCFStringEncodingUTF16BE
:
52 case kCFStringEncodingUTF16LE
:
53 case kCFStringEncodingUTF32
:
54 case kCFStringEncodingUTF32BE
:
55 case kCFStringEncodingUTF32LE
:
59 return CFStringEncodingIsValidEncoding(theEncoding
);
63 const CFStringEncoding
* CFStringGetListOfAvailableEncodings() {
64 return (const CFStringEncoding
*)CFStringEncodingListOfAvailableEncodings();
67 CFStringRef
CFStringGetNameOfEncoding(CFStringEncoding theEncoding
) {
68 static CFMutableDictionaryRef mappingTable
= NULL
;
69 CFStringRef theName
= mappingTable
? (CFStringRef
)CFDictionaryGetValue(mappingTable
, (const void*)(uintptr_t)theEncoding
) : NULL
;
72 switch (theEncoding
) {
73 case kCFStringEncodingUTF8
: theName
= CFSTR("Unicode (UTF-8)"); break;
74 case kCFStringEncodingUTF16
: theName
= CFSTR("Unicode (UTF-16)"); break;
75 case kCFStringEncodingUTF16BE
: theName
= CFSTR("Unicode (UTF-16BE)"); break;
76 case kCFStringEncodingUTF16LE
: theName
= CFSTR("Unicode (UTF-16LE)"); break;
77 case kCFStringEncodingUTF32
: theName
= CFSTR("Unicode (UTF-32)"); break;
78 case kCFStringEncodingUTF32BE
: theName
= CFSTR("Unicode (UTF-32BE)"); break;
79 case kCFStringEncodingUTF32LE
: theName
= CFSTR("Unicode (UTF-32LE)"); break;
80 case kCFStringEncodingNonLossyASCII
: theName
= CFSTR("Non-lossy ASCII"); break;
83 const char *encodingName
= CFStringEncodingName(theEncoding
);
86 theName
= CFStringCreateWithCString(kCFAllocatorSystemDefault
, encodingName
, kCFStringEncodingASCII
);
93 if (!mappingTable
) mappingTable
= CFDictionaryCreateMutable(kCFAllocatorSystemDefault
, 0, (const CFDictionaryKeyCallBacks
*)NULL
, &kCFTypeDictionaryValueCallBacks
);
95 CFDictionaryAddValue(mappingTable
, (const void*)(uintptr_t)theEncoding
, (const void*)theName
);
103 CFStringEncoding
CFStringConvertIANACharSetNameToEncoding(CFStringRef charsetName
) {
104 static CFMutableDictionaryRef mappingTable
= NULL
;
105 CFStringEncoding result
= kCFStringEncodingInvalidId
;
106 CFMutableStringRef lowerCharsetName
;
108 /* Check for common encodings first */
109 if (CFStringCompare(charsetName
, CFSTR("utf-8"), kCFCompareCaseInsensitive
) == kCFCompareEqualTo
) {
110 return kCFStringEncodingUTF8
;
111 } else if (CFStringCompare(charsetName
, CFSTR("iso-8859-1"), kCFCompareCaseInsensitive
) == kCFCompareEqualTo
) {
112 return kCFStringEncodingISOLatin1
;
115 /* Create lowercase copy */
116 lowerCharsetName
= CFStringCreateMutableCopy(kCFAllocatorSystemDefault
, 0, charsetName
);
117 CFStringLowercase(lowerCharsetName
, NULL
);
119 if (mappingTable
== NULL
) {
120 CFMutableDictionaryRef table
= CFDictionaryCreateMutable(kCFAllocatorSystemDefault
, 0, &kCFTypeDictionaryKeyCallBacks
, (const CFDictionaryValueCallBacks
*)NULL
);
121 const CFStringEncoding
*encodings
= CFStringGetListOfAvailableEncodings();
123 while (*encodings
!= kCFStringEncodingInvalidId
) {
124 const char **nameList
= CFStringEncodingCanonicalCharsetNames(*encodings
);
128 CFStringRef name
= CFStringCreateWithCString(kCFAllocatorSystemDefault
, *nameList
++, kCFStringEncodingASCII
);
131 CFDictionaryAddValue(table
, (const void*)name
, (const void*)(uintptr_t)*encodings
);
138 // Adding Unicode names
139 CFDictionaryAddValue(table
, (const void*)CFSTR("unicode-1-1"), (const void*)kCFStringEncodingUTF16
);
140 CFDictionaryAddValue(table
, (const void*)CFSTR("iso-10646-ucs-2"), (const void*)kCFStringEncodingUTF16
);
141 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-16"), (const void*)kCFStringEncodingUTF16
);
142 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-16be"), (const void*)kCFStringEncodingUTF16BE
);
143 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-16le"), (const void*)kCFStringEncodingUTF16LE
);
144 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-32"), (const void*)kCFStringEncodingUTF32
);
145 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-32be"), (const void*)kCFStringEncodingUTF32BE
);
146 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-32le"), (const void*)kCFStringEncodingUTF32LE
);
148 mappingTable
= table
;
151 if (CFDictionaryContainsKey(mappingTable
, (const void*)lowerCharsetName
)) {
152 result
= (CFStringEncoding
)(uintptr_t)CFDictionaryGetValue(mappingTable
, (const void*)lowerCharsetName
);
155 CFRelease(lowerCharsetName
);
160 CFStringRef
CFStringConvertEncodingToIANACharSetName(CFStringEncoding encoding
) {
161 static CFMutableDictionaryRef mappingTable
= NULL
;
162 CFStringRef theName
= mappingTable
? (CFStringRef
)CFDictionaryGetValue(mappingTable
, (const void*)(uintptr_t)encoding
) : NULL
;
166 case kCFStringEncodingUTF16
: theName
= CFSTR("UTF-16"); break;
167 case kCFStringEncodingUTF16BE
: theName
= CFSTR("UTF-16BE"); break;
168 case kCFStringEncodingUTF16LE
: theName
= CFSTR("UTF-16LE"); break;
169 case kCFStringEncodingUTF32
: theName
= CFSTR("UTF-32"); break;
170 case kCFStringEncodingUTF32BE
: theName
= CFSTR("UTF-32BE"); break;
171 case kCFStringEncodingUTF32LE
: theName
= CFSTR("UTF-32LE"); break;
175 const char **nameList
= CFStringEncodingCanonicalCharsetNames(encoding
);
177 if (nameList
&& *nameList
) {
178 CFMutableStringRef upperCaseName
;
180 theName
= CFStringCreateWithCString(kCFAllocatorSystemDefault
, *nameList
, kCFStringEncodingASCII
);
182 upperCaseName
= CFStringCreateMutableCopy(kCFAllocatorSystemDefault
, 0, theName
);
183 CFStringUppercase(upperCaseName
, 0);
185 theName
= upperCaseName
;
193 if (!mappingTable
) mappingTable
= CFDictionaryCreateMutable(kCFAllocatorSystemDefault
, 0, (const CFDictionaryKeyCallBacks
*)NULL
, &kCFTypeDictionaryValueCallBacks
);
195 CFDictionaryAddValue(mappingTable
, (const void*)(uintptr_t)encoding
, (const void*)theName
);
204 NSASCIIStringEncoding
= 1, /* 0..127 only */
205 NSNEXTSTEPStringEncoding
= 2,
206 NSJapaneseEUCStringEncoding
= 3,
207 NSUTF8StringEncoding
= 4,
208 NSISOLatin1StringEncoding
= 5,
209 NSSymbolStringEncoding
= 6,
210 NSNonLossyASCIIStringEncoding
= 7,
211 NSShiftJISStringEncoding
= 8,
212 NSISOLatin2StringEncoding
= 9,
213 NSUnicodeStringEncoding
= 10,
214 NSWindowsCP1251StringEncoding
= 11, /* Cyrillic; same as AdobeStandardCyrillic */
215 NSWindowsCP1252StringEncoding
= 12, /* WinLatin1 */
216 NSWindowsCP1253StringEncoding
= 13, /* Greek */
217 NSWindowsCP1254StringEncoding
= 14, /* Turkish */
218 NSWindowsCP1250StringEncoding
= 15, /* WinLatin2 */
219 NSISO2022JPStringEncoding
= 21, /* ISO 2022 Japanese encoding for e-mail */
220 NSMacOSRomanStringEncoding
= 30,
222 NSProprietaryStringEncoding
= 65536 /* Installation-specific encoding */
225 #define NSENCODING_MASK (1 << 31)
227 unsigned long CFStringConvertEncodingToNSStringEncoding(CFStringEncoding theEncoding
) {
228 switch (theEncoding
& 0xFFF) {
229 case kCFStringEncodingASCII
: return NSASCIIStringEncoding
;
230 case kCFStringEncodingNextStepLatin
: return NSNEXTSTEPStringEncoding
;
231 case kCFStringEncodingISOLatin1
: return NSISOLatin1StringEncoding
;
232 case kCFStringEncodingNonLossyASCII
: return NSNonLossyASCIIStringEncoding
;
233 case kCFStringEncodingWindowsLatin1
: return NSWindowsCP1252StringEncoding
;
234 case kCFStringEncodingMacRoman
: return NSMacOSRomanStringEncoding
;
235 #if DEPLOYMENT_TARGET_MACOSX
236 case kCFStringEncodingEUC_JP
: return NSJapaneseEUCStringEncoding
;
237 case kCFStringEncodingMacSymbol
: return NSSymbolStringEncoding
;
238 case kCFStringEncodingDOSJapanese
: return NSShiftJISStringEncoding
;
239 case kCFStringEncodingISOLatin2
: return NSISOLatin2StringEncoding
;
240 case kCFStringEncodingWindowsCyrillic
: return NSWindowsCP1251StringEncoding
;
241 case kCFStringEncodingWindowsGreek
: return NSWindowsCP1253StringEncoding
;
242 case kCFStringEncodingWindowsLatin5
: return NSWindowsCP1254StringEncoding
;
243 case kCFStringEncodingWindowsLatin2
: return NSWindowsCP1250StringEncoding
;
244 case kCFStringEncodingISO_2022_JP
: return NSISO2022JPStringEncoding
;
246 #if DEPLOYMENT_TARGET_MACOSX
247 case kCFStringEncodingUnicode
:
248 if (theEncoding
== kCFStringEncodingUTF16
) return NSUnicodeStringEncoding
;
249 else if (theEncoding
== kCFStringEncodingUTF8
) return NSUTF8StringEncoding
;
251 /* fall-through for other encoding schemes */
254 return NSENCODING_MASK
| theEncoding
;
258 CFStringEncoding
CFStringConvertNSStringEncodingToEncoding(unsigned long theEncoding
) {
259 switch (theEncoding
) {
260 case NSASCIIStringEncoding
: return kCFStringEncodingASCII
;
261 case NSNEXTSTEPStringEncoding
: return kCFStringEncodingNextStepLatin
;
262 case NSUTF8StringEncoding
: return kCFStringEncodingUTF8
;
263 case NSISOLatin1StringEncoding
: return kCFStringEncodingISOLatin1
;
264 case NSNonLossyASCIIStringEncoding
: return kCFStringEncodingNonLossyASCII
;
265 case NSUnicodeStringEncoding
: return kCFStringEncodingUTF16
;
266 case NSWindowsCP1252StringEncoding
: return kCFStringEncodingWindowsLatin1
;
267 case NSMacOSRomanStringEncoding
: return kCFStringEncodingMacRoman
;
268 #if DEPLOYMENT_TARGET_MACOSX
269 case NSSymbolStringEncoding
: return kCFStringEncodingMacSymbol
;
270 case NSJapaneseEUCStringEncoding
: return kCFStringEncodingEUC_JP
;
271 case NSShiftJISStringEncoding
: return kCFStringEncodingDOSJapanese
;
272 case NSISO2022JPStringEncoding
: return kCFStringEncodingISO_2022_JP
;
273 case NSISOLatin2StringEncoding
: return kCFStringEncodingISOLatin2
;
274 case NSWindowsCP1251StringEncoding
: return kCFStringEncodingWindowsCyrillic
;
275 case NSWindowsCP1253StringEncoding
: return kCFStringEncodingWindowsGreek
;
276 case NSWindowsCP1254StringEncoding
: return kCFStringEncodingWindowsLatin5
;
277 case NSWindowsCP1250StringEncoding
: return kCFStringEncodingWindowsLatin2
;
280 return ((theEncoding
& NSENCODING_MASK
) ? theEncoding
& ~NSENCODING_MASK
: kCFStringEncodingInvalidId
);
284 #define MACCODEPAGE_BASE (10000)
285 #define ISO8859CODEPAGE_BASE (28590)
287 static const uint16_t _CFToDOSCodePageList
[] = {
288 437, -1, -1, -1, -1, 737, 775, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x400
289 850, 851, 852, 855, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874, -1, 01, // 0x410
290 932, 936, 949 , 950, // 0x420
293 static const uint16_t _CFToWindowsCodePageList
[] = {
294 1252, 1250, 1251, 1253, 1254, 1255, 1256, 1257, 1258,
297 static const uint16_t _CFEUCToCodePage
[] = { // 0x900
298 51932, 51936, 51950, 51949,
301 UInt32
CFStringConvertEncodingToWindowsCodepage(CFStringEncoding theEncoding
) {
302 #if DEPLOYMENT_TARGET_MACOSX
303 CFStringEncoding encodingBase
= theEncoding
& 0x0FFF;
306 switch (theEncoding
& 0x0F00) {
307 #if DEPLOYMENT_TARGET_MACOSX
308 case 0: // Mac OS script
309 if (encodingBase
<= kCFStringEncodingMacCentralEurRoman
) {
310 return MACCODEPAGE_BASE
+ encodingBase
;
311 } else if (encodingBase
== kCFStringEncodingMacTurkish
) {
313 } else if (encodingBase
== kCFStringEncodingMacCroatian
) {
315 } else if (encodingBase
== kCFStringEncodingMacIcelandic
) {
321 case 0x100: // Unicode
322 switch (theEncoding
) {
323 case kCFStringEncodingUTF8
: return 65001;
324 case kCFStringEncodingUTF16
: return 1200;
325 case kCFStringEncodingUTF16BE
: return 1201;
326 case kCFStringEncodingUTF32
: return 65005;
327 case kCFStringEncodingUTF32BE
: return 65006;
331 #if (DEPLOYMENT_TARGET_MACOSX)
332 case 0x0200: // ISO 8859 series
333 if (encodingBase
<= kCFStringEncodingISOLatin10
) return ISO8859CODEPAGE_BASE
+ (encodingBase
- 0x200);
336 case 0x0400: // DOS codepage
337 if (encodingBase
<= kCFStringEncodingDOSChineseTrad
) return _CFToDOSCodePageList
[encodingBase
- 0x400];
340 case 0x0500: // ANSI (Windows) codepage
341 if (encodingBase
<= kCFStringEncodingWindowsVietnamese
) return _CFToWindowsCodePageList
[theEncoding
- 0x500];
342 else if (encodingBase
== kCFStringEncodingWindowsKoreanJohab
) return 1361;
345 case 0x600: // National standards
346 if (encodingBase
== kCFStringEncodingASCII
) return 20127;
347 else if (encodingBase
== kCFStringEncodingGB_18030_2000
) return 54936;
350 case 0x0800: // ISO 2022 series
351 switch (encodingBase
) {
352 case kCFStringEncodingISO_2022_JP
: return 50220;
353 case kCFStringEncodingISO_2022_CN
: return 50227;
354 case kCFStringEncodingISO_2022_KR
: return 50225;
358 case 0x0900: // EUC series
359 if (encodingBase
<= kCFStringEncodingEUC_KR
) return _CFEUCToCodePage
[encodingBase
- 0x0900];
363 case 0x0A00: // Misc encodings
364 switch (encodingBase
) {
365 case kCFStringEncodingKOI8_R
: return 20866;
366 case kCFStringEncodingHZ_GB_2312
: return 52936;
367 case kCFStringEncodingKOI8_U
: return 21866;
371 case 0x0C00: // IBM EBCDIC encodings
372 if (encodingBase
== kCFStringEncodingEBCDIC_CP037
) return 37;
377 return kCFStringEncodingInvalidId
;
380 #if DEPLOYMENT_TARGET_MACOSX
381 static const struct {
384 } _CFACPToCFTable
[] = {
385 {37, kCFStringEncodingEBCDIC_CP037
},
386 {437, kCFStringEncodingDOSLatinUS
},
387 {737, kCFStringEncodingDOSGreek
},
388 {775, kCFStringEncodingDOSBalticRim
},
389 {850, kCFStringEncodingDOSLatin1
},
390 {851, kCFStringEncodingDOSGreek1
},
391 {852, kCFStringEncodingDOSLatin2
},
392 {855, kCFStringEncodingDOSCyrillic
},
393 {857, kCFStringEncodingDOSTurkish
},
394 {860, kCFStringEncodingDOSPortuguese
},
395 {861, kCFStringEncodingDOSIcelandic
},
396 {862, kCFStringEncodingDOSHebrew
},
397 {863, kCFStringEncodingDOSCanadianFrench
},
398 {864, kCFStringEncodingDOSArabic
},
399 {865, kCFStringEncodingDOSNordic
},
400 {866, kCFStringEncodingDOSRussian
},
401 {869, kCFStringEncodingDOSGreek2
},
402 {874, kCFStringEncodingDOSThai
},
403 {932, kCFStringEncodingDOSJapanese
},
404 {936, kCFStringEncodingDOSChineseSimplif
},
405 {949, kCFStringEncodingDOSKorean
},
406 {950, kCFStringEncodingDOSChineseTrad
},
407 {1250, kCFStringEncodingWindowsLatin2
},
408 {1251, kCFStringEncodingWindowsCyrillic
},
409 {1252, kCFStringEncodingWindowsLatin1
},
410 {1253, kCFStringEncodingWindowsGreek
},
411 {1254, kCFStringEncodingWindowsLatin5
},
412 {1255, kCFStringEncodingWindowsHebrew
},
413 {1256, kCFStringEncodingWindowsArabic
},
414 {1257, kCFStringEncodingWindowsBalticRim
},
415 {1258, kCFStringEncodingWindowsVietnamese
},
416 {1361, kCFStringEncodingWindowsKoreanJohab
},
417 {20127, kCFStringEncodingASCII
},
418 {20866, kCFStringEncodingKOI8_R
},
419 {21866, kCFStringEncodingKOI8_U
},
420 {50220, kCFStringEncodingISO_2022_JP
},
421 {50225, kCFStringEncodingISO_2022_KR
},
422 {50227, kCFStringEncodingISO_2022_CN
},
423 {51932, kCFStringEncodingEUC_JP
},
424 {51936, kCFStringEncodingEUC_CN
},
425 {51949, kCFStringEncodingEUC_KR
},
426 {51950, kCFStringEncodingEUC_TW
},
427 {52936, kCFStringEncodingHZ_GB_2312
},
428 {54936, kCFStringEncodingGB_18030_2000
},
431 static SInt32
bsearchEncoding(uint16_t target
) {
432 const unsigned int *start
, *end
, *divider
;
433 unsigned int size
= sizeof(_CFACPToCFTable
) / sizeof(UInt32
);
435 start
= (const unsigned int*)_CFACPToCFTable
; end
= (const unsigned int*)_CFACPToCFTable
+ (size
- 1);
436 while (start
<= end
) {
437 divider
= start
+ ((end
- start
) / 2);
439 if (*(const uint16_t*)divider
== target
) return *((const uint16_t*)divider
+ 1);
440 else if (*(const uint16_t*)divider
> target
) end
= divider
- 1;
441 else if (*(const uint16_t*)(divider
+ 1) > target
) return *((const uint16_t*)divider
+ 1);
442 else start
= divider
+ 1;
444 return (kCFStringEncodingInvalidId
);
448 CFStringEncoding
CFStringConvertWindowsCodepageToEncoding(UInt32 theEncoding
) {
449 if (theEncoding
== 0 || theEncoding
== 1) { // ID for default (system) codepage
450 return CFStringGetSystemEncoding();
451 } else if ((theEncoding
>= MACCODEPAGE_BASE
) && (theEncoding
< 20000)) { // Mac script
452 if (theEncoding
<= 10029) return theEncoding
- MACCODEPAGE_BASE
; // up to Mac Central European
453 #if (DEPLOYMENT_TARGET_MACOSX)
454 else if (theEncoding
== 10079) return kCFStringEncodingMacIcelandic
;
455 else if (theEncoding
== 10081) return kCFStringEncodingMacTurkish
;
456 else if (theEncoding
== 10082) return kCFStringEncodingMacCroatian
;
458 } else if ((theEncoding
>= ISO8859CODEPAGE_BASE
) && (theEncoding
<= 28605)) { // ISO 8859
459 return (theEncoding
- ISO8859CODEPAGE_BASE
) + 0x200;
460 } else if (theEncoding
== 65001) { // UTF-8
461 return kCFStringEncodingUTF8
;
462 } else if (theEncoding
== 12000) { // UTF-16
463 return kCFStringEncodingUTF16
;
464 } else if (theEncoding
== 12001) { // UTF-16BE
465 return kCFStringEncodingUTF16BE
;
466 } else if (theEncoding
== 65005) { // UTF-32
467 return kCFStringEncodingUTF32
;
468 } else if (theEncoding
== 65006) { // UTF-32BE
469 return kCFStringEncodingUTF32BE
;
471 #if DEPLOYMENT_TARGET_MACOSX
472 return bsearchEncoding(theEncoding
);
476 return kCFStringEncodingInvalidId
;
479 CFStringEncoding
CFStringGetMostCompatibleMacStringEncoding(CFStringEncoding encoding
) {
480 CFStringEncoding macEncoding
;
482 macEncoding
= CFStringEncodingGetScriptCodeForEncoding(encoding
);