2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
23 /* CFStringUtilities.c
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Aki Inoue
28 #include "CFInternal.h"
29 #include "CFStringEncodingConverterExt.h"
30 #include "CFUniChar.h"
31 #include <CoreFoundation/CFStringEncodingExt.h>
33 #if defined(__MACH__) || defined(__LINUX__)
35 #elif defined(__WIN32__)
41 Boolean
CFStringIsEncodingAvailable(CFStringEncoding theEncoding
) {
42 switch (theEncoding
) {
43 case kCFStringEncodingASCII
: // Built-in encodings
44 case kCFStringEncodingMacRoman
:
45 case kCFStringEncodingUTF8
:
46 case kCFStringEncodingNonLossyASCII
:
47 case kCFStringEncodingWindowsLatin1
:
48 case kCFStringEncodingNextStepLatin
:
49 case kCFStringEncodingUTF16
:
50 case kCFStringEncodingUTF16BE
:
51 case kCFStringEncodingUTF16LE
:
52 case kCFStringEncodingUTF32
:
53 case kCFStringEncodingUTF32BE
:
54 case kCFStringEncodingUTF32LE
:
58 return CFStringEncodingIsValidEncoding(theEncoding
);
62 const CFStringEncoding
* CFStringGetListOfAvailableEncodings() {
63 return CFStringEncodingListOfAvailableEncodings();
66 CFStringRef
CFStringGetNameOfEncoding(CFStringEncoding theEncoding
) {
67 static CFMutableDictionaryRef mappingTable
= NULL
;
68 CFStringRef theName
= mappingTable
? CFDictionaryGetValue(mappingTable
, (const void*)theEncoding
) : NULL
;
71 switch (theEncoding
) {
72 case kCFStringEncodingUTF8
: theName
= CFSTR("Unicode (UTF-8)"); break;
73 case kCFStringEncodingUTF16
: theName
= CFSTR("Unicode (UTF-16)"); break;
74 case kCFStringEncodingUTF16BE
: theName
= CFSTR("Unicode (UTF-16BE)"); break;
75 case kCFStringEncodingUTF16LE
: theName
= CFSTR("Unicode (UTF-16LE)"); break;
76 case kCFStringEncodingUTF32
: theName
= CFSTR("Unicode (UTF-32)"); break;
77 case kCFStringEncodingUTF32BE
: theName
= CFSTR("Unicode (UTF-32BE)"); break;
78 case kCFStringEncodingUTF32LE
: theName
= CFSTR("Unicode (UTF-32LE)"); break;
79 case kCFStringEncodingNonLossyASCII
: theName
= CFSTR("Non-lossy ASCII"); break;
82 const uint8_t *encodingName
= CFStringEncodingName(theEncoding
);
85 theName
= CFStringCreateWithCString(NULL
, encodingName
, kCFStringEncodingASCII
);
92 if (!mappingTable
) mappingTable
= CFDictionaryCreateMutable(NULL
, 0, (const CFDictionaryKeyCallBacks
*)NULL
, &kCFTypeDictionaryValueCallBacks
);
94 CFDictionaryAddValue(mappingTable
, (const void*)theEncoding
, (const void*)theName
);
102 CFStringEncoding
CFStringConvertIANACharSetNameToEncoding(CFStringRef charsetName
) {
103 static CFMutableDictionaryRef mappingTable
= NULL
;
104 CFStringEncoding result
= kCFStringEncodingInvalidId
;
105 CFMutableStringRef lowerCharsetName
;
107 /* Check for common encodings first */
108 if (CFStringCompare(charsetName
, CFSTR("utf-8"), kCFCompareCaseInsensitive
) == kCFCompareEqualTo
) {
109 return kCFStringEncodingUTF8
;
110 } else if (CFStringCompare(charsetName
, CFSTR("iso-8859-1"), kCFCompareCaseInsensitive
) == kCFCompareEqualTo
) {
111 return kCFStringEncodingISOLatin1
;
114 /* Create lowercase copy */
115 lowerCharsetName
= CFStringCreateMutableCopy(NULL
, 0, charsetName
);
116 CFStringLowercase(lowerCharsetName
, NULL
);
118 if (mappingTable
== NULL
) {
119 CFMutableDictionaryRef table
= CFDictionaryCreateMutable(NULL
, 0, &kCFTypeDictionaryKeyCallBacks
, (const CFDictionaryValueCallBacks
*)NULL
);
120 const CFStringEncoding
*encodings
= CFStringGetListOfAvailableEncodings();
122 while (*encodings
!= kCFStringEncodingInvalidId
) {
123 const char **nameList
= CFStringEncodingCanonicalCharsetNames(*encodings
);
127 CFStringRef name
= CFStringCreateWithCString(NULL
, *nameList
++, kCFStringEncodingASCII
);
130 CFDictionaryAddValue(table
, (const void*)name
, (const void*)*encodings
);
137 // Adding Unicode names
138 CFDictionaryAddValue(table
, (const void*)CFSTR("unicode-1-1"), (const void*)kCFStringEncodingUTF16
);
139 CFDictionaryAddValue(table
, (const void*)CFSTR("iso-10646-ucs-2"), (const void*)kCFStringEncodingUTF16
);
140 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-16"), (const void*)kCFStringEncodingUTF16
);
141 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-16be"), (const void*)kCFStringEncodingUTF16BE
);
142 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-16le"), (const void*)kCFStringEncodingUTF16LE
);
143 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-32"), (const void*)kCFStringEncodingUTF32
);
144 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-32be"), (const void*)kCFStringEncodingUTF32BE
);
145 CFDictionaryAddValue(table
, (const void*)CFSTR("utf-32le"), (const void*)kCFStringEncodingUTF32LE
);
147 mappingTable
= table
;
150 if (CFDictionaryContainsKey(mappingTable
, (const void*)lowerCharsetName
)) {
151 result
= (CFStringEncoding
)CFDictionaryGetValue(mappingTable
, (const void*)lowerCharsetName
);
154 CFRelease(lowerCharsetName
);
159 CFStringRef
CFStringConvertEncodingToIANACharSetName(CFStringEncoding encoding
) {
160 static CFMutableDictionaryRef mappingTable
= NULL
;
161 CFStringRef theName
= mappingTable
? (CFStringRef
)CFDictionaryGetValue(mappingTable
, (const void*)encoding
) : NULL
;
165 case kCFStringEncodingUTF16
: theName
= CFSTR("UTF-16"); break;
166 case kCFStringEncodingUTF16BE
: theName
= CFSTR("UTF-16BE"); break;
167 case kCFStringEncodingUTF16LE
: theName
= CFSTR("UTF-16LE"); break;
168 case kCFStringEncodingUTF32
: theName
= CFSTR("UTF-32"); break;
169 case kCFStringEncodingUTF32BE
: theName
= CFSTR("UTF-32BE"); break;
170 case kCFStringEncodingUTF32LE
: theName
= CFSTR("UTF-32LE"); break;
174 const char **nameList
= CFStringEncodingCanonicalCharsetNames(encoding
);
176 if (nameList
&& *nameList
) {
177 CFMutableStringRef upperCaseName
;
179 theName
= CFStringCreateWithCString(NULL
, *nameList
, kCFStringEncodingASCII
);
181 upperCaseName
= CFStringCreateMutableCopy(NULL
, 0, theName
);
182 CFStringUppercase(upperCaseName
, 0);
184 theName
= upperCaseName
;
192 if (!mappingTable
) mappingTable
= CFDictionaryCreateMutable(NULL
, 0, (const CFDictionaryKeyCallBacks
*)NULL
, &kCFTypeDictionaryValueCallBacks
);
194 CFDictionaryAddValue(mappingTable
, (const void*)encoding
, (const void*)theName
);
203 NSASCIIStringEncoding
= 1, /* 0..127 only */
204 NSNEXTSTEPStringEncoding
= 2,
205 NSJapaneseEUCStringEncoding
= 3,
206 NSUTF8StringEncoding
= 4,
207 NSISOLatin1StringEncoding
= 5,
208 NSSymbolStringEncoding
= 6,
209 NSNonLossyASCIIStringEncoding
= 7,
210 NSShiftJISStringEncoding
= 8,
211 NSISOLatin2StringEncoding
= 9,
212 NSUnicodeStringEncoding
= 10,
213 NSWindowsCP1251StringEncoding
= 11, /* Cyrillic; same as AdobeStandardCyrillic */
214 NSWindowsCP1252StringEncoding
= 12, /* WinLatin1 */
215 NSWindowsCP1253StringEncoding
= 13, /* Greek */
216 NSWindowsCP1254StringEncoding
= 14, /* Turkish */
217 NSWindowsCP1250StringEncoding
= 15, /* WinLatin2 */
218 NSISO2022JPStringEncoding
= 21, /* ISO 2022 Japanese encoding for e-mail */
219 NSMacOSRomanStringEncoding
= 30,
221 NSProprietaryStringEncoding
= 65536 /* Installation-specific encoding */
224 #define NSENCODING_MASK (1 << 31)
226 UInt32
CFStringConvertEncodingToNSStringEncoding(CFStringEncoding theEncoding
) {
227 switch (theEncoding
& 0xFFF) {
228 case kCFStringEncodingASCII
: return NSASCIIStringEncoding
;
229 case kCFStringEncodingNextStepLatin
: return NSNEXTSTEPStringEncoding
;
230 case kCFStringEncodingISOLatin1
: return NSISOLatin1StringEncoding
;
231 case kCFStringEncodingNonLossyASCII
: return NSNonLossyASCIIStringEncoding
;
232 case kCFStringEncodingWindowsLatin1
: return NSWindowsCP1252StringEncoding
;
233 case kCFStringEncodingMacRoman
: return NSMacOSRomanStringEncoding
;
234 #if defined(__MACH__)
235 case kCFStringEncodingEUC_JP
: return NSJapaneseEUCStringEncoding
;
236 case kCFStringEncodingMacSymbol
: return NSSymbolStringEncoding
;
237 case kCFStringEncodingDOSJapanese
: return NSShiftJISStringEncoding
;
238 case kCFStringEncodingISOLatin2
: return NSISOLatin2StringEncoding
;
239 case kCFStringEncodingWindowsCyrillic
: return NSWindowsCP1251StringEncoding
;
240 case kCFStringEncodingWindowsGreek
: return NSWindowsCP1253StringEncoding
;
241 case kCFStringEncodingWindowsLatin5
: return NSWindowsCP1254StringEncoding
;
242 case kCFStringEncodingWindowsLatin2
: return NSWindowsCP1250StringEncoding
;
243 case kCFStringEncodingISO_2022_JP
: return NSISO2022JPStringEncoding
;
244 case kCFStringEncodingUnicode
:
245 if (theEncoding
== kCFStringEncodingUTF16
) return NSUnicodeStringEncoding
;
246 else if (theEncoding
== kCFStringEncodingUTF8
) return NSUTF8StringEncoding
;
248 /* fall-through for other encoding schemes */
251 return NSENCODING_MASK
| theEncoding
;
255 CFStringEncoding
CFStringConvertNSStringEncodingToEncoding(UInt32 theEncoding
) {
256 switch (theEncoding
) {
257 case NSASCIIStringEncoding
: return kCFStringEncodingASCII
;
258 case NSNEXTSTEPStringEncoding
: return kCFStringEncodingNextStepLatin
;
259 case NSUTF8StringEncoding
: return kCFStringEncodingUTF8
;
260 case NSISOLatin1StringEncoding
: return kCFStringEncodingISOLatin1
;
261 case NSNonLossyASCIIStringEncoding
: return kCFStringEncodingNonLossyASCII
;
262 case NSUnicodeStringEncoding
: return kCFStringEncodingUTF16
;
263 case NSWindowsCP1252StringEncoding
: return kCFStringEncodingWindowsLatin1
;
264 case NSMacOSRomanStringEncoding
: return kCFStringEncodingMacRoman
;
265 #if defined(__MACH__)
266 case NSSymbolStringEncoding
: return kCFStringEncodingMacSymbol
;
267 case NSJapaneseEUCStringEncoding
: return kCFStringEncodingEUC_JP
;
268 case NSShiftJISStringEncoding
: return kCFStringEncodingDOSJapanese
;
269 case NSISO2022JPStringEncoding
: return kCFStringEncodingISO_2022_JP
;
270 case NSISOLatin2StringEncoding
: return kCFStringEncodingISOLatin2
;
271 case NSWindowsCP1251StringEncoding
: return kCFStringEncodingWindowsCyrillic
;
272 case NSWindowsCP1253StringEncoding
: return kCFStringEncodingWindowsGreek
;
273 case NSWindowsCP1254StringEncoding
: return kCFStringEncodingWindowsLatin5
;
274 case NSWindowsCP1250StringEncoding
: return kCFStringEncodingWindowsLatin2
;
277 return ((theEncoding
& NSENCODING_MASK
) ? theEncoding
& ~NSENCODING_MASK
: kCFStringEncodingInvalidId
);
281 #define MACCODEPAGE_BASE (10000)
282 #define ISO8859CODEPAGE_BASE (28590)
284 static const uint16_t _CFToDOSCodePageList
[] = {
285 437, -1, -1, -1, -1, 737, 775, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x400
286 850, 851, 852, 855, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874, -1, 01, // 0x410
287 932, 936, 949 , 950, // 0x420
290 static const uint16_t _CFToWindowsCodePageList
[] = {
291 1252, 1250, 1251, 1253, 1254, 1255, 1256, 1257, 1258,
294 static const uint16_t _CFEUCToCodePage
[] = { // 0x900
295 51932, 51936, 51950, 51949,
298 UInt32
CFStringConvertEncodingToWindowsCodepage(CFStringEncoding theEncoding
) {
299 #if defined(__MACH__)
300 CFStringEncoding encodingBase
= theEncoding
& 0x0FFF;
303 switch (theEncoding
& 0x0F00) {
304 #if defined(__MACH__)
305 case 0: // Mac OS script
306 if (encodingBase
<= kCFStringEncodingMacCentralEurRoman
) {
307 return MACCODEPAGE_BASE
+ encodingBase
;
308 } else if (encodingBase
== kCFStringEncodingMacTurkish
) {
310 } else if (encodingBase
== kCFStringEncodingMacCroatian
) {
312 } else if (encodingBase
== kCFStringEncodingMacIcelandic
) {
318 case 0x100: // Unicode
319 switch (theEncoding
) {
320 case kCFStringEncodingUTF8
: return 65001;
321 case kCFStringEncodingUTF16
: return 1200;
322 case kCFStringEncodingUTF16BE
: return 1201;
323 case kCFStringEncodingUTF32
: return 65005;
324 case kCFStringEncodingUTF32BE
: return 65006;
328 #if defined(__MACH__)
329 case 0x0200: // ISO 8859 series
330 if (encodingBase
<= kCFStringEncodingISOLatin10
) return ISO8859CODEPAGE_BASE
+ (encodingBase
- 0x200);
333 case 0x0400: // DOS codepage
334 if (encodingBase
<= kCFStringEncodingDOSChineseTrad
) return _CFToDOSCodePageList
[encodingBase
- 0x400];
337 case 0x0500: // ANSI (Windows) codepage
338 if (encodingBase
<= kCFStringEncodingWindowsVietnamese
) return _CFToWindowsCodePageList
[theEncoding
- 0x500];
339 else if (encodingBase
== kCFStringEncodingWindowsKoreanJohab
) return 1361;
342 case 0x600: // National standards
343 if (encodingBase
== kCFStringEncodingASCII
) return 20127;
344 else if (encodingBase
== kCFStringEncodingGB_18030_2000
) return 54936;
347 case 0x0800: // ISO 2022 series
348 switch (encodingBase
) {
349 case kCFStringEncodingISO_2022_JP
: return 50220;
350 case kCFStringEncodingISO_2022_CN
: return 50227;
351 case kCFStringEncodingISO_2022_KR
: return 50225;
355 case 0x0900: // EUC series
356 if (encodingBase
<= kCFStringEncodingEUC_KR
) return _CFEUCToCodePage
[encodingBase
- 0x0900];
360 case 0x0A00: // Misc encodings
361 switch (encodingBase
) {
362 case kCFStringEncodingKOI8_R
: return 20866;
363 case kCFStringEncodingHZ_GB_2312
: return 52936;
364 case kCFStringEncodingKOI8_U
: return 21866;
368 case 0x0C00: // IBM EBCDIC encodings
369 if (encodingBase
== kCFStringEncodingEBCDIC_CP037
) return 37;
374 return kCFStringEncodingInvalidId
;
377 #if defined(__MACH__)
378 static const struct {
381 } _CFACPToCFTable
[] = {
382 {37, kCFStringEncodingEBCDIC_CP037
},
383 {437, kCFStringEncodingDOSLatinUS
},
384 {737, kCFStringEncodingDOSGreek
},
385 {775, kCFStringEncodingDOSBalticRim
},
386 {850, kCFStringEncodingDOSLatin1
},
387 {851, kCFStringEncodingDOSGreek1
},
388 {852, kCFStringEncodingDOSLatin2
},
389 {855, kCFStringEncodingDOSCyrillic
},
390 {857, kCFStringEncodingDOSTurkish
},
391 {860, kCFStringEncodingDOSPortuguese
},
392 {861, kCFStringEncodingDOSIcelandic
},
393 {862, kCFStringEncodingDOSHebrew
},
394 {863, kCFStringEncodingDOSCanadianFrench
},
395 {864, kCFStringEncodingDOSArabic
},
396 {865, kCFStringEncodingDOSNordic
},
397 {866, kCFStringEncodingDOSRussian
},
398 {869, kCFStringEncodingDOSGreek2
},
399 {874, kCFStringEncodingDOSThai
},
400 {932, kCFStringEncodingDOSJapanese
},
401 {936, kCFStringEncodingDOSChineseSimplif
},
402 {949, kCFStringEncodingDOSKorean
},
403 {950, kCFStringEncodingDOSChineseTrad
},
404 {1250, kCFStringEncodingWindowsLatin2
},
405 {1251, kCFStringEncodingWindowsCyrillic
},
406 {1252, kCFStringEncodingWindowsLatin1
},
407 {1253, kCFStringEncodingWindowsGreek
},
408 {1254, kCFStringEncodingWindowsLatin5
},
409 {1255, kCFStringEncodingWindowsHebrew
},
410 {1256, kCFStringEncodingWindowsArabic
},
411 {1257, kCFStringEncodingWindowsBalticRim
},
412 {1258, kCFStringEncodingWindowsVietnamese
},
413 {1361, kCFStringEncodingWindowsKoreanJohab
},
414 {20127, kCFStringEncodingASCII
},
415 {20866, kCFStringEncodingKOI8_R
},
416 {21866, kCFStringEncodingKOI8_U
},
417 {50220, kCFStringEncodingISO_2022_JP
},
418 {50225, kCFStringEncodingISO_2022_KR
},
419 {50227, kCFStringEncodingISO_2022_CN
},
420 {51932, kCFStringEncodingEUC_JP
},
421 {51936, kCFStringEncodingEUC_CN
},
422 {51949, kCFStringEncodingEUC_KR
},
423 {51950, kCFStringEncodingEUC_TW
},
424 {52936, kCFStringEncodingHZ_GB_2312
},
425 {54936, kCFStringEncodingGB_18030_2000
},
428 static SInt32
bsearchEncoding(uint16_t target
) {
429 const unsigned int *start
, *end
, *divider
;
430 unsigned int size
= sizeof(_CFACPToCFTable
) / sizeof(UInt32
);
432 start
= (const unsigned int*)_CFACPToCFTable
; end
= (const unsigned int*)_CFACPToCFTable
+ (size
- 1);
433 while (start
<= end
) {
434 divider
= start
+ ((end
- start
) / 2);
436 if (*(const uint16_t*)divider
== target
) return *((const uint16_t*)divider
+ 1);
437 else if (*(const uint16_t*)divider
> target
) end
= divider
- 1;
438 else if (*(const uint16_t*)(divider
+ 1) > target
) return *((const uint16_t*)divider
+ 1);
439 else start
= divider
+ 1;
441 return (kCFStringEncodingInvalidId
);
445 CFStringEncoding
CFStringConvertWindowsCodepageToEncoding(UInt32 theEncoding
) {
446 if (theEncoding
== 0 || theEncoding
== 1) { // ID for default (system) codepage
447 return CFStringGetSystemEncoding();
448 } else if ((theEncoding
>= MACCODEPAGE_BASE
) && (theEncoding
< 20000)) { // Mac script
449 if (theEncoding
<= 10029) return theEncoding
- MACCODEPAGE_BASE
; // up to Mac Central European
450 #if defined(__MACH__)
451 else if (theEncoding
== 10079) return kCFStringEncodingMacIcelandic
;
452 else if (theEncoding
== 10081) return kCFStringEncodingMacTurkish
;
453 else if (theEncoding
== 10082) return kCFStringEncodingMacCroatian
;
455 } else if ((theEncoding
>= ISO8859CODEPAGE_BASE
) && (theEncoding
<= 28605)) { // ISO 8859
456 return (theEncoding
- ISO8859CODEPAGE_BASE
) + 0x200;
457 } else if (theEncoding
== 65001) { // UTF-8
458 return kCFStringEncodingUTF8
;
459 } else if (theEncoding
== 12000) { // UTF-16
460 return kCFStringEncodingUTF16
;
461 } else if (theEncoding
== 12001) { // UTF-16BE
462 return kCFStringEncodingUTF16BE
;
463 } else if (theEncoding
== 65005) { // UTF-32
464 return kCFStringEncodingUTF32
;
465 } else if (theEncoding
== 65006) { // UTF-32BE
466 return kCFStringEncodingUTF32BE
;
468 #if defined(__MACH__)
469 return bsearchEncoding(theEncoding
);
473 return kCFStringEncodingInvalidId
;
476 CFStringEncoding
CFStringGetMostCompatibleMacStringEncoding(CFStringEncoding encoding
) {
477 CFStringEncoding macEncoding
;
479 macEncoding
= CFStringEncodingGetScriptCodeForEncoding(encoding
);