]> git.saurik.com Git - apple/cf.git/blob - String.subproj/CFStringUtilities.c
CF-368.27.tar.gz
[apple/cf.git] / String.subproj / CFStringUtilities.c
1 /*
2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23 /* CFStringUtilities.c
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Aki Inoue
26 */
27
28 #include "CFInternal.h"
29 #include "CFStringEncodingConverterExt.h"
30 #include "CFUniChar.h"
31 #include <CoreFoundation/CFStringEncodingExt.h>
32 #include <limits.h>
33 #if defined(__MACH__) || defined(__LINUX__)
34 #include <stdlib.h>
35 #elif defined(__WIN32__)
36 #include <stdlib.h>
37 #include <tchar.h>
38 #endif
39
40
41 Boolean CFStringIsEncodingAvailable(CFStringEncoding theEncoding) {
42 switch (theEncoding) {
43 case kCFStringEncodingASCII: // Built-in encodings
44 case kCFStringEncodingMacRoman:
45 case kCFStringEncodingUTF8:
46 case kCFStringEncodingNonLossyASCII:
47 case kCFStringEncodingWindowsLatin1:
48 case kCFStringEncodingNextStepLatin:
49 case kCFStringEncodingUTF16:
50 case kCFStringEncodingUTF16BE:
51 case kCFStringEncodingUTF16LE:
52 case kCFStringEncodingUTF32:
53 case kCFStringEncodingUTF32BE:
54 case kCFStringEncodingUTF32LE:
55 return true;
56
57 default:
58 return CFStringEncodingIsValidEncoding(theEncoding);
59 }
60 }
61
62 const CFStringEncoding* CFStringGetListOfAvailableEncodings() {
63 return CFStringEncodingListOfAvailableEncodings();
64 }
65
66 CFStringRef CFStringGetNameOfEncoding(CFStringEncoding theEncoding) {
67 static CFMutableDictionaryRef mappingTable = NULL;
68 CFStringRef theName = mappingTable ? CFDictionaryGetValue(mappingTable, (const void*)theEncoding) : NULL;
69
70 if (!theName) {
71 switch (theEncoding) {
72 case kCFStringEncodingUTF8: theName = CFSTR("Unicode (UTF-8)"); break;
73 case kCFStringEncodingUTF16: theName = CFSTR("Unicode (UTF-16)"); break;
74 case kCFStringEncodingUTF16BE: theName = CFSTR("Unicode (UTF-16BE)"); break;
75 case kCFStringEncodingUTF16LE: theName = CFSTR("Unicode (UTF-16LE)"); break;
76 case kCFStringEncodingUTF32: theName = CFSTR("Unicode (UTF-32)"); break;
77 case kCFStringEncodingUTF32BE: theName = CFSTR("Unicode (UTF-32BE)"); break;
78 case kCFStringEncodingUTF32LE: theName = CFSTR("Unicode (UTF-32LE)"); break;
79 case kCFStringEncodingNonLossyASCII: theName = CFSTR("Non-lossy ASCII"); break;
80
81 default: {
82 const uint8_t *encodingName = CFStringEncodingName(theEncoding);
83
84 if (encodingName) {
85 theName = CFStringCreateWithCString(NULL, encodingName, kCFStringEncodingASCII);
86 }
87 }
88 break;
89 }
90
91 if (theName) {
92 if (!mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, (const CFDictionaryKeyCallBacks *)NULL, &kCFTypeDictionaryValueCallBacks);
93
94 CFDictionaryAddValue(mappingTable, (const void*)theEncoding, (const void*)theName);
95 CFRelease(theName);
96 }
97 }
98
99 return theName;
100 }
101
102 CFStringEncoding CFStringConvertIANACharSetNameToEncoding(CFStringRef charsetName) {
103 static CFMutableDictionaryRef mappingTable = NULL;
104 CFStringEncoding result = kCFStringEncodingInvalidId;
105 CFMutableStringRef lowerCharsetName;
106
107 /* Check for common encodings first */
108 if (CFStringCompare(charsetName, CFSTR("utf-8"), kCFCompareCaseInsensitive) == kCFCompareEqualTo) {
109 return kCFStringEncodingUTF8;
110 } else if (CFStringCompare(charsetName, CFSTR("iso-8859-1"), kCFCompareCaseInsensitive) == kCFCompareEqualTo) {
111 return kCFStringEncodingISOLatin1;
112 }
113
114 /* Create lowercase copy */
115 lowerCharsetName = CFStringCreateMutableCopy(NULL, 0, charsetName);
116 CFStringLowercase(lowerCharsetName, NULL);
117
118 if (mappingTable == NULL) {
119 CFMutableDictionaryRef table = CFDictionaryCreateMutable(NULL, 0, &kCFTypeDictionaryKeyCallBacks, (const CFDictionaryValueCallBacks *)NULL);
120 const CFStringEncoding *encodings = CFStringGetListOfAvailableEncodings();
121
122 while (*encodings != kCFStringEncodingInvalidId) {
123 const char **nameList = CFStringEncodingCanonicalCharsetNames(*encodings);
124
125 if (nameList) {
126 while (*nameList) {
127 CFStringRef name = CFStringCreateWithCString(NULL, *nameList++, kCFStringEncodingASCII);
128
129 if (name) {
130 CFDictionaryAddValue(table, (const void*)name, (const void*)*encodings);
131 CFRelease(name);
132 }
133 }
134 }
135 encodings++;
136 }
137 // Adding Unicode names
138 CFDictionaryAddValue(table, (const void*)CFSTR("unicode-1-1"), (const void*)kCFStringEncodingUTF16);
139 CFDictionaryAddValue(table, (const void*)CFSTR("iso-10646-ucs-2"), (const void*)kCFStringEncodingUTF16);
140 CFDictionaryAddValue(table, (const void*)CFSTR("utf-16"), (const void*)kCFStringEncodingUTF16);
141 CFDictionaryAddValue(table, (const void*)CFSTR("utf-16be"), (const void*)kCFStringEncodingUTF16BE);
142 CFDictionaryAddValue(table, (const void*)CFSTR("utf-16le"), (const void*)kCFStringEncodingUTF16LE);
143 CFDictionaryAddValue(table, (const void*)CFSTR("utf-32"), (const void*)kCFStringEncodingUTF32);
144 CFDictionaryAddValue(table, (const void*)CFSTR("utf-32be"), (const void*)kCFStringEncodingUTF32BE);
145 CFDictionaryAddValue(table, (const void*)CFSTR("utf-32le"), (const void*)kCFStringEncodingUTF32LE);
146
147 mappingTable = table;
148 }
149
150 if (CFDictionaryContainsKey(mappingTable, (const void*)lowerCharsetName)) {
151 result = (CFStringEncoding)CFDictionaryGetValue(mappingTable, (const void*)lowerCharsetName);
152 }
153
154 CFRelease(lowerCharsetName);
155
156 return result;
157 }
158
159 CFStringRef CFStringConvertEncodingToIANACharSetName(CFStringEncoding encoding) {
160 static CFMutableDictionaryRef mappingTable = NULL;
161 CFStringRef theName = mappingTable ? (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)encoding) : NULL;
162
163 if (!theName) {
164 switch (encoding) {
165 case kCFStringEncodingUTF16: theName = CFSTR("UTF-16"); break;
166 case kCFStringEncodingUTF16BE: theName = CFSTR("UTF-16BE"); break;
167 case kCFStringEncodingUTF16LE: theName = CFSTR("UTF-16LE"); break;
168 case kCFStringEncodingUTF32: theName = CFSTR("UTF-32"); break;
169 case kCFStringEncodingUTF32BE: theName = CFSTR("UTF-32BE"); break;
170 case kCFStringEncodingUTF32LE: theName = CFSTR("UTF-32LE"); break;
171
172
173 default: {
174 const char **nameList = CFStringEncodingCanonicalCharsetNames(encoding);
175
176 if (nameList && *nameList) {
177 CFMutableStringRef upperCaseName;
178
179 theName = CFStringCreateWithCString(NULL, *nameList, kCFStringEncodingASCII);
180 if (theName) {
181 upperCaseName = CFStringCreateMutableCopy(NULL, 0, theName);
182 CFStringUppercase(upperCaseName, 0);
183 CFRelease(theName);
184 theName = upperCaseName;
185 }
186 }
187 }
188 break;
189 }
190
191 if (theName) {
192 if (!mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, (const CFDictionaryKeyCallBacks *)NULL, &kCFTypeDictionaryValueCallBacks);
193
194 CFDictionaryAddValue(mappingTable, (const void*)encoding, (const void*)theName);
195 CFRelease(theName);
196 }
197 }
198
199 return theName;
200 }
201
202 enum {
203 NSASCIIStringEncoding = 1, /* 0..127 only */
204 NSNEXTSTEPStringEncoding = 2,
205 NSJapaneseEUCStringEncoding = 3,
206 NSUTF8StringEncoding = 4,
207 NSISOLatin1StringEncoding = 5,
208 NSSymbolStringEncoding = 6,
209 NSNonLossyASCIIStringEncoding = 7,
210 NSShiftJISStringEncoding = 8,
211 NSISOLatin2StringEncoding = 9,
212 NSUnicodeStringEncoding = 10,
213 NSWindowsCP1251StringEncoding = 11, /* Cyrillic; same as AdobeStandardCyrillic */
214 NSWindowsCP1252StringEncoding = 12, /* WinLatin1 */
215 NSWindowsCP1253StringEncoding = 13, /* Greek */
216 NSWindowsCP1254StringEncoding = 14, /* Turkish */
217 NSWindowsCP1250StringEncoding = 15, /* WinLatin2 */
218 NSISO2022JPStringEncoding = 21, /* ISO 2022 Japanese encoding for e-mail */
219 NSMacOSRomanStringEncoding = 30,
220
221 NSProprietaryStringEncoding = 65536 /* Installation-specific encoding */
222 };
223
224 #define NSENCODING_MASK (1 << 31)
225
226 UInt32 CFStringConvertEncodingToNSStringEncoding(CFStringEncoding theEncoding) {
227 switch (theEncoding & 0xFFF) {
228 case kCFStringEncodingASCII: return NSASCIIStringEncoding;
229 case kCFStringEncodingNextStepLatin: return NSNEXTSTEPStringEncoding;
230 case kCFStringEncodingISOLatin1: return NSISOLatin1StringEncoding;
231 case kCFStringEncodingNonLossyASCII: return NSNonLossyASCIIStringEncoding;
232 case kCFStringEncodingWindowsLatin1: return NSWindowsCP1252StringEncoding;
233 case kCFStringEncodingMacRoman: return NSMacOSRomanStringEncoding;
234 #if defined(__MACH__)
235 case kCFStringEncodingEUC_JP: return NSJapaneseEUCStringEncoding;
236 case kCFStringEncodingMacSymbol: return NSSymbolStringEncoding;
237 case kCFStringEncodingDOSJapanese: return NSShiftJISStringEncoding;
238 case kCFStringEncodingISOLatin2: return NSISOLatin2StringEncoding;
239 case kCFStringEncodingWindowsCyrillic: return NSWindowsCP1251StringEncoding;
240 case kCFStringEncodingWindowsGreek: return NSWindowsCP1253StringEncoding;
241 case kCFStringEncodingWindowsLatin5: return NSWindowsCP1254StringEncoding;
242 case kCFStringEncodingWindowsLatin2: return NSWindowsCP1250StringEncoding;
243 case kCFStringEncodingISO_2022_JP: return NSISO2022JPStringEncoding;
244 case kCFStringEncodingUnicode:
245 if (theEncoding == kCFStringEncodingUTF16) return NSUnicodeStringEncoding;
246 else if (theEncoding == kCFStringEncodingUTF8) return NSUTF8StringEncoding;
247 #endif // __MACH__
248 /* fall-through for other encoding schemes */
249
250 default:
251 return NSENCODING_MASK | theEncoding;
252 }
253 }
254
255 CFStringEncoding CFStringConvertNSStringEncodingToEncoding(UInt32 theEncoding) {
256 switch (theEncoding) {
257 case NSASCIIStringEncoding: return kCFStringEncodingASCII;
258 case NSNEXTSTEPStringEncoding: return kCFStringEncodingNextStepLatin;
259 case NSUTF8StringEncoding: return kCFStringEncodingUTF8;
260 case NSISOLatin1StringEncoding: return kCFStringEncodingISOLatin1;
261 case NSNonLossyASCIIStringEncoding: return kCFStringEncodingNonLossyASCII;
262 case NSUnicodeStringEncoding: return kCFStringEncodingUTF16;
263 case NSWindowsCP1252StringEncoding: return kCFStringEncodingWindowsLatin1;
264 case NSMacOSRomanStringEncoding: return kCFStringEncodingMacRoman;
265 #if defined(__MACH__)
266 case NSSymbolStringEncoding: return kCFStringEncodingMacSymbol;
267 case NSJapaneseEUCStringEncoding: return kCFStringEncodingEUC_JP;
268 case NSShiftJISStringEncoding: return kCFStringEncodingDOSJapanese;
269 case NSISO2022JPStringEncoding: return kCFStringEncodingISO_2022_JP;
270 case NSISOLatin2StringEncoding: return kCFStringEncodingISOLatin2;
271 case NSWindowsCP1251StringEncoding: return kCFStringEncodingWindowsCyrillic;
272 case NSWindowsCP1253StringEncoding: return kCFStringEncodingWindowsGreek;
273 case NSWindowsCP1254StringEncoding: return kCFStringEncodingWindowsLatin5;
274 case NSWindowsCP1250StringEncoding: return kCFStringEncodingWindowsLatin2;
275 #endif // __MACH__
276 default:
277 return ((theEncoding & NSENCODING_MASK) ? theEncoding & ~NSENCODING_MASK : kCFStringEncodingInvalidId);
278 }
279 }
280
281 #define MACCODEPAGE_BASE (10000)
282 #define ISO8859CODEPAGE_BASE (28590)
283
284 static const uint16_t _CFToDOSCodePageList[] = {
285 437, -1, -1, -1, -1, 737, 775, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x400
286 850, 851, 852, 855, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874, -1, 01, // 0x410
287 932, 936, 949 , 950, // 0x420
288 };
289
290 static const uint16_t _CFToWindowsCodePageList[] = {
291 1252, 1250, 1251, 1253, 1254, 1255, 1256, 1257, 1258,
292 };
293
294 static const uint16_t _CFEUCToCodePage[] = { // 0x900
295 51932, 51936, 51950, 51949,
296 };
297
298 UInt32 CFStringConvertEncodingToWindowsCodepage(CFStringEncoding theEncoding) {
299 #if defined(__MACH__)
300 CFStringEncoding encodingBase = theEncoding & 0x0FFF;
301 #endif
302
303 switch (theEncoding & 0x0F00) {
304 #if defined(__MACH__)
305 case 0: // Mac OS script
306 if (encodingBase <= kCFStringEncodingMacCentralEurRoman) {
307 return MACCODEPAGE_BASE + encodingBase;
308 } else if (encodingBase == kCFStringEncodingMacTurkish) {
309 return 10081;
310 } else if (encodingBase == kCFStringEncodingMacCroatian) {
311 return 10082;
312 } else if (encodingBase == kCFStringEncodingMacIcelandic) {
313 return 10079;
314 }
315 break;
316 #endif
317
318 case 0x100: // Unicode
319 switch (theEncoding) {
320 case kCFStringEncodingUTF8: return 65001;
321 case kCFStringEncodingUTF16: return 1200;
322 case kCFStringEncodingUTF16BE: return 1201;
323 case kCFStringEncodingUTF32: return 65005;
324 case kCFStringEncodingUTF32BE: return 65006;
325 }
326 break;
327
328 #if defined(__MACH__)
329 case 0x0200: // ISO 8859 series
330 if (encodingBase <= kCFStringEncodingISOLatin10) return ISO8859CODEPAGE_BASE + (encodingBase - 0x200);
331 break;
332
333 case 0x0400: // DOS codepage
334 if (encodingBase <= kCFStringEncodingDOSChineseTrad) return _CFToDOSCodePageList[encodingBase - 0x400];
335 break;
336
337 case 0x0500: // ANSI (Windows) codepage
338 if (encodingBase <= kCFStringEncodingWindowsVietnamese) return _CFToWindowsCodePageList[theEncoding - 0x500];
339 else if (encodingBase == kCFStringEncodingWindowsKoreanJohab) return 1361;
340 break;
341
342 case 0x600: // National standards
343 if (encodingBase == kCFStringEncodingASCII) return 20127;
344 else if (encodingBase == kCFStringEncodingGB_18030_2000) return 54936;
345 break;
346
347 case 0x0800: // ISO 2022 series
348 switch (encodingBase) {
349 case kCFStringEncodingISO_2022_JP: return 50220;
350 case kCFStringEncodingISO_2022_CN: return 50227;
351 case kCFStringEncodingISO_2022_KR: return 50225;
352 }
353 break;
354
355 case 0x0900: // EUC series
356 if (encodingBase <= kCFStringEncodingEUC_KR) return _CFEUCToCodePage[encodingBase - 0x0900];
357 break;
358
359
360 case 0x0A00: // Misc encodings
361 switch (encodingBase) {
362 case kCFStringEncodingKOI8_R: return 20866;
363 case kCFStringEncodingHZ_GB_2312: return 52936;
364 case kCFStringEncodingKOI8_U: return 21866;
365 }
366 break;
367
368 case 0x0C00: // IBM EBCDIC encodings
369 if (encodingBase == kCFStringEncodingEBCDIC_CP037) return 37;
370 break;
371 #endif // __MACH__
372 }
373
374 return kCFStringEncodingInvalidId;
375 }
376
377 #if defined(__MACH__)
378 static const struct {
379 uint16_t acp;
380 uint16_t encoding;
381 } _CFACPToCFTable[] = {
382 {37, kCFStringEncodingEBCDIC_CP037},
383 {437, kCFStringEncodingDOSLatinUS},
384 {737, kCFStringEncodingDOSGreek},
385 {775, kCFStringEncodingDOSBalticRim},
386 {850, kCFStringEncodingDOSLatin1},
387 {851, kCFStringEncodingDOSGreek1},
388 {852, kCFStringEncodingDOSLatin2},
389 {855, kCFStringEncodingDOSCyrillic},
390 {857, kCFStringEncodingDOSTurkish},
391 {860, kCFStringEncodingDOSPortuguese},
392 {861, kCFStringEncodingDOSIcelandic},
393 {862, kCFStringEncodingDOSHebrew},
394 {863, kCFStringEncodingDOSCanadianFrench},
395 {864, kCFStringEncodingDOSArabic},
396 {865, kCFStringEncodingDOSNordic},
397 {866, kCFStringEncodingDOSRussian},
398 {869, kCFStringEncodingDOSGreek2},
399 {874, kCFStringEncodingDOSThai},
400 {932, kCFStringEncodingDOSJapanese},
401 {936, kCFStringEncodingDOSChineseSimplif},
402 {949, kCFStringEncodingDOSKorean},
403 {950, kCFStringEncodingDOSChineseTrad},
404 {1250, kCFStringEncodingWindowsLatin2},
405 {1251, kCFStringEncodingWindowsCyrillic},
406 {1252, kCFStringEncodingWindowsLatin1},
407 {1253, kCFStringEncodingWindowsGreek},
408 {1254, kCFStringEncodingWindowsLatin5},
409 {1255, kCFStringEncodingWindowsHebrew},
410 {1256, kCFStringEncodingWindowsArabic},
411 {1257, kCFStringEncodingWindowsBalticRim},
412 {1258, kCFStringEncodingWindowsVietnamese},
413 {1361, kCFStringEncodingWindowsKoreanJohab},
414 {20127, kCFStringEncodingASCII},
415 {20866, kCFStringEncodingKOI8_R},
416 {21866, kCFStringEncodingKOI8_U},
417 {50220, kCFStringEncodingISO_2022_JP},
418 {50225, kCFStringEncodingISO_2022_KR},
419 {50227, kCFStringEncodingISO_2022_CN},
420 {51932, kCFStringEncodingEUC_JP},
421 {51936, kCFStringEncodingEUC_CN},
422 {51949, kCFStringEncodingEUC_KR},
423 {51950, kCFStringEncodingEUC_TW},
424 {52936, kCFStringEncodingHZ_GB_2312},
425 {54936, kCFStringEncodingGB_18030_2000},
426 };
427
428 static SInt32 bsearchEncoding(uint16_t target) {
429 const unsigned int *start, *end, *divider;
430 unsigned int size = sizeof(_CFACPToCFTable) / sizeof(UInt32);
431
432 start = (const unsigned int*)_CFACPToCFTable; end = (const unsigned int*)_CFACPToCFTable + (size - 1);
433 while (start <= end) {
434 divider = start + ((end - start) / 2);
435
436 if (*(const uint16_t*)divider == target) return *((const uint16_t*)divider + 1);
437 else if (*(const uint16_t*)divider > target) end = divider - 1;
438 else if (*(const uint16_t*)(divider + 1) > target) return *((const uint16_t*)divider + 1);
439 else start = divider + 1;
440 }
441 return (kCFStringEncodingInvalidId);
442 }
443 #endif
444
445 CFStringEncoding CFStringConvertWindowsCodepageToEncoding(UInt32 theEncoding) {
446 if (theEncoding == 0 || theEncoding == 1) { // ID for default (system) codepage
447 return CFStringGetSystemEncoding();
448 } else if ((theEncoding >= MACCODEPAGE_BASE) && (theEncoding < 20000)) { // Mac script
449 if (theEncoding <= 10029) return theEncoding - MACCODEPAGE_BASE; // up to Mac Central European
450 #if defined(__MACH__)
451 else if (theEncoding == 10079) return kCFStringEncodingMacIcelandic;
452 else if (theEncoding == 10081) return kCFStringEncodingMacTurkish;
453 else if (theEncoding == 10082) return kCFStringEncodingMacCroatian;
454 #endif
455 } else if ((theEncoding >= ISO8859CODEPAGE_BASE) && (theEncoding <= 28605)) { // ISO 8859
456 return (theEncoding - ISO8859CODEPAGE_BASE) + 0x200;
457 } else if (theEncoding == 65001) { // UTF-8
458 return kCFStringEncodingUTF8;
459 } else if (theEncoding == 12000) { // UTF-16
460 return kCFStringEncodingUTF16;
461 } else if (theEncoding == 12001) { // UTF-16BE
462 return kCFStringEncodingUTF16BE;
463 } else if (theEncoding == 65005) { // UTF-32
464 return kCFStringEncodingUTF32;
465 } else if (theEncoding == 65006) { // UTF-32BE
466 return kCFStringEncodingUTF32BE;
467 } else {
468 #if defined(__MACH__)
469 return bsearchEncoding(theEncoding);
470 #endif
471 }
472
473 return kCFStringEncodingInvalidId;
474 }
475
476 CFStringEncoding CFStringGetMostCompatibleMacStringEncoding(CFStringEncoding encoding) {
477 CFStringEncoding macEncoding;
478
479 macEncoding = CFStringEncodingGetScriptCodeForEncoding(encoding);
480
481 return macEncoding;
482 }
483
484