]> git.saurik.com Git - apple/cf.git/blob - StringEncodings.subproj/CFBuiltinConverters.c
CF-299.35.tar.gz
[apple/cf.git] / StringEncodings.subproj / CFBuiltinConverters.c
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /* CFBuiltinConverters.c
26 Copyright 1999-2002, Apple, Inc. All rights reserved.
27 Responsibility: Aki Inoue
28 */
29
30 #include "CFStringEncodingConverterExt.h"
31 #include "CFUniChar.h"
32 #include "CFUnicodeDecomposition.h"
33 #include "CFUnicodePrecomposition.h"
34 #include "CFStringEncodingConverterPriv.h"
35 #include "CFInternal.h"
36
37 #define ParagraphSeparator 0x2029
38 #define ASCIINewLine 0x0a
39
40 /* Precomposition */
41 static const UInt32 __CFLatin1CombiningCharBitmap[] = { // 0x300 ~ 0x35FF
42 0xFBB94010, 0x01800000, 0x0000000,
43 };
44
45 Boolean CFStringEncodingIsValidCombiningCharacterForLatin1(UniChar character) {
46 return ((character >= 0x300) && (character < 0x360) && (__CFLatin1CombiningCharBitmap[(character - 0x300) / 32] & (1 << (31 - ((character - 0x300) % 32)))) ? true : false);
47 }
48
49 UniChar CFStringEncodingPrecomposeLatinCharacter(const UniChar *character, UInt32 numChars, UInt32 *usedChars) {
50 if (numChars > 0) {
51 UTF32Char ch = *(character++), nextCh, composedChar;
52 UInt32 usedCharLen = 1;
53
54 if (CFUniCharIsSurrogateHighCharacter(ch) || CFUniCharIsSurrogateLowCharacter(ch)) {
55 if (usedChars) (*usedChars) = usedCharLen;
56 return ch;
57 }
58
59 while (usedCharLen < numChars) {
60 nextCh = *(character++);
61
62 if (CFUniCharIsSurrogateHighCharacter(nextCh) || CFUniCharIsSurrogateLowCharacter(nextCh)) break;
63
64 if (CFUniCharIsMemberOf(nextCh, kCFUniCharNonBaseCharacterSet) && ((composedChar = CFUniCharPrecomposeCharacter(ch, nextCh)) != 0xFFFD)) {
65 if (composedChar > 0xFFFF) { // Non-base
66 break;
67 } else {
68 ch = composedChar;
69 }
70 } else {
71 break;
72 }
73 ++usedCharLen;
74 }
75 if (usedChars) (*usedChars) = usedCharLen;
76 return ch;
77 }
78 return 0xFFFD;
79 }
80
81 /* ASCII */
82 static Boolean __CFToASCII(UInt32 flags, UniChar character, uint8_t *byte) {
83 if (character < 0x80) {
84 *byte = (uint8_t)character;
85 } else if (character == ParagraphSeparator) {
86 *byte = ASCIINewLine;
87 } else {
88 return false;
89 }
90 return true;
91 }
92
93 static Boolean __CFFromASCII(UInt32 flags, uint8_t byte, UniChar *character) {
94 if (byte < 0x80) {
95 *character = (UniChar)byte;
96 return true;
97 } else {
98 return false;
99 }
100 }
101
102
103 __private_extern__ CFStringEncodingConverter __CFConverterASCII = {
104 __CFToASCII, __CFFromASCII, 1, 1, kCFStringEncodingConverterCheapEightBit,
105 NULL, NULL, NULL, NULL, NULL, NULL,
106 };
107
108 /* ISO Latin 1 (8859-1) */
109 static Boolean __CFToISOLatin1(UInt32 flags, UniChar character, uint8_t *byte) {
110 if (character <= 0xFF) {
111 *byte = (uint8_t)character;
112 } else if (character == ParagraphSeparator) {
113 *byte = ASCIINewLine;
114 } else {
115 return false;
116 }
117
118 return true;
119 }
120
121 static Boolean __CFFromISOLatin1(UInt32 flags, uint8_t byte, UniChar *character) {
122 *character = (UniChar)byte;
123 return true;
124 }
125
126 static UInt32 __CFToISOLatin1Precompose(UInt32 flags, const UniChar *character, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) {
127 uint8_t byte;
128 UInt32 usedCharLen;
129
130 if (__CFToISOLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
131 if (maxByteLen) *bytes = byte;
132 *usedByteLen = 1;
133 return usedCharLen;
134 } else {
135 return 0;
136 }
137 }
138
139 __private_extern__ CFStringEncodingConverter __CFConverterISOLatin1 = {
140 __CFToISOLatin1, __CFFromISOLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit,
141 NULL, NULL, NULL, NULL, __CFToISOLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
142 };
143
144 /* Mac Roman */
145 #define NUM_MACROMAN_FROM_UNI 129
146 static const CFStringEncodingUnicodeTo8BitCharMap macRoman_from_uni[NUM_MACROMAN_FROM_UNI] = {
147 { 0x00A0, 0xCA }, /* NO-BREAK SPACE */
148 { 0x00A1, 0xC1 }, /* INVERTED EXCLAMATION MARK */
149 { 0x00A2, 0xA2 }, /* CENT SIGN */
150 { 0x00A3, 0xA3 }, /* POUND SIGN */
151 { 0x00A5, 0xB4 }, /* YEN SIGN */
152 { 0x00A7, 0xA4 }, /* SECTION SIGN */
153 { 0x00A8, 0xAC }, /* DIAERESIS */
154 { 0x00A9, 0xA9 }, /* COPYRIGHT SIGN */
155 { 0x00AA, 0xBB }, /* FEMININE ORDINAL INDICATOR */
156 { 0x00AB, 0xC7 }, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
157 { 0x00AC, 0xC2 }, /* NOT SIGN */
158 { 0x00AE, 0xA8 }, /* REGISTERED SIGN */
159 { 0x00AF, 0xF8 }, /* MACRON */
160 { 0x00B0, 0xA1 }, /* DEGREE SIGN */
161 { 0x00B1, 0xB1 }, /* PLUS-MINUS SIGN */
162 { 0x00B4, 0xAB }, /* ACUTE ACCENT */
163 { 0x00B5, 0xB5 }, /* MICRO SIGN */
164 { 0x00B6, 0xA6 }, /* PILCROW SIGN */
165 { 0x00B7, 0xE1 }, /* MIDDLE DOT */
166 { 0x00B8, 0xFC }, /* CEDILLA */
167 { 0x00BA, 0xBC }, /* MASCULINE ORDINAL INDICATOR */
168 { 0x00BB, 0xC8 }, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
169 { 0x00BF, 0xC0 }, /* INVERTED QUESTION MARK */
170 { 0x00C0, 0xCB }, /* LATIN CAPITAL LETTER A WITH GRAVE */
171 { 0x00C1, 0xE7 }, /* LATIN CAPITAL LETTER A WITH ACUTE */
172 { 0x00C2, 0xE5 }, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
173 { 0x00C3, 0xCC }, /* LATIN CAPITAL LETTER A WITH TILDE */
174 { 0x00C4, 0x80 }, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
175 { 0x00C5, 0x81 }, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
176 { 0x00C6, 0xAE }, /* LATIN CAPITAL LIGATURE AE */
177 { 0x00C7, 0x82 }, /* LATIN CAPITAL LETTER C WITH CEDILLA */
178 { 0x00C8, 0xE9 }, /* LATIN CAPITAL LETTER E WITH GRAVE */
179 { 0x00C9, 0x83 }, /* LATIN CAPITAL LETTER E WITH ACUTE */
180 { 0x00CA, 0xE6 }, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
181 { 0x00CB, 0xE8 }, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
182 { 0x00CC, 0xED }, /* LATIN CAPITAL LETTER I WITH GRAVE */
183 { 0x00CD, 0xEA }, /* LATIN CAPITAL LETTER I WITH ACUTE */
184 { 0x00CE, 0xEB }, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
185 { 0x00CF, 0xEC }, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
186 { 0x00D1, 0x84 }, /* LATIN CAPITAL LETTER N WITH TILDE */
187 { 0x00D2, 0xF1 }, /* LATIN CAPITAL LETTER O WITH GRAVE */
188 { 0x00D3, 0xEE }, /* LATIN CAPITAL LETTER O WITH ACUTE */
189 { 0x00D4, 0xEF }, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
190 { 0x00D5, 0xCD }, /* LATIN CAPITAL LETTER O WITH TILDE */
191 { 0x00D6, 0x85 }, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
192 { 0x00D8, 0xAF }, /* LATIN CAPITAL LETTER O WITH STROKE */
193 { 0x00D9, 0xF4 }, /* LATIN CAPITAL LETTER U WITH GRAVE */
194 { 0x00DA, 0xF2 }, /* LATIN CAPITAL LETTER U WITH ACUTE */
195 { 0x00DB, 0xF3 }, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
196 { 0x00DC, 0x86 }, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
197 { 0x00DF, 0xA7 }, /* LATIN SMALL LETTER SHARP S */
198 { 0x00E0, 0x88 }, /* LATIN SMALL LETTER A WITH GRAVE */
199 { 0x00E1, 0x87 }, /* LATIN SMALL LETTER A WITH ACUTE */
200 { 0x00E2, 0x89 }, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
201 { 0x00E3, 0x8B }, /* LATIN SMALL LETTER A WITH TILDE */
202 { 0x00E4, 0x8A }, /* LATIN SMALL LETTER A WITH DIAERESIS */
203 { 0x00E5, 0x8C }, /* LATIN SMALL LETTER A WITH RING ABOVE */
204 { 0x00E6, 0xBE }, /* LATIN SMALL LIGATURE AE */
205 { 0x00E7, 0x8D }, /* LATIN SMALL LETTER C WITH CEDILLA */
206 { 0x00E8, 0x8F }, /* LATIN SMALL LETTER E WITH GRAVE */
207 { 0x00E9, 0x8E }, /* LATIN SMALL LETTER E WITH ACUTE */
208 { 0x00EA, 0x90 }, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
209 { 0x00EB, 0x91 }, /* LATIN SMALL LETTER E WITH DIAERESIS */
210 { 0x00EC, 0x93 }, /* LATIN SMALL LETTER I WITH GRAVE */
211 { 0x00ED, 0x92 }, /* LATIN SMALL LETTER I WITH ACUTE */
212 { 0x00EE, 0x94 }, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
213 { 0x00EF, 0x95 }, /* LATIN SMALL LETTER I WITH DIAERESIS */
214 { 0x00F1, 0x96 }, /* LATIN SMALL LETTER N WITH TILDE */
215 { 0x00F2, 0x98 }, /* LATIN SMALL LETTER O WITH GRAVE */
216 { 0x00F3, 0x97 }, /* LATIN SMALL LETTER O WITH ACUTE */
217 { 0x00F4, 0x99 }, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
218 { 0x00F5, 0x9B }, /* LATIN SMALL LETTER O WITH TILDE */
219 { 0x00F6, 0x9A }, /* LATIN SMALL LETTER O WITH DIAERESIS */
220 { 0x00F7, 0xD6 }, /* DIVISION SIGN */
221 { 0x00F8, 0xBF }, /* LATIN SMALL LETTER O WITH STROKE */
222 { 0x00F9, 0x9D }, /* LATIN SMALL LETTER U WITH GRAVE */
223 { 0x00FA, 0x9C }, /* LATIN SMALL LETTER U WITH ACUTE */
224 { 0x00FB, 0x9E }, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
225 { 0x00FC, 0x9F }, /* LATIN SMALL LETTER U WITH DIAERESIS */
226 { 0x00FF, 0xD8 }, /* LATIN SMALL LETTER Y WITH DIAERESIS */
227 { 0x0131, 0xF5 }, /* LATIN SMALL LETTER DOTLESS I */
228 { 0x0152, 0xCE }, /* LATIN CAPITAL LIGATURE OE */
229 { 0x0153, 0xCF }, /* LATIN SMALL LIGATURE OE */
230 { 0x0178, 0xD9 }, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
231 { 0x0192, 0xC4 }, /* LATIN SMALL LETTER F WITH HOOK */
232 { 0x02C6, 0xF6 }, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
233 { 0x02C7, 0xFF }, /* CARON */
234 { 0x02D8, 0xF9 }, /* BREVE */
235 { 0x02D9, 0xFA }, /* DOT ABOVE */
236 { 0x02DA, 0xFB }, /* RING ABOVE */
237 { 0x02DB, 0xFE }, /* OGONEK */
238 { 0x02DC, 0xF7 }, /* SMALL TILDE */
239 { 0x02DD, 0xFD }, /* DOUBLE ACUTE ACCENT */
240 { 0x03A9, 0xBD }, /* OHM SIGN (Canonical ?) */
241 { 0x03C0, 0xB9 }, /* GREEK SMALL LETTER PI */
242 { 0x2013, 0xD0 }, /* EN DASH */
243 { 0x2014, 0xD1 }, /* EM DASH */
244 { 0x2018, 0xD4 }, /* LEFT SINGLE QUOTATION MARK */
245 { 0x2019, 0xD5 }, /* RIGHT SINGLE QUOTATION MARK */
246 { 0x201A, 0xE2 }, /* SINGLE LOW-9 QUOTATION MARK */
247 { 0x201C, 0xD2 }, /* LEFT DOUBLE QUOTATION MARK */
248 { 0x201D, 0xD3 }, /* RIGHT DOUBLE QUOTATION MARK */
249 { 0x201E, 0xE3 }, /* DOUBLE LOW-9 QUOTATION MARK */
250 { 0x2020, 0xA0 }, /* DAGGER */
251 { 0x2021, 0xE0 }, /* DOUBLE DAGGER */
252 { 0x2022, 0xA5 }, /* BULLET */
253 { 0x2026, 0xC9 }, /* HORIZONTAL ELLIPSIS */
254 { 0x2030, 0xE4 }, /* PER MILLE SIGN */
255 { 0x2039, 0xDC }, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
256 { 0x203A, 0xDD }, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
257 { 0x2044, 0xDA }, /* FRACTION SLASH */
258 { 0x20AC, 0xDB }, /* EURO SIGN */
259 { 0x2122, 0xAA }, /* TRADE MARK SIGN */
260 { 0x2126, 0xBD }, /* OHM SIGN */
261 { 0x2202, 0xB6 }, /* PARTIAL DIFFERENTIAL */
262 { 0x2206, 0xC6 }, /* INCREMENT */
263 { 0x220F, 0xB8 }, /* N-ARY PRODUCT */
264 { 0x2211, 0xB7 }, /* N-ARY SUMMATION */
265 { 0x221A, 0xC3 }, /* SQUARE ROOT */
266 { 0x221E, 0xB0 }, /* INFINITY */
267 { 0x222B, 0xBA }, /* INTEGRAL */
268 { 0x2248, 0xC5 }, /* ALMOST EQUAL TO */
269 { 0x2260, 0xAD }, /* NOT EQUAL TO */
270 { 0x2264, 0xB2 }, /* LESS-THAN OR EQUAL TO */
271 { 0x2265, 0xB3 }, /* GREATER-THAN OR EQUAL TO */
272 { 0x25CA, 0xD7 }, /* LOZENGE */
273 { 0xF8FF, 0xF0 }, /* Apple logo */
274 { 0xFB01, 0xDE }, /* LATIN SMALL LIGATURE FI */
275 { 0xFB02, 0xDF }, /* LATIN SMALL LIGATURE FL */
276 };
277
278 static Boolean __CFToMacRoman(UInt32 flags, UniChar character, uint8_t *byte) {
279 if (character < 0x80) {
280 *byte = (uint8_t)character;
281 return true;
282 } else {
283 return CFStringEncodingUnicodeTo8BitEncoding(macRoman_from_uni, NUM_MACROMAN_FROM_UNI, character, byte);
284 }
285 }
286
287 static const UniChar macRoman_to_uni[128] = {
288 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
289 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
290 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
291 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
292 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
293 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
294 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
295 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
296 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
297 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
298 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
299 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
300 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
301 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
302 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
303 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
304 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
305 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
306 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
307 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
308 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
309 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
310 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
311 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
312 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
313 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
314 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
315 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
316 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
317 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
318 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
319 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
320 0x2020, /* DAGGER */
321 0x00B0, /* DEGREE SIGN */
322 0x00A2, /* CENT SIGN */
323 0x00A3, /* POUND SIGN */
324 0x00A7, /* SECTION SIGN */
325 0x2022, /* BULLET */
326 0x00B6, /* PILCROW SIGN */
327 0x00DF, /* LATIN SMALL LETTER SHARP S */
328 0x00AE, /* REGISTERED SIGN */
329 0x00A9, /* COPYRIGHT SIGN */
330 0x2122, /* TRADE MARK SIGN */
331 0x00B4, /* ACUTE ACCENT */
332 0x00A8, /* DIAERESIS */
333 0x2260, /* NOT EQUAL TO */
334 0x00C6, /* LATIN CAPITAL LIGATURE AE */
335 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
336 0x221E, /* INFINITY */
337 0x00B1, /* PLUS-MINUS SIGN */
338 0x2264, /* LESS-THAN OR EQUAL TO */
339 0x2265, /* GREATER-THAN OR EQUAL TO */
340 0x00A5, /* YEN SIGN */
341 0x00B5, /* MICRO SIGN */
342 0x2202, /* PARTIAL DIFFERENTIAL */
343 0x2211, /* N-ARY SUMMATION */
344 0x220F, /* N-ARY PRODUCT */
345 0x03C0, /* GREEK SMALL LETTER PI */
346 0x222B, /* INTEGRAL */
347 0x00AA, /* FEMININE ORDINAL INDICATOR */
348 0x00BA, /* MASCULINE ORDINAL INDICATOR */
349 0x03A9, /* OHM SIGN (Canonical mapping) */
350 0x00E6, /* LATIN SMALL LIGATURE AE */
351 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
352 0x00BF, /* INVERTED QUESTION MARK */
353 0x00A1, /* INVERTED EXCLAMATION MARK */
354 0x00AC, /* NOT SIGN */
355 0x221A, /* SQUARE ROOT */
356 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
357 0x2248, /* ALMOST EQUAL TO */
358 0x2206, /* INCREMENT */
359 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
360 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
361 0x2026, /* HORIZONTAL ELLIPSIS */
362 0x00A0, /* NO-BREAK SPACE */
363 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
364 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
365 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
366 0x0152, /* LATIN CAPITAL LIGATURE OE */
367 0x0153, /* LATIN SMALL LIGATURE OE */
368 0x2013, /* EN DASH */
369 0x2014, /* EM DASH */
370 0x201C, /* LEFT DOUBLE QUOTATION MARK */
371 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
372 0x2018, /* LEFT SINGLE QUOTATION MARK */
373 0x2019, /* RIGHT SINGLE QUOTATION MARK */
374 0x00F7, /* DIVISION SIGN */
375 0x25CA, /* LOZENGE */
376 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
377 0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
378 0x2044, /* FRACTION SLASH */
379 0x20AC, /* EURO SIGN */
380 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
381 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
382 0xFB01, /* LATIN SMALL LIGATURE FI */
383 0xFB02, /* LATIN SMALL LIGATURE FL */
384 0x2021, /* DOUBLE DAGGER */
385 0x00B7, /* MIDDLE DOT */
386 0x201A, /* SINGLE LOW-9 QUOTATION MARK */
387 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
388 0x2030, /* PER MILLE SIGN */
389 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
390 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
391 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
392 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
393 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
394 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
395 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
396 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
397 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
398 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
399 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
400 0xF8FF, /* Apple logo */
401 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
402 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
403 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
404 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
405 0x0131, /* LATIN SMALL LETTER DOTLESS I */
406 0x02C6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
407 0x02DC, /* SMALL TILDE */
408 0x00AF, /* MACRON */
409 0x02D8, /* BREVE */
410 0x02D9, /* DOT ABOVE */
411 0x02DA, /* RING ABOVE */
412 0x00B8, /* CEDILLA */
413 0x02DD, /* DOUBLE ACUTE ACCENT */
414 0x02DB, /* OGONEK */
415 0x02C7, /* CARON */
416 };
417
418 static Boolean __CFFromMacRoman(UInt32 flags, uint8_t byte, UniChar *character) {
419 *character = (byte < 0x80 ? (UniChar)byte : macRoman_to_uni[byte - 0x80]);
420 return true;
421 }
422
423 static UInt32 __CFToMacRomanPrecompose(UInt32 flags, const UniChar *character, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) {
424 uint8_t byte;
425 UInt32 usedCharLen;
426
427 if (__CFToMacRoman(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
428 if (maxByteLen) *bytes = byte;
429 *usedByteLen = 1;
430 return usedCharLen;
431 } else {
432 return 0;
433 }
434 }
435
436 __private_extern__ CFStringEncodingConverter __CFConverterMacRoman = {
437 __CFToMacRoman, __CFFromMacRoman, 1, 1, kCFStringEncodingConverterCheapEightBit,
438 NULL, NULL, NULL, NULL, __CFToMacRomanPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
439 };
440
441 /* Win Latin1 (ANSI CodePage 1252) */
442 #define NUM_1252_FROM_UNI 27
443 static const CFStringEncodingUnicodeTo8BitCharMap cp1252_from_uni[NUM_1252_FROM_UNI] = {
444 {0x0152, 0x8C}, // LATIN CAPITAL LIGATURE OE
445 {0x0153, 0x9C}, // LATIN SMALL LIGATURE OE
446 {0x0160, 0x8A}, // LATIN CAPITAL LETTER S WITH CARON
447 {0x0161, 0x9A}, // LATIN SMALL LETTER S WITH CARON
448 {0x0178, 0x9F}, // LATIN CAPITAL LETTER Y WITH DIAERESIS
449 {0x017D, 0x8E}, // LATIN CAPITAL LETTER Z WITH CARON
450 {0x017E, 0x9E}, // LATIN SMALL LETTER Z WITH CARON
451 {0x0192, 0x83}, // LATIN SMALL LETTER F WITH HOOK
452 {0x02C6, 0x88}, // MODIFIER LETTER CIRCUMFLEX ACCENT
453 {0x02DC, 0x98}, // SMALL TILDE
454 {0x2013, 0x96}, // EN DASH
455 {0x2014, 0x97}, // EM DASH
456 {0x2018, 0x91}, // LEFT SINGLE QUOTATION MARK
457 {0x2019, 0x92}, // RIGHT SINGLE QUOTATION MARK
458 {0x201A, 0x82}, // SINGLE LOW-9 QUOTATION MARK
459 {0x201C, 0x93}, // LEFT DOUBLE QUOTATION MARK
460 {0x201D, 0x94}, // RIGHT DOUBLE QUOTATION MARK
461 {0x201E, 0x84}, // DOUBLE LOW-9 QUOTATION MARK
462 {0x2020, 0x86}, // DAGGER
463 {0x2021, 0x87}, // DOUBLE DAGGER
464 {0x2022, 0x95}, // BULLET
465 {0x2026, 0x85}, // HORIZONTAL ELLIPSIS
466 {0x2030, 0x89}, // PER MILLE SIGN
467 {0x2039, 0x8B}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
468 {0x203A, 0x9B}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
469 {0x20AC, 0x80}, // EURO SIGN
470 {0x2122, 0x99}, // TRADE MARK SIGN
471 };
472
473 static Boolean __CFToWinLatin1(UInt32 flags, UniChar character, uint8_t *byte) {
474 if ((character < 0x80) || ((character > 0x9F) && (character <= 0x00FF))) {
475 *byte = (uint8_t)character;
476 return true;
477 }
478 return CFStringEncodingUnicodeTo8BitEncoding(cp1252_from_uni, NUM_1252_FROM_UNI, character, byte);
479 }
480
481 static const unsigned short cp1252_to_uni[32] = {
482 0x20AC, // EURO SIGN
483 0xFFFD, // NOT USED
484 0x201A, // SINGLE LOW-9 QUOTATION MARK
485 0x0192, // LATIN SMALL LETTER F WITH HOOK
486 0x201E, // DOUBLE LOW-9 QUOTATION MARK
487 0x2026, // HORIZONTAL ELLIPSIS
488 0x2020, // DAGGER
489 0x2021, // DOUBLE DAGGER
490 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT
491 0x2030, // PER MILLE SIGN
492 0x0160, // LATIN CAPITAL LETTER S WITH CARON
493 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
494 0x0152, // LATIN CAPITAL LIGATURE OE
495 0xFFFD, // NOT USED
496 0x017D, // LATIN CAPITAL LETTER Z WITH CARON
497 0xFFFD, // NOT USED
498 0xFFFD, // NOT USED
499 0x2018, // LEFT SINGLE QUOTATION MARK
500 0x2019, // RIGHT SINGLE QUOTATION MARK
501 0x201C, // LEFT DOUBLE QUOTATION MARK
502 0x201D, // RIGHT DOUBLE QUOTATION MARK
503 0x2022, // BULLET
504 0x2013, // EN DASH
505 0x2014, // EM DASH
506 0x02DC, // SMALL TILDE
507 0x2122, // TRADE MARK SIGN
508 0x0161, // LATIN SMALL LETTER S WITH CARON
509 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
510 0x0153, // LATIN SMALL LIGATURE OE
511 0xFFFD, // NOT USED
512 0x017E, // LATIN SMALL LETTER Z WITH CARON
513 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS
514 };
515
516 static Boolean __CFFromWinLatin1(UInt32 flags, uint8_t byte, UniChar *character) {
517 *character = (byte < 0x80 || byte > 0x9F ? (UniChar)byte : cp1252_to_uni[byte - 0x80]);
518 return (*character != 0xFFFD);
519 }
520
521 static UInt32 __CFToWinLatin1Precompose(UInt32 flags, const UniChar *character, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) {
522 uint8_t byte;
523 UInt32 usedCharLen;
524
525 if (__CFToWinLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
526 if (maxByteLen) *bytes = byte;
527 *usedByteLen = 1;
528 return usedCharLen;
529 } else {
530 return 0;
531 }
532 }
533
534 __private_extern__ CFStringEncodingConverter __CFConverterWinLatin1 = {
535 __CFToWinLatin1, __CFFromWinLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit,
536 NULL, NULL, NULL, NULL, __CFToWinLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
537 };
538
539 /* NEXTSTEP Encoding */
540 #define NUM_NEXTSTEP_FROM_UNI 128
541
542 static const CFStringEncodingUnicodeTo8BitCharMap nextstep_from_tab[NUM_NEXTSTEP_FROM_UNI] = {
543 { 0x00a0, 0x80 },
544 { 0x00a1, 0xa1 },
545 { 0x00a2, 0xa2 },
546 { 0x00a3, 0xa3 },
547 { 0x00a4, 0xa8 },
548 { 0x00a5, 0xa5 },
549 { 0x00a6, 0xb5 },
550 { 0x00a7, 0xa7 },
551 { 0x00a8, 0xc8 },
552 { 0x00a9, 0xa0 },
553 { 0x00aa, 0xe3 },
554 { 0x00ab, 0xab },
555 { 0x00ac, 0xbe },
556 /* { 0x00ad, 0x2d }, <= 96/10/25 rick removed; converts soft-hyphen to hyphen! */
557 { 0x00ae, 0xb0 },
558 { 0x00af, 0xc5 },
559 { 0x00b1, 0xd1 },
560 { 0x00b2, 0xc9 },
561 { 0x00b3, 0xcc },
562 { 0x00b4, 0xc2 },
563 { 0x00b5, 0x9d },
564 { 0x00b6, 0xb6 },
565 { 0x00b7, 0xb4 },
566 { 0x00b8, 0xcb },
567 { 0x00b9, 0xc0 },
568 { 0x00ba, 0xeb },
569 { 0x00bb, 0xbb },
570 { 0x00bc, 0xd2 },
571 { 0x00bd, 0xd3 },
572 { 0x00be, 0xd4 },
573 { 0x00bf, 0xbf },
574 { 0x00c0, 0x81 },
575 { 0x00c1, 0x82 },
576 { 0x00c2, 0x83 },
577 { 0x00c3, 0x84 },
578 { 0x00c4, 0x85 },
579 { 0x00c5, 0x86 },
580 { 0x00c6, 0xe1 },
581 { 0x00c7, 0x87 },
582 { 0x00c8, 0x88 },
583 { 0x00c9, 0x89 },
584 { 0x00ca, 0x8a },
585 { 0x00cb, 0x8b },
586 { 0x00cc, 0x8c },
587 { 0x00cd, 0x8d },
588 { 0x00ce, 0x8e },
589 { 0x00cf, 0x8f },
590 { 0x00d0, 0x90 },
591 { 0x00d1, 0x91 },
592 { 0x00d2, 0x92 },
593 { 0x00d3, 0x93 },
594 { 0x00d4, 0x94 },
595 { 0x00d5, 0x95 },
596 { 0x00d6, 0x96 },
597 { 0x00d7, 0x9e },
598 { 0x00d8, 0xe9 },
599 { 0x00d9, 0x97 },
600 { 0x00da, 0x98 },
601 { 0x00db, 0x99 },
602 { 0x00dc, 0x9a },
603 { 0x00dd, 0x9b },
604 { 0x00de, 0x9c },
605 { 0x00df, 0xfb },
606 { 0x00e0, 0xd5 },
607 { 0x00e1, 0xd6 },
608 { 0x00e2, 0xd7 },
609 { 0x00e3, 0xd8 },
610 { 0x00e4, 0xd9 },
611 { 0x00e5, 0xda },
612 { 0x00e6, 0xf1 },
613 { 0x00e7, 0xdb },
614 { 0x00e8, 0xdc },
615 { 0x00e9, 0xdd },
616 { 0x00ea, 0xde },
617 { 0x00eb, 0xdf },
618 { 0x00ec, 0xe0 },
619 { 0x00ed, 0xe2 },
620 { 0x00ee, 0xe4 },
621 { 0x00ef, 0xe5 },
622 { 0x00f0, 0xe6 },
623 { 0x00f1, 0xe7 },
624 { 0x00f2, 0xec },
625 { 0x00f3, 0xed },
626 { 0x00f4, 0xee },
627 { 0x00f5, 0xef },
628 { 0x00f6, 0xf0 },
629 { 0x00f7, 0x9f },
630 { 0x00f8, 0xf9 },
631 { 0x00f9, 0xf2 },
632 { 0x00fa, 0xf3 },
633 { 0x00fb, 0xf4 },
634 { 0x00fc, 0xf6 },
635 { 0x00fd, 0xf7 },
636 { 0x00fe, 0xfc },
637 { 0x00ff, 0xfd },
638 { 0x0131, 0xf5 },
639 { 0x0141, 0xe8 },
640 { 0x0142, 0xf8 },
641 { 0x0152, 0xea },
642 { 0x0153, 0xfa },
643 { 0x0192, 0xa6 },
644 { 0x02c6, 0xc3 },
645 { 0x02c7, 0xcf },
646 { 0x02cb, 0xc1 },
647 { 0x02d8, 0xc6 },
648 { 0x02d9, 0xc7 },
649 { 0x02da, 0xca },
650 { 0x02db, 0xce },
651 { 0x02dc, 0xc4 },
652 { 0x02dd, 0xcd },
653 { 0x2013, 0xb1 },
654 { 0x2014, 0xd0 },
655 { 0x2019, 0xa9 },
656 { 0x201a, 0xb8 },
657 { 0x201c, 0xaa },
658 { 0x201d, 0xba },
659 { 0x201e, 0xb9 },
660 { 0x2020, 0xb2 },
661 { 0x2021, 0xb3 },
662 { 0x2022, 0xb7 },
663 { 0x2026, 0xbc },
664 { 0x2029, 0x0a }, /* ParagraphSeparator -> ASCIINewLine */
665 { 0x2030, 0xbd },
666 { 0x2039, 0xac },
667 { 0x203a, 0xad },
668 { 0x2044, 0xa4 },
669 { 0xfb01, 0xae },
670 { 0xfb02, 0xaf },
671 { 0xfffd, 0xff },
672 };
673
674 static Boolean __CFToNextStepLatin(UInt32 flags, UniChar character, uint8_t *byte) {
675 if (character < 0x80) {
676 *byte = (uint8_t)character;
677 return true;
678 } else {
679 return CFStringEncodingUnicodeTo8BitEncoding(nextstep_from_tab, NUM_NEXTSTEP_FROM_UNI, character, byte);
680 }
681 };
682
683 static const UniChar NSToPrecompUnicodeTable[128] = {
684 /* NextStep Encoding Unicode */
685 /* 128 figspace */ 0x00a0, /* 0x2007 is fig space */
686 /* 129 Agrave */ 0x00c0,
687 /* 130 Aacute */ 0x00c1,
688 /* 131 Acircumflex */ 0x00c2,
689 /* 132 Atilde */ 0x00c3,
690 /* 133 Adieresis */ 0x00c4,
691 /* 134 Aring */ 0x00c5,
692 /* 135 Ccedilla */ 0x00c7,
693 /* 136 Egrave */ 0x00c8,
694 /* 137 Eacute */ 0x00c9,
695 /* 138 Ecircumflex */ 0x00ca,
696 /* 139 Edieresis */ 0x00cb,
697 /* 140 Igrave */ 0x00cc,
698 /* 141 Iacute */ 0x00cd,
699 /* 142 Icircumflex */ 0x00ce,
700 /* 143 Idieresis */ 0x00cf,
701 /* 144 Eth */ 0x00d0,
702 /* 145 Ntilde */ 0x00d1,
703 /* 146 Ograve */ 0x00d2,
704 /* 147 Oacute */ 0x00d3,
705 /* 148 Ocircumflex */ 0x00d4,
706 /* 149 Otilde */ 0x00d5,
707 /* 150 Odieresis */ 0x00d6,
708 /* 151 Ugrave */ 0x00d9,
709 /* 152 Uacute */ 0x00da,
710 /* 153 Ucircumflex */ 0x00db,
711 /* 154 Udieresis */ 0x00dc,
712 /* 155 Yacute */ 0x00dd,
713 /* 156 Thorn */ 0x00de,
714 /* 157 mu */ 0x00b5,
715 /* 158 multiply */ 0x00d7,
716 /* 159 divide */ 0x00f7,
717 /* 160 copyright */ 0x00a9,
718 /* 161 exclamdown */ 0x00a1,
719 /* 162 cent */ 0x00a2,
720 /* 163 sterling */ 0x00a3,
721 /* 164 fraction */ 0x2044,
722 /* 165 yen */ 0x00a5,
723 /* 166 florin */ 0x0192,
724 /* 167 section */ 0x00a7,
725 /* 168 currency */ 0x00a4,
726 /* 169 quotesingle */ 0x2019,
727 /* 170 quotedblleft */ 0x201c,
728 /* 171 guillemotleft */ 0x00ab,
729 /* 172 guilsinglleft */ 0x2039,
730 /* 173 guilsinglright */ 0x203a,
731 /* 174 fi */ 0xFB01,
732 /* 175 fl */ 0xFB02,
733 /* 176 registered */ 0x00ae,
734 /* 177 endash */ 0x2013,
735 /* 178 dagger */ 0x2020,
736 /* 179 daggerdbl */ 0x2021,
737 /* 180 periodcentered */ 0x00b7,
738 /* 181 brokenbar */ 0x00a6,
739 /* 182 paragraph */ 0x00b6,
740 /* 183 bullet */ 0x2022,
741 /* 184 quotesinglbase */ 0x201a,
742 /* 185 quotedblbase */ 0x201e,
743 /* 186 quotedblright */ 0x201d,
744 /* 187 guillemotright */ 0x00bb,
745 /* 188 ellipsis */ 0x2026,
746 /* 189 perthousand */ 0x2030,
747 /* 190 logicalnot */ 0x00ac,
748 /* 191 questiondown */ 0x00bf,
749 /* 192 onesuperior */ 0x00b9,
750 /* 193 grave */ 0x02cb,
751 /* 194 acute */ 0x00b4,
752 /* 195 circumflex */ 0x02c6,
753 /* 196 tilde */ 0x02dc,
754 /* 197 macron */ 0x00af,
755 /* 198 breve */ 0x02d8,
756 /* 199 dotaccent */ 0x02d9,
757 /* 200 dieresis */ 0x00a8,
758 /* 201 twosuperior */ 0x00b2,
759 /* 202 ring */ 0x02da,
760 /* 203 cedilla */ 0x00b8,
761 /* 204 threesuperior */ 0x00b3,
762 /* 205 hungarumlaut */ 0x02dd,
763 /* 206 ogonek */ 0x02db,
764 /* 207 caron */ 0x02c7,
765 /* 208 emdash */ 0x2014,
766 /* 209 plusminus */ 0x00b1,
767 /* 210 onequarter */ 0x00bc,
768 /* 211 onehalf */ 0x00bd,
769 /* 212 threequarters */ 0x00be,
770 /* 213 agrave */ 0x00e0,
771 /* 214 aacute */ 0x00e1,
772 /* 215 acircumflex */ 0x00e2,
773 /* 216 atilde */ 0x00e3,
774 /* 217 adieresis */ 0x00e4,
775 /* 218 aring */ 0x00e5,
776 /* 219 ccedilla */ 0x00e7,
777 /* 220 egrave */ 0x00e8,
778 /* 221 eacute */ 0x00e9,
779 /* 222 ecircumflex */ 0x00ea,
780 /* 223 edieresis */ 0x00eb,
781 /* 224 igrave */ 0x00ec,
782 /* 225 AE */ 0x00c6,
783 /* 226 iacute */ 0x00ed,
784 /* 227 ordfeminine */ 0x00aa,
785 /* 228 icircumflex */ 0x00ee,
786 /* 229 idieresis */ 0x00ef,
787 /* 230 eth */ 0x00f0,
788 /* 231 ntilde */ 0x00f1,
789 /* 232 Lslash */ 0x0141,
790 /* 233 Oslash */ 0x00d8,
791 /* 234 OE */ 0x0152,
792 /* 235 ordmasculine */ 0x00ba,
793 /* 236 ograve */ 0x00f2,
794 /* 237 oacute */ 0x00f3,
795 /* 238 ocircumflex */ 0x00f4,
796 /* 239 otilde */ 0x00f5,
797 /* 240 odieresis */ 0x00f6,
798 /* 241 ae */ 0x00e6,
799 /* 242 ugrave */ 0x00f9,
800 /* 243 uacute */ 0x00fa,
801 /* 244 ucircumflex */ 0x00fb,
802 /* 245 dotlessi */ 0x0131,
803 /* 246 udieresis */ 0x00fc,
804 /* 247 yacute */ 0x00fd,
805 /* 248 lslash */ 0x0142,
806 /* 249 oslash */ 0x00f8,
807 /* 250 oe */ 0x0153,
808 /* 251 germandbls */ 0x00df,
809 /* 252 thorn */ 0x00fe,
810 /* 253 ydieresis */ 0x00ff,
811 /* 254 .notdef */ 0xFFFD,
812 /* 255 .notdef */ 0xFFFD
813 };
814
815 static Boolean __CFFromNextStepLatin(UInt32 flags, uint8_t byte, UniChar *character) {
816 return ((*character = (byte < 0x80 ? (UniChar)byte : NSToPrecompUnicodeTable[byte - 0x80])) != 0xFFFD);
817 }
818
819 static UInt32 __CFToNextStepLatinPrecompose(UInt32 flags, const UniChar *character, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) {
820 uint8_t byte;
821 UInt32 usedCharLen;
822
823 if (__CFToNextStepLatin(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
824 if (maxByteLen) *bytes = byte;
825 *usedByteLen = 1;
826 return usedCharLen;
827 } else {
828 return 0;
829 }
830 }
831
832 __private_extern__ CFStringEncodingConverter __CFConverterNextStepLatin = {
833 __CFToNextStepLatin, __CFFromNextStepLatin, 1, 1, kCFStringEncodingConverterCheapEightBit,
834 NULL, NULL, NULL, NULL, __CFToNextStepLatinPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
835 };
836
837 /* UTF8 */
838 /*
839 * Copyright 2001 Unicode, Inc.
840 *
841 * Disclaimer
842 *
843 * This source code is provided as is by Unicode, Inc. No claims are
844 * made as to fitness for any particular purpose. No warranties of any
845 * kind are expressed or implied. The recipient agrees to determine
846 * applicability of information provided. If this file has been
847 * purchased on magnetic or optical media from Unicode, Inc., the
848 * sole remedy for any claim will be exchange of defective media
849 * within 90 days of receipt.
850 *
851 * Limitations on Rights to Redistribute This Code
852 *
853 * Unicode, Inc. hereby grants the right to freely use the information
854 * supplied in this file in the creation of products supporting the
855 * Unicode Standard, and to make copies of this file in any form
856 * for internal or external distribution as long as this notice
857 * remains attached.
858 */
859
860 static const UInt32 kReplacementCharacter = 0x0000FFFDUL;
861 static const UInt32 kMaximumUCS2 = 0x0000FFFFUL;
862 static const UInt32 kMaximumUTF16 = 0x0010FFFFUL;
863 static const UInt32 kMaximumUCS4 = 0x7FFFFFFFUL;
864
865 static const int halfShift = 10;
866 static const UInt32 halfBase = 0x0010000UL;
867 static const UInt32 halfMask = 0x3FFUL;
868 static const UInt32 kSurrogateHighStart = 0xD800UL;
869 static const UInt32 kSurrogateHighEnd = 0xDBFFUL;
870 static const UInt32 kSurrogateLowStart = 0xDC00UL;
871 static const UInt32 kSurrogateLowEnd = 0xDFFFUL;
872
873 /*
874 * Index into the table below with the first byte of a UTF-8 sequence to
875 * get the number of trailing bytes that are supposed to follow it.
876 */
877 static const char trailingBytesForUTF8[256] = {
878 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
879 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
880 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
881 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
882 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
883 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
884 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
885 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
886 };
887
888 /*
889 * Magic values subtracted from a buffer value during UTF8 conversion.
890 * This table contains as many values as there might be trailing bytes
891 * in a UTF-8 sequence.
892 */
893 static const UTF32Char offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
894 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
895
896 static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
897
898 /* This code is similar in effect to making successive calls on the mbtowc and wctomb routines in FSS-UTF. However, it is considerably different in code:
899 * it is adapted to be consistent with UTF16,
900 * constants have been gathered.
901 * loops & conditionals have been removed as much as possible for
902 * efficiency, in favor of drop-through switch statements.
903 */
904
905 CF_INLINE uint16_t __CFUTF8BytesToWriteForCharacter(UInt32 ch) {
906 if (ch < 0x80) return 1;
907 else if (ch < 0x800) return 2;
908 else if (ch < 0x10000) return 3;
909 else if (ch < 0x200000) return 4;
910 else if (ch < 0x4000000) return 5;
911 else if (ch <= kMaximumUCS4) return 6;
912 else return 0;
913 }
914
915 CF_INLINE uint16_t __CFToUTF8Core(UInt32 ch, uint8_t *bytes, UInt32 maxByteLen) {
916 uint16_t bytesToWrite = __CFUTF8BytesToWriteForCharacter(ch);
917 const UInt32 byteMask = 0xBF;
918 const UInt32 byteMark = 0x80;
919
920 if (!bytesToWrite) {
921 bytesToWrite = 2;
922 ch = kReplacementCharacter;
923 }
924
925 if (maxByteLen < bytesToWrite) return 0;
926
927 switch (bytesToWrite) { /* note: code falls through cases! */
928 case 6: bytes[5] = (ch | byteMark) & byteMask; ch >>= 6;
929 case 5: bytes[4] = (ch | byteMark) & byteMask; ch >>= 6;
930 case 4: bytes[3] = (ch | byteMark) & byteMask; ch >>= 6;
931 case 3: bytes[2] = (ch | byteMark) & byteMask; ch >>= 6;
932 case 2: bytes[1] = (ch | byteMark) & byteMask; ch >>= 6;
933 case 1: bytes[0] = ch | firstByteMark[bytesToWrite];
934 }
935 return bytesToWrite;
936 }
937
938 static UInt32 __CFToUTF8(UInt32 flags, const UniChar *characters, UInt32 numChars, uint8_t *bytes, UInt32 maxByteLen, UInt32 *usedByteLen) {
939 uint16_t bytesWritten;
940 UInt32 ch;
941 const UniChar *beginCharacter = characters;
942 const UniChar *endCharacter = characters + numChars;
943 const uint8_t *beginBytes = bytes;
944 const uint8_t *endBytes = bytes + maxByteLen;
945 bool isStrict = (flags & kCFStringEncodingUseHFSPlusCanonical ? false : true);
946
947 while ((characters < endCharacter) && (!maxByteLen || (bytes < endBytes))) {
948 ch = *(characters++);
949
950 if (ch < 0x80) { // ASCII
951 if (maxByteLen) *bytes = ch;
952 ++bytes;
953 } else {
954 if (ch >= kSurrogateHighStart) {
955 if (ch <= kSurrogateHighEnd) {
956 if ((characters < endCharacter) && ((*characters >= kSurrogateLowStart) && (*characters <= kSurrogateLowEnd))) {
957 ch = ((ch - kSurrogateHighStart) << halfShift) + (*(characters++) - kSurrogateLowStart) + halfBase;
958 } else if (isStrict) {
959 --characters;
960 break;
961 }
962 } else if (isStrict && (ch <= kSurrogateLowEnd)) {
963 --characters;
964 break;
965 }
966 }
967
968 if (!(bytesWritten = (maxByteLen ? __CFToUTF8Core(ch, bytes, endBytes - bytes) : __CFUTF8BytesToWriteForCharacter(ch)))) {
969 --characters;
970 break;
971 }
972 bytes += bytesWritten;
973 }
974 }
975
976 if (usedByteLen) *usedByteLen = bytes - beginBytes;
977 return characters - beginCharacter;
978 }
979
980 /*
981 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
982 * This must be called with the length pre-determined by the first byte.
983 * If not calling this from ConvertUTF8to*, then the length can be set by:
984 * length = trailingBytesForUTF8[*source]+1;
985 * and the sequence is illegal right away if there aren't that many bytes
986 * available.
987 * If presented with a length > 4, this returns false. The Unicode
988 * definition of UTF-8 goes up to 4-byte sequences.
989 */
990
991 CF_INLINE bool __CFIsLegalUTF8(const uint8_t *source, int length) {
992 uint8_t a;
993 const uint8_t *srcptr = source+length;
994 switch (length) {
995 default: return false;
996 /* Everything else falls through when "true"... */
997 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
998 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
999 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
1000 switch (*source) {
1001 /* no fall-through in this inner switch */
1002 case 0xE0: if (a < 0xA0) return false; break;
1003 case 0xF0: if (a < 0x90) return false; break;
1004 case 0xF4: if (a > 0x8F) return false; break;
1005 default: if (a < 0x80) return false;
1006 }
1007 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
1008 if (*source > 0xF4) return false;
1009 }
1010 return true;
1011 }
1012
1013 static UInt32 __CFFromUTF8(UInt32 flags, const uint8_t *bytes, UInt32 numBytes, UniChar *characters, UInt32 maxCharLen, UInt32 *usedCharLen) {
1014 const uint8_t *source = bytes;
1015 uint16_t extraBytesToRead;
1016 UInt32 theUsedCharLen = 0;
1017 UInt32 ch;
1018 Boolean isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
1019 Boolean needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false);
1020 Boolean strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true);
1021 UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
1022 int32_t decompLength;
1023 bool isStrict = !isHFSPlus;
1024
1025 while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
1026 extraBytesToRead = trailingBytesForUTF8[*source];
1027
1028 if (extraBytesToRead > --numBytes) break;
1029 numBytes -= extraBytesToRead;
1030
1031 /* Do this check whether lenient or strict */
1032 // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps
1033 // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release
1034 if ((strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1)) || (extraBytesToRead > 3)) {
1035 if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) {
1036 numBytes += extraBytesToRead;
1037 ++source;
1038 if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter;
1039 ++theUsedCharLen;
1040 continue;
1041 } else {
1042 break;
1043 }
1044 }
1045
1046 ch = 0;
1047 /*
1048 * The cases all fall through. See "Note A" below.
1049 */
1050 switch (extraBytesToRead) {
1051 case 3: ch += *source++; ch <<= 6;
1052 case 2: ch += *source++; ch <<= 6;
1053 case 1: ch += *source++; ch <<= 6;
1054 case 0: ch += *source++;
1055 }
1056 ch -= offsetsFromUTF8[extraBytesToRead];
1057
1058 if (ch <= kMaximumUCS2) {
1059 if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) {
1060 source -= (extraBytesToRead + 1);
1061 break;
1062 }
1063 if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1064 decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1065
1066 if (maxCharLen) {
1067 if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, (uint32_t *)&theUsedCharLen, kCFUniCharUTF16Format)) break;
1068 } else {
1069 theUsedCharLen += decompLength;
1070 }
1071 } else {
1072 if (maxCharLen) *(characters++) = (UTF16Char)ch;
1073 ++theUsedCharLen;
1074 }
1075 } else if (ch > kMaximumUTF16) {
1076 if (isStrict) {
1077 source -= (extraBytesToRead + 1);
1078 break;
1079 }
1080 if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter;
1081 ++theUsedCharLen;
1082 } else {
1083 if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1084 decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1085
1086 if (maxCharLen) {
1087 if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, (uint32_t *)&theUsedCharLen, kCFUniCharUTF16Format)) break;
1088 } else {
1089 while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2);
1090 }
1091 } else {
1092 if (maxCharLen) {
1093 if ((theUsedCharLen + 2) > maxCharLen) break;
1094 ch -= halfBase;
1095 *(characters++) = (ch >> halfShift) + kSurrogateHighStart;
1096 *(characters++) = (ch & halfMask) + kSurrogateLowStart;
1097 }
1098 theUsedCharLen += 2;
1099 }
1100 }
1101 }
1102
1103 if (usedCharLen) *usedCharLen = theUsedCharLen;
1104
1105 return source - bytes;
1106 }
1107
1108 static UInt32 __CFToUTF8Len(UInt32 flags, const UniChar *characters, UInt32 numChars) {
1109 UInt32 bytesToWrite = 0;
1110 UInt32 ch;
1111
1112 while (numChars) {
1113 ch = *characters++;
1114 numChars--;
1115 if ((ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd) && numChars && (*characters >= kSurrogateLowStart && *characters <= kSurrogateLowEnd)) {
1116 ch = ((ch - kSurrogateHighStart) << halfShift) + (*characters++ - kSurrogateLowStart) + halfBase;
1117 numChars--;
1118 }
1119 bytesToWrite += __CFUTF8BytesToWriteForCharacter(ch);
1120 }
1121
1122 return bytesToWrite;
1123 }
1124
1125 static UInt32 __CFFromUTF8Len(UInt32 flags, const uint8_t *source, UInt32 numBytes) {
1126 uint16_t extraBytesToRead;
1127 UInt32 theUsedCharLen = 0;
1128 UInt32 ch;
1129 Boolean isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
1130 Boolean needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false);
1131 Boolean strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true);
1132 UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
1133 int32_t decompLength;
1134 bool isStrict = !isHFSPlus;
1135
1136 while (numBytes) {
1137 extraBytesToRead = trailingBytesForUTF8[*source];
1138
1139 if (extraBytesToRead > --numBytes) break;
1140 numBytes -= extraBytesToRead;
1141
1142 /* Do this check whether lenient or strict */
1143 // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps
1144 // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release
1145 if ((strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1)) || (extraBytesToRead > 3)) {
1146 if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) {
1147 numBytes += extraBytesToRead;
1148 ++source;
1149 ++theUsedCharLen;
1150 continue;
1151 } else {
1152 break;
1153 }
1154 }
1155
1156
1157 ch = 0;
1158 /*
1159 * The cases all fall through. See "Note A" below.
1160 */
1161 switch (extraBytesToRead) {
1162 case 3: ch += *source++; ch <<= 6;
1163 case 2: ch += *source++; ch <<= 6;
1164 case 1: ch += *source++; ch <<= 6;
1165 case 0: ch += *source++;
1166 }
1167 ch -= offsetsFromUTF8[extraBytesToRead];
1168
1169 if (ch <= kMaximumUCS2) {
1170 if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) {
1171 break;
1172 }
1173 if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1174 decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1175 theUsedCharLen += decompLength;
1176 } else {
1177 ++theUsedCharLen;
1178 }
1179 } else if (ch > kMaximumUTF16) {
1180 ++theUsedCharLen;
1181 } else {
1182 if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1183 decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1184 while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2);
1185 } else {
1186 theUsedCharLen += 2;
1187 }
1188 }
1189 }
1190
1191 return theUsedCharLen;
1192 }
1193
1194 __private_extern__ CFStringEncodingConverter __CFConverterUTF8 = {
1195 __CFToUTF8, __CFFromUTF8, 6, 2, kCFStringEncodingConverterStandard,
1196 __CFToUTF8Len, __CFFromUTF8Len, NULL, NULL, NULL, NULL,
1197 };