]>
Commit | Line | Data |
---|---|---|
9ce05555 | 1 | /* |
8ca704e1 | 2 | * Copyright (c) 2011 Apple Inc. All rights reserved. |
9ce05555 A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
9ce05555 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
f64f9b69 | 23 | |
9ce05555 | 24 | /* CFBuiltinConverters.c |
8ca704e1 | 25 | Copyright (c) 1999-2011, Apple Inc. All rights reserved. |
9ce05555 A |
26 | Responsibility: Aki Inoue |
27 | */ | |
28 | ||
29 | #include "CFStringEncodingConverterExt.h" | |
30 | #include "CFUniChar.h" | |
31 | #include "CFUnicodeDecomposition.h" | |
32 | #include "CFUnicodePrecomposition.h" | |
33 | #include "CFStringEncodingConverterPriv.h" | |
34 | #include "CFInternal.h" | |
35 | ||
36 | #define ParagraphSeparator 0x2029 | |
37 | #define ASCIINewLine 0x0a | |
bd5b749c A |
38 | static int8_t __CFMapsParagraphSeparator = -1; |
39 | ||
40 | CF_INLINE bool __CFIsParagraphSeparator(UTF16Char character) { | |
41 | if (-1 == __CFMapsParagraphSeparator) __CFMapsParagraphSeparator = (_CFExecutableLinkedOnOrAfter(CFSystemVersionLeopard) ? false : true); | |
42 | ||
43 | return ((__CFMapsParagraphSeparator && (ParagraphSeparator == character)) ? true : false); | |
44 | } | |
9ce05555 A |
45 | |
46 | /* Precomposition */ | |
bd5b749c | 47 | static const uint32_t __CFLatin1CombiningCharBitmap[] = { // 0x300 ~ 0x35FF |
9ce05555 A |
48 | 0xFBB94010, 0x01800000, 0x0000000, |
49 | }; | |
50 | ||
bd5b749c | 51 | bool CFStringEncodingIsValidCombiningCharacterForLatin1(UniChar character) { |
9ce05555 A |
52 | return ((character >= 0x300) && (character < 0x360) && (__CFLatin1CombiningCharBitmap[(character - 0x300) / 32] & (1 << (31 - ((character - 0x300) % 32)))) ? true : false); |
53 | } | |
54 | ||
bd5b749c | 55 | UniChar CFStringEncodingPrecomposeLatinCharacter(const UniChar *character, CFIndex numChars, CFIndex *usedChars) { |
9ce05555 A |
56 | if (numChars > 0) { |
57 | UTF32Char ch = *(character++), nextCh, composedChar; | |
bd5b749c | 58 | CFIndex usedCharLen = 1; |
9ce05555 A |
59 | |
60 | if (CFUniCharIsSurrogateHighCharacter(ch) || CFUniCharIsSurrogateLowCharacter(ch)) { | |
61 | if (usedChars) (*usedChars) = usedCharLen; | |
62 | return ch; | |
63 | } | |
64 | ||
65 | while (usedCharLen < numChars) { | |
66 | nextCh = *(character++); | |
67 | ||
68 | if (CFUniCharIsSurrogateHighCharacter(nextCh) || CFUniCharIsSurrogateLowCharacter(nextCh)) break; | |
69 | ||
70 | if (CFUniCharIsMemberOf(nextCh, kCFUniCharNonBaseCharacterSet) && ((composedChar = CFUniCharPrecomposeCharacter(ch, nextCh)) != 0xFFFD)) { | |
71 | if (composedChar > 0xFFFF) { // Non-base | |
72 | break; | |
73 | } else { | |
74 | ch = composedChar; | |
75 | } | |
76 | } else { | |
77 | break; | |
78 | } | |
79 | ++usedCharLen; | |
80 | } | |
81 | if (usedChars) (*usedChars) = usedCharLen; | |
cf7d2af9 | 82 | if (usedCharLen > 1) return ch; |
9ce05555 A |
83 | } |
84 | return 0xFFFD; | |
85 | } | |
86 | ||
87 | /* ASCII */ | |
bd5b749c | 88 | static bool __CFToASCII(uint32_t flags, UniChar character, uint8_t *byte) { |
9ce05555 A |
89 | if (character < 0x80) { |
90 | *byte = (uint8_t)character; | |
bd5b749c | 91 | } else if (__CFIsParagraphSeparator(character)) { |
9ce05555 A |
92 | *byte = ASCIINewLine; |
93 | } else { | |
94 | return false; | |
95 | } | |
96 | return true; | |
97 | } | |
98 | ||
bd5b749c | 99 | static bool __CFFromASCII(uint32_t flags, uint8_t byte, UniChar *character) { |
9ce05555 A |
100 | if (byte < 0x80) { |
101 | *character = (UniChar)byte; | |
102 | return true; | |
103 | } else { | |
104 | return false; | |
105 | } | |
106 | } | |
107 | ||
108 | ||
d8925383 | 109 | __private_extern__ const CFStringEncodingConverter __CFConverterASCII = { |
9ce05555 A |
110 | __CFToASCII, __CFFromASCII, 1, 1, kCFStringEncodingConverterCheapEightBit, |
111 | NULL, NULL, NULL, NULL, NULL, NULL, | |
112 | }; | |
113 | ||
114 | /* ISO Latin 1 (8859-1) */ | |
bd5b749c | 115 | static bool __CFToISOLatin1(uint32_t flags, UniChar character, uint8_t *byte) { |
9ce05555 A |
116 | if (character <= 0xFF) { |
117 | *byte = (uint8_t)character; | |
bd5b749c | 118 | } else if (__CFIsParagraphSeparator(character)) { |
9ce05555 A |
119 | *byte = ASCIINewLine; |
120 | } else { | |
121 | return false; | |
122 | } | |
123 | ||
124 | return true; | |
125 | } | |
126 | ||
bd5b749c | 127 | static bool __CFFromISOLatin1(uint32_t flags, uint8_t byte, UniChar *character) { |
9ce05555 A |
128 | *character = (UniChar)byte; |
129 | return true; | |
130 | } | |
131 | ||
bd5b749c | 132 | static CFIndex __CFToISOLatin1Precompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
9ce05555 | 133 | uint8_t byte; |
bd5b749c | 134 | CFIndex usedCharLen; |
9ce05555 A |
135 | |
136 | if (__CFToISOLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) { | |
137 | if (maxByteLen) *bytes = byte; | |
138 | *usedByteLen = 1; | |
139 | return usedCharLen; | |
140 | } else { | |
141 | return 0; | |
142 | } | |
143 | } | |
144 | ||
d8925383 | 145 | __private_extern__ const CFStringEncodingConverter __CFConverterISOLatin1 = { |
9ce05555 A |
146 | __CFToISOLatin1, __CFFromISOLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit, |
147 | NULL, NULL, NULL, NULL, __CFToISOLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1, | |
148 | }; | |
149 | ||
150 | /* Mac Roman */ | |
151 | #define NUM_MACROMAN_FROM_UNI 129 | |
152 | static const CFStringEncodingUnicodeTo8BitCharMap macRoman_from_uni[NUM_MACROMAN_FROM_UNI] = { | |
153 | { 0x00A0, 0xCA }, /* NO-BREAK SPACE */ | |
154 | { 0x00A1, 0xC1 }, /* INVERTED EXCLAMATION MARK */ | |
155 | { 0x00A2, 0xA2 }, /* CENT SIGN */ | |
156 | { 0x00A3, 0xA3 }, /* POUND SIGN */ | |
157 | { 0x00A5, 0xB4 }, /* YEN SIGN */ | |
158 | { 0x00A7, 0xA4 }, /* SECTION SIGN */ | |
159 | { 0x00A8, 0xAC }, /* DIAERESIS */ | |
160 | { 0x00A9, 0xA9 }, /* COPYRIGHT SIGN */ | |
161 | { 0x00AA, 0xBB }, /* FEMININE ORDINAL INDICATOR */ | |
162 | { 0x00AB, 0xC7 }, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */ | |
163 | { 0x00AC, 0xC2 }, /* NOT SIGN */ | |
164 | { 0x00AE, 0xA8 }, /* REGISTERED SIGN */ | |
165 | { 0x00AF, 0xF8 }, /* MACRON */ | |
166 | { 0x00B0, 0xA1 }, /* DEGREE SIGN */ | |
167 | { 0x00B1, 0xB1 }, /* PLUS-MINUS SIGN */ | |
168 | { 0x00B4, 0xAB }, /* ACUTE ACCENT */ | |
169 | { 0x00B5, 0xB5 }, /* MICRO SIGN */ | |
170 | { 0x00B6, 0xA6 }, /* PILCROW SIGN */ | |
171 | { 0x00B7, 0xE1 }, /* MIDDLE DOT */ | |
172 | { 0x00B8, 0xFC }, /* CEDILLA */ | |
173 | { 0x00BA, 0xBC }, /* MASCULINE ORDINAL INDICATOR */ | |
174 | { 0x00BB, 0xC8 }, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */ | |
175 | { 0x00BF, 0xC0 }, /* INVERTED QUESTION MARK */ | |
176 | { 0x00C0, 0xCB }, /* LATIN CAPITAL LETTER A WITH GRAVE */ | |
177 | { 0x00C1, 0xE7 }, /* LATIN CAPITAL LETTER A WITH ACUTE */ | |
178 | { 0x00C2, 0xE5 }, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ | |
179 | { 0x00C3, 0xCC }, /* LATIN CAPITAL LETTER A WITH TILDE */ | |
180 | { 0x00C4, 0x80 }, /* LATIN CAPITAL LETTER A WITH DIAERESIS */ | |
181 | { 0x00C5, 0x81 }, /* LATIN CAPITAL LETTER A WITH RING ABOVE */ | |
182 | { 0x00C6, 0xAE }, /* LATIN CAPITAL LIGATURE AE */ | |
183 | { 0x00C7, 0x82 }, /* LATIN CAPITAL LETTER C WITH CEDILLA */ | |
184 | { 0x00C8, 0xE9 }, /* LATIN CAPITAL LETTER E WITH GRAVE */ | |
185 | { 0x00C9, 0x83 }, /* LATIN CAPITAL LETTER E WITH ACUTE */ | |
186 | { 0x00CA, 0xE6 }, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */ | |
187 | { 0x00CB, 0xE8 }, /* LATIN CAPITAL LETTER E WITH DIAERESIS */ | |
188 | { 0x00CC, 0xED }, /* LATIN CAPITAL LETTER I WITH GRAVE */ | |
189 | { 0x00CD, 0xEA }, /* LATIN CAPITAL LETTER I WITH ACUTE */ | |
190 | { 0x00CE, 0xEB }, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */ | |
191 | { 0x00CF, 0xEC }, /* LATIN CAPITAL LETTER I WITH DIAERESIS */ | |
192 | { 0x00D1, 0x84 }, /* LATIN CAPITAL LETTER N WITH TILDE */ | |
193 | { 0x00D2, 0xF1 }, /* LATIN CAPITAL LETTER O WITH GRAVE */ | |
194 | { 0x00D3, 0xEE }, /* LATIN CAPITAL LETTER O WITH ACUTE */ | |
195 | { 0x00D4, 0xEF }, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */ | |
196 | { 0x00D5, 0xCD }, /* LATIN CAPITAL LETTER O WITH TILDE */ | |
197 | { 0x00D6, 0x85 }, /* LATIN CAPITAL LETTER O WITH DIAERESIS */ | |
198 | { 0x00D8, 0xAF }, /* LATIN CAPITAL LETTER O WITH STROKE */ | |
199 | { 0x00D9, 0xF4 }, /* LATIN CAPITAL LETTER U WITH GRAVE */ | |
200 | { 0x00DA, 0xF2 }, /* LATIN CAPITAL LETTER U WITH ACUTE */ | |
201 | { 0x00DB, 0xF3 }, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */ | |
202 | { 0x00DC, 0x86 }, /* LATIN CAPITAL LETTER U WITH DIAERESIS */ | |
203 | { 0x00DF, 0xA7 }, /* LATIN SMALL LETTER SHARP S */ | |
204 | { 0x00E0, 0x88 }, /* LATIN SMALL LETTER A WITH GRAVE */ | |
205 | { 0x00E1, 0x87 }, /* LATIN SMALL LETTER A WITH ACUTE */ | |
206 | { 0x00E2, 0x89 }, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */ | |
207 | { 0x00E3, 0x8B }, /* LATIN SMALL LETTER A WITH TILDE */ | |
208 | { 0x00E4, 0x8A }, /* LATIN SMALL LETTER A WITH DIAERESIS */ | |
209 | { 0x00E5, 0x8C }, /* LATIN SMALL LETTER A WITH RING ABOVE */ | |
210 | { 0x00E6, 0xBE }, /* LATIN SMALL LIGATURE AE */ | |
211 | { 0x00E7, 0x8D }, /* LATIN SMALL LETTER C WITH CEDILLA */ | |
212 | { 0x00E8, 0x8F }, /* LATIN SMALL LETTER E WITH GRAVE */ | |
213 | { 0x00E9, 0x8E }, /* LATIN SMALL LETTER E WITH ACUTE */ | |
214 | { 0x00EA, 0x90 }, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */ | |
215 | { 0x00EB, 0x91 }, /* LATIN SMALL LETTER E WITH DIAERESIS */ | |
216 | { 0x00EC, 0x93 }, /* LATIN SMALL LETTER I WITH GRAVE */ | |
217 | { 0x00ED, 0x92 }, /* LATIN SMALL LETTER I WITH ACUTE */ | |
218 | { 0x00EE, 0x94 }, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */ | |
219 | { 0x00EF, 0x95 }, /* LATIN SMALL LETTER I WITH DIAERESIS */ | |
220 | { 0x00F1, 0x96 }, /* LATIN SMALL LETTER N WITH TILDE */ | |
221 | { 0x00F2, 0x98 }, /* LATIN SMALL LETTER O WITH GRAVE */ | |
222 | { 0x00F3, 0x97 }, /* LATIN SMALL LETTER O WITH ACUTE */ | |
223 | { 0x00F4, 0x99 }, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */ | |
224 | { 0x00F5, 0x9B }, /* LATIN SMALL LETTER O WITH TILDE */ | |
225 | { 0x00F6, 0x9A }, /* LATIN SMALL LETTER O WITH DIAERESIS */ | |
226 | { 0x00F7, 0xD6 }, /* DIVISION SIGN */ | |
227 | { 0x00F8, 0xBF }, /* LATIN SMALL LETTER O WITH STROKE */ | |
228 | { 0x00F9, 0x9D }, /* LATIN SMALL LETTER U WITH GRAVE */ | |
229 | { 0x00FA, 0x9C }, /* LATIN SMALL LETTER U WITH ACUTE */ | |
230 | { 0x00FB, 0x9E }, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */ | |
231 | { 0x00FC, 0x9F }, /* LATIN SMALL LETTER U WITH DIAERESIS */ | |
232 | { 0x00FF, 0xD8 }, /* LATIN SMALL LETTER Y WITH DIAERESIS */ | |
233 | { 0x0131, 0xF5 }, /* LATIN SMALL LETTER DOTLESS I */ | |
234 | { 0x0152, 0xCE }, /* LATIN CAPITAL LIGATURE OE */ | |
235 | { 0x0153, 0xCF }, /* LATIN SMALL LIGATURE OE */ | |
236 | { 0x0178, 0xD9 }, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */ | |
237 | { 0x0192, 0xC4 }, /* LATIN SMALL LETTER F WITH HOOK */ | |
238 | { 0x02C6, 0xF6 }, /* MODIFIER LETTER CIRCUMFLEX ACCENT */ | |
239 | { 0x02C7, 0xFF }, /* CARON */ | |
240 | { 0x02D8, 0xF9 }, /* BREVE */ | |
241 | { 0x02D9, 0xFA }, /* DOT ABOVE */ | |
242 | { 0x02DA, 0xFB }, /* RING ABOVE */ | |
243 | { 0x02DB, 0xFE }, /* OGONEK */ | |
244 | { 0x02DC, 0xF7 }, /* SMALL TILDE */ | |
245 | { 0x02DD, 0xFD }, /* DOUBLE ACUTE ACCENT */ | |
246 | { 0x03A9, 0xBD }, /* OHM SIGN (Canonical ?) */ | |
247 | { 0x03C0, 0xB9 }, /* GREEK SMALL LETTER PI */ | |
248 | { 0x2013, 0xD0 }, /* EN DASH */ | |
249 | { 0x2014, 0xD1 }, /* EM DASH */ | |
250 | { 0x2018, 0xD4 }, /* LEFT SINGLE QUOTATION MARK */ | |
251 | { 0x2019, 0xD5 }, /* RIGHT SINGLE QUOTATION MARK */ | |
252 | { 0x201A, 0xE2 }, /* SINGLE LOW-9 QUOTATION MARK */ | |
253 | { 0x201C, 0xD2 }, /* LEFT DOUBLE QUOTATION MARK */ | |
254 | { 0x201D, 0xD3 }, /* RIGHT DOUBLE QUOTATION MARK */ | |
255 | { 0x201E, 0xE3 }, /* DOUBLE LOW-9 QUOTATION MARK */ | |
256 | { 0x2020, 0xA0 }, /* DAGGER */ | |
257 | { 0x2021, 0xE0 }, /* DOUBLE DAGGER */ | |
258 | { 0x2022, 0xA5 }, /* BULLET */ | |
259 | { 0x2026, 0xC9 }, /* HORIZONTAL ELLIPSIS */ | |
260 | { 0x2030, 0xE4 }, /* PER MILLE SIGN */ | |
261 | { 0x2039, 0xDC }, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ | |
262 | { 0x203A, 0xDD }, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ | |
263 | { 0x2044, 0xDA }, /* FRACTION SLASH */ | |
264 | { 0x20AC, 0xDB }, /* EURO SIGN */ | |
265 | { 0x2122, 0xAA }, /* TRADE MARK SIGN */ | |
266 | { 0x2126, 0xBD }, /* OHM SIGN */ | |
267 | { 0x2202, 0xB6 }, /* PARTIAL DIFFERENTIAL */ | |
268 | { 0x2206, 0xC6 }, /* INCREMENT */ | |
269 | { 0x220F, 0xB8 }, /* N-ARY PRODUCT */ | |
270 | { 0x2211, 0xB7 }, /* N-ARY SUMMATION */ | |
271 | { 0x221A, 0xC3 }, /* SQUARE ROOT */ | |
272 | { 0x221E, 0xB0 }, /* INFINITY */ | |
273 | { 0x222B, 0xBA }, /* INTEGRAL */ | |
274 | { 0x2248, 0xC5 }, /* ALMOST EQUAL TO */ | |
275 | { 0x2260, 0xAD }, /* NOT EQUAL TO */ | |
276 | { 0x2264, 0xB2 }, /* LESS-THAN OR EQUAL TO */ | |
277 | { 0x2265, 0xB3 }, /* GREATER-THAN OR EQUAL TO */ | |
278 | { 0x25CA, 0xD7 }, /* LOZENGE */ | |
279 | { 0xF8FF, 0xF0 }, /* Apple logo */ | |
280 | { 0xFB01, 0xDE }, /* LATIN SMALL LIGATURE FI */ | |
281 | { 0xFB02, 0xDF }, /* LATIN SMALL LIGATURE FL */ | |
282 | }; | |
283 | ||
bd5b749c | 284 | static bool __CFToMacRoman(uint32_t flags, UniChar character, uint8_t *byte) { |
9ce05555 A |
285 | if (character < 0x80) { |
286 | *byte = (uint8_t)character; | |
287 | return true; | |
288 | } else { | |
289 | return CFStringEncodingUnicodeTo8BitEncoding(macRoman_from_uni, NUM_MACROMAN_FROM_UNI, character, byte); | |
290 | } | |
291 | } | |
292 | ||
293 | static const UniChar macRoman_to_uni[128] = { | |
294 | 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */ | |
295 | 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */ | |
296 | 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */ | |
297 | 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */ | |
298 | 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */ | |
299 | 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */ | |
300 | 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */ | |
301 | 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */ | |
302 | 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */ | |
303 | 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */ | |
304 | 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */ | |
305 | 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */ | |
306 | 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */ | |
307 | 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */ | |
308 | 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */ | |
309 | 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */ | |
310 | 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */ | |
311 | 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */ | |
312 | 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */ | |
313 | 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */ | |
314 | 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */ | |
315 | 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */ | |
316 | 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */ | |
317 | 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */ | |
318 | 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */ | |
319 | 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */ | |
320 | 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */ | |
321 | 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */ | |
322 | 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */ | |
323 | 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */ | |
324 | 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */ | |
325 | 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */ | |
326 | 0x2020, /* DAGGER */ | |
327 | 0x00B0, /* DEGREE SIGN */ | |
328 | 0x00A2, /* CENT SIGN */ | |
329 | 0x00A3, /* POUND SIGN */ | |
330 | 0x00A7, /* SECTION SIGN */ | |
331 | 0x2022, /* BULLET */ | |
332 | 0x00B6, /* PILCROW SIGN */ | |
333 | 0x00DF, /* LATIN SMALL LETTER SHARP S */ | |
334 | 0x00AE, /* REGISTERED SIGN */ | |
335 | 0x00A9, /* COPYRIGHT SIGN */ | |
336 | 0x2122, /* TRADE MARK SIGN */ | |
337 | 0x00B4, /* ACUTE ACCENT */ | |
338 | 0x00A8, /* DIAERESIS */ | |
339 | 0x2260, /* NOT EQUAL TO */ | |
340 | 0x00C6, /* LATIN CAPITAL LIGATURE AE */ | |
341 | 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */ | |
342 | 0x221E, /* INFINITY */ | |
343 | 0x00B1, /* PLUS-MINUS SIGN */ | |
344 | 0x2264, /* LESS-THAN OR EQUAL TO */ | |
345 | 0x2265, /* GREATER-THAN OR EQUAL TO */ | |
346 | 0x00A5, /* YEN SIGN */ | |
347 | 0x00B5, /* MICRO SIGN */ | |
348 | 0x2202, /* PARTIAL DIFFERENTIAL */ | |
349 | 0x2211, /* N-ARY SUMMATION */ | |
350 | 0x220F, /* N-ARY PRODUCT */ | |
351 | 0x03C0, /* GREEK SMALL LETTER PI */ | |
352 | 0x222B, /* INTEGRAL */ | |
353 | 0x00AA, /* FEMININE ORDINAL INDICATOR */ | |
354 | 0x00BA, /* MASCULINE ORDINAL INDICATOR */ | |
355 | 0x03A9, /* OHM SIGN (Canonical mapping) */ | |
356 | 0x00E6, /* LATIN SMALL LIGATURE AE */ | |
357 | 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */ | |
358 | 0x00BF, /* INVERTED QUESTION MARK */ | |
359 | 0x00A1, /* INVERTED EXCLAMATION MARK */ | |
360 | 0x00AC, /* NOT SIGN */ | |
361 | 0x221A, /* SQUARE ROOT */ | |
362 | 0x0192, /* LATIN SMALL LETTER F WITH HOOK */ | |
363 | 0x2248, /* ALMOST EQUAL TO */ | |
364 | 0x2206, /* INCREMENT */ | |
365 | 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */ | |
366 | 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */ | |
367 | 0x2026, /* HORIZONTAL ELLIPSIS */ | |
368 | 0x00A0, /* NO-BREAK SPACE */ | |
369 | 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */ | |
370 | 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */ | |
371 | 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */ | |
372 | 0x0152, /* LATIN CAPITAL LIGATURE OE */ | |
373 | 0x0153, /* LATIN SMALL LIGATURE OE */ | |
374 | 0x2013, /* EN DASH */ | |
375 | 0x2014, /* EM DASH */ | |
376 | 0x201C, /* LEFT DOUBLE QUOTATION MARK */ | |
377 | 0x201D, /* RIGHT DOUBLE QUOTATION MARK */ | |
378 | 0x2018, /* LEFT SINGLE QUOTATION MARK */ | |
379 | 0x2019, /* RIGHT SINGLE QUOTATION MARK */ | |
380 | 0x00F7, /* DIVISION SIGN */ | |
381 | 0x25CA, /* LOZENGE */ | |
382 | 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */ | |
383 | 0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */ | |
384 | 0x2044, /* FRACTION SLASH */ | |
385 | 0x20AC, /* EURO SIGN */ | |
386 | 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ | |
387 | 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ | |
388 | 0xFB01, /* LATIN SMALL LIGATURE FI */ | |
389 | 0xFB02, /* LATIN SMALL LIGATURE FL */ | |
390 | 0x2021, /* DOUBLE DAGGER */ | |
391 | 0x00B7, /* MIDDLE DOT */ | |
392 | 0x201A, /* SINGLE LOW-9 QUOTATION MARK */ | |
393 | 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */ | |
394 | 0x2030, /* PER MILLE SIGN */ | |
395 | 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ | |
396 | 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */ | |
397 | 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */ | |
398 | 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */ | |
399 | 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */ | |
400 | 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */ | |
401 | 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */ | |
402 | 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */ | |
403 | 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */ | |
404 | 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */ | |
405 | 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */ | |
406 | 0xF8FF, /* Apple logo */ | |
407 | 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */ | |
408 | 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */ | |
409 | 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */ | |
410 | 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */ | |
411 | 0x0131, /* LATIN SMALL LETTER DOTLESS I */ | |
412 | 0x02C6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */ | |
413 | 0x02DC, /* SMALL TILDE */ | |
414 | 0x00AF, /* MACRON */ | |
415 | 0x02D8, /* BREVE */ | |
416 | 0x02D9, /* DOT ABOVE */ | |
417 | 0x02DA, /* RING ABOVE */ | |
418 | 0x00B8, /* CEDILLA */ | |
419 | 0x02DD, /* DOUBLE ACUTE ACCENT */ | |
420 | 0x02DB, /* OGONEK */ | |
421 | 0x02C7, /* CARON */ | |
422 | }; | |
423 | ||
bd5b749c | 424 | static bool __CFFromMacRoman(uint32_t flags, uint8_t byte, UniChar *character) { |
9ce05555 A |
425 | *character = (byte < 0x80 ? (UniChar)byte : macRoman_to_uni[byte - 0x80]); |
426 | return true; | |
427 | } | |
428 | ||
bd5b749c | 429 | static CFIndex __CFToMacRomanPrecompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
9ce05555 | 430 | uint8_t byte; |
bd5b749c | 431 | CFIndex usedCharLen; |
9ce05555 A |
432 | |
433 | if (__CFToMacRoman(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) { | |
434 | if (maxByteLen) *bytes = byte; | |
435 | *usedByteLen = 1; | |
436 | return usedCharLen; | |
437 | } else { | |
438 | return 0; | |
439 | } | |
440 | } | |
441 | ||
d8925383 | 442 | __private_extern__ const CFStringEncodingConverter __CFConverterMacRoman = { |
9ce05555 A |
443 | __CFToMacRoman, __CFFromMacRoman, 1, 1, kCFStringEncodingConverterCheapEightBit, |
444 | NULL, NULL, NULL, NULL, __CFToMacRomanPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1, | |
445 | }; | |
446 | ||
447 | /* Win Latin1 (ANSI CodePage 1252) */ | |
448 | #define NUM_1252_FROM_UNI 27 | |
449 | static const CFStringEncodingUnicodeTo8BitCharMap cp1252_from_uni[NUM_1252_FROM_UNI] = { | |
450 | {0x0152, 0x8C}, // LATIN CAPITAL LIGATURE OE | |
451 | {0x0153, 0x9C}, // LATIN SMALL LIGATURE OE | |
452 | {0x0160, 0x8A}, // LATIN CAPITAL LETTER S WITH CARON | |
453 | {0x0161, 0x9A}, // LATIN SMALL LETTER S WITH CARON | |
454 | {0x0178, 0x9F}, // LATIN CAPITAL LETTER Y WITH DIAERESIS | |
455 | {0x017D, 0x8E}, // LATIN CAPITAL LETTER Z WITH CARON | |
456 | {0x017E, 0x9E}, // LATIN SMALL LETTER Z WITH CARON | |
457 | {0x0192, 0x83}, // LATIN SMALL LETTER F WITH HOOK | |
458 | {0x02C6, 0x88}, // MODIFIER LETTER CIRCUMFLEX ACCENT | |
459 | {0x02DC, 0x98}, // SMALL TILDE | |
460 | {0x2013, 0x96}, // EN DASH | |
461 | {0x2014, 0x97}, // EM DASH | |
462 | {0x2018, 0x91}, // LEFT SINGLE QUOTATION MARK | |
463 | {0x2019, 0x92}, // RIGHT SINGLE QUOTATION MARK | |
464 | {0x201A, 0x82}, // SINGLE LOW-9 QUOTATION MARK | |
465 | {0x201C, 0x93}, // LEFT DOUBLE QUOTATION MARK | |
466 | {0x201D, 0x94}, // RIGHT DOUBLE QUOTATION MARK | |
467 | {0x201E, 0x84}, // DOUBLE LOW-9 QUOTATION MARK | |
468 | {0x2020, 0x86}, // DAGGER | |
469 | {0x2021, 0x87}, // DOUBLE DAGGER | |
470 | {0x2022, 0x95}, // BULLET | |
471 | {0x2026, 0x85}, // HORIZONTAL ELLIPSIS | |
472 | {0x2030, 0x89}, // PER MILLE SIGN | |
473 | {0x2039, 0x8B}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK | |
474 | {0x203A, 0x9B}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | |
475 | {0x20AC, 0x80}, // EURO SIGN | |
476 | {0x2122, 0x99}, // TRADE MARK SIGN | |
477 | }; | |
478 | ||
bd5b749c | 479 | static bool __CFToWinLatin1(uint32_t flags, UniChar character, uint8_t *byte) { |
9ce05555 A |
480 | if ((character < 0x80) || ((character > 0x9F) && (character <= 0x00FF))) { |
481 | *byte = (uint8_t)character; | |
482 | return true; | |
483 | } | |
484 | return CFStringEncodingUnicodeTo8BitEncoding(cp1252_from_uni, NUM_1252_FROM_UNI, character, byte); | |
485 | } | |
486 | ||
d8925383 | 487 | static const uint16_t cp1252_to_uni[32] = { |
9ce05555 A |
488 | 0x20AC, // EURO SIGN |
489 | 0xFFFD, // NOT USED | |
490 | 0x201A, // SINGLE LOW-9 QUOTATION MARK | |
491 | 0x0192, // LATIN SMALL LETTER F WITH HOOK | |
492 | 0x201E, // DOUBLE LOW-9 QUOTATION MARK | |
493 | 0x2026, // HORIZONTAL ELLIPSIS | |
494 | 0x2020, // DAGGER | |
495 | 0x2021, // DOUBLE DAGGER | |
496 | 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT | |
497 | 0x2030, // PER MILLE SIGN | |
498 | 0x0160, // LATIN CAPITAL LETTER S WITH CARON | |
499 | 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK | |
500 | 0x0152, // LATIN CAPITAL LIGATURE OE | |
501 | 0xFFFD, // NOT USED | |
502 | 0x017D, // LATIN CAPITAL LETTER Z WITH CARON | |
503 | 0xFFFD, // NOT USED | |
504 | 0xFFFD, // NOT USED | |
505 | 0x2018, // LEFT SINGLE QUOTATION MARK | |
506 | 0x2019, // RIGHT SINGLE QUOTATION MARK | |
507 | 0x201C, // LEFT DOUBLE QUOTATION MARK | |
508 | 0x201D, // RIGHT DOUBLE QUOTATION MARK | |
509 | 0x2022, // BULLET | |
510 | 0x2013, // EN DASH | |
511 | 0x2014, // EM DASH | |
512 | 0x02DC, // SMALL TILDE | |
513 | 0x2122, // TRADE MARK SIGN | |
514 | 0x0161, // LATIN SMALL LETTER S WITH CARON | |
515 | 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK | |
516 | 0x0153, // LATIN SMALL LIGATURE OE | |
517 | 0xFFFD, // NOT USED | |
518 | 0x017E, // LATIN SMALL LETTER Z WITH CARON | |
519 | 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS | |
520 | }; | |
521 | ||
bd5b749c | 522 | static bool __CFFromWinLatin1(uint32_t flags, uint8_t byte, UniChar *character) { |
9ce05555 A |
523 | *character = (byte < 0x80 || byte > 0x9F ? (UniChar)byte : cp1252_to_uni[byte - 0x80]); |
524 | return (*character != 0xFFFD); | |
525 | } | |
526 | ||
bd5b749c | 527 | static CFIndex __CFToWinLatin1Precompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
9ce05555 | 528 | uint8_t byte; |
bd5b749c | 529 | CFIndex usedCharLen; |
9ce05555 A |
530 | |
531 | if (__CFToWinLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) { | |
532 | if (maxByteLen) *bytes = byte; | |
533 | *usedByteLen = 1; | |
534 | return usedCharLen; | |
535 | } else { | |
536 | return 0; | |
537 | } | |
538 | } | |
539 | ||
d8925383 | 540 | __private_extern__ const CFStringEncodingConverter __CFConverterWinLatin1 = { |
9ce05555 A |
541 | __CFToWinLatin1, __CFFromWinLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit, |
542 | NULL, NULL, NULL, NULL, __CFToWinLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1, | |
543 | }; | |
544 | ||
545 | /* NEXTSTEP Encoding */ | |
bd5b749c | 546 | #define NUM_NEXTSTEP_FROM_UNI 127 |
9ce05555 A |
547 | |
548 | static const CFStringEncodingUnicodeTo8BitCharMap nextstep_from_tab[NUM_NEXTSTEP_FROM_UNI] = { | |
549 | { 0x00a0, 0x80 }, | |
550 | { 0x00a1, 0xa1 }, | |
551 | { 0x00a2, 0xa2 }, | |
552 | { 0x00a3, 0xa3 }, | |
553 | { 0x00a4, 0xa8 }, | |
554 | { 0x00a5, 0xa5 }, | |
555 | { 0x00a6, 0xb5 }, | |
556 | { 0x00a7, 0xa7 }, | |
557 | { 0x00a8, 0xc8 }, | |
558 | { 0x00a9, 0xa0 }, | |
559 | { 0x00aa, 0xe3 }, | |
560 | { 0x00ab, 0xab }, | |
561 | { 0x00ac, 0xbe }, | |
562 | /* { 0x00ad, 0x2d }, <= 96/10/25 rick removed; converts soft-hyphen to hyphen! */ | |
563 | { 0x00ae, 0xb0 }, | |
564 | { 0x00af, 0xc5 }, | |
565 | { 0x00b1, 0xd1 }, | |
566 | { 0x00b2, 0xc9 }, | |
567 | { 0x00b3, 0xcc }, | |
568 | { 0x00b4, 0xc2 }, | |
569 | { 0x00b5, 0x9d }, | |
570 | { 0x00b6, 0xb6 }, | |
571 | { 0x00b7, 0xb4 }, | |
572 | { 0x00b8, 0xcb }, | |
573 | { 0x00b9, 0xc0 }, | |
574 | { 0x00ba, 0xeb }, | |
575 | { 0x00bb, 0xbb }, | |
576 | { 0x00bc, 0xd2 }, | |
577 | { 0x00bd, 0xd3 }, | |
578 | { 0x00be, 0xd4 }, | |
579 | { 0x00bf, 0xbf }, | |
580 | { 0x00c0, 0x81 }, | |
581 | { 0x00c1, 0x82 }, | |
582 | { 0x00c2, 0x83 }, | |
583 | { 0x00c3, 0x84 }, | |
584 | { 0x00c4, 0x85 }, | |
585 | { 0x00c5, 0x86 }, | |
586 | { 0x00c6, 0xe1 }, | |
587 | { 0x00c7, 0x87 }, | |
588 | { 0x00c8, 0x88 }, | |
589 | { 0x00c9, 0x89 }, | |
590 | { 0x00ca, 0x8a }, | |
591 | { 0x00cb, 0x8b }, | |
592 | { 0x00cc, 0x8c }, | |
593 | { 0x00cd, 0x8d }, | |
594 | { 0x00ce, 0x8e }, | |
595 | { 0x00cf, 0x8f }, | |
596 | { 0x00d0, 0x90 }, | |
597 | { 0x00d1, 0x91 }, | |
598 | { 0x00d2, 0x92 }, | |
599 | { 0x00d3, 0x93 }, | |
600 | { 0x00d4, 0x94 }, | |
601 | { 0x00d5, 0x95 }, | |
602 | { 0x00d6, 0x96 }, | |
603 | { 0x00d7, 0x9e }, | |
604 | { 0x00d8, 0xe9 }, | |
605 | { 0x00d9, 0x97 }, | |
606 | { 0x00da, 0x98 }, | |
607 | { 0x00db, 0x99 }, | |
608 | { 0x00dc, 0x9a }, | |
609 | { 0x00dd, 0x9b }, | |
610 | { 0x00de, 0x9c }, | |
611 | { 0x00df, 0xfb }, | |
612 | { 0x00e0, 0xd5 }, | |
613 | { 0x00e1, 0xd6 }, | |
614 | { 0x00e2, 0xd7 }, | |
615 | { 0x00e3, 0xd8 }, | |
616 | { 0x00e4, 0xd9 }, | |
617 | { 0x00e5, 0xda }, | |
618 | { 0x00e6, 0xf1 }, | |
619 | { 0x00e7, 0xdb }, | |
620 | { 0x00e8, 0xdc }, | |
621 | { 0x00e9, 0xdd }, | |
622 | { 0x00ea, 0xde }, | |
623 | { 0x00eb, 0xdf }, | |
624 | { 0x00ec, 0xe0 }, | |
625 | { 0x00ed, 0xe2 }, | |
626 | { 0x00ee, 0xe4 }, | |
627 | { 0x00ef, 0xe5 }, | |
628 | { 0x00f0, 0xe6 }, | |
629 | { 0x00f1, 0xe7 }, | |
630 | { 0x00f2, 0xec }, | |
631 | { 0x00f3, 0xed }, | |
632 | { 0x00f4, 0xee }, | |
633 | { 0x00f5, 0xef }, | |
634 | { 0x00f6, 0xf0 }, | |
635 | { 0x00f7, 0x9f }, | |
636 | { 0x00f8, 0xf9 }, | |
637 | { 0x00f9, 0xf2 }, | |
638 | { 0x00fa, 0xf3 }, | |
639 | { 0x00fb, 0xf4 }, | |
640 | { 0x00fc, 0xf6 }, | |
641 | { 0x00fd, 0xf7 }, | |
642 | { 0x00fe, 0xfc }, | |
643 | { 0x00ff, 0xfd }, | |
644 | { 0x0131, 0xf5 }, | |
645 | { 0x0141, 0xe8 }, | |
646 | { 0x0142, 0xf8 }, | |
647 | { 0x0152, 0xea }, | |
648 | { 0x0153, 0xfa }, | |
649 | { 0x0192, 0xa6 }, | |
650 | { 0x02c6, 0xc3 }, | |
651 | { 0x02c7, 0xcf }, | |
652 | { 0x02cb, 0xc1 }, | |
653 | { 0x02d8, 0xc6 }, | |
654 | { 0x02d9, 0xc7 }, | |
655 | { 0x02da, 0xca }, | |
656 | { 0x02db, 0xce }, | |
657 | { 0x02dc, 0xc4 }, | |
658 | { 0x02dd, 0xcd }, | |
659 | { 0x2013, 0xb1 }, | |
660 | { 0x2014, 0xd0 }, | |
661 | { 0x2019, 0xa9 }, | |
662 | { 0x201a, 0xb8 }, | |
663 | { 0x201c, 0xaa }, | |
664 | { 0x201d, 0xba }, | |
665 | { 0x201e, 0xb9 }, | |
666 | { 0x2020, 0xb2 }, | |
667 | { 0x2021, 0xb3 }, | |
668 | { 0x2022, 0xb7 }, | |
669 | { 0x2026, 0xbc }, | |
9ce05555 A |
670 | { 0x2030, 0xbd }, |
671 | { 0x2039, 0xac }, | |
672 | { 0x203a, 0xad }, | |
673 | { 0x2044, 0xa4 }, | |
674 | { 0xfb01, 0xae }, | |
675 | { 0xfb02, 0xaf }, | |
676 | { 0xfffd, 0xff }, | |
677 | }; | |
678 | ||
bd5b749c | 679 | static bool __CFToNextStepLatin(uint32_t flags, UniChar character, uint8_t *byte) { |
9ce05555 A |
680 | if (character < 0x80) { |
681 | *byte = (uint8_t)character; | |
682 | return true; | |
bd5b749c A |
683 | } else if (__CFIsParagraphSeparator(character)) { |
684 | *byte = ASCIINewLine; | |
685 | return true; | |
9ce05555 A |
686 | } else { |
687 | return CFStringEncodingUnicodeTo8BitEncoding(nextstep_from_tab, NUM_NEXTSTEP_FROM_UNI, character, byte); | |
688 | } | |
689 | }; | |
690 | ||
691 | static const UniChar NSToPrecompUnicodeTable[128] = { | |
692 | /* NextStep Encoding Unicode */ | |
693 | /* 128 figspace */ 0x00a0, /* 0x2007 is fig space */ | |
694 | /* 129 Agrave */ 0x00c0, | |
695 | /* 130 Aacute */ 0x00c1, | |
696 | /* 131 Acircumflex */ 0x00c2, | |
697 | /* 132 Atilde */ 0x00c3, | |
698 | /* 133 Adieresis */ 0x00c4, | |
699 | /* 134 Aring */ 0x00c5, | |
700 | /* 135 Ccedilla */ 0x00c7, | |
701 | /* 136 Egrave */ 0x00c8, | |
702 | /* 137 Eacute */ 0x00c9, | |
703 | /* 138 Ecircumflex */ 0x00ca, | |
704 | /* 139 Edieresis */ 0x00cb, | |
705 | /* 140 Igrave */ 0x00cc, | |
706 | /* 141 Iacute */ 0x00cd, | |
707 | /* 142 Icircumflex */ 0x00ce, | |
708 | /* 143 Idieresis */ 0x00cf, | |
709 | /* 144 Eth */ 0x00d0, | |
710 | /* 145 Ntilde */ 0x00d1, | |
711 | /* 146 Ograve */ 0x00d2, | |
712 | /* 147 Oacute */ 0x00d3, | |
713 | /* 148 Ocircumflex */ 0x00d4, | |
714 | /* 149 Otilde */ 0x00d5, | |
715 | /* 150 Odieresis */ 0x00d6, | |
716 | /* 151 Ugrave */ 0x00d9, | |
717 | /* 152 Uacute */ 0x00da, | |
718 | /* 153 Ucircumflex */ 0x00db, | |
719 | /* 154 Udieresis */ 0x00dc, | |
720 | /* 155 Yacute */ 0x00dd, | |
721 | /* 156 Thorn */ 0x00de, | |
722 | /* 157 mu */ 0x00b5, | |
723 | /* 158 multiply */ 0x00d7, | |
724 | /* 159 divide */ 0x00f7, | |
725 | /* 160 copyright */ 0x00a9, | |
726 | /* 161 exclamdown */ 0x00a1, | |
727 | /* 162 cent */ 0x00a2, | |
728 | /* 163 sterling */ 0x00a3, | |
729 | /* 164 fraction */ 0x2044, | |
730 | /* 165 yen */ 0x00a5, | |
731 | /* 166 florin */ 0x0192, | |
732 | /* 167 section */ 0x00a7, | |
733 | /* 168 currency */ 0x00a4, | |
734 | /* 169 quotesingle */ 0x2019, | |
735 | /* 170 quotedblleft */ 0x201c, | |
736 | /* 171 guillemotleft */ 0x00ab, | |
737 | /* 172 guilsinglleft */ 0x2039, | |
738 | /* 173 guilsinglright */ 0x203a, | |
739 | /* 174 fi */ 0xFB01, | |
740 | /* 175 fl */ 0xFB02, | |
741 | /* 176 registered */ 0x00ae, | |
742 | /* 177 endash */ 0x2013, | |
743 | /* 178 dagger */ 0x2020, | |
744 | /* 179 daggerdbl */ 0x2021, | |
745 | /* 180 periodcentered */ 0x00b7, | |
746 | /* 181 brokenbar */ 0x00a6, | |
747 | /* 182 paragraph */ 0x00b6, | |
748 | /* 183 bullet */ 0x2022, | |
749 | /* 184 quotesinglbase */ 0x201a, | |
750 | /* 185 quotedblbase */ 0x201e, | |
751 | /* 186 quotedblright */ 0x201d, | |
752 | /* 187 guillemotright */ 0x00bb, | |
753 | /* 188 ellipsis */ 0x2026, | |
754 | /* 189 perthousand */ 0x2030, | |
755 | /* 190 logicalnot */ 0x00ac, | |
756 | /* 191 questiondown */ 0x00bf, | |
757 | /* 192 onesuperior */ 0x00b9, | |
758 | /* 193 grave */ 0x02cb, | |
759 | /* 194 acute */ 0x00b4, | |
760 | /* 195 circumflex */ 0x02c6, | |
761 | /* 196 tilde */ 0x02dc, | |
762 | /* 197 macron */ 0x00af, | |
763 | /* 198 breve */ 0x02d8, | |
764 | /* 199 dotaccent */ 0x02d9, | |
765 | /* 200 dieresis */ 0x00a8, | |
766 | /* 201 twosuperior */ 0x00b2, | |
767 | /* 202 ring */ 0x02da, | |
768 | /* 203 cedilla */ 0x00b8, | |
769 | /* 204 threesuperior */ 0x00b3, | |
770 | /* 205 hungarumlaut */ 0x02dd, | |
771 | /* 206 ogonek */ 0x02db, | |
772 | /* 207 caron */ 0x02c7, | |
773 | /* 208 emdash */ 0x2014, | |
774 | /* 209 plusminus */ 0x00b1, | |
775 | /* 210 onequarter */ 0x00bc, | |
776 | /* 211 onehalf */ 0x00bd, | |
777 | /* 212 threequarters */ 0x00be, | |
778 | /* 213 agrave */ 0x00e0, | |
779 | /* 214 aacute */ 0x00e1, | |
780 | /* 215 acircumflex */ 0x00e2, | |
781 | /* 216 atilde */ 0x00e3, | |
782 | /* 217 adieresis */ 0x00e4, | |
783 | /* 218 aring */ 0x00e5, | |
784 | /* 219 ccedilla */ 0x00e7, | |
785 | /* 220 egrave */ 0x00e8, | |
786 | /* 221 eacute */ 0x00e9, | |
787 | /* 222 ecircumflex */ 0x00ea, | |
788 | /* 223 edieresis */ 0x00eb, | |
789 | /* 224 igrave */ 0x00ec, | |
790 | /* 225 AE */ 0x00c6, | |
791 | /* 226 iacute */ 0x00ed, | |
792 | /* 227 ordfeminine */ 0x00aa, | |
793 | /* 228 icircumflex */ 0x00ee, | |
794 | /* 229 idieresis */ 0x00ef, | |
795 | /* 230 eth */ 0x00f0, | |
796 | /* 231 ntilde */ 0x00f1, | |
797 | /* 232 Lslash */ 0x0141, | |
798 | /* 233 Oslash */ 0x00d8, | |
799 | /* 234 OE */ 0x0152, | |
800 | /* 235 ordmasculine */ 0x00ba, | |
801 | /* 236 ograve */ 0x00f2, | |
802 | /* 237 oacute */ 0x00f3, | |
803 | /* 238 ocircumflex */ 0x00f4, | |
804 | /* 239 otilde */ 0x00f5, | |
805 | /* 240 odieresis */ 0x00f6, | |
806 | /* 241 ae */ 0x00e6, | |
807 | /* 242 ugrave */ 0x00f9, | |
808 | /* 243 uacute */ 0x00fa, | |
809 | /* 244 ucircumflex */ 0x00fb, | |
810 | /* 245 dotlessi */ 0x0131, | |
811 | /* 246 udieresis */ 0x00fc, | |
812 | /* 247 yacute */ 0x00fd, | |
813 | /* 248 lslash */ 0x0142, | |
814 | /* 249 oslash */ 0x00f8, | |
815 | /* 250 oe */ 0x0153, | |
816 | /* 251 germandbls */ 0x00df, | |
817 | /* 252 thorn */ 0x00fe, | |
818 | /* 253 ydieresis */ 0x00ff, | |
819 | /* 254 .notdef */ 0xFFFD, | |
820 | /* 255 .notdef */ 0xFFFD | |
821 | }; | |
822 | ||
bd5b749c | 823 | static bool __CFFromNextStepLatin(uint32_t flags, uint8_t byte, UniChar *character) { |
9ce05555 A |
824 | return ((*character = (byte < 0x80 ? (UniChar)byte : NSToPrecompUnicodeTable[byte - 0x80])) != 0xFFFD); |
825 | } | |
826 | ||
bd5b749c | 827 | static CFIndex __CFToNextStepLatinPrecompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
9ce05555 | 828 | uint8_t byte; |
bd5b749c | 829 | CFIndex usedCharLen; |
9ce05555 A |
830 | |
831 | if (__CFToNextStepLatin(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) { | |
832 | if (maxByteLen) *bytes = byte; | |
833 | *usedByteLen = 1; | |
834 | return usedCharLen; | |
835 | } else { | |
836 | return 0; | |
837 | } | |
838 | } | |
839 | ||
d8925383 | 840 | __private_extern__ const CFStringEncodingConverter __CFConverterNextStepLatin = { |
9ce05555 A |
841 | __CFToNextStepLatin, __CFFromNextStepLatin, 1, 1, kCFStringEncodingConverterCheapEightBit, |
842 | NULL, NULL, NULL, NULL, __CFToNextStepLatinPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1, | |
843 | }; | |
844 | ||
845 | /* UTF8 */ | |
846 | /* | |
847 | * Copyright 2001 Unicode, Inc. | |
848 | * | |
849 | * Disclaimer | |
850 | * | |
851 | * This source code is provided as is by Unicode, Inc. No claims are | |
852 | * made as to fitness for any particular purpose. No warranties of any | |
853 | * kind are expressed or implied. The recipient agrees to determine | |
854 | * applicability of information provided. If this file has been | |
855 | * purchased on magnetic or optical media from Unicode, Inc., the | |
856 | * sole remedy for any claim will be exchange of defective media | |
857 | * within 90 days of receipt. | |
858 | * | |
859 | * Limitations on Rights to Redistribute This Code | |
860 | * | |
861 | * Unicode, Inc. hereby grants the right to freely use the information | |
862 | * supplied in this file in the creation of products supporting the | |
863 | * Unicode Standard, and to make copies of this file in any form | |
864 | * for internal or external distribution as long as this notice | |
865 | * remains attached. | |
866 | */ | |
867 | ||
bd5b749c A |
868 | static const uint32_t kReplacementCharacter = 0x0000FFFDUL; |
869 | static const uint32_t kMaximumUCS2 = 0x0000FFFFUL; | |
870 | static const uint32_t kMaximumUTF16 = 0x0010FFFFUL; | |
871 | static const uint32_t kMaximumUCS4 = 0x7FFFFFFFUL; | |
9ce05555 A |
872 | |
873 | static const int halfShift = 10; | |
bd5b749c A |
874 | static const uint32_t halfBase = 0x0010000UL; |
875 | static const uint32_t halfMask = 0x3FFUL; | |
876 | static const uint32_t kSurrogateHighStart = 0xD800UL; | |
877 | static const uint32_t kSurrogateHighEnd = 0xDBFFUL; | |
878 | static const uint32_t kSurrogateLowStart = 0xDC00UL; | |
879 | static const uint32_t kSurrogateLowEnd = 0xDFFFUL; | |
9ce05555 A |
880 | |
881 | /* | |
882 | * Index into the table below with the first byte of a UTF-8 sequence to | |
883 | * get the number of trailing bytes that are supposed to follow it. | |
884 | */ | |
885 | static const char trailingBytesForUTF8[256] = { | |
886 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
887 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
888 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
889 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
890 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
891 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
892 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
893 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 | |
894 | }; | |
895 | ||
896 | /* | |
897 | * Magic values subtracted from a buffer value during UTF8 conversion. | |
898 | * This table contains as many values as there might be trailing bytes | |
899 | * in a UTF-8 sequence. | |
900 | */ | |
901 | static const UTF32Char offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, | |
902 | 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; | |
903 | ||
904 | static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; | |
905 | ||
906 | /* This code is similar in effect to making successive calls on the mbtowc and wctomb routines in FSS-UTF. However, it is considerably different in code: | |
907 | * it is adapted to be consistent with UTF16, | |
908 | * constants have been gathered. | |
909 | * loops & conditionals have been removed as much as possible for | |
910 | * efficiency, in favor of drop-through switch statements. | |
911 | */ | |
912 | ||
bd5b749c | 913 | CF_INLINE uint16_t __CFUTF8BytesToWriteForCharacter(uint32_t ch) { |
9ce05555 A |
914 | if (ch < 0x80) return 1; |
915 | else if (ch < 0x800) return 2; | |
916 | else if (ch < 0x10000) return 3; | |
917 | else if (ch < 0x200000) return 4; | |
918 | else if (ch < 0x4000000) return 5; | |
919 | else if (ch <= kMaximumUCS4) return 6; | |
920 | else return 0; | |
921 | } | |
922 | ||
bd5b749c | 923 | CF_INLINE uint16_t __CFToUTF8Core(uint32_t ch, uint8_t *bytes, uint32_t maxByteLen) { |
9ce05555 | 924 | uint16_t bytesToWrite = __CFUTF8BytesToWriteForCharacter(ch); |
bd5b749c A |
925 | const uint32_t byteMask = 0xBF; |
926 | const uint32_t byteMark = 0x80; | |
9ce05555 A |
927 | |
928 | if (!bytesToWrite) { | |
929 | bytesToWrite = 2; | |
930 | ch = kReplacementCharacter; | |
931 | } | |
932 | ||
933 | if (maxByteLen < bytesToWrite) return 0; | |
934 | ||
935 | switch (bytesToWrite) { /* note: code falls through cases! */ | |
936 | case 6: bytes[5] = (ch | byteMark) & byteMask; ch >>= 6; | |
937 | case 5: bytes[4] = (ch | byteMark) & byteMask; ch >>= 6; | |
938 | case 4: bytes[3] = (ch | byteMark) & byteMask; ch >>= 6; | |
939 | case 3: bytes[2] = (ch | byteMark) & byteMask; ch >>= 6; | |
940 | case 2: bytes[1] = (ch | byteMark) & byteMask; ch >>= 6; | |
941 | case 1: bytes[0] = ch | firstByteMark[bytesToWrite]; | |
942 | } | |
943 | return bytesToWrite; | |
944 | } | |
945 | ||
bd5b749c | 946 | static CFIndex __CFToUTF8(uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { |
9ce05555 | 947 | uint16_t bytesWritten; |
bd5b749c | 948 | uint32_t ch; |
9ce05555 A |
949 | const UniChar *beginCharacter = characters; |
950 | const UniChar *endCharacter = characters + numChars; | |
951 | const uint8_t *beginBytes = bytes; | |
952 | const uint8_t *endBytes = bytes + maxByteLen; | |
953 | bool isStrict = (flags & kCFStringEncodingUseHFSPlusCanonical ? false : true); | |
954 | ||
955 | while ((characters < endCharacter) && (!maxByteLen || (bytes < endBytes))) { | |
956 | ch = *(characters++); | |
957 | ||
958 | if (ch < 0x80) { // ASCII | |
959 | if (maxByteLen) *bytes = ch; | |
960 | ++bytes; | |
961 | } else { | |
962 | if (ch >= kSurrogateHighStart) { | |
963 | if (ch <= kSurrogateHighEnd) { | |
964 | if ((characters < endCharacter) && ((*characters >= kSurrogateLowStart) && (*characters <= kSurrogateLowEnd))) { | |
965 | ch = ((ch - kSurrogateHighStart) << halfShift) + (*(characters++) - kSurrogateLowStart) + halfBase; | |
966 | } else if (isStrict) { | |
967 | --characters; | |
968 | break; | |
969 | } | |
970 | } else if (isStrict && (ch <= kSurrogateLowEnd)) { | |
971 | --characters; | |
972 | break; | |
973 | } | |
974 | } | |
975 | ||
976 | if (!(bytesWritten = (maxByteLen ? __CFToUTF8Core(ch, bytes, endBytes - bytes) : __CFUTF8BytesToWriteForCharacter(ch)))) { | |
d8925383 | 977 | characters -= (ch < 0x10000 ? 1 : 2); |
9ce05555 A |
978 | break; |
979 | } | |
980 | bytes += bytesWritten; | |
981 | } | |
982 | } | |
983 | ||
984 | if (usedByteLen) *usedByteLen = bytes - beginBytes; | |
985 | return characters - beginCharacter; | |
986 | } | |
987 | ||
988 | /* | |
989 | * Utility routine to tell whether a sequence of bytes is legal UTF-8. | |
990 | * This must be called with the length pre-determined by the first byte. | |
991 | * If not calling this from ConvertUTF8to*, then the length can be set by: | |
992 | * length = trailingBytesForUTF8[*source]+1; | |
993 | * and the sequence is illegal right away if there aren't that many bytes | |
994 | * available. | |
995 | * If presented with a length > 4, this returns false. The Unicode | |
996 | * definition of UTF-8 goes up to 4-byte sequences. | |
997 | */ | |
998 | ||
bd5b749c | 999 | CF_INLINE bool __CFIsLegalUTF8(const uint8_t *source, CFIndex length) { |
d8925383 A |
1000 | if (length > 4) return false; |
1001 | ||
1002 | const uint8_t *srcptr = source+length; | |
1003 | uint8_t head = *source; | |
1004 | ||
1005 | while (--srcptr > source) if ((*srcptr & 0xC0) != 0x80) return false; | |
1006 | ||
1007 | if (((head >= 0x80) && (head < 0xC2)) || (head > 0xF4)) return false; | |
1008 | ||
1009 | if (((head == 0xE0) && (*(source + 1) < 0xA0)) || ((head == 0xED) && (*(source + 1) > 0x9F)) || ((head == 0xF0) && (*(source + 1) < 0x90)) || ((head == 0xF4) && (*(source + 1) > 0x8F))) return false; | |
1010 | return true; | |
1011 | } | |
1012 | ||
bd5b749c | 1013 | static CFIndex __CFFromUTF8(uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { |
9ce05555 A |
1014 | const uint8_t *source = bytes; |
1015 | uint16_t extraBytesToRead; | |
bd5b749c A |
1016 | CFIndex theUsedCharLen = 0; |
1017 | uint32_t ch; | |
1018 | bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); | |
1019 | bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false); | |
1020 | bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true); | |
9ce05555 | 1021 | UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; |
bd5b749c | 1022 | CFIndex decompLength; |
9ce05555 A |
1023 | bool isStrict = !isHFSPlus; |
1024 | ||
1025 | while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) { | |
1026 | extraBytesToRead = trailingBytesForUTF8[*source]; | |
1027 | ||
1028 | if (extraBytesToRead > --numBytes) break; | |
1029 | numBytes -= extraBytesToRead; | |
1030 | ||
1031 | /* Do this check whether lenient or strict */ | |
1032 | // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps | |
1033 | // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release | |
d8925383 | 1034 | if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) { |
9ce05555 A |
1035 | if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) { |
1036 | numBytes += extraBytesToRead; | |
1037 | ++source; | |
1038 | if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter; | |
1039 | ++theUsedCharLen; | |
1040 | continue; | |
1041 | } else { | |
1042 | break; | |
1043 | } | |
1044 | } | |
1045 | ||
1046 | ch = 0; | |
1047 | /* | |
1048 | * The cases all fall through. See "Note A" below. | |
1049 | */ | |
1050 | switch (extraBytesToRead) { | |
1051 | case 3: ch += *source++; ch <<= 6; | |
1052 | case 2: ch += *source++; ch <<= 6; | |
1053 | case 1: ch += *source++; ch <<= 6; | |
1054 | case 0: ch += *source++; | |
1055 | } | |
1056 | ch -= offsetsFromUTF8[extraBytesToRead]; | |
1057 | ||
1058 | if (ch <= kMaximumUCS2) { | |
1059 | if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) { | |
1060 | source -= (extraBytesToRead + 1); | |
1061 | break; | |
1062 | } | |
1063 | if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { | |
1064 | decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); | |
1065 | ||
1066 | if (maxCharLen) { | |
bd5b749c | 1067 | if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break; |
9ce05555 A |
1068 | } else { |
1069 | theUsedCharLen += decompLength; | |
1070 | } | |
1071 | } else { | |
1072 | if (maxCharLen) *(characters++) = (UTF16Char)ch; | |
1073 | ++theUsedCharLen; | |
1074 | } | |
1075 | } else if (ch > kMaximumUTF16) { | |
1076 | if (isStrict) { | |
1077 | source -= (extraBytesToRead + 1); | |
1078 | break; | |
1079 | } | |
1080 | if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter; | |
1081 | ++theUsedCharLen; | |
1082 | } else { | |
1083 | if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { | |
1084 | decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); | |
1085 | ||
1086 | if (maxCharLen) { | |
bd5b749c | 1087 | if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break; |
9ce05555 A |
1088 | } else { |
1089 | while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2); | |
1090 | } | |
1091 | } else { | |
1092 | if (maxCharLen) { | |
1093 | if ((theUsedCharLen + 2) > maxCharLen) break; | |
1094 | ch -= halfBase; | |
1095 | *(characters++) = (ch >> halfShift) + kSurrogateHighStart; | |
1096 | *(characters++) = (ch & halfMask) + kSurrogateLowStart; | |
1097 | } | |
1098 | theUsedCharLen += 2; | |
1099 | } | |
1100 | } | |
1101 | } | |
1102 | ||
1103 | if (usedCharLen) *usedCharLen = theUsedCharLen; | |
1104 | ||
1105 | return source - bytes; | |
1106 | } | |
1107 | ||
bd5b749c A |
1108 | static CFIndex __CFToUTF8Len(uint32_t flags, const UniChar *characters, CFIndex numChars) { |
1109 | uint32_t bytesToWrite = 0; | |
1110 | uint32_t ch; | |
9ce05555 A |
1111 | |
1112 | while (numChars) { | |
1113 | ch = *characters++; | |
1114 | numChars--; | |
1115 | if ((ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd) && numChars && (*characters >= kSurrogateLowStart && *characters <= kSurrogateLowEnd)) { | |
1116 | ch = ((ch - kSurrogateHighStart) << halfShift) + (*characters++ - kSurrogateLowStart) + halfBase; | |
1117 | numChars--; | |
1118 | } | |
1119 | bytesToWrite += __CFUTF8BytesToWriteForCharacter(ch); | |
1120 | } | |
1121 | ||
1122 | return bytesToWrite; | |
1123 | } | |
1124 | ||
bd5b749c | 1125 | static CFIndex __CFFromUTF8Len(uint32_t flags, const uint8_t *source, CFIndex numBytes) { |
9ce05555 | 1126 | uint16_t extraBytesToRead; |
bd5b749c A |
1127 | CFIndex theUsedCharLen = 0; |
1128 | uint32_t ch; | |
1129 | bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); | |
1130 | bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false); | |
1131 | bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true); | |
9ce05555 | 1132 | UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; |
bd5b749c | 1133 | CFIndex decompLength; |
9ce05555 A |
1134 | bool isStrict = !isHFSPlus; |
1135 | ||
1136 | while (numBytes) { | |
1137 | extraBytesToRead = trailingBytesForUTF8[*source]; | |
1138 | ||
1139 | if (extraBytesToRead > --numBytes) break; | |
1140 | numBytes -= extraBytesToRead; | |
1141 | ||
1142 | /* Do this check whether lenient or strict */ | |
1143 | // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps | |
1144 | // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release | |
d8925383 | 1145 | if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) { |
9ce05555 A |
1146 | if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) { |
1147 | numBytes += extraBytesToRead; | |
1148 | ++source; | |
1149 | ++theUsedCharLen; | |
1150 | continue; | |
1151 | } else { | |
1152 | break; | |
1153 | } | |
1154 | } | |
1155 | ||
1156 | ||
1157 | ch = 0; | |
1158 | /* | |
1159 | * The cases all fall through. See "Note A" below. | |
1160 | */ | |
1161 | switch (extraBytesToRead) { | |
1162 | case 3: ch += *source++; ch <<= 6; | |
1163 | case 2: ch += *source++; ch <<= 6; | |
1164 | case 1: ch += *source++; ch <<= 6; | |
1165 | case 0: ch += *source++; | |
1166 | } | |
1167 | ch -= offsetsFromUTF8[extraBytesToRead]; | |
1168 | ||
1169 | if (ch <= kMaximumUCS2) { | |
1170 | if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) { | |
1171 | break; | |
1172 | } | |
1173 | if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { | |
1174 | decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); | |
1175 | theUsedCharLen += decompLength; | |
1176 | } else { | |
1177 | ++theUsedCharLen; | |
1178 | } | |
1179 | } else if (ch > kMaximumUTF16) { | |
1180 | ++theUsedCharLen; | |
1181 | } else { | |
1182 | if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { | |
1183 | decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); | |
1184 | while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2); | |
1185 | } else { | |
1186 | theUsedCharLen += 2; | |
1187 | } | |
1188 | } | |
1189 | } | |
1190 | ||
1191 | return theUsedCharLen; | |
1192 | } | |
1193 | ||
d8925383 A |
1194 | __private_extern__ const CFStringEncodingConverter __CFConverterUTF8 = { |
1195 | __CFToUTF8, __CFFromUTF8, 3, 2, kCFStringEncodingConverterStandard, | |
9ce05555 A |
1196 | __CFToUTF8Len, __CFFromUTF8Len, NULL, NULL, NULL, NULL, |
1197 | }; |