2 * Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
23 /* CFBuiltinConverters.c
24 Copyright 1999-2002, Apple, Inc. All rights reserved.
25 Responsibility: Aki Inoue
28 #include "CFStringEncodingConverterExt.h"
29 #include "CFUniChar.h"
30 #include "CFUnicodeDecomposition.h"
31 #include "CFUnicodePrecomposition.h"
32 #include "CFStringEncodingConverterPriv.h"
33 #include "CFInternal.h"
35 #define ParagraphSeparator 0x2029
36 #define ASCIINewLine 0x0a
39 static const UInt32 __CFLatin1CombiningCharBitmap
[] = { // 0x300 ~ 0x35FF
40 0xFBB94010, 0x01800000, 0x0000000,
43 Boolean
CFStringEncodingIsValidCombiningCharacterForLatin1(UniChar character
) {
44 return ((character
>= 0x300) && (character
< 0x360) && (__CFLatin1CombiningCharBitmap
[(character
- 0x300) / 32] & (1 << (31 - ((character
- 0x300) % 32)))) ? true : false);
47 UniChar
CFStringEncodingPrecomposeLatinCharacter(const UniChar
*character
, UInt32 numChars
, UInt32
*usedChars
) {
49 UTF32Char ch
= *(character
++), nextCh
, composedChar
;
50 UInt32 usedCharLen
= 1;
52 if (CFUniCharIsSurrogateHighCharacter(ch
) || CFUniCharIsSurrogateLowCharacter(ch
)) {
53 if (usedChars
) (*usedChars
) = usedCharLen
;
57 while (usedCharLen
< numChars
) {
58 nextCh
= *(character
++);
60 if (CFUniCharIsSurrogateHighCharacter(nextCh
) || CFUniCharIsSurrogateLowCharacter(nextCh
)) break;
62 if (CFUniCharIsMemberOf(nextCh
, kCFUniCharNonBaseCharacterSet
) && ((composedChar
= CFUniCharPrecomposeCharacter(ch
, nextCh
)) != 0xFFFD)) {
63 if (composedChar
> 0xFFFF) { // Non-base
73 if (usedChars
) (*usedChars
) = usedCharLen
;
80 static Boolean
__CFToASCII(UInt32 flags
, UniChar character
, uint8_t *byte
) {
81 if (character
< 0x80) {
82 *byte
= (uint8_t)character
;
83 } else if (character
== ParagraphSeparator
) {
91 static Boolean
__CFFromASCII(UInt32 flags
, uint8_t byte
, UniChar
*character
) {
93 *character
= (UniChar
)byte
;
101 __private_extern__
const CFStringEncodingConverter __CFConverterASCII
= {
102 __CFToASCII
, __CFFromASCII
, 1, 1, kCFStringEncodingConverterCheapEightBit
,
103 NULL
, NULL
, NULL
, NULL
, NULL
, NULL
,
106 /* ISO Latin 1 (8859-1) */
107 static Boolean
__CFToISOLatin1(UInt32 flags
, UniChar character
, uint8_t *byte
) {
108 if (character
<= 0xFF) {
109 *byte
= (uint8_t)character
;
110 } else if (character
== ParagraphSeparator
) {
111 *byte
= ASCIINewLine
;
119 static Boolean
__CFFromISOLatin1(UInt32 flags
, uint8_t byte
, UniChar
*character
) {
120 *character
= (UniChar
)byte
;
124 static UInt32
__CFToISOLatin1Precompose(UInt32 flags
, const UniChar
*character
, UInt32 numChars
, uint8_t *bytes
, UInt32 maxByteLen
, UInt32
*usedByteLen
) {
128 if (__CFToISOLatin1(flags
, CFStringEncodingPrecomposeLatinCharacter(character
, numChars
, &usedCharLen
), &byte
) && byte
&& (usedCharLen
> 1)) {
129 if (maxByteLen
) *bytes
= byte
;
137 __private_extern__
const CFStringEncodingConverter __CFConverterISOLatin1
= {
138 __CFToISOLatin1
, __CFFromISOLatin1
, 1, 1, kCFStringEncodingConverterCheapEightBit
,
139 NULL
, NULL
, NULL
, NULL
, __CFToISOLatin1Precompose
, CFStringEncodingIsValidCombiningCharacterForLatin1
,
143 #define NUM_MACROMAN_FROM_UNI 129
144 static const CFStringEncodingUnicodeTo8BitCharMap macRoman_from_uni
[NUM_MACROMAN_FROM_UNI
] = {
145 { 0x00A0, 0xCA }, /* NO-BREAK SPACE */
146 { 0x00A1, 0xC1 }, /* INVERTED EXCLAMATION MARK */
147 { 0x00A2, 0xA2 }, /* CENT SIGN */
148 { 0x00A3, 0xA3 }, /* POUND SIGN */
149 { 0x00A5, 0xB4 }, /* YEN SIGN */
150 { 0x00A7, 0xA4 }, /* SECTION SIGN */
151 { 0x00A8, 0xAC }, /* DIAERESIS */
152 { 0x00A9, 0xA9 }, /* COPYRIGHT SIGN */
153 { 0x00AA, 0xBB }, /* FEMININE ORDINAL INDICATOR */
154 { 0x00AB, 0xC7 }, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
155 { 0x00AC, 0xC2 }, /* NOT SIGN */
156 { 0x00AE, 0xA8 }, /* REGISTERED SIGN */
157 { 0x00AF, 0xF8 }, /* MACRON */
158 { 0x00B0, 0xA1 }, /* DEGREE SIGN */
159 { 0x00B1, 0xB1 }, /* PLUS-MINUS SIGN */
160 { 0x00B4, 0xAB }, /* ACUTE ACCENT */
161 { 0x00B5, 0xB5 }, /* MICRO SIGN */
162 { 0x00B6, 0xA6 }, /* PILCROW SIGN */
163 { 0x00B7, 0xE1 }, /* MIDDLE DOT */
164 { 0x00B8, 0xFC }, /* CEDILLA */
165 { 0x00BA, 0xBC }, /* MASCULINE ORDINAL INDICATOR */
166 { 0x00BB, 0xC8 }, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
167 { 0x00BF, 0xC0 }, /* INVERTED QUESTION MARK */
168 { 0x00C0, 0xCB }, /* LATIN CAPITAL LETTER A WITH GRAVE */
169 { 0x00C1, 0xE7 }, /* LATIN CAPITAL LETTER A WITH ACUTE */
170 { 0x00C2, 0xE5 }, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
171 { 0x00C3, 0xCC }, /* LATIN CAPITAL LETTER A WITH TILDE */
172 { 0x00C4, 0x80 }, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
173 { 0x00C5, 0x81 }, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
174 { 0x00C6, 0xAE }, /* LATIN CAPITAL LIGATURE AE */
175 { 0x00C7, 0x82 }, /* LATIN CAPITAL LETTER C WITH CEDILLA */
176 { 0x00C8, 0xE9 }, /* LATIN CAPITAL LETTER E WITH GRAVE */
177 { 0x00C9, 0x83 }, /* LATIN CAPITAL LETTER E WITH ACUTE */
178 { 0x00CA, 0xE6 }, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
179 { 0x00CB, 0xE8 }, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
180 { 0x00CC, 0xED }, /* LATIN CAPITAL LETTER I WITH GRAVE */
181 { 0x00CD, 0xEA }, /* LATIN CAPITAL LETTER I WITH ACUTE */
182 { 0x00CE, 0xEB }, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
183 { 0x00CF, 0xEC }, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
184 { 0x00D1, 0x84 }, /* LATIN CAPITAL LETTER N WITH TILDE */
185 { 0x00D2, 0xF1 }, /* LATIN CAPITAL LETTER O WITH GRAVE */
186 { 0x00D3, 0xEE }, /* LATIN CAPITAL LETTER O WITH ACUTE */
187 { 0x00D4, 0xEF }, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
188 { 0x00D5, 0xCD }, /* LATIN CAPITAL LETTER O WITH TILDE */
189 { 0x00D6, 0x85 }, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
190 { 0x00D8, 0xAF }, /* LATIN CAPITAL LETTER O WITH STROKE */
191 { 0x00D9, 0xF4 }, /* LATIN CAPITAL LETTER U WITH GRAVE */
192 { 0x00DA, 0xF2 }, /* LATIN CAPITAL LETTER U WITH ACUTE */
193 { 0x00DB, 0xF3 }, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
194 { 0x00DC, 0x86 }, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
195 { 0x00DF, 0xA7 }, /* LATIN SMALL LETTER SHARP S */
196 { 0x00E0, 0x88 }, /* LATIN SMALL LETTER A WITH GRAVE */
197 { 0x00E1, 0x87 }, /* LATIN SMALL LETTER A WITH ACUTE */
198 { 0x00E2, 0x89 }, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
199 { 0x00E3, 0x8B }, /* LATIN SMALL LETTER A WITH TILDE */
200 { 0x00E4, 0x8A }, /* LATIN SMALL LETTER A WITH DIAERESIS */
201 { 0x00E5, 0x8C }, /* LATIN SMALL LETTER A WITH RING ABOVE */
202 { 0x00E6, 0xBE }, /* LATIN SMALL LIGATURE AE */
203 { 0x00E7, 0x8D }, /* LATIN SMALL LETTER C WITH CEDILLA */
204 { 0x00E8, 0x8F }, /* LATIN SMALL LETTER E WITH GRAVE */
205 { 0x00E9, 0x8E }, /* LATIN SMALL LETTER E WITH ACUTE */
206 { 0x00EA, 0x90 }, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
207 { 0x00EB, 0x91 }, /* LATIN SMALL LETTER E WITH DIAERESIS */
208 { 0x00EC, 0x93 }, /* LATIN SMALL LETTER I WITH GRAVE */
209 { 0x00ED, 0x92 }, /* LATIN SMALL LETTER I WITH ACUTE */
210 { 0x00EE, 0x94 }, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
211 { 0x00EF, 0x95 }, /* LATIN SMALL LETTER I WITH DIAERESIS */
212 { 0x00F1, 0x96 }, /* LATIN SMALL LETTER N WITH TILDE */
213 { 0x00F2, 0x98 }, /* LATIN SMALL LETTER O WITH GRAVE */
214 { 0x00F3, 0x97 }, /* LATIN SMALL LETTER O WITH ACUTE */
215 { 0x00F4, 0x99 }, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
216 { 0x00F5, 0x9B }, /* LATIN SMALL LETTER O WITH TILDE */
217 { 0x00F6, 0x9A }, /* LATIN SMALL LETTER O WITH DIAERESIS */
218 { 0x00F7, 0xD6 }, /* DIVISION SIGN */
219 { 0x00F8, 0xBF }, /* LATIN SMALL LETTER O WITH STROKE */
220 { 0x00F9, 0x9D }, /* LATIN SMALL LETTER U WITH GRAVE */
221 { 0x00FA, 0x9C }, /* LATIN SMALL LETTER U WITH ACUTE */
222 { 0x00FB, 0x9E }, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
223 { 0x00FC, 0x9F }, /* LATIN SMALL LETTER U WITH DIAERESIS */
224 { 0x00FF, 0xD8 }, /* LATIN SMALL LETTER Y WITH DIAERESIS */
225 { 0x0131, 0xF5 }, /* LATIN SMALL LETTER DOTLESS I */
226 { 0x0152, 0xCE }, /* LATIN CAPITAL LIGATURE OE */
227 { 0x0153, 0xCF }, /* LATIN SMALL LIGATURE OE */
228 { 0x0178, 0xD9 }, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
229 { 0x0192, 0xC4 }, /* LATIN SMALL LETTER F WITH HOOK */
230 { 0x02C6, 0xF6 }, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
231 { 0x02C7, 0xFF }, /* CARON */
232 { 0x02D8, 0xF9 }, /* BREVE */
233 { 0x02D9, 0xFA }, /* DOT ABOVE */
234 { 0x02DA, 0xFB }, /* RING ABOVE */
235 { 0x02DB, 0xFE }, /* OGONEK */
236 { 0x02DC, 0xF7 }, /* SMALL TILDE */
237 { 0x02DD, 0xFD }, /* DOUBLE ACUTE ACCENT */
238 { 0x03A9, 0xBD }, /* OHM SIGN (Canonical ?) */
239 { 0x03C0, 0xB9 }, /* GREEK SMALL LETTER PI */
240 { 0x2013, 0xD0 }, /* EN DASH */
241 { 0x2014, 0xD1 }, /* EM DASH */
242 { 0x2018, 0xD4 }, /* LEFT SINGLE QUOTATION MARK */
243 { 0x2019, 0xD5 }, /* RIGHT SINGLE QUOTATION MARK */
244 { 0x201A, 0xE2 }, /* SINGLE LOW-9 QUOTATION MARK */
245 { 0x201C, 0xD2 }, /* LEFT DOUBLE QUOTATION MARK */
246 { 0x201D, 0xD3 }, /* RIGHT DOUBLE QUOTATION MARK */
247 { 0x201E, 0xE3 }, /* DOUBLE LOW-9 QUOTATION MARK */
248 { 0x2020, 0xA0 }, /* DAGGER */
249 { 0x2021, 0xE0 }, /* DOUBLE DAGGER */
250 { 0x2022, 0xA5 }, /* BULLET */
251 { 0x2026, 0xC9 }, /* HORIZONTAL ELLIPSIS */
252 { 0x2030, 0xE4 }, /* PER MILLE SIGN */
253 { 0x2039, 0xDC }, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
254 { 0x203A, 0xDD }, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
255 { 0x2044, 0xDA }, /* FRACTION SLASH */
256 { 0x20AC, 0xDB }, /* EURO SIGN */
257 { 0x2122, 0xAA }, /* TRADE MARK SIGN */
258 { 0x2126, 0xBD }, /* OHM SIGN */
259 { 0x2202, 0xB6 }, /* PARTIAL DIFFERENTIAL */
260 { 0x2206, 0xC6 }, /* INCREMENT */
261 { 0x220F, 0xB8 }, /* N-ARY PRODUCT */
262 { 0x2211, 0xB7 }, /* N-ARY SUMMATION */
263 { 0x221A, 0xC3 }, /* SQUARE ROOT */
264 { 0x221E, 0xB0 }, /* INFINITY */
265 { 0x222B, 0xBA }, /* INTEGRAL */
266 { 0x2248, 0xC5 }, /* ALMOST EQUAL TO */
267 { 0x2260, 0xAD }, /* NOT EQUAL TO */
268 { 0x2264, 0xB2 }, /* LESS-THAN OR EQUAL TO */
269 { 0x2265, 0xB3 }, /* GREATER-THAN OR EQUAL TO */
270 { 0x25CA, 0xD7 }, /* LOZENGE */
271 { 0xF8FF, 0xF0 }, /* Apple logo */
272 { 0xFB01, 0xDE }, /* LATIN SMALL LIGATURE FI */
273 { 0xFB02, 0xDF }, /* LATIN SMALL LIGATURE FL */
276 static Boolean
__CFToMacRoman(UInt32 flags
, UniChar character
, uint8_t *byte
) {
277 if (character
< 0x80) {
278 *byte
= (uint8_t)character
;
281 return CFStringEncodingUnicodeTo8BitEncoding(macRoman_from_uni
, NUM_MACROMAN_FROM_UNI
, character
, byte
);
285 static const UniChar macRoman_to_uni
[128] = {
286 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
287 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
288 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
289 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
290 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
291 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
292 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
293 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
294 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
295 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
296 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
297 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
298 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
299 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
300 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
301 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
302 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
303 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
304 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
305 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
306 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
307 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
308 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
309 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
310 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
311 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
312 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
313 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
314 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
315 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
316 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
317 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
319 0x00B0, /* DEGREE SIGN */
320 0x00A2, /* CENT SIGN */
321 0x00A3, /* POUND SIGN */
322 0x00A7, /* SECTION SIGN */
324 0x00B6, /* PILCROW SIGN */
325 0x00DF, /* LATIN SMALL LETTER SHARP S */
326 0x00AE, /* REGISTERED SIGN */
327 0x00A9, /* COPYRIGHT SIGN */
328 0x2122, /* TRADE MARK SIGN */
329 0x00B4, /* ACUTE ACCENT */
330 0x00A8, /* DIAERESIS */
331 0x2260, /* NOT EQUAL TO */
332 0x00C6, /* LATIN CAPITAL LIGATURE AE */
333 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
334 0x221E, /* INFINITY */
335 0x00B1, /* PLUS-MINUS SIGN */
336 0x2264, /* LESS-THAN OR EQUAL TO */
337 0x2265, /* GREATER-THAN OR EQUAL TO */
338 0x00A5, /* YEN SIGN */
339 0x00B5, /* MICRO SIGN */
340 0x2202, /* PARTIAL DIFFERENTIAL */
341 0x2211, /* N-ARY SUMMATION */
342 0x220F, /* N-ARY PRODUCT */
343 0x03C0, /* GREEK SMALL LETTER PI */
344 0x222B, /* INTEGRAL */
345 0x00AA, /* FEMININE ORDINAL INDICATOR */
346 0x00BA, /* MASCULINE ORDINAL INDICATOR */
347 0x03A9, /* OHM SIGN (Canonical mapping) */
348 0x00E6, /* LATIN SMALL LIGATURE AE */
349 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
350 0x00BF, /* INVERTED QUESTION MARK */
351 0x00A1, /* INVERTED EXCLAMATION MARK */
352 0x00AC, /* NOT SIGN */
353 0x221A, /* SQUARE ROOT */
354 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
355 0x2248, /* ALMOST EQUAL TO */
356 0x2206, /* INCREMENT */
357 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
358 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
359 0x2026, /* HORIZONTAL ELLIPSIS */
360 0x00A0, /* NO-BREAK SPACE */
361 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
362 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
363 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
364 0x0152, /* LATIN CAPITAL LIGATURE OE */
365 0x0153, /* LATIN SMALL LIGATURE OE */
366 0x2013, /* EN DASH */
367 0x2014, /* EM DASH */
368 0x201C, /* LEFT DOUBLE QUOTATION MARK */
369 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
370 0x2018, /* LEFT SINGLE QUOTATION MARK */
371 0x2019, /* RIGHT SINGLE QUOTATION MARK */
372 0x00F7, /* DIVISION SIGN */
373 0x25CA, /* LOZENGE */
374 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
375 0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
376 0x2044, /* FRACTION SLASH */
377 0x20AC, /* EURO SIGN */
378 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
379 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
380 0xFB01, /* LATIN SMALL LIGATURE FI */
381 0xFB02, /* LATIN SMALL LIGATURE FL */
382 0x2021, /* DOUBLE DAGGER */
383 0x00B7, /* MIDDLE DOT */
384 0x201A, /* SINGLE LOW-9 QUOTATION MARK */
385 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
386 0x2030, /* PER MILLE SIGN */
387 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
388 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
389 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
390 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
391 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
392 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
393 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
394 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
395 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
396 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
397 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
398 0xF8FF, /* Apple logo */
399 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
400 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
401 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
402 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
403 0x0131, /* LATIN SMALL LETTER DOTLESS I */
404 0x02C6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
405 0x02DC, /* SMALL TILDE */
408 0x02D9, /* DOT ABOVE */
409 0x02DA, /* RING ABOVE */
410 0x00B8, /* CEDILLA */
411 0x02DD, /* DOUBLE ACUTE ACCENT */
416 static Boolean
__CFFromMacRoman(UInt32 flags
, uint8_t byte
, UniChar
*character
) {
417 *character
= (byte
< 0x80 ? (UniChar
)byte
: macRoman_to_uni
[byte
- 0x80]);
421 static UInt32
__CFToMacRomanPrecompose(UInt32 flags
, const UniChar
*character
, UInt32 numChars
, uint8_t *bytes
, UInt32 maxByteLen
, UInt32
*usedByteLen
) {
425 if (__CFToMacRoman(flags
, CFStringEncodingPrecomposeLatinCharacter(character
, numChars
, &usedCharLen
), &byte
) && byte
&& (usedCharLen
> 1)) {
426 if (maxByteLen
) *bytes
= byte
;
434 __private_extern__
const CFStringEncodingConverter __CFConverterMacRoman
= {
435 __CFToMacRoman
, __CFFromMacRoman
, 1, 1, kCFStringEncodingConverterCheapEightBit
,
436 NULL
, NULL
, NULL
, NULL
, __CFToMacRomanPrecompose
, CFStringEncodingIsValidCombiningCharacterForLatin1
,
439 /* Win Latin1 (ANSI CodePage 1252) */
440 #define NUM_1252_FROM_UNI 27
441 static const CFStringEncodingUnicodeTo8BitCharMap cp1252_from_uni
[NUM_1252_FROM_UNI
] = {
442 {0x0152, 0x8C}, // LATIN CAPITAL LIGATURE OE
443 {0x0153, 0x9C}, // LATIN SMALL LIGATURE OE
444 {0x0160, 0x8A}, // LATIN CAPITAL LETTER S WITH CARON
445 {0x0161, 0x9A}, // LATIN SMALL LETTER S WITH CARON
446 {0x0178, 0x9F}, // LATIN CAPITAL LETTER Y WITH DIAERESIS
447 {0x017D, 0x8E}, // LATIN CAPITAL LETTER Z WITH CARON
448 {0x017E, 0x9E}, // LATIN SMALL LETTER Z WITH CARON
449 {0x0192, 0x83}, // LATIN SMALL LETTER F WITH HOOK
450 {0x02C6, 0x88}, // MODIFIER LETTER CIRCUMFLEX ACCENT
451 {0x02DC, 0x98}, // SMALL TILDE
452 {0x2013, 0x96}, // EN DASH
453 {0x2014, 0x97}, // EM DASH
454 {0x2018, 0x91}, // LEFT SINGLE QUOTATION MARK
455 {0x2019, 0x92}, // RIGHT SINGLE QUOTATION MARK
456 {0x201A, 0x82}, // SINGLE LOW-9 QUOTATION MARK
457 {0x201C, 0x93}, // LEFT DOUBLE QUOTATION MARK
458 {0x201D, 0x94}, // RIGHT DOUBLE QUOTATION MARK
459 {0x201E, 0x84}, // DOUBLE LOW-9 QUOTATION MARK
460 {0x2020, 0x86}, // DAGGER
461 {0x2021, 0x87}, // DOUBLE DAGGER
462 {0x2022, 0x95}, // BULLET
463 {0x2026, 0x85}, // HORIZONTAL ELLIPSIS
464 {0x2030, 0x89}, // PER MILLE SIGN
465 {0x2039, 0x8B}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
466 {0x203A, 0x9B}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
467 {0x20AC, 0x80}, // EURO SIGN
468 {0x2122, 0x99}, // TRADE MARK SIGN
471 static Boolean
__CFToWinLatin1(UInt32 flags
, UniChar character
, uint8_t *byte
) {
472 if ((character
< 0x80) || ((character
> 0x9F) && (character
<= 0x00FF))) {
473 *byte
= (uint8_t)character
;
476 return CFStringEncodingUnicodeTo8BitEncoding(cp1252_from_uni
, NUM_1252_FROM_UNI
, character
, byte
);
479 static const uint16_t cp1252_to_uni
[32] = {
482 0x201A, // SINGLE LOW-9 QUOTATION MARK
483 0x0192, // LATIN SMALL LETTER F WITH HOOK
484 0x201E, // DOUBLE LOW-9 QUOTATION MARK
485 0x2026, // HORIZONTAL ELLIPSIS
487 0x2021, // DOUBLE DAGGER
488 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT
489 0x2030, // PER MILLE SIGN
490 0x0160, // LATIN CAPITAL LETTER S WITH CARON
491 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
492 0x0152, // LATIN CAPITAL LIGATURE OE
494 0x017D, // LATIN CAPITAL LETTER Z WITH CARON
497 0x2018, // LEFT SINGLE QUOTATION MARK
498 0x2019, // RIGHT SINGLE QUOTATION MARK
499 0x201C, // LEFT DOUBLE QUOTATION MARK
500 0x201D, // RIGHT DOUBLE QUOTATION MARK
504 0x02DC, // SMALL TILDE
505 0x2122, // TRADE MARK SIGN
506 0x0161, // LATIN SMALL LETTER S WITH CARON
507 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
508 0x0153, // LATIN SMALL LIGATURE OE
510 0x017E, // LATIN SMALL LETTER Z WITH CARON
511 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS
514 static Boolean
__CFFromWinLatin1(UInt32 flags
, uint8_t byte
, UniChar
*character
) {
515 *character
= (byte
< 0x80 || byte
> 0x9F ? (UniChar
)byte
: cp1252_to_uni
[byte
- 0x80]);
516 return (*character
!= 0xFFFD);
519 static UInt32
__CFToWinLatin1Precompose(UInt32 flags
, const UniChar
*character
, UInt32 numChars
, uint8_t *bytes
, UInt32 maxByteLen
, UInt32
*usedByteLen
) {
523 if (__CFToWinLatin1(flags
, CFStringEncodingPrecomposeLatinCharacter(character
, numChars
, &usedCharLen
), &byte
) && byte
&& (usedCharLen
> 1)) {
524 if (maxByteLen
) *bytes
= byte
;
532 __private_extern__
const CFStringEncodingConverter __CFConverterWinLatin1
= {
533 __CFToWinLatin1
, __CFFromWinLatin1
, 1, 1, kCFStringEncodingConverterCheapEightBit
,
534 NULL
, NULL
, NULL
, NULL
, __CFToWinLatin1Precompose
, CFStringEncodingIsValidCombiningCharacterForLatin1
,
537 /* NEXTSTEP Encoding */
538 #define NUM_NEXTSTEP_FROM_UNI 128
540 static const CFStringEncodingUnicodeTo8BitCharMap nextstep_from_tab
[NUM_NEXTSTEP_FROM_UNI
] = {
554 /* { 0x00ad, 0x2d }, <= 96/10/25 rick removed; converts soft-hyphen to hyphen! */
662 { 0x2029, 0x0a }, /* ParagraphSeparator -> ASCIINewLine */
672 static Boolean
__CFToNextStepLatin(UInt32 flags
, UniChar character
, uint8_t *byte
) {
673 if (character
< 0x80) {
674 *byte
= (uint8_t)character
;
677 return CFStringEncodingUnicodeTo8BitEncoding(nextstep_from_tab
, NUM_NEXTSTEP_FROM_UNI
, character
, byte
);
681 static const UniChar NSToPrecompUnicodeTable
[128] = {
682 /* NextStep Encoding Unicode */
683 /* 128 figspace */ 0x00a0, /* 0x2007 is fig space */
684 /* 129 Agrave */ 0x00c0,
685 /* 130 Aacute */ 0x00c1,
686 /* 131 Acircumflex */ 0x00c2,
687 /* 132 Atilde */ 0x00c3,
688 /* 133 Adieresis */ 0x00c4,
689 /* 134 Aring */ 0x00c5,
690 /* 135 Ccedilla */ 0x00c7,
691 /* 136 Egrave */ 0x00c8,
692 /* 137 Eacute */ 0x00c9,
693 /* 138 Ecircumflex */ 0x00ca,
694 /* 139 Edieresis */ 0x00cb,
695 /* 140 Igrave */ 0x00cc,
696 /* 141 Iacute */ 0x00cd,
697 /* 142 Icircumflex */ 0x00ce,
698 /* 143 Idieresis */ 0x00cf,
699 /* 144 Eth */ 0x00d0,
700 /* 145 Ntilde */ 0x00d1,
701 /* 146 Ograve */ 0x00d2,
702 /* 147 Oacute */ 0x00d3,
703 /* 148 Ocircumflex */ 0x00d4,
704 /* 149 Otilde */ 0x00d5,
705 /* 150 Odieresis */ 0x00d6,
706 /* 151 Ugrave */ 0x00d9,
707 /* 152 Uacute */ 0x00da,
708 /* 153 Ucircumflex */ 0x00db,
709 /* 154 Udieresis */ 0x00dc,
710 /* 155 Yacute */ 0x00dd,
711 /* 156 Thorn */ 0x00de,
713 /* 158 multiply */ 0x00d7,
714 /* 159 divide */ 0x00f7,
715 /* 160 copyright */ 0x00a9,
716 /* 161 exclamdown */ 0x00a1,
717 /* 162 cent */ 0x00a2,
718 /* 163 sterling */ 0x00a3,
719 /* 164 fraction */ 0x2044,
720 /* 165 yen */ 0x00a5,
721 /* 166 florin */ 0x0192,
722 /* 167 section */ 0x00a7,
723 /* 168 currency */ 0x00a4,
724 /* 169 quotesingle */ 0x2019,
725 /* 170 quotedblleft */ 0x201c,
726 /* 171 guillemotleft */ 0x00ab,
727 /* 172 guilsinglleft */ 0x2039,
728 /* 173 guilsinglright */ 0x203a,
731 /* 176 registered */ 0x00ae,
732 /* 177 endash */ 0x2013,
733 /* 178 dagger */ 0x2020,
734 /* 179 daggerdbl */ 0x2021,
735 /* 180 periodcentered */ 0x00b7,
736 /* 181 brokenbar */ 0x00a6,
737 /* 182 paragraph */ 0x00b6,
738 /* 183 bullet */ 0x2022,
739 /* 184 quotesinglbase */ 0x201a,
740 /* 185 quotedblbase */ 0x201e,
741 /* 186 quotedblright */ 0x201d,
742 /* 187 guillemotright */ 0x00bb,
743 /* 188 ellipsis */ 0x2026,
744 /* 189 perthousand */ 0x2030,
745 /* 190 logicalnot */ 0x00ac,
746 /* 191 questiondown */ 0x00bf,
747 /* 192 onesuperior */ 0x00b9,
748 /* 193 grave */ 0x02cb,
749 /* 194 acute */ 0x00b4,
750 /* 195 circumflex */ 0x02c6,
751 /* 196 tilde */ 0x02dc,
752 /* 197 macron */ 0x00af,
753 /* 198 breve */ 0x02d8,
754 /* 199 dotaccent */ 0x02d9,
755 /* 200 dieresis */ 0x00a8,
756 /* 201 twosuperior */ 0x00b2,
757 /* 202 ring */ 0x02da,
758 /* 203 cedilla */ 0x00b8,
759 /* 204 threesuperior */ 0x00b3,
760 /* 205 hungarumlaut */ 0x02dd,
761 /* 206 ogonek */ 0x02db,
762 /* 207 caron */ 0x02c7,
763 /* 208 emdash */ 0x2014,
764 /* 209 plusminus */ 0x00b1,
765 /* 210 onequarter */ 0x00bc,
766 /* 211 onehalf */ 0x00bd,
767 /* 212 threequarters */ 0x00be,
768 /* 213 agrave */ 0x00e0,
769 /* 214 aacute */ 0x00e1,
770 /* 215 acircumflex */ 0x00e2,
771 /* 216 atilde */ 0x00e3,
772 /* 217 adieresis */ 0x00e4,
773 /* 218 aring */ 0x00e5,
774 /* 219 ccedilla */ 0x00e7,
775 /* 220 egrave */ 0x00e8,
776 /* 221 eacute */ 0x00e9,
777 /* 222 ecircumflex */ 0x00ea,
778 /* 223 edieresis */ 0x00eb,
779 /* 224 igrave */ 0x00ec,
781 /* 226 iacute */ 0x00ed,
782 /* 227 ordfeminine */ 0x00aa,
783 /* 228 icircumflex */ 0x00ee,
784 /* 229 idieresis */ 0x00ef,
785 /* 230 eth */ 0x00f0,
786 /* 231 ntilde */ 0x00f1,
787 /* 232 Lslash */ 0x0141,
788 /* 233 Oslash */ 0x00d8,
790 /* 235 ordmasculine */ 0x00ba,
791 /* 236 ograve */ 0x00f2,
792 /* 237 oacute */ 0x00f3,
793 /* 238 ocircumflex */ 0x00f4,
794 /* 239 otilde */ 0x00f5,
795 /* 240 odieresis */ 0x00f6,
797 /* 242 ugrave */ 0x00f9,
798 /* 243 uacute */ 0x00fa,
799 /* 244 ucircumflex */ 0x00fb,
800 /* 245 dotlessi */ 0x0131,
801 /* 246 udieresis */ 0x00fc,
802 /* 247 yacute */ 0x00fd,
803 /* 248 lslash */ 0x0142,
804 /* 249 oslash */ 0x00f8,
806 /* 251 germandbls */ 0x00df,
807 /* 252 thorn */ 0x00fe,
808 /* 253 ydieresis */ 0x00ff,
809 /* 254 .notdef */ 0xFFFD,
810 /* 255 .notdef */ 0xFFFD
813 static Boolean
__CFFromNextStepLatin(UInt32 flags
, uint8_t byte
, UniChar
*character
) {
814 return ((*character
= (byte
< 0x80 ? (UniChar
)byte
: NSToPrecompUnicodeTable
[byte
- 0x80])) != 0xFFFD);
817 static UInt32
__CFToNextStepLatinPrecompose(UInt32 flags
, const UniChar
*character
, UInt32 numChars
, uint8_t *bytes
, UInt32 maxByteLen
, UInt32
*usedByteLen
) {
821 if (__CFToNextStepLatin(flags
, CFStringEncodingPrecomposeLatinCharacter(character
, numChars
, &usedCharLen
), &byte
) && byte
&& (usedCharLen
> 1)) {
822 if (maxByteLen
) *bytes
= byte
;
830 __private_extern__
const CFStringEncodingConverter __CFConverterNextStepLatin
= {
831 __CFToNextStepLatin
, __CFFromNextStepLatin
, 1, 1, kCFStringEncodingConverterCheapEightBit
,
832 NULL
, NULL
, NULL
, NULL
, __CFToNextStepLatinPrecompose
, CFStringEncodingIsValidCombiningCharacterForLatin1
,
837 * Copyright 2001 Unicode, Inc.
841 * This source code is provided as is by Unicode, Inc. No claims are
842 * made as to fitness for any particular purpose. No warranties of any
843 * kind are expressed or implied. The recipient agrees to determine
844 * applicability of information provided. If this file has been
845 * purchased on magnetic or optical media from Unicode, Inc., the
846 * sole remedy for any claim will be exchange of defective media
847 * within 90 days of receipt.
849 * Limitations on Rights to Redistribute This Code
851 * Unicode, Inc. hereby grants the right to freely use the information
852 * supplied in this file in the creation of products supporting the
853 * Unicode Standard, and to make copies of this file in any form
854 * for internal or external distribution as long as this notice
858 static const UInt32 kReplacementCharacter
= 0x0000FFFDUL
;
859 static const UInt32 kMaximumUCS2
= 0x0000FFFFUL
;
860 static const UInt32 kMaximumUTF16
= 0x0010FFFFUL
;
861 static const UInt32 kMaximumUCS4
= 0x7FFFFFFFUL
;
863 static const int halfShift
= 10;
864 static const UInt32 halfBase
= 0x0010000UL
;
865 static const UInt32 halfMask
= 0x3FFUL
;
866 static const UInt32 kSurrogateHighStart
= 0xD800UL
;
867 static const UInt32 kSurrogateHighEnd
= 0xDBFFUL
;
868 static const UInt32 kSurrogateLowStart
= 0xDC00UL
;
869 static const UInt32 kSurrogateLowEnd
= 0xDFFFUL
;
872 * Index into the table below with the first byte of a UTF-8 sequence to
873 * get the number of trailing bytes that are supposed to follow it.
875 static const char trailingBytesForUTF8
[256] = {
876 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
877 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
878 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
879 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
880 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
881 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
882 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
883 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
887 * Magic values subtracted from a buffer value during UTF8 conversion.
888 * This table contains as many values as there might be trailing bytes
889 * in a UTF-8 sequence.
891 static const UTF32Char offsetsFromUTF8
[6] = { 0x00000000UL
, 0x00003080UL
, 0x000E2080UL
,
892 0x03C82080UL
, 0xFA082080UL
, 0x82082080UL
};
894 static const uint8_t firstByteMark
[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
896 /* This code is similar in effect to making successive calls on the mbtowc and wctomb routines in FSS-UTF. However, it is considerably different in code:
897 * it is adapted to be consistent with UTF16,
898 * constants have been gathered.
899 * loops & conditionals have been removed as much as possible for
900 * efficiency, in favor of drop-through switch statements.
903 CF_INLINE
uint16_t __CFUTF8BytesToWriteForCharacter(UInt32 ch
) {
904 if (ch
< 0x80) return 1;
905 else if (ch
< 0x800) return 2;
906 else if (ch
< 0x10000) return 3;
907 else if (ch
< 0x200000) return 4;
908 else if (ch
< 0x4000000) return 5;
909 else if (ch
<= kMaximumUCS4
) return 6;
913 CF_INLINE
uint16_t __CFToUTF8Core(UInt32 ch
, uint8_t *bytes
, UInt32 maxByteLen
) {
914 uint16_t bytesToWrite
= __CFUTF8BytesToWriteForCharacter(ch
);
915 const UInt32 byteMask
= 0xBF;
916 const UInt32 byteMark
= 0x80;
920 ch
= kReplacementCharacter
;
923 if (maxByteLen
< bytesToWrite
) return 0;
925 switch (bytesToWrite
) { /* note: code falls through cases! */
926 case 6: bytes
[5] = (ch
| byteMark
) & byteMask
; ch
>>= 6;
927 case 5: bytes
[4] = (ch
| byteMark
) & byteMask
; ch
>>= 6;
928 case 4: bytes
[3] = (ch
| byteMark
) & byteMask
; ch
>>= 6;
929 case 3: bytes
[2] = (ch
| byteMark
) & byteMask
; ch
>>= 6;
930 case 2: bytes
[1] = (ch
| byteMark
) & byteMask
; ch
>>= 6;
931 case 1: bytes
[0] = ch
| firstByteMark
[bytesToWrite
];
936 static UInt32
__CFToUTF8(UInt32 flags
, const UniChar
*characters
, UInt32 numChars
, uint8_t *bytes
, UInt32 maxByteLen
, UInt32
*usedByteLen
) {
937 uint16_t bytesWritten
;
939 const UniChar
*beginCharacter
= characters
;
940 const UniChar
*endCharacter
= characters
+ numChars
;
941 const uint8_t *beginBytes
= bytes
;
942 const uint8_t *endBytes
= bytes
+ maxByteLen
;
943 bool isStrict
= (flags
& kCFStringEncodingUseHFSPlusCanonical
? false : true);
945 while ((characters
< endCharacter
) && (!maxByteLen
|| (bytes
< endBytes
))) {
946 ch
= *(characters
++);
948 if (ch
< 0x80) { // ASCII
949 if (maxByteLen
) *bytes
= ch
;
952 if (ch
>= kSurrogateHighStart
) {
953 if (ch
<= kSurrogateHighEnd
) {
954 if ((characters
< endCharacter
) && ((*characters
>= kSurrogateLowStart
) && (*characters
<= kSurrogateLowEnd
))) {
955 ch
= ((ch
- kSurrogateHighStart
) << halfShift
) + (*(characters
++) - kSurrogateLowStart
) + halfBase
;
956 } else if (isStrict
) {
960 } else if (isStrict
&& (ch
<= kSurrogateLowEnd
)) {
966 if (!(bytesWritten
= (maxByteLen
? __CFToUTF8Core(ch
, bytes
, endBytes
- bytes
) : __CFUTF8BytesToWriteForCharacter(ch
)))) {
967 characters
-= (ch
< 0x10000 ? 1 : 2);
970 bytes
+= bytesWritten
;
974 if (usedByteLen
) *usedByteLen
= bytes
- beginBytes
;
975 return characters
- beginCharacter
;
979 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
980 * This must be called with the length pre-determined by the first byte.
981 * If not calling this from ConvertUTF8to*, then the length can be set by:
982 * length = trailingBytesForUTF8[*source]+1;
983 * and the sequence is illegal right away if there aren't that many bytes
985 * If presented with a length > 4, this returns false. The Unicode
986 * definition of UTF-8 goes up to 4-byte sequences.
989 CF_INLINE
bool __CFIsLegalUTF8(const uint8_t *source
, int length
) {
990 if (length
> 4) return false;
992 const uint8_t *srcptr
= source
+length
;
993 uint8_t head
= *source
;
995 while (--srcptr
> source
) if ((*srcptr
& 0xC0) != 0x80) return false;
997 if (((head
>= 0x80) && (head
< 0xC2)) || (head
> 0xF4)) return false;
999 if (((head
== 0xE0) && (*(source
+ 1) < 0xA0)) || ((head
== 0xED) && (*(source
+ 1) > 0x9F)) || ((head
== 0xF0) && (*(source
+ 1) < 0x90)) || ((head
== 0xF4) && (*(source
+ 1) > 0x8F))) return false;
1003 /* This version of the routine returns the length of the sequence,
1004 or 0 on illegal sequence. This version is correct according to
1005 the Unicode 4.0 spec. */
1006 #define ISLEGALUTF8_FAST 0
1007 static CFIndex
__CFIsLegalUTF8_2(const uint8_t *source
, CFIndex maxBytes
) {
1008 if (maxBytes
< 1) return 0;
1009 uint8_t first
= source
[0];
1010 if (first
<= 0x7F) return 1;
1011 if (first
< 0xC2) return 0;
1012 if (maxBytes
< 2) return 0;
1013 if (first
<= 0xDF) {
1014 #if ISLEGALUTF8_FAST
1015 if ((source
[1] & 0xC0) == 0x80) return 2;
1017 if (source
[1] < 0x80) return 0;
1018 if (source
[1] <= 0xBF) return 2;
1022 if (maxBytes
< 3) return 0;
1023 #if ISLEGALUTF8_FAST
1024 if (first
<= 0xEF) {
1025 uint32_t value
= (first
<< 24) | ((*(const uint16_t *)((const uint8_t *)source
+ 1)) << 8);
1026 uint32_t masked1
= (value
& 0xFFF0C000);
1028 // 0b 11100000 101{0,1}xxxx 10xxxxxx (0xE0)
1029 if (masked1
== 0xE0A08000) return 3;
1030 if (masked1
== 0xE0B08000) return 3;
1032 // 0b 11101101 100{0,1}xxxx 10xxxxxx (0xED)
1033 if (masked1
== 0xED808000) return 3;
1034 if (masked1
== 0xED908000) return 3;
1036 // 0b 1110{0001 - 1100} 10xxxxxx 10xxxxxx (0xE1 - 0xEC)
1037 // 0b 1110{1110 - 1111} 10xxxxxx 10xxxxxx (0xEE - 0xEF)
1038 if ((value
& 0x00C0C000) == 0x00808000) return 3;
1043 if (first
== 0xE0) {
1044 if (source
[1] < 0xA0 /* NOTE */) return 0;
1045 if (source
[1] <= 0xBF) {
1046 if (source
[2] < 0x80) return 0;
1047 if (source
[2] <= 0xBF) return 3;
1051 if (first
<= 0xEC) {
1052 if (source
[1] < 0x80) return 0;
1053 if (source
[1] <= 0xBF) {
1054 if (source
[2] < 0x80) return 0;
1055 if (source
[2] <= 0xBF) return 3;
1059 if (first
== 0xED) {
1060 if (source
[1] < 0x80) return 0;
1061 if (source
[1] <= 0x9F /* NOTE */) {
1062 if (source
[2] < 0x80) return 0;
1063 if (source
[2] <= 0xBF) return 3;
1067 if (first
<= 0xEF) {
1068 if (source
[1] < 0x80) return 0;
1069 if (source
[1] <= 0xBF) {
1070 if (source
[2] < 0x80) return 0;
1071 if (source
[2] <= 0xBF) return 3;
1076 if (maxBytes
< 4) return 0;
1077 #if ISLEGALUTF8_FAST
1078 if (first
<= 0xF4) {
1079 uint32_t value
= *(const uint32_t *)source
;
1080 uint32_t masked1
= (value
& 0xFFF0C0C0);
1082 // 0b 11110000 10{01,10,11}xxxx 10xxxxxx 10xxxxxx (0xF0)
1083 if (masked1
== 0xF0908080) return 4;
1084 if (masked1
== 0xF0A08080) return 4;
1085 if (masked1
== 0xF0B08080) return 4;
1087 // 0b 11110100 1000xxxx 10xxxxxx 10xxxxxx (0xF4)
1088 if (masked1
== 0xF4808080) return 4;
1090 // 0b 111100{01,10,11} 10xxxxxx 10xxxxxx 10xxxxxx (0xF1 - 0xF3)
1091 if ((value
& 0x00C0C0C0) == 0x00808080) return 4;
1096 if (first
== 0xF0) {
1097 if (source
[1] < 0x90 /* NOTE */) return 0;
1098 if (source
[1] <= 0xBF) {
1099 if (source
[2] < 0x80) return 0;
1100 if (source
[2] <= 0xBF) {
1101 if (source
[3] < 0x80) return 0;
1102 if (source
[3] <= 0xBF) return 4;
1107 if (first
<= 0xF3) {
1108 if (source
[1] < 0x80) return 0;
1109 if (source
[1] <= 0xBF) {
1110 if (source
[2] < 0x80) return 0;
1111 if (source
[2] <= 0xBF) {
1112 if (source
[3] < 0x80) return 0;
1113 if (source
[3] <= 0xBF) return 4;
1118 if (first
== 0xF4) {
1119 if (source
[1] < 0x80) return 0;
1120 if (source
[1] <= 0x8F /* NOTE */) {
1121 if (source
[2] < 0x80) return 0;
1122 if (source
[2] <= 0xBF) {
1123 if (source
[3] < 0x80) return 0;
1124 if (source
[3] <= 0xBF) return 4;
1133 static UInt32
__CFFromUTF8(UInt32 flags
, const uint8_t *bytes
, UInt32 numBytes
, UniChar
*characters
, UInt32 maxCharLen
, UInt32
*usedCharLen
) {
1134 const uint8_t *source
= bytes
;
1135 uint16_t extraBytesToRead
;
1136 UInt32 theUsedCharLen
= 0;
1138 Boolean isHFSPlus
= (flags
& kCFStringEncodingUseHFSPlusCanonical
? true : false);
1139 Boolean needsToDecompose
= (flags
& kCFStringEncodingUseCanonical
|| isHFSPlus
? true : false);
1140 Boolean strictUTF8
= (flags
& kCFStringEncodingLenientUTF8Conversion
? false : true);
1141 UTF32Char decomposed
[MAX_DECOMPOSED_LENGTH
];
1142 int32_t decompLength
;
1143 bool isStrict
= !isHFSPlus
;
1145 while (numBytes
&& (!maxCharLen
|| (theUsedCharLen
< maxCharLen
))) {
1146 extraBytesToRead
= trailingBytesForUTF8
[*source
];
1148 if (extraBytesToRead
> --numBytes
) break;
1149 numBytes
-= extraBytesToRead
;
1151 /* Do this check whether lenient or strict */
1152 // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps
1153 // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release
1154 if ((extraBytesToRead
> 3) || (strictUTF8
&& !__CFIsLegalUTF8(source
, extraBytesToRead
+ 1))) {
1155 if ((*source
== 0xA9) || (flags
& kCFStringEncodingAllowLossyConversion
)) {
1156 numBytes
+= extraBytesToRead
;
1158 if (maxCharLen
) *(characters
++) = (UTF16Char
)kReplacementCharacter
;
1168 * The cases all fall through. See "Note A" below.
1170 switch (extraBytesToRead
) {
1171 case 3: ch
+= *source
++; ch
<<= 6;
1172 case 2: ch
+= *source
++; ch
<<= 6;
1173 case 1: ch
+= *source
++; ch
<<= 6;
1174 case 0: ch
+= *source
++;
1176 ch
-= offsetsFromUTF8
[extraBytesToRead
];
1178 if (ch
<= kMaximumUCS2
) {
1179 if (isStrict
&& (ch
>= kSurrogateHighStart
&& ch
<= kSurrogateLowEnd
)) {
1180 source
-= (extraBytesToRead
+ 1);
1183 if (needsToDecompose
&& CFUniCharIsDecomposableCharacter(ch
, isHFSPlus
)) {
1184 decompLength
= CFUniCharDecomposeCharacter(ch
, decomposed
, MAX_DECOMPOSED_LENGTH
);
1187 if (!CFUniCharFillDestinationBuffer(decomposed
, decompLength
, (void **)&characters
, maxCharLen
, (uint32_t *)&theUsedCharLen
, kCFUniCharUTF16Format
)) break;
1189 theUsedCharLen
+= decompLength
;
1192 if (maxCharLen
) *(characters
++) = (UTF16Char
)ch
;
1195 } else if (ch
> kMaximumUTF16
) {
1197 source
-= (extraBytesToRead
+ 1);
1200 if (maxCharLen
) *(characters
++) = (UTF16Char
)kReplacementCharacter
;
1203 if (needsToDecompose
&& CFUniCharIsDecomposableCharacter(ch
, isHFSPlus
)) {
1204 decompLength
= CFUniCharDecomposeCharacter(ch
, decomposed
, MAX_DECOMPOSED_LENGTH
);
1207 if (!CFUniCharFillDestinationBuffer(decomposed
, decompLength
, (void **)&characters
, maxCharLen
, (uint32_t *)&theUsedCharLen
, kCFUniCharUTF16Format
)) break;
1209 while (--decompLength
>= 0) theUsedCharLen
+= (decomposed
[decompLength
] < 0x10000 ? 1 : 2);
1213 if ((theUsedCharLen
+ 2) > maxCharLen
) break;
1215 *(characters
++) = (ch
>> halfShift
) + kSurrogateHighStart
;
1216 *(characters
++) = (ch
& halfMask
) + kSurrogateLowStart
;
1218 theUsedCharLen
+= 2;
1223 if (usedCharLen
) *usedCharLen
= theUsedCharLen
;
1225 return source
- bytes
;
1228 static UInt32
__CFToUTF8Len(UInt32 flags
, const UniChar
*characters
, UInt32 numChars
) {
1229 UInt32 bytesToWrite
= 0;
1235 if ((ch
>= kSurrogateHighStart
&& ch
<= kSurrogateHighEnd
) && numChars
&& (*characters
>= kSurrogateLowStart
&& *characters
<= kSurrogateLowEnd
)) {
1236 ch
= ((ch
- kSurrogateHighStart
) << halfShift
) + (*characters
++ - kSurrogateLowStart
) + halfBase
;
1239 bytesToWrite
+= __CFUTF8BytesToWriteForCharacter(ch
);
1242 return bytesToWrite
;
1245 static UInt32
__CFFromUTF8Len(UInt32 flags
, const uint8_t *source
, UInt32 numBytes
) {
1246 uint16_t extraBytesToRead
;
1247 UInt32 theUsedCharLen
= 0;
1249 Boolean isHFSPlus
= (flags
& kCFStringEncodingUseHFSPlusCanonical
? true : false);
1250 Boolean needsToDecompose
= (flags
& kCFStringEncodingUseCanonical
|| isHFSPlus
? true : false);
1251 Boolean strictUTF8
= (flags
& kCFStringEncodingLenientUTF8Conversion
? false : true);
1252 UTF32Char decomposed
[MAX_DECOMPOSED_LENGTH
];
1253 int32_t decompLength
;
1254 bool isStrict
= !isHFSPlus
;
1257 extraBytesToRead
= trailingBytesForUTF8
[*source
];
1259 if (extraBytesToRead
> --numBytes
) break;
1260 numBytes
-= extraBytesToRead
;
1262 /* Do this check whether lenient or strict */
1263 // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps
1264 // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release
1265 if ((extraBytesToRead
> 3) || (strictUTF8
&& !__CFIsLegalUTF8(source
, extraBytesToRead
+ 1))) {
1266 if ((*source
== 0xA9) || (flags
& kCFStringEncodingAllowLossyConversion
)) {
1267 numBytes
+= extraBytesToRead
;
1279 * The cases all fall through. See "Note A" below.
1281 switch (extraBytesToRead
) {
1282 case 3: ch
+= *source
++; ch
<<= 6;
1283 case 2: ch
+= *source
++; ch
<<= 6;
1284 case 1: ch
+= *source
++; ch
<<= 6;
1285 case 0: ch
+= *source
++;
1287 ch
-= offsetsFromUTF8
[extraBytesToRead
];
1289 if (ch
<= kMaximumUCS2
) {
1290 if (isStrict
&& (ch
>= kSurrogateHighStart
&& ch
<= kSurrogateLowEnd
)) {
1293 if (needsToDecompose
&& CFUniCharIsDecomposableCharacter(ch
, isHFSPlus
)) {
1294 decompLength
= CFUniCharDecomposeCharacter(ch
, decomposed
, MAX_DECOMPOSED_LENGTH
);
1295 theUsedCharLen
+= decompLength
;
1299 } else if (ch
> kMaximumUTF16
) {
1302 if (needsToDecompose
&& CFUniCharIsDecomposableCharacter(ch
, isHFSPlus
)) {
1303 decompLength
= CFUniCharDecomposeCharacter(ch
, decomposed
, MAX_DECOMPOSED_LENGTH
);
1304 while (--decompLength
>= 0) theUsedCharLen
+= (decomposed
[decompLength
] < 0x10000 ? 1 : 2);
1306 theUsedCharLen
+= 2;
1311 return theUsedCharLen
;
1314 __private_extern__
const CFStringEncodingConverter __CFConverterUTF8
= {
1315 __CFToUTF8
, __CFFromUTF8
, 3, 2, kCFStringEncodingConverterStandard
,
1316 __CFToUTF8Len
, __CFFromUTF8Len
, NULL
, NULL
, NULL
, NULL
,