]> git.saurik.com Git - apple/libc.git/blob - locale/FreeBSD/utf8.c
Libc-997.1.1.tar.gz
[apple/libc.git] / locale / FreeBSD / utf8.c
1 /*-
2 * Copyright (c) 2002-2004 Tim J. Robbins
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/param.h>
28 __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $");
29
30 #include "xlocale_private.h"
31
32 #include <errno.h>
33 #include <limits.h>
34 #include <runetype.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <wchar.h>
38 #include "mblocal.h"
39
40 /*
41 * 10952550: detect ill-formed UTF-8
42 * Unicode 6.0, section D92, mandates specific byte sequences for well-
43 * formed UTF-8. UTF-8 sequences are now limited to 4 bytes, while the
44 * FreeBSD code originally handled up to 6. Illegal surrogate code point
45 * sequences are now detected. And while "non-shortest forms" were detected,
46 * this only happened after completing the sequence. Now, all ill-formed
47 * sequences are detected at the earliest point.
48 *
49 * Table 3-7. Well-Formed UTF-8 Byte Sequences
50 *
51 * Code Points 1st 2nd 3rd 4th Byte
52 * U+0000..U+007F 00..7F
53 * U+0080..U+07FF C2..DF 80..BF
54 * U+0800..U+0FFF E0 A0..BF 80..BF
55 * U+1000..U+CFFF E1..EC 80..BF 80..BF
56 * U+D000..U+D7FF ED 80..9F 80..BF
57 * U+E000..U+FFFF EE..EF 80..BF 80..BF
58 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
59 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
60 * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
61 *
62 * Note that while any 3rd and 4th byte can be in the range 80..BF, the
63 * second byte is often limited to a smaller range.
64 */
65
66 typedef struct {
67 unsigned char lowerbound;
68 unsigned char upperbound;
69 } SecondByte;
70 static SecondByte sb_00_00 = {0x00, 0x00};
71 static SecondByte sb_80_8F = {0x80, 0x8F};
72 static SecondByte sb_80_9F = {0x80, 0x9F};
73 static SecondByte sb_80_BF = {0x80, 0xBF};
74 static SecondByte sb_90_BF = {0x90, 0xBF};
75 static SecondByte sb_A0_BF = {0xA0, 0xBF};
76
77 #define UTF8_MB_CUR_MAX 4
78
79 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
80 size_t, mbstate_t * __restrict, locale_t);
81 static int _UTF8_mbsinit(const mbstate_t *, locale_t);
82 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
83 const char ** __restrict, size_t, size_t,
84 mbstate_t * __restrict, locale_t);
85 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
86 mbstate_t * __restrict, locale_t);
87 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
88 size_t, size_t, mbstate_t * __restrict, locale_t);
89
90 typedef struct {
91 wchar_t ch;
92 int want;
93 SecondByte sb;
94 } _UTF8State;
95
96 __private_extern__ int
97 _UTF8_init(struct __xlocale_st_runelocale *xrl)
98 {
99
100 xrl->__mbrtowc = _UTF8_mbrtowc;
101 xrl->__wcrtomb = _UTF8_wcrtomb;
102 xrl->__mbsinit = _UTF8_mbsinit;
103 xrl->__mbsnrtowcs = _UTF8_mbsnrtowcs;
104 xrl->__wcsnrtombs = _UTF8_wcsnrtombs;
105 xrl->__mb_cur_max = UTF8_MB_CUR_MAX;
106 /*
107 * UCS-4 encoding used as the internal representation, so
108 * slots 0x0080-0x00FF are occuped and must be excluded
109 * from the single byte ctype by setting the limit.
110 */
111 xrl->__mb_sb_limit = 128;
112
113 return (0);
114 }
115
116 static int
117 _UTF8_mbsinit(const mbstate_t *ps, locale_t loc)
118 {
119
120 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
121 }
122
123 static size_t
124 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
125 mbstate_t * __restrict ps, locale_t loc)
126 {
127 _UTF8State *us;
128 int ch, i, mask, want;
129 wchar_t wch;
130 SecondByte sb;
131
132 us = (_UTF8State *)ps;
133
134 if (us->want < 0 || us->want > UTF8_MB_CUR_MAX) {
135 errno = EINVAL;
136 return ((size_t)-1);
137 }
138
139 if (s == NULL) {
140 s = "";
141 n = 1;
142 pwc = NULL;
143 }
144
145 if (n == 0)
146 /* Incomplete multibyte sequence */
147 return ((size_t)-2);
148
149 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
150 /* Fast path for plain ASCII characters. */
151 if (pwc != NULL)
152 *pwc = ch;
153 return (ch != '\0' ? 1 : 0);
154 }
155
156 if (us->want == 0) {
157 /*
158 * Determine the number of octets that make up this character
159 * from the first octet, and a mask that extracts the
160 * interesting bits of the first octet. We already know
161 * the character is at least two bytes long.
162 *
163 * We detect if the first byte is illegal, and set sb to
164 * the legal range of the second byte.
165 */
166 ch = (unsigned char)*s;
167 if ((ch & 0x80) == 0) {
168 mask = 0x7f;
169 want = 1;
170 sb = sb_00_00;
171 } else if ((ch & 0xe0) == 0xc0) {
172 if (ch < 0xc2) goto malformed;
173 mask = 0x1f;
174 want = 2;
175 sb = sb_80_BF;
176 } else if ((ch & 0xf0) == 0xe0) {
177 mask = 0x0f;
178 want = 3;
179 switch (ch) {
180 case 0xe0:
181 sb = sb_A0_BF;
182 break;
183 case 0xed:
184 sb = sb_80_9F;
185 break;
186 default:
187 sb = sb_80_BF;
188 break;
189 }
190 } else if ((ch & 0xf8) == 0xf0) {
191 if (ch > 0xf4) goto malformed;
192 mask = 0x07;
193 want = 4;
194 switch (ch) {
195 case 0xf0:
196 sb = sb_90_BF;
197 break;
198 case 0xf4:
199 sb = sb_80_8F;
200 break;
201 default:
202 sb = sb_80_BF;
203 break;
204 }
205 } else {
206 malformed:
207 /*
208 * Malformed input; input is not UTF-8.
209 */
210 errno = EILSEQ;
211 return ((size_t)-1);
212 }
213 } else {
214 want = us->want;
215 sb = us->sb;
216 }
217
218 /*
219 * Decode the octet sequence representing the character in chunks
220 * of 6 bits, most significant first.
221 */
222 if (us->want == 0)
223 wch = (unsigned char)*s++ & mask;
224 else
225 wch = us->ch;
226 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
227 if (sb.lowerbound) {
228 if ((unsigned char)*s < sb.lowerbound ||
229 (unsigned char)*s > sb.upperbound) goto malformed;
230 sb = sb_00_00;
231 } else if ((*s & 0xc0) != 0x80) goto malformed;
232 wch <<= 6;
233 wch |= *s++ & 0x3f;
234 }
235 if (i < want) {
236 /* Incomplete multibyte sequence. */
237 us->want = want - i;
238 us->sb = sb;
239 us->ch = wch;
240 return ((size_t)-2);
241 }
242 if (pwc != NULL)
243 *pwc = wch;
244 us->want = 0;
245 return (wch == L'\0' ? 0 : want);
246 }
247
248 static size_t
249 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
250 size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc)
251 {
252 _UTF8State *us;
253 const char *s;
254 size_t nchr;
255 wchar_t wc;
256 size_t nb;
257
258 us = (_UTF8State *)ps;
259
260 s = *src;
261 nchr = 0;
262
263 if (dst == NULL) {
264 /*
265 * The fast path in the loop below is not safe if an ASCII
266 * character appears as anything but the first byte of a
267 * multibyte sequence. Check now to avoid doing it in the loop.
268 */
269 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
270 errno = EILSEQ;
271 return ((size_t)-1);
272 }
273 for (;;) {
274 if (nms > 0 && (signed char)*s > 0)
275 /*
276 * Fast path for plain ASCII characters
277 * excluding NUL.
278 */
279 nb = 1;
280 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps, loc)) ==
281 (size_t)-1)
282 /* Invalid sequence - mbrtowc() sets errno. */
283 return ((size_t)-1);
284 else if (nb == 0 || nb == (size_t)-2)
285 return (nchr);
286 s += nb;
287 nms -= nb;
288 nchr++;
289 }
290 /*NOTREACHED*/
291 }
292
293 /*
294 * The fast path in the loop below is not safe if an ASCII
295 * character appears as anything but the first byte of a
296 * multibyte sequence. Check now to avoid doing it in the loop.
297 */
298 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
299 errno = EILSEQ;
300 return ((size_t)-1);
301 }
302 while (len-- > 0) {
303 if (nms > 0 && (signed char)*s > 0) {
304 /*
305 * Fast path for plain ASCII characters
306 * excluding NUL.
307 */
308 *dst = (wchar_t)*s;
309 nb = 1;
310 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, loc)) ==
311 (size_t)-1) {
312 *src = s;
313 return ((size_t)-1);
314 } else if (nb == (size_t)-2) {
315 *src = s + nms;
316 return (nchr);
317 } else if (nb == 0) {
318 *src = NULL;
319 return (nchr);
320 }
321 s += nb;
322 nms -= nb;
323 nchr++;
324 dst++;
325 }
326 *src = s;
327 return (nchr);
328 }
329
330 static size_t
331 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc)
332 {
333 _UTF8State *us;
334 unsigned char lead;
335 int i, len;
336
337 us = (_UTF8State *)ps;
338
339 if (us->want != 0) {
340 errno = EINVAL;
341 return ((size_t)-1);
342 }
343
344 if (s == NULL)
345 /* Reset to initial shift state (no-op) */
346 return (1);
347
348 if ((wc & ~0x7f) == 0) {
349 /* Fast path for plain ASCII characters. */
350 *s = (char)wc;
351 return (1);
352 }
353
354 /*
355 * Determine the number of octets needed to represent this character.
356 * We always output the shortest sequence possible. Also specify the
357 * first few bits of the first octet, which contains the information
358 * about the sequence length.
359 */
360 if ((wc & ~0x7f) == 0) {
361 lead = 0;
362 len = 1;
363 } else if ((wc & ~0x7ff) == 0) {
364 lead = 0xc0;
365 len = 2;
366 } else if ((wc & ~0xffff) == 0) {
367 if (wc >= 0xd800 && wc <= 0xdfff) goto illegal;
368 lead = 0xe0;
369 len = 3;
370 } else if ((wc & ~0x1fffff) == 0) {
371 if (wc > 0x10ffff) goto illegal;
372 lead = 0xf0;
373 len = 4;
374 } else {
375 illegal:
376 errno = EILSEQ;
377 return ((size_t)-1);
378 }
379
380 /*
381 * Output the octets representing the character in chunks
382 * of 6 bits, least significant last. The first octet is
383 * a special case because it contains the sequence length
384 * information.
385 */
386 for (i = len - 1; i > 0; i--) {
387 s[i] = (wc & 0x3f) | 0x80;
388 wc >>= 6;
389 }
390 *s = (wc & 0xff) | lead;
391
392 return (len);
393 }
394
395 static size_t
396 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
397 size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc)
398 {
399 _UTF8State *us;
400 char buf[MB_LEN_MAX];
401 const wchar_t *s;
402 size_t nbytes;
403 size_t nb;
404
405 us = (_UTF8State *)ps;
406
407 if (us->want != 0) {
408 errno = EINVAL;
409 return ((size_t)-1);
410 }
411
412 s = *src;
413 nbytes = 0;
414
415 if (dst == NULL) {
416 while (nwc-- > 0) {
417 if (0 <= *s && *s < 0x80)
418 /* Fast path for plain ASCII characters. */
419 nb = 1;
420 else if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) ==
421 (size_t)-1)
422 /* Invalid character - wcrtomb() sets errno. */
423 return ((size_t)-1);
424 if (*s == L'\0')
425 return (nbytes + nb - 1);
426 s++;
427 nbytes += nb;
428 }
429 return (nbytes);
430 }
431
432 while (len > 0 && nwc-- > 0) {
433 if (0 <= *s && *s < 0x80) {
434 /* Fast path for plain ASCII characters. */
435 nb = 1;
436 *dst = *s;
437 } else if (len > (size_t)UTF8_MB_CUR_MAX) {
438 /* Enough space to translate in-place. */
439 if ((nb = _UTF8_wcrtomb(dst, *s, ps, loc)) == (size_t)-1) {
440 *src = s;
441 return ((size_t)-1);
442 }
443 } else {
444 /*
445 * May not be enough space; use temp. buffer.
446 */
447 if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == (size_t)-1) {
448 *src = s;
449 return ((size_t)-1);
450 }
451 if (nb > (int)len)
452 /* MB sequence for character won't fit. */
453 break;
454 memcpy(dst, buf, nb);
455 }
456 if (*s == L'\0') {
457 *src = NULL;
458 return (nbytes + nb - 1);
459 }
460 s++;
461 dst += nb;
462 len -= nb;
463 nbytes += nb;
464 }
465 *src = s;
466 return (nbytes);
467 }