]>
Commit | Line | Data |
---|---|---|
9385eb3d | 1 | /*- |
3d9156a7 | 2 | * Copyright (c) 2002-2004 Tim J. Robbins |
9385eb3d A |
3 | * All rights reserved. |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * | |
14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
18 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
19 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
20 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
21 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
22 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
23 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
24 | * SUCH DAMAGE. | |
25 | */ | |
26 | ||
3d9156a7 | 27 | #include <sys/param.h> |
1f2f436a | 28 | __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $"); |
9385eb3d | 29 | |
ad3c9f2a A |
30 | #include "xlocale_private.h" |
31 | ||
3d9156a7 A |
32 | #include <errno.h> |
33 | #include <limits.h> | |
34 | #include <runetype.h> | |
9385eb3d | 35 | #include <stdlib.h> |
3d9156a7 A |
36 | #include <string.h> |
37 | #include <wchar.h> | |
38 | #include "mblocal.h" | |
9385eb3d | 39 | |
6465356a A |
40 | /* |
41 | * 10952550: detect ill-formed UTF-8 | |
42 | * Unicode 6.0, section D92, mandates specific byte sequences for well- | |
43 | * formed UTF-8. UTF-8 sequences are now limited to 4 bytes, while the | |
44 | * FreeBSD code originally handled up to 6. Illegal surrogate code point | |
45 | * sequences are now detected. And while "non-shortest forms" were detected, | |
46 | * this only happened after completing the sequence. Now, all ill-formed | |
47 | * sequences are detected at the earliest point. | |
48 | * | |
49 | * Table 3-7. Well-Formed UTF-8 Byte Sequences | |
50 | * | |
51 | * Code Points 1st 2nd 3rd 4th Byte | |
52 | * U+0000..U+007F 00..7F | |
53 | * U+0080..U+07FF C2..DF 80..BF | |
54 | * U+0800..U+0FFF E0 A0..BF 80..BF | |
55 | * U+1000..U+CFFF E1..EC 80..BF 80..BF | |
56 | * U+D000..U+D7FF ED 80..9F 80..BF | |
57 | * U+E000..U+FFFF EE..EF 80..BF 80..BF | |
58 | * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF | |
59 | * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF | |
60 | * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF | |
61 | * | |
62 | * Note that while any 3rd and 4th byte can be in the range 80..BF, the | |
63 | * second byte is often limited to a smaller range. | |
64 | */ | |
65 | ||
66 | typedef struct { | |
67 | unsigned char lowerbound; | |
68 | unsigned char upperbound; | |
69 | } SecondByte; | |
70 | static SecondByte sb_00_00 = {0x00, 0x00}; | |
71 | static SecondByte sb_80_8F = {0x80, 0x8F}; | |
72 | static SecondByte sb_80_9F = {0x80, 0x9F}; | |
73 | static SecondByte sb_80_BF = {0x80, 0xBF}; | |
74 | static SecondByte sb_90_BF = {0x90, 0xBF}; | |
75 | static SecondByte sb_A0_BF = {0xA0, 0xBF}; | |
76 | ||
77 | #define UTF8_MB_CUR_MAX 4 | |
1f2f436a A |
78 | |
79 | static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, | |
ad3c9f2a A |
80 | size_t, mbstate_t * __restrict, locale_t); |
81 | static int _UTF8_mbsinit(const mbstate_t *, locale_t); | |
1f2f436a A |
82 | static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, |
83 | const char ** __restrict, size_t, size_t, | |
ad3c9f2a | 84 | mbstate_t * __restrict, locale_t); |
1f2f436a | 85 | static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, |
ad3c9f2a | 86 | mbstate_t * __restrict, locale_t); |
1f2f436a | 87 | static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, |
ad3c9f2a | 88 | size_t, size_t, mbstate_t * __restrict, locale_t); |
3d9156a7 A |
89 | |
90 | typedef struct { | |
91 | wchar_t ch; | |
92 | int want; | |
6465356a | 93 | SecondByte sb; |
3d9156a7 | 94 | } _UTF8State; |
9385eb3d | 95 | |
23e20b00 | 96 | int |
ad3c9f2a | 97 | _UTF8_init(struct __xlocale_st_runelocale *xrl) |
9385eb3d A |
98 | { |
99 | ||
ad3c9f2a A |
100 | xrl->__mbrtowc = _UTF8_mbrtowc; |
101 | xrl->__wcrtomb = _UTF8_wcrtomb; | |
102 | xrl->__mbsinit = _UTF8_mbsinit; | |
103 | xrl->__mbsnrtowcs = _UTF8_mbsnrtowcs; | |
104 | xrl->__wcsnrtombs = _UTF8_wcsnrtombs; | |
105 | xrl->__mb_cur_max = UTF8_MB_CUR_MAX; | |
1f2f436a A |
106 | /* |
107 | * UCS-4 encoding used as the internal representation, so | |
108 | * slots 0x0080-0x00FF are occuped and must be excluded | |
109 | * from the single byte ctype by setting the limit. | |
110 | */ | |
ad3c9f2a | 111 | xrl->__mb_sb_limit = 128; |
9385eb3d A |
112 | |
113 | return (0); | |
114 | } | |
115 | ||
1f2f436a | 116 | static int |
ad3c9f2a | 117 | _UTF8_mbsinit(const mbstate_t *ps, locale_t loc) |
3d9156a7 A |
118 | { |
119 | ||
120 | return (ps == NULL || ((const _UTF8State *)ps)->want == 0); | |
121 | } | |
122 | ||
1f2f436a | 123 | static size_t |
3d9156a7 | 124 | _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, |
ad3c9f2a | 125 | mbstate_t * __restrict ps, locale_t loc) |
9385eb3d | 126 | { |
3d9156a7 A |
127 | _UTF8State *us; |
128 | int ch, i, mask, want; | |
6465356a A |
129 | wchar_t wch; |
130 | SecondByte sb; | |
3d9156a7 A |
131 | |
132 | us = (_UTF8State *)ps; | |
9385eb3d | 133 | |
6465356a | 134 | if (us->want < 0 || us->want > UTF8_MB_CUR_MAX) { |
3d9156a7 A |
135 | errno = EINVAL; |
136 | return ((size_t)-1); | |
9385eb3d A |
137 | } |
138 | ||
3d9156a7 A |
139 | if (s == NULL) { |
140 | s = ""; | |
141 | n = 1; | |
142 | pwc = NULL; | |
9385eb3d A |
143 | } |
144 | ||
3d9156a7 A |
145 | if (n == 0) |
146 | /* Incomplete multibyte sequence */ | |
147 | return ((size_t)-2); | |
148 | ||
149 | if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { | |
150 | /* Fast path for plain ASCII characters. */ | |
151 | if (pwc != NULL) | |
152 | *pwc = ch; | |
153 | return (ch != '\0' ? 1 : 0); | |
154 | } | |
155 | ||
156 | if (us->want == 0) { | |
9385eb3d | 157 | /* |
3d9156a7 A |
158 | * Determine the number of octets that make up this character |
159 | * from the first octet, and a mask that extracts the | |
160 | * interesting bits of the first octet. We already know | |
161 | * the character is at least two bytes long. | |
162 | * | |
6465356a A |
163 | * We detect if the first byte is illegal, and set sb to |
164 | * the legal range of the second byte. | |
9385eb3d | 165 | */ |
3d9156a7 A |
166 | ch = (unsigned char)*s; |
167 | if ((ch & 0x80) == 0) { | |
168 | mask = 0x7f; | |
169 | want = 1; | |
6465356a | 170 | sb = sb_00_00; |
3d9156a7 | 171 | } else if ((ch & 0xe0) == 0xc0) { |
6465356a | 172 | if (ch < 0xc2) goto malformed; |
3d9156a7 A |
173 | mask = 0x1f; |
174 | want = 2; | |
6465356a | 175 | sb = sb_80_BF; |
3d9156a7 A |
176 | } else if ((ch & 0xf0) == 0xe0) { |
177 | mask = 0x0f; | |
178 | want = 3; | |
6465356a A |
179 | switch (ch) { |
180 | case 0xe0: | |
181 | sb = sb_A0_BF; | |
182 | break; | |
183 | case 0xed: | |
184 | sb = sb_80_9F; | |
185 | break; | |
186 | default: | |
187 | sb = sb_80_BF; | |
188 | break; | |
189 | } | |
3d9156a7 | 190 | } else if ((ch & 0xf8) == 0xf0) { |
6465356a | 191 | if (ch > 0xf4) goto malformed; |
3d9156a7 A |
192 | mask = 0x07; |
193 | want = 4; | |
6465356a A |
194 | switch (ch) { |
195 | case 0xf0: | |
196 | sb = sb_90_BF; | |
197 | break; | |
198 | case 0xf4: | |
199 | sb = sb_80_8F; | |
200 | break; | |
201 | default: | |
202 | sb = sb_80_BF; | |
203 | break; | |
204 | } | |
3d9156a7 | 205 | } else { |
6465356a | 206 | malformed: |
3d9156a7 A |
207 | /* |
208 | * Malformed input; input is not UTF-8. | |
209 | */ | |
210 | errno = EILSEQ; | |
211 | return ((size_t)-1); | |
212 | } | |
213 | } else { | |
214 | want = us->want; | |
6465356a | 215 | sb = us->sb; |
9385eb3d A |
216 | } |
217 | ||
218 | /* | |
219 | * Decode the octet sequence representing the character in chunks | |
220 | * of 6 bits, most significant first. | |
221 | */ | |
3d9156a7 A |
222 | if (us->want == 0) |
223 | wch = (unsigned char)*s++ & mask; | |
224 | else | |
225 | wch = us->ch; | |
226 | for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { | |
6465356a A |
227 | if (sb.lowerbound) { |
228 | if ((unsigned char)*s < sb.lowerbound || | |
229 | (unsigned char)*s > sb.upperbound) goto malformed; | |
230 | sb = sb_00_00; | |
231 | } else if ((*s & 0xc0) != 0x80) goto malformed; | |
9385eb3d | 232 | wch <<= 6; |
3d9156a7 | 233 | wch |= *s++ & 0x3f; |
9385eb3d | 234 | } |
3d9156a7 A |
235 | if (i < want) { |
236 | /* Incomplete multibyte sequence. */ | |
237 | us->want = want - i; | |
6465356a | 238 | us->sb = sb; |
3d9156a7 A |
239 | us->ch = wch; |
240 | return ((size_t)-2); | |
241 | } | |
3d9156a7 A |
242 | if (pwc != NULL) |
243 | *pwc = wch; | |
244 | us->want = 0; | |
245 | return (wch == L'\0' ? 0 : want); | |
9385eb3d A |
246 | } |
247 | ||
1f2f436a | 248 | static size_t |
3d9156a7 | 249 | _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, |
ad3c9f2a | 250 | size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc) |
9385eb3d | 251 | { |
3d9156a7 A |
252 | _UTF8State *us; |
253 | const char *s; | |
254 | size_t nchr; | |
255 | wchar_t wc; | |
256 | size_t nb; | |
257 | ||
258 | us = (_UTF8State *)ps; | |
259 | ||
260 | s = *src; | |
261 | nchr = 0; | |
262 | ||
263 | if (dst == NULL) { | |
264 | /* | |
265 | * The fast path in the loop below is not safe if an ASCII | |
266 | * character appears as anything but the first byte of a | |
267 | * multibyte sequence. Check now to avoid doing it in the loop. | |
268 | */ | |
269 | if (nms > 0 && us->want > 0 && (signed char)*s > 0) { | |
270 | errno = EILSEQ; | |
271 | return ((size_t)-1); | |
272 | } | |
273 | for (;;) { | |
274 | if (nms > 0 && (signed char)*s > 0) | |
275 | /* | |
276 | * Fast path for plain ASCII characters | |
277 | * excluding NUL. | |
278 | */ | |
279 | nb = 1; | |
ad3c9f2a | 280 | else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps, loc)) == |
3d9156a7 A |
281 | (size_t)-1) |
282 | /* Invalid sequence - mbrtowc() sets errno. */ | |
283 | return ((size_t)-1); | |
284 | else if (nb == 0 || nb == (size_t)-2) | |
285 | return (nchr); | |
286 | s += nb; | |
287 | nms -= nb; | |
288 | nchr++; | |
289 | } | |
290 | /*NOTREACHED*/ | |
291 | } | |
292 | ||
293 | /* | |
294 | * The fast path in the loop below is not safe if an ASCII | |
295 | * character appears as anything but the first byte of a | |
296 | * multibyte sequence. Check now to avoid doing it in the loop. | |
297 | */ | |
298 | if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { | |
299 | errno = EILSEQ; | |
300 | return ((size_t)-1); | |
301 | } | |
302 | while (len-- > 0) { | |
303 | if (nms > 0 && (signed char)*s > 0) { | |
304 | /* | |
305 | * Fast path for plain ASCII characters | |
306 | * excluding NUL. | |
307 | */ | |
308 | *dst = (wchar_t)*s; | |
309 | nb = 1; | |
ad3c9f2a | 310 | } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, loc)) == |
3d9156a7 A |
311 | (size_t)-1) { |
312 | *src = s; | |
313 | return ((size_t)-1); | |
314 | } else if (nb == (size_t)-2) { | |
315 | *src = s + nms; | |
316 | return (nchr); | |
317 | } else if (nb == 0) { | |
318 | *src = NULL; | |
319 | return (nchr); | |
320 | } | |
321 | s += nb; | |
322 | nms -= nb; | |
323 | nchr++; | |
324 | dst++; | |
325 | } | |
326 | *src = s; | |
327 | return (nchr); | |
328 | } | |
329 | ||
1f2f436a | 330 | static size_t |
ad3c9f2a | 331 | _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc) |
3d9156a7 A |
332 | { |
333 | _UTF8State *us; | |
9385eb3d A |
334 | unsigned char lead; |
335 | int i, len; | |
336 | ||
3d9156a7 A |
337 | us = (_UTF8State *)ps; |
338 | ||
339 | if (us->want != 0) { | |
340 | errno = EINVAL; | |
341 | return ((size_t)-1); | |
342 | } | |
343 | ||
344 | if (s == NULL) | |
345 | /* Reset to initial shift state (no-op) */ | |
346 | return (1); | |
347 | ||
348 | if ((wc & ~0x7f) == 0) { | |
349 | /* Fast path for plain ASCII characters. */ | |
350 | *s = (char)wc; | |
351 | return (1); | |
352 | } | |
353 | ||
9385eb3d A |
354 | /* |
355 | * Determine the number of octets needed to represent this character. | |
356 | * We always output the shortest sequence possible. Also specify the | |
357 | * first few bits of the first octet, which contains the information | |
358 | * about the sequence length. | |
359 | */ | |
3d9156a7 | 360 | if ((wc & ~0x7f) == 0) { |
9385eb3d A |
361 | lead = 0; |
362 | len = 1; | |
3d9156a7 | 363 | } else if ((wc & ~0x7ff) == 0) { |
9385eb3d A |
364 | lead = 0xc0; |
365 | len = 2; | |
3d9156a7 | 366 | } else if ((wc & ~0xffff) == 0) { |
6465356a | 367 | if (wc >= 0xd800 && wc <= 0xdfff) goto illegal; |
9385eb3d A |
368 | lead = 0xe0; |
369 | len = 3; | |
3d9156a7 | 370 | } else if ((wc & ~0x1fffff) == 0) { |
6465356a | 371 | if (wc > 0x10ffff) goto illegal; |
9385eb3d A |
372 | lead = 0xf0; |
373 | len = 4; | |
9385eb3d | 374 | } else { |
6465356a | 375 | illegal: |
3d9156a7 A |
376 | errno = EILSEQ; |
377 | return ((size_t)-1); | |
9385eb3d A |
378 | } |
379 | ||
3d9156a7 A |
380 | /* |
381 | * Output the octets representing the character in chunks | |
382 | * of 6 bits, least significant last. The first octet is | |
383 | * a special case because it contains the sequence length | |
384 | * information. | |
385 | */ | |
386 | for (i = len - 1; i > 0; i--) { | |
387 | s[i] = (wc & 0x3f) | 0x80; | |
388 | wc >>= 6; | |
9385eb3d | 389 | } |
3d9156a7 | 390 | *s = (wc & 0xff) | lead; |
9385eb3d A |
391 | |
392 | return (len); | |
393 | } | |
3d9156a7 | 394 | |
1f2f436a | 395 | static size_t |
3d9156a7 | 396 | _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, |
ad3c9f2a | 397 | size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc) |
3d9156a7 A |
398 | { |
399 | _UTF8State *us; | |
400 | char buf[MB_LEN_MAX]; | |
401 | const wchar_t *s; | |
402 | size_t nbytes; | |
403 | size_t nb; | |
404 | ||
405 | us = (_UTF8State *)ps; | |
406 | ||
407 | if (us->want != 0) { | |
408 | errno = EINVAL; | |
409 | return ((size_t)-1); | |
410 | } | |
411 | ||
412 | s = *src; | |
413 | nbytes = 0; | |
414 | ||
415 | if (dst == NULL) { | |
416 | while (nwc-- > 0) { | |
417 | if (0 <= *s && *s < 0x80) | |
418 | /* Fast path for plain ASCII characters. */ | |
419 | nb = 1; | |
ad3c9f2a | 420 | else if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == |
3d9156a7 A |
421 | (size_t)-1) |
422 | /* Invalid character - wcrtomb() sets errno. */ | |
423 | return ((size_t)-1); | |
424 | if (*s == L'\0') | |
425 | return (nbytes + nb - 1); | |
426 | s++; | |
427 | nbytes += nb; | |
428 | } | |
429 | return (nbytes); | |
430 | } | |
431 | ||
432 | while (len > 0 && nwc-- > 0) { | |
433 | if (0 <= *s && *s < 0x80) { | |
434 | /* Fast path for plain ASCII characters. */ | |
435 | nb = 1; | |
436 | *dst = *s; | |
ad3c9f2a | 437 | } else if (len > (size_t)UTF8_MB_CUR_MAX) { |
3d9156a7 | 438 | /* Enough space to translate in-place. */ |
ad3c9f2a | 439 | if ((nb = _UTF8_wcrtomb(dst, *s, ps, loc)) == (size_t)-1) { |
3d9156a7 A |
440 | *src = s; |
441 | return ((size_t)-1); | |
442 | } | |
443 | } else { | |
444 | /* | |
445 | * May not be enough space; use temp. buffer. | |
446 | */ | |
ad3c9f2a | 447 | if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == (size_t)-1) { |
3d9156a7 A |
448 | *src = s; |
449 | return ((size_t)-1); | |
450 | } | |
451 | if (nb > (int)len) | |
452 | /* MB sequence for character won't fit. */ | |
453 | break; | |
454 | memcpy(dst, buf, nb); | |
455 | } | |
456 | if (*s == L'\0') { | |
457 | *src = NULL; | |
458 | return (nbytes + nb - 1); | |
459 | } | |
460 | s++; | |
461 | dst += nb; | |
462 | len -= nb; | |
463 | nbytes += nb; | |
464 | } | |
465 | *src = s; | |
466 | return (nbytes); | |
467 | } |