]>
Commit | Line | Data |
---|---|---|
1 | /*- | |
2 | * Copyright (c) 2002-2004 Tim J. Robbins | |
3 | * All rights reserved. | |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * | |
14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
18 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
19 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
20 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
21 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
22 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
23 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
24 | * SUCH DAMAGE. | |
25 | */ | |
26 | ||
27 | #include <sys/param.h> | |
28 | __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $"); | |
29 | ||
30 | #include "xlocale_private.h" | |
31 | ||
32 | #include <errno.h> | |
33 | #include <limits.h> | |
34 | #include <runetype.h> | |
35 | #include <stdlib.h> | |
36 | #include <string.h> | |
37 | #include <wchar.h> | |
38 | #include "mblocal.h" | |
39 | ||
40 | /* | |
41 | * 10952550: detect ill-formed UTF-8 | |
42 | * Unicode 6.0, section D92, mandates specific byte sequences for well- | |
43 | * formed UTF-8. UTF-8 sequences are now limited to 4 bytes, while the | |
44 | * FreeBSD code originally handled up to 6. Illegal surrogate code point | |
45 | * sequences are now detected. And while "non-shortest forms" were detected, | |
46 | * this only happened after completing the sequence. Now, all ill-formed | |
47 | * sequences are detected at the earliest point. | |
48 | * | |
49 | * Table 3-7. Well-Formed UTF-8 Byte Sequences | |
50 | * | |
51 | * Code Points 1st 2nd 3rd 4th Byte | |
52 | * U+0000..U+007F 00..7F | |
53 | * U+0080..U+07FF C2..DF 80..BF | |
54 | * U+0800..U+0FFF E0 A0..BF 80..BF | |
55 | * U+1000..U+CFFF E1..EC 80..BF 80..BF | |
56 | * U+D000..U+D7FF ED 80..9F 80..BF | |
57 | * U+E000..U+FFFF EE..EF 80..BF 80..BF | |
58 | * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF | |
59 | * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF | |
60 | * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF | |
61 | * | |
62 | * Note that while any 3rd and 4th byte can be in the range 80..BF, the | |
63 | * second byte is often limited to a smaller range. | |
64 | */ | |
65 | ||
66 | typedef struct { | |
67 | unsigned char lowerbound; | |
68 | unsigned char upperbound; | |
69 | } SecondByte; | |
70 | static SecondByte sb_00_00 = {0x00, 0x00}; | |
71 | static SecondByte sb_80_8F = {0x80, 0x8F}; | |
72 | static SecondByte sb_80_9F = {0x80, 0x9F}; | |
73 | static SecondByte sb_80_BF = {0x80, 0xBF}; | |
74 | static SecondByte sb_90_BF = {0x90, 0xBF}; | |
75 | static SecondByte sb_A0_BF = {0xA0, 0xBF}; | |
76 | ||
77 | #define UTF8_MB_CUR_MAX 4 | |
78 | ||
79 | static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, | |
80 | size_t, mbstate_t * __restrict, locale_t); | |
81 | static int _UTF8_mbsinit(const mbstate_t *, locale_t); | |
82 | static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, | |
83 | const char ** __restrict, size_t, size_t, | |
84 | mbstate_t * __restrict, locale_t); | |
85 | static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, | |
86 | mbstate_t * __restrict, locale_t); | |
87 | static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, | |
88 | size_t, size_t, mbstate_t * __restrict, locale_t); | |
89 | ||
90 | typedef struct { | |
91 | wchar_t ch; | |
92 | int want; | |
93 | SecondByte sb; | |
94 | } _UTF8State; | |
95 | ||
96 | int | |
97 | _UTF8_init(struct __xlocale_st_runelocale *xrl) | |
98 | { | |
99 | ||
100 | xrl->__mbrtowc = _UTF8_mbrtowc; | |
101 | xrl->__wcrtomb = _UTF8_wcrtomb; | |
102 | xrl->__mbsinit = _UTF8_mbsinit; | |
103 | xrl->__mbsnrtowcs = _UTF8_mbsnrtowcs; | |
104 | xrl->__wcsnrtombs = _UTF8_wcsnrtombs; | |
105 | xrl->__mb_cur_max = UTF8_MB_CUR_MAX; | |
106 | /* | |
107 | * UCS-4 encoding used as the internal representation, so | |
108 | * slots 0x0080-0x00FF are occuped and must be excluded | |
109 | * from the single byte ctype by setting the limit. | |
110 | */ | |
111 | xrl->__mb_sb_limit = 128; | |
112 | ||
113 | return (0); | |
114 | } | |
115 | ||
116 | static int | |
117 | _UTF8_mbsinit(const mbstate_t *ps, locale_t loc) | |
118 | { | |
119 | ||
120 | return (ps == NULL || ((const _UTF8State *)ps)->want == 0); | |
121 | } | |
122 | ||
123 | static size_t | |
124 | _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, | |
125 | mbstate_t * __restrict ps, locale_t loc) | |
126 | { | |
127 | _UTF8State *us; | |
128 | int ch, i, mask, want; | |
129 | wchar_t wch; | |
130 | SecondByte sb; | |
131 | ||
132 | us = (_UTF8State *)ps; | |
133 | ||
134 | if (us->want < 0 || us->want > UTF8_MB_CUR_MAX) { | |
135 | errno = EINVAL; | |
136 | return ((size_t)-1); | |
137 | } | |
138 | ||
139 | if (s == NULL) { | |
140 | s = ""; | |
141 | n = 1; | |
142 | pwc = NULL; | |
143 | } | |
144 | ||
145 | if (n == 0) | |
146 | /* Incomplete multibyte sequence */ | |
147 | return ((size_t)-2); | |
148 | ||
149 | if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { | |
150 | /* Fast path for plain ASCII characters. */ | |
151 | if (pwc != NULL) | |
152 | *pwc = ch; | |
153 | return (ch != '\0' ? 1 : 0); | |
154 | } | |
155 | ||
156 | if (us->want == 0) { | |
157 | /* | |
158 | * Determine the number of octets that make up this character | |
159 | * from the first octet, and a mask that extracts the | |
160 | * interesting bits of the first octet. We already know | |
161 | * the character is at least two bytes long. | |
162 | * | |
163 | * We detect if the first byte is illegal, and set sb to | |
164 | * the legal range of the second byte. | |
165 | */ | |
166 | ch = (unsigned char)*s; | |
167 | if ((ch & 0x80) == 0) { | |
168 | mask = 0x7f; | |
169 | want = 1; | |
170 | sb = sb_00_00; | |
171 | } else if ((ch & 0xe0) == 0xc0) { | |
172 | if (ch < 0xc2) goto malformed; | |
173 | mask = 0x1f; | |
174 | want = 2; | |
175 | sb = sb_80_BF; | |
176 | } else if ((ch & 0xf0) == 0xe0) { | |
177 | mask = 0x0f; | |
178 | want = 3; | |
179 | switch (ch) { | |
180 | case 0xe0: | |
181 | sb = sb_A0_BF; | |
182 | break; | |
183 | case 0xed: | |
184 | sb = sb_80_9F; | |
185 | break; | |
186 | default: | |
187 | sb = sb_80_BF; | |
188 | break; | |
189 | } | |
190 | } else if ((ch & 0xf8) == 0xf0) { | |
191 | if (ch > 0xf4) goto malformed; | |
192 | mask = 0x07; | |
193 | want = 4; | |
194 | switch (ch) { | |
195 | case 0xf0: | |
196 | sb = sb_90_BF; | |
197 | break; | |
198 | case 0xf4: | |
199 | sb = sb_80_8F; | |
200 | break; | |
201 | default: | |
202 | sb = sb_80_BF; | |
203 | break; | |
204 | } | |
205 | } else { | |
206 | malformed: | |
207 | /* | |
208 | * Malformed input; input is not UTF-8. | |
209 | */ | |
210 | errno = EILSEQ; | |
211 | return ((size_t)-1); | |
212 | } | |
213 | } else { | |
214 | want = us->want; | |
215 | sb = us->sb; | |
216 | } | |
217 | ||
218 | /* | |
219 | * Decode the octet sequence representing the character in chunks | |
220 | * of 6 bits, most significant first. | |
221 | */ | |
222 | if (us->want == 0) | |
223 | wch = (unsigned char)*s++ & mask; | |
224 | else | |
225 | wch = us->ch; | |
226 | for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { | |
227 | if (sb.lowerbound) { | |
228 | if ((unsigned char)*s < sb.lowerbound || | |
229 | (unsigned char)*s > sb.upperbound) goto malformed; | |
230 | sb = sb_00_00; | |
231 | } else if ((*s & 0xc0) != 0x80) goto malformed; | |
232 | wch <<= 6; | |
233 | wch |= *s++ & 0x3f; | |
234 | } | |
235 | if (i < want) { | |
236 | /* Incomplete multibyte sequence. */ | |
237 | us->want = want - i; | |
238 | us->sb = sb; | |
239 | us->ch = wch; | |
240 | return ((size_t)-2); | |
241 | } | |
242 | if (pwc != NULL) | |
243 | *pwc = wch; | |
244 | us->want = 0; | |
245 | return (wch == L'\0' ? 0 : want); | |
246 | } | |
247 | ||
248 | static size_t | |
249 | _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, | |
250 | size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc) | |
251 | { | |
252 | _UTF8State *us; | |
253 | const char *s; | |
254 | size_t nchr; | |
255 | wchar_t wc; | |
256 | size_t nb; | |
257 | ||
258 | us = (_UTF8State *)ps; | |
259 | ||
260 | s = *src; | |
261 | nchr = 0; | |
262 | ||
263 | if (dst == NULL) { | |
264 | /* | |
265 | * The fast path in the loop below is not safe if an ASCII | |
266 | * character appears as anything but the first byte of a | |
267 | * multibyte sequence. Check now to avoid doing it in the loop. | |
268 | */ | |
269 | if (nms > 0 && us->want > 0 && (signed char)*s > 0) { | |
270 | errno = EILSEQ; | |
271 | return ((size_t)-1); | |
272 | } | |
273 | for (;;) { | |
274 | if (nms > 0 && (signed char)*s > 0) | |
275 | /* | |
276 | * Fast path for plain ASCII characters | |
277 | * excluding NUL. | |
278 | */ | |
279 | nb = 1; | |
280 | else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps, loc)) == | |
281 | (size_t)-1) | |
282 | /* Invalid sequence - mbrtowc() sets errno. */ | |
283 | return ((size_t)-1); | |
284 | else if (nb == 0 || nb == (size_t)-2) | |
285 | return (nchr); | |
286 | s += nb; | |
287 | nms -= nb; | |
288 | nchr++; | |
289 | } | |
290 | /*NOTREACHED*/ | |
291 | } | |
292 | ||
293 | /* | |
294 | * The fast path in the loop below is not safe if an ASCII | |
295 | * character appears as anything but the first byte of a | |
296 | * multibyte sequence. Check now to avoid doing it in the loop. | |
297 | */ | |
298 | if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { | |
299 | errno = EILSEQ; | |
300 | return ((size_t)-1); | |
301 | } | |
302 | while (len-- > 0) { | |
303 | if (nms > 0 && (signed char)*s > 0) { | |
304 | /* | |
305 | * Fast path for plain ASCII characters | |
306 | * excluding NUL. | |
307 | */ | |
308 | *dst = (wchar_t)*s; | |
309 | nb = 1; | |
310 | } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, loc)) == | |
311 | (size_t)-1) { | |
312 | *src = s; | |
313 | return ((size_t)-1); | |
314 | } else if (nb == (size_t)-2) { | |
315 | *src = s + nms; | |
316 | return (nchr); | |
317 | } else if (nb == 0) { | |
318 | *src = NULL; | |
319 | return (nchr); | |
320 | } | |
321 | s += nb; | |
322 | nms -= nb; | |
323 | nchr++; | |
324 | dst++; | |
325 | } | |
326 | *src = s; | |
327 | return (nchr); | |
328 | } | |
329 | ||
330 | static size_t | |
331 | _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc) | |
332 | { | |
333 | _UTF8State *us; | |
334 | unsigned char lead; | |
335 | int i, len; | |
336 | ||
337 | us = (_UTF8State *)ps; | |
338 | ||
339 | if (us->want != 0) { | |
340 | errno = EINVAL; | |
341 | return ((size_t)-1); | |
342 | } | |
343 | ||
344 | if (s == NULL) | |
345 | /* Reset to initial shift state (no-op) */ | |
346 | return (1); | |
347 | ||
348 | if ((wc & ~0x7f) == 0) { | |
349 | /* Fast path for plain ASCII characters. */ | |
350 | *s = (char)wc; | |
351 | return (1); | |
352 | } | |
353 | ||
354 | /* | |
355 | * Determine the number of octets needed to represent this character. | |
356 | * We always output the shortest sequence possible. Also specify the | |
357 | * first few bits of the first octet, which contains the information | |
358 | * about the sequence length. | |
359 | */ | |
360 | if ((wc & ~0x7f) == 0) { | |
361 | lead = 0; | |
362 | len = 1; | |
363 | } else if ((wc & ~0x7ff) == 0) { | |
364 | lead = 0xc0; | |
365 | len = 2; | |
366 | } else if ((wc & ~0xffff) == 0) { | |
367 | if (wc >= 0xd800 && wc <= 0xdfff) goto illegal; | |
368 | lead = 0xe0; | |
369 | len = 3; | |
370 | } else if ((wc & ~0x1fffff) == 0) { | |
371 | if (wc > 0x10ffff) goto illegal; | |
372 | lead = 0xf0; | |
373 | len = 4; | |
374 | } else { | |
375 | illegal: | |
376 | errno = EILSEQ; | |
377 | return ((size_t)-1); | |
378 | } | |
379 | ||
380 | /* | |
381 | * Output the octets representing the character in chunks | |
382 | * of 6 bits, least significant last. The first octet is | |
383 | * a special case because it contains the sequence length | |
384 | * information. | |
385 | */ | |
386 | for (i = len - 1; i > 0; i--) { | |
387 | s[i] = (wc & 0x3f) | 0x80; | |
388 | wc >>= 6; | |
389 | } | |
390 | *s = (wc & 0xff) | lead; | |
391 | ||
392 | return (len); | |
393 | } | |
394 | ||
395 | static size_t | |
396 | _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, | |
397 | size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc) | |
398 | { | |
399 | _UTF8State *us; | |
400 | char buf[MB_LEN_MAX]; | |
401 | const wchar_t *s; | |
402 | size_t nbytes; | |
403 | size_t nb; | |
404 | ||
405 | us = (_UTF8State *)ps; | |
406 | ||
407 | if (us->want != 0) { | |
408 | errno = EINVAL; | |
409 | return ((size_t)-1); | |
410 | } | |
411 | ||
412 | s = *src; | |
413 | nbytes = 0; | |
414 | ||
415 | if (dst == NULL) { | |
416 | while (nwc-- > 0) { | |
417 | if (0 <= *s && *s < 0x80) | |
418 | /* Fast path for plain ASCII characters. */ | |
419 | nb = 1; | |
420 | else if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == | |
421 | (size_t)-1) | |
422 | /* Invalid character - wcrtomb() sets errno. */ | |
423 | return ((size_t)-1); | |
424 | if (*s == L'\0') | |
425 | return (nbytes + nb - 1); | |
426 | s++; | |
427 | nbytes += nb; | |
428 | } | |
429 | return (nbytes); | |
430 | } | |
431 | ||
432 | while (len > 0 && nwc-- > 0) { | |
433 | if (0 <= *s && *s < 0x80) { | |
434 | /* Fast path for plain ASCII characters. */ | |
435 | nb = 1; | |
436 | *dst = *s; | |
437 | } else if (len > (size_t)UTF8_MB_CUR_MAX) { | |
438 | /* Enough space to translate in-place. */ | |
439 | if ((nb = _UTF8_wcrtomb(dst, *s, ps, loc)) == (size_t)-1) { | |
440 | *src = s; | |
441 | return ((size_t)-1); | |
442 | } | |
443 | } else { | |
444 | /* | |
445 | * May not be enough space; use temp. buffer. | |
446 | */ | |
447 | if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == (size_t)-1) { | |
448 | *src = s; | |
449 | return ((size_t)-1); | |
450 | } | |
451 | if (nb > (int)len) | |
452 | /* MB sequence for character won't fit. */ | |
453 | break; | |
454 | memcpy(dst, buf, nb); | |
455 | } | |
456 | if (*s == L'\0') { | |
457 | *src = NULL; | |
458 | return (nbytes + nb - 1); | |
459 | } | |
460 | s++; | |
461 | dst += nb; | |
462 | len -= nb; | |
463 | nbytes += nb; | |
464 | } | |
465 | *src = s; | |
466 | return (nbytes); | |
467 | } |