]>
Commit | Line | Data |
---|---|---|
9385eb3d | 1 | /*- |
3d9156a7 | 2 | * Copyright (c) 2002-2004 Tim J. Robbins |
9385eb3d A |
3 | * All rights reserved. |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * | |
14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
18 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
19 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
20 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
21 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
22 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
23 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
24 | * SUCH DAMAGE. | |
25 | */ | |
26 | ||
3d9156a7 | 27 | #include <sys/param.h> |
1f2f436a | 28 | __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $"); |
9385eb3d | 29 | |
ad3c9f2a A |
30 | #include "xlocale_private.h" |
31 | ||
3d9156a7 A |
32 | #include <errno.h> |
33 | #include <limits.h> | |
34 | #include <runetype.h> | |
9385eb3d | 35 | #include <stdlib.h> |
3d9156a7 A |
36 | #include <string.h> |
37 | #include <wchar.h> | |
38 | #include "mblocal.h" | |
9385eb3d | 39 | |
ad3c9f2a | 40 | #define UTF8_MB_CUR_MAX 6 |
1f2f436a A |
41 | |
42 | static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, | |
ad3c9f2a A |
43 | size_t, mbstate_t * __restrict, locale_t); |
44 | static int _UTF8_mbsinit(const mbstate_t *, locale_t); | |
1f2f436a A |
45 | static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, |
46 | const char ** __restrict, size_t, size_t, | |
ad3c9f2a | 47 | mbstate_t * __restrict, locale_t); |
1f2f436a | 48 | static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, |
ad3c9f2a | 49 | mbstate_t * __restrict, locale_t); |
1f2f436a | 50 | static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, |
ad3c9f2a | 51 | size_t, size_t, mbstate_t * __restrict, locale_t); |
3d9156a7 A |
52 | |
53 | typedef struct { | |
54 | wchar_t ch; | |
55 | int want; | |
56 | wchar_t lbound; | |
57 | } _UTF8State; | |
9385eb3d | 58 | |
ad3c9f2a A |
59 | __private_extern__ int |
60 | _UTF8_init(struct __xlocale_st_runelocale *xrl) | |
9385eb3d A |
61 | { |
62 | ||
ad3c9f2a A |
63 | xrl->__mbrtowc = _UTF8_mbrtowc; |
64 | xrl->__wcrtomb = _UTF8_wcrtomb; | |
65 | xrl->__mbsinit = _UTF8_mbsinit; | |
66 | xrl->__mbsnrtowcs = _UTF8_mbsnrtowcs; | |
67 | xrl->__wcsnrtombs = _UTF8_wcsnrtombs; | |
68 | xrl->__mb_cur_max = UTF8_MB_CUR_MAX; | |
1f2f436a A |
69 | /* |
70 | * UCS-4 encoding used as the internal representation, so | |
71 | * slots 0x0080-0x00FF are occuped and must be excluded | |
72 | * from the single byte ctype by setting the limit. | |
73 | */ | |
ad3c9f2a | 74 | xrl->__mb_sb_limit = 128; |
9385eb3d A |
75 | |
76 | return (0); | |
77 | } | |
78 | ||
1f2f436a | 79 | static int |
ad3c9f2a | 80 | _UTF8_mbsinit(const mbstate_t *ps, locale_t loc) |
3d9156a7 A |
81 | { |
82 | ||
83 | return (ps == NULL || ((const _UTF8State *)ps)->want == 0); | |
84 | } | |
85 | ||
1f2f436a | 86 | static size_t |
3d9156a7 | 87 | _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, |
ad3c9f2a | 88 | mbstate_t * __restrict ps, locale_t loc) |
9385eb3d | 89 | { |
3d9156a7 A |
90 | _UTF8State *us; |
91 | int ch, i, mask, want; | |
92 | wchar_t lbound, wch; | |
93 | ||
94 | us = (_UTF8State *)ps; | |
9385eb3d | 95 | |
3d9156a7 A |
96 | if (us->want < 0 || us->want > 6) { |
97 | errno = EINVAL; | |
98 | return ((size_t)-1); | |
9385eb3d A |
99 | } |
100 | ||
3d9156a7 A |
101 | if (s == NULL) { |
102 | s = ""; | |
103 | n = 1; | |
104 | pwc = NULL; | |
9385eb3d A |
105 | } |
106 | ||
3d9156a7 A |
107 | if (n == 0) |
108 | /* Incomplete multibyte sequence */ | |
109 | return ((size_t)-2); | |
110 | ||
111 | if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { | |
112 | /* Fast path for plain ASCII characters. */ | |
113 | if (pwc != NULL) | |
114 | *pwc = ch; | |
115 | return (ch != '\0' ? 1 : 0); | |
116 | } | |
117 | ||
118 | if (us->want == 0) { | |
9385eb3d | 119 | /* |
3d9156a7 A |
120 | * Determine the number of octets that make up this character |
121 | * from the first octet, and a mask that extracts the | |
122 | * interesting bits of the first octet. We already know | |
123 | * the character is at least two bytes long. | |
124 | * | |
125 | * We also specify a lower bound for the character code to | |
126 | * detect redundant, non-"shortest form" encodings. For | |
127 | * example, the sequence C0 80 is _not_ a legal representation | |
128 | * of the null character. This enforces a 1-to-1 mapping | |
129 | * between character codes and their multibyte representations. | |
9385eb3d | 130 | */ |
3d9156a7 A |
131 | ch = (unsigned char)*s; |
132 | if ((ch & 0x80) == 0) { | |
133 | mask = 0x7f; | |
134 | want = 1; | |
135 | lbound = 0; | |
136 | } else if ((ch & 0xe0) == 0xc0) { | |
137 | mask = 0x1f; | |
138 | want = 2; | |
139 | lbound = 0x80; | |
140 | } else if ((ch & 0xf0) == 0xe0) { | |
141 | mask = 0x0f; | |
142 | want = 3; | |
143 | lbound = 0x800; | |
144 | } else if ((ch & 0xf8) == 0xf0) { | |
145 | mask = 0x07; | |
146 | want = 4; | |
147 | lbound = 0x10000; | |
148 | } else if ((ch & 0xfc) == 0xf8) { | |
149 | mask = 0x03; | |
150 | want = 5; | |
151 | lbound = 0x200000; | |
1f2f436a | 152 | } else if ((ch & 0xfe) == 0xfc) { |
3d9156a7 A |
153 | mask = 0x01; |
154 | want = 6; | |
155 | lbound = 0x4000000; | |
156 | } else { | |
157 | /* | |
158 | * Malformed input; input is not UTF-8. | |
159 | */ | |
160 | errno = EILSEQ; | |
161 | return ((size_t)-1); | |
162 | } | |
163 | } else { | |
164 | want = us->want; | |
165 | lbound = us->lbound; | |
9385eb3d A |
166 | } |
167 | ||
168 | /* | |
169 | * Decode the octet sequence representing the character in chunks | |
170 | * of 6 bits, most significant first. | |
171 | */ | |
3d9156a7 A |
172 | if (us->want == 0) |
173 | wch = (unsigned char)*s++ & mask; | |
174 | else | |
175 | wch = us->ch; | |
176 | for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { | |
177 | if ((*s & 0xc0) != 0x80) { | |
9385eb3d A |
178 | /* |
179 | * Malformed input; bad characters in the middle | |
180 | * of a character. | |
181 | */ | |
3d9156a7 A |
182 | errno = EILSEQ; |
183 | return ((size_t)-1); | |
9385eb3d A |
184 | } |
185 | wch <<= 6; | |
3d9156a7 | 186 | wch |= *s++ & 0x3f; |
9385eb3d | 187 | } |
3d9156a7 A |
188 | if (i < want) { |
189 | /* Incomplete multibyte sequence. */ | |
190 | us->want = want - i; | |
191 | us->lbound = lbound; | |
192 | us->ch = wch; | |
193 | return ((size_t)-2); | |
194 | } | |
195 | if (wch < lbound) { | |
9385eb3d A |
196 | /* |
197 | * Malformed input; redundant encoding. | |
198 | */ | |
3d9156a7 A |
199 | errno = EILSEQ; |
200 | return ((size_t)-1); | |
201 | } | |
202 | if (pwc != NULL) | |
203 | *pwc = wch; | |
204 | us->want = 0; | |
205 | return (wch == L'\0' ? 0 : want); | |
9385eb3d A |
206 | } |
207 | ||
1f2f436a | 208 | static size_t |
3d9156a7 | 209 | _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, |
ad3c9f2a | 210 | size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc) |
9385eb3d | 211 | { |
3d9156a7 A |
212 | _UTF8State *us; |
213 | const char *s; | |
214 | size_t nchr; | |
215 | wchar_t wc; | |
216 | size_t nb; | |
217 | ||
218 | us = (_UTF8State *)ps; | |
219 | ||
220 | s = *src; | |
221 | nchr = 0; | |
222 | ||
223 | if (dst == NULL) { | |
224 | /* | |
225 | * The fast path in the loop below is not safe if an ASCII | |
226 | * character appears as anything but the first byte of a | |
227 | * multibyte sequence. Check now to avoid doing it in the loop. | |
228 | */ | |
229 | if (nms > 0 && us->want > 0 && (signed char)*s > 0) { | |
230 | errno = EILSEQ; | |
231 | return ((size_t)-1); | |
232 | } | |
233 | for (;;) { | |
234 | if (nms > 0 && (signed char)*s > 0) | |
235 | /* | |
236 | * Fast path for plain ASCII characters | |
237 | * excluding NUL. | |
238 | */ | |
239 | nb = 1; | |
ad3c9f2a | 240 | else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps, loc)) == |
3d9156a7 A |
241 | (size_t)-1) |
242 | /* Invalid sequence - mbrtowc() sets errno. */ | |
243 | return ((size_t)-1); | |
244 | else if (nb == 0 || nb == (size_t)-2) | |
245 | return (nchr); | |
246 | s += nb; | |
247 | nms -= nb; | |
248 | nchr++; | |
249 | } | |
250 | /*NOTREACHED*/ | |
251 | } | |
252 | ||
253 | /* | |
254 | * The fast path in the loop below is not safe if an ASCII | |
255 | * character appears as anything but the first byte of a | |
256 | * multibyte sequence. Check now to avoid doing it in the loop. | |
257 | */ | |
258 | if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { | |
259 | errno = EILSEQ; | |
260 | return ((size_t)-1); | |
261 | } | |
262 | while (len-- > 0) { | |
263 | if (nms > 0 && (signed char)*s > 0) { | |
264 | /* | |
265 | * Fast path for plain ASCII characters | |
266 | * excluding NUL. | |
267 | */ | |
268 | *dst = (wchar_t)*s; | |
269 | nb = 1; | |
ad3c9f2a | 270 | } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, loc)) == |
3d9156a7 A |
271 | (size_t)-1) { |
272 | *src = s; | |
273 | return ((size_t)-1); | |
274 | } else if (nb == (size_t)-2) { | |
275 | *src = s + nms; | |
276 | return (nchr); | |
277 | } else if (nb == 0) { | |
278 | *src = NULL; | |
279 | return (nchr); | |
280 | } | |
281 | s += nb; | |
282 | nms -= nb; | |
283 | nchr++; | |
284 | dst++; | |
285 | } | |
286 | *src = s; | |
287 | return (nchr); | |
288 | } | |
289 | ||
1f2f436a | 290 | static size_t |
ad3c9f2a | 291 | _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc) |
3d9156a7 A |
292 | { |
293 | _UTF8State *us; | |
9385eb3d A |
294 | unsigned char lead; |
295 | int i, len; | |
296 | ||
3d9156a7 A |
297 | us = (_UTF8State *)ps; |
298 | ||
299 | if (us->want != 0) { | |
300 | errno = EINVAL; | |
301 | return ((size_t)-1); | |
302 | } | |
303 | ||
304 | if (s == NULL) | |
305 | /* Reset to initial shift state (no-op) */ | |
306 | return (1); | |
307 | ||
308 | if ((wc & ~0x7f) == 0) { | |
309 | /* Fast path for plain ASCII characters. */ | |
310 | *s = (char)wc; | |
311 | return (1); | |
312 | } | |
313 | ||
9385eb3d A |
314 | /* |
315 | * Determine the number of octets needed to represent this character. | |
316 | * We always output the shortest sequence possible. Also specify the | |
317 | * first few bits of the first octet, which contains the information | |
318 | * about the sequence length. | |
319 | */ | |
3d9156a7 | 320 | if ((wc & ~0x7f) == 0) { |
9385eb3d A |
321 | lead = 0; |
322 | len = 1; | |
3d9156a7 | 323 | } else if ((wc & ~0x7ff) == 0) { |
9385eb3d A |
324 | lead = 0xc0; |
325 | len = 2; | |
3d9156a7 | 326 | } else if ((wc & ~0xffff) == 0) { |
9385eb3d A |
327 | lead = 0xe0; |
328 | len = 3; | |
3d9156a7 | 329 | } else if ((wc & ~0x1fffff) == 0) { |
9385eb3d A |
330 | lead = 0xf0; |
331 | len = 4; | |
3d9156a7 | 332 | } else if ((wc & ~0x3ffffff) == 0) { |
9385eb3d A |
333 | lead = 0xf8; |
334 | len = 5; | |
3d9156a7 | 335 | } else if ((wc & ~0x7fffffff) == 0) { |
9385eb3d A |
336 | lead = 0xfc; |
337 | len = 6; | |
338 | } else { | |
3d9156a7 A |
339 | errno = EILSEQ; |
340 | return ((size_t)-1); | |
9385eb3d A |
341 | } |
342 | ||
3d9156a7 A |
343 | /* |
344 | * Output the octets representing the character in chunks | |
345 | * of 6 bits, least significant last. The first octet is | |
346 | * a special case because it contains the sequence length | |
347 | * information. | |
348 | */ | |
349 | for (i = len - 1; i > 0; i--) { | |
350 | s[i] = (wc & 0x3f) | 0x80; | |
351 | wc >>= 6; | |
9385eb3d | 352 | } |
3d9156a7 | 353 | *s = (wc & 0xff) | lead; |
9385eb3d A |
354 | |
355 | return (len); | |
356 | } | |
3d9156a7 | 357 | |
1f2f436a | 358 | static size_t |
3d9156a7 | 359 | _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, |
ad3c9f2a | 360 | size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc) |
3d9156a7 A |
361 | { |
362 | _UTF8State *us; | |
363 | char buf[MB_LEN_MAX]; | |
364 | const wchar_t *s; | |
365 | size_t nbytes; | |
366 | size_t nb; | |
367 | ||
368 | us = (_UTF8State *)ps; | |
369 | ||
370 | if (us->want != 0) { | |
371 | errno = EINVAL; | |
372 | return ((size_t)-1); | |
373 | } | |
374 | ||
375 | s = *src; | |
376 | nbytes = 0; | |
377 | ||
378 | if (dst == NULL) { | |
379 | while (nwc-- > 0) { | |
380 | if (0 <= *s && *s < 0x80) | |
381 | /* Fast path for plain ASCII characters. */ | |
382 | nb = 1; | |
ad3c9f2a | 383 | else if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == |
3d9156a7 A |
384 | (size_t)-1) |
385 | /* Invalid character - wcrtomb() sets errno. */ | |
386 | return ((size_t)-1); | |
387 | if (*s == L'\0') | |
388 | return (nbytes + nb - 1); | |
389 | s++; | |
390 | nbytes += nb; | |
391 | } | |
392 | return (nbytes); | |
393 | } | |
394 | ||
395 | while (len > 0 && nwc-- > 0) { | |
396 | if (0 <= *s && *s < 0x80) { | |
397 | /* Fast path for plain ASCII characters. */ | |
398 | nb = 1; | |
399 | *dst = *s; | |
ad3c9f2a | 400 | } else if (len > (size_t)UTF8_MB_CUR_MAX) { |
3d9156a7 | 401 | /* Enough space to translate in-place. */ |
ad3c9f2a | 402 | if ((nb = _UTF8_wcrtomb(dst, *s, ps, loc)) == (size_t)-1) { |
3d9156a7 A |
403 | *src = s; |
404 | return ((size_t)-1); | |
405 | } | |
406 | } else { | |
407 | /* | |
408 | * May not be enough space; use temp. buffer. | |
409 | */ | |
ad3c9f2a | 410 | if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == (size_t)-1) { |
3d9156a7 A |
411 | *src = s; |
412 | return ((size_t)-1); | |
413 | } | |
414 | if (nb > (int)len) | |
415 | /* MB sequence for character won't fit. */ | |
416 | break; | |
417 | memcpy(dst, buf, nb); | |
418 | } | |
419 | if (*s == L'\0') { | |
420 | *src = NULL; | |
421 | return (nbytes + nb - 1); | |
422 | } | |
423 | s++; | |
424 | dst += nb; | |
425 | len -= nb; | |
426 | nbytes += nb; | |
427 | } | |
428 | *src = s; | |
429 | return (nbytes); | |
430 | } |