]>
Commit | Line | Data |
---|---|---|
9385eb3d | 1 | /*- |
3d9156a7 | 2 | * Copyright (c) 2002-2004 Tim J. Robbins |
9385eb3d A |
3 | * All rights reserved. |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * | |
14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
18 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
19 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
20 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
21 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
22 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
23 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
24 | * SUCH DAMAGE. | |
25 | */ | |
26 | ||
3d9156a7 | 27 | #include <sys/param.h> |
1f2f436a | 28 | __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $"); |
9385eb3d | 29 | |
3d9156a7 A |
30 | #include <errno.h> |
31 | #include <limits.h> | |
32 | #include <runetype.h> | |
9385eb3d | 33 | #include <stdlib.h> |
3d9156a7 A |
34 | #include <string.h> |
35 | #include <wchar.h> | |
36 | #include "mblocal.h" | |
9385eb3d | 37 | |
1f2f436a A |
38 | extern int __mb_sb_limit; |
39 | ||
40 | static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, | |
41 | size_t, mbstate_t * __restrict); | |
42 | static int _UTF8_mbsinit(const mbstate_t *); | |
43 | static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, | |
44 | const char ** __restrict, size_t, size_t, | |
45 | mbstate_t * __restrict); | |
46 | static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, | |
47 | mbstate_t * __restrict); | |
48 | static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, | |
49 | size_t, size_t, mbstate_t * __restrict); | |
3d9156a7 A |
50 | |
51 | typedef struct { | |
52 | wchar_t ch; | |
53 | int want; | |
54 | wchar_t lbound; | |
55 | } _UTF8State; | |
9385eb3d A |
56 | |
57 | int | |
58 | _UTF8_init(_RuneLocale *rl) | |
59 | { | |
60 | ||
3d9156a7 A |
61 | __mbrtowc = _UTF8_mbrtowc; |
62 | __wcrtomb = _UTF8_wcrtomb; | |
63 | __mbsinit = _UTF8_mbsinit; | |
64 | __mbsnrtowcs = _UTF8_mbsnrtowcs; | |
65 | __wcsnrtombs = _UTF8_wcsnrtombs; | |
9385eb3d A |
66 | _CurrentRuneLocale = rl; |
67 | __mb_cur_max = 6; | |
1f2f436a A |
68 | /* |
69 | * UCS-4 encoding used as the internal representation, so | |
70 | * slots 0x0080-0x00FF are occuped and must be excluded | |
71 | * from the single byte ctype by setting the limit. | |
72 | */ | |
73 | __mb_sb_limit = 128; | |
9385eb3d A |
74 | |
75 | return (0); | |
76 | } | |
77 | ||
1f2f436a | 78 | static int |
3d9156a7 A |
79 | _UTF8_mbsinit(const mbstate_t *ps) |
80 | { | |
81 | ||
82 | return (ps == NULL || ((const _UTF8State *)ps)->want == 0); | |
83 | } | |
84 | ||
1f2f436a | 85 | static size_t |
3d9156a7 A |
86 | _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, |
87 | mbstate_t * __restrict ps) | |
9385eb3d | 88 | { |
3d9156a7 A |
89 | _UTF8State *us; |
90 | int ch, i, mask, want; | |
91 | wchar_t lbound, wch; | |
92 | ||
93 | us = (_UTF8State *)ps; | |
9385eb3d | 94 | |
3d9156a7 A |
95 | if (us->want < 0 || us->want > 6) { |
96 | errno = EINVAL; | |
97 | return ((size_t)-1); | |
9385eb3d A |
98 | } |
99 | ||
3d9156a7 A |
100 | if (s == NULL) { |
101 | s = ""; | |
102 | n = 1; | |
103 | pwc = NULL; | |
9385eb3d A |
104 | } |
105 | ||
3d9156a7 A |
106 | if (n == 0) |
107 | /* Incomplete multibyte sequence */ | |
108 | return ((size_t)-2); | |
109 | ||
110 | if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { | |
111 | /* Fast path for plain ASCII characters. */ | |
112 | if (pwc != NULL) | |
113 | *pwc = ch; | |
114 | return (ch != '\0' ? 1 : 0); | |
115 | } | |
116 | ||
117 | if (us->want == 0) { | |
9385eb3d | 118 | /* |
3d9156a7 A |
119 | * Determine the number of octets that make up this character |
120 | * from the first octet, and a mask that extracts the | |
121 | * interesting bits of the first octet. We already know | |
122 | * the character is at least two bytes long. | |
123 | * | |
124 | * We also specify a lower bound for the character code to | |
125 | * detect redundant, non-"shortest form" encodings. For | |
126 | * example, the sequence C0 80 is _not_ a legal representation | |
127 | * of the null character. This enforces a 1-to-1 mapping | |
128 | * between character codes and their multibyte representations. | |
9385eb3d | 129 | */ |
3d9156a7 A |
130 | ch = (unsigned char)*s; |
131 | if ((ch & 0x80) == 0) { | |
132 | mask = 0x7f; | |
133 | want = 1; | |
134 | lbound = 0; | |
135 | } else if ((ch & 0xe0) == 0xc0) { | |
136 | mask = 0x1f; | |
137 | want = 2; | |
138 | lbound = 0x80; | |
139 | } else if ((ch & 0xf0) == 0xe0) { | |
140 | mask = 0x0f; | |
141 | want = 3; | |
142 | lbound = 0x800; | |
143 | } else if ((ch & 0xf8) == 0xf0) { | |
144 | mask = 0x07; | |
145 | want = 4; | |
146 | lbound = 0x10000; | |
147 | } else if ((ch & 0xfc) == 0xf8) { | |
148 | mask = 0x03; | |
149 | want = 5; | |
150 | lbound = 0x200000; | |
1f2f436a | 151 | } else if ((ch & 0xfe) == 0xfc) { |
3d9156a7 A |
152 | mask = 0x01; |
153 | want = 6; | |
154 | lbound = 0x4000000; | |
155 | } else { | |
156 | /* | |
157 | * Malformed input; input is not UTF-8. | |
158 | */ | |
159 | errno = EILSEQ; | |
160 | return ((size_t)-1); | |
161 | } | |
162 | } else { | |
163 | want = us->want; | |
164 | lbound = us->lbound; | |
9385eb3d A |
165 | } |
166 | ||
167 | /* | |
168 | * Decode the octet sequence representing the character in chunks | |
169 | * of 6 bits, most significant first. | |
170 | */ | |
3d9156a7 A |
171 | if (us->want == 0) |
172 | wch = (unsigned char)*s++ & mask; | |
173 | else | |
174 | wch = us->ch; | |
175 | for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { | |
176 | if ((*s & 0xc0) != 0x80) { | |
9385eb3d A |
177 | /* |
178 | * Malformed input; bad characters in the middle | |
179 | * of a character. | |
180 | */ | |
3d9156a7 A |
181 | errno = EILSEQ; |
182 | return ((size_t)-1); | |
9385eb3d A |
183 | } |
184 | wch <<= 6; | |
3d9156a7 | 185 | wch |= *s++ & 0x3f; |
9385eb3d | 186 | } |
3d9156a7 A |
187 | if (i < want) { |
188 | /* Incomplete multibyte sequence. */ | |
189 | us->want = want - i; | |
190 | us->lbound = lbound; | |
191 | us->ch = wch; | |
192 | return ((size_t)-2); | |
193 | } | |
194 | if (wch < lbound) { | |
9385eb3d A |
195 | /* |
196 | * Malformed input; redundant encoding. | |
197 | */ | |
3d9156a7 A |
198 | errno = EILSEQ; |
199 | return ((size_t)-1); | |
200 | } | |
201 | if (pwc != NULL) | |
202 | *pwc = wch; | |
203 | us->want = 0; | |
204 | return (wch == L'\0' ? 0 : want); | |
9385eb3d A |
205 | } |
206 | ||
1f2f436a | 207 | static size_t |
3d9156a7 A |
208 | _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, |
209 | size_t nms, size_t len, mbstate_t * __restrict ps) | |
9385eb3d | 210 | { |
3d9156a7 A |
211 | _UTF8State *us; |
212 | const char *s; | |
213 | size_t nchr; | |
214 | wchar_t wc; | |
215 | size_t nb; | |
216 | ||
217 | us = (_UTF8State *)ps; | |
218 | ||
219 | s = *src; | |
220 | nchr = 0; | |
221 | ||
222 | if (dst == NULL) { | |
223 | /* | |
224 | * The fast path in the loop below is not safe if an ASCII | |
225 | * character appears as anything but the first byte of a | |
226 | * multibyte sequence. Check now to avoid doing it in the loop. | |
227 | */ | |
228 | if (nms > 0 && us->want > 0 && (signed char)*s > 0) { | |
229 | errno = EILSEQ; | |
230 | return ((size_t)-1); | |
231 | } | |
232 | for (;;) { | |
233 | if (nms > 0 && (signed char)*s > 0) | |
234 | /* | |
235 | * Fast path for plain ASCII characters | |
236 | * excluding NUL. | |
237 | */ | |
238 | nb = 1; | |
239 | else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == | |
240 | (size_t)-1) | |
241 | /* Invalid sequence - mbrtowc() sets errno. */ | |
242 | return ((size_t)-1); | |
243 | else if (nb == 0 || nb == (size_t)-2) | |
244 | return (nchr); | |
245 | s += nb; | |
246 | nms -= nb; | |
247 | nchr++; | |
248 | } | |
249 | /*NOTREACHED*/ | |
250 | } | |
251 | ||
252 | /* | |
253 | * The fast path in the loop below is not safe if an ASCII | |
254 | * character appears as anything but the first byte of a | |
255 | * multibyte sequence. Check now to avoid doing it in the loop. | |
256 | */ | |
257 | if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { | |
258 | errno = EILSEQ; | |
259 | return ((size_t)-1); | |
260 | } | |
261 | while (len-- > 0) { | |
262 | if (nms > 0 && (signed char)*s > 0) { | |
263 | /* | |
264 | * Fast path for plain ASCII characters | |
265 | * excluding NUL. | |
266 | */ | |
267 | *dst = (wchar_t)*s; | |
268 | nb = 1; | |
269 | } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == | |
270 | (size_t)-1) { | |
271 | *src = s; | |
272 | return ((size_t)-1); | |
273 | } else if (nb == (size_t)-2) { | |
274 | *src = s + nms; | |
275 | return (nchr); | |
276 | } else if (nb == 0) { | |
277 | *src = NULL; | |
278 | return (nchr); | |
279 | } | |
280 | s += nb; | |
281 | nms -= nb; | |
282 | nchr++; | |
283 | dst++; | |
284 | } | |
285 | *src = s; | |
286 | return (nchr); | |
287 | } | |
288 | ||
1f2f436a | 289 | static size_t |
3d9156a7 A |
290 | _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) |
291 | { | |
292 | _UTF8State *us; | |
9385eb3d A |
293 | unsigned char lead; |
294 | int i, len; | |
295 | ||
3d9156a7 A |
296 | us = (_UTF8State *)ps; |
297 | ||
298 | if (us->want != 0) { | |
299 | errno = EINVAL; | |
300 | return ((size_t)-1); | |
301 | } | |
302 | ||
303 | if (s == NULL) | |
304 | /* Reset to initial shift state (no-op) */ | |
305 | return (1); | |
306 | ||
307 | if ((wc & ~0x7f) == 0) { | |
308 | /* Fast path for plain ASCII characters. */ | |
309 | *s = (char)wc; | |
310 | return (1); | |
311 | } | |
312 | ||
9385eb3d A |
313 | /* |
314 | * Determine the number of octets needed to represent this character. | |
315 | * We always output the shortest sequence possible. Also specify the | |
316 | * first few bits of the first octet, which contains the information | |
317 | * about the sequence length. | |
318 | */ | |
3d9156a7 | 319 | if ((wc & ~0x7f) == 0) { |
9385eb3d A |
320 | lead = 0; |
321 | len = 1; | |
3d9156a7 | 322 | } else if ((wc & ~0x7ff) == 0) { |
9385eb3d A |
323 | lead = 0xc0; |
324 | len = 2; | |
3d9156a7 | 325 | } else if ((wc & ~0xffff) == 0) { |
9385eb3d A |
326 | lead = 0xe0; |
327 | len = 3; | |
3d9156a7 | 328 | } else if ((wc & ~0x1fffff) == 0) { |
9385eb3d A |
329 | lead = 0xf0; |
330 | len = 4; | |
3d9156a7 | 331 | } else if ((wc & ~0x3ffffff) == 0) { |
9385eb3d A |
332 | lead = 0xf8; |
333 | len = 5; | |
3d9156a7 | 334 | } else if ((wc & ~0x7fffffff) == 0) { |
9385eb3d A |
335 | lead = 0xfc; |
336 | len = 6; | |
337 | } else { | |
3d9156a7 A |
338 | errno = EILSEQ; |
339 | return ((size_t)-1); | |
9385eb3d A |
340 | } |
341 | ||
3d9156a7 A |
342 | /* |
343 | * Output the octets representing the character in chunks | |
344 | * of 6 bits, least significant last. The first octet is | |
345 | * a special case because it contains the sequence length | |
346 | * information. | |
347 | */ | |
348 | for (i = len - 1; i > 0; i--) { | |
349 | s[i] = (wc & 0x3f) | 0x80; | |
350 | wc >>= 6; | |
9385eb3d | 351 | } |
3d9156a7 | 352 | *s = (wc & 0xff) | lead; |
9385eb3d A |
353 | |
354 | return (len); | |
355 | } | |
3d9156a7 | 356 | |
1f2f436a | 357 | static size_t |
3d9156a7 A |
358 | _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, |
359 | size_t nwc, size_t len, mbstate_t * __restrict ps) | |
360 | { | |
361 | _UTF8State *us; | |
362 | char buf[MB_LEN_MAX]; | |
363 | const wchar_t *s; | |
364 | size_t nbytes; | |
365 | size_t nb; | |
366 | ||
367 | us = (_UTF8State *)ps; | |
368 | ||
369 | if (us->want != 0) { | |
370 | errno = EINVAL; | |
371 | return ((size_t)-1); | |
372 | } | |
373 | ||
374 | s = *src; | |
375 | nbytes = 0; | |
376 | ||
377 | if (dst == NULL) { | |
378 | while (nwc-- > 0) { | |
379 | if (0 <= *s && *s < 0x80) | |
380 | /* Fast path for plain ASCII characters. */ | |
381 | nb = 1; | |
382 | else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == | |
383 | (size_t)-1) | |
384 | /* Invalid character - wcrtomb() sets errno. */ | |
385 | return ((size_t)-1); | |
386 | if (*s == L'\0') | |
387 | return (nbytes + nb - 1); | |
388 | s++; | |
389 | nbytes += nb; | |
390 | } | |
391 | return (nbytes); | |
392 | } | |
393 | ||
394 | while (len > 0 && nwc-- > 0) { | |
395 | if (0 <= *s && *s < 0x80) { | |
396 | /* Fast path for plain ASCII characters. */ | |
397 | nb = 1; | |
398 | *dst = *s; | |
399 | } else if (len > (size_t)MB_CUR_MAX) { | |
400 | /* Enough space to translate in-place. */ | |
1f2f436a | 401 | if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { |
3d9156a7 A |
402 | *src = s; |
403 | return ((size_t)-1); | |
404 | } | |
405 | } else { | |
406 | /* | |
407 | * May not be enough space; use temp. buffer. | |
408 | */ | |
1f2f436a | 409 | if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { |
3d9156a7 A |
410 | *src = s; |
411 | return ((size_t)-1); | |
412 | } | |
413 | if (nb > (int)len) | |
414 | /* MB sequence for character won't fit. */ | |
415 | break; | |
416 | memcpy(dst, buf, nb); | |
417 | } | |
418 | if (*s == L'\0') { | |
419 | *src = NULL; | |
420 | return (nbytes + nb - 1); | |
421 | } | |
422 | s++; | |
423 | dst += nb; | |
424 | len -= nb; | |
425 | nbytes += nb; | |
426 | } | |
427 | *src = s; | |
428 | return (nbytes); | |
429 | } |