]>
Commit | Line | Data |
---|---|---|
224c7076 A |
1 | /*- |
2 | * Copyright (c) 2002-2004 Tim J. Robbins | |
3 | * All rights reserved. | |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * | |
14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
18 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
19 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
20 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
21 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
22 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
23 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
24 | * SUCH DAMAGE. | |
25 | */ | |
26 | ||
27 | #include <sys/param.h> | |
28 | __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.11 2004/07/27 06:29:48 tjr Exp $"); | |
29 | ||
30 | #include "xlocale_private.h" | |
31 | ||
32 | #include <errno.h> | |
33 | #include <limits.h> | |
34 | #include <runetype.h> | |
35 | #include <stdlib.h> | |
36 | #include <string.h> | |
37 | #include <wchar.h> | |
38 | #include "mblocal.h" | |
39 | ||
40 | #define UTF8_MB_CUR_MAX 6 | |
41 | ||
42 | static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, | |
43 | mbstate_t * __restrict, locale_t); | |
44 | static int _UTF8_mbsinit(const mbstate_t *, locale_t); | |
45 | static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, const char ** __restrict, | |
46 | size_t, size_t, mbstate_t * __restrict, locale_t); | |
47 | static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict, locale_t); | |
48 | static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, | |
49 | size_t, size_t, mbstate_t * __restrict, locale_t); | |
50 | ||
51 | typedef struct { | |
52 | wchar_t ch; | |
53 | int want; | |
54 | wchar_t lbound; | |
55 | } _UTF8State; | |
56 | ||
57 | __private_extern__ int | |
58 | _UTF8_init(struct __xlocale_st_runelocale *xrl) | |
59 | { | |
60 | ||
61 | xrl->__mbrtowc = _UTF8_mbrtowc; | |
62 | xrl->__wcrtomb = _UTF8_wcrtomb; | |
63 | xrl->__mbsinit = _UTF8_mbsinit; | |
64 | xrl->__mbsnrtowcs = _UTF8_mbsnrtowcs; | |
65 | xrl->__wcsnrtombs = _UTF8_wcsnrtombs; | |
66 | xrl->__mb_cur_max = UTF8_MB_CUR_MAX; | |
67 | ||
68 | return (0); | |
69 | } | |
70 | ||
71 | static int | |
72 | _UTF8_mbsinit(const mbstate_t *ps, locale_t loc) | |
73 | { | |
74 | ||
75 | return (ps == NULL || ((const _UTF8State *)ps)->want == 0); | |
76 | } | |
77 | ||
78 | static size_t | |
79 | _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, | |
80 | mbstate_t * __restrict ps, locale_t loc) | |
81 | { | |
82 | _UTF8State *us; | |
83 | int ch, i, mask, want; | |
84 | wchar_t lbound, wch; | |
85 | ||
86 | us = (_UTF8State *)ps; | |
87 | ||
88 | if (us->want < 0 || us->want > 6) { | |
89 | errno = EINVAL; | |
90 | return ((size_t)-1); | |
91 | } | |
92 | ||
93 | if (s == NULL) { | |
94 | s = ""; | |
95 | n = 1; | |
96 | pwc = NULL; | |
97 | } | |
98 | ||
99 | if (n == 0) | |
100 | /* Incomplete multibyte sequence */ | |
101 | return ((size_t)-2); | |
102 | ||
103 | if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { | |
104 | /* Fast path for plain ASCII characters. */ | |
105 | if (pwc != NULL) | |
106 | *pwc = ch; | |
107 | return (ch != '\0' ? 1 : 0); | |
108 | } | |
109 | ||
110 | if (us->want == 0) { | |
111 | /* | |
112 | * Determine the number of octets that make up this character | |
113 | * from the first octet, and a mask that extracts the | |
114 | * interesting bits of the first octet. We already know | |
115 | * the character is at least two bytes long. | |
116 | * | |
117 | * We also specify a lower bound for the character code to | |
118 | * detect redundant, non-"shortest form" encodings. For | |
119 | * example, the sequence C0 80 is _not_ a legal representation | |
120 | * of the null character. This enforces a 1-to-1 mapping | |
121 | * between character codes and their multibyte representations. | |
122 | */ | |
123 | ch = (unsigned char)*s; | |
124 | if ((ch & 0x80) == 0) { | |
125 | mask = 0x7f; | |
126 | want = 1; | |
127 | lbound = 0; | |
128 | } else if ((ch & 0xe0) == 0xc0) { | |
129 | mask = 0x1f; | |
130 | want = 2; | |
131 | lbound = 0x80; | |
132 | } else if ((ch & 0xf0) == 0xe0) { | |
133 | mask = 0x0f; | |
134 | want = 3; | |
135 | lbound = 0x800; | |
136 | } else if ((ch & 0xf8) == 0xf0) { | |
137 | mask = 0x07; | |
138 | want = 4; | |
139 | lbound = 0x10000; | |
140 | } else if ((ch & 0xfc) == 0xf8) { | |
141 | mask = 0x03; | |
142 | want = 5; | |
143 | lbound = 0x200000; | |
144 | } else if ((ch & 0xfc) == 0xfc) { | |
145 | mask = 0x01; | |
146 | want = 6; | |
147 | lbound = 0x4000000; | |
148 | } else { | |
149 | /* | |
150 | * Malformed input; input is not UTF-8. | |
151 | */ | |
152 | errno = EILSEQ; | |
153 | return ((size_t)-1); | |
154 | } | |
155 | } else { | |
156 | want = us->want; | |
157 | lbound = us->lbound; | |
158 | } | |
159 | ||
160 | /* | |
161 | * Decode the octet sequence representing the character in chunks | |
162 | * of 6 bits, most significant first. | |
163 | */ | |
164 | if (us->want == 0) | |
165 | wch = (unsigned char)*s++ & mask; | |
166 | else | |
167 | wch = us->ch; | |
168 | for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { | |
169 | if ((*s & 0xc0) != 0x80) { | |
170 | /* | |
171 | * Malformed input; bad characters in the middle | |
172 | * of a character. | |
173 | */ | |
174 | errno = EILSEQ; | |
175 | return ((size_t)-1); | |
176 | } | |
177 | wch <<= 6; | |
178 | wch |= *s++ & 0x3f; | |
179 | } | |
180 | if (i < want) { | |
181 | /* Incomplete multibyte sequence. */ | |
182 | us->want = want - i; | |
183 | us->lbound = lbound; | |
184 | us->ch = wch; | |
185 | return ((size_t)-2); | |
186 | } | |
187 | if (wch < lbound) { | |
188 | /* | |
189 | * Malformed input; redundant encoding. | |
190 | */ | |
191 | errno = EILSEQ; | |
192 | return ((size_t)-1); | |
193 | } | |
194 | if (pwc != NULL) | |
195 | *pwc = wch; | |
196 | us->want = 0; | |
197 | return (wch == L'\0' ? 0 : want); | |
198 | } | |
199 | ||
200 | static size_t | |
201 | _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, | |
202 | size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc) | |
203 | { | |
204 | _UTF8State *us; | |
205 | const char *s; | |
206 | size_t nchr; | |
207 | wchar_t wc; | |
208 | size_t nb; | |
209 | ||
210 | us = (_UTF8State *)ps; | |
211 | ||
212 | s = *src; | |
213 | nchr = 0; | |
214 | ||
215 | if (dst == NULL) { | |
216 | /* | |
217 | * The fast path in the loop below is not safe if an ASCII | |
218 | * character appears as anything but the first byte of a | |
219 | * multibyte sequence. Check now to avoid doing it in the loop. | |
220 | */ | |
221 | if (nms > 0 && us->want > 0 && (signed char)*s > 0) { | |
222 | errno = EILSEQ; | |
223 | return ((size_t)-1); | |
224 | } | |
225 | for (;;) { | |
226 | if (nms > 0 && (signed char)*s > 0) | |
227 | /* | |
228 | * Fast path for plain ASCII characters | |
229 | * excluding NUL. | |
230 | */ | |
231 | nb = 1; | |
232 | else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps, loc)) == | |
233 | (size_t)-1) | |
234 | /* Invalid sequence - mbrtowc() sets errno. */ | |
235 | return ((size_t)-1); | |
236 | else if (nb == 0 || nb == (size_t)-2) | |
237 | return (nchr); | |
238 | s += nb; | |
239 | nms -= nb; | |
240 | nchr++; | |
241 | } | |
242 | /*NOTREACHED*/ | |
243 | } | |
244 | ||
245 | /* | |
246 | * The fast path in the loop below is not safe if an ASCII | |
247 | * character appears as anything but the first byte of a | |
248 | * multibyte sequence. Check now to avoid doing it in the loop. | |
249 | */ | |
250 | if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { | |
251 | errno = EILSEQ; | |
252 | return ((size_t)-1); | |
253 | } | |
254 | while (len-- > 0) { | |
255 | if (nms > 0 && (signed char)*s > 0) { | |
256 | /* | |
257 | * Fast path for plain ASCII characters | |
258 | * excluding NUL. | |
259 | */ | |
260 | *dst = (wchar_t)*s; | |
261 | nb = 1; | |
262 | } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, loc)) == | |
263 | (size_t)-1) { | |
264 | *src = s; | |
265 | return ((size_t)-1); | |
266 | } else if (nb == (size_t)-2) { | |
267 | *src = s + nms; | |
268 | return (nchr); | |
269 | } else if (nb == 0) { | |
270 | *src = NULL; | |
271 | return (nchr); | |
272 | } | |
273 | s += nb; | |
274 | nms -= nb; | |
275 | nchr++; | |
276 | dst++; | |
277 | } | |
278 | *src = s; | |
279 | return (nchr); | |
280 | } | |
281 | ||
282 | static size_t | |
283 | _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc) | |
284 | { | |
285 | _UTF8State *us; | |
286 | unsigned char lead; | |
287 | int i, len; | |
288 | ||
289 | us = (_UTF8State *)ps; | |
290 | ||
291 | if (us->want != 0) { | |
292 | errno = EINVAL; | |
293 | return ((size_t)-1); | |
294 | } | |
295 | ||
296 | if (s == NULL) | |
297 | /* Reset to initial shift state (no-op) */ | |
298 | return (1); | |
299 | ||
300 | if ((wc & ~0x7f) == 0) { | |
301 | /* Fast path for plain ASCII characters. */ | |
302 | *s = (char)wc; | |
303 | return (1); | |
304 | } | |
305 | ||
306 | /* | |
307 | * Determine the number of octets needed to represent this character. | |
308 | * We always output the shortest sequence possible. Also specify the | |
309 | * first few bits of the first octet, which contains the information | |
310 | * about the sequence length. | |
311 | */ | |
312 | if ((wc & ~0x7f) == 0) { | |
313 | lead = 0; | |
314 | len = 1; | |
315 | } else if ((wc & ~0x7ff) == 0) { | |
316 | lead = 0xc0; | |
317 | len = 2; | |
318 | } else if ((wc & ~0xffff) == 0) { | |
319 | lead = 0xe0; | |
320 | len = 3; | |
321 | } else if ((wc & ~0x1fffff) == 0) { | |
322 | lead = 0xf0; | |
323 | len = 4; | |
324 | } else if ((wc & ~0x3ffffff) == 0) { | |
325 | lead = 0xf8; | |
326 | len = 5; | |
327 | } else if ((wc & ~0x7fffffff) == 0) { | |
328 | lead = 0xfc; | |
329 | len = 6; | |
330 | } else { | |
331 | errno = EILSEQ; | |
332 | return ((size_t)-1); | |
333 | } | |
334 | ||
335 | /* | |
336 | * Output the octets representing the character in chunks | |
337 | * of 6 bits, least significant last. The first octet is | |
338 | * a special case because it contains the sequence length | |
339 | * information. | |
340 | */ | |
341 | for (i = len - 1; i > 0; i--) { | |
342 | s[i] = (wc & 0x3f) | 0x80; | |
343 | wc >>= 6; | |
344 | } | |
345 | *s = (wc & 0xff) | lead; | |
346 | ||
347 | return (len); | |
348 | } | |
349 | ||
350 | static size_t | |
351 | _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, | |
352 | size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc) | |
353 | { | |
354 | _UTF8State *us; | |
355 | char buf[MB_LEN_MAX]; | |
356 | const wchar_t *s; | |
357 | size_t nbytes; | |
358 | size_t nb; | |
359 | ||
360 | us = (_UTF8State *)ps; | |
361 | ||
362 | if (us->want != 0) { | |
363 | errno = EINVAL; | |
364 | return ((size_t)-1); | |
365 | } | |
366 | ||
367 | s = *src; | |
368 | nbytes = 0; | |
369 | ||
370 | if (dst == NULL) { | |
371 | while (nwc-- > 0) { | |
372 | if (0 <= *s && *s < 0x80) | |
373 | /* Fast path for plain ASCII characters. */ | |
374 | nb = 1; | |
375 | else if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == | |
376 | (size_t)-1) | |
377 | /* Invalid character - wcrtomb() sets errno. */ | |
378 | return ((size_t)-1); | |
379 | if (*s == L'\0') | |
380 | return (nbytes + nb - 1); | |
381 | s++; | |
382 | nbytes += nb; | |
383 | } | |
384 | return (nbytes); | |
385 | } | |
386 | ||
387 | while (len > 0 && nwc-- > 0) { | |
388 | if (0 <= *s && *s < 0x80) { | |
389 | /* Fast path for plain ASCII characters. */ | |
390 | nb = 1; | |
391 | *dst = *s; | |
392 | } else if (len > (size_t)UTF8_MB_CUR_MAX) { | |
393 | /* Enough space to translate in-place. */ | |
394 | if ((nb = (int)_UTF8_wcrtomb(dst, *s, ps, loc)) < 0) { | |
395 | *src = s; | |
396 | return ((size_t)-1); | |
397 | } | |
398 | } else { | |
399 | /* | |
400 | * May not be enough space; use temp. buffer. | |
401 | */ | |
402 | if ((nb = (int)_UTF8_wcrtomb(buf, *s, ps, loc)) < 0) { | |
403 | *src = s; | |
404 | return ((size_t)-1); | |
405 | } | |
406 | if (nb > (int)len) | |
407 | /* MB sequence for character won't fit. */ | |
408 | break; | |
409 | memcpy(dst, buf, nb); | |
410 | } | |
411 | if (*s == L'\0') { | |
412 | *src = NULL; | |
413 | return (nbytes + nb - 1); | |
414 | } | |
415 | s++; | |
416 | dst += nb; | |
417 | len -= nb; | |
418 | nbytes += nb; | |
419 | } | |
420 | *src = s; | |
421 | return (nbytes); | |
422 | } |