]> git.saurik.com Git - apple/libc.git/blob - locale/FreeBSD/utf8.c
Libc-391.tar.gz
[apple/libc.git] / locale / FreeBSD / utf8.c
1 /*-
2 * Copyright (c) 2002-2004 Tim J. Robbins
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/param.h>
28 __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.11 2004/07/27 06:29:48 tjr Exp $");
29
30 #include <errno.h>
31 #include <limits.h>
32 #include <runetype.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <wchar.h>
36 #include "mblocal.h"
37
38 size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
39 mbstate_t * __restrict);
40 int _UTF8_mbsinit(const mbstate_t *);
41 size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, const char ** __restrict,
42 size_t, size_t, mbstate_t * __restrict);
43 size_t _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
44 size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
45 size_t, size_t, mbstate_t * __restrict);
46
47 typedef struct {
48 wchar_t ch;
49 int want;
50 wchar_t lbound;
51 } _UTF8State;
52
53 int
54 _UTF8_init(_RuneLocale *rl)
55 {
56
57 __mbrtowc = _UTF8_mbrtowc;
58 __wcrtomb = _UTF8_wcrtomb;
59 __mbsinit = _UTF8_mbsinit;
60 __mbsnrtowcs = _UTF8_mbsnrtowcs;
61 __wcsnrtombs = _UTF8_wcsnrtombs;
62 _CurrentRuneLocale = rl;
63 __mb_cur_max = 6;
64
65 return (0);
66 }
67
68 int
69 _UTF8_mbsinit(const mbstate_t *ps)
70 {
71
72 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
73 }
74
75 size_t
76 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
77 mbstate_t * __restrict ps)
78 {
79 _UTF8State *us;
80 int ch, i, mask, want;
81 wchar_t lbound, wch;
82
83 us = (_UTF8State *)ps;
84
85 if (us->want < 0 || us->want > 6) {
86 errno = EINVAL;
87 return ((size_t)-1);
88 }
89
90 if (s == NULL) {
91 s = "";
92 n = 1;
93 pwc = NULL;
94 }
95
96 if (n == 0)
97 /* Incomplete multibyte sequence */
98 return ((size_t)-2);
99
100 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
101 /* Fast path for plain ASCII characters. */
102 if (pwc != NULL)
103 *pwc = ch;
104 return (ch != '\0' ? 1 : 0);
105 }
106
107 if (us->want == 0) {
108 /*
109 * Determine the number of octets that make up this character
110 * from the first octet, and a mask that extracts the
111 * interesting bits of the first octet. We already know
112 * the character is at least two bytes long.
113 *
114 * We also specify a lower bound for the character code to
115 * detect redundant, non-"shortest form" encodings. For
116 * example, the sequence C0 80 is _not_ a legal representation
117 * of the null character. This enforces a 1-to-1 mapping
118 * between character codes and their multibyte representations.
119 */
120 ch = (unsigned char)*s;
121 if ((ch & 0x80) == 0) {
122 mask = 0x7f;
123 want = 1;
124 lbound = 0;
125 } else if ((ch & 0xe0) == 0xc0) {
126 mask = 0x1f;
127 want = 2;
128 lbound = 0x80;
129 } else if ((ch & 0xf0) == 0xe0) {
130 mask = 0x0f;
131 want = 3;
132 lbound = 0x800;
133 } else if ((ch & 0xf8) == 0xf0) {
134 mask = 0x07;
135 want = 4;
136 lbound = 0x10000;
137 } else if ((ch & 0xfc) == 0xf8) {
138 mask = 0x03;
139 want = 5;
140 lbound = 0x200000;
141 } else if ((ch & 0xfc) == 0xfc) {
142 mask = 0x01;
143 want = 6;
144 lbound = 0x4000000;
145 } else {
146 /*
147 * Malformed input; input is not UTF-8.
148 */
149 errno = EILSEQ;
150 return ((size_t)-1);
151 }
152 } else {
153 want = us->want;
154 lbound = us->lbound;
155 }
156
157 /*
158 * Decode the octet sequence representing the character in chunks
159 * of 6 bits, most significant first.
160 */
161 if (us->want == 0)
162 wch = (unsigned char)*s++ & mask;
163 else
164 wch = us->ch;
165 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
166 if ((*s & 0xc0) != 0x80) {
167 /*
168 * Malformed input; bad characters in the middle
169 * of a character.
170 */
171 errno = EILSEQ;
172 return ((size_t)-1);
173 }
174 wch <<= 6;
175 wch |= *s++ & 0x3f;
176 }
177 if (i < want) {
178 /* Incomplete multibyte sequence. */
179 us->want = want - i;
180 us->lbound = lbound;
181 us->ch = wch;
182 return ((size_t)-2);
183 }
184 if (wch < lbound) {
185 /*
186 * Malformed input; redundant encoding.
187 */
188 errno = EILSEQ;
189 return ((size_t)-1);
190 }
191 if (pwc != NULL)
192 *pwc = wch;
193 us->want = 0;
194 return (wch == L'\0' ? 0 : want);
195 }
196
197 size_t
198 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
199 size_t nms, size_t len, mbstate_t * __restrict ps)
200 {
201 _UTF8State *us;
202 const char *s;
203 size_t nchr;
204 wchar_t wc;
205 size_t nb;
206
207 us = (_UTF8State *)ps;
208
209 s = *src;
210 nchr = 0;
211
212 if (dst == NULL) {
213 /*
214 * The fast path in the loop below is not safe if an ASCII
215 * character appears as anything but the first byte of a
216 * multibyte sequence. Check now to avoid doing it in the loop.
217 */
218 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
219 errno = EILSEQ;
220 return ((size_t)-1);
221 }
222 for (;;) {
223 if (nms > 0 && (signed char)*s > 0)
224 /*
225 * Fast path for plain ASCII characters
226 * excluding NUL.
227 */
228 nb = 1;
229 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
230 (size_t)-1)
231 /* Invalid sequence - mbrtowc() sets errno. */
232 return ((size_t)-1);
233 else if (nb == 0 || nb == (size_t)-2)
234 return (nchr);
235 s += nb;
236 nms -= nb;
237 nchr++;
238 }
239 /*NOTREACHED*/
240 }
241
242 /*
243 * The fast path in the loop below is not safe if an ASCII
244 * character appears as anything but the first byte of a
245 * multibyte sequence. Check now to avoid doing it in the loop.
246 */
247 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
248 errno = EILSEQ;
249 return ((size_t)-1);
250 }
251 while (len-- > 0) {
252 if (nms > 0 && (signed char)*s > 0) {
253 /*
254 * Fast path for plain ASCII characters
255 * excluding NUL.
256 */
257 *dst = (wchar_t)*s;
258 nb = 1;
259 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
260 (size_t)-1) {
261 *src = s;
262 return ((size_t)-1);
263 } else if (nb == (size_t)-2) {
264 *src = s + nms;
265 return (nchr);
266 } else if (nb == 0) {
267 *src = NULL;
268 return (nchr);
269 }
270 s += nb;
271 nms -= nb;
272 nchr++;
273 dst++;
274 }
275 *src = s;
276 return (nchr);
277 }
278
279 size_t
280 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
281 {
282 _UTF8State *us;
283 unsigned char lead;
284 int i, len;
285
286 us = (_UTF8State *)ps;
287
288 if (us->want != 0) {
289 errno = EINVAL;
290 return ((size_t)-1);
291 }
292
293 if (s == NULL)
294 /* Reset to initial shift state (no-op) */
295 return (1);
296
297 if ((wc & ~0x7f) == 0) {
298 /* Fast path for plain ASCII characters. */
299 *s = (char)wc;
300 return (1);
301 }
302
303 /*
304 * Determine the number of octets needed to represent this character.
305 * We always output the shortest sequence possible. Also specify the
306 * first few bits of the first octet, which contains the information
307 * about the sequence length.
308 */
309 if ((wc & ~0x7f) == 0) {
310 lead = 0;
311 len = 1;
312 } else if ((wc & ~0x7ff) == 0) {
313 lead = 0xc0;
314 len = 2;
315 } else if ((wc & ~0xffff) == 0) {
316 lead = 0xe0;
317 len = 3;
318 } else if ((wc & ~0x1fffff) == 0) {
319 lead = 0xf0;
320 len = 4;
321 } else if ((wc & ~0x3ffffff) == 0) {
322 lead = 0xf8;
323 len = 5;
324 } else if ((wc & ~0x7fffffff) == 0) {
325 lead = 0xfc;
326 len = 6;
327 } else {
328 errno = EILSEQ;
329 return ((size_t)-1);
330 }
331
332 /*
333 * Output the octets representing the character in chunks
334 * of 6 bits, least significant last. The first octet is
335 * a special case because it contains the sequence length
336 * information.
337 */
338 for (i = len - 1; i > 0; i--) {
339 s[i] = (wc & 0x3f) | 0x80;
340 wc >>= 6;
341 }
342 *s = (wc & 0xff) | lead;
343
344 return (len);
345 }
346
347 size_t
348 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
349 size_t nwc, size_t len, mbstate_t * __restrict ps)
350 {
351 _UTF8State *us;
352 char buf[MB_LEN_MAX];
353 const wchar_t *s;
354 size_t nbytes;
355 size_t nb;
356
357 us = (_UTF8State *)ps;
358
359 if (us->want != 0) {
360 errno = EINVAL;
361 return ((size_t)-1);
362 }
363
364 s = *src;
365 nbytes = 0;
366
367 if (dst == NULL) {
368 while (nwc-- > 0) {
369 if (0 <= *s && *s < 0x80)
370 /* Fast path for plain ASCII characters. */
371 nb = 1;
372 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
373 (size_t)-1)
374 /* Invalid character - wcrtomb() sets errno. */
375 return ((size_t)-1);
376 if (*s == L'\0')
377 return (nbytes + nb - 1);
378 s++;
379 nbytes += nb;
380 }
381 return (nbytes);
382 }
383
384 while (len > 0 && nwc-- > 0) {
385 if (0 <= *s && *s < 0x80) {
386 /* Fast path for plain ASCII characters. */
387 nb = 1;
388 *dst = *s;
389 } else if (len > (size_t)MB_CUR_MAX) {
390 /* Enough space to translate in-place. */
391 if ((nb = (int)_UTF8_wcrtomb(dst, *s, ps)) < 0) {
392 *src = s;
393 return ((size_t)-1);
394 }
395 } else {
396 /*
397 * May not be enough space; use temp. buffer.
398 */
399 if ((nb = (int)_UTF8_wcrtomb(buf, *s, ps)) < 0) {
400 *src = s;
401 return ((size_t)-1);
402 }
403 if (nb > (int)len)
404 /* MB sequence for character won't fit. */
405 break;
406 memcpy(dst, buf, nb);
407 }
408 if (*s == L'\0') {
409 *src = NULL;
410 return (nbytes + nb - 1);
411 }
412 s++;
413 dst += nb;
414 len -= nb;
415 nbytes += nb;
416 }
417 *src = s;
418 return (nbytes);
419 }