]> git.saurik.com Git - apple/libc.git/blame - locale/FreeBSD/utf8.c
Libc-763.13.tar.gz
[apple/libc.git] / locale / FreeBSD / utf8.c
CommitLineData
9385eb3d 1/*-
3d9156a7 2 * Copyright (c) 2002-2004 Tim J. Robbins
9385eb3d
A
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
3d9156a7 27#include <sys/param.h>
1f2f436a 28__FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $");
9385eb3d 29
3d9156a7
A
30#include <errno.h>
31#include <limits.h>
32#include <runetype.h>
9385eb3d 33#include <stdlib.h>
3d9156a7
A
34#include <string.h>
35#include <wchar.h>
36#include "mblocal.h"
9385eb3d 37
1f2f436a
A
38extern int __mb_sb_limit;
39
40static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
41 size_t, mbstate_t * __restrict);
42static int _UTF8_mbsinit(const mbstate_t *);
43static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
44 const char ** __restrict, size_t, size_t,
45 mbstate_t * __restrict);
46static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
47 mbstate_t * __restrict);
48static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
49 size_t, size_t, mbstate_t * __restrict);
3d9156a7
A
50
51typedef struct {
52 wchar_t ch;
53 int want;
54 wchar_t lbound;
55} _UTF8State;
9385eb3d
A
56
57int
58_UTF8_init(_RuneLocale *rl)
59{
60
3d9156a7
A
61 __mbrtowc = _UTF8_mbrtowc;
62 __wcrtomb = _UTF8_wcrtomb;
63 __mbsinit = _UTF8_mbsinit;
64 __mbsnrtowcs = _UTF8_mbsnrtowcs;
65 __wcsnrtombs = _UTF8_wcsnrtombs;
9385eb3d
A
66 _CurrentRuneLocale = rl;
67 __mb_cur_max = 6;
1f2f436a
A
68 /*
69 * UCS-4 encoding used as the internal representation, so
70 * slots 0x0080-0x00FF are occuped and must be excluded
71 * from the single byte ctype by setting the limit.
72 */
73 __mb_sb_limit = 128;
9385eb3d
A
74
75 return (0);
76}
77
1f2f436a 78static int
3d9156a7
A
79_UTF8_mbsinit(const mbstate_t *ps)
80{
81
82 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
83}
84
1f2f436a 85static size_t
3d9156a7
A
86_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
87 mbstate_t * __restrict ps)
9385eb3d 88{
3d9156a7
A
89 _UTF8State *us;
90 int ch, i, mask, want;
91 wchar_t lbound, wch;
92
93 us = (_UTF8State *)ps;
9385eb3d 94
3d9156a7
A
95 if (us->want < 0 || us->want > 6) {
96 errno = EINVAL;
97 return ((size_t)-1);
9385eb3d
A
98 }
99
3d9156a7
A
100 if (s == NULL) {
101 s = "";
102 n = 1;
103 pwc = NULL;
9385eb3d
A
104 }
105
3d9156a7
A
106 if (n == 0)
107 /* Incomplete multibyte sequence */
108 return ((size_t)-2);
109
110 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
111 /* Fast path for plain ASCII characters. */
112 if (pwc != NULL)
113 *pwc = ch;
114 return (ch != '\0' ? 1 : 0);
115 }
116
117 if (us->want == 0) {
9385eb3d 118 /*
3d9156a7
A
119 * Determine the number of octets that make up this character
120 * from the first octet, and a mask that extracts the
121 * interesting bits of the first octet. We already know
122 * the character is at least two bytes long.
123 *
124 * We also specify a lower bound for the character code to
125 * detect redundant, non-"shortest form" encodings. For
126 * example, the sequence C0 80 is _not_ a legal representation
127 * of the null character. This enforces a 1-to-1 mapping
128 * between character codes and their multibyte representations.
9385eb3d 129 */
3d9156a7
A
130 ch = (unsigned char)*s;
131 if ((ch & 0x80) == 0) {
132 mask = 0x7f;
133 want = 1;
134 lbound = 0;
135 } else if ((ch & 0xe0) == 0xc0) {
136 mask = 0x1f;
137 want = 2;
138 lbound = 0x80;
139 } else if ((ch & 0xf0) == 0xe0) {
140 mask = 0x0f;
141 want = 3;
142 lbound = 0x800;
143 } else if ((ch & 0xf8) == 0xf0) {
144 mask = 0x07;
145 want = 4;
146 lbound = 0x10000;
147 } else if ((ch & 0xfc) == 0xf8) {
148 mask = 0x03;
149 want = 5;
150 lbound = 0x200000;
1f2f436a 151 } else if ((ch & 0xfe) == 0xfc) {
3d9156a7
A
152 mask = 0x01;
153 want = 6;
154 lbound = 0x4000000;
155 } else {
156 /*
157 * Malformed input; input is not UTF-8.
158 */
159 errno = EILSEQ;
160 return ((size_t)-1);
161 }
162 } else {
163 want = us->want;
164 lbound = us->lbound;
9385eb3d
A
165 }
166
167 /*
168 * Decode the octet sequence representing the character in chunks
169 * of 6 bits, most significant first.
170 */
3d9156a7
A
171 if (us->want == 0)
172 wch = (unsigned char)*s++ & mask;
173 else
174 wch = us->ch;
175 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
176 if ((*s & 0xc0) != 0x80) {
9385eb3d
A
177 /*
178 * Malformed input; bad characters in the middle
179 * of a character.
180 */
3d9156a7
A
181 errno = EILSEQ;
182 return ((size_t)-1);
9385eb3d
A
183 }
184 wch <<= 6;
3d9156a7 185 wch |= *s++ & 0x3f;
9385eb3d 186 }
3d9156a7
A
187 if (i < want) {
188 /* Incomplete multibyte sequence. */
189 us->want = want - i;
190 us->lbound = lbound;
191 us->ch = wch;
192 return ((size_t)-2);
193 }
194 if (wch < lbound) {
9385eb3d
A
195 /*
196 * Malformed input; redundant encoding.
197 */
3d9156a7
A
198 errno = EILSEQ;
199 return ((size_t)-1);
200 }
201 if (pwc != NULL)
202 *pwc = wch;
203 us->want = 0;
204 return (wch == L'\0' ? 0 : want);
9385eb3d
A
205}
206
1f2f436a 207static size_t
3d9156a7
A
208_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
209 size_t nms, size_t len, mbstate_t * __restrict ps)
9385eb3d 210{
3d9156a7
A
211 _UTF8State *us;
212 const char *s;
213 size_t nchr;
214 wchar_t wc;
215 size_t nb;
216
217 us = (_UTF8State *)ps;
218
219 s = *src;
220 nchr = 0;
221
222 if (dst == NULL) {
223 /*
224 * The fast path in the loop below is not safe if an ASCII
225 * character appears as anything but the first byte of a
226 * multibyte sequence. Check now to avoid doing it in the loop.
227 */
228 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
229 errno = EILSEQ;
230 return ((size_t)-1);
231 }
232 for (;;) {
233 if (nms > 0 && (signed char)*s > 0)
234 /*
235 * Fast path for plain ASCII characters
236 * excluding NUL.
237 */
238 nb = 1;
239 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
240 (size_t)-1)
241 /* Invalid sequence - mbrtowc() sets errno. */
242 return ((size_t)-1);
243 else if (nb == 0 || nb == (size_t)-2)
244 return (nchr);
245 s += nb;
246 nms -= nb;
247 nchr++;
248 }
249 /*NOTREACHED*/
250 }
251
252 /*
253 * The fast path in the loop below is not safe if an ASCII
254 * character appears as anything but the first byte of a
255 * multibyte sequence. Check now to avoid doing it in the loop.
256 */
257 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
258 errno = EILSEQ;
259 return ((size_t)-1);
260 }
261 while (len-- > 0) {
262 if (nms > 0 && (signed char)*s > 0) {
263 /*
264 * Fast path for plain ASCII characters
265 * excluding NUL.
266 */
267 *dst = (wchar_t)*s;
268 nb = 1;
269 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
270 (size_t)-1) {
271 *src = s;
272 return ((size_t)-1);
273 } else if (nb == (size_t)-2) {
274 *src = s + nms;
275 return (nchr);
276 } else if (nb == 0) {
277 *src = NULL;
278 return (nchr);
279 }
280 s += nb;
281 nms -= nb;
282 nchr++;
283 dst++;
284 }
285 *src = s;
286 return (nchr);
287}
288
1f2f436a 289static size_t
3d9156a7
A
290_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
291{
292 _UTF8State *us;
9385eb3d
A
293 unsigned char lead;
294 int i, len;
295
3d9156a7
A
296 us = (_UTF8State *)ps;
297
298 if (us->want != 0) {
299 errno = EINVAL;
300 return ((size_t)-1);
301 }
302
303 if (s == NULL)
304 /* Reset to initial shift state (no-op) */
305 return (1);
306
307 if ((wc & ~0x7f) == 0) {
308 /* Fast path for plain ASCII characters. */
309 *s = (char)wc;
310 return (1);
311 }
312
9385eb3d
A
313 /*
314 * Determine the number of octets needed to represent this character.
315 * We always output the shortest sequence possible. Also specify the
316 * first few bits of the first octet, which contains the information
317 * about the sequence length.
318 */
3d9156a7 319 if ((wc & ~0x7f) == 0) {
9385eb3d
A
320 lead = 0;
321 len = 1;
3d9156a7 322 } else if ((wc & ~0x7ff) == 0) {
9385eb3d
A
323 lead = 0xc0;
324 len = 2;
3d9156a7 325 } else if ((wc & ~0xffff) == 0) {
9385eb3d
A
326 lead = 0xe0;
327 len = 3;
3d9156a7 328 } else if ((wc & ~0x1fffff) == 0) {
9385eb3d
A
329 lead = 0xf0;
330 len = 4;
3d9156a7 331 } else if ((wc & ~0x3ffffff) == 0) {
9385eb3d
A
332 lead = 0xf8;
333 len = 5;
3d9156a7 334 } else if ((wc & ~0x7fffffff) == 0) {
9385eb3d
A
335 lead = 0xfc;
336 len = 6;
337 } else {
3d9156a7
A
338 errno = EILSEQ;
339 return ((size_t)-1);
9385eb3d
A
340 }
341
3d9156a7
A
342 /*
343 * Output the octets representing the character in chunks
344 * of 6 bits, least significant last. The first octet is
345 * a special case because it contains the sequence length
346 * information.
347 */
348 for (i = len - 1; i > 0; i--) {
349 s[i] = (wc & 0x3f) | 0x80;
350 wc >>= 6;
9385eb3d 351 }
3d9156a7 352 *s = (wc & 0xff) | lead;
9385eb3d
A
353
354 return (len);
355}
3d9156a7 356
1f2f436a 357static size_t
3d9156a7
A
358_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
359 size_t nwc, size_t len, mbstate_t * __restrict ps)
360{
361 _UTF8State *us;
362 char buf[MB_LEN_MAX];
363 const wchar_t *s;
364 size_t nbytes;
365 size_t nb;
366
367 us = (_UTF8State *)ps;
368
369 if (us->want != 0) {
370 errno = EINVAL;
371 return ((size_t)-1);
372 }
373
374 s = *src;
375 nbytes = 0;
376
377 if (dst == NULL) {
378 while (nwc-- > 0) {
379 if (0 <= *s && *s < 0x80)
380 /* Fast path for plain ASCII characters. */
381 nb = 1;
382 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
383 (size_t)-1)
384 /* Invalid character - wcrtomb() sets errno. */
385 return ((size_t)-1);
386 if (*s == L'\0')
387 return (nbytes + nb - 1);
388 s++;
389 nbytes += nb;
390 }
391 return (nbytes);
392 }
393
394 while (len > 0 && nwc-- > 0) {
395 if (0 <= *s && *s < 0x80) {
396 /* Fast path for plain ASCII characters. */
397 nb = 1;
398 *dst = *s;
399 } else if (len > (size_t)MB_CUR_MAX) {
400 /* Enough space to translate in-place. */
1f2f436a 401 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
3d9156a7
A
402 *src = s;
403 return ((size_t)-1);
404 }
405 } else {
406 /*
407 * May not be enough space; use temp. buffer.
408 */
1f2f436a 409 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
3d9156a7
A
410 *src = s;
411 return ((size_t)-1);
412 }
413 if (nb > (int)len)
414 /* MB sequence for character won't fit. */
415 break;
416 memcpy(dst, buf, nb);
417 }
418 if (*s == L'\0') {
419 *src = NULL;
420 return (nbytes + nb - 1);
421 }
422 s++;
423 dst += nb;
424 len -= nb;
425 nbytes += nb;
426 }
427 *src = s;
428 return (nbytes);
429}