]> git.saurik.com Git - apple/libc.git/blob - locale/utf2-fbsd.c
Libc-594.9.5.tar.gz
[apple/libc.git] / locale / utf2-fbsd.c
1 /*-
2 * Copyright (c) 2002-2004 Tim J. Robbins
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/param.h>
28 /* dumb down UTF-8 to do UTF2 */
29 __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.11 2004/07/27 06:29:48 tjr Exp $");
30
31 #include "xlocale_private.h"
32
33 #include <errno.h>
34 #include <limits.h>
35 #include <runetype.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <wchar.h>
39 #include "mblocal.h"
40
41 #define UTF2_MB_CUR_MAX 3
42
43 static size_t _UTF2_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
44 mbstate_t * __restrict, locale_t);
45 static int _UTF2_mbsinit(const mbstate_t *, locale_t);
46 static size_t _UTF2_mbsnrtowcs(wchar_t * __restrict, const char ** __restrict,
47 size_t, size_t, mbstate_t * __restrict, locale_t);
48 static size_t _UTF2_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict, locale_t);
49 static size_t _UTF2_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
50 size_t, size_t, mbstate_t * __restrict, locale_t);
51
52 typedef struct {
53 wchar_t ch;
54 int want;
55 wchar_t lbound;
56 } _UTF2State;
57
58 __private_extern__ int
59 _UTF2_init(struct __xlocale_st_runelocale *xrl)
60 {
61
62 xrl->__mbrtowc = _UTF2_mbrtowc;
63 xrl->__wcrtomb = _UTF2_wcrtomb;
64 xrl->__mbsinit = _UTF2_mbsinit;
65 xrl->__mbsnrtowcs = _UTF2_mbsnrtowcs;
66 xrl->__wcsnrtombs = _UTF2_wcsnrtombs;
67 xrl->__mb_cur_max = UTF2_MB_CUR_MAX;
68
69 return (0);
70 }
71
72 static int
73 _UTF2_mbsinit(const mbstate_t *ps, locale_t loc)
74 {
75
76 return (ps == NULL || ((const _UTF2State *)ps)->want == 0);
77 }
78
79 static size_t
80 _UTF2_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
81 mbstate_t * __restrict ps, locale_t loc)
82 {
83 _UTF2State *us;
84 int ch, i, mask, want;
85 wchar_t lbound, wch;
86
87 us = (_UTF2State *)ps;
88
89 if (us->want < 0 || us->want > 3) {
90 errno = EINVAL;
91 return ((size_t)-1);
92 }
93
94 if (s == NULL) {
95 s = "";
96 n = 1;
97 pwc = NULL;
98 }
99
100 if (n == 0)
101 /* Incomplete multibyte sequence */
102 return ((size_t)-2);
103
104 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
105 /* Fast path for plain ASCII characters. */
106 if (pwc != NULL)
107 *pwc = ch;
108 return (ch != '\0' ? 1 : 0);
109 }
110
111 if (us->want == 0) {
112 /*
113 * Determine the number of octets that make up this character
114 * from the first octet, and a mask that extracts the
115 * interesting bits of the first octet. We already know
116 * the character is at least two bytes long.
117 *
118 * We also specify a lower bound for the character code to
119 * detect redundant, non-"shortest form" encodings. For
120 * example, the sequence C0 80 is _not_ a legal representation
121 * of the null character. This enforces a 1-to-1 mapping
122 * between character codes and their multibyte representations.
123 */
124 ch = (unsigned char)*s;
125 if ((ch & 0x80) == 0) {
126 mask = 0x7f;
127 want = 1;
128 lbound = 0;
129 } else if ((ch & 0xe0) == 0xc0) {
130 mask = 0x1f;
131 want = 2;
132 lbound = 0x80;
133 } else if ((ch & 0xf0) == 0xe0) {
134 mask = 0x0f;
135 want = 3;
136 lbound = 0x800;
137 } else {
138 /*
139 * Malformed input; input is not UTF2.
140 */
141 errno = EILSEQ;
142 return ((size_t)-1);
143 }
144 } else {
145 want = us->want;
146 lbound = us->lbound;
147 }
148
149 /*
150 * Decode the octet sequence representing the character in chunks
151 * of 6 bits, most significant first.
152 */
153 if (us->want == 0)
154 wch = (unsigned char)*s++ & mask;
155 else
156 wch = us->ch;
157 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
158 if ((*s & 0xc0) != 0x80) {
159 /*
160 * Malformed input; bad characters in the middle
161 * of a character.
162 */
163 errno = EILSEQ;
164 return ((size_t)-1);
165 }
166 wch <<= 6;
167 wch |= *s++ & 0x3f;
168 }
169 if (i < want) {
170 /* Incomplete multibyte sequence. */
171 us->want = want - i;
172 us->lbound = lbound;
173 us->ch = wch;
174 return ((size_t)-2);
175 }
176 if (wch < lbound) {
177 /*
178 * Malformed input; redundant encoding.
179 */
180 errno = EILSEQ;
181 return ((size_t)-1);
182 }
183 if (pwc != NULL)
184 *pwc = wch;
185 us->want = 0;
186 return (wch == L'\0' ? 0 : want);
187 }
188
189 static size_t
190 _UTF2_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
191 size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc)
192 {
193 _UTF2State *us;
194 const char *s;
195 size_t nchr;
196 wchar_t wc;
197 size_t nb;
198
199 us = (_UTF2State *)ps;
200
201 s = *src;
202 nchr = 0;
203
204 if (dst == NULL) {
205 /*
206 * The fast path in the loop below is not safe if an ASCII
207 * character appears as anything but the first byte of a
208 * multibyte sequence. Check now to avoid doing it in the loop.
209 */
210 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
211 errno = EILSEQ;
212 return ((size_t)-1);
213 }
214 for (;;) {
215 if (nms > 0 && (signed char)*s > 0)
216 /*
217 * Fast path for plain ASCII characters
218 * excluding NUL.
219 */
220 nb = 1;
221 else if ((nb = _UTF2_mbrtowc(&wc, s, nms, ps, loc)) ==
222 (size_t)-1)
223 /* Invalid sequence - mbrtowc() sets errno. */
224 return ((size_t)-1);
225 else if (nb == 0 || nb == (size_t)-2)
226 return (nchr);
227 s += nb;
228 nms -= nb;
229 nchr++;
230 }
231 /*NOTREACHED*/
232 }
233
234 /*
235 * The fast path in the loop below is not safe if an ASCII
236 * character appears as anything but the first byte of a
237 * multibyte sequence. Check now to avoid doing it in the loop.
238 */
239 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
240 errno = EILSEQ;
241 return ((size_t)-1);
242 }
243 while (len-- > 0) {
244 if (nms > 0 && (signed char)*s > 0) {
245 /*
246 * Fast path for plain ASCII characters
247 * excluding NUL.
248 */
249 *dst = (wchar_t)*s;
250 nb = 1;
251 } else if ((nb = _UTF2_mbrtowc(dst, s, nms, ps, loc)) ==
252 (size_t)-1) {
253 *src = s;
254 return ((size_t)-1);
255 } else if (nb == (size_t)-2) {
256 *src = s + nms;
257 return (nchr);
258 } else if (nb == 0) {
259 *src = NULL;
260 return (nchr);
261 }
262 s += nb;
263 nms -= nb;
264 nchr++;
265 dst++;
266 }
267 *src = s;
268 return (nchr);
269 }
270
271 static size_t
272 _UTF2_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc)
273 {
274 _UTF2State *us;
275 unsigned char lead;
276 int i, len;
277
278 us = (_UTF2State *)ps;
279
280 if (us->want != 0) {
281 errno = EINVAL;
282 return ((size_t)-1);
283 }
284
285 if (s == NULL)
286 /* Reset to initial shift state (no-op) */
287 return (1);
288
289 if ((wc & ~0x7f) == 0) {
290 /* Fast path for plain ASCII characters. */
291 *s = (char)wc;
292 return (1);
293 }
294
295 /*
296 * Determine the number of octets needed to represent this character.
297 * We always output the shortest sequence possible. Also specify the
298 * first few bits of the first octet, which contains the information
299 * about the sequence length.
300 */
301 if ((wc & ~0x7f) == 0) {
302 lead = 0;
303 len = 1;
304 } else if ((wc & ~0x7ff) == 0) {
305 lead = 0xc0;
306 len = 2;
307 } else if ((wc & ~0xffff) == 0) {
308 lead = 0xe0;
309 len = 3;
310 } else {
311 errno = EILSEQ;
312 return ((size_t)-1);
313 }
314
315 /*
316 * Output the octets representing the character in chunks
317 * of 6 bits, least significant last. The first octet is
318 * a special case because it contains the sequence length
319 * information.
320 */
321 for (i = len - 1; i > 0; i--) {
322 s[i] = (wc & 0x3f) | 0x80;
323 wc >>= 6;
324 }
325 *s = (wc & 0xff) | lead;
326
327 return (len);
328 }
329
330 static size_t
331 _UTF2_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
332 size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc)
333 {
334 _UTF2State *us;
335 char buf[MB_LEN_MAX];
336 const wchar_t *s;
337 size_t nbytes;
338 size_t nb;
339
340 us = (_UTF2State *)ps;
341
342 if (us->want != 0) {
343 errno = EINVAL;
344 return ((size_t)-1);
345 }
346
347 s = *src;
348 nbytes = 0;
349
350 if (dst == NULL) {
351 while (nwc-- > 0) {
352 if (0 <= *s && *s < 0x80)
353 /* Fast path for plain ASCII characters. */
354 nb = 1;
355 else if ((nb = _UTF2_wcrtomb(buf, *s, ps, loc)) ==
356 (size_t)-1)
357 /* Invalid character - wcrtomb() sets errno. */
358 return ((size_t)-1);
359 if (*s == L'\0')
360 return (nbytes + nb - 1);
361 s++;
362 nbytes += nb;
363 }
364 return (nbytes);
365 }
366
367 while (len > 0 && nwc-- > 0) {
368 if (0 <= *s && *s < 0x80) {
369 /* Fast path for plain ASCII characters. */
370 nb = 1;
371 *dst = *s;
372 } else if (len > (size_t)UTF2_MB_CUR_MAX) {
373 /* Enough space to translate in-place. */
374 if ((nb = (int)_UTF2_wcrtomb(dst, *s, ps, loc)) < 0) {
375 *src = s;
376 return ((size_t)-1);
377 }
378 } else {
379 /*
380 * May not be enough space; use temp. buffer.
381 */
382 if ((nb = (int)_UTF2_wcrtomb(buf, *s, ps, loc)) < 0) {
383 *src = s;
384 return ((size_t)-1);
385 }
386 if (nb > (int)len)
387 /* MB sequence for character won't fit. */
388 break;
389 memcpy(dst, buf, nb);
390 }
391 if (*s == L'\0') {
392 *src = NULL;
393 return (nbytes + nb - 1);
394 }
395 s++;
396 dst += nb;
397 len -= nb;
398 nbytes += nb;
399 }
400 *src = s;
401 return (nbytes);
402 }