]>
Commit | Line | Data |
---|---|---|
1 | /*- | |
2 | * Copyright (c) 2002-2004 Tim J. Robbins | |
3 | * All rights reserved. | |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * | |
14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
18 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
19 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
20 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
21 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
22 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
23 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
24 | * SUCH DAMAGE. | |
25 | */ | |
26 | ||
27 | // MWW: Generated by applying utf2.c.patch to utf8.c in the FreeBSD patch sets. | |
28 | ||
29 | #include <sys/param.h> | |
30 | __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $"); | |
31 | ||
32 | #include "xlocale_private.h" | |
33 | ||
34 | #include <errno.h> | |
35 | #include <limits.h> | |
36 | #include <runetype.h> | |
37 | #include <stdlib.h> | |
38 | #include <string.h> | |
39 | #include <wchar.h> | |
40 | #include "mblocal.h" | |
41 | ||
42 | #define UTF2_MB_CUR_MAX 3 | |
43 | ||
44 | static size_t _UTF2_mbrtowc(wchar_t * __restrict, const char * __restrict, | |
45 | size_t, mbstate_t * __restrict, locale_t); | |
46 | static int _UTF2_mbsinit(const mbstate_t *, locale_t); | |
47 | static size_t _UTF2_mbsnrtowcs(wchar_t * __restrict, | |
48 | const char ** __restrict, size_t, size_t, | |
49 | mbstate_t * __restrict, locale_t); | |
50 | static size_t _UTF2_wcrtomb(char * __restrict, wchar_t, | |
51 | mbstate_t * __restrict, locale_t); | |
52 | static size_t _UTF2_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, | |
53 | size_t, size_t, mbstate_t * __restrict, locale_t); | |
54 | ||
55 | typedef struct { | |
56 | wchar_t ch; | |
57 | int want; | |
58 | wchar_t lbound; | |
59 | } _UTF2State; | |
60 | ||
61 | int | |
62 | _UTF2_init(struct __xlocale_st_runelocale *xrl) | |
63 | { | |
64 | ||
65 | xrl->__mbrtowc = _UTF2_mbrtowc; | |
66 | xrl->__wcrtomb = _UTF2_wcrtomb; | |
67 | xrl->__mbsinit = _UTF2_mbsinit; | |
68 | xrl->__mbsnrtowcs = _UTF2_mbsnrtowcs; | |
69 | xrl->__wcsnrtombs = _UTF2_wcsnrtombs; | |
70 | xrl->__mb_cur_max = UTF2_MB_CUR_MAX; | |
71 | /* | |
72 | * UCS-4 encoding used as the internal representation, so | |
73 | * slots 0x0080-0x00FF are occuped and must be excluded | |
74 | * from the single byte ctype by setting the limit. | |
75 | */ | |
76 | xrl->__mb_sb_limit = 128; | |
77 | ||
78 | return (0); | |
79 | } | |
80 | ||
81 | static int | |
82 | _UTF2_mbsinit(const mbstate_t *ps, locale_t loc) | |
83 | { | |
84 | ||
85 | return (ps == NULL || ((const _UTF2State *)ps)->want == 0); | |
86 | } | |
87 | ||
88 | static size_t | |
89 | _UTF2_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, | |
90 | mbstate_t * __restrict ps, locale_t loc) | |
91 | { | |
92 | _UTF2State *us; | |
93 | int ch, i, mask, want; | |
94 | wchar_t lbound, wch; | |
95 | ||
96 | us = (_UTF2State *)ps; | |
97 | ||
98 | if (us->want < 0 || us->want > 6) { | |
99 | errno = EINVAL; | |
100 | return ((size_t)-1); | |
101 | } | |
102 | ||
103 | if (s == NULL) { | |
104 | s = ""; | |
105 | n = 1; | |
106 | pwc = NULL; | |
107 | } | |
108 | ||
109 | if (n == 0) | |
110 | /* Incomplete multibyte sequence */ | |
111 | return ((size_t)-2); | |
112 | ||
113 | if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { | |
114 | /* Fast path for plain ASCII characters. */ | |
115 | if (pwc != NULL) | |
116 | *pwc = ch; | |
117 | return (ch != '\0' ? 1 : 0); | |
118 | } | |
119 | ||
120 | if (us->want == 0) { | |
121 | /* | |
122 | * Determine the number of octets that make up this character | |
123 | * from the first octet, and a mask that extracts the | |
124 | * interesting bits of the first octet. We already know | |
125 | * the character is at least two bytes long. | |
126 | * | |
127 | * We also specify a lower bound for the character code to | |
128 | * detect redundant, non-"shortest form" encodings. For | |
129 | * example, the sequence C0 80 is _not_ a legal representation | |
130 | * of the null character. This enforces a 1-to-1 mapping | |
131 | * between character codes and their multibyte representations. | |
132 | */ | |
133 | ch = (unsigned char)*s; | |
134 | if ((ch & 0x80) == 0) { | |
135 | mask = 0x7f; | |
136 | want = 1; | |
137 | lbound = 0; | |
138 | } else if ((ch & 0xe0) == 0xc0) { | |
139 | mask = 0x1f; | |
140 | want = 2; | |
141 | lbound = 0x80; | |
142 | } else if ((ch & 0xf0) == 0xe0) { | |
143 | mask = 0x0f; | |
144 | want = 3; | |
145 | lbound = 0x800; | |
146 | } else { | |
147 | /* | |
148 | * Malformed input; input is not UTF2. | |
149 | */ | |
150 | errno = EILSEQ; | |
151 | return ((size_t)-1); | |
152 | } | |
153 | } else { | |
154 | want = us->want; | |
155 | lbound = us->lbound; | |
156 | } | |
157 | ||
158 | /* | |
159 | * Decode the octet sequence representing the character in chunks | |
160 | * of 6 bits, most significant first. | |
161 | */ | |
162 | if (us->want == 0) | |
163 | wch = (unsigned char)*s++ & mask; | |
164 | else | |
165 | wch = us->ch; | |
166 | for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { | |
167 | if ((*s & 0xc0) != 0x80) { | |
168 | /* | |
169 | * Malformed input; bad characters in the middle | |
170 | * of a character. | |
171 | */ | |
172 | errno = EILSEQ; | |
173 | return ((size_t)-1); | |
174 | } | |
175 | wch <<= 6; | |
176 | wch |= *s++ & 0x3f; | |
177 | } | |
178 | if (i < want) { | |
179 | /* Incomplete multibyte sequence. */ | |
180 | us->want = want - i; | |
181 | us->lbound = lbound; | |
182 | us->ch = wch; | |
183 | return ((size_t)-2); | |
184 | } | |
185 | if (wch < lbound) { | |
186 | /* | |
187 | * Malformed input; redundant encoding. | |
188 | */ | |
189 | errno = EILSEQ; | |
190 | return ((size_t)-1); | |
191 | } | |
192 | if (pwc != NULL) | |
193 | *pwc = wch; | |
194 | us->want = 0; | |
195 | return (wch == L'\0' ? 0 : want); | |
196 | } | |
197 | ||
198 | static size_t | |
199 | _UTF2_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, | |
200 | size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc) | |
201 | { | |
202 | _UTF2State *us; | |
203 | const char *s; | |
204 | size_t nchr; | |
205 | wchar_t wc; | |
206 | size_t nb; | |
207 | ||
208 | us = (_UTF2State *)ps; | |
209 | ||
210 | s = *src; | |
211 | nchr = 0; | |
212 | ||
213 | if (dst == NULL) { | |
214 | /* | |
215 | * The fast path in the loop below is not safe if an ASCII | |
216 | * character appears as anything but the first byte of a | |
217 | * multibyte sequence. Check now to avoid doing it in the loop. | |
218 | */ | |
219 | if (nms > 0 && us->want > 0 && (signed char)*s > 0) { | |
220 | errno = EILSEQ; | |
221 | return ((size_t)-1); | |
222 | } | |
223 | for (;;) { | |
224 | if (nms > 0 && (signed char)*s > 0) | |
225 | /* | |
226 | * Fast path for plain ASCII characters | |
227 | * excluding NUL. | |
228 | */ | |
229 | nb = 1; | |
230 | else if ((nb = _UTF2_mbrtowc(&wc, s, nms, ps, loc)) == | |
231 | (size_t)-1) | |
232 | /* Invalid sequence - mbrtowc() sets errno. */ | |
233 | return ((size_t)-1); | |
234 | else if (nb == 0 || nb == (size_t)-2) | |
235 | return (nchr); | |
236 | s += nb; | |
237 | nms -= nb; | |
238 | nchr++; | |
239 | } | |
240 | /*NOTREACHED*/ | |
241 | } | |
242 | ||
243 | /* | |
244 | * The fast path in the loop below is not safe if an ASCII | |
245 | * character appears as anything but the first byte of a | |
246 | * multibyte sequence. Check now to avoid doing it in the loop. | |
247 | */ | |
248 | if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { | |
249 | errno = EILSEQ; | |
250 | return ((size_t)-1); | |
251 | } | |
252 | while (len-- > 0) { | |
253 | if (nms > 0 && (signed char)*s > 0) { | |
254 | /* | |
255 | * Fast path for plain ASCII characters | |
256 | * excluding NUL. | |
257 | */ | |
258 | *dst = (wchar_t)*s; | |
259 | nb = 1; | |
260 | } else if ((nb = _UTF2_mbrtowc(dst, s, nms, ps, loc)) == | |
261 | (size_t)-1) { | |
262 | *src = s; | |
263 | return ((size_t)-1); | |
264 | } else if (nb == (size_t)-2) { | |
265 | *src = s + nms; | |
266 | return (nchr); | |
267 | } else if (nb == 0) { | |
268 | *src = NULL; | |
269 | return (nchr); | |
270 | } | |
271 | s += nb; | |
272 | nms -= nb; | |
273 | nchr++; | |
274 | dst++; | |
275 | } | |
276 | *src = s; | |
277 | return (nchr); | |
278 | } | |
279 | ||
280 | static size_t | |
281 | _UTF2_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc) | |
282 | { | |
283 | _UTF2State *us; | |
284 | unsigned char lead; | |
285 | int i, len; | |
286 | ||
287 | us = (_UTF2State *)ps; | |
288 | ||
289 | if (us->want != 0) { | |
290 | errno = EINVAL; | |
291 | return ((size_t)-1); | |
292 | } | |
293 | ||
294 | if (s == NULL) | |
295 | /* Reset to initial shift state (no-op) */ | |
296 | return (1); | |
297 | ||
298 | if ((wc & ~0x7f) == 0) { | |
299 | /* Fast path for plain ASCII characters. */ | |
300 | *s = (char)wc; | |
301 | return (1); | |
302 | } | |
303 | ||
304 | /* | |
305 | * Determine the number of octets needed to represent this character. | |
306 | * We always output the shortest sequence possible. Also specify the | |
307 | * first few bits of the first octet, which contains the information | |
308 | * about the sequence length. | |
309 | */ | |
310 | if ((wc & ~0x7f) == 0) { | |
311 | lead = 0; | |
312 | len = 1; | |
313 | } else if ((wc & ~0x7ff) == 0) { | |
314 | lead = 0xc0; | |
315 | len = 2; | |
316 | } else if ((wc & ~0xffff) == 0) { | |
317 | lead = 0xe0; | |
318 | len = 3; | |
319 | } else { | |
320 | errno = EILSEQ; | |
321 | return ((size_t)-1); | |
322 | } | |
323 | ||
324 | /* | |
325 | * Output the octets representing the character in chunks | |
326 | * of 6 bits, least significant last. The first octet is | |
327 | * a special case because it contains the sequence length | |
328 | * information. | |
329 | */ | |
330 | for (i = len - 1; i > 0; i--) { | |
331 | s[i] = (wc & 0x3f) | 0x80; | |
332 | wc >>= 6; | |
333 | } | |
334 | *s = (wc & 0xff) | lead; | |
335 | ||
336 | return (len); | |
337 | } | |
338 | ||
339 | static size_t | |
340 | _UTF2_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, | |
341 | size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc) | |
342 | { | |
343 | _UTF2State *us; | |
344 | char buf[MB_LEN_MAX]; | |
345 | const wchar_t *s; | |
346 | size_t nbytes; | |
347 | size_t nb; | |
348 | ||
349 | us = (_UTF2State *)ps; | |
350 | ||
351 | if (us->want != 0) { | |
352 | errno = EINVAL; | |
353 | return ((size_t)-1); | |
354 | } | |
355 | ||
356 | s = *src; | |
357 | nbytes = 0; | |
358 | ||
359 | if (dst == NULL) { | |
360 | while (nwc-- > 0) { | |
361 | if (0 <= *s && *s < 0x80) | |
362 | /* Fast path for plain ASCII characters. */ | |
363 | nb = 1; | |
364 | else if ((nb = _UTF2_wcrtomb(buf, *s, ps, loc)) == | |
365 | (size_t)-1) | |
366 | /* Invalid character - wcrtomb() sets errno. */ | |
367 | return ((size_t)-1); | |
368 | if (*s == L'\0') | |
369 | return (nbytes + nb - 1); | |
370 | s++; | |
371 | nbytes += nb; | |
372 | } | |
373 | return (nbytes); | |
374 | } | |
375 | ||
376 | while (len > 0 && nwc-- > 0) { | |
377 | if (0 <= *s && *s < 0x80) { | |
378 | /* Fast path for plain ASCII characters. */ | |
379 | nb = 1; | |
380 | *dst = *s; | |
381 | } else if (len > (size_t)UTF2_MB_CUR_MAX) { | |
382 | /* Enough space to translate in-place. */ | |
383 | if ((nb = _UTF2_wcrtomb(dst, *s, ps, loc)) == (size_t)-1) { | |
384 | *src = s; | |
385 | return ((size_t)-1); | |
386 | } | |
387 | } else { | |
388 | /* | |
389 | * May not be enough space; use temp. buffer. | |
390 | */ | |
391 | if ((nb = _UTF2_wcrtomb(buf, *s, ps, loc)) == (size_t)-1) { | |
392 | *src = s; | |
393 | return ((size_t)-1); | |
394 | } | |
395 | if (nb > (int)len) | |
396 | /* MB sequence for character won't fit. */ | |
397 | break; | |
398 | memcpy(dst, buf, nb); | |
399 | } | |
400 | if (*s == L'\0') { | |
401 | *src = NULL; | |
402 | return (nbytes + nb - 1); | |
403 | } | |
404 | s++; | |
405 | dst += nb; | |
406 | len -= nb; | |
407 | nbytes += nb; | |
408 | } | |
409 | *src = s; | |
410 | return (nbytes); | |
411 | } |