X-Git-Url: https://git.saurik.com/apple/libc.git/blobdiff_plain/3d9156a7a519a5e3aa1b92e9d9d4b991f1aed7ff..2be56ee90c5c5bee77895b8787a43e894249002b:/locale/FreeBSD/utf8.c diff --git a/locale/FreeBSD/utf8.c b/locale/FreeBSD/utf8.c index 26f89f7..100573e 100644 --- a/locale/FreeBSD/utf8.c +++ b/locale/FreeBSD/utf8.c @@ -25,7 +25,9 @@ */ #include -__FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.11 2004/07/27 06:29:48 tjr Exp $"); +__FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $"); + +#include "xlocale_private.h" #include #include @@ -35,54 +37,101 @@ __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.11 2004/07/27 06:29:48 tjr Ex #include #include "mblocal.h" -size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, - mbstate_t * __restrict); -int _UTF8_mbsinit(const mbstate_t *); -size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, const char ** __restrict, - size_t, size_t, mbstate_t * __restrict); -size_t _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); -size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, - size_t, size_t, mbstate_t * __restrict); +/* + * 10952550: detect ill-formed UTF-8 + * Unicode 6.0, section D92, mandates specific byte sequences for well- + * formed UTF-8. UTF-8 sequences are now limited to 4 bytes, while the + * FreeBSD code originally handled up to 6. Illegal surrogate code point + * sequences are now detected. And while "non-shortest forms" were detected, + * this only happened after completing the sequence. Now, all ill-formed + * sequences are detected at the earliest point. + * + * Table 3-7. Well-Formed UTF-8 Byte Sequences + * + * Code Points 1st 2nd 3rd 4th Byte + * U+0000..U+007F 00..7F + * U+0080..U+07FF C2..DF 80..BF + * U+0800..U+0FFF E0 A0..BF 80..BF + * U+1000..U+CFFF E1..EC 80..BF 80..BF + * U+D000..U+D7FF ED 80..9F 80..BF + * U+E000..U+FFFF EE..EF 80..BF 80..BF + * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + * + * Note that while any 3rd and 4th byte can be in the range 80..BF, the + * second byte is often limited to a smaller range. + */ + +typedef struct { + unsigned char lowerbound; + unsigned char upperbound; +} SecondByte; +static SecondByte sb_00_00 = {0x00, 0x00}; +static SecondByte sb_80_8F = {0x80, 0x8F}; +static SecondByte sb_80_9F = {0x80, 0x9F}; +static SecondByte sb_80_BF = {0x80, 0xBF}; +static SecondByte sb_90_BF = {0x90, 0xBF}; +static SecondByte sb_A0_BF = {0xA0, 0xBF}; + +#define UTF8_MB_CUR_MAX 4 + +static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, + size_t, mbstate_t * __restrict, locale_t); +static int _UTF8_mbsinit(const mbstate_t *, locale_t); +static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, + const char ** __restrict, size_t, size_t, + mbstate_t * __restrict, locale_t); +static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, + mbstate_t * __restrict, locale_t); +static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, + size_t, size_t, mbstate_t * __restrict, locale_t); typedef struct { wchar_t ch; int want; - wchar_t lbound; + SecondByte sb; } _UTF8State; int -_UTF8_init(_RuneLocale *rl) +_UTF8_init(struct __xlocale_st_runelocale *xrl) { - __mbrtowc = _UTF8_mbrtowc; - __wcrtomb = _UTF8_wcrtomb; - __mbsinit = _UTF8_mbsinit; - __mbsnrtowcs = _UTF8_mbsnrtowcs; - __wcsnrtombs = _UTF8_wcsnrtombs; - _CurrentRuneLocale = rl; - __mb_cur_max = 6; + xrl->__mbrtowc = _UTF8_mbrtowc; + xrl->__wcrtomb = _UTF8_wcrtomb; + xrl->__mbsinit = _UTF8_mbsinit; + xrl->__mbsnrtowcs = _UTF8_mbsnrtowcs; + xrl->__wcsnrtombs = _UTF8_wcsnrtombs; + xrl->__mb_cur_max = UTF8_MB_CUR_MAX; + /* + * UCS-4 encoding used as the internal representation, so + * slots 0x0080-0x00FF are occuped and must be excluded + * from the single byte ctype by setting the limit. + */ + xrl->__mb_sb_limit = 128; return (0); } -int -_UTF8_mbsinit(const mbstate_t *ps) +static int +_UTF8_mbsinit(const mbstate_t *ps, locale_t loc) { return (ps == NULL || ((const _UTF8State *)ps)->want == 0); } -size_t +static size_t _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, - mbstate_t * __restrict ps) + mbstate_t * __restrict ps, locale_t loc) { _UTF8State *us; int ch, i, mask, want; - wchar_t lbound, wch; + wchar_t wch; + SecondByte sb; us = (_UTF8State *)ps; - if (us->want < 0 || us->want > 6) { + if (us->want < 0 || us->want > UTF8_MB_CUR_MAX) { errno = EINVAL; return ((size_t)-1); } @@ -111,38 +160,50 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, * interesting bits of the first octet. We already know * the character is at least two bytes long. * - * We also specify a lower bound for the character code to - * detect redundant, non-"shortest form" encodings. For - * example, the sequence C0 80 is _not_ a legal representation - * of the null character. This enforces a 1-to-1 mapping - * between character codes and their multibyte representations. + * We detect if the first byte is illegal, and set sb to + * the legal range of the second byte. */ ch = (unsigned char)*s; if ((ch & 0x80) == 0) { mask = 0x7f; want = 1; - lbound = 0; + sb = sb_00_00; } else if ((ch & 0xe0) == 0xc0) { + if (ch < 0xc2) goto malformed; mask = 0x1f; want = 2; - lbound = 0x80; + sb = sb_80_BF; } else if ((ch & 0xf0) == 0xe0) { mask = 0x0f; want = 3; - lbound = 0x800; + switch (ch) { + case 0xe0: + sb = sb_A0_BF; + break; + case 0xed: + sb = sb_80_9F; + break; + default: + sb = sb_80_BF; + break; + } } else if ((ch & 0xf8) == 0xf0) { + if (ch > 0xf4) goto malformed; mask = 0x07; want = 4; - lbound = 0x10000; - } else if ((ch & 0xfc) == 0xf8) { - mask = 0x03; - want = 5; - lbound = 0x200000; - } else if ((ch & 0xfc) == 0xfc) { - mask = 0x01; - want = 6; - lbound = 0x4000000; + switch (ch) { + case 0xf0: + sb = sb_90_BF; + break; + case 0xf4: + sb = sb_80_8F; + break; + default: + sb = sb_80_BF; + break; + } } else { +malformed: /* * Malformed input; input is not UTF-8. */ @@ -151,7 +212,7 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, } } else { want = us->want; - lbound = us->lbound; + sb = us->sb; } /* @@ -163,40 +224,30 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, else wch = us->ch; for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { - if ((*s & 0xc0) != 0x80) { - /* - * Malformed input; bad characters in the middle - * of a character. - */ - errno = EILSEQ; - return ((size_t)-1); - } + if (sb.lowerbound) { + if ((unsigned char)*s < sb.lowerbound || + (unsigned char)*s > sb.upperbound) goto malformed; + sb = sb_00_00; + } else if ((*s & 0xc0) != 0x80) goto malformed; wch <<= 6; wch |= *s++ & 0x3f; } if (i < want) { /* Incomplete multibyte sequence. */ us->want = want - i; - us->lbound = lbound; + us->sb = sb; us->ch = wch; return ((size_t)-2); } - if (wch < lbound) { - /* - * Malformed input; redundant encoding. - */ - errno = EILSEQ; - return ((size_t)-1); - } if (pwc != NULL) *pwc = wch; us->want = 0; return (wch == L'\0' ? 0 : want); } -size_t +static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, - size_t nms, size_t len, mbstate_t * __restrict ps) + size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc) { _UTF8State *us; const char *s; @@ -226,7 +277,7 @@ _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, * excluding NUL. */ nb = 1; - else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == + else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps, loc)) == (size_t)-1) /* Invalid sequence - mbrtowc() sets errno. */ return ((size_t)-1); @@ -256,7 +307,7 @@ _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, */ *dst = (wchar_t)*s; nb = 1; - } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == + } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, loc)) == (size_t)-1) { *src = s; return ((size_t)-1); @@ -276,8 +327,8 @@ _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, return (nchr); } -size_t -_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) +static size_t +_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc) { _UTF8State *us; unsigned char lead; @@ -313,18 +364,15 @@ _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) lead = 0xc0; len = 2; } else if ((wc & ~0xffff) == 0) { + if (wc >= 0xd800 && wc <= 0xdfff) goto illegal; lead = 0xe0; len = 3; } else if ((wc & ~0x1fffff) == 0) { + if (wc > 0x10ffff) goto illegal; lead = 0xf0; len = 4; - } else if ((wc & ~0x3ffffff) == 0) { - lead = 0xf8; - len = 5; - } else if ((wc & ~0x7fffffff) == 0) { - lead = 0xfc; - len = 6; } else { +illegal: errno = EILSEQ; return ((size_t)-1); } @@ -344,9 +392,9 @@ _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) return (len); } -size_t +static size_t _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, - size_t nwc, size_t len, mbstate_t * __restrict ps) + size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc) { _UTF8State *us; char buf[MB_LEN_MAX]; @@ -369,7 +417,7 @@ _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, if (0 <= *s && *s < 0x80) /* Fast path for plain ASCII characters. */ nb = 1; - else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == + else if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == (size_t)-1) /* Invalid character - wcrtomb() sets errno. */ return ((size_t)-1); @@ -386,9 +434,9 @@ _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, /* Fast path for plain ASCII characters. */ nb = 1; *dst = *s; - } else if (len > (size_t)MB_CUR_MAX) { + } else if (len > (size_t)UTF8_MB_CUR_MAX) { /* Enough space to translate in-place. */ - if ((nb = (int)_UTF8_wcrtomb(dst, *s, ps)) < 0) { + if ((nb = _UTF8_wcrtomb(dst, *s, ps, loc)) == (size_t)-1) { *src = s; return ((size_t)-1); } @@ -396,7 +444,7 @@ _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, /* * May not be enough space; use temp. buffer. */ - if ((nb = (int)_UTF8_wcrtomb(buf, *s, ps)) < 0) { + if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == (size_t)-1) { *src = s; return ((size_t)-1); }