-size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
- mbstate_t * __restrict);
-int _UTF8_mbsinit(const mbstate_t *);
-size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, const char ** __restrict,
- size_t, size_t, mbstate_t * __restrict);
-size_t _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
-size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
- size_t, size_t, mbstate_t * __restrict);
+/*
+ * 10952550: detect ill-formed UTF-8
+ * Unicode 6.0, section D92, mandates specific byte sequences for well-
+ * formed UTF-8. UTF-8 sequences are now limited to 4 bytes, while the
+ * FreeBSD code originally handled up to 6. Illegal surrogate code point
+ * sequences are now detected. And while "non-shortest forms" were detected,
+ * this only happened after completing the sequence. Now, all ill-formed
+ * sequences are detected at the earliest point.
+ *
+ * Table 3-7. Well-Formed UTF-8 Byte Sequences
+ *
+ * Code Points 1st 2nd 3rd 4th Byte
+ * U+0000..U+007F 00..7F
+ * U+0080..U+07FF C2..DF 80..BF
+ * U+0800..U+0FFF E0 A0..BF 80..BF
+ * U+1000..U+CFFF E1..EC 80..BF 80..BF
+ * U+D000..U+D7FF ED 80..9F 80..BF
+ * U+E000..U+FFFF EE..EF 80..BF 80..BF
+ * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+ * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+ * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+ *
+ * Note that while any 3rd and 4th byte can be in the range 80..BF, the
+ * second byte is often limited to a smaller range.
+ */
+
+typedef struct {
+ unsigned char lowerbound;
+ unsigned char upperbound;
+} SecondByte;
+static SecondByte sb_00_00 = {0x00, 0x00};
+static SecondByte sb_80_8F = {0x80, 0x8F};
+static SecondByte sb_80_9F = {0x80, 0x9F};
+static SecondByte sb_80_BF = {0x80, 0xBF};
+static SecondByte sb_90_BF = {0x90, 0xBF};
+static SecondByte sb_A0_BF = {0xA0, 0xBF};
+
+#define UTF8_MB_CUR_MAX 4
+
+static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
+ size_t, mbstate_t * __restrict, locale_t);
+static int _UTF8_mbsinit(const mbstate_t *, locale_t);
+static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
+ const char ** __restrict, size_t, size_t,
+ mbstate_t * __restrict, locale_t);
+static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
+ mbstate_t * __restrict, locale_t);
+static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
+ size_t, size_t, mbstate_t * __restrict, locale_t);