locale/FreeBSD/utf8.c

   1 /*-
   2  * Copyright (c) 2002-2004 Tim J. Robbins
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 #include <sys/param.h>
  28 __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $");
  29
  30 #include "xlocale_private.h"
  31
  32 #include <errno.h>
  33 #include <limits.h>
  34 #include <runetype.h>
  35 #include <stdlib.h>
  36 #include <string.h>
  37 #include <wchar.h>
  38 #include "mblocal.h"
  39
  40 /*
  41  * 10952550: detect ill-formed UTF-8
  42  * Unicode 6.0, section D92, mandates specific byte sequences for well-
  43  * formed UTF-8.  UTF-8 sequences are now limited to 4 bytes, while the
  44  * FreeBSD code originally handled up to 6.  Illegal surrogate code point
  45  * sequences are now detected.  And while "non-shortest forms" were detected,
  46  * this only happened after completing the sequence.  Now, all ill-formed
  47  * sequences are detected at the earliest point.
  48  *
  49  *          Table 3-7.  Well-Formed UTF-8 Byte Sequences
  50  *
  51  *      Code Points         1st      2nd      3rd      4th Byte
  52  *    U+0000..U+007F       00..7F
  53  *    U+0080..U+07FF       C2..DF   80..BF
  54  *    U+0800..U+0FFF       E0       A0..BF   80..BF
  55  *    U+1000..U+CFFF       E1..EC   80..BF   80..BF
  56  *    U+D000..U+D7FF       ED       80..9F   80..BF
  57  *    U+E000..U+FFFF       EE..EF   80..BF   80..BF
  58  *    U+10000..U+3FFFF     F0       90..BF   80..BF   80..BF
  59  *    U+40000..U+FFFFF     F1..F3   80..BF   80..BF   80..BF
  60  *    U+100000..U+10FFFF   F4       80..8F   80..BF   80..BF
  61  *
  62  * Note that while any 3rd and 4th byte can be in the range 80..BF, the
  63  * second byte is often limited to a smaller range.
  64  */
  65
  66 typedef struct {
  67         unsigned char lowerbound;
  68         unsigned char upperbound;
  69 } SecondByte;
  70 static SecondByte sb_00_00 = {0x00, 0x00};
  71 static SecondByte sb_80_8F = {0x80, 0x8F};
  72 static SecondByte sb_80_9F = {0x80, 0x9F};
  73 static SecondByte sb_80_BF = {0x80, 0xBF};
  74 static SecondByte sb_90_BF = {0x90, 0xBF};
  75 static SecondByte sb_A0_BF = {0xA0, 0xBF};
  76
  77 #define UTF8_MB_CUR_MAX         4
  78
  79 static size_t   _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
  80                     size_t, mbstate_t * __restrict, locale_t);
  81 static int      _UTF8_mbsinit(const mbstate_t *, locale_t);
  82 static size_t   _UTF8_mbsnrtowcs(wchar_t * __restrict,
  83                     const char ** __restrict, size_t, size_t,
  84                     mbstate_t * __restrict, locale_t);
  85 static size_t   _UTF8_wcrtomb(char * __restrict, wchar_t,
  86                     mbstate_t * __restrict, locale_t);
  87 static size_t   _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
  88                     size_t, size_t, mbstate_t * __restrict, locale_t);
  89
  90 typedef struct {
  91         wchar_t ch;
  92         int     want;
  93         SecondByte sb;
  94 } _UTF8State;
  95
  96 int
  97 _UTF8_init(struct __xlocale_st_runelocale *xrl)
  98 {
  99
 100         xrl->__mbrtowc = _UTF8_mbrtowc;
 101         xrl->__wcrtomb = _UTF8_wcrtomb;
 102         xrl->__mbsinit = _UTF8_mbsinit;
 103         xrl->__mbsnrtowcs = _UTF8_mbsnrtowcs;
 104         xrl->__wcsnrtombs = _UTF8_wcsnrtombs;
 105         xrl->__mb_cur_max = UTF8_MB_CUR_MAX;
 106         /*
 107          * UCS-4 encoding used as the internal representation, so
 108          * slots 0x0080-0x00FF are occuped and must be excluded
 109          * from the single byte ctype by setting the limit.
 110          */
 111         xrl->__mb_sb_limit = 128;
 112
 113         return (0);
 114 }
 115
 116 static int
 117 _UTF8_mbsinit(const mbstate_t *ps, locale_t loc)
 118 {
 119
 120         return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
 121 }
 122
 123 static size_t
 124 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
 125     mbstate_t * __restrict ps, locale_t loc)
 126 {
 127         _UTF8State *us;
 128         int ch, i, mask, want;
 129         wchar_t wch;
 130         SecondByte sb;
 131
 132         us = (_UTF8State *)ps;
 133
 134         if (us->want < 0 || us->want > UTF8_MB_CUR_MAX) {
 135                 errno = EINVAL;
 136                 return ((size_t)-1);
 137         }
 138
 139         if (s == NULL) {
 140                 s = "";
 141                 n = 1;
 142                 pwc = NULL;
 143         }
 144
 145         if (n == 0)
 146                 /* Incomplete multibyte sequence */
 147                 return ((size_t)-2);
 148
 149         if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
 150                 /* Fast path for plain ASCII characters. */
 151                 if (pwc != NULL)
 152                         *pwc = ch;
 153                 return (ch != '\0' ? 1 : 0);
 154         }
 155
 156         if (us->want == 0) {
 157                 /*
 158                  * Determine the number of octets that make up this character
 159                  * from the first octet, and a mask that extracts the
 160                  * interesting bits of the first octet. We already know
 161                  * the character is at least two bytes long.
 162                  *
 163                  * We detect if the first byte is illegal, and set sb to
 164                  * the legal range of the second byte.
 165                  */
 166                 ch = (unsigned char)*s;
 167                 if ((ch & 0x80) == 0) {
 168                         mask = 0x7f;
 169                         want = 1;
 170                         sb = sb_00_00;
 171                 } else if ((ch & 0xe0) == 0xc0) {
 172                         if (ch < 0xc2) goto malformed;
 173                         mask = 0x1f;
 174                         want = 2;
 175                         sb = sb_80_BF;
 176                 } else if ((ch & 0xf0) == 0xe0) {
 177                         mask = 0x0f;
 178                         want = 3;
 179                         switch (ch) {
 180                         case 0xe0:
 181                                 sb = sb_A0_BF;
 182                                 break;
 183                         case 0xed:
 184                                 sb = sb_80_9F;
 185                                 break;
 186                         default:
 187                                 sb = sb_80_BF;
 188                                 break;
 189                         }
 190                 } else if ((ch & 0xf8) == 0xf0) {
 191                         if (ch > 0xf4) goto malformed;
 192                         mask = 0x07;
 193                         want = 4;
 194                         switch (ch) {
 195                         case 0xf0:
 196                                 sb = sb_90_BF;
 197                                 break;
 198                         case 0xf4:
 199                                 sb = sb_80_8F;
 200                                 break;
 201                         default:
 202                                 sb = sb_80_BF;
 203                                 break;
 204                         }
 205                 } else {
 206 malformed:
 207                         /*
 208                          * Malformed input; input is not UTF-8.
 209                          */
 210                         errno = EILSEQ;
 211                         return ((size_t)-1);
 212                 }
 213         } else {
 214                 want = us->want;
 215                 sb = us->sb;
 216         }
 217
 218         /*
 219          * Decode the octet sequence representing the character in chunks
 220          * of 6 bits, most significant first.
 221          */
 222         if (us->want == 0)
 223                 wch = (unsigned char)*s++ & mask;
 224         else
 225                 wch = us->ch;
 226         for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
 227                 if (sb.lowerbound) {
 228                         if ((unsigned char)*s < sb.lowerbound ||
 229                            (unsigned char)*s > sb.upperbound) goto malformed;
 230                         sb = sb_00_00;
 231                 } else if ((*s & 0xc0) != 0x80) goto malformed;
 232                 wch <<= 6;
 233                 wch |= *s++ & 0x3f;
 234         }
 235         if (i < want) {
 236                 /* Incomplete multibyte sequence. */
 237                 us->want = want - i;
 238                 us->sb = sb;
 239                 us->ch = wch;
 240                 return ((size_t)-2);
 241         }
 242         if (pwc != NULL)
 243                 *pwc = wch;
 244         us->want = 0;
 245         return (wch == L'\0' ? 0 : want);
 246 }
 247
 248 static size_t
 249 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
 250     size_t nms, size_t len, mbstate_t * __restrict ps, locale_t loc)
 251 {
 252         _UTF8State *us;
 253         const char *s;
 254         size_t nchr;
 255         wchar_t wc;
 256         size_t nb;
 257
 258         us = (_UTF8State *)ps;
 259
 260         s = *src;
 261         nchr = 0;
 262
 263         if (dst == NULL) {
 264                 /*
 265                  * The fast path in the loop below is not safe if an ASCII
 266                  * character appears as anything but the first byte of a
 267                  * multibyte sequence. Check now to avoid doing it in the loop.
 268                  */
 269                 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
 270                         errno = EILSEQ;
 271                         return ((size_t)-1);
 272                 }
 273                 for (;;) {
 274                         if (nms > 0 && (signed char)*s > 0)
 275                                 /*
 276                                  * Fast path for plain ASCII characters
 277                                  * excluding NUL.
 278                                  */
 279                                 nb = 1;
 280                         else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps, loc)) ==
 281                             (size_t)-1)
 282                                 /* Invalid sequence - mbrtowc() sets errno. */
 283                                 return ((size_t)-1);
 284                         else if (nb == 0 || nb == (size_t)-2)
 285                                 return (nchr);
 286                         s += nb;
 287                         nms -= nb;
 288                         nchr++;
 289                 }
 290                 /*NOTREACHED*/
 291         }
 292
 293         /*
 294          * The fast path in the loop below is not safe if an ASCII
 295          * character appears as anything but the first byte of a
 296          * multibyte sequence. Check now to avoid doing it in the loop.
 297          */
 298         if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
 299                 errno = EILSEQ;
 300                 return ((size_t)-1);
 301         }
 302         while (len-- > 0) {
 303                 if (nms > 0 && (signed char)*s > 0) {
 304                         /*
 305                          * Fast path for plain ASCII characters
 306                          * excluding NUL.
 307                          */
 308                         *dst = (wchar_t)*s;
 309                         nb = 1;
 310                 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, loc)) ==
 311                     (size_t)-1) {
 312                         *src = s;
 313                         return ((size_t)-1);
 314                 } else if (nb == (size_t)-2) {
 315                         *src = s + nms;
 316                         return (nchr);
 317                 } else if (nb == 0) {
 318                         *src = NULL;
 319                         return (nchr);
 320                 }
 321                 s += nb;
 322                 nms -= nb;
 323                 nchr++;
 324                 dst++;
 325         }
 326         *src = s;
 327         return (nchr);
 328 }
 329
 330 static size_t
 331 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps, locale_t loc)
 332 {
 333         _UTF8State *us;
 334         unsigned char lead;
 335         int i, len;
 336
 337         us = (_UTF8State *)ps;
 338
 339         if (us->want != 0) {
 340                 errno = EINVAL;
 341                 return ((size_t)-1);
 342         }
 343
 344         if (s == NULL)
 345                 /* Reset to initial shift state (no-op) */
 346                 return (1);
 347
 348         if ((wc & ~0x7f) == 0) {
 349                 /* Fast path for plain ASCII characters. */
 350                 *s = (char)wc;
 351                 return (1);
 352         }
 353
 354         /*
 355          * Determine the number of octets needed to represent this character.
 356          * We always output the shortest sequence possible. Also specify the
 357          * first few bits of the first octet, which contains the information
 358          * about the sequence length.
 359          */
 360         if ((wc & ~0x7f) == 0) {
 361                 lead = 0;
 362                 len = 1;
 363         } else if ((wc & ~0x7ff) == 0) {
 364                 lead = 0xc0;
 365                 len = 2;
 366         } else if ((wc & ~0xffff) == 0) {
 367                 if (wc >= 0xd800 && wc <= 0xdfff) goto illegal;
 368                 lead = 0xe0;
 369                 len = 3;
 370         } else if ((wc & ~0x1fffff) == 0) {
 371                 if (wc > 0x10ffff) goto illegal;
 372                 lead = 0xf0;
 373                 len = 4;
 374         } else {
 375 illegal:
 376                 errno = EILSEQ;
 377                 return ((size_t)-1);
 378         }
 379
 380         /*
 381          * Output the octets representing the character in chunks
 382          * of 6 bits, least significant last. The first octet is
 383          * a special case because it contains the sequence length
 384          * information.
 385          */
 386         for (i = len - 1; i > 0; i--) {
 387                 s[i] = (wc & 0x3f) | 0x80;
 388                 wc >>= 6;
 389         }
 390         *s = (wc & 0xff) | lead;
 391
 392         return (len);
 393 }
 394
 395 static size_t
 396 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
 397     size_t nwc, size_t len, mbstate_t * __restrict ps, locale_t loc)
 398 {
 399         _UTF8State *us;
 400         char buf[MB_LEN_MAX];
 401         const wchar_t *s;
 402         size_t nbytes;
 403         size_t nb;
 404
 405         us = (_UTF8State *)ps;
 406
 407         if (us->want != 0) {
 408                 errno = EINVAL;
 409                 return ((size_t)-1);
 410         }
 411
 412         s = *src;
 413         nbytes = 0;
 414
 415         if (dst == NULL) {
 416                 while (nwc-- > 0) {
 417                         if (0 <= *s && *s < 0x80)
 418                                 /* Fast path for plain ASCII characters. */
 419                                 nb = 1;
 420                         else if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) ==
 421                             (size_t)-1)
 422                                 /* Invalid character - wcrtomb() sets errno. */
 423                                 return ((size_t)-1);
 424                         if (*s == L'\0')
 425                                 return (nbytes + nb - 1);
 426                         s++;
 427                         nbytes += nb;
 428                 }
 429                 return (nbytes);
 430         }
 431
 432         while (len > 0 && nwc-- > 0) {
 433                 if (0 <= *s && *s < 0x80) {
 434                         /* Fast path for plain ASCII characters. */
 435                         nb = 1;
 436                         *dst = *s;
 437                 } else if (len > (size_t)UTF8_MB_CUR_MAX) {
 438                         /* Enough space to translate in-place. */
 439                         if ((nb = _UTF8_wcrtomb(dst, *s, ps, loc)) == (size_t)-1) {
 440                                 *src = s;
 441                                 return ((size_t)-1);
 442                         }
 443                 } else {
 444                         /*
 445                          * May not be enough space; use temp. buffer.
 446                          */
 447                         if ((nb = _UTF8_wcrtomb(buf, *s, ps, loc)) == (size_t)-1) {
 448                                 *src = s;
 449                                 return ((size_t)-1);
 450                         }
 451                         if (nb > (int)len)
 452                                 /* MB sequence for character won't fit. */
 453                                 break;
 454                         memcpy(dst, buf, nb);
 455                 }
 456                 if (*s == L'\0') {
 457                         *src = NULL;
 458                         return (nbytes + nb - 1);
 459                 }
 460                 s++;
 461                 dst += nb;
 462                 len -= nb;
 463                 nbytes += nb;
 464         }
 465         *src = s;
 466         return (nbytes);
 467 }