locale/FreeBSD/utf8.c

   1 /*-
   2  * Copyright (c) 2002 Tim J. Robbins
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.2 2003/02/18 13:39:51 nectar Exp $");
  29
  30 #include <rune.h>
  31 #include <stddef.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34
  35 rune_t  _UTF8_sgetrune(const char *, size_t, char const **);
  36 int     _UTF8_sputrune(rune_t, char *, size_t, char **);
  37
  38 int
  39 _UTF8_init(_RuneLocale *rl)
  40 {
  41
  42         rl->sgetrune = _UTF8_sgetrune;
  43         rl->sputrune = _UTF8_sputrune;
  44         _CurrentRuneLocale = rl;
  45         __mb_cur_max = 6;
  46
  47         return (0);
  48 }
  49
  50 rune_t
  51 _UTF8_sgetrune(const char *string, size_t n, const char **result)
  52 {
  53         int ch, len, mask;
  54         rune_t lbound, wch;
  55
  56         if (n < 1) {
  57                 if (result != NULL)
  58                         *result = string;
  59                 return (_INVALID_RUNE);
  60         }
  61
  62         /*
  63          * Determine the number of octets that make up this character from
  64          * the first octet, and a mask that extracts the interesting bits of
  65          * the first octet.
  66          *
  67          * We also specify a lower bound for the character code to detect
  68          * redundant, non-"shortest form" encodings. For example, the
  69          * sequence C0 80 is _not_ a legal representation of the null
  70          * character. This enforces a 1-to-1 mapping between character
  71          * codes and their multibyte representations.
  72          */
  73         ch = (unsigned char)*string;
  74         if ((ch & 0x80) == 0) {
  75                 mask = 0x7f;
  76                 len = 1;
  77                 lbound = 0;
  78         } else if ((ch & 0xe0) == 0xc0) {
  79                 mask = 0x1f;
  80                 len = 2;
  81                 lbound = 0x80;
  82         } else if ((ch & 0xf0) == 0xe0) {
  83                 mask = 0x0f;
  84                 len = 3;
  85                 lbound = 0x800;
  86         } else if ((ch & 0xf8) == 0xf0) {
  87                 mask = 0x07;
  88                 len = 4;
  89                 lbound = 0x10000;
  90         } else if ((ch & 0xfc) == 0xf8) {
  91                 mask = 0x03;
  92                 len = 5;
  93                 lbound = 0x200000;
  94         } else if ((ch & 0xfc) == 0xfc) {
  95                 mask = 0x01;
  96                 len = 6;
  97                 lbound = 0x4000000;
  98         } else {
  99                 /*
 100                  * Malformed input; input is not UTF-8.
 101                  */
 102                 if (result != NULL)
 103                         *result = string + 1;
 104                 return (_INVALID_RUNE);
 105         }
 106
 107         if (n < len) {
 108                 /*
 109                  * Truncated or partial input.
 110                  */
 111                 if (result != NULL)
 112                         *result = string;
 113                 return (_INVALID_RUNE);
 114         }
 115
 116         /*
 117          * Decode the octet sequence representing the character in chunks
 118          * of 6 bits, most significant first.
 119          */
 120         wch = (unsigned char)*string++ & mask;
 121         while (--len != 0) {
 122                 if ((*string & 0xc0) != 0x80) {
 123                         /*
 124                          * Malformed input; bad characters in the middle
 125                          * of a character.
 126                          */
 127                         wch = _INVALID_RUNE;
 128                         if (result != NULL)
 129                                 *result = string + 1;
 130                         return (_INVALID_RUNE);
 131                 }
 132                 wch <<= 6;
 133                 wch |= *string++ & 0x3f;
 134         }
 135         if (wch != _INVALID_RUNE && wch < lbound)
 136                 /*
 137                  * Malformed input; redundant encoding.
 138                  */
 139                 wch = _INVALID_RUNE;
 140         if (result != NULL)
 141                 *result = string;
 142         return (wch);
 143 }
 144
 145 int
 146 _UTF8_sputrune(rune_t c, char *string, size_t n, char **result)
 147 {
 148         unsigned char lead;
 149         int i, len;
 150
 151         /*
 152          * Determine the number of octets needed to represent this character.
 153          * We always output the shortest sequence possible. Also specify the
 154          * first few bits of the first octet, which contains the information
 155          * about the sequence length.
 156          */
 157         if ((c & ~0x7f) == 0) {
 158                 lead = 0;
 159                 len = 1;
 160         } else if ((c & ~0x7ff) == 0) {
 161                 lead = 0xc0;
 162                 len = 2;
 163         } else if ((c & ~0xffff) == 0) {
 164                 lead = 0xe0;
 165                 len = 3;
 166         } else if ((c & ~0x1fffff) == 0) {
 167                 lead = 0xf0;
 168                 len = 4;
 169         } else if ((c & ~0x3ffffff) == 0) {
 170                 lead = 0xf8;
 171                 len = 5;
 172         } else if ((c & ~0x7fffffff) == 0) {
 173                 lead = 0xfc;
 174                 len = 6;
 175         } else {
 176                 /*
 177                  * Wide character code is out of range.
 178                  */
 179                 if (result != NULL)
 180                         *result = NULL;
 181                 return (0);
 182         }
 183
 184         if (n < len) {
 185                 if (result != NULL)
 186                         *result = NULL;
 187         } else {
 188                 /*
 189                  * Output the octets representing the character in chunks
 190                  * of 6 bits, least significant last. The first octet is
 191                  * a special case because it contains the sequence length
 192                  * information.
 193                  */
 194                 for (i = len - 1; i > 0; i--) {
 195                         string[i] = (c & 0x3f) | 0x80;
 196                         c >>= 6;
 197                 }
 198                 *string = (c & 0xff) | lead;
 199                 if (result != NULL)
 200                         *result = string + len;
 201         }
 202
 203         return (len);
 204 }