]> git.saurik.com Git - apple/libc.git/blob - locale/FreeBSD/utf8.c
95ddce9243994dbde05d1881711eb8309144d0ad
[apple/libc.git] / locale / FreeBSD / utf8.c
1 /*-
2 * Copyright (c) 2002 Tim J. Robbins
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.2 2003/02/18 13:39:51 nectar Exp $");
29
30 #include <rune.h>
31 #include <stddef.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34
35 rune_t _UTF8_sgetrune(const char *, size_t, char const **);
36 int _UTF8_sputrune(rune_t, char *, size_t, char **);
37
38 int
39 _UTF8_init(_RuneLocale *rl)
40 {
41
42 rl->sgetrune = _UTF8_sgetrune;
43 rl->sputrune = _UTF8_sputrune;
44 _CurrentRuneLocale = rl;
45 __mb_cur_max = 6;
46
47 return (0);
48 }
49
50 rune_t
51 _UTF8_sgetrune(const char *string, size_t n, const char **result)
52 {
53 int ch, len, mask;
54 rune_t lbound, wch;
55
56 if (n < 1) {
57 if (result != NULL)
58 *result = string;
59 return (_INVALID_RUNE);
60 }
61
62 /*
63 * Determine the number of octets that make up this character from
64 * the first octet, and a mask that extracts the interesting bits of
65 * the first octet.
66 *
67 * We also specify a lower bound for the character code to detect
68 * redundant, non-"shortest form" encodings. For example, the
69 * sequence C0 80 is _not_ a legal representation of the null
70 * character. This enforces a 1-to-1 mapping between character
71 * codes and their multibyte representations.
72 */
73 ch = (unsigned char)*string;
74 if ((ch & 0x80) == 0) {
75 mask = 0x7f;
76 len = 1;
77 lbound = 0;
78 } else if ((ch & 0xe0) == 0xc0) {
79 mask = 0x1f;
80 len = 2;
81 lbound = 0x80;
82 } else if ((ch & 0xf0) == 0xe0) {
83 mask = 0x0f;
84 len = 3;
85 lbound = 0x800;
86 } else if ((ch & 0xf8) == 0xf0) {
87 mask = 0x07;
88 len = 4;
89 lbound = 0x10000;
90 } else if ((ch & 0xfc) == 0xf8) {
91 mask = 0x03;
92 len = 5;
93 lbound = 0x200000;
94 } else if ((ch & 0xfc) == 0xfc) {
95 mask = 0x01;
96 len = 6;
97 lbound = 0x4000000;
98 } else {
99 /*
100 * Malformed input; input is not UTF-8.
101 */
102 if (result != NULL)
103 *result = string + 1;
104 return (_INVALID_RUNE);
105 }
106
107 if (n < len) {
108 /*
109 * Truncated or partial input.
110 */
111 if (result != NULL)
112 *result = string;
113 return (_INVALID_RUNE);
114 }
115
116 /*
117 * Decode the octet sequence representing the character in chunks
118 * of 6 bits, most significant first.
119 */
120 wch = (unsigned char)*string++ & mask;
121 while (--len != 0) {
122 if ((*string & 0xc0) != 0x80) {
123 /*
124 * Malformed input; bad characters in the middle
125 * of a character.
126 */
127 wch = _INVALID_RUNE;
128 if (result != NULL)
129 *result = string + 1;
130 return (_INVALID_RUNE);
131 }
132 wch <<= 6;
133 wch |= *string++ & 0x3f;
134 }
135 if (wch != _INVALID_RUNE && wch < lbound)
136 /*
137 * Malformed input; redundant encoding.
138 */
139 wch = _INVALID_RUNE;
140 if (result != NULL)
141 *result = string;
142 return (wch);
143 }
144
145 int
146 _UTF8_sputrune(rune_t c, char *string, size_t n, char **result)
147 {
148 unsigned char lead;
149 int i, len;
150
151 /*
152 * Determine the number of octets needed to represent this character.
153 * We always output the shortest sequence possible. Also specify the
154 * first few bits of the first octet, which contains the information
155 * about the sequence length.
156 */
157 if ((c & ~0x7f) == 0) {
158 lead = 0;
159 len = 1;
160 } else if ((c & ~0x7ff) == 0) {
161 lead = 0xc0;
162 len = 2;
163 } else if ((c & ~0xffff) == 0) {
164 lead = 0xe0;
165 len = 3;
166 } else if ((c & ~0x1fffff) == 0) {
167 lead = 0xf0;
168 len = 4;
169 } else if ((c & ~0x3ffffff) == 0) {
170 lead = 0xf8;
171 len = 5;
172 } else if ((c & ~0x7fffffff) == 0) {
173 lead = 0xfc;
174 len = 6;
175 } else {
176 /*
177 * Wide character code is out of range.
178 */
179 if (result != NULL)
180 *result = NULL;
181 return (0);
182 }
183
184 if (n < len) {
185 if (result != NULL)
186 *result = NULL;
187 } else {
188 /*
189 * Output the octets representing the character in chunks
190 * of 6 bits, least significant last. The first octet is
191 * a special case because it contains the sequence length
192 * information.
193 */
194 for (i = len - 1; i > 0; i--) {
195 string[i] = (c & 0x3f) | 0x80;
196 c >>= 6;
197 }
198 *string = (c & 0xff) | lead;
199 if (result != NULL)
200 *result = string + len;
201 }
202
203 return (len);
204 }