]>
git.saurik.com Git - apple/libc.git/blob - locale/FreeBSD/utf8.c
95ddce9243994dbde05d1881711eb8309144d0ad
2 * Copyright (c) 2002 Tim J. Robbins
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.2 2003/02/18 13:39:51 nectar Exp $");
35 rune_t
_UTF8_sgetrune(const char *, size_t, char const **);
36 int _UTF8_sputrune(rune_t
, char *, size_t, char **);
39 _UTF8_init(_RuneLocale
*rl
)
42 rl
->sgetrune
= _UTF8_sgetrune
;
43 rl
->sputrune
= _UTF8_sputrune
;
44 _CurrentRuneLocale
= rl
;
51 _UTF8_sgetrune(const char *string
, size_t n
, const char **result
)
59 return (_INVALID_RUNE
);
63 * Determine the number of octets that make up this character from
64 * the first octet, and a mask that extracts the interesting bits of
67 * We also specify a lower bound for the character code to detect
68 * redundant, non-"shortest form" encodings. For example, the
69 * sequence C0 80 is _not_ a legal representation of the null
70 * character. This enforces a 1-to-1 mapping between character
71 * codes and their multibyte representations.
73 ch
= (unsigned char)*string
;
74 if ((ch
& 0x80) == 0) {
78 } else if ((ch
& 0xe0) == 0xc0) {
82 } else if ((ch
& 0xf0) == 0xe0) {
86 } else if ((ch
& 0xf8) == 0xf0) {
90 } else if ((ch
& 0xfc) == 0xf8) {
94 } else if ((ch
& 0xfc) == 0xfc) {
100 * Malformed input; input is not UTF-8.
103 *result
= string
+ 1;
104 return (_INVALID_RUNE
);
109 * Truncated or partial input.
113 return (_INVALID_RUNE
);
117 * Decode the octet sequence representing the character in chunks
118 * of 6 bits, most significant first.
120 wch
= (unsigned char)*string
++ & mask
;
122 if ((*string
& 0xc0) != 0x80) {
124 * Malformed input; bad characters in the middle
129 *result
= string
+ 1;
130 return (_INVALID_RUNE
);
133 wch
|= *string
++ & 0x3f;
135 if (wch
!= _INVALID_RUNE
&& wch
< lbound
)
137 * Malformed input; redundant encoding.
146 _UTF8_sputrune(rune_t c
, char *string
, size_t n
, char **result
)
152 * Determine the number of octets needed to represent this character.
153 * We always output the shortest sequence possible. Also specify the
154 * first few bits of the first octet, which contains the information
155 * about the sequence length.
157 if ((c
& ~0x7f) == 0) {
160 } else if ((c
& ~0x7ff) == 0) {
163 } else if ((c
& ~0xffff) == 0) {
166 } else if ((c
& ~0x1fffff) == 0) {
169 } else if ((c
& ~0x3ffffff) == 0) {
172 } else if ((c
& ~0x7fffffff) == 0) {
177 * Wide character code is out of range.
189 * Output the octets representing the character in chunks
190 * of 6 bits, least significant last. The first octet is
191 * a special case because it contains the sequence length
194 for (i
= len
- 1; i
> 0; i
--) {
195 string
[i
] = (c
& 0x3f) | 0x80;
198 *string
= (c
& 0xff) | lead
;
200 *result
= string
+ len
;