]>
git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_utfconv.c
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 #include <sys/param.h>
24 #include <sys/utfconv.h>
25 #include <sys/errno.h>
26 #include <architecture/byte_order.h>
30 * UTF-8 (UCS Transformation Format)
32 * The following subset of UTF-8 is used to encode UCS-2 filenames. It
33 * requires a maximum of three 3 bytes per UCS-2 character. Only the
34 * shortest encoding required to represent the significant UCS-2 bits
37 * UTF-8 Multibyte Codes
39 * Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary)
40 * -------------------------------------------------------------------
41 * 1 7 0x0000 0x007F 0xxxxxxx
42 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
43 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
44 * -------------------------------------------------------------------
48 #define UCS_TO_UTF_LEN(c) ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : 3))
51 static u_int16_t ucs_decompose
__P((u_int16_t
, u_int16_t
*));
55 * utf8_encodelen - Calculates the UTF-8 encoding length for a UCS-2 filename
58 * If '/' chars are allowed on disk then an alternate
59 * (replacement) char must be provided in altslash.
62 * UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
65 utf8_encodelen(ucsp
, ucslen
, altslash
, flags
)
66 const u_int16_t
* ucsp
;
73 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
79 while (charcnt
-- > 0) {
83 ucs_ch
= NXSwapShort(ucs_ch
);
84 if (altslash
&& ucs_ch
== '/')
89 len
+= UCS_TO_UTF_LEN(ucs_ch
);
97 * utf8_encodestr - Encodes a UCS-2 (Unicode) string to UTF-8
100 * The resulting UTF-8 string is not null terminated.
102 * If '/' chars are allowed on disk then an alternate
103 * (replacement) char must be provided in altslash.
106 * UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
107 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
109 int utf8_encodestr(ucsp
, ucslen
, utf8p
, utf8len
, buflen
, altslash
, flags
)
110 const u_int16_t
* ucsp
;
122 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
123 int nullterm
= ((flags
& UTF_NO_NULL_TERM
) == 0);
127 bufend
= bufstart
+ buflen
;
130 charcnt
= ucslen
/ 2;
132 while (charcnt
-- > 0) {
136 ucs_ch
= NXSwapShort(ucs_ch
);
137 if (altslash
&& ucs_ch
== '/')
140 if ((ucs_ch
< 0x0080) && (ucs_ch
!= '\0')) {
141 if (utf8p
>= bufend
) {
142 result
= ENAMETOOLONG
;
147 } else if (ucs_ch
< 0x800) {
148 if ((utf8p
+ 1) >= bufend
) {
149 result
= ENAMETOOLONG
;
152 /* NOTE: NULL maps to 0xC080 */
153 *utf8p
++ = (ucs_ch
>> 6) | 0xc0;
154 *utf8p
++ = (ucs_ch
& 0x3f) | 0x80;
157 if ((utf8p
+ 2) >= bufend
) {
158 result
= ENAMETOOLONG
;
161 *utf8p
++ = (ucs_ch
>> 12) | 0xe0;
162 *utf8p
++ = ((ucs_ch
>> 6) & 0x3f) | 0x80;
163 *utf8p
++ = ((ucs_ch
) & 0x3f) | 0x80;
167 *utf8len
= utf8p
- bufstart
;
176 * utf8_decodestr - Decodes a UTF-8 string back to UCS-2 (Unicode)
179 * The input UTF-8 string does not need to be null terminated
182 * If '/' chars are allowed on disk then an alternate
183 * (replacement) char must be provided in altslash.
186 * UTF_REV_ENDIAN: UCS-2 byteorder is oposite current runtime
187 * UTF_DECOMPOSED: UCS-2 output string must be fully decompsed
190 utf8_decodestr(utf8p
, utf8len
, ucsp
, ucslen
, buflen
, altslash
, flags
)
191 const u_int8_t
* utf8p
;
204 int decompose
, swapbytes
;
206 decompose
= (flags
& UTF_DECOMPOSED
);
207 swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
210 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
212 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
213 if (ucsp
>= bufend
) {
214 result
= ENAMETOOLONG
;
218 /* check for ascii */
222 switch (byte
& 0xf0) {
226 /* extract bits 6 - 10 from first byte */
227 ucs_ch
= (byte
& 0x1F) << 6;
228 if ((ucs_ch
< 0x0080) && (*utf8p
!= 0x80)) {
229 result
= EINVAL
; /* seq not minimal */
235 /* extract bits 12 - 15 from first byte */
236 ucs_ch
= (byte
& 0x0F) << 6;
238 /* extract bits 6 - 11 from second byte */
239 if (((byte
= *utf8p
++) & 0xc0) != 0x80) {
245 ucs_ch
+= (byte
& 0x3F);
248 if (ucs_ch
< 0x0800) {
249 result
= EINVAL
; /* seq not minimal */
258 /* extract bits 0 - 5 from final byte */
259 if (((byte
= *utf8p
++) & 0xc0) != 0x80) {
264 ucs_ch
+= (byte
& 0x3F);
269 ucs_ch
= ucs_decompose(ucs_ch
, &comb_ch
);
273 *ucsp
++ = NXSwapShort(ucs_ch
);
277 if (ucsp
>= bufend
) {
278 result
= ENAMETOOLONG
;
287 if (ucs_ch
== altslash
)
290 ucs_ch
= NXSwapShort(ucs_ch
);
295 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
302 * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
303 * primary_char yields first decomposed char. If this
304 * char is an alpha char then get the combining char
305 * from the combining_char table and add 0x0300 to it.
308 static unsigned char primary_char
[64] = {
309 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43,
311 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49,
313 0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7,
315 0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF,
317 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63,
319 0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69,
321 0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7,
323 0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79,
326 static unsigned char combining_char
[64] = {
327 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
329 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
331 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
333 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF,
335 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
337 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
339 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
341 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08
345 /* CJK codepoints 0x3000 ~ 0x30FF */
346 static const unsigned long __CJKDecompBitmap
[] = {
347 0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, /* 0x3000 */
348 0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, /* 0x3080 */
350 #define IS_DECOMPOSABLE(table,unicodeVal) \
351 (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
354 * ucs_decompose - decompose a composed UCS-2 char
356 * Composed Unicode characters are forbidden on
357 * HFS Plus volumes. ucs_decompose will convert a
358 * composed character into its correct decomposed
361 * Currently only MacRoman and MacJapanese chars
362 * are handled. Other composed characters are
366 ucs_decompose(register u_int16_t ch
, u_int16_t
*cmb
)
372 if ((ch
<= 0x00FF) && (ch
>= 0x00C0)) {
375 base
= (u_int16_t
) primary_char
[ch
];
378 *cmb
= (u_int16_t
)0x0300 + (u_int16_t
)combining_char
[ch
];
380 } else if ((ch
> 0x3000) && (ch
< 0x3100) &&
381 IS_DECOMPOSABLE(__CJKDecompBitmap
, ch
- 0x3000)) {
383 /* Handle HIRAGANA LETTERs */
385 case 0x3071: base
= 0x306F; *cmb
= 0x309A; break; /* PA */
386 case 0x3074: base
= 0x3072; *cmb
= 0x309A; break; /* PI */
387 case 0x3077: base
= 0x3075; *cmb
= 0x309A; break; /* PU */
388 case 0x307A: base
= 0x3078; *cmb
= 0x309A; break; /* PE */
390 case 0x307D: base
= 0x307B; *cmb
= 0x309A; break; /* PO */
391 case 0x3094: base
= 0x3046; *cmb
= 0x3099; break; /* VU */
392 case 0x30D1: base
= 0x30CF; *cmb
= 0x309A; break; /* PA */
393 case 0x30D4: base
= 0x30D2; *cmb
= 0x309A; break; /* PI */
395 case 0x30D7: base
= 0x30D5; *cmb
= 0x309A; break; /* PU */
396 case 0x30DA: base
= 0x30D8; *cmb
= 0x309A; break; /* PE */
397 case 0x30DD: base
= 0x30DB; *cmb
= 0x309A; break; /* PO */
398 case 0x30F4: base
= 0x30A6; *cmb
= 0x3099; break; /* VU */
400 case 0x30F7: base
= 0x30EF; *cmb
= 0x3099; break; /* VA */
401 case 0x30F8: base
= 0x30F0; *cmb
= 0x3099; break; /* VI */
402 case 0x30F9: base
= 0x30F1; *cmb
= 0x3099; break; /* VE */
403 case 0x30FA: base
= 0x30F2; *cmb
= 0x3099; break; /* VO */
406 /* the rest (41 of them) have a simple conversion */