]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_utfconv.c
xnu-123.5.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
CommitLineData
1c79356b
A
1/*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23#include <sys/param.h>
24#include <sys/utfconv.h>
25#include <sys/errno.h>
26#include <architecture/byte_order.h>
27
28
29/*
30 * UTF-8 (UCS Transformation Format)
31 *
32 * The following subset of UTF-8 is used to encode UCS-2 filenames. It
33 * requires a maximum of three 3 bytes per UCS-2 character. Only the
34 * shortest encoding required to represent the significant UCS-2 bits
35 * is legal.
36 *
37 * UTF-8 Multibyte Codes
38 *
39 * Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary)
40 * -------------------------------------------------------------------
41 * 1 7 0x0000 0x007F 0xxxxxxx
42 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
43 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
44 * -------------------------------------------------------------------
45 */
46
47
48#define UCS_TO_UTF_LEN(c) ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : 3))
49
50
51static u_int16_t ucs_decompose __P((u_int16_t, u_int16_t *));
52
53
54/*
55 * utf8_encodelen - Calculates the UTF-8 encoding length for a UCS-2 filename
56 *
57 * NOTES:
58 * If '/' chars are allowed on disk then an alternate
59 * (replacement) char must be provided in altslash.
60 *
61 * input flags:
62 * UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
63 */
64size_t
65utf8_encodelen(ucsp, ucslen, altslash, flags)
66 const u_int16_t * ucsp;
67 size_t ucslen;
68 u_int16_t altslash;
69 int flags;
70{
71 u_int16_t ucs_ch;
72 int charcnt;
73 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
74 size_t len;
75
76 charcnt = ucslen / 2;
77 len = 0;
78
79 while (charcnt-- > 0) {
80 ucs_ch = *ucsp++;
81
82 if (swapbytes)
83 ucs_ch = NXSwapShort(ucs_ch);
84 if (altslash && ucs_ch == '/')
85 ucs_ch = altslash;
86 if (ucs_ch == '\0')
87 ucs_ch = 0xc080;
88
89 len += UCS_TO_UTF_LEN(ucs_ch);
90 }
91
92 return (len);
93}
94
95
96/*
97 * utf8_encodestr - Encodes a UCS-2 (Unicode) string to UTF-8
98 *
99 * NOTES:
100 * The resulting UTF-8 string is not null terminated.
101 *
102 * If '/' chars are allowed on disk then an alternate
103 * (replacement) char must be provided in altslash.
104 *
105 * input flags:
106 * UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
107 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
108 */
109int utf8_encodestr(ucsp, ucslen, utf8p, utf8len, buflen, altslash, flags)
110 const u_int16_t * ucsp;
111 size_t ucslen;
112 u_int8_t * utf8p;
113 size_t * utf8len;
114 size_t buflen;
115 u_int16_t altslash;
116 int flags;
117{
118 u_int8_t * bufstart;
119 u_int8_t * bufend;
120 u_int16_t ucs_ch;
121 int charcnt;
122 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
123 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
124 int result = 0;
125
126 bufstart = utf8p;
127 bufend = bufstart + buflen;
128 if (nullterm)
129 --bufend;
130 charcnt = ucslen / 2;
131
132 while (charcnt-- > 0) {
133 ucs_ch = *ucsp++;
134
135 if (swapbytes)
136 ucs_ch = NXSwapShort(ucs_ch);
137 if (altslash && ucs_ch == '/')
138 ucs_ch = altslash;
139
140 if ((ucs_ch < 0x0080) && (ucs_ch != '\0')) {
141 if (utf8p >= bufend) {
142 result = ENAMETOOLONG;
143 break;
144 }
145 *utf8p++ = ucs_ch;
146
147 } else if (ucs_ch < 0x800) {
148 if ((utf8p + 1) >= bufend) {
149 result = ENAMETOOLONG;
150 break;
151 }
152 /* NOTE: NULL maps to 0xC080 */
153 *utf8p++ = (ucs_ch >> 6) | 0xc0;
154 *utf8p++ = (ucs_ch & 0x3f) | 0x80;
155
156 } else {
157 if ((utf8p + 2) >= bufend) {
158 result = ENAMETOOLONG;
159 break;
160 }
161 *utf8p++ = (ucs_ch >> 12) | 0xe0;
162 *utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
163 *utf8p++ = ((ucs_ch) & 0x3f) | 0x80;
164 }
165 }
166
167 *utf8len = utf8p - bufstart;
168 if (nullterm)
169 *utf8p++ = '\0';
170
171 return (result);
172}
173
174
175/*
176 * utf8_decodestr - Decodes a UTF-8 string back to UCS-2 (Unicode)
177 *
178 * NOTES:
179 * The input UTF-8 string does not need to be null terminated
180 * if utf8len is set.
181 *
182 * If '/' chars are allowed on disk then an alternate
183 * (replacement) char must be provided in altslash.
184 *
185 * input flags:
186 * UTF_REV_ENDIAN: UCS-2 byteorder is oposite current runtime
187 * UTF_DECOMPOSED: UCS-2 output string must be fully decompsed
188 */
189int
190utf8_decodestr(utf8p, utf8len, ucsp, ucslen, buflen, altslash, flags)
191 const u_int8_t* utf8p;
192 size_t utf8len;
193 u_int16_t* ucsp;
194 size_t *ucslen;
195 size_t buflen;
196 u_int16_t altslash;
197 int flags;
198{
199 u_int16_t* bufstart;
200 u_int16_t* bufend;
201 u_int16_t ucs_ch;
202 u_int8_t byte;
203 int result = 0;
204 int decompose, swapbytes;
205
206 decompose = (flags & UTF_DECOMPOSED);
207 swapbytes = (flags & UTF_REVERSE_ENDIAN);
208
209 bufstart = ucsp;
210 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
211
212 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
213 if (ucsp >= bufend) {
214 result = ENAMETOOLONG;
215 goto stop;
216 }
217
218 /* check for ascii */
219 if (byte < 0x80) {
220 ucs_ch = byte;
221 } else {
222 switch (byte & 0xf0) {
223 /* 2 byte sequence*/
224 case 0xc0:
225 case 0xd0:
226 /* extract bits 6 - 10 from first byte */
227 ucs_ch = (byte & 0x1F) << 6;
228 if ((ucs_ch < 0x0080) && (*utf8p != 0x80)) {
229 result = EINVAL; /* seq not minimal */
230 goto stop;
231 }
232 break;
233 /* 3 byte sequence*/
234 case 0xe0:
235 /* extract bits 12 - 15 from first byte */
236 ucs_ch = (byte & 0x0F) << 6;
237
238 /* extract bits 6 - 11 from second byte */
239 if (((byte = *utf8p++) & 0xc0) != 0x80) {
240 result = EINVAL;
241 goto stop;
242 }
243 utf8len--;
244
245 ucs_ch += (byte & 0x3F);
246 ucs_ch <<= 6;
247
248 if (ucs_ch < 0x0800) {
249 result = EINVAL; /* seq not minimal */
250 goto stop;
251 }
252 break;
253 default:
254 result = EINVAL;
255 goto stop;
256 }
257
258 /* extract bits 0 - 5 from final byte */
259 if (((byte = *utf8p++) & 0xc0) != 0x80) {
260 result = EINVAL;
261 goto stop;
262 }
263 utf8len--;
264 ucs_ch += (byte & 0x3F);
265
266 if (decompose) {
267 u_int16_t comb_ch;
268
269 ucs_ch = ucs_decompose(ucs_ch, &comb_ch);
270
271 if (comb_ch) {
272 if (swapbytes)
273 *ucsp++ = NXSwapShort(ucs_ch);
274 else
275 *ucsp++ = ucs_ch;
276
277 if (ucsp >= bufend) {
278 result = ENAMETOOLONG;
279 goto stop;
280 }
281
282 ucs_ch = comb_ch;
283 }
284 }
285 }
286
287 if (ucs_ch == altslash)
288 ucs_ch = '/';
289 if (swapbytes)
290 ucs_ch = NXSwapShort(ucs_ch);
291
292 *ucsp++ = ucs_ch;
293 }
294stop:
295 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
296
297 return (result);
298}
299
300
301/*
302 * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
303 * primary_char yields first decomposed char. If this
304 * char is an alpha char then get the combining char
305 * from the combining_char table and add 0x0300 to it.
306 */
307
308static unsigned char primary_char[64] = {
309 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43,
310
311 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49,
312
313 0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7,
314
315 0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF,
316
317 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63,
318
319 0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69,
320
321 0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7,
322
323 0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79,
324};
325
326static unsigned char combining_char[64] = {
327 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
328
329 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
330
331 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
332
333 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF,
334
335 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
336
337 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
338
339 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
340
341 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08
342};
343
344
345/* CJK codepoints 0x3000 ~ 0x30FF */
346static const unsigned long __CJKDecompBitmap[] = {
347 0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, /* 0x3000 */
348 0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, /* 0x3080 */
349};
350#define IS_DECOMPOSABLE(table,unicodeVal) \
351 (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
352
353/*
354 * ucs_decompose - decompose a composed UCS-2 char
355 *
356 * Composed Unicode characters are forbidden on
357 * HFS Plus volumes. ucs_decompose will convert a
358 * composed character into its correct decomposed
359 * sequence.
360 *
361 * Currently only MacRoman and MacJapanese chars
362 * are handled. Other composed characters are
363 * passed unchanged.
364 */
365static u_int16_t
366ucs_decompose(register u_int16_t ch, u_int16_t *cmb)
367{
368 u_int16_t base;
369
370 *cmb = 0;
371
372 if ((ch <= 0x00FF) && (ch >= 0x00C0)) {
373 ch -= 0x00C0;
374
375 base = (u_int16_t) primary_char[ch];
376
377 if (base <= 'z') {
378 *cmb = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch];
379 }
380 } else if ((ch > 0x3000) && (ch < 0x3100) &&
381 IS_DECOMPOSABLE(__CJKDecompBitmap, ch - 0x3000)) {
382
383 /* Handle HIRAGANA LETTERs */
384 switch(ch) {
385 case 0x3071: base = 0x306F; *cmb = 0x309A; break; /* PA */
386 case 0x3074: base = 0x3072; *cmb = 0x309A; break; /* PI */
387 case 0x3077: base = 0x3075; *cmb = 0x309A; break; /* PU */
388 case 0x307A: base = 0x3078; *cmb = 0x309A; break; /* PE */
389
390 case 0x307D: base = 0x307B; *cmb = 0x309A; break; /* PO */
391 case 0x3094: base = 0x3046; *cmb = 0x3099; break; /* VU */
392 case 0x30D1: base = 0x30CF; *cmb = 0x309A; break; /* PA */
393 case 0x30D4: base = 0x30D2; *cmb = 0x309A; break; /* PI */
394
395 case 0x30D7: base = 0x30D5; *cmb = 0x309A; break; /* PU */
396 case 0x30DA: base = 0x30D8; *cmb = 0x309A; break; /* PE */
397 case 0x30DD: base = 0x30DB; *cmb = 0x309A; break; /* PO */
398 case 0x30F4: base = 0x30A6; *cmb = 0x3099; break; /* VU */
399
400 case 0x30F7: base = 0x30EF; *cmb = 0x3099; break; /* VA */
401 case 0x30F8: base = 0x30F0; *cmb = 0x3099; break; /* VI */
402 case 0x30F9: base = 0x30F1; *cmb = 0x3099; break; /* VE */
403 case 0x30FA: base = 0x30F2; *cmb = 0x3099; break; /* VO */
404
405 default:
406 /* the rest (41 of them) have a simple conversion */
407 base = ch - 1;
408 *cmb = 0x3099;
409 }
410 } else {
411 base = ch;
412 }
413
414 return (base);
415}
416