]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_utfconv.c
xnu-517.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25
26 /*
27 Includes Unicode 3.2 decomposition code derived from Core Foundation
28 */
29
30 #include <sys/param.h>
31 #include <sys/utfconv.h>
32 #include <sys/errno.h>
33 #include <architecture/byte_order.h>
34
35 /*
36 * UTF-8 (Unicode Transformation Format)
37 *
38 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
39 * character as a sequence of one to four bytes. Only the shortest form
40 * required to represent the significant Unicode bits is legal.
41 *
42 * UTF-8 Multibyte Codes
43 *
44 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
45 * -----------------------------------------------------------------------------
46 * 1 7 0x0000 0x007F 0xxxxxxx
47 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
48 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
49 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
50 * -----------------------------------------------------------------------------
51 */
52
53
54 #define UNICODE_TO_UTF8_LEN(c) \
55 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
56
57 #define UCS_ALT_NULL 0x2400
58
59 /* Surrogate Pair Constants */
60 #define SP_HALF_SHIFT 10
61 #define SP_HALF_BASE 0x0010000UL
62 #define SP_HALF_MASK 0x3FFUL
63
64 #define SP_HIGH_FIRST 0xD800UL
65 #define SP_HIGH_LAST 0xDBFFUL
66 #define SP_LOW_FIRST 0xDC00UL
67 #define SP_LOW_LAST 0xDFFFUL
68
69
70 #include "vfs_utfconvdata.h"
71
72
73 /*
74 * Test for a combining character.
75 *
76 * Similar to __CFUniCharIsNonBaseCharacter except that
77 * unicode_combinable also includes Hangul Jamo characters.
78 */
79 static inline int
80 unicode_combinable(u_int16_t character)
81 {
82 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
83 u_int8_t value;
84
85 if (character < 0x0300)
86 return (0);
87
88 value = bitmap[(character >> 8) & 0xFF];
89
90 if (value == 0xFF) {
91 return (1);
92 } else if (value) {
93 bitmap = bitmap + ((value - 1) * 32) + 256;
94 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
95 }
96 return (0);
97 }
98
99 /*
100 * Test for a precomposed character.
101 *
102 * Similar to __CFUniCharIsDecomposableCharacter.
103 */
104 static inline int
105 unicode_decomposeable(u_int16_t character) {
106 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
107 u_int8_t value;
108
109 if (character < 0x00C0)
110 return (0);
111
112 value = bitmap[(character >> 8) & 0xFF];
113
114 if (value == 0xFF) {
115 return (1);
116 } else if (value) {
117 bitmap = bitmap + ((value - 1) * 32) + 256;
118 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
119 }
120 return (0);
121 }
122
123 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
124
125 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
126
127
128 char utf_extrabytes[32] = {
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
131 };
132
133
134 /*
135 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
136 *
137 * NOTES:
138 * If '/' chars are allowed on disk then an alternate
139 * (replacement) char must be provided in altslash.
140 *
141 * input flags:
142 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
143 */
144 size_t
145 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
146 int flags)
147 {
148 u_int16_t ucs_ch;
149 int charcnt;
150 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
151 size_t len;
152
153 charcnt = ucslen / 2;
154 len = 0;
155
156 while (charcnt-- > 0) {
157 ucs_ch = *ucsp++;
158
159 if (swapbytes)
160 ucs_ch = NXSwapShort(ucs_ch);
161 if (ucs_ch == '/')
162 ucs_ch = altslash ? altslash : '_';
163 else if (ucs_ch == '\0')
164 ucs_ch = UCS_ALT_NULL;
165
166 len += UNICODE_TO_UTF8_LEN(ucs_ch);
167 }
168
169 return (len);
170 }
171
172
173 /*
174 * utf8_encodestr - Encodes a Unicode string to UTF-8
175 *
176 * NOTES:
177 * The resulting UTF-8 string is NULL terminated.
178 *
179 * If '/' chars are allowed on disk then an alternate
180 * (replacement) char must be provided in altslash.
181 *
182 * input flags:
183 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
184 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
185 *
186 * result:
187 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
188 * EINVAL: Illegal char found; char was replaced by an '_'.
189 */
190 int
191 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
192 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
193 {
194 u_int8_t * bufstart;
195 u_int8_t * bufend;
196 u_int16_t ucs_ch;
197 u_int16_t * chp = NULL;
198 u_int16_t sequence[8];
199 int extra = 0;
200 int charcnt;
201 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
202 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
203 int decompose = (flags & UTF_DECOMPOSED);
204 int result = 0;
205
206 bufstart = utf8p;
207 bufend = bufstart + buflen;
208 if (nullterm)
209 --bufend;
210 charcnt = ucslen / 2;
211
212 while (charcnt-- > 0) {
213 if (extra > 0) {
214 --extra;
215 ucs_ch = *chp++;
216 } else {
217 ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
218
219 if (decompose && unicode_decomposeable(ucs_ch)) {
220 extra = unicode_decompose(ucs_ch, sequence) - 1;
221 charcnt += extra;
222 ucs_ch = sequence[0];
223 chp = &sequence[1];
224 }
225 }
226
227 /* Slash and NULL are not permitted */
228 if (ucs_ch == '/') {
229 if (altslash)
230 ucs_ch = altslash;
231 else {
232 ucs_ch = '_';
233 result = EINVAL;
234 }
235 } else if (ucs_ch == '\0') {
236 ucs_ch = UCS_ALT_NULL;
237 }
238
239 if (ucs_ch < 0x0080) {
240 if (utf8p >= bufend) {
241 result = ENAMETOOLONG;
242 break;
243 }
244 *utf8p++ = ucs_ch;
245
246 } else if (ucs_ch < 0x800) {
247 if ((utf8p + 1) >= bufend) {
248 result = ENAMETOOLONG;
249 break;
250 }
251 *utf8p++ = 0xc0 | (ucs_ch >> 6);
252 *utf8p++ = 0x80 | (0x3f & ucs_ch);
253
254 } else {
255 /* Combine valid surrogate pairs */
256 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
257 && charcnt > 0) {
258 u_int16_t ch2;
259 u_int32_t pair;
260
261 ch2 = swapbytes ? NXSwapShort(*ucsp) : *ucsp;
262 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
263 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
264 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
265 if ((utf8p + 3) >= bufend) {
266 result = ENAMETOOLONG;
267 break;
268 }
269 --charcnt;
270 ++ucsp;
271 *utf8p++ = 0xf0 | (pair >> 18);
272 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
273 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
274 *utf8p++ = 0x80 | (0x3f & pair);
275 continue;
276 }
277 }
278 if ((utf8p + 2) >= bufend) {
279 result = ENAMETOOLONG;
280 break;
281 }
282 *utf8p++ = 0xe0 | (ucs_ch >> 12);
283 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
284 *utf8p++ = 0x80 | (0x3f & ucs_ch);
285 }
286 }
287
288 *utf8len = utf8p - bufstart;
289 if (nullterm)
290 *utf8p++ = '\0';
291
292 return (result);
293 }
294
295
296 /*
297 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
298 *
299 * NOTES:
300 * The input UTF-8 string does not need to be null terminated
301 * if utf8len is set.
302 *
303 * If '/' chars are allowed on disk then an alternate
304 * (replacement) char must be provided in altslash.
305 *
306 * input flags:
307 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
308 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
309 *
310 * result:
311 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
312 * EINVAL: Illegal UTF-8 sequence found.
313 */
314 int
315 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
316 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
317 {
318 u_int16_t* bufstart;
319 u_int16_t* bufend;
320 unsigned int ucs_ch;
321 unsigned int byte;
322 int result = 0;
323 int decompose, precompose, swapbytes;
324
325 decompose = (flags & UTF_DECOMPOSED);
326 precompose = (flags & UTF_PRECOMPOSED);
327 swapbytes = (flags & UTF_REVERSE_ENDIAN);
328
329 bufstart = ucsp;
330 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
331
332 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
333 if (ucsp >= bufend)
334 goto toolong;
335
336 /* check for ascii */
337 if (byte < 0x80) {
338 ucs_ch = byte; /* 1st byte */
339 } else {
340 u_int32_t ch;
341 int extrabytes = utf_extrabytes[byte >> 3];
342
343 if (utf8len < extrabytes)
344 goto invalid;
345 utf8len -= extrabytes;
346
347 switch (extrabytes) {
348 case 1:
349 ch = byte; ch <<= 6; /* 1st byte */
350 byte = *utf8p++; /* 2nd byte */
351 if ((byte >> 6) != 2)
352 goto invalid;
353 ch += byte;
354 ch -= 0x00003080UL;
355 if (ch < 0x0080)
356 goto invalid;
357 ucs_ch = ch;
358 break;
359 case 2:
360 ch = byte; ch <<= 6; /* 1st byte */
361 byte = *utf8p++; /* 2nd byte */
362 if ((byte >> 6) != 2)
363 goto invalid;
364 ch += byte; ch <<= 6;
365 byte = *utf8p++; /* 3rd byte */
366 if ((byte >> 6) != 2)
367 goto invalid;
368 ch += byte;
369 ch -= 0x000E2080UL;
370 if (ch < 0x0800)
371 goto invalid;
372 if (ch >= 0xD800) {
373 if (ch <= 0xDFFF)
374 goto invalid;
375 if (ch == 0xFFFE || ch == 0xFFFF)
376 goto invalid;
377 }
378 ucs_ch = ch;
379 break;
380 case 3:
381 ch = byte; ch <<= 6; /* 1st byte */
382 byte = *utf8p++; /* 2nd byte */
383 if ((byte >> 6) != 2)
384 goto invalid;
385 ch += byte; ch <<= 6;
386 byte = *utf8p++; /* 3rd byte */
387 if ((byte >> 6) != 2)
388 goto invalid;
389 ch += byte; ch <<= 6;
390 byte = *utf8p++; /* 4th byte */
391 if ((byte >> 6) != 2)
392 goto invalid;
393 ch += byte;
394 ch -= 0x03C82080UL + SP_HALF_BASE;
395 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
396 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
397 goto invalid;
398 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
399 if (ucsp >= bufend)
400 goto toolong;
401 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
402 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
403 goto invalid;
404 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
405 continue;
406 default:
407 goto invalid;
408 }
409 if (decompose) {
410 if (unicode_decomposeable(ucs_ch)) {
411 u_int16_t sequence[8];
412 int count, i;
413
414 count = unicode_decompose(ucs_ch, sequence);
415
416 for (i = 0; i < count; ++i) {
417 ucs_ch = sequence[i];
418 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
419 if (ucsp >= bufend)
420 goto toolong;
421 }
422 continue;
423 }
424 } else if (precompose && (ucsp != bufstart)) {
425 u_int16_t composite, base;
426
427 if (unicode_combinable(ucs_ch)) {
428 base = swapbytes ? NXSwapShort(*(ucsp - 1)) : *(ucsp - 1);
429 composite = unicode_combine(base, ucs_ch);
430 if (composite) {
431 --ucsp;
432 ucs_ch = composite;
433 }
434 }
435 }
436 if (ucs_ch == UCS_ALT_NULL)
437 ucs_ch = '\0';
438 }
439 if (ucs_ch == altslash)
440 ucs_ch = '/';
441
442 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
443 }
444
445 exit:
446 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
447
448 return (result);
449
450 invalid:
451 result = EINVAL;
452 goto exit;
453
454 toolong:
455 result = ENAMETOOLONG;
456 goto exit;
457 }
458
459
460 /*
461 * Unicode 3.2 decomposition code (derived from Core Foundation)
462 */
463
464 typedef struct {
465 u_int32_t _key;
466 u_int32_t _value;
467 } unicode_mappings32;
468
469 static inline u_int32_t
470 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
471 u_int16_t character)
472 {
473 const unicode_mappings32 *p, *q, *divider;
474
475 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
476 return (0);
477
478 p = theTable;
479 q = p + (numElem-1);
480 while (p <= q) {
481 divider = p + ((q - p) >> 1); /* divide by 2 */
482 if (character < divider->_key) { q = divider - 1; }
483 else if (character > divider->_key) { p = divider + 1; }
484 else { return (divider->_value); }
485 }
486 return (0);
487 }
488
489 #define RECURSIVE_DECOMPOSITION (1 << 15)
490 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
491
492 typedef struct {
493 u_int16_t _key;
494 u_int16_t _value;
495 } unicode_mappings16;
496
497 static inline u_int16_t
498 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
499 u_int16_t character)
500 {
501 const unicode_mappings16 *p, *q, *divider;
502
503 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
504 return (0);
505
506 p = theTable;
507 q = p + (numElem-1);
508 while (p <= q) {
509 divider = p + ((q - p) >> 1); /* divide by 2 */
510 if (character < divider->_key)
511 q = divider - 1;
512 else if (character > divider->_key)
513 p = divider + 1;
514 else
515 return (divider->_value);
516 }
517 return (0);
518 }
519
520
521 static u_int32_t
522 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
523 {
524 u_int16_t value;
525 u_int32_t length;
526 u_int16_t firstChar;
527 u_int16_t theChar;
528 const u_int16_t *bmpMappings;
529 u_int32_t usedLength;
530
531 value = getmappedvalue16(
532 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
533 __UniCharDecompositionTableLength, character);
534 length = EXTRACT_COUNT(value);
535 firstChar = value & 0x0FFF;
536 theChar = firstChar;
537 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
538 usedLength = 0;
539
540 if (value & RECURSIVE_DECOMPOSITION) {
541 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
542
543 --length; /* Decrement for the first char */
544 if (!usedLength)
545 return 0;
546 ++bmpMappings;
547 convertedChars += usedLength;
548 }
549
550 usedLength += length;
551
552 while (length--)
553 *(convertedChars++) = *(bmpMappings++);
554
555 return (usedLength);
556 }
557
558 #define HANGUL_SBASE 0xAC00
559 #define HANGUL_LBASE 0x1100
560 #define HANGUL_VBASE 0x1161
561 #define HANGUL_TBASE 0x11A7
562
563 #define HANGUL_SCOUNT 11172
564 #define HANGUL_LCOUNT 19
565 #define HANGUL_VCOUNT 21
566 #define HANGUL_TCOUNT 28
567 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
568
569 /*
570 * unicode_decompose - decompose a composed Unicode char
571 *
572 * Composed Unicode characters are forbidden on
573 * HFS Plus volumes. ucs_decompose will convert a
574 * composed character into its correct decomposed
575 * sequence.
576 *
577 * Similar to CFUniCharDecomposeCharacter
578 */
579 static int
580 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
581 {
582 if ((character >= HANGUL_SBASE) &&
583 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
584 u_int32_t length;
585
586 character -= HANGUL_SBASE;
587 length = (character % HANGUL_TCOUNT ? 3 : 2);
588
589 *(convertedChars++) =
590 character / HANGUL_NCOUNT + HANGUL_LBASE;
591 *(convertedChars++) =
592 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
593 if (length > 2)
594 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
595 return (length);
596 } else {
597 return (unicode_recursive_decompose(character, convertedChars));
598 }
599 }
600
601 /*
602 * unicode_combine - generate a precomposed Unicode char
603 *
604 * Precomposed Unicode characters are required for some volume
605 * formats and network protocols. unicode_combine will combine
606 * a decomposed character sequence into a single precomposed
607 * (composite) character.
608 *
609 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
610 * also handles Hangul Jamo characters.
611 */
612 static u_int16_t
613 unicode_combine(u_int16_t base, u_int16_t combining)
614 {
615 u_int32_t value;
616
617 /* Check HANGUL */
618 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
619 /* 2 char Hangul sequences */
620 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
621 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
622 return (HANGUL_SBASE +
623 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
624 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
625 }
626
627 /* 3 char Hangul sequences */
628 if ((combining > HANGUL_TBASE) &&
629 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
630 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
631 return (0);
632 else
633 return (base + (combining - HANGUL_TBASE));
634 }
635 }
636
637 value = getmappedvalue32(
638 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
639 __CFUniCharPrecompositionTableLength, combining);
640
641 if (value) {
642 value = getmappedvalue16(
643 (const unicode_mappings16 *)
644 ((u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
645 (value >> 16), base);
646 }
647 return (value);
648 }
649