]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_utfconv.c
d2623589b8fa2ea5bb06602d854d069938f1f6f2
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 /*
24 Includes Unicode 3.2 decomposition code derived from Core Foundation
25 */
26
27 #include <sys/param.h>
28 #include <sys/utfconv.h>
29 #include <sys/errno.h>
30 #include <architecture/byte_order.h>
31
32 /*
33 * UTF-8 (Unicode Transformation Format)
34 *
35 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
36 * character as a sequence of one to four bytes. Only the shortest form
37 * required to represent the significant Unicode bits is legal.
38 *
39 * UTF-8 Multibyte Codes
40 *
41 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
42 * -----------------------------------------------------------------------------
43 * 1 7 0x0000 0x007F 0xxxxxxx
44 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
45 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
46 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
47 * -----------------------------------------------------------------------------
48 */
49
50
51 #define UNICODE_TO_UTF8_LEN(c) \
52 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
53
54 #define UCS_ALT_NULL 0x2400
55
56 /* Surrogate Pair Constants */
57 #define SP_HALF_SHIFT 10
58 #define SP_HALF_BASE 0x0010000UL
59 #define SP_HALF_MASK 0x3FFUL
60
61 #define SP_HIGH_FIRST 0xD800UL
62 #define SP_HIGH_LAST 0xDBFFUL
63 #define SP_LOW_FIRST 0xDC00UL
64 #define SP_LOW_LAST 0xDFFFUL
65
66
67 #include "vfs_utfconvdata.h"
68
69
70 /*
71 * Test for a combining character.
72 *
73 * Similar to __CFUniCharIsNonBaseCharacter except that
74 * unicode_combinable also includes Hangul Jamo characters.
75 */
76 static inline int
77 unicode_combinable(u_int16_t character)
78 {
79 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
80 u_int8_t value;
81
82 if (character < 0x0300)
83 return (0);
84
85 value = bitmap[(character >> 8) & 0xFF];
86
87 if (value == 0xFF) {
88 return (1);
89 } else if (value) {
90 bitmap = bitmap + ((value - 1) * 32) + 256;
91 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
92 }
93 return (0);
94 }
95
96 /*
97 * Test for a precomposed character.
98 *
99 * Similar to __CFUniCharIsDecomposableCharacter.
100 */
101 static inline int
102 unicode_decomposeable(u_int16_t character) {
103 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
104 u_int8_t value;
105
106 if (character < 0x00C0)
107 return (0);
108
109 value = bitmap[(character >> 8) & 0xFF];
110
111 if (value == 0xFF) {
112 return (1);
113 } else if (value) {
114 bitmap = bitmap + ((value - 1) * 32) + 256;
115 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
116 }
117 return (0);
118 }
119
120 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
121
122 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
123
124
125 char utf_extrabytes[32] = {
126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
128 };
129
130
131 /*
132 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
133 *
134 * NOTES:
135 * If '/' chars are allowed on disk then an alternate
136 * (replacement) char must be provided in altslash.
137 *
138 * input flags:
139 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
140 */
141 size_t
142 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
143 int flags)
144 {
145 u_int16_t ucs_ch;
146 int charcnt;
147 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
148 size_t len;
149
150 charcnt = ucslen / 2;
151 len = 0;
152
153 while (charcnt-- > 0) {
154 ucs_ch = *ucsp++;
155
156 if (swapbytes)
157 ucs_ch = NXSwapShort(ucs_ch);
158 if (ucs_ch == '/')
159 ucs_ch = altslash ? altslash : '_';
160 else if (ucs_ch == '\0')
161 ucs_ch = UCS_ALT_NULL;
162
163 len += UNICODE_TO_UTF8_LEN(ucs_ch);
164 }
165
166 return (len);
167 }
168
169
170 /*
171 * utf8_encodestr - Encodes a Unicode string to UTF-8
172 *
173 * NOTES:
174 * The resulting UTF-8 string is NULL terminated.
175 *
176 * If '/' chars are allowed on disk then an alternate
177 * (replacement) char must be provided in altslash.
178 *
179 * input flags:
180 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
181 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
182 *
183 * result:
184 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
185 * EINVAL: Illegal char found; char was replaced by an '_'.
186 */
187 int
188 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
189 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
190 {
191 u_int8_t * bufstart;
192 u_int8_t * bufend;
193 u_int16_t ucs_ch;
194 u_int16_t * chp = NULL;
195 u_int16_t sequence[8];
196 int extra = 0;
197 int charcnt;
198 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
199 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
200 int decompose = (flags & UTF_DECOMPOSED);
201 int result = 0;
202
203 bufstart = utf8p;
204 bufend = bufstart + buflen;
205 if (nullterm)
206 --bufend;
207 charcnt = ucslen / 2;
208
209 while (charcnt-- > 0) {
210 if (extra > 0) {
211 --extra;
212 ucs_ch = *chp++;
213 } else {
214 ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
215
216 if (decompose && unicode_decomposeable(ucs_ch)) {
217 extra = unicode_decompose(ucs_ch, sequence) - 1;
218 charcnt += extra;
219 ucs_ch = sequence[0];
220 chp = &sequence[1];
221 }
222 }
223
224 /* Slash and NULL are not permitted */
225 if (ucs_ch == '/') {
226 if (altslash)
227 ucs_ch = altslash;
228 else {
229 ucs_ch = '_';
230 result = EINVAL;
231 }
232 } else if (ucs_ch == '\0') {
233 ucs_ch = UCS_ALT_NULL;
234 }
235
236 if (ucs_ch < 0x0080) {
237 if (utf8p >= bufend) {
238 result = ENAMETOOLONG;
239 break;
240 }
241 *utf8p++ = ucs_ch;
242
243 } else if (ucs_ch < 0x800) {
244 if ((utf8p + 1) >= bufend) {
245 result = ENAMETOOLONG;
246 break;
247 }
248 *utf8p++ = 0xc0 | (ucs_ch >> 6);
249 *utf8p++ = 0x80 | (0x3f & ucs_ch);
250
251 } else {
252 /* Combine valid surrogate pairs */
253 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
254 && charcnt > 0) {
255 u_int16_t ch2;
256 u_int32_t pair;
257
258 ch2 = swapbytes ? NXSwapShort(*ucsp) : *ucsp;
259 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
260 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
261 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
262 if ((utf8p + 3) >= bufend) {
263 result = ENAMETOOLONG;
264 break;
265 }
266 --charcnt;
267 ++ucsp;
268 *utf8p++ = 0xf0 | (pair >> 18);
269 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
270 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
271 *utf8p++ = 0x80 | (0x3f & pair);
272 continue;
273 }
274 }
275 if ((utf8p + 2) >= bufend) {
276 result = ENAMETOOLONG;
277 break;
278 }
279 *utf8p++ = 0xe0 | (ucs_ch >> 12);
280 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
281 *utf8p++ = 0x80 | (0x3f & ucs_ch);
282 }
283 }
284
285 *utf8len = utf8p - bufstart;
286 if (nullterm)
287 *utf8p++ = '\0';
288
289 return (result);
290 }
291
292
293 /*
294 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
295 *
296 * NOTES:
297 * The input UTF-8 string does not need to be null terminated
298 * if utf8len is set.
299 *
300 * If '/' chars are allowed on disk then an alternate
301 * (replacement) char must be provided in altslash.
302 *
303 * input flags:
304 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
305 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
306 *
307 * result:
308 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
309 * EINVAL: Illegal UTF-8 sequence found.
310 */
311 int
312 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
313 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
314 {
315 u_int16_t* bufstart;
316 u_int16_t* bufend;
317 u_int16_t ucs_ch;
318 u_int8_t byte;
319 int result = 0;
320 int decompose, precompose, swapbytes;
321
322 decompose = (flags & UTF_DECOMPOSED);
323 precompose = (flags & UTF_PRECOMPOSED);
324 swapbytes = (flags & UTF_REVERSE_ENDIAN);
325
326 bufstart = ucsp;
327 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
328
329 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
330 if (ucsp >= bufend)
331 goto toolong;
332
333 /* check for ascii */
334 if (byte < 0x80) {
335 ucs_ch = byte; /* 1st byte */
336 } else {
337 u_int32_t ch;
338 int extrabytes = utf_extrabytes[byte >> 3];
339
340 if (utf8len < extrabytes)
341 goto invalid;
342 utf8len -= extrabytes;
343
344 switch (extrabytes) {
345 case 1: ch = byte; /* 1st byte */
346 ch <<= 6;
347 ch += *utf8p++; /* 2nd byte */
348 ch -= 0x00003080UL;
349 if (ch < 0x0080)
350 goto invalid;
351 ucs_ch = ch;
352 break;
353
354 case 2: ch = byte; /* 1st byte */
355 ch <<= 6;
356 ch += *utf8p++; /* 2nd byte */
357 ch <<= 6;
358 ch += *utf8p++; /* 3rd byte */
359 ch -= 0x000E2080UL;
360 if (ch < 0x0800)
361 goto invalid;
362 ucs_ch = ch;
363 break;
364
365 case 3: ch = byte; /* 1st byte */
366 ch <<= 6;
367 ch += *utf8p++; /* 2nd byte */
368 ch <<= 6;
369 ch += *utf8p++; /* 3rd byte */
370 ch <<= 6;
371 ch += *utf8p++; /* 4th byte */
372 ch -= 0x03C82080UL + SP_HALF_BASE;
373 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
374 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
375 if (ucsp >= bufend)
376 goto toolong;
377 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
378 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
379 continue;
380
381 default:
382 goto invalid;
383 }
384 if (decompose) {
385 if (unicode_decomposeable(ucs_ch)) {
386 u_int16_t sequence[8];
387 int count, i;
388
389 count = unicode_decompose(ucs_ch, sequence);
390
391 for (i = 0; i < count; ++i) {
392 ucs_ch = sequence[i];
393 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
394 if (ucsp >= bufend)
395 goto toolong;
396 }
397 continue;
398 }
399 } else if (precompose && (ucsp != bufstart)) {
400 u_int16_t composite, base;
401
402 if (unicode_combinable(ucs_ch)) {
403 base = swapbytes ? NXSwapShort(*(ucsp - 1)) : *(ucsp - 1);
404 composite = unicode_combine(base, ucs_ch);
405 if (composite) {
406 --ucsp;
407 ucs_ch = composite;
408 }
409 }
410 }
411 if (ucs_ch == UCS_ALT_NULL)
412 ucs_ch = '\0';
413 }
414 if (ucs_ch == altslash)
415 ucs_ch = '/';
416
417 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
418 }
419
420 exit:
421 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
422
423 return (result);
424
425 invalid:
426 result = EINVAL;
427 goto exit;
428
429 toolong:
430 result = ENAMETOOLONG;
431 goto exit;
432 }
433
434
435 /*
436 * Unicode 3.2 decomposition code (derived from Core Foundation)
437 */
438
439 typedef struct {
440 u_int32_t _key;
441 u_int32_t _value;
442 } unicode_mappings32;
443
444 static inline u_int32_t
445 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
446 u_int16_t character)
447 {
448 const unicode_mappings32 *p, *q, *divider;
449
450 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
451 return (0);
452
453 p = theTable;
454 q = p + (numElem-1);
455 while (p <= q) {
456 divider = p + ((q - p) >> 1); /* divide by 2 */
457 if (character < divider->_key) { q = divider - 1; }
458 else if (character > divider->_key) { p = divider + 1; }
459 else { return (divider->_value); }
460 }
461 return (0);
462 }
463
464 #define RECURSIVE_DECOMPOSITION (1 << 15)
465 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
466
467 typedef struct {
468 u_int16_t _key;
469 u_int16_t _value;
470 } unicode_mappings16;
471
472 static inline u_int16_t
473 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
474 u_int16_t character)
475 {
476 const unicode_mappings16 *p, *q, *divider;
477
478 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
479 return (0);
480
481 p = theTable;
482 q = p + (numElem-1);
483 while (p <= q) {
484 divider = p + ((q - p) >> 1); /* divide by 2 */
485 if (character < divider->_key)
486 q = divider - 1;
487 else if (character > divider->_key)
488 p = divider + 1;
489 else
490 return (divider->_value);
491 }
492 return (0);
493 }
494
495
496 static u_int32_t
497 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
498 {
499 u_int16_t value;
500 u_int32_t length;
501 u_int16_t firstChar;
502 u_int16_t theChar;
503 const u_int16_t *bmpMappings;
504 u_int32_t usedLength;
505
506 value = getmappedvalue16(
507 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
508 __UniCharDecompositionTableLength, character);
509 length = EXTRACT_COUNT(value);
510 firstChar = value & 0x0FFF;
511 theChar = firstChar;
512 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
513 usedLength = 0;
514
515 if (value & RECURSIVE_DECOMPOSITION) {
516 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
517
518 --length; /* Decrement for the first char */
519 if (!usedLength)
520 return 0;
521 ++bmpMappings;
522 convertedChars += usedLength;
523 }
524
525 usedLength += length;
526
527 while (length--)
528 *(convertedChars++) = *(bmpMappings++);
529
530 return (usedLength);
531 }
532
533 #define HANGUL_SBASE 0xAC00
534 #define HANGUL_LBASE 0x1100
535 #define HANGUL_VBASE 0x1161
536 #define HANGUL_TBASE 0x11A7
537
538 #define HANGUL_SCOUNT 11172
539 #define HANGUL_LCOUNT 19
540 #define HANGUL_VCOUNT 21
541 #define HANGUL_TCOUNT 28
542 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
543
544 /*
545 * unicode_decompose - decompose a composed Unicode char
546 *
547 * Composed Unicode characters are forbidden on
548 * HFS Plus volumes. ucs_decompose will convert a
549 * composed character into its correct decomposed
550 * sequence.
551 *
552 * Similar to CFUniCharDecomposeCharacter
553 */
554 static int
555 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
556 {
557 if ((character >= HANGUL_SBASE) &&
558 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
559 u_int32_t length;
560
561 character -= HANGUL_SBASE;
562 length = (character % HANGUL_TCOUNT ? 3 : 2);
563
564 *(convertedChars++) =
565 character / HANGUL_NCOUNT + HANGUL_LBASE;
566 *(convertedChars++) =
567 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
568 if (length > 2)
569 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
570 return (length);
571 } else {
572 return (unicode_recursive_decompose(character, convertedChars));
573 }
574 }
575
576 /*
577 * unicode_combine - generate a precomposed Unicode char
578 *
579 * Precomposed Unicode characters are required for some volume
580 * formats and network protocols. unicode_combine will combine
581 * a decomposed character sequence into a single precomposed
582 * (composite) character.
583 *
584 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
585 * also handles Hangul Jamo characters.
586 */
587 static u_int16_t
588 unicode_combine(u_int16_t base, u_int16_t combining)
589 {
590 u_int32_t value;
591
592 /* Check HANGUL */
593 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
594 /* 2 char Hangul sequences */
595 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
596 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
597 return (HANGUL_SBASE +
598 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
599 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
600 }
601
602 /* 3 char Hangul sequences */
603 if ((combining > HANGUL_TBASE) &&
604 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
605 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
606 return (0);
607 else
608 return (base + (combining - HANGUL_TBASE));
609 }
610 }
611
612 value = getmappedvalue32(
613 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
614 __CFUniCharPrecompositionTableLength, combining);
615
616 if (value) {
617 value = getmappedvalue16(
618 (const unicode_mappings16 *)
619 ((u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
620 (value >> 16), base);
621 }
622 return (value);
623 }
624