]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_utfconv.c
xnu-3248.30.4.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 Includes Unicode 3.2 decomposition code derived from Core Foundation
31 */
32
33 #include <sys/param.h>
34 #include <sys/utfconv.h>
35 #include <sys/errno.h>
36 #include <sys/malloc.h>
37 #include <libkern/OSByteOrder.h>
38
39 /*
40 * UTF-8 (Unicode Transformation Format)
41 *
42 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
43 * character as a sequence of one to four bytes. Only the shortest form
44 * required to represent the significant Unicode bits is legal.
45 *
46 * UTF-8 Multibyte Codes
47 *
48 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
49 * -----------------------------------------------------------------------------
50 * 1 7 0x0000 0x007F 0xxxxxxx
51 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
52 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
53 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
54 * -----------------------------------------------------------------------------
55 */
56
57
58 #define UNICODE_TO_UTF8_LEN(c) \
59 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
60
61 #define UCS_ALT_NULL 0x2400
62
63 /* Surrogate Pair Constants */
64 #define SP_HALF_SHIFT 10
65 #define SP_HALF_BASE 0x0010000u
66 #define SP_HALF_MASK 0x3FFu
67
68 #define SP_HIGH_FIRST 0xD800u
69 #define SP_HIGH_LAST 0xDBFFu
70 #define SP_LOW_FIRST 0xDC00u
71 #define SP_LOW_LAST 0xDFFFu
72
73
74 #include "vfs_utfconvdata.h"
75
76
77 /*
78 * Test for a combining character.
79 *
80 * Similar to __CFUniCharIsNonBaseCharacter except that
81 * unicode_combinable also includes Hangul Jamo characters.
82 */
83 int
84 unicode_combinable(u_int16_t character)
85 {
86 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
87 u_int8_t value;
88
89 if (character < 0x0300)
90 return (0);
91
92 value = bitmap[(character >> 8) & 0xFF];
93
94 if (value == 0xFF) {
95 return (1);
96 } else if (value) {
97 bitmap = bitmap + ((value - 1) * 32) + 256;
98 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
99 }
100 return (0);
101 }
102
103 /*
104 * Test for a precomposed character.
105 *
106 * Similar to __CFUniCharIsDecomposableCharacter.
107 */
108 int
109 unicode_decomposeable(u_int16_t character) {
110 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
111 u_int8_t value;
112
113 if (character < 0x00C0)
114 return (0);
115
116 value = bitmap[(character >> 8) & 0xFF];
117
118 if (value == 0xFF) {
119 return (1);
120 } else if (value) {
121 bitmap = bitmap + ((value - 1) * 32) + 256;
122 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
123 }
124 return (0);
125 }
126
127
128 /*
129 * Get the combing class.
130 *
131 * Similar to CFUniCharGetCombiningPropertyForCharacter.
132 */
133 static inline u_int8_t
134 get_combining_class(u_int16_t character) {
135 const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
136
137 u_int8_t value = bitmap[(character >> 8)];
138
139 if (value) {
140 bitmap = bitmap + (value * 256);
141 return bitmap[character % 256];
142 }
143 return (0);
144 }
145
146
147 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
148
149 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
150
151 static void prioritysort(u_int16_t* characters, int count);
152
153 static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
154
155 static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
156
157
158 char utf_extrabytes[32] = {
159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
161 };
162
163 const char hexdigits[16] = {
164 '0', '1', '2', '3', '4', '5', '6', '7',
165 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
166 };
167
168 /*
169 * utf8_encodelen - Calculate the UTF-8 encoding length
170 *
171 * This function takes a Unicode input string, ucsp, of ucslen bytes
172 * and calculates the size of the UTF-8 output in bytes (not including
173 * a NULL termination byte). The string must reside in kernel memory.
174 *
175 * If '/' chars are possible in the Unicode input then an alternate
176 * (replacement) char should be provided in altslash.
177 *
178 * FLAGS
179 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
180 *
181 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
182 *
183 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
184 *
185 * UTF_DECOMPOSED: generate fully decomposed output
186 *
187 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
188 *
189 * ERRORS
190 * None
191 */
192 size_t
193 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
194 {
195 u_int16_t ucs_ch;
196 u_int16_t * chp = NULL;
197 u_int16_t sequence[8];
198 int extra = 0;
199 size_t charcnt;
200 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
201 int decompose = (flags & UTF_DECOMPOSED);
202 size_t len;
203
204 charcnt = ucslen / 2;
205 len = 0;
206
207 while (charcnt-- > 0) {
208 if (extra > 0) {
209 --extra;
210 ucs_ch = *chp++;
211 } else {
212 ucs_ch = *ucsp++;
213 if (swapbytes) {
214 ucs_ch = OSSwapInt16(ucs_ch);
215 }
216 if (ucs_ch == '/') {
217 ucs_ch = altslash ? altslash : '_';
218 } else if (ucs_ch == '\0') {
219 ucs_ch = UCS_ALT_NULL;
220 } else if (decompose && unicode_decomposeable(ucs_ch)) {
221 extra = unicode_decompose(ucs_ch, sequence) - 1;
222 charcnt += extra;
223 ucs_ch = sequence[0];
224 chp = &sequence[1];
225 }
226 }
227 len += UNICODE_TO_UTF8_LEN(ucs_ch);
228 }
229
230 return (len);
231 }
232
233
234 /*
235 * utf8_encodestr - Encodes a Unicode string to UTF-8
236 *
237 * NOTES:
238 * The resulting UTF-8 string is NULL terminated.
239 *
240 * If '/' chars are allowed on disk then an alternate
241 * (replacement) char must be provided in altslash.
242 *
243 * input flags:
244 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
245 *
246 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
247 *
248 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
249 *
250 * UTF_DECOMPOSED: generate fully decomposed output
251 *
252 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
253 *
254 * result:
255 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
256 *
257 * EINVAL: Illegal char found; char was replaced by an '_'.
258 */
259 int
260 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
261 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
262 {
263 u_int8_t * bufstart;
264 u_int8_t * bufend;
265 u_int16_t ucs_ch;
266 u_int16_t * chp = NULL;
267 u_int16_t sequence[8];
268 int extra = 0;
269 size_t charcnt;
270 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
271 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
272 int decompose = (flags & UTF_DECOMPOSED);
273 int sfmconv = (flags & UTF_SFM_CONVERSIONS);
274 int result = 0;
275
276 bufstart = utf8p;
277 bufend = bufstart + buflen;
278 if (nullterm)
279 --bufend;
280 charcnt = ucslen / 2;
281
282 while (charcnt-- > 0) {
283 if (extra > 0) {
284 --extra;
285 ucs_ch = *chp++;
286 } else {
287 ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
288
289 if (decompose && unicode_decomposeable(ucs_ch)) {
290 extra = unicode_decompose(ucs_ch, sequence) - 1;
291 charcnt += extra;
292 ucs_ch = sequence[0];
293 chp = &sequence[1];
294 }
295 }
296
297 /* Slash and NULL are not permitted */
298 if (ucs_ch == '/') {
299 if (altslash)
300 ucs_ch = altslash;
301 else {
302 ucs_ch = '_';
303 result = EINVAL;
304 }
305 } else if (ucs_ch == '\0') {
306 ucs_ch = UCS_ALT_NULL;
307 }
308
309 if (ucs_ch < 0x0080) {
310 if (utf8p >= bufend) {
311 result = ENAMETOOLONG;
312 break;
313 }
314 *utf8p++ = ucs_ch;
315
316 } else if (ucs_ch < 0x800) {
317 if ((utf8p + 1) >= bufend) {
318 result = ENAMETOOLONG;
319 break;
320 }
321 *utf8p++ = 0xc0 | (ucs_ch >> 6);
322 *utf8p++ = 0x80 | (0x3f & ucs_ch);
323
324 } else {
325 /* These chars never valid Unicode. */
326 if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) {
327 result = EINVAL;
328 break;
329 }
330
331 /* Combine valid surrogate pairs */
332 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
333 && charcnt > 0) {
334 u_int16_t ch2;
335 u_int32_t pair;
336
337 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
338 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
339 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
340 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
341 if ((utf8p + 3) >= bufend) {
342 result = ENAMETOOLONG;
343 break;
344 }
345 --charcnt;
346 ++ucsp;
347 *utf8p++ = 0xf0 | (pair >> 18);
348 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
349 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
350 *utf8p++ = 0x80 | (0x3f & pair);
351 continue;
352 }
353 } else if (sfmconv) {
354 ucs_ch = sfm_to_ucs(ucs_ch);
355 if (ucs_ch < 0x0080) {
356 if (utf8p >= bufend) {
357 result = ENAMETOOLONG;
358 break;
359 }
360 *utf8p++ = ucs_ch;
361 continue;
362 }
363 }
364 if ((utf8p + 2) >= bufend) {
365 result = ENAMETOOLONG;
366 break;
367 }
368 *utf8p++ = 0xe0 | (ucs_ch >> 12);
369 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
370 *utf8p++ = 0x80 | (0x3f & ucs_ch);
371 }
372 }
373
374 *utf8len = utf8p - bufstart;
375 if (nullterm)
376 *utf8p++ = '\0';
377
378 return (result);
379 }
380
381 // Pushes a character taking account of combining character sequences
382 static void push(uint16_t ucs_ch, int *combcharcnt, uint16_t **ucsp)
383 {
384 /*
385 * Make multiple combining character sequences canonical
386 */
387 if (unicode_combinable(ucs_ch)) {
388 ++*combcharcnt; /* start tracking a run */
389 } else if (*combcharcnt) {
390 if (*combcharcnt > 1) {
391 prioritysort(*ucsp - *combcharcnt, *combcharcnt);
392 }
393 *combcharcnt = 0; /* start over */
394 }
395
396 *(*ucsp)++ = ucs_ch;
397 }
398
399 /*
400 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
401 *
402 * NOTES:
403 * The input UTF-8 string does not need to be null terminated
404 * if utf8len is set.
405 *
406 * If '/' chars are allowed on disk then an alternate
407 * (replacement) char must be provided in altslash.
408 *
409 * input flags:
410 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
411 *
412 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
413 *
414 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
415 *
416 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
417 *
418 * UTF_PRECOMPOSED: generate precomposed output (NFC)
419 *
420 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
421 *
422 * result:
423 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
424 *
425 * EINVAL: Illegal UTF-8 sequence found.
426 */
427 int
428 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
429 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
430 {
431 u_int16_t* bufstart;
432 u_int16_t* bufend;
433 unsigned int ucs_ch;
434 unsigned int byte;
435 int combcharcnt = 0;
436 int result = 0;
437 int decompose, precompose, escaping;
438 int sfmconv;
439 int extrabytes;
440
441 decompose = (flags & UTF_DECOMPOSED);
442 precompose = (flags & UTF_PRECOMPOSED);
443 escaping = (flags & UTF_ESCAPE_ILLEGAL);
444 sfmconv = (flags & UTF_SFM_CONVERSIONS);
445
446 bufstart = ucsp;
447 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
448
449 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
450 if (ucsp >= bufend)
451 goto toolong;
452
453 /* check for ascii */
454 if (byte < 0x80) {
455 ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
456 } else {
457 u_int32_t ch;
458
459 extrabytes = utf_extrabytes[byte >> 3];
460 if ((extrabytes < 0) || ((int)utf8len < extrabytes)) {
461 goto escape;
462 }
463 utf8len -= extrabytes;
464
465 switch (extrabytes) {
466 case 1:
467 ch = byte; ch <<= 6; /* 1st byte */
468 byte = *utf8p++; /* 2nd byte */
469 if ((byte >> 6) != 2)
470 goto escape2;
471 ch += byte;
472 ch -= 0x00003080UL;
473 if (ch < 0x0080)
474 goto escape2;
475 ucs_ch = ch;
476 break;
477 case 2:
478 ch = byte; ch <<= 6; /* 1st byte */
479 byte = *utf8p++; /* 2nd byte */
480 if ((byte >> 6) != 2)
481 goto escape2;
482 ch += byte; ch <<= 6;
483 byte = *utf8p++; /* 3rd byte */
484 if ((byte >> 6) != 2)
485 goto escape3;
486 ch += byte;
487 ch -= 0x000E2080UL;
488 if (ch < 0x0800)
489 goto escape3;
490 if (ch >= 0xD800) {
491 if (ch <= 0xDFFF)
492 goto escape3;
493 if (ch == 0xFFFE || ch == 0xFFFF)
494 goto escape3;
495 }
496 ucs_ch = ch;
497 break;
498 case 3:
499 ch = byte; ch <<= 6; /* 1st byte */
500 byte = *utf8p++; /* 2nd byte */
501 if ((byte >> 6) != 2)
502 goto escape2;
503 ch += byte; ch <<= 6;
504 byte = *utf8p++; /* 3rd byte */
505 if ((byte >> 6) != 2)
506 goto escape3;
507 ch += byte; ch <<= 6;
508 byte = *utf8p++; /* 4th byte */
509 if ((byte >> 6) != 2)
510 goto escape4;
511 ch += byte;
512 ch -= 0x03C82080UL + SP_HALF_BASE;
513 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
514 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
515 goto escape4;
516 push(ucs_ch, &combcharcnt, &ucsp);
517 if (ucsp >= bufend)
518 goto toolong;
519 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
520 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
521 --ucsp;
522 goto escape4;
523 }
524 *ucsp++ = ucs_ch;
525 continue;
526 default:
527 result = EINVAL;
528 goto exit;
529 }
530 if (decompose) {
531 if (unicode_decomposeable(ucs_ch)) {
532 u_int16_t sequence[8];
533 int count, i;
534
535 count = unicode_decompose(ucs_ch, sequence);
536
537 for (i = 0; i < count; ++i) {
538 if (ucsp >= bufend)
539 goto toolong;
540
541 push(sequence[i], &combcharcnt, &ucsp);
542 }
543
544 continue;
545 }
546 } else if (precompose && (ucsp != bufstart)) {
547 u_int16_t composite, base;
548
549 if (unicode_combinable(ucs_ch)) {
550 base = ucsp[-1];
551 composite = unicode_combine(base, ucs_ch);
552 if (composite) {
553 --ucsp;
554 ucs_ch = composite;
555 }
556 }
557 }
558 if (ucs_ch == UCS_ALT_NULL)
559 ucs_ch = '\0';
560 }
561 if (ucs_ch == altslash)
562 ucs_ch = '/';
563
564 push(ucs_ch, &combcharcnt, &ucsp);
565 continue;
566
567 /*
568 * Escape illegal UTF-8 into something legal.
569 */
570 escape4:
571 utf8p -= 3;
572 goto escape;
573 escape3:
574 utf8p -= 2;
575 goto escape;
576 escape2:
577 utf8p -= 1;
578 escape:
579 if (!escaping) {
580 result = EINVAL;
581 goto exit;
582 }
583 if (extrabytes > 0)
584 utf8len += extrabytes;
585 byte = *(utf8p - 1);
586
587 if ((ucsp + 2) >= bufend)
588 goto toolong;
589
590 /* Make a previous combining sequence canonical. */
591 if (combcharcnt > 1) {
592 prioritysort(ucsp - combcharcnt, combcharcnt);
593 }
594 combcharcnt = 0;
595
596 ucs_ch = '%';
597 *ucsp++ = ucs_ch;
598 ucs_ch = hexdigits[byte >> 4];
599 *ucsp++ = ucs_ch;
600 ucs_ch = hexdigits[byte & 0x0F];
601 *ucsp++ = ucs_ch;
602 }
603 /*
604 * Make a previous combining sequence canonical
605 */
606 if (combcharcnt > 1) {
607 prioritysort(ucsp - combcharcnt, combcharcnt);
608 }
609
610 if (flags & UTF_REVERSE_ENDIAN) {
611 uint16_t *p = bufstart;
612 while (p < ucsp) {
613 *p = OSSwapInt16(*p);
614 ++p;
615 }
616 }
617
618 exit:
619 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
620
621 return (result);
622
623 toolong:
624 result = ENAMETOOLONG;
625 goto exit;
626 }
627
628
629 /*
630 * utf8_validatestr - Check for a valid UTF-8 string.
631 */
632 int
633 utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
634 {
635 unsigned int byte;
636 u_int32_t ch;
637 unsigned int ucs_ch;
638 size_t extrabytes;
639
640 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
641 if (byte < 0x80)
642 continue; /* plain ascii */
643
644 extrabytes = utf_extrabytes[byte >> 3];
645
646 if (utf8len < extrabytes)
647 goto invalid;
648 utf8len -= extrabytes;
649
650 switch (extrabytes) {
651 case 1:
652 ch = byte; ch <<= 6; /* 1st byte */
653 byte = *utf8p++; /* 2nd byte */
654 if ((byte >> 6) != 2)
655 goto invalid;
656 ch += byte;
657 ch -= 0x00003080UL;
658 if (ch < 0x0080)
659 goto invalid;
660 break;
661 case 2:
662 ch = byte; ch <<= 6; /* 1st byte */
663 byte = *utf8p++; /* 2nd byte */
664 if ((byte >> 6) != 2)
665 goto invalid;
666 ch += byte; ch <<= 6;
667 byte = *utf8p++; /* 3rd byte */
668 if ((byte >> 6) != 2)
669 goto invalid;
670 ch += byte;
671 ch -= 0x000E2080UL;
672 if (ch < 0x0800)
673 goto invalid;
674 if (ch >= 0xD800) {
675 if (ch <= 0xDFFF)
676 goto invalid;
677 if (ch == 0xFFFE || ch == 0xFFFF)
678 goto invalid;
679 }
680 break;
681 case 3:
682 ch = byte; ch <<= 6; /* 1st byte */
683 byte = *utf8p++; /* 2nd byte */
684 if ((byte >> 6) != 2)
685 goto invalid;
686 ch += byte; ch <<= 6;
687 byte = *utf8p++; /* 3rd byte */
688 if ((byte >> 6) != 2)
689 goto invalid;
690 ch += byte; ch <<= 6;
691 byte = *utf8p++; /* 4th byte */
692 if ((byte >> 6) != 2)
693 goto invalid;
694 ch += byte;
695 ch -= 0x03C82080UL + SP_HALF_BASE;
696 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
697 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
698 goto invalid;
699 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
700 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
701 goto invalid;
702 break;
703 default:
704 goto invalid;
705 }
706
707 }
708 return (0);
709 invalid:
710 return (EINVAL);
711 }
712
713 /*
714 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
715 *
716 * This function takes an UTF-8 input string, instr, of inlen bytes
717 * and produces normalized UTF-8 output into a buffer of buflen bytes
718 * pointed to by outstr. The size of the output in bytes (not including
719 * a NULL termination byte) is returned in outlen. In-place conversions
720 * are not supported (i.e. instr != outstr).]
721
722 * FLAGS
723 * UTF_DECOMPOSED: output string will be fully decomposed (NFD)
724 *
725 * UTF_PRECOMPOSED: output string will be precomposed (NFC)
726 *
727 * UTF_NO_NULL_TERM: do not add null termination to output string
728 *
729 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
730 *
731 * ERRORS
732 * ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
733 *
734 * EINVAL: illegal UTF-8 sequence encountered or invalid flags
735 */
736 int
737 utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
738 size_t *outlen, size_t buflen, int flags)
739 {
740 u_int16_t unicodebuf[32];
741 u_int16_t* unistr = NULL;
742 size_t unicode_bytes;
743 size_t uft8_bytes;
744 size_t inbuflen;
745 u_int8_t *outbufstart, *outbufend;
746 const u_int8_t *inbufstart;
747 unsigned int byte;
748 int decompose, precompose;
749 int result = 0;
750
751 if (flags & ~(UTF_DECOMPOSED | UTF_PRECOMPOSED | UTF_NO_NULL_TERM | UTF_ESCAPE_ILLEGAL)) {
752 return (EINVAL);
753 }
754 decompose = (flags & UTF_DECOMPOSED);
755 precompose = (flags & UTF_PRECOMPOSED);
756 if ((decompose && precompose) || (!decompose && !precompose)) {
757 return (EINVAL);
758 }
759 outbufstart = outstr;
760 outbufend = outbufstart + buflen;
761 inbufstart = instr;
762 inbuflen = inlen;
763
764 while (inlen-- > 0 && (byte = *instr++) != '\0') {
765 if (outstr >= outbufend) {
766 result = ENAMETOOLONG;
767 goto exit;
768 }
769 if (byte >= 0x80) {
770 goto nonASCII;
771 }
772 /* ASCII is already normalized. */
773 *outstr++ = byte;
774 }
775 exit:
776 *outlen = outstr - outbufstart;
777 if (((flags & UTF_NO_NULL_TERM) == 0)) {
778 if (outstr < outbufend)
779 *outstr++ = '\0';
780 else
781 result = ENAMETOOLONG;
782 }
783 return (result);
784
785
786 /*
787 * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
788 * functions to perform the normalization. Since this will
789 * presumably be used to normalize filenames in the back-end
790 * (on disk or over-the-wire), it should be fast enough.
791 */
792 nonASCII:
793
794 /* Make sure the input size is reasonable. */
795 if (inbuflen > MAXPATHLEN) {
796 result = ENAMETOOLONG;
797 goto exit;
798 }
799 /*
800 * Compute worst case Unicode buffer size.
801 *
802 * For pre-composed output, every UTF-8 input byte will be at
803 * most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
804 * (smallest composite char sequence) may yield 6 Unicode bytes
805 * (1 base char + 2 combining chars).
806 */
807 unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
808
809 if (unicode_bytes <= sizeof(unicodebuf))
810 unistr = &unicodebuf[0];
811 else
812 MALLOC(unistr, uint16_t *, unicode_bytes, M_TEMP, M_WAITOK);
813
814 /* Normalize the string. */
815 result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
816 unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
817 if (result == 0) {
818 /* Put results back into UTF-8. */
819 result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
820 &uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
821 outstr = outbufstart + uft8_bytes;
822 }
823 if (unistr && unistr != &unicodebuf[0]) {
824 FREE(unistr, M_TEMP);
825 }
826 goto exit;
827 }
828
829
830 /*
831 * Unicode 3.2 decomposition code (derived from Core Foundation)
832 */
833
834 typedef struct {
835 u_int32_t _key;
836 u_int32_t _value;
837 } unicode_mappings32;
838
839 static inline u_int32_t
840 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
841 u_int16_t character)
842 {
843 const unicode_mappings32 *p, *q, *divider;
844
845 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
846 return (0);
847
848 p = theTable;
849 q = p + (numElem-1);
850 while (p <= q) {
851 divider = p + ((q - p) >> 1); /* divide by 2 */
852 if (character < divider->_key) { q = divider - 1; }
853 else if (character > divider->_key) { p = divider + 1; }
854 else { return (divider->_value); }
855 }
856 return (0);
857 }
858
859 #define RECURSIVE_DECOMPOSITION (1 << 15)
860 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
861
862 typedef struct {
863 u_int16_t _key;
864 u_int16_t _value;
865 } unicode_mappings16;
866
867 static inline u_int16_t
868 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
869 u_int16_t character)
870 {
871 const unicode_mappings16 *p, *q, *divider;
872
873 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
874 return (0);
875
876 p = theTable;
877 q = p + (numElem-1);
878 while (p <= q) {
879 divider = p + ((q - p) >> 1); /* divide by 2 */
880 if (character < divider->_key)
881 q = divider - 1;
882 else if (character > divider->_key)
883 p = divider + 1;
884 else
885 return (divider->_value);
886 }
887 return (0);
888 }
889
890
891 static u_int32_t
892 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
893 {
894 u_int16_t value;
895 u_int32_t length;
896 u_int16_t firstChar;
897 u_int16_t theChar;
898 const u_int16_t *bmpMappings;
899 u_int32_t usedLength;
900
901 value = getmappedvalue16(
902 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
903 __UniCharDecompositionTableLength, character);
904 length = EXTRACT_COUNT(value);
905 firstChar = value & 0x0FFF;
906 theChar = firstChar;
907 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
908 usedLength = 0;
909
910 if (value & RECURSIVE_DECOMPOSITION) {
911 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
912
913 --length; /* Decrement for the first char */
914 if (!usedLength)
915 return 0;
916 ++bmpMappings;
917 convertedChars += usedLength;
918 }
919
920 usedLength += length;
921
922 while (length--)
923 *(convertedChars++) = *(bmpMappings++);
924
925 return (usedLength);
926 }
927
928 #define HANGUL_SBASE 0xAC00
929 #define HANGUL_LBASE 0x1100
930 #define HANGUL_VBASE 0x1161
931 #define HANGUL_TBASE 0x11A7
932
933 #define HANGUL_SCOUNT 11172
934 #define HANGUL_LCOUNT 19
935 #define HANGUL_VCOUNT 21
936 #define HANGUL_TCOUNT 28
937 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
938
939 /*
940 * unicode_decompose - decompose a composed Unicode char
941 *
942 * Composed Unicode characters are forbidden on
943 * HFS Plus volumes. ucs_decompose will convert a
944 * composed character into its correct decomposed
945 * sequence.
946 *
947 * Similar to CFUniCharDecomposeCharacter
948 */
949 static int
950 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
951 {
952 if ((character >= HANGUL_SBASE) &&
953 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
954 u_int32_t length;
955
956 character -= HANGUL_SBASE;
957 length = (character % HANGUL_TCOUNT ? 3 : 2);
958
959 *(convertedChars++) =
960 character / HANGUL_NCOUNT + HANGUL_LBASE;
961 *(convertedChars++) =
962 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
963 if (length > 2)
964 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
965 return (length);
966 } else {
967 return (unicode_recursive_decompose(character, convertedChars));
968 }
969 }
970
971 /*
972 * unicode_combine - generate a precomposed Unicode char
973 *
974 * Precomposed Unicode characters are required for some volume
975 * formats and network protocols. unicode_combine will combine
976 * a decomposed character sequence into a single precomposed
977 * (composite) character.
978 *
979 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
980 * also handles Hangul Jamo characters.
981 */
982 static u_int16_t
983 unicode_combine(u_int16_t base, u_int16_t combining)
984 {
985 u_int32_t value;
986
987 /* Check HANGUL */
988 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
989 /* 2 char Hangul sequences */
990 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
991 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
992 return (HANGUL_SBASE +
993 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
994 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
995 }
996
997 /* 3 char Hangul sequences */
998 if ((combining > HANGUL_TBASE) &&
999 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
1000 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
1001 return (0);
1002 else
1003 return (base + (combining - HANGUL_TBASE));
1004 }
1005 }
1006
1007 value = getmappedvalue32(
1008 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
1009 __CFUniCharPrecompositionTableLength, combining);
1010
1011 if (value) {
1012 value = getmappedvalue16(
1013 (const unicode_mappings16 *)
1014 ((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
1015 (value >> 16), base);
1016 }
1017 return (value);
1018 }
1019
1020
1021 /*
1022 * prioritysort - order combining chars into canonical order
1023 *
1024 * Similar to CFUniCharPrioritySort
1025 */
1026 static void
1027 prioritysort(u_int16_t* characters, int count)
1028 {
1029 u_int32_t p1, p2;
1030 u_int16_t *ch1, *ch2;
1031 u_int16_t *end;
1032 int changes = 0;
1033
1034 end = characters + count;
1035 do {
1036 changes = 0;
1037 ch1 = characters;
1038 ch2 = characters + 1;
1039 p2 = get_combining_class(*ch1);
1040 while (ch2 < end) {
1041 p1 = p2;
1042 p2 = get_combining_class(*ch2);
1043 if (p1 > p2 && p2 != 0) {
1044 u_int32_t tmp;
1045
1046 tmp = *ch1;
1047 *ch1 = *ch2;
1048 *ch2 = tmp;
1049 changes = 1;
1050
1051 /*
1052 * Make sure that p2 contains the combining class for the
1053 * character now stored at *ch2. This isn't required for
1054 * correctness, but it will be more efficient if a character
1055 * with a large combining class has to "bubble past" several
1056 * characters with lower combining classes.
1057 */
1058 p2 = p1;
1059 }
1060 ++ch1;
1061 ++ch2;
1062 }
1063 } while (changes);
1064 }
1065
1066
1067 /*
1068 * Invalid NTFS filename characters are encodeded using the
1069 * SFM (Services for Macintosh) private use Unicode characters.
1070 *
1071 * These should only be used for SMB, MSDOS or NTFS.
1072 *
1073 * Illegal NTFS Char SFM Unicode Char
1074 * ----------------------------------------
1075 * 0x01-0x1f 0xf001-0xf01f
1076 * '"' 0xf020
1077 * '*' 0xf021
1078 * '/' 0xf022
1079 * '<' 0xf023
1080 * '>' 0xf024
1081 * '?' 0xf025
1082 * '\' 0xf026
1083 * '|' 0xf027
1084 * ' ' 0xf028 (Only if last char of the name)
1085 * '.' 0xf029 (Only if last char of the name)
1086 * ----------------------------------------
1087 *
1088 * Reference: http://support.microsoft.com/kb/q117258/
1089 */
1090
1091 #define MAX_SFM2MAC 0x29
1092 #define SFMCODE_PREFIX_MASK 0xf000
1093
1094 /*
1095 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1096 * SFM had no conversion for the colon. There is a conversion for the
1097 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1098 * is a slash and a slash is a colon. So we can just replace the slash with the
1099 * colon in our tables and everything will just work.
1100 */
1101 static u_int8_t
1102 sfm2mac[42] = {
1103 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
1104 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
1105 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
1106 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
1107 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
1108 0x20, 0x2e /* 28 - 29 */
1109 };
1110
1111 static u_int8_t
1112 mac2sfm[112] = {
1113 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
1114 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
1115 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
1116 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
1117 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
1118 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
1119 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
1120 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
1121 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
1122 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
1123 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
1124 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
1125 };
1126
1127
1128 /*
1129 * Encode illegal NTFS filename characters into SFM Private Unicode characters
1130 *
1131 * Assumes non-zero ASCII input.
1132 */
1133 static u_int16_t
1134 ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
1135 {
1136 /* The last character of filename cannot be a space or period. */
1137 if (lastchar) {
1138 if (ucs_ch == 0x20)
1139 return (0xf028);
1140 else if (ucs_ch == 0x2e)
1141 return (0xf029);
1142 }
1143 /* 0x01 - 0x1f is simple transformation. */
1144 if (ucs_ch <= 0x1f) {
1145 return (ucs_ch | 0xf000);
1146 } else /* 0x20 - 0x7f */ {
1147 u_int16_t lsb;
1148
1149 lsb = mac2sfm[ucs_ch - 0x0020];
1150 if (lsb != ucs_ch)
1151 return(0xf000 | lsb);
1152 }
1153 return (ucs_ch);
1154 }
1155
1156 /*
1157 * Decode any SFM Private Unicode characters
1158 */
1159 static u_int16_t
1160 sfm_to_ucs(u_int16_t ucs_ch)
1161 {
1162 if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
1163 ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
1164 ucs_ch = sfm2mac[ucs_ch & 0x003f];
1165 }
1166 return (ucs_ch);
1167 }
1168
1169