]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_utfconv.c
xnu-4570.1.46.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
CommitLineData
1c79356b 1/*
2d21ac55 2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
9bccf70c
A
28
29 /*
30 Includes Unicode 3.2 decomposition code derived from Core Foundation
31 */
32
1c79356b
A
33#include <sys/param.h>
34#include <sys/utfconv.h>
35#include <sys/errno.h>
2d21ac55 36#include <sys/malloc.h>
0c530ab8 37#include <libkern/OSByteOrder.h>
1c79356b 38
39037602
A
39#if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST)
40#include <kern/assert.h>
41#else
42#include <assert.h>
43#endif
44
1c79356b 45/*
765c9de3 46 * UTF-8 (Unicode Transformation Format)
1c79356b 47 *
765c9de3
A
48 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
49 * character as a sequence of one to four bytes. Only the shortest form
50 * required to represent the significant Unicode bits is legal.
1c79356b
A
51 *
52 * UTF-8 Multibyte Codes
53 *
765c9de3
A
54 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
55 * -----------------------------------------------------------------------------
56 * 1 7 0x0000 0x007F 0xxxxxxx
57 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
58 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
59 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
60 * -----------------------------------------------------------------------------
1c79356b
A
61 */
62
63
765c9de3
A
64#define UNICODE_TO_UTF8_LEN(c) \
65 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
0b4e3aa0
A
66
67#define UCS_ALT_NULL 0x2400
1c79356b 68
765c9de3
A
69/* Surrogate Pair Constants */
70#define SP_HALF_SHIFT 10
3e170ce0
A
71#define SP_HALF_BASE 0x0010000u
72#define SP_HALF_MASK 0x3FFu
765c9de3 73
3e170ce0
A
74#define SP_HIGH_FIRST 0xD800u
75#define SP_HIGH_LAST 0xDBFFu
76#define SP_LOW_FIRST 0xDC00u
77#define SP_LOW_LAST 0xDFFFu
9bccf70c 78
1c79356b 79
9bccf70c 80#include "vfs_utfconvdata.h"
765c9de3 81
1c79356b 82
9bccf70c
A
83/*
84 * Test for a combining character.
85 *
86 * Similar to __CFUniCharIsNonBaseCharacter except that
87 * unicode_combinable also includes Hangul Jamo characters.
88 */
6d2010ae 89int
9bccf70c
A
90unicode_combinable(u_int16_t character)
91{
92 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
93 u_int8_t value;
94
95 if (character < 0x0300)
96 return (0);
97
98 value = bitmap[(character >> 8) & 0xFF];
99
100 if (value == 0xFF) {
101 return (1);
102 } else if (value) {
103 bitmap = bitmap + ((value - 1) * 32) + 256;
104 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
105 }
106 return (0);
107}
108
109/*
110 * Test for a precomposed character.
111 *
112 * Similar to __CFUniCharIsDecomposableCharacter.
113 */
6d2010ae 114int
9bccf70c
A
115unicode_decomposeable(u_int16_t character) {
116 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
117 u_int8_t value;
118
119 if (character < 0x00C0)
120 return (0);
121
122 value = bitmap[(character >> 8) & 0xFF];
123
124 if (value == 0xFF) {
125 return (1);
126 } else if (value) {
127 bitmap = bitmap + ((value - 1) * 32) + 256;
128 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
129 }
130 return (0);
131}
132
13fec989
A
133
134/*
135 * Get the combing class.
136 *
137 * Similar to CFUniCharGetCombiningPropertyForCharacter.
138 */
139static inline u_int8_t
140get_combining_class(u_int16_t character) {
141 const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
142
143 u_int8_t value = bitmap[(character >> 8)];
144
145 if (value) {
146 bitmap = bitmap + (value * 256);
147 return bitmap[character % 256];
148 }
149 return (0);
150}
151
152
9bccf70c
A
153static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
154
155static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
0b4e3aa0 156
3e170ce0 157static void prioritysort(u_int16_t* characters, int count);
1c79356b 158
2d21ac55
A
159static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
160
161static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
162
163
765c9de3
A
164char utf_extrabytes[32] = {
165 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
167};
168
2d21ac55
A
169const char hexdigits[16] = {
170 '0', '1', '2', '3', '4', '5', '6', '7',
171 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
172};
765c9de3 173
1c79356b 174/*
2d21ac55 175 * utf8_encodelen - Calculate the UTF-8 encoding length
1c79356b 176 *
2d21ac55
A
177 * This function takes a Unicode input string, ucsp, of ucslen bytes
178 * and calculates the size of the UTF-8 output in bytes (not including
179 * a NULL termination byte). The string must reside in kernel memory.
1c79356b 180 *
2d21ac55
A
181 * If '/' chars are possible in the Unicode input then an alternate
182 * (replacement) char should be provided in altslash.
183 *
184 * FLAGS
185 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
186 *
187 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
188 *
189 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
190 *
191 * UTF_DECOMPOSED: generate fully decomposed output
192 *
193 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
194 *
195 * ERRORS
196 * None
1c79356b
A
197 */
198size_t
2d21ac55 199utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
1c79356b
A
200{
201 u_int16_t ucs_ch;
2d21ac55
A
202 u_int16_t * chp = NULL;
203 u_int16_t sequence[8];
204 int extra = 0;
3e170ce0 205 size_t charcnt;
1c79356b 206 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
2d21ac55 207 int decompose = (flags & UTF_DECOMPOSED);
1c79356b 208 size_t len;
2d21ac55 209
1c79356b
A
210 charcnt = ucslen / 2;
211 len = 0;
212
213 while (charcnt-- > 0) {
2d21ac55
A
214 if (extra > 0) {
215 --extra;
216 ucs_ch = *chp++;
217 } else {
218 ucs_ch = *ucsp++;
219 if (swapbytes) {
220 ucs_ch = OSSwapInt16(ucs_ch);
221 }
222 if (ucs_ch == '/') {
223 ucs_ch = altslash ? altslash : '_';
224 } else if (ucs_ch == '\0') {
225 ucs_ch = UCS_ALT_NULL;
226 } else if (decompose && unicode_decomposeable(ucs_ch)) {
227 extra = unicode_decompose(ucs_ch, sequence) - 1;
228 charcnt += extra;
229 ucs_ch = sequence[0];
230 chp = &sequence[1];
231 }
232 }
765c9de3 233 len += UNICODE_TO_UTF8_LEN(ucs_ch);
1c79356b
A
234 }
235
236 return (len);
237}
238
239
240/*
765c9de3 241 * utf8_encodestr - Encodes a Unicode string to UTF-8
1c79356b
A
242 *
243 * NOTES:
0b4e3aa0 244 * The resulting UTF-8 string is NULL terminated.
1c79356b
A
245 *
246 * If '/' chars are allowed on disk then an alternate
247 * (replacement) char must be provided in altslash.
248 *
249 * input flags:
765c9de3 250 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
2d21ac55
A
251 *
252 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
253 *
254 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
255 *
256 * UTF_DECOMPOSED: generate fully decomposed output
257 *
1c79356b 258 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
0b4e3aa0
A
259 *
260 * result:
261 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
2d21ac55 262 *
0b4e3aa0 263 * EINVAL: Illegal char found; char was replaced by an '_'.
1c79356b 264 */
765c9de3
A
265int
266utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
267 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
1c79356b
A
268{
269 u_int8_t * bufstart;
270 u_int8_t * bufend;
271 u_int16_t ucs_ch;
9bccf70c
A
272 u_int16_t * chp = NULL;
273 u_int16_t sequence[8];
274 int extra = 0;
3e170ce0 275 size_t charcnt;
1c79356b 276 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
0b4e3aa0
A
277 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
278 int decompose = (flags & UTF_DECOMPOSED);
2d21ac55 279 int sfmconv = (flags & UTF_SFM_CONVERSIONS);
1c79356b 280 int result = 0;
2d21ac55 281
1c79356b
A
282 bufstart = utf8p;
283 bufend = bufstart + buflen;
284 if (nullterm)
285 --bufend;
286 charcnt = ucslen / 2;
287
288 while (charcnt-- > 0) {
9bccf70c
A
289 if (extra > 0) {
290 --extra;
291 ucs_ch = *chp++;
0b4e3aa0 292 } else {
0c530ab8 293 ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
9bccf70c
A
294
295 if (decompose && unicode_decomposeable(ucs_ch)) {
296 extra = unicode_decompose(ucs_ch, sequence) - 1;
297 charcnt += extra;
298 ucs_ch = sequence[0];
299 chp = &sequence[1];
300 }
0b4e3aa0 301 }
1c79356b 302
0b4e3aa0
A
303 /* Slash and NULL are not permitted */
304 if (ucs_ch == '/') {
305 if (altslash)
306 ucs_ch = altslash;
307 else {
308 ucs_ch = '_';
309 result = EINVAL;
310 }
311 } else if (ucs_ch == '\0') {
312 ucs_ch = UCS_ALT_NULL;
313 }
1c79356b 314
0b4e3aa0 315 if (ucs_ch < 0x0080) {
1c79356b
A
316 if (utf8p >= bufend) {
317 result = ENAMETOOLONG;
318 break;
765c9de3 319 }
1c79356b
A
320 *utf8p++ = ucs_ch;
321
322 } else if (ucs_ch < 0x800) {
323 if ((utf8p + 1) >= bufend) {
324 result = ENAMETOOLONG;
325 break;
326 }
765c9de3
A
327 *utf8p++ = 0xc0 | (ucs_ch >> 6);
328 *utf8p++ = 0x80 | (0x3f & ucs_ch);
1c79356b
A
329
330 } else {
2d21ac55
A
331 /* These chars never valid Unicode. */
332 if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) {
333 result = EINVAL;
334 break;
335 }
336
765c9de3
A
337 /* Combine valid surrogate pairs */
338 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
339 && charcnt > 0) {
340 u_int16_t ch2;
341 u_int32_t pair;
342
0c530ab8 343 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
765c9de3
A
344 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
345 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
346 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
347 if ((utf8p + 3) >= bufend) {
348 result = ENAMETOOLONG;
349 break;
350 }
351 --charcnt;
352 ++ucsp;
353 *utf8p++ = 0xf0 | (pair >> 18);
354 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
355 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
356 *utf8p++ = 0x80 | (0x3f & pair);
357 continue;
358 }
2d21ac55
A
359 } else if (sfmconv) {
360 ucs_ch = sfm_to_ucs(ucs_ch);
361 if (ucs_ch < 0x0080) {
362 if (utf8p >= bufend) {
363 result = ENAMETOOLONG;
364 break;
365 }
366 *utf8p++ = ucs_ch;
367 continue;
368 }
765c9de3 369 }
1c79356b
A
370 if ((utf8p + 2) >= bufend) {
371 result = ENAMETOOLONG;
372 break;
373 }
765c9de3
A
374 *utf8p++ = 0xe0 | (ucs_ch >> 12);
375 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
376 *utf8p++ = 0x80 | (0x3f & ucs_ch);
1c79356b
A
377 }
378 }
379
380 *utf8len = utf8p - bufstart;
381 if (nullterm)
382 *utf8p++ = '\0';
383
384 return (result);
385}
386
3e170ce0
A
387// Pushes a character taking account of combining character sequences
388static void push(uint16_t ucs_ch, int *combcharcnt, uint16_t **ucsp)
389{
390 /*
391 * Make multiple combining character sequences canonical
392 */
393 if (unicode_combinable(ucs_ch)) {
394 ++*combcharcnt; /* start tracking a run */
395 } else if (*combcharcnt) {
396 if (*combcharcnt > 1) {
397 prioritysort(*ucsp - *combcharcnt, *combcharcnt);
398 }
399 *combcharcnt = 0; /* start over */
400 }
401
402 *(*ucsp)++ = ucs_ch;
403}
1c79356b
A
404
405/*
765c9de3 406 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
1c79356b
A
407 *
408 * NOTES:
409 * The input UTF-8 string does not need to be null terminated
410 * if utf8len is set.
411 *
412 * If '/' chars are allowed on disk then an alternate
413 * (replacement) char must be provided in altslash.
414 *
415 * input flags:
2d21ac55
A
416 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
417 *
418 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
419 *
420 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
421 *
422 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
423 *
424 * UTF_PRECOMPOSED: generate precomposed output (NFC)
425 *
426 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
0b4e3aa0
A
427 *
428 * result:
429 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
2d21ac55 430 *
0b4e3aa0 431 * EINVAL: Illegal UTF-8 sequence found.
1c79356b
A
432 */
433int
765c9de3
A
434utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
435 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
1c79356b
A
436{
437 u_int16_t* bufstart;
438 u_int16_t* bufend;
55e303ae
A
439 unsigned int ucs_ch;
440 unsigned int byte;
13fec989 441 int combcharcnt = 0;
1c79356b 442 int result = 0;
3e170ce0 443 int decompose, precompose, escaping;
2d21ac55
A
444 int sfmconv;
445 int extrabytes;
1c79356b 446
2d21ac55 447 decompose = (flags & UTF_DECOMPOSED);
0b4e3aa0 448 precompose = (flags & UTF_PRECOMPOSED);
2d21ac55
A
449 escaping = (flags & UTF_ESCAPE_ILLEGAL);
450 sfmconv = (flags & UTF_SFM_CONVERSIONS);
1c79356b
A
451
452 bufstart = ucsp;
453 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
454
455 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
765c9de3
A
456 if (ucsp >= bufend)
457 goto toolong;
1c79356b
A
458
459 /* check for ascii */
460 if (byte < 0x80) {
2d21ac55 461 ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
1c79356b 462 } else {
765c9de3 463 u_int32_t ch;
765c9de3 464
2d21ac55
A
465 extrabytes = utf_extrabytes[byte >> 3];
466 if ((extrabytes < 0) || ((int)utf8len < extrabytes)) {
467 goto escape;
468 }
765c9de3
A
469 utf8len -= extrabytes;
470
471 switch (extrabytes) {
55e303ae
A
472 case 1:
473 ch = byte; ch <<= 6; /* 1st byte */
474 byte = *utf8p++; /* 2nd byte */
475 if ((byte >> 6) != 2)
2d21ac55 476 goto escape2;
55e303ae
A
477 ch += byte;
478 ch -= 0x00003080UL;
479 if (ch < 0x0080)
2d21ac55 480 goto escape2;
55e303ae 481 ucs_ch = ch;
765c9de3 482 break;
55e303ae
A
483 case 2:
484 ch = byte; ch <<= 6; /* 1st byte */
485 byte = *utf8p++; /* 2nd byte */
486 if ((byte >> 6) != 2)
2d21ac55 487 goto escape2;
55e303ae
A
488 ch += byte; ch <<= 6;
489 byte = *utf8p++; /* 3rd byte */
490 if ((byte >> 6) != 2)
2d21ac55 491 goto escape3;
55e303ae
A
492 ch += byte;
493 ch -= 0x000E2080UL;
494 if (ch < 0x0800)
2d21ac55 495 goto escape3;
55e303ae
A
496 if (ch >= 0xD800) {
497 if (ch <= 0xDFFF)
2d21ac55 498 goto escape3;
55e303ae 499 if (ch == 0xFFFE || ch == 0xFFFF)
2d21ac55 500 goto escape3;
55e303ae
A
501 }
502 ucs_ch = ch;
503 break;
504 case 3:
505 ch = byte; ch <<= 6; /* 1st byte */
506 byte = *utf8p++; /* 2nd byte */
507 if ((byte >> 6) != 2)
2d21ac55 508 goto escape2;
55e303ae
A
509 ch += byte; ch <<= 6;
510 byte = *utf8p++; /* 3rd byte */
511 if ((byte >> 6) != 2)
2d21ac55 512 goto escape3;
55e303ae
A
513 ch += byte; ch <<= 6;
514 byte = *utf8p++; /* 4th byte */
515 if ((byte >> 6) != 2)
2d21ac55 516 goto escape4;
55e303ae
A
517 ch += byte;
518 ch -= 0x03C82080UL + SP_HALF_BASE;
519 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
520 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
2d21ac55 521 goto escape4;
3e170ce0 522 push(ucs_ch, &combcharcnt, &ucsp);
55e303ae
A
523 if (ucsp >= bufend)
524 goto toolong;
525 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
2d21ac55
A
526 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
527 --ucsp;
528 goto escape4;
529 }
3e170ce0 530 *ucsp++ = ucs_ch;
765c9de3 531 continue;
1c79356b 532 default:
2d21ac55
A
533 result = EINVAL;
534 goto exit;
1c79356b 535 }
1c79356b 536 if (decompose) {
9bccf70c
A
537 if (unicode_decomposeable(ucs_ch)) {
538 u_int16_t sequence[8];
539 int count, i;
2d21ac55 540
2d21ac55 541 count = unicode_decompose(ucs_ch, sequence);
3e170ce0 542
9bccf70c 543 for (i = 0; i < count; ++i) {
765c9de3
A
544 if (ucsp >= bufend)
545 goto toolong;
3e170ce0
A
546
547 push(sequence[i], &combcharcnt, &ucsp);
0b4e3aa0 548 }
3e170ce0
A
549
550 continue;
0b4e3aa0
A
551 }
552 } else if (precompose && (ucsp != bufstart)) {
553 u_int16_t composite, base;
554
9bccf70c 555 if (unicode_combinable(ucs_ch)) {
3e170ce0 556 base = ucsp[-1];
9bccf70c
A
557 composite = unicode_combine(base, ucs_ch);
558 if (composite) {
559 --ucsp;
560 ucs_ch = composite;
561 }
1c79356b
A
562 }
563 }
0b4e3aa0
A
564 if (ucs_ch == UCS_ALT_NULL)
565 ucs_ch = '\0';
1c79356b 566 }
1c79356b
A
567 if (ucs_ch == altslash)
568 ucs_ch = '/';
1c79356b 569
3e170ce0 570 push(ucs_ch, &combcharcnt, &ucsp);
2d21ac55
A
571 continue;
572
573 /*
574 * Escape illegal UTF-8 into something legal.
575 */
576escape4:
577 utf8p -= 3;
578 goto escape;
579escape3:
580 utf8p -= 2;
581 goto escape;
582escape2:
583 utf8p -= 1;
584escape:
585 if (!escaping) {
586 result = EINVAL;
587 goto exit;
588 }
589 if (extrabytes > 0)
590 utf8len += extrabytes;
591 byte = *(utf8p - 1);
592
593 if ((ucsp + 2) >= bufend)
594 goto toolong;
595
b0d623f7
A
596 /* Make a previous combining sequence canonical. */
597 if (combcharcnt > 1) {
3e170ce0 598 prioritysort(ucsp - combcharcnt, combcharcnt);
b0d623f7
A
599 }
600 combcharcnt = 0;
601
2d21ac55 602 ucs_ch = '%';
3e170ce0 603 *ucsp++ = ucs_ch;
2d21ac55 604 ucs_ch = hexdigits[byte >> 4];
3e170ce0 605 *ucsp++ = ucs_ch;
2d21ac55 606 ucs_ch = hexdigits[byte & 0x0F];
3e170ce0 607 *ucsp++ = ucs_ch;
1c79356b 608 }
13fec989
A
609 /*
610 * Make a previous combining sequence canonical
611 */
612 if (combcharcnt > 1) {
3e170ce0 613 prioritysort(ucsp - combcharcnt, combcharcnt);
13fec989 614 }
3e170ce0
A
615
616 if (flags & UTF_REVERSE_ENDIAN) {
617 uint16_t *p = bufstart;
618 while (p < ucsp) {
619 *p = OSSwapInt16(*p);
620 ++p;
621 }
622 }
623
765c9de3 624exit:
1c79356b
A
625 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
626
627 return (result);
765c9de3 628
765c9de3
A
629toolong:
630 result = ENAMETOOLONG;
631 goto exit;
1c79356b
A
632}
633
634
91447636
A
635/*
636 * utf8_validatestr - Check for a valid UTF-8 string.
637 */
638int
639utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
640{
641 unsigned int byte;
642 u_int32_t ch;
643 unsigned int ucs_ch;
644 size_t extrabytes;
645
646 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
647 if (byte < 0x80)
648 continue; /* plain ascii */
649
650 extrabytes = utf_extrabytes[byte >> 3];
651
652 if (utf8len < extrabytes)
653 goto invalid;
654 utf8len -= extrabytes;
655
656 switch (extrabytes) {
657 case 1:
658 ch = byte; ch <<= 6; /* 1st byte */
659 byte = *utf8p++; /* 2nd byte */
660 if ((byte >> 6) != 2)
661 goto invalid;
662 ch += byte;
663 ch -= 0x00003080UL;
664 if (ch < 0x0080)
665 goto invalid;
666 break;
667 case 2:
668 ch = byte; ch <<= 6; /* 1st byte */
669 byte = *utf8p++; /* 2nd byte */
670 if ((byte >> 6) != 2)
671 goto invalid;
672 ch += byte; ch <<= 6;
673 byte = *utf8p++; /* 3rd byte */
674 if ((byte >> 6) != 2)
675 goto invalid;
676 ch += byte;
677 ch -= 0x000E2080UL;
678 if (ch < 0x0800)
679 goto invalid;
680 if (ch >= 0xD800) {
681 if (ch <= 0xDFFF)
682 goto invalid;
683 if (ch == 0xFFFE || ch == 0xFFFF)
684 goto invalid;
685 }
686 break;
687 case 3:
688 ch = byte; ch <<= 6; /* 1st byte */
689 byte = *utf8p++; /* 2nd byte */
690 if ((byte >> 6) != 2)
691 goto invalid;
692 ch += byte; ch <<= 6;
693 byte = *utf8p++; /* 3rd byte */
694 if ((byte >> 6) != 2)
695 goto invalid;
696 ch += byte; ch <<= 6;
697 byte = *utf8p++; /* 4th byte */
698 if ((byte >> 6) != 2)
699 goto invalid;
700 ch += byte;
701 ch -= 0x03C82080UL + SP_HALF_BASE;
702 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
703 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
704 goto invalid;
705 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
706 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
707 goto invalid;
708 break;
709 default:
710 goto invalid;
711 }
712
713 }
714 return (0);
715invalid:
716 return (EINVAL);
717}
718
2d21ac55
A
719/*
720 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
721 *
722 * This function takes an UTF-8 input string, instr, of inlen bytes
723 * and produces normalized UTF-8 output into a buffer of buflen bytes
724 * pointed to by outstr. The size of the output in bytes (not including
725 * a NULL termination byte) is returned in outlen. In-place conversions
726 * are not supported (i.e. instr != outstr).]
727
728 * FLAGS
729 * UTF_DECOMPOSED: output string will be fully decomposed (NFD)
730 *
731 * UTF_PRECOMPOSED: output string will be precomposed (NFC)
732 *
733 * UTF_NO_NULL_TERM: do not add null termination to output string
734 *
735 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
736 *
737 * ERRORS
738 * ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
739 *
740 * EINVAL: illegal UTF-8 sequence encountered or invalid flags
741 */
742int
743utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
744 size_t *outlen, size_t buflen, int flags)
745{
746 u_int16_t unicodebuf[32];
747 u_int16_t* unistr = NULL;
748 size_t unicode_bytes;
749 size_t uft8_bytes;
750 size_t inbuflen;
751 u_int8_t *outbufstart, *outbufend;
752 const u_int8_t *inbufstart;
753 unsigned int byte;
754 int decompose, precompose;
755 int result = 0;
756
757 if (flags & ~(UTF_DECOMPOSED | UTF_PRECOMPOSED | UTF_NO_NULL_TERM | UTF_ESCAPE_ILLEGAL)) {
758 return (EINVAL);
759 }
760 decompose = (flags & UTF_DECOMPOSED);
761 precompose = (flags & UTF_PRECOMPOSED);
762 if ((decompose && precompose) || (!decompose && !precompose)) {
763 return (EINVAL);
764 }
765 outbufstart = outstr;
766 outbufend = outbufstart + buflen;
767 inbufstart = instr;
768 inbuflen = inlen;
769
770 while (inlen-- > 0 && (byte = *instr++) != '\0') {
771 if (outstr >= outbufend) {
772 result = ENAMETOOLONG;
773 goto exit;
774 }
775 if (byte >= 0x80) {
776 goto nonASCII;
777 }
778 /* ASCII is already normalized. */
779 *outstr++ = byte;
780 }
781exit:
782 *outlen = outstr - outbufstart;
783 if (((flags & UTF_NO_NULL_TERM) == 0)) {
784 if (outstr < outbufend)
785 *outstr++ = '\0';
786 else
787 result = ENAMETOOLONG;
788 }
789 return (result);
790
791
792 /*
793 * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
794 * functions to perform the normalization. Since this will
795 * presumably be used to normalize filenames in the back-end
796 * (on disk or over-the-wire), it should be fast enough.
797 */
798nonASCII:
799
800 /* Make sure the input size is reasonable. */
801 if (inbuflen > MAXPATHLEN) {
802 result = ENAMETOOLONG;
803 goto exit;
804 }
805 /*
806 * Compute worst case Unicode buffer size.
807 *
808 * For pre-composed output, every UTF-8 input byte will be at
809 * most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
810 * (smallest composite char sequence) may yield 6 Unicode bytes
811 * (1 base char + 2 combining chars).
812 */
813 unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
814
815 if (unicode_bytes <= sizeof(unicodebuf))
816 unistr = &unicodebuf[0];
817 else
3e170ce0 818 MALLOC(unistr, uint16_t *, unicode_bytes, M_TEMP, M_WAITOK);
2d21ac55
A
819
820 /* Normalize the string. */
821 result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
822 unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
823 if (result == 0) {
824 /* Put results back into UTF-8. */
825 result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
826 &uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
827 outstr = outbufstart + uft8_bytes;
828 }
829 if (unistr && unistr != &unicodebuf[0]) {
830 FREE(unistr, M_TEMP);
831 }
832 goto exit;
833}
834
91447636 835
9bccf70c
A
836 /*
837 * Unicode 3.2 decomposition code (derived from Core Foundation)
838 */
1c79356b 839
9bccf70c
A
840typedef struct {
841 u_int32_t _key;
842 u_int32_t _value;
843} unicode_mappings32;
0b4e3aa0 844
9bccf70c
A
845static inline u_int32_t
846getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
847 u_int16_t character)
848{
849 const unicode_mappings32 *p, *q, *divider;
1c79356b 850
9bccf70c
A
851 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
852 return (0);
1c79356b 853
9bccf70c
A
854 p = theTable;
855 q = p + (numElem-1);
856 while (p <= q) {
857 divider = p + ((q - p) >> 1); /* divide by 2 */
858 if (character < divider->_key) { q = divider - 1; }
859 else if (character > divider->_key) { p = divider + 1; }
860 else { return (divider->_value); }
861 }
862 return (0);
863}
1c79356b 864
9bccf70c
A
865#define RECURSIVE_DECOMPOSITION (1 << 15)
866#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
1c79356b 867
9bccf70c
A
868typedef struct {
869 u_int16_t _key;
870 u_int16_t _value;
871} unicode_mappings16;
1c79356b 872
9bccf70c
A
873static inline u_int16_t
874getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
875 u_int16_t character)
876{
877 const unicode_mappings16 *p, *q, *divider;
1c79356b 878
9bccf70c
A
879 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
880 return (0);
1c79356b 881
9bccf70c
A
882 p = theTable;
883 q = p + (numElem-1);
884 while (p <= q) {
885 divider = p + ((q - p) >> 1); /* divide by 2 */
886 if (character < divider->_key)
887 q = divider - 1;
888 else if (character > divider->_key)
889 p = divider + 1;
890 else
891 return (divider->_value);
892 }
893 return (0);
894}
895
896
897static u_int32_t
898unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
899{
900 u_int16_t value;
901 u_int32_t length;
902 u_int16_t firstChar;
903 u_int16_t theChar;
904 const u_int16_t *bmpMappings;
905 u_int32_t usedLength;
906
907 value = getmappedvalue16(
908 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
909 __UniCharDecompositionTableLength, character);
910 length = EXTRACT_COUNT(value);
911 firstChar = value & 0x0FFF;
912 theChar = firstChar;
913 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
914 usedLength = 0;
915
916 if (value & RECURSIVE_DECOMPOSITION) {
917 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
918
919 --length; /* Decrement for the first char */
920 if (!usedLength)
921 return 0;
922 ++bmpMappings;
923 convertedChars += usedLength;
924 }
0b4e3aa0 925
9bccf70c 926 usedLength += length;
0b4e3aa0 927
9bccf70c
A
928 while (length--)
929 *(convertedChars++) = *(bmpMappings++);
0b4e3aa0 930
9bccf70c
A
931 return (usedLength);
932}
933
934#define HANGUL_SBASE 0xAC00
935#define HANGUL_LBASE 0x1100
936#define HANGUL_VBASE 0x1161
937#define HANGUL_TBASE 0x11A7
938
939#define HANGUL_SCOUNT 11172
940#define HANGUL_LCOUNT 19
941#define HANGUL_VCOUNT 21
942#define HANGUL_TCOUNT 28
943#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
1c79356b
A
944
945/*
9bccf70c 946 * unicode_decompose - decompose a composed Unicode char
1c79356b
A
947 *
948 * Composed Unicode characters are forbidden on
949 * HFS Plus volumes. ucs_decompose will convert a
950 * composed character into its correct decomposed
951 * sequence.
952 *
9bccf70c 953 * Similar to CFUniCharDecomposeCharacter
1c79356b 954 */
9bccf70c
A
955static int
956unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
1c79356b 957{
9bccf70c
A
958 if ((character >= HANGUL_SBASE) &&
959 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
960 u_int32_t length;
961
962 character -= HANGUL_SBASE;
963 length = (character % HANGUL_TCOUNT ? 3 : 2);
964
965 *(convertedChars++) =
966 character / HANGUL_NCOUNT + HANGUL_LBASE;
967 *(convertedChars++) =
968 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
969 if (length > 2)
970 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
971 return (length);
1c79356b 972 } else {
9bccf70c 973 return (unicode_recursive_decompose(character, convertedChars));
1c79356b 974 }
1c79356b
A
975}
976
0b4e3aa0 977/*
9bccf70c 978 * unicode_combine - generate a precomposed Unicode char
0b4e3aa0
A
979 *
980 * Precomposed Unicode characters are required for some volume
9bccf70c
A
981 * formats and network protocols. unicode_combine will combine
982 * a decomposed character sequence into a single precomposed
0b4e3aa0
A
983 * (composite) character.
984 *
9bccf70c
A
985 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
986 * also handles Hangul Jamo characters.
0b4e3aa0
A
987 */
988static u_int16_t
9bccf70c 989unicode_combine(u_int16_t base, u_int16_t combining)
0b4e3aa0 990{
9bccf70c
A
991 u_int32_t value;
992
993 /* Check HANGUL */
994 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
995 /* 2 char Hangul sequences */
996 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
997 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
998 return (HANGUL_SBASE +
999 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
1000 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
0b4e3aa0 1001 }
9bccf70c
A
1002
1003 /* 3 char Hangul sequences */
1004 if ((combining > HANGUL_TBASE) &&
1005 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
1006 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
1007 return (0);
1008 else
1009 return (base + (combining - HANGUL_TBASE));
0b4e3aa0 1010 }
0b4e3aa0
A
1011 }
1012
9bccf70c
A
1013 value = getmappedvalue32(
1014 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
1015 __CFUniCharPrecompositionTableLength, combining);
0b4e3aa0 1016
9bccf70c
A
1017 if (value) {
1018 value = getmappedvalue16(
1019 (const unicode_mappings16 *)
2d21ac55 1020 ((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
9bccf70c 1021 (value >> 16), base);
0b4e3aa0 1022 }
9bccf70c 1023 return (value);
0b4e3aa0
A
1024}
1025
13fec989
A
1026
1027/*
3e170ce0 1028 * prioritysort - order combining chars into canonical order
13fec989
A
1029 *
1030 * Similar to CFUniCharPrioritySort
1031 */
1032static void
3e170ce0 1033prioritysort(u_int16_t* characters, int count)
13fec989
A
1034{
1035 u_int32_t p1, p2;
1036 u_int16_t *ch1, *ch2;
1037 u_int16_t *end;
6d2010ae 1038 int changes = 0;
13fec989
A
1039
1040 end = characters + count;
1041 do {
1042 changes = 0;
1043 ch1 = characters;
1044 ch2 = characters + 1;
1045 p2 = get_combining_class(*ch1);
1046 while (ch2 < end) {
1047 p1 = p2;
1048 p2 = get_combining_class(*ch2);
6d2010ae 1049 if (p1 > p2 && p2 != 0) {
13fec989
A
1050 u_int32_t tmp;
1051
1052 tmp = *ch1;
1053 *ch1 = *ch2;
1054 *ch2 = tmp;
1055 changes = 1;
6d2010ae
A
1056
1057 /*
1058 * Make sure that p2 contains the combining class for the
1059 * character now stored at *ch2. This isn't required for
1060 * correctness, but it will be more efficient if a character
1061 * with a large combining class has to "bubble past" several
1062 * characters with lower combining classes.
1063 */
1064 p2 = p1;
13fec989
A
1065 }
1066 ++ch1;
1067 ++ch2;
1068 }
1069 } while (changes);
1070}
2d21ac55
A
1071
1072
1073/*
1074 * Invalid NTFS filename characters are encodeded using the
1075 * SFM (Services for Macintosh) private use Unicode characters.
1076 *
1077 * These should only be used for SMB, MSDOS or NTFS.
1078 *
1079 * Illegal NTFS Char SFM Unicode Char
1080 * ----------------------------------------
1081 * 0x01-0x1f 0xf001-0xf01f
1082 * '"' 0xf020
1083 * '*' 0xf021
1084 * '/' 0xf022
1085 * '<' 0xf023
1086 * '>' 0xf024
1087 * '?' 0xf025
1088 * '\' 0xf026
1089 * '|' 0xf027
1090 * ' ' 0xf028 (Only if last char of the name)
1091 * '.' 0xf029 (Only if last char of the name)
1092 * ----------------------------------------
1093 *
1094 * Reference: http://support.microsoft.com/kb/q117258/
1095 */
1096
1097#define MAX_SFM2MAC 0x29
1098#define SFMCODE_PREFIX_MASK 0xf000
1099
1100/*
1101 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1102 * SFM had no conversion for the colon. There is a conversion for the
1103 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1104 * is a slash and a slash is a colon. So we can just replace the slash with the
1105 * colon in our tables and everything will just work.
1106 */
1107static u_int8_t
39037602 1108sfm2mac[] = {
2d21ac55
A
1109 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
1110 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
1111 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
1112 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
1113 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
1114 0x20, 0x2e /* 28 - 29 */
1115};
39037602 1116#define SFM2MAC_LEN ((sizeof(sfm2mac))/sizeof(sfm2mac[0]))
2d21ac55
A
1117
1118static u_int8_t
39037602 1119mac2sfm[] = {
2d21ac55
A
1120 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
1121 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
1122 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
1123 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
1124 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
1125 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
1126 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
1127 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
1128 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
1129 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
1130 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
1131 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
1132};
39037602 1133#define MAC2SFM_LEN ((sizeof(mac2sfm))/sizeof(mac2sfm[0]))
2d21ac55
A
1134
1135
1136/*
1137 * Encode illegal NTFS filename characters into SFM Private Unicode characters
1138 *
1139 * Assumes non-zero ASCII input.
1140 */
1141static u_int16_t
1142ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
1143{
1144 /* The last character of filename cannot be a space or period. */
1145 if (lastchar) {
1146 if (ucs_ch == 0x20)
1147 return (0xf028);
1148 else if (ucs_ch == 0x2e)
1149 return (0xf029);
1150 }
1151 /* 0x01 - 0x1f is simple transformation. */
1152 if (ucs_ch <= 0x1f) {
1153 return (ucs_ch | 0xf000);
1154 } else /* 0x20 - 0x7f */ {
1155 u_int16_t lsb;
1156
39037602 1157 assert((ucs_ch - 0x0020) < MAC2SFM_LEN);
2d21ac55
A
1158 lsb = mac2sfm[ucs_ch - 0x0020];
1159 if (lsb != ucs_ch)
1160 return(0xf000 | lsb);
1161 }
1162 return (ucs_ch);
1163}
1164
1165/*
1166 * Decode any SFM Private Unicode characters
1167 */
1168static u_int16_t
1169sfm_to_ucs(u_int16_t ucs_ch)
1170{
1171 if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
1172 ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
39037602 1173 assert((ucs_ch & 0x003f) < SFM2MAC_LEN);
2d21ac55
A
1174 ucs_ch = sfm2mac[ucs_ch & 0x003f];
1175 }
1176 return (ucs_ch);
1177}
1178
1179