]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_utfconv.c
xnu-1228.3.13.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
CommitLineData
1c79356b 1/*
2d21ac55 2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
9bccf70c
A
28
29 /*
30 Includes Unicode 3.2 decomposition code derived from Core Foundation
31 */
32
1c79356b
A
33#include <sys/param.h>
34#include <sys/utfconv.h>
35#include <sys/errno.h>
2d21ac55 36#include <sys/malloc.h>
0c530ab8 37#include <libkern/OSByteOrder.h>
1c79356b 38
1c79356b 39/*
765c9de3 40 * UTF-8 (Unicode Transformation Format)
1c79356b 41 *
765c9de3
A
42 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
43 * character as a sequence of one to four bytes. Only the shortest form
44 * required to represent the significant Unicode bits is legal.
1c79356b
A
45 *
46 * UTF-8 Multibyte Codes
47 *
765c9de3
A
48 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
49 * -----------------------------------------------------------------------------
50 * 1 7 0x0000 0x007F 0xxxxxxx
51 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
52 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
53 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
54 * -----------------------------------------------------------------------------
1c79356b
A
55 */
56
57
765c9de3
A
58#define UNICODE_TO_UTF8_LEN(c) \
59 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
0b4e3aa0
A
60
61#define UCS_ALT_NULL 0x2400
1c79356b 62
765c9de3
A
63/* Surrogate Pair Constants */
64#define SP_HALF_SHIFT 10
65#define SP_HALF_BASE 0x0010000UL
66#define SP_HALF_MASK 0x3FFUL
67
68#define SP_HIGH_FIRST 0xD800UL
69#define SP_HIGH_LAST 0xDBFFUL
70#define SP_LOW_FIRST 0xDC00UL
9bccf70c
A
71#define SP_LOW_LAST 0xDFFFUL
72
1c79356b 73
9bccf70c 74#include "vfs_utfconvdata.h"
765c9de3 75
1c79356b 76
9bccf70c
A
77/*
78 * Test for a combining character.
79 *
80 * Similar to __CFUniCharIsNonBaseCharacter except that
81 * unicode_combinable also includes Hangul Jamo characters.
82 */
83static inline int
84unicode_combinable(u_int16_t character)
85{
86 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
87 u_int8_t value;
88
89 if (character < 0x0300)
90 return (0);
91
92 value = bitmap[(character >> 8) & 0xFF];
93
94 if (value == 0xFF) {
95 return (1);
96 } else if (value) {
97 bitmap = bitmap + ((value - 1) * 32) + 256;
98 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
99 }
100 return (0);
101}
102
103/*
104 * Test for a precomposed character.
105 *
106 * Similar to __CFUniCharIsDecomposableCharacter.
107 */
108static inline int
109unicode_decomposeable(u_int16_t character) {
110 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
111 u_int8_t value;
112
113 if (character < 0x00C0)
114 return (0);
115
116 value = bitmap[(character >> 8) & 0xFF];
117
118 if (value == 0xFF) {
119 return (1);
120 } else if (value) {
121 bitmap = bitmap + ((value - 1) * 32) + 256;
122 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
123 }
124 return (0);
125}
126
13fec989
A
127
128/*
129 * Get the combing class.
130 *
131 * Similar to CFUniCharGetCombiningPropertyForCharacter.
132 */
133static inline u_int8_t
134get_combining_class(u_int16_t character) {
135 const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
136
137 u_int8_t value = bitmap[(character >> 8)];
138
139 if (value) {
140 bitmap = bitmap + (value * 256);
141 return bitmap[character % 256];
142 }
143 return (0);
144}
145
146
9bccf70c
A
147static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
148
149static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
0b4e3aa0 150
13fec989 151static void priortysort(u_int16_t* characters, int count);
1c79356b 152
2d21ac55
A
153static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
154
155static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
156
157
765c9de3
A
158char utf_extrabytes[32] = {
159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
161};
162
2d21ac55
A
163const char hexdigits[16] = {
164 '0', '1', '2', '3', '4', '5', '6', '7',
165 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
166};
765c9de3 167
1c79356b 168/*
2d21ac55 169 * utf8_encodelen - Calculate the UTF-8 encoding length
1c79356b 170 *
2d21ac55
A
171 * This function takes a Unicode input string, ucsp, of ucslen bytes
172 * and calculates the size of the UTF-8 output in bytes (not including
173 * a NULL termination byte). The string must reside in kernel memory.
1c79356b 174 *
2d21ac55
A
175 * If '/' chars are possible in the Unicode input then an alternate
176 * (replacement) char should be provided in altslash.
177 *
178 * FLAGS
179 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
180 *
181 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
182 *
183 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
184 *
185 * UTF_DECOMPOSED: generate fully decomposed output
186 *
187 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
188 *
189 * ERRORS
190 * None
1c79356b
A
191 */
192size_t
2d21ac55 193utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
1c79356b
A
194{
195 u_int16_t ucs_ch;
2d21ac55
A
196 u_int16_t * chp = NULL;
197 u_int16_t sequence[8];
198 int extra = 0;
1c79356b
A
199 int charcnt;
200 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
2d21ac55 201 int decompose = (flags & UTF_DECOMPOSED);
1c79356b 202 size_t len;
2d21ac55 203
1c79356b
A
204 charcnt = ucslen / 2;
205 len = 0;
206
207 while (charcnt-- > 0) {
2d21ac55
A
208 if (extra > 0) {
209 --extra;
210 ucs_ch = *chp++;
211 } else {
212 ucs_ch = *ucsp++;
213 if (swapbytes) {
214 ucs_ch = OSSwapInt16(ucs_ch);
215 }
216 if (ucs_ch == '/') {
217 ucs_ch = altslash ? altslash : '_';
218 } else if (ucs_ch == '\0') {
219 ucs_ch = UCS_ALT_NULL;
220 } else if (decompose && unicode_decomposeable(ucs_ch)) {
221 extra = unicode_decompose(ucs_ch, sequence) - 1;
222 charcnt += extra;
223 ucs_ch = sequence[0];
224 chp = &sequence[1];
225 }
226 }
765c9de3 227 len += UNICODE_TO_UTF8_LEN(ucs_ch);
1c79356b
A
228 }
229
230 return (len);
231}
232
233
234/*
765c9de3 235 * utf8_encodestr - Encodes a Unicode string to UTF-8
1c79356b
A
236 *
237 * NOTES:
0b4e3aa0 238 * The resulting UTF-8 string is NULL terminated.
1c79356b
A
239 *
240 * If '/' chars are allowed on disk then an alternate
241 * (replacement) char must be provided in altslash.
242 *
243 * input flags:
765c9de3 244 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
2d21ac55
A
245 *
246 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
247 *
248 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
249 *
250 * UTF_DECOMPOSED: generate fully decomposed output
251 *
1c79356b 252 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
0b4e3aa0
A
253 *
254 * result:
255 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
2d21ac55 256 *
0b4e3aa0 257 * EINVAL: Illegal char found; char was replaced by an '_'.
1c79356b 258 */
765c9de3
A
259int
260utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
261 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
1c79356b
A
262{
263 u_int8_t * bufstart;
264 u_int8_t * bufend;
265 u_int16_t ucs_ch;
9bccf70c
A
266 u_int16_t * chp = NULL;
267 u_int16_t sequence[8];
268 int extra = 0;
1c79356b
A
269 int charcnt;
270 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
0b4e3aa0
A
271 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
272 int decompose = (flags & UTF_DECOMPOSED);
2d21ac55 273 int sfmconv = (flags & UTF_SFM_CONVERSIONS);
1c79356b 274 int result = 0;
2d21ac55 275
1c79356b
A
276 bufstart = utf8p;
277 bufend = bufstart + buflen;
278 if (nullterm)
279 --bufend;
280 charcnt = ucslen / 2;
281
282 while (charcnt-- > 0) {
9bccf70c
A
283 if (extra > 0) {
284 --extra;
285 ucs_ch = *chp++;
0b4e3aa0 286 } else {
0c530ab8 287 ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
9bccf70c
A
288
289 if (decompose && unicode_decomposeable(ucs_ch)) {
290 extra = unicode_decompose(ucs_ch, sequence) - 1;
291 charcnt += extra;
292 ucs_ch = sequence[0];
293 chp = &sequence[1];
294 }
0b4e3aa0 295 }
1c79356b 296
0b4e3aa0
A
297 /* Slash and NULL are not permitted */
298 if (ucs_ch == '/') {
299 if (altslash)
300 ucs_ch = altslash;
301 else {
302 ucs_ch = '_';
303 result = EINVAL;
304 }
305 } else if (ucs_ch == '\0') {
306 ucs_ch = UCS_ALT_NULL;
307 }
1c79356b 308
0b4e3aa0 309 if (ucs_ch < 0x0080) {
1c79356b
A
310 if (utf8p >= bufend) {
311 result = ENAMETOOLONG;
312 break;
765c9de3 313 }
1c79356b
A
314 *utf8p++ = ucs_ch;
315
316 } else if (ucs_ch < 0x800) {
317 if ((utf8p + 1) >= bufend) {
318 result = ENAMETOOLONG;
319 break;
320 }
765c9de3
A
321 *utf8p++ = 0xc0 | (ucs_ch >> 6);
322 *utf8p++ = 0x80 | (0x3f & ucs_ch);
1c79356b
A
323
324 } else {
2d21ac55
A
325 /* These chars never valid Unicode. */
326 if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) {
327 result = EINVAL;
328 break;
329 }
330
765c9de3
A
331 /* Combine valid surrogate pairs */
332 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
333 && charcnt > 0) {
334 u_int16_t ch2;
335 u_int32_t pair;
336
0c530ab8 337 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
765c9de3
A
338 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
339 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
340 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
341 if ((utf8p + 3) >= bufend) {
342 result = ENAMETOOLONG;
343 break;
344 }
345 --charcnt;
346 ++ucsp;
347 *utf8p++ = 0xf0 | (pair >> 18);
348 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
349 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
350 *utf8p++ = 0x80 | (0x3f & pair);
351 continue;
352 }
2d21ac55
A
353 } else if (sfmconv) {
354 ucs_ch = sfm_to_ucs(ucs_ch);
355 if (ucs_ch < 0x0080) {
356 if (utf8p >= bufend) {
357 result = ENAMETOOLONG;
358 break;
359 }
360 *utf8p++ = ucs_ch;
361 continue;
362 }
765c9de3 363 }
1c79356b
A
364 if ((utf8p + 2) >= bufend) {
365 result = ENAMETOOLONG;
366 break;
367 }
765c9de3
A
368 *utf8p++ = 0xe0 | (ucs_ch >> 12);
369 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
370 *utf8p++ = 0x80 | (0x3f & ucs_ch);
1c79356b
A
371 }
372 }
373
374 *utf8len = utf8p - bufstart;
375 if (nullterm)
376 *utf8p++ = '\0';
377
378 return (result);
379}
380
381
382/*
765c9de3 383 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
1c79356b
A
384 *
385 * NOTES:
386 * The input UTF-8 string does not need to be null terminated
387 * if utf8len is set.
388 *
389 * If '/' chars are allowed on disk then an alternate
390 * (replacement) char must be provided in altslash.
391 *
392 * input flags:
2d21ac55
A
393 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
394 *
395 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
396 *
397 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
398 *
399 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
400 *
401 * UTF_PRECOMPOSED: generate precomposed output (NFC)
402 *
403 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
0b4e3aa0
A
404 *
405 * result:
406 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
2d21ac55 407 *
0b4e3aa0 408 * EINVAL: Illegal UTF-8 sequence found.
1c79356b
A
409 */
410int
765c9de3
A
411utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
412 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
1c79356b
A
413{
414 u_int16_t* bufstart;
415 u_int16_t* bufend;
55e303ae
A
416 unsigned int ucs_ch;
417 unsigned int byte;
13fec989 418 int combcharcnt = 0;
1c79356b 419 int result = 0;
2d21ac55
A
420 int decompose, precompose, swapbytes, escaping;
421 int sfmconv;
422 int extrabytes;
1c79356b 423
2d21ac55 424 decompose = (flags & UTF_DECOMPOSED);
0b4e3aa0 425 precompose = (flags & UTF_PRECOMPOSED);
2d21ac55
A
426 swapbytes = (flags & UTF_REVERSE_ENDIAN);
427 escaping = (flags & UTF_ESCAPE_ILLEGAL);
428 sfmconv = (flags & UTF_SFM_CONVERSIONS);
1c79356b
A
429
430 bufstart = ucsp;
431 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
432
433 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
765c9de3
A
434 if (ucsp >= bufend)
435 goto toolong;
1c79356b
A
436
437 /* check for ascii */
438 if (byte < 0x80) {
2d21ac55 439 ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
1c79356b 440 } else {
765c9de3 441 u_int32_t ch;
765c9de3 442
2d21ac55
A
443 extrabytes = utf_extrabytes[byte >> 3];
444 if ((extrabytes < 0) || ((int)utf8len < extrabytes)) {
445 goto escape;
446 }
765c9de3
A
447 utf8len -= extrabytes;
448
449 switch (extrabytes) {
55e303ae
A
450 case 1:
451 ch = byte; ch <<= 6; /* 1st byte */
452 byte = *utf8p++; /* 2nd byte */
453 if ((byte >> 6) != 2)
2d21ac55 454 goto escape2;
55e303ae
A
455 ch += byte;
456 ch -= 0x00003080UL;
457 if (ch < 0x0080)
2d21ac55 458 goto escape2;
55e303ae 459 ucs_ch = ch;
765c9de3 460 break;
55e303ae
A
461 case 2:
462 ch = byte; ch <<= 6; /* 1st byte */
463 byte = *utf8p++; /* 2nd byte */
464 if ((byte >> 6) != 2)
2d21ac55 465 goto escape2;
55e303ae
A
466 ch += byte; ch <<= 6;
467 byte = *utf8p++; /* 3rd byte */
468 if ((byte >> 6) != 2)
2d21ac55 469 goto escape3;
55e303ae
A
470 ch += byte;
471 ch -= 0x000E2080UL;
472 if (ch < 0x0800)
2d21ac55 473 goto escape3;
55e303ae
A
474 if (ch >= 0xD800) {
475 if (ch <= 0xDFFF)
2d21ac55 476 goto escape3;
55e303ae 477 if (ch == 0xFFFE || ch == 0xFFFF)
2d21ac55 478 goto escape3;
55e303ae
A
479 }
480 ucs_ch = ch;
481 break;
482 case 3:
483 ch = byte; ch <<= 6; /* 1st byte */
484 byte = *utf8p++; /* 2nd byte */
485 if ((byte >> 6) != 2)
2d21ac55 486 goto escape2;
55e303ae
A
487 ch += byte; ch <<= 6;
488 byte = *utf8p++; /* 3rd byte */
489 if ((byte >> 6) != 2)
2d21ac55 490 goto escape3;
55e303ae
A
491 ch += byte; ch <<= 6;
492 byte = *utf8p++; /* 4th byte */
493 if ((byte >> 6) != 2)
2d21ac55 494 goto escape4;
55e303ae
A
495 ch += byte;
496 ch -= 0x03C82080UL + SP_HALF_BASE;
497 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
498 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
2d21ac55
A
499 goto escape4;
500 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
55e303ae
A
501 if (ucsp >= bufend)
502 goto toolong;
503 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
2d21ac55
A
504 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
505 --ucsp;
506 goto escape4;
507 }
508 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
765c9de3 509 continue;
1c79356b 510 default:
2d21ac55
A
511 result = EINVAL;
512 goto exit;
1c79356b 513 }
1c79356b 514 if (decompose) {
9bccf70c
A
515 if (unicode_decomposeable(ucs_ch)) {
516 u_int16_t sequence[8];
517 int count, i;
2d21ac55
A
518
519 /* Before decomposing a new unicode character, sort
520 * previous combining characters, if any, and reset
521 * the counter.
6601e61a 522 */
2d21ac55 523 if (combcharcnt > 1) {
6601e61a
A
524 priortysort(ucsp - combcharcnt, combcharcnt);
525 }
526 combcharcnt = 0;
1c79356b 527
2d21ac55 528 count = unicode_decompose(ucs_ch, sequence);
9bccf70c
A
529 for (i = 0; i < count; ++i) {
530 ucs_ch = sequence[i];
2d21ac55 531 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
765c9de3
A
532 if (ucsp >= bufend)
533 goto toolong;
0b4e3aa0 534 }
13fec989 535 combcharcnt += count - 1;
0c530ab8 536 continue;
0b4e3aa0
A
537 }
538 } else if (precompose && (ucsp != bufstart)) {
539 u_int16_t composite, base;
540
9bccf70c 541 if (unicode_combinable(ucs_ch)) {
0c530ab8 542 base = swapbytes ? OSSwapInt16(*(ucsp - 1)) : *(ucsp - 1);
9bccf70c
A
543 composite = unicode_combine(base, ucs_ch);
544 if (composite) {
545 --ucsp;
546 ucs_ch = composite;
547 }
1c79356b
A
548 }
549 }
0b4e3aa0
A
550 if (ucs_ch == UCS_ALT_NULL)
551 ucs_ch = '\0';
1c79356b 552 }
1c79356b
A
553 if (ucs_ch == altslash)
554 ucs_ch = '/';
1c79356b 555
13fec989
A
556 /*
557 * Make multiple combining character sequences canonical
558 */
559 if (unicode_combinable(ucs_ch)) {
560 ++combcharcnt; /* start tracking a run */
561 } else if (combcharcnt) {
562 if (combcharcnt > 1) {
563 priortysort(ucsp - combcharcnt, combcharcnt);
564 }
565 combcharcnt = 0; /* start over */
566 }
2d21ac55
A
567
568 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
569 continue;
570
571 /*
572 * Escape illegal UTF-8 into something legal.
573 */
574escape4:
575 utf8p -= 3;
576 goto escape;
577escape3:
578 utf8p -= 2;
579 goto escape;
580escape2:
581 utf8p -= 1;
582escape:
583 if (!escaping) {
584 result = EINVAL;
585 goto exit;
586 }
587 if (extrabytes > 0)
588 utf8len += extrabytes;
589 byte = *(utf8p - 1);
590
591 if ((ucsp + 2) >= bufend)
592 goto toolong;
593
594 ucs_ch = '%';
595 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
596 ucs_ch = hexdigits[byte >> 4];
597 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
598 ucs_ch = hexdigits[byte & 0x0F];
599 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
1c79356b 600 }
13fec989
A
601 /*
602 * Make a previous combining sequence canonical
603 */
604 if (combcharcnt > 1) {
605 priortysort(ucsp - combcharcnt, combcharcnt);
606 }
765c9de3 607exit:
1c79356b
A
608 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
609
610 return (result);
765c9de3 611
765c9de3
A
612toolong:
613 result = ENAMETOOLONG;
614 goto exit;
1c79356b
A
615}
616
617
91447636
A
618/*
619 * utf8_validatestr - Check for a valid UTF-8 string.
620 */
621int
622utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
623{
624 unsigned int byte;
625 u_int32_t ch;
626 unsigned int ucs_ch;
627 size_t extrabytes;
628
629 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
630 if (byte < 0x80)
631 continue; /* plain ascii */
632
633 extrabytes = utf_extrabytes[byte >> 3];
634
635 if (utf8len < extrabytes)
636 goto invalid;
637 utf8len -= extrabytes;
638
639 switch (extrabytes) {
640 case 1:
641 ch = byte; ch <<= 6; /* 1st byte */
642 byte = *utf8p++; /* 2nd byte */
643 if ((byte >> 6) != 2)
644 goto invalid;
645 ch += byte;
646 ch -= 0x00003080UL;
647 if (ch < 0x0080)
648 goto invalid;
649 break;
650 case 2:
651 ch = byte; ch <<= 6; /* 1st byte */
652 byte = *utf8p++; /* 2nd byte */
653 if ((byte >> 6) != 2)
654 goto invalid;
655 ch += byte; ch <<= 6;
656 byte = *utf8p++; /* 3rd byte */
657 if ((byte >> 6) != 2)
658 goto invalid;
659 ch += byte;
660 ch -= 0x000E2080UL;
661 if (ch < 0x0800)
662 goto invalid;
663 if (ch >= 0xD800) {
664 if (ch <= 0xDFFF)
665 goto invalid;
666 if (ch == 0xFFFE || ch == 0xFFFF)
667 goto invalid;
668 }
669 break;
670 case 3:
671 ch = byte; ch <<= 6; /* 1st byte */
672 byte = *utf8p++; /* 2nd byte */
673 if ((byte >> 6) != 2)
674 goto invalid;
675 ch += byte; ch <<= 6;
676 byte = *utf8p++; /* 3rd byte */
677 if ((byte >> 6) != 2)
678 goto invalid;
679 ch += byte; ch <<= 6;
680 byte = *utf8p++; /* 4th byte */
681 if ((byte >> 6) != 2)
682 goto invalid;
683 ch += byte;
684 ch -= 0x03C82080UL + SP_HALF_BASE;
685 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
686 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
687 goto invalid;
688 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
689 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
690 goto invalid;
691 break;
692 default:
693 goto invalid;
694 }
695
696 }
697 return (0);
698invalid:
699 return (EINVAL);
700}
701
2d21ac55
A
702/*
703 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
704 *
705 * This function takes an UTF-8 input string, instr, of inlen bytes
706 * and produces normalized UTF-8 output into a buffer of buflen bytes
707 * pointed to by outstr. The size of the output in bytes (not including
708 * a NULL termination byte) is returned in outlen. In-place conversions
709 * are not supported (i.e. instr != outstr).]
710
711 * FLAGS
712 * UTF_DECOMPOSED: output string will be fully decomposed (NFD)
713 *
714 * UTF_PRECOMPOSED: output string will be precomposed (NFC)
715 *
716 * UTF_NO_NULL_TERM: do not add null termination to output string
717 *
718 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
719 *
720 * ERRORS
721 * ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
722 *
723 * EINVAL: illegal UTF-8 sequence encountered or invalid flags
724 */
725int
726utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
727 size_t *outlen, size_t buflen, int flags)
728{
729 u_int16_t unicodebuf[32];
730 u_int16_t* unistr = NULL;
731 size_t unicode_bytes;
732 size_t uft8_bytes;
733 size_t inbuflen;
734 u_int8_t *outbufstart, *outbufend;
735 const u_int8_t *inbufstart;
736 unsigned int byte;
737 int decompose, precompose;
738 int result = 0;
739
740 if (flags & ~(UTF_DECOMPOSED | UTF_PRECOMPOSED | UTF_NO_NULL_TERM | UTF_ESCAPE_ILLEGAL)) {
741 return (EINVAL);
742 }
743 decompose = (flags & UTF_DECOMPOSED);
744 precompose = (flags & UTF_PRECOMPOSED);
745 if ((decompose && precompose) || (!decompose && !precompose)) {
746 return (EINVAL);
747 }
748 outbufstart = outstr;
749 outbufend = outbufstart + buflen;
750 inbufstart = instr;
751 inbuflen = inlen;
752
753 while (inlen-- > 0 && (byte = *instr++) != '\0') {
754 if (outstr >= outbufend) {
755 result = ENAMETOOLONG;
756 goto exit;
757 }
758 if (byte >= 0x80) {
759 goto nonASCII;
760 }
761 /* ASCII is already normalized. */
762 *outstr++ = byte;
763 }
764exit:
765 *outlen = outstr - outbufstart;
766 if (((flags & UTF_NO_NULL_TERM) == 0)) {
767 if (outstr < outbufend)
768 *outstr++ = '\0';
769 else
770 result = ENAMETOOLONG;
771 }
772 return (result);
773
774
775 /*
776 * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
777 * functions to perform the normalization. Since this will
778 * presumably be used to normalize filenames in the back-end
779 * (on disk or over-the-wire), it should be fast enough.
780 */
781nonASCII:
782
783 /* Make sure the input size is reasonable. */
784 if (inbuflen > MAXPATHLEN) {
785 result = ENAMETOOLONG;
786 goto exit;
787 }
788 /*
789 * Compute worst case Unicode buffer size.
790 *
791 * For pre-composed output, every UTF-8 input byte will be at
792 * most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
793 * (smallest composite char sequence) may yield 6 Unicode bytes
794 * (1 base char + 2 combining chars).
795 */
796 unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
797
798 if (unicode_bytes <= sizeof(unicodebuf))
799 unistr = &unicodebuf[0];
800 else
801 MALLOC(unistr, u_int16_t *, unicode_bytes, M_TEMP, M_WAITOK);
802
803 /* Normalize the string. */
804 result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
805 unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
806 if (result == 0) {
807 /* Put results back into UTF-8. */
808 result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
809 &uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
810 outstr = outbufstart + uft8_bytes;
811 }
812 if (unistr && unistr != &unicodebuf[0]) {
813 FREE(unistr, M_TEMP);
814 }
815 goto exit;
816}
817
91447636 818
9bccf70c
A
819 /*
820 * Unicode 3.2 decomposition code (derived from Core Foundation)
821 */
1c79356b 822
9bccf70c
A
823typedef struct {
824 u_int32_t _key;
825 u_int32_t _value;
826} unicode_mappings32;
0b4e3aa0 827
9bccf70c
A
828static inline u_int32_t
829getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
830 u_int16_t character)
831{
832 const unicode_mappings32 *p, *q, *divider;
1c79356b 833
9bccf70c
A
834 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
835 return (0);
1c79356b 836
9bccf70c
A
837 p = theTable;
838 q = p + (numElem-1);
839 while (p <= q) {
840 divider = p + ((q - p) >> 1); /* divide by 2 */
841 if (character < divider->_key) { q = divider - 1; }
842 else if (character > divider->_key) { p = divider + 1; }
843 else { return (divider->_value); }
844 }
845 return (0);
846}
1c79356b 847
9bccf70c
A
848#define RECURSIVE_DECOMPOSITION (1 << 15)
849#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
1c79356b 850
9bccf70c
A
851typedef struct {
852 u_int16_t _key;
853 u_int16_t _value;
854} unicode_mappings16;
1c79356b 855
9bccf70c
A
856static inline u_int16_t
857getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
858 u_int16_t character)
859{
860 const unicode_mappings16 *p, *q, *divider;
1c79356b 861
9bccf70c
A
862 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
863 return (0);
1c79356b 864
9bccf70c
A
865 p = theTable;
866 q = p + (numElem-1);
867 while (p <= q) {
868 divider = p + ((q - p) >> 1); /* divide by 2 */
869 if (character < divider->_key)
870 q = divider - 1;
871 else if (character > divider->_key)
872 p = divider + 1;
873 else
874 return (divider->_value);
875 }
876 return (0);
877}
878
879
880static u_int32_t
881unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
882{
883 u_int16_t value;
884 u_int32_t length;
885 u_int16_t firstChar;
886 u_int16_t theChar;
887 const u_int16_t *bmpMappings;
888 u_int32_t usedLength;
889
890 value = getmappedvalue16(
891 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
892 __UniCharDecompositionTableLength, character);
893 length = EXTRACT_COUNT(value);
894 firstChar = value & 0x0FFF;
895 theChar = firstChar;
896 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
897 usedLength = 0;
898
899 if (value & RECURSIVE_DECOMPOSITION) {
900 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
901
902 --length; /* Decrement for the first char */
903 if (!usedLength)
904 return 0;
905 ++bmpMappings;
906 convertedChars += usedLength;
907 }
0b4e3aa0 908
9bccf70c 909 usedLength += length;
0b4e3aa0 910
9bccf70c
A
911 while (length--)
912 *(convertedChars++) = *(bmpMappings++);
0b4e3aa0 913
9bccf70c
A
914 return (usedLength);
915}
916
917#define HANGUL_SBASE 0xAC00
918#define HANGUL_LBASE 0x1100
919#define HANGUL_VBASE 0x1161
920#define HANGUL_TBASE 0x11A7
921
922#define HANGUL_SCOUNT 11172
923#define HANGUL_LCOUNT 19
924#define HANGUL_VCOUNT 21
925#define HANGUL_TCOUNT 28
926#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
1c79356b
A
927
928/*
9bccf70c 929 * unicode_decompose - decompose a composed Unicode char
1c79356b
A
930 *
931 * Composed Unicode characters are forbidden on
932 * HFS Plus volumes. ucs_decompose will convert a
933 * composed character into its correct decomposed
934 * sequence.
935 *
9bccf70c 936 * Similar to CFUniCharDecomposeCharacter
1c79356b 937 */
9bccf70c
A
938static int
939unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
1c79356b 940{
9bccf70c
A
941 if ((character >= HANGUL_SBASE) &&
942 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
943 u_int32_t length;
944
945 character -= HANGUL_SBASE;
946 length = (character % HANGUL_TCOUNT ? 3 : 2);
947
948 *(convertedChars++) =
949 character / HANGUL_NCOUNT + HANGUL_LBASE;
950 *(convertedChars++) =
951 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
952 if (length > 2)
953 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
954 return (length);
1c79356b 955 } else {
9bccf70c 956 return (unicode_recursive_decompose(character, convertedChars));
1c79356b 957 }
1c79356b
A
958}
959
0b4e3aa0 960/*
9bccf70c 961 * unicode_combine - generate a precomposed Unicode char
0b4e3aa0
A
962 *
963 * Precomposed Unicode characters are required for some volume
9bccf70c
A
964 * formats and network protocols. unicode_combine will combine
965 * a decomposed character sequence into a single precomposed
0b4e3aa0
A
966 * (composite) character.
967 *
9bccf70c
A
968 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
969 * also handles Hangul Jamo characters.
0b4e3aa0
A
970 */
971static u_int16_t
9bccf70c 972unicode_combine(u_int16_t base, u_int16_t combining)
0b4e3aa0 973{
9bccf70c
A
974 u_int32_t value;
975
976 /* Check HANGUL */
977 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
978 /* 2 char Hangul sequences */
979 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
980 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
981 return (HANGUL_SBASE +
982 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
983 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
0b4e3aa0 984 }
9bccf70c
A
985
986 /* 3 char Hangul sequences */
987 if ((combining > HANGUL_TBASE) &&
988 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
989 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
990 return (0);
991 else
992 return (base + (combining - HANGUL_TBASE));
0b4e3aa0 993 }
0b4e3aa0
A
994 }
995
9bccf70c
A
996 value = getmappedvalue32(
997 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
998 __CFUniCharPrecompositionTableLength, combining);
0b4e3aa0 999
9bccf70c
A
1000 if (value) {
1001 value = getmappedvalue16(
1002 (const unicode_mappings16 *)
2d21ac55 1003 ((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
9bccf70c 1004 (value >> 16), base);
0b4e3aa0 1005 }
9bccf70c 1006 return (value);
0b4e3aa0
A
1007}
1008
13fec989
A
1009
1010/*
1011 * priortysort - order combining chars into canonical order
1012 *
1013 * Similar to CFUniCharPrioritySort
1014 */
1015static void
1016priortysort(u_int16_t* characters, int count)
1017{
1018 u_int32_t p1, p2;
1019 u_int16_t *ch1, *ch2;
1020 u_int16_t *end;
1021 int changes = 1;
1022
1023 end = characters + count;
1024 do {
1025 changes = 0;
1026 ch1 = characters;
1027 ch2 = characters + 1;
1028 p2 = get_combining_class(*ch1);
1029 while (ch2 < end) {
1030 p1 = p2;
1031 p2 = get_combining_class(*ch2);
1032 if (p1 > p2) {
1033 u_int32_t tmp;
1034
1035 tmp = *ch1;
1036 *ch1 = *ch2;
1037 *ch2 = tmp;
1038 changes = 1;
1039 }
1040 ++ch1;
1041 ++ch2;
1042 }
1043 } while (changes);
1044}
2d21ac55
A
1045
1046
1047/*
1048 * Invalid NTFS filename characters are encodeded using the
1049 * SFM (Services for Macintosh) private use Unicode characters.
1050 *
1051 * These should only be used for SMB, MSDOS or NTFS.
1052 *
1053 * Illegal NTFS Char SFM Unicode Char
1054 * ----------------------------------------
1055 * 0x01-0x1f 0xf001-0xf01f
1056 * '"' 0xf020
1057 * '*' 0xf021
1058 * '/' 0xf022
1059 * '<' 0xf023
1060 * '>' 0xf024
1061 * '?' 0xf025
1062 * '\' 0xf026
1063 * '|' 0xf027
1064 * ' ' 0xf028 (Only if last char of the name)
1065 * '.' 0xf029 (Only if last char of the name)
1066 * ----------------------------------------
1067 *
1068 * Reference: http://support.microsoft.com/kb/q117258/
1069 */
1070
1071#define MAX_SFM2MAC 0x29
1072#define SFMCODE_PREFIX_MASK 0xf000
1073
1074/*
1075 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1076 * SFM had no conversion for the colon. There is a conversion for the
1077 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1078 * is a slash and a slash is a colon. So we can just replace the slash with the
1079 * colon in our tables and everything will just work.
1080 */
1081static u_int8_t
1082sfm2mac[42] = {
1083 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
1084 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
1085 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
1086 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
1087 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
1088 0x20, 0x2e /* 28 - 29 */
1089};
1090
1091static u_int8_t
1092mac2sfm[112] = {
1093 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
1094 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
1095 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
1096 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
1097 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
1098 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
1099 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
1100 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
1101 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
1102 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
1103 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
1104 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
1105};
1106
1107
1108/*
1109 * Encode illegal NTFS filename characters into SFM Private Unicode characters
1110 *
1111 * Assumes non-zero ASCII input.
1112 */
1113static u_int16_t
1114ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
1115{
1116 /* The last character of filename cannot be a space or period. */
1117 if (lastchar) {
1118 if (ucs_ch == 0x20)
1119 return (0xf028);
1120 else if (ucs_ch == 0x2e)
1121 return (0xf029);
1122 }
1123 /* 0x01 - 0x1f is simple transformation. */
1124 if (ucs_ch <= 0x1f) {
1125 return (ucs_ch | 0xf000);
1126 } else /* 0x20 - 0x7f */ {
1127 u_int16_t lsb;
1128
1129 lsb = mac2sfm[ucs_ch - 0x0020];
1130 if (lsb != ucs_ch)
1131 return(0xf000 | lsb);
1132 }
1133 return (ucs_ch);
1134}
1135
1136/*
1137 * Decode any SFM Private Unicode characters
1138 */
1139static u_int16_t
1140sfm_to_ucs(u_int16_t ucs_ch)
1141{
1142 if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
1143 ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
1144 ucs_ch = sfm2mac[ucs_ch & 0x003f];
1145 }
1146 return (ucs_ch);
1147}
1148
1149