]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/vfs/vfs_utfconv.c
xnu-344.49.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25
26 /*
27 Includes Unicode 3.2 decomposition code derived from Core Foundation
28 */
29
30#include <sys/param.h>
31#include <sys/utfconv.h>
32#include <sys/errno.h>
33#include <architecture/byte_order.h>
34
35/*
36 * UTF-8 (Unicode Transformation Format)
37 *
38 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
39 * character as a sequence of one to four bytes. Only the shortest form
40 * required to represent the significant Unicode bits is legal.
41 *
42 * UTF-8 Multibyte Codes
43 *
44 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
45 * -----------------------------------------------------------------------------
46 * 1 7 0x0000 0x007F 0xxxxxxx
47 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
48 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
49 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
50 * -----------------------------------------------------------------------------
51 */
52
53
54#define UNICODE_TO_UTF8_LEN(c) \
55 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
56
57#define UCS_ALT_NULL 0x2400
58
59/* Surrogate Pair Constants */
60#define SP_HALF_SHIFT 10
61#define SP_HALF_BASE 0x0010000UL
62#define SP_HALF_MASK 0x3FFUL
63
64#define SP_HIGH_FIRST 0xD800UL
65#define SP_HIGH_LAST 0xDBFFUL
66#define SP_LOW_FIRST 0xDC00UL
67#define SP_LOW_LAST 0xDFFFUL
68
69
70#include "vfs_utfconvdata.h"
71
72
73/*
74 * Test for a combining character.
75 *
76 * Similar to __CFUniCharIsNonBaseCharacter except that
77 * unicode_combinable also includes Hangul Jamo characters.
78 */
79static inline int
80unicode_combinable(u_int16_t character)
81{
82 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
83 u_int8_t value;
84
85 if (character < 0x0300)
86 return (0);
87
88 value = bitmap[(character >> 8) & 0xFF];
89
90 if (value == 0xFF) {
91 return (1);
92 } else if (value) {
93 bitmap = bitmap + ((value - 1) * 32) + 256;
94 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
95 }
96 return (0);
97}
98
99/*
100 * Test for a precomposed character.
101 *
102 * Similar to __CFUniCharIsDecomposableCharacter.
103 */
104static inline int
105unicode_decomposeable(u_int16_t character) {
106 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
107 u_int8_t value;
108
109 if (character < 0x00C0)
110 return (0);
111
112 value = bitmap[(character >> 8) & 0xFF];
113
114 if (value == 0xFF) {
115 return (1);
116 } else if (value) {
117 bitmap = bitmap + ((value - 1) * 32) + 256;
118 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
119 }
120 return (0);
121}
122
123static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
124
125static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
126
127
128char utf_extrabytes[32] = {
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
131};
132
133
134/*
135 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
136 *
137 * NOTES:
138 * If '/' chars are allowed on disk then an alternate
139 * (replacement) char must be provided in altslash.
140 *
141 * input flags:
142 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
143 */
144size_t
145utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
146 int flags)
147{
148 u_int16_t ucs_ch;
149 int charcnt;
150 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
151 size_t len;
152
153 charcnt = ucslen / 2;
154 len = 0;
155
156 while (charcnt-- > 0) {
157 ucs_ch = *ucsp++;
158
159 if (swapbytes)
160 ucs_ch = NXSwapShort(ucs_ch);
161 if (ucs_ch == '/')
162 ucs_ch = altslash ? altslash : '_';
163 else if (ucs_ch == '\0')
164 ucs_ch = UCS_ALT_NULL;
165
166 len += UNICODE_TO_UTF8_LEN(ucs_ch);
167 }
168
169 return (len);
170}
171
172
173/*
174 * utf8_encodestr - Encodes a Unicode string to UTF-8
175 *
176 * NOTES:
177 * The resulting UTF-8 string is NULL terminated.
178 *
179 * If '/' chars are allowed on disk then an alternate
180 * (replacement) char must be provided in altslash.
181 *
182 * input flags:
183 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
184 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
185 *
186 * result:
187 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
188 * EINVAL: Illegal char found; char was replaced by an '_'.
189 */
190int
191utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
192 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
193{
194 u_int8_t * bufstart;
195 u_int8_t * bufend;
196 u_int16_t ucs_ch;
197 u_int16_t * chp = NULL;
198 u_int16_t sequence[8];
199 int extra = 0;
200 int charcnt;
201 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
202 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
203 int decompose = (flags & UTF_DECOMPOSED);
204 int result = 0;
205
206 bufstart = utf8p;
207 bufend = bufstart + buflen;
208 if (nullterm)
209 --bufend;
210 charcnt = ucslen / 2;
211
212 while (charcnt-- > 0) {
213 if (extra > 0) {
214 --extra;
215 ucs_ch = *chp++;
216 } else {
217 ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
218
219 if (decompose && unicode_decomposeable(ucs_ch)) {
220 extra = unicode_decompose(ucs_ch, sequence) - 1;
221 charcnt += extra;
222 ucs_ch = sequence[0];
223 chp = &sequence[1];
224 }
225 }
226
227 /* Slash and NULL are not permitted */
228 if (ucs_ch == '/') {
229 if (altslash)
230 ucs_ch = altslash;
231 else {
232 ucs_ch = '_';
233 result = EINVAL;
234 }
235 } else if (ucs_ch == '\0') {
236 ucs_ch = UCS_ALT_NULL;
237 }
238
239 if (ucs_ch < 0x0080) {
240 if (utf8p >= bufend) {
241 result = ENAMETOOLONG;
242 break;
243 }
244 *utf8p++ = ucs_ch;
245
246 } else if (ucs_ch < 0x800) {
247 if ((utf8p + 1) >= bufend) {
248 result = ENAMETOOLONG;
249 break;
250 }
251 *utf8p++ = 0xc0 | (ucs_ch >> 6);
252 *utf8p++ = 0x80 | (0x3f & ucs_ch);
253
254 } else {
255 /* Combine valid surrogate pairs */
256 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
257 && charcnt > 0) {
258 u_int16_t ch2;
259 u_int32_t pair;
260
261 ch2 = swapbytes ? NXSwapShort(*ucsp) : *ucsp;
262 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
263 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
264 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
265 if ((utf8p + 3) >= bufend) {
266 result = ENAMETOOLONG;
267 break;
268 }
269 --charcnt;
270 ++ucsp;
271 *utf8p++ = 0xf0 | (pair >> 18);
272 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
273 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
274 *utf8p++ = 0x80 | (0x3f & pair);
275 continue;
276 }
277 }
278 if ((utf8p + 2) >= bufend) {
279 result = ENAMETOOLONG;
280 break;
281 }
282 *utf8p++ = 0xe0 | (ucs_ch >> 12);
283 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
284 *utf8p++ = 0x80 | (0x3f & ucs_ch);
285 }
286 }
287
288 *utf8len = utf8p - bufstart;
289 if (nullterm)
290 *utf8p++ = '\0';
291
292 return (result);
293}
294
295
296/*
297 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
298 *
299 * NOTES:
300 * The input UTF-8 string does not need to be null terminated
301 * if utf8len is set.
302 *
303 * If '/' chars are allowed on disk then an alternate
304 * (replacement) char must be provided in altslash.
305 *
306 * input flags:
307 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
308 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
309 *
310 * result:
311 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
312 * EINVAL: Illegal UTF-8 sequence found.
313 */
314int
315utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
316 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
317{
318 u_int16_t* bufstart;
319 u_int16_t* bufend;
320 u_int16_t ucs_ch;
321 u_int8_t byte;
322 int result = 0;
323 int decompose, precompose, swapbytes;
324
325 decompose = (flags & UTF_DECOMPOSED);
326 precompose = (flags & UTF_PRECOMPOSED);
327 swapbytes = (flags & UTF_REVERSE_ENDIAN);
328
329 bufstart = ucsp;
330 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
331
332 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
333 if (ucsp >= bufend)
334 goto toolong;
335
336 /* check for ascii */
337 if (byte < 0x80) {
338 ucs_ch = byte; /* 1st byte */
339 } else {
340 u_int32_t ch;
341 int extrabytes = utf_extrabytes[byte >> 3];
342
343 if (utf8len < extrabytes)
344 goto invalid;
345 utf8len -= extrabytes;
346
347 switch (extrabytes) {
348 case 1: ch = byte; /* 1st byte */
349 ch <<= 6;
350 ch += *utf8p++; /* 2nd byte */
351 ch -= 0x00003080UL;
352 if (ch < 0x0080)
353 goto invalid;
354 ucs_ch = ch;
355 break;
356
357 case 2: ch = byte; /* 1st byte */
358 ch <<= 6;
359 ch += *utf8p++; /* 2nd byte */
360 ch <<= 6;
361 ch += *utf8p++; /* 3rd byte */
362 ch -= 0x000E2080UL;
363 if (ch < 0x0800)
364 goto invalid;
365 ucs_ch = ch;
366 break;
367
368 case 3: ch = byte; /* 1st byte */
369 ch <<= 6;
370 ch += *utf8p++; /* 2nd byte */
371 ch <<= 6;
372 ch += *utf8p++; /* 3rd byte */
373 ch <<= 6;
374 ch += *utf8p++; /* 4th byte */
375 ch -= 0x03C82080UL + SP_HALF_BASE;
376 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
377 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
378 if (ucsp >= bufend)
379 goto toolong;
380 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
381 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
382 continue;
383
384 default:
385 goto invalid;
386 }
387 if (decompose) {
388 if (unicode_decomposeable(ucs_ch)) {
389 u_int16_t sequence[8];
390 int count, i;
391
392 count = unicode_decompose(ucs_ch, sequence);
393
394 for (i = 0; i < count; ++i) {
395 ucs_ch = sequence[i];
396 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
397 if (ucsp >= bufend)
398 goto toolong;
399 }
400 continue;
401 }
402 } else if (precompose && (ucsp != bufstart)) {
403 u_int16_t composite, base;
404
405 if (unicode_combinable(ucs_ch)) {
406 base = swapbytes ? NXSwapShort(*(ucsp - 1)) : *(ucsp - 1);
407 composite = unicode_combine(base, ucs_ch);
408 if (composite) {
409 --ucsp;
410 ucs_ch = composite;
411 }
412 }
413 }
414 if (ucs_ch == UCS_ALT_NULL)
415 ucs_ch = '\0';
416 }
417 if (ucs_ch == altslash)
418 ucs_ch = '/';
419
420 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
421 }
422
423exit:
424 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
425
426 return (result);
427
428invalid:
429 result = EINVAL;
430 goto exit;
431
432toolong:
433 result = ENAMETOOLONG;
434 goto exit;
435}
436
437
438 /*
439 * Unicode 3.2 decomposition code (derived from Core Foundation)
440 */
441
442typedef struct {
443 u_int32_t _key;
444 u_int32_t _value;
445} unicode_mappings32;
446
447static inline u_int32_t
448getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
449 u_int16_t character)
450{
451 const unicode_mappings32 *p, *q, *divider;
452
453 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
454 return (0);
455
456 p = theTable;
457 q = p + (numElem-1);
458 while (p <= q) {
459 divider = p + ((q - p) >> 1); /* divide by 2 */
460 if (character < divider->_key) { q = divider - 1; }
461 else if (character > divider->_key) { p = divider + 1; }
462 else { return (divider->_value); }
463 }
464 return (0);
465}
466
467#define RECURSIVE_DECOMPOSITION (1 << 15)
468#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
469
470typedef struct {
471 u_int16_t _key;
472 u_int16_t _value;
473} unicode_mappings16;
474
475static inline u_int16_t
476getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
477 u_int16_t character)
478{
479 const unicode_mappings16 *p, *q, *divider;
480
481 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
482 return (0);
483
484 p = theTable;
485 q = p + (numElem-1);
486 while (p <= q) {
487 divider = p + ((q - p) >> 1); /* divide by 2 */
488 if (character < divider->_key)
489 q = divider - 1;
490 else if (character > divider->_key)
491 p = divider + 1;
492 else
493 return (divider->_value);
494 }
495 return (0);
496}
497
498
499static u_int32_t
500unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
501{
502 u_int16_t value;
503 u_int32_t length;
504 u_int16_t firstChar;
505 u_int16_t theChar;
506 const u_int16_t *bmpMappings;
507 u_int32_t usedLength;
508
509 value = getmappedvalue16(
510 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
511 __UniCharDecompositionTableLength, character);
512 length = EXTRACT_COUNT(value);
513 firstChar = value & 0x0FFF;
514 theChar = firstChar;
515 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
516 usedLength = 0;
517
518 if (value & RECURSIVE_DECOMPOSITION) {
519 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
520
521 --length; /* Decrement for the first char */
522 if (!usedLength)
523 return 0;
524 ++bmpMappings;
525 convertedChars += usedLength;
526 }
527
528 usedLength += length;
529
530 while (length--)
531 *(convertedChars++) = *(bmpMappings++);
532
533 return (usedLength);
534}
535
536#define HANGUL_SBASE 0xAC00
537#define HANGUL_LBASE 0x1100
538#define HANGUL_VBASE 0x1161
539#define HANGUL_TBASE 0x11A7
540
541#define HANGUL_SCOUNT 11172
542#define HANGUL_LCOUNT 19
543#define HANGUL_VCOUNT 21
544#define HANGUL_TCOUNT 28
545#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
546
547/*
548 * unicode_decompose - decompose a composed Unicode char
549 *
550 * Composed Unicode characters are forbidden on
551 * HFS Plus volumes. ucs_decompose will convert a
552 * composed character into its correct decomposed
553 * sequence.
554 *
555 * Similar to CFUniCharDecomposeCharacter
556 */
557static int
558unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
559{
560 if ((character >= HANGUL_SBASE) &&
561 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
562 u_int32_t length;
563
564 character -= HANGUL_SBASE;
565 length = (character % HANGUL_TCOUNT ? 3 : 2);
566
567 *(convertedChars++) =
568 character / HANGUL_NCOUNT + HANGUL_LBASE;
569 *(convertedChars++) =
570 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
571 if (length > 2)
572 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
573 return (length);
574 } else {
575 return (unicode_recursive_decompose(character, convertedChars));
576 }
577}
578
579/*
580 * unicode_combine - generate a precomposed Unicode char
581 *
582 * Precomposed Unicode characters are required for some volume
583 * formats and network protocols. unicode_combine will combine
584 * a decomposed character sequence into a single precomposed
585 * (composite) character.
586 *
587 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
588 * also handles Hangul Jamo characters.
589 */
590static u_int16_t
591unicode_combine(u_int16_t base, u_int16_t combining)
592{
593 u_int32_t value;
594
595 /* Check HANGUL */
596 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
597 /* 2 char Hangul sequences */
598 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
599 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
600 return (HANGUL_SBASE +
601 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
602 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
603 }
604
605 /* 3 char Hangul sequences */
606 if ((combining > HANGUL_TBASE) &&
607 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
608 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
609 return (0);
610 else
611 return (base + (combining - HANGUL_TBASE));
612 }
613 }
614
615 value = getmappedvalue32(
616 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
617 __CFUniCharPrecompositionTableLength, combining);
618
619 if (value) {
620 value = getmappedvalue16(
621 (const unicode_mappings16 *)
622 ((u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
623 (value >> 16), base);
624 }
625 return (value);
626}
627