1 /* Copyright © 2017-2018 Apple Inc. All rights reserved.
6 * Created by Oded Shoshani on 31/1/18.
10 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
12 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
14 * This file contains Original Code and/or Modifications of Original Code
15 * as defined in and that are subject to the Apple Public Source License
16 * Version 2.0 (the 'License'). You may not use this file except in
17 * compliance with the License. The rights granted to you under the License
18 * may not be used to create, or enable the creation or redistribution of,
19 * unlawful or unlicensed copies of an Apple operating system, or to
20 * circumvent, violate, or enable the circumvention or violation of, any
21 * terms of an Apple operating system software license agreement.
23 * Please obtain a copy of the License at
24 * http://www.opensource.apple.com/apsl/ and read it before using this file.
26 * The Original Code and all software distributed under the License are
27 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
28 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
29 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
31 * Please see the License for the specific language governing rights and
32 * limitations under the License.
34 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
38 Includes Unicode 3.2 decomposition code derived from Core Foundation
41 #pragma clang diagnostic ignored "-Wsign-conversion"
42 #pragma clang diagnostic ignored "-Wconversion"
44 #include <sys/param.h>
45 #include <sys/errno.h>
46 #include <libkern/OSByteOrder.h>
49 #include "lf_hfs_sbunicode.h"
53 * UTF-8 (Unicode Transformation Format)
55 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
56 * character as a sequence of one to four bytes. Only the shortest form
57 * required to represent the significant Unicode bits is legal.
59 * UTF-8 Multibyte Codes
61 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
62 * -----------------------------------------------------------------------------
63 * 1 7 0x0000 0x007F 0xxxxxxx
64 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
65 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
66 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
67 * -----------------------------------------------------------------------------
71 #define UNICODE_TO_UTF8_LEN(c) \
72 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
74 #define UCS_ALT_NULL 0x2400
77 /* Surrogate Pair Constants */
79 #define SP_HALF_SHIFT 10
80 #define SP_HALF_BASE 0x0010000u
81 #define SP_HALF_MASK 0x3FFu
82 #define SP_HIGH_FIRST 0xD800u
83 #define SP_HIGH_LAST 0xDBFFu
84 #define SP_LOW_FIRST 0xDC00u
85 #define SP_LOW_LAST 0xDFFFu
88 #include "lf_hfs_utfconvdata.h"
92 * Test for a combining character.
94 * Similar to __CFUniCharIsNonBaseCharacter except that
95 * unicode_combinable also includes Hangul Jamo characters.
98 unicode_combinable(u_int16_t character
)
100 const u_int8_t
*bitmap
= __CFUniCharCombiningBitmap
;
103 if (character
< 0x0300)
106 value
= bitmap
[(character
>> 8) & 0xFF];
111 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
112 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
118 * Test for a precomposed character.
120 * Similar to __CFUniCharIsDecomposableCharacter.
123 unicode_decomposeable(u_int16_t character
) {
124 const u_int8_t
*bitmap
= __CFUniCharDecomposableBitmap
;
127 if (character
< 0x00C0)
130 value
= bitmap
[(character
>> 8) & 0xFF];
135 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
136 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
143 * Get the combing class.
145 * Similar to CFUniCharGetCombiningPropertyForCharacter.
147 static inline u_int8_t
148 get_combining_class(u_int16_t character
) {
149 const u_int8_t
*bitmap
= __CFUniCharCombiningPropertyBitmap
;
151 u_int8_t value
= bitmap
[(character
>> 8)];
154 bitmap
= bitmap
+ (value
* 256);
155 return bitmap
[character
% 256];
160 static int unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
);
162 static u_int16_t
unicode_combine(u_int16_t base
, u_int16_t combining
);
164 static void priortysort(u_int16_t
* characters
, int count
);
166 static u_int16_t
ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
);
168 static u_int16_t
sfm_to_ucs(u_int16_t ucs_ch
);
170 char utf_extrabytes
[32] = {
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
175 const char hexdigits
[16] = {
176 '0', '1', '2', '3', '4', '5', '6', '7',
177 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
181 * utf8_encodelen - Calculate the UTF-8 encoding length
183 * This function takes a Unicode input string, ucsp, of ucslen bytes
184 * and calculates the size of the UTF-8 output in bytes (not including
185 * a NULL termination byte). The string must reside in kernel memory.
187 * If '/' chars are possible in the Unicode input then an alternate
188 * (replacement) char should be provided in altslash.
191 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
193 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
195 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
197 * UTF_DECOMPOSED: generate fully decomposed output
199 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
205 utf8_encodelen(const u_int16_t
* ucsp
, size_t ucslen
, u_int16_t altslash
, int flags
)
208 u_int16_t
* chp
= NULL
;
209 u_int16_t sequence
[8];
212 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
213 int decompose
= (flags
& UTF_DECOMPOSED
);
216 charcnt
= ucslen
/ 2;
219 while (charcnt
-- > 0) {
226 ucs_ch
= OSSwapInt16(ucs_ch
);
229 ucs_ch
= altslash
? altslash
: '_';
230 } else if (ucs_ch
== '\0') {
231 ucs_ch
= UCS_ALT_NULL
;
232 } else if (decompose
&& unicode_decomposeable(ucs_ch
)) {
233 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
235 ucs_ch
= sequence
[0];
239 len
+= UNICODE_TO_UTF8_LEN(ucs_ch
);
247 * utf8_encodestr - Encodes a Unicode string to UTF-8
250 * The resulting UTF-8 string is NULL terminated.
252 * If '/' chars are allowed on disk then an alternate
253 * (replacement) char must be provided in altslash.
256 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
258 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
260 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
262 * UTF_DECOMPOSED: generate fully decomposed output
264 * UTF_ADD_NULL_TERM: add NULL termination to UTF-8 output
267 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
269 * EINVAL: Illegal char found; char was replaced by an '_'.
272 utf8_encodestr(const u_int16_t
* ucsp
, size_t ucslen
, u_int8_t
* utf8p
,
273 size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
)
278 u_int16_t
* chp
= NULL
;
279 u_int16_t sequence
[8];
282 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
283 int nullterm
= (flags
& UTF_ADD_NULL_TERM
);
284 int decompose
= (flags
& UTF_DECOMPOSED
);
285 int sfmconv
= (flags
& UTF_SFM_CONVERSIONS
);
289 bufend
= bufstart
+ buflen
;
292 charcnt
= ucslen
/ 2;
294 while (charcnt
-- > 0) {
299 ucs_ch
= swapbytes
? OSSwapInt16(*ucsp
++) : *ucsp
++;
301 if (decompose
&& unicode_decomposeable(ucs_ch
)) {
302 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
304 ucs_ch
= sequence
[0];
309 /* Slash and NULL are not permitted */
317 } else if (ucs_ch
== '\0') {
318 ucs_ch
= UCS_ALT_NULL
;
321 if (ucs_ch
< 0x0080) {
322 if (utf8p
>= bufend
) {
323 result
= ENAMETOOLONG
;
328 } else if (ucs_ch
< 0x800) {
329 if ((utf8p
+ 1) >= bufend
) {
330 result
= ENAMETOOLONG
;
333 *utf8p
++ = 0xc0 | (ucs_ch
>> 6);
334 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
337 /* These chars never valid Unicode. */
338 if (ucs_ch
== 0xFFFE || ucs_ch
== 0xFFFF) {
343 /* Combine valid surrogate pairs */
344 if (ucs_ch
>= SP_HIGH_FIRST
&& ucs_ch
<= SP_HIGH_LAST
349 ch2
= swapbytes
? OSSwapInt16(*ucsp
) : *ucsp
;
350 if (ch2
>= SP_LOW_FIRST
&& ch2
<= SP_LOW_LAST
) {
351 pair
= (u_int32_t
)((ucs_ch
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
)
352 + (ch2
- SP_LOW_FIRST
) + SP_HALF_BASE
;
353 if ((utf8p
+ 3) >= bufend
) {
354 result
= ENAMETOOLONG
;
359 *utf8p
++ = 0xf0 | (pair
>> 18);
360 *utf8p
++ = 0x80 | (0x3f & (pair
>> 12));
361 *utf8p
++ = 0x80 | (0x3f & (pair
>> 6));
362 *utf8p
++ = 0x80 | (0x3f & pair
);
365 } else if (sfmconv
) {
366 ucs_ch
= sfm_to_ucs(ucs_ch
);
367 if (ucs_ch
< 0x0080) {
368 if (utf8p
>= bufend
) {
369 result
= ENAMETOOLONG
;
376 if ((utf8p
+ 2) >= bufend
) {
377 result
= ENAMETOOLONG
;
380 *utf8p
++ = 0xe0 | (ucs_ch
>> 12);
381 *utf8p
++ = 0x80 | (0x3f & (ucs_ch
>> 6));
382 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
386 *utf8len
= utf8p
- bufstart
;
395 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
398 * The input UTF-8 string does not need to be null terminated
401 * If '/' chars are allowed on disk then an alternate
402 * (replacement) char must be provided in altslash.
405 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
407 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
409 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
411 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
413 * UTF_PRECOMPOSED: generate precomposed output (NFC)
415 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
418 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
420 * EINVAL: Illegal UTF-8 sequence found.
423 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
,
424 size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
)
432 int decompose
, precompose
, swapbytes
, escaping
;
436 decompose
= (flags
& UTF_DECOMPOSED
);
437 precompose
= (flags
& UTF_PRECOMPOSED
);
438 swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
439 escaping
= (flags
& UTF_ESCAPE_ILLEGAL
);
440 sfmconv
= (flags
& UTF_SFM_CONVERSIONS
);
443 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
445 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
449 /* check for ascii */
451 ucs_ch
= sfmconv
? ucs_to_sfm(byte
, utf8len
== 0) : byte
;
455 extrabytes
= utf_extrabytes
[byte
>> 3];
456 if ((extrabytes
< 0) || ((int)utf8len
< extrabytes
)) {
459 utf8len
-= extrabytes
;
461 switch (extrabytes
) {
463 ch
= byte
; ch
<<= 6; /* 1st byte */
464 byte
= *utf8p
++; /* 2nd byte */
465 if ((byte
>> 6) != 2)
474 ch
= byte
; ch
<<= 6; /* 1st byte */
475 byte
= *utf8p
++; /* 2nd byte */
476 if ((byte
>> 6) != 2)
478 ch
+= byte
; ch
<<= 6;
479 byte
= *utf8p
++; /* 3rd byte */
480 if ((byte
>> 6) != 2)
489 if (ch
== 0xFFFE || ch
== 0xFFFF)
495 ch
= byte
; ch
<<= 6; /* 1st byte */
496 byte
= *utf8p
++; /* 2nd byte */
497 if ((byte
>> 6) != 2)
499 ch
+= byte
; ch
<<= 6;
500 byte
= *utf8p
++; /* 3rd byte */
501 if ((byte
>> 6) != 2)
503 ch
+= byte
; ch
<<= 6;
504 byte
= *utf8p
++; /* 4th byte */
505 if ((byte
>> 6) != 2)
508 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
509 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
510 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
512 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
515 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
516 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
) {
520 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
527 if (unicode_decomposeable(ucs_ch
)) {
528 u_int16_t sequence
[8] = {0};
531 /* Before decomposing a new unicode character, sort
532 * previous combining characters, if any, and reset
535 if (combcharcnt
> 1) {
536 priortysort(ucsp
- combcharcnt
, combcharcnt
);
540 count
= unicode_decompose(ucs_ch
, sequence
);
541 for (i
= 0; i
< count
; ++i
) {
542 ucs_ch
= sequence
[i
];
543 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
547 combcharcnt
+= count
- 1;
550 } else if (precompose
&& (ucsp
!= bufstart
)) {
551 u_int16_t composite
, base
;
553 if (unicode_combinable(ucs_ch
)) {
554 base
= swapbytes
? OSSwapInt16(*(ucsp
- 1)) : *(ucsp
- 1);
555 composite
= unicode_combine(base
, ucs_ch
);
562 if (ucs_ch
== UCS_ALT_NULL
)
565 if (ucs_ch
== altslash
)
569 * Make multiple combining character sequences canonical
571 if (unicode_combinable(ucs_ch
)) {
572 ++combcharcnt
; /* start tracking a run */
573 } else if (combcharcnt
) {
574 if (combcharcnt
> 1) {
575 priortysort(ucsp
- combcharcnt
, combcharcnt
);
577 combcharcnt
= 0; /* start over */
580 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
584 * Escape illegal UTF-8 into something legal.
600 utf8len
+= extrabytes
;
603 if ((ucsp
+ 2) >= bufend
)
606 /* Make a previous combining sequence canonical. */
607 if (combcharcnt
> 1) {
608 priortysort(ucsp
- combcharcnt
, combcharcnt
);
613 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
614 ucs_ch
= hexdigits
[byte
>> 4];
615 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
616 ucs_ch
= hexdigits
[byte
& 0x0F];
617 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
620 * Make a previous combining sequence canonical
622 if (combcharcnt
> 1) {
623 priortysort(ucsp
- combcharcnt
, combcharcnt
);
626 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
631 result
= ENAMETOOLONG
;
636 * Unicode 3.2 decomposition code (derived from Core Foundation)
639 #define HANGUL_SBASE 0xAC00
640 #define HANGUL_LBASE 0x1100
641 #define HANGUL_VBASE 0x1161
642 #define HANGUL_TBASE 0x11A7
644 #define HANGUL_SCOUNT 11172
645 #define HANGUL_LCOUNT 19
646 #define HANGUL_VCOUNT 21
647 #define HANGUL_TCOUNT 28
648 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
654 } unicode_mappings32
;
656 #define RECURSIVE_DECOMPOSITION (1 << 15)
657 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
662 } unicode_mappings16
;
664 static inline u_int32_t
665 getmappedvalue32(const unicode_mappings32
*theTable
, u_int32_t numElem
,
668 const unicode_mappings32
*p
, *q
, *divider
;
670 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
676 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
677 if (character
< divider
->_key
) { q
= divider
- 1; }
678 else if (character
> divider
->_key
) { p
= divider
+ 1; }
679 else { return (divider
->_value
); }
684 static inline u_int16_t
685 getmappedvalue16(const unicode_mappings16
*theTable
, u_int32_t numElem
,
688 const unicode_mappings16
*p
, *q
, *divider
;
690 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
696 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
697 if (character
< divider
->_key
)
699 else if (character
> divider
->_key
)
702 return (divider
->_value
);
708 unicode_recursive_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
714 const u_int16_t
*bmpMappings
;
715 u_int32_t usedLength
;
717 value
= getmappedvalue16(
718 (const unicode_mappings16
*)__CFUniCharDecompositionTable
,
719 __UniCharDecompositionTableLength
, character
);
720 length
= EXTRACT_COUNT(value
);
721 firstChar
= value
& 0x0FFF;
723 bmpMappings
= (length
== 1 ? &theChar
: __CFUniCharMultipleDecompositionTable
+ firstChar
);
726 if (value
& RECURSIVE_DECOMPOSITION
) {
727 usedLength
= unicode_recursive_decompose((u_int16_t
)*bmpMappings
, convertedChars
);
729 --length
; /* Decrement for the first char */
733 convertedChars
+= usedLength
;
736 usedLength
+= length
;
739 *(convertedChars
++) = *(bmpMappings
++);
745 * unicode_decompose - decompose a composed Unicode char
747 * Composed Unicode characters are forbidden on
748 * HFS Plus volumes. ucs_decompose will convert a
749 * composed character into its correct decomposed
752 * Similar to CFUniCharDecomposeCharacter
755 unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
757 if ((character
>= HANGUL_SBASE
) &&
758 (character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
761 character
-= HANGUL_SBASE
;
762 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
764 *(convertedChars
++) =
765 character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
766 *(convertedChars
++) =
767 (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
769 *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
772 return (unicode_recursive_decompose(character
, convertedChars
));
777 * unicode_combine - generate a precomposed Unicode char
779 * Precomposed Unicode characters are required for some volume
780 * formats and network protocols. unicode_combine will combine
781 * a decomposed character sequence into a single precomposed
782 * (composite) character.
784 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
785 * also handles Hangul Jamo characters.
788 unicode_combine(u_int16_t base
, u_int16_t combining
)
793 if ((combining
>= HANGUL_VBASE
) && (combining
< (HANGUL_TBASE
+ HANGUL_TCOUNT
))) {
794 /* 2 char Hangul sequences */
795 if ((combining
< (HANGUL_VBASE
+ HANGUL_VCOUNT
)) &&
796 (base
>= HANGUL_LBASE
&& base
< (HANGUL_LBASE
+ HANGUL_LCOUNT
))) {
797 return (HANGUL_SBASE
+
798 ((base
- HANGUL_LBASE
)*(HANGUL_VCOUNT
*HANGUL_TCOUNT
)) +
799 ((combining
- HANGUL_VBASE
)*HANGUL_TCOUNT
));
802 /* 3 char Hangul sequences */
803 if ((combining
> HANGUL_TBASE
) &&
804 (base
>= HANGUL_SBASE
&& base
< (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
805 if ((base
- HANGUL_SBASE
) % HANGUL_TCOUNT
)
808 return (base
+ (combining
- HANGUL_TBASE
));
812 value
= getmappedvalue32(
813 (const unicode_mappings32
*)__CFUniCharPrecompSourceTable
,
814 __CFUniCharPrecompositionTableLength
, combining
);
817 value
= getmappedvalue16(
818 (const unicode_mappings16
*)
819 ((const u_int32_t
*)__CFUniCharBMPPrecompDestinationTable
+ (value
& 0xFFFF)),
820 (value
>> 16), base
);
827 * priortysort - order combining chars into canonical order
829 * Similar to CFUniCharPrioritySort
832 priortysort(u_int16_t
* characters
, int count
)
835 u_int16_t
*ch1
, *ch2
;
839 end
= characters
+ count
;
843 ch2
= characters
+ 1;
844 p2
= get_combining_class(*ch1
);
847 p2
= get_combining_class(*ch2
);
848 if (p1
> p2
&& p2
!= 0) {
857 * Make sure that p2 contains the combining class for the
858 * character now stored at *ch2. This isn't required for
859 * correctness, but it will be more efficient if a character
860 * with a large combining class has to "bubble past" several
861 * characters with lower combining classes.
873 * Invalid NTFS filename characters are encodeded using the
874 * SFM (Services for Macintosh) private use Unicode characters.
876 * These should only be used for SMB, MSDOS or NTFS.
878 * Illegal NTFS Char SFM Unicode Char
879 * ----------------------------------------
880 * 0x01-0x1f 0xf001-0xf01f
889 * ' ' 0xf028 (Only if last char of the name)
890 * '.' 0xf029 (Only if last char of the name)
891 * ----------------------------------------
893 * Reference: http://support.microsoft.com/kb/q117258/
896 #define MAX_SFM2MAC 0x29
897 #define SFMCODE_PREFIX_MASK 0xf000
900 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
901 * SFM had no conversion for the colon. There is a conversion for the
902 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
903 * is a slash and a slash is a colon. So we can just replace the slash with the
904 * colon in our tables and everything will just work.
908 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
909 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
910 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
911 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
912 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
913 0x20, 0x2e /* 28 - 29 */
918 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
919 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
920 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
921 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
922 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
923 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
924 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
925 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
926 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
927 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
928 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
929 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
934 * Encode illegal NTFS filename characters into SFM Private Unicode characters
936 * Assumes non-zero ASCII input.
939 ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
)
941 /* The last character of filename cannot be a space or period. */
945 else if (ucs_ch
== 0x2e)
948 /* 0x01 - 0x1f is simple transformation. */
949 if (ucs_ch
<= 0x1f) {
950 return (ucs_ch
| 0xf000);
951 } else /* 0x20 - 0x7f */ {
954 lsb
= mac2sfm
[ucs_ch
- 0x0020];
956 return(0xf000 | lsb
);
962 * Decode any SFM Private Unicode characters
965 sfm_to_ucs(u_int16_t ucs_ch
)
967 if (((ucs_ch
& 0xffC0) == SFMCODE_PREFIX_MASK
) &&
968 ((ucs_ch
& 0x003f) <= MAX_SFM2MAC
)) {
969 ucs_ch
= sfm2mac
[ucs_ch
& 0x003f];