2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 Includes Unicode 3.2 decomposition code derived from Core Foundation
33 #include <sys/param.h>
34 #include <sys/utfconv.h>
35 #include <sys/errno.h>
36 #include <sys/malloc.h>
37 #include <libkern/OSByteOrder.h>
39 #if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST)
40 #include <kern/assert.h>
46 * UTF-8 (Unicode Transformation Format)
48 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
49 * character as a sequence of one to four bytes. Only the shortest form
50 * required to represent the significant Unicode bits is legal.
52 * UTF-8 Multibyte Codes
54 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
55 * -----------------------------------------------------------------------------
56 * 1 7 0x0000 0x007F 0xxxxxxx
57 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
58 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
59 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
60 * -----------------------------------------------------------------------------
64 #define UNICODE_TO_UTF8_LEN(c) \
65 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
67 #define UCS_ALT_NULL 0x2400
69 /* Surrogate Pair Constants */
70 #define SP_HALF_SHIFT 10
71 #define SP_HALF_BASE 0x0010000u
72 #define SP_HALF_MASK 0x3FFu
74 #define SP_HIGH_FIRST 0xD800u
75 #define SP_HIGH_LAST 0xDBFFu
76 #define SP_LOW_FIRST 0xDC00u
77 #define SP_LOW_LAST 0xDFFFu
80 #include "vfs_utfconvdata.h"
84 * Test for a combining character.
86 * Similar to __CFUniCharIsNonBaseCharacter except that
87 * unicode_combinable also includes Hangul Jamo characters.
90 unicode_combinable(u_int16_t character
)
92 const u_int8_t
*bitmap
= __CFUniCharCombiningBitmap
;
95 if (character
< 0x0300)
98 value
= bitmap
[(character
>> 8) & 0xFF];
103 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
104 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
110 * Test for a precomposed character.
112 * Similar to __CFUniCharIsDecomposableCharacter.
115 unicode_decomposeable(u_int16_t character
) {
116 const u_int8_t
*bitmap
= __CFUniCharDecomposableBitmap
;
119 if (character
< 0x00C0)
122 value
= bitmap
[(character
>> 8) & 0xFF];
127 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
128 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
135 * Get the combing class.
137 * Similar to CFUniCharGetCombiningPropertyForCharacter.
139 static inline u_int8_t
140 get_combining_class(u_int16_t character
) {
141 const u_int8_t
*bitmap
= __CFUniCharCombiningPropertyBitmap
;
143 u_int8_t value
= bitmap
[(character
>> 8)];
146 bitmap
= bitmap
+ (value
* 256);
147 return bitmap
[character
% 256];
153 static int unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
);
155 static u_int16_t
unicode_combine(u_int16_t base
, u_int16_t combining
);
157 static void prioritysort(u_int16_t
* characters
, int count
);
159 static u_int16_t
ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
);
161 static u_int16_t
sfm_to_ucs(u_int16_t ucs_ch
);
164 char utf_extrabytes
[32] = {
165 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
169 const char hexdigits
[16] = {
170 '0', '1', '2', '3', '4', '5', '6', '7',
171 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
175 * utf8_encodelen - Calculate the UTF-8 encoding length
177 * This function takes a Unicode input string, ucsp, of ucslen bytes
178 * and calculates the size of the UTF-8 output in bytes (not including
179 * a NULL termination byte). The string must reside in kernel memory.
181 * If '/' chars are possible in the Unicode input then an alternate
182 * (replacement) char should be provided in altslash.
185 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
187 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
189 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
191 * UTF_DECOMPOSED: generate fully decomposed output
193 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
199 utf8_encodelen(const u_int16_t
* ucsp
, size_t ucslen
, u_int16_t altslash
, int flags
)
202 u_int16_t
* chp
= NULL
;
203 u_int16_t sequence
[8];
206 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
207 int decompose
= (flags
& UTF_DECOMPOSED
);
210 charcnt
= ucslen
/ 2;
213 while (charcnt
-- > 0) {
220 ucs_ch
= OSSwapInt16(ucs_ch
);
223 ucs_ch
= altslash
? altslash
: '_';
224 } else if (ucs_ch
== '\0') {
225 ucs_ch
= UCS_ALT_NULL
;
226 } else if (decompose
&& unicode_decomposeable(ucs_ch
)) {
227 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
229 ucs_ch
= sequence
[0];
233 len
+= UNICODE_TO_UTF8_LEN(ucs_ch
);
241 * utf8_encodestr - Encodes a Unicode string to UTF-8
244 * The resulting UTF-8 string is NULL terminated.
246 * If '/' chars are allowed on disk then an alternate
247 * (replacement) char must be provided in altslash.
250 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
252 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
254 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
256 * UTF_DECOMPOSED: generate fully decomposed output
258 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
261 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
263 * EINVAL: Illegal char found; char was replaced by an '_'.
266 utf8_encodestr(const u_int16_t
* ucsp
, size_t ucslen
, u_int8_t
* utf8p
,
267 size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
)
272 u_int16_t
* chp
= NULL
;
273 u_int16_t sequence
[8];
276 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
277 int nullterm
= ((flags
& UTF_NO_NULL_TERM
) == 0);
278 int decompose
= (flags
& UTF_DECOMPOSED
);
279 int sfmconv
= (flags
& UTF_SFM_CONVERSIONS
);
283 bufend
= bufstart
+ buflen
;
286 charcnt
= ucslen
/ 2;
288 while (charcnt
-- > 0) {
293 ucs_ch
= swapbytes
? OSSwapInt16(*ucsp
++) : *ucsp
++;
295 if (decompose
&& unicode_decomposeable(ucs_ch
)) {
296 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
298 ucs_ch
= sequence
[0];
303 /* Slash and NULL are not permitted */
311 } else if (ucs_ch
== '\0') {
312 ucs_ch
= UCS_ALT_NULL
;
315 if (ucs_ch
< 0x0080) {
316 if (utf8p
>= bufend
) {
317 result
= ENAMETOOLONG
;
322 } else if (ucs_ch
< 0x800) {
323 if ((utf8p
+ 1) >= bufend
) {
324 result
= ENAMETOOLONG
;
327 *utf8p
++ = 0xc0 | (ucs_ch
>> 6);
328 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
331 /* These chars never valid Unicode. */
332 if (ucs_ch
== 0xFFFE || ucs_ch
== 0xFFFF) {
337 /* Combine valid surrogate pairs */
338 if (ucs_ch
>= SP_HIGH_FIRST
&& ucs_ch
<= SP_HIGH_LAST
343 ch2
= swapbytes
? OSSwapInt16(*ucsp
) : *ucsp
;
344 if (ch2
>= SP_LOW_FIRST
&& ch2
<= SP_LOW_LAST
) {
345 pair
= ((ucs_ch
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
)
346 + (ch2
- SP_LOW_FIRST
) + SP_HALF_BASE
;
347 if ((utf8p
+ 3) >= bufend
) {
348 result
= ENAMETOOLONG
;
353 *utf8p
++ = 0xf0 | (pair
>> 18);
354 *utf8p
++ = 0x80 | (0x3f & (pair
>> 12));
355 *utf8p
++ = 0x80 | (0x3f & (pair
>> 6));
356 *utf8p
++ = 0x80 | (0x3f & pair
);
359 } else if (sfmconv
) {
360 ucs_ch
= sfm_to_ucs(ucs_ch
);
361 if (ucs_ch
< 0x0080) {
362 if (utf8p
>= bufend
) {
363 result
= ENAMETOOLONG
;
370 if ((utf8p
+ 2) >= bufend
) {
371 result
= ENAMETOOLONG
;
374 *utf8p
++ = 0xe0 | (ucs_ch
>> 12);
375 *utf8p
++ = 0x80 | (0x3f & (ucs_ch
>> 6));
376 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
380 *utf8len
= utf8p
- bufstart
;
387 // Pushes a character taking account of combining character sequences
388 static void push(uint16_t ucs_ch
, int *combcharcnt
, uint16_t **ucsp
)
391 * Make multiple combining character sequences canonical
393 if (unicode_combinable(ucs_ch
)) {
394 ++*combcharcnt
; /* start tracking a run */
395 } else if (*combcharcnt
) {
396 if (*combcharcnt
> 1) {
397 prioritysort(*ucsp
- *combcharcnt
, *combcharcnt
);
399 *combcharcnt
= 0; /* start over */
406 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
409 * The input UTF-8 string does not need to be null terminated
412 * If '/' chars are allowed on disk then an alternate
413 * (replacement) char must be provided in altslash.
416 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
418 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
420 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
422 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
424 * UTF_PRECOMPOSED: generate precomposed output (NFC)
426 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
429 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
431 * EINVAL: Illegal UTF-8 sequence found.
434 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
,
435 size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
)
443 int decompose
, precompose
, escaping
;
447 decompose
= (flags
& UTF_DECOMPOSED
);
448 precompose
= (flags
& UTF_PRECOMPOSED
);
449 escaping
= (flags
& UTF_ESCAPE_ILLEGAL
);
450 sfmconv
= (flags
& UTF_SFM_CONVERSIONS
);
453 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
455 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
459 /* check for ascii */
461 ucs_ch
= sfmconv
? ucs_to_sfm(byte
, utf8len
== 0) : byte
;
465 extrabytes
= utf_extrabytes
[byte
>> 3];
466 if ((extrabytes
< 0) || ((int)utf8len
< extrabytes
)) {
469 utf8len
-= extrabytes
;
471 switch (extrabytes
) {
473 ch
= byte
; ch
<<= 6; /* 1st byte */
474 byte
= *utf8p
++; /* 2nd byte */
475 if ((byte
>> 6) != 2)
484 ch
= byte
; ch
<<= 6; /* 1st byte */
485 byte
= *utf8p
++; /* 2nd byte */
486 if ((byte
>> 6) != 2)
488 ch
+= byte
; ch
<<= 6;
489 byte
= *utf8p
++; /* 3rd byte */
490 if ((byte
>> 6) != 2)
499 if (ch
== 0xFFFE || ch
== 0xFFFF)
505 ch
= byte
; ch
<<= 6; /* 1st byte */
506 byte
= *utf8p
++; /* 2nd byte */
507 if ((byte
>> 6) != 2)
509 ch
+= byte
; ch
<<= 6;
510 byte
= *utf8p
++; /* 3rd byte */
511 if ((byte
>> 6) != 2)
513 ch
+= byte
; ch
<<= 6;
514 byte
= *utf8p
++; /* 4th byte */
515 if ((byte
>> 6) != 2)
518 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
519 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
520 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
522 push(ucs_ch
, &combcharcnt
, &ucsp
);
525 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
526 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
) {
537 if (unicode_decomposeable(ucs_ch
)) {
538 u_int16_t sequence
[8];
541 count
= unicode_decompose(ucs_ch
, sequence
);
543 for (i
= 0; i
< count
; ++i
) {
547 push(sequence
[i
], &combcharcnt
, &ucsp
);
552 } else if (precompose
&& (ucsp
!= bufstart
)) {
553 u_int16_t composite
, base
;
555 if (unicode_combinable(ucs_ch
)) {
557 composite
= unicode_combine(base
, ucs_ch
);
564 if (ucs_ch
== UCS_ALT_NULL
)
567 if (ucs_ch
== altslash
)
570 push(ucs_ch
, &combcharcnt
, &ucsp
);
574 * Escape illegal UTF-8 into something legal.
590 utf8len
+= extrabytes
;
593 if ((ucsp
+ 2) >= bufend
)
596 /* Make a previous combining sequence canonical. */
597 if (combcharcnt
> 1) {
598 prioritysort(ucsp
- combcharcnt
, combcharcnt
);
604 ucs_ch
= hexdigits
[byte
>> 4];
606 ucs_ch
= hexdigits
[byte
& 0x0F];
610 * Make a previous combining sequence canonical
612 if (combcharcnt
> 1) {
613 prioritysort(ucsp
- combcharcnt
, combcharcnt
);
616 if (flags
& UTF_REVERSE_ENDIAN
) {
617 uint16_t *p
= bufstart
;
619 *p
= OSSwapInt16(*p
);
625 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
630 result
= ENAMETOOLONG
;
636 * utf8_validatestr - Check for a valid UTF-8 string.
639 utf8_validatestr(const u_int8_t
* utf8p
, size_t utf8len
)
646 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
648 continue; /* plain ascii */
650 extrabytes
= utf_extrabytes
[byte
>> 3];
652 if (utf8len
< extrabytes
)
654 utf8len
-= extrabytes
;
656 switch (extrabytes
) {
658 ch
= byte
; ch
<<= 6; /* 1st byte */
659 byte
= *utf8p
++; /* 2nd byte */
660 if ((byte
>> 6) != 2)
668 ch
= byte
; ch
<<= 6; /* 1st byte */
669 byte
= *utf8p
++; /* 2nd byte */
670 if ((byte
>> 6) != 2)
672 ch
+= byte
; ch
<<= 6;
673 byte
= *utf8p
++; /* 3rd byte */
674 if ((byte
>> 6) != 2)
683 if (ch
== 0xFFFE || ch
== 0xFFFF)
688 ch
= byte
; ch
<<= 6; /* 1st byte */
689 byte
= *utf8p
++; /* 2nd byte */
690 if ((byte
>> 6) != 2)
692 ch
+= byte
; ch
<<= 6;
693 byte
= *utf8p
++; /* 3rd byte */
694 if ((byte
>> 6) != 2)
696 ch
+= byte
; ch
<<= 6;
697 byte
= *utf8p
++; /* 4th byte */
698 if ((byte
>> 6) != 2)
701 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
702 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
703 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
705 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
706 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
)
720 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
722 * This function takes an UTF-8 input string, instr, of inlen bytes
723 * and produces normalized UTF-8 output into a buffer of buflen bytes
724 * pointed to by outstr. The size of the output in bytes (not including
725 * a NULL termination byte) is returned in outlen. In-place conversions
726 * are not supported (i.e. instr != outstr).]
729 * UTF_DECOMPOSED: output string will be fully decomposed (NFD)
731 * UTF_PRECOMPOSED: output string will be precomposed (NFC)
733 * UTF_NO_NULL_TERM: do not add null termination to output string
735 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
738 * ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
740 * EINVAL: illegal UTF-8 sequence encountered or invalid flags
743 utf8_normalizestr(const u_int8_t
* instr
, size_t inlen
, u_int8_t
* outstr
,
744 size_t *outlen
, size_t buflen
, int flags
)
746 u_int16_t unicodebuf
[32];
747 u_int16_t
* unistr
= NULL
;
748 size_t unicode_bytes
;
751 u_int8_t
*outbufstart
, *outbufend
;
752 const u_int8_t
*inbufstart
;
754 int decompose
, precompose
;
757 if (flags
& ~(UTF_DECOMPOSED
| UTF_PRECOMPOSED
| UTF_NO_NULL_TERM
| UTF_ESCAPE_ILLEGAL
)) {
760 decompose
= (flags
& UTF_DECOMPOSED
);
761 precompose
= (flags
& UTF_PRECOMPOSED
);
762 if ((decompose
&& precompose
) || (!decompose
&& !precompose
)) {
765 outbufstart
= outstr
;
766 outbufend
= outbufstart
+ buflen
;
770 while (inlen
-- > 0 && (byte
= *instr
++) != '\0') {
771 if (outstr
>= outbufend
) {
772 result
= ENAMETOOLONG
;
778 /* ASCII is already normalized. */
782 *outlen
= outstr
- outbufstart
;
783 if (((flags
& UTF_NO_NULL_TERM
) == 0)) {
784 if (outstr
< outbufend
)
787 result
= ENAMETOOLONG
;
793 * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
794 * functions to perform the normalization. Since this will
795 * presumably be used to normalize filenames in the back-end
796 * (on disk or over-the-wire), it should be fast enough.
800 /* Make sure the input size is reasonable. */
801 if (inbuflen
> MAXPATHLEN
) {
802 result
= ENAMETOOLONG
;
806 * Compute worst case Unicode buffer size.
808 * For pre-composed output, every UTF-8 input byte will be at
809 * most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
810 * (smallest composite char sequence) may yield 6 Unicode bytes
811 * (1 base char + 2 combining chars).
813 unicode_bytes
= precompose
? (inbuflen
* 2) : (inbuflen
* 3);
815 if (unicode_bytes
<= sizeof(unicodebuf
))
816 unistr
= &unicodebuf
[0];
818 MALLOC(unistr
, uint16_t *, unicode_bytes
, M_TEMP
, M_WAITOK
);
820 /* Normalize the string. */
821 result
= utf8_decodestr(inbufstart
, inbuflen
, unistr
, &unicode_bytes
,
822 unicode_bytes
, 0, flags
& ~UTF_NO_NULL_TERM
);
824 /* Put results back into UTF-8. */
825 result
= utf8_encodestr(unistr
, unicode_bytes
, outbufstart
,
826 &uft8_bytes
, buflen
, 0, UTF_NO_NULL_TERM
);
827 outstr
= outbufstart
+ uft8_bytes
;
829 if (unistr
&& unistr
!= &unicodebuf
[0]) {
830 FREE(unistr
, M_TEMP
);
837 * Unicode 3.2 decomposition code (derived from Core Foundation)
843 } unicode_mappings32
;
845 static inline u_int32_t
846 getmappedvalue32(const unicode_mappings32
*theTable
, u_int32_t numElem
,
849 const unicode_mappings32
*p
, *q
, *divider
;
851 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
857 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
858 if (character
< divider
->_key
) { q
= divider
- 1; }
859 else if (character
> divider
->_key
) { p
= divider
+ 1; }
860 else { return (divider
->_value
); }
865 #define RECURSIVE_DECOMPOSITION (1 << 15)
866 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
871 } unicode_mappings16
;
873 static inline u_int16_t
874 getmappedvalue16(const unicode_mappings16
*theTable
, u_int32_t numElem
,
877 const unicode_mappings16
*p
, *q
, *divider
;
879 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
885 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
886 if (character
< divider
->_key
)
888 else if (character
> divider
->_key
)
891 return (divider
->_value
);
898 unicode_recursive_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
904 const u_int16_t
*bmpMappings
;
905 u_int32_t usedLength
;
907 value
= getmappedvalue16(
908 (const unicode_mappings16
*)__CFUniCharDecompositionTable
,
909 __UniCharDecompositionTableLength
, character
);
910 length
= EXTRACT_COUNT(value
);
911 firstChar
= value
& 0x0FFF;
913 bmpMappings
= (length
== 1 ? &theChar
: __CFUniCharMultipleDecompositionTable
+ firstChar
);
916 if (value
& RECURSIVE_DECOMPOSITION
) {
917 usedLength
= unicode_recursive_decompose((u_int16_t
)*bmpMappings
, convertedChars
);
919 --length
; /* Decrement for the first char */
923 convertedChars
+= usedLength
;
926 usedLength
+= length
;
929 *(convertedChars
++) = *(bmpMappings
++);
934 #define HANGUL_SBASE 0xAC00
935 #define HANGUL_LBASE 0x1100
936 #define HANGUL_VBASE 0x1161
937 #define HANGUL_TBASE 0x11A7
939 #define HANGUL_SCOUNT 11172
940 #define HANGUL_LCOUNT 19
941 #define HANGUL_VCOUNT 21
942 #define HANGUL_TCOUNT 28
943 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
946 * unicode_decompose - decompose a composed Unicode char
948 * Composed Unicode characters are forbidden on
949 * HFS Plus volumes. ucs_decompose will convert a
950 * composed character into its correct decomposed
953 * Similar to CFUniCharDecomposeCharacter
956 unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
958 if ((character
>= HANGUL_SBASE
) &&
959 (character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
962 character
-= HANGUL_SBASE
;
963 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
965 *(convertedChars
++) =
966 character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
967 *(convertedChars
++) =
968 (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
970 *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
973 return (unicode_recursive_decompose(character
, convertedChars
));
978 * unicode_combine - generate a precomposed Unicode char
980 * Precomposed Unicode characters are required for some volume
981 * formats and network protocols. unicode_combine will combine
982 * a decomposed character sequence into a single precomposed
983 * (composite) character.
985 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
986 * also handles Hangul Jamo characters.
989 unicode_combine(u_int16_t base
, u_int16_t combining
)
994 if ((combining
>= HANGUL_VBASE
) && (combining
< (HANGUL_TBASE
+ HANGUL_TCOUNT
))) {
995 /* 2 char Hangul sequences */
996 if ((combining
< (HANGUL_VBASE
+ HANGUL_VCOUNT
)) &&
997 (base
>= HANGUL_LBASE
&& base
< (HANGUL_LBASE
+ HANGUL_LCOUNT
))) {
998 return (HANGUL_SBASE
+
999 ((base
- HANGUL_LBASE
)*(HANGUL_VCOUNT
*HANGUL_TCOUNT
)) +
1000 ((combining
- HANGUL_VBASE
)*HANGUL_TCOUNT
));
1003 /* 3 char Hangul sequences */
1004 if ((combining
> HANGUL_TBASE
) &&
1005 (base
>= HANGUL_SBASE
&& base
< (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
1006 if ((base
- HANGUL_SBASE
) % HANGUL_TCOUNT
)
1009 return (base
+ (combining
- HANGUL_TBASE
));
1013 value
= getmappedvalue32(
1014 (const unicode_mappings32
*)__CFUniCharPrecompSourceTable
,
1015 __CFUniCharPrecompositionTableLength
, combining
);
1018 value
= getmappedvalue16(
1019 (const unicode_mappings16
*)
1020 ((const u_int32_t
*)__CFUniCharBMPPrecompDestinationTable
+ (value
& 0xFFFF)),
1021 (value
>> 16), base
);
1028 * prioritysort - order combining chars into canonical order
1030 * Similar to CFUniCharPrioritySort
1033 prioritysort(u_int16_t
* characters
, int count
)
1036 u_int16_t
*ch1
, *ch2
;
1040 end
= characters
+ count
;
1044 ch2
= characters
+ 1;
1045 p2
= get_combining_class(*ch1
);
1048 p2
= get_combining_class(*ch2
);
1049 if (p1
> p2
&& p2
!= 0) {
1058 * Make sure that p2 contains the combining class for the
1059 * character now stored at *ch2. This isn't required for
1060 * correctness, but it will be more efficient if a character
1061 * with a large combining class has to "bubble past" several
1062 * characters with lower combining classes.
1074 * Invalid NTFS filename characters are encodeded using the
1075 * SFM (Services for Macintosh) private use Unicode characters.
1077 * These should only be used for SMB, MSDOS or NTFS.
1079 * Illegal NTFS Char SFM Unicode Char
1080 * ----------------------------------------
1081 * 0x01-0x1f 0xf001-0xf01f
1090 * ' ' 0xf028 (Only if last char of the name)
1091 * '.' 0xf029 (Only if last char of the name)
1092 * ----------------------------------------
1094 * Reference: http://support.microsoft.com/kb/q117258/
1097 #define MAX_SFM2MAC 0x29
1098 #define SFMCODE_PREFIX_MASK 0xf000
1101 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1102 * SFM had no conversion for the colon. There is a conversion for the
1103 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1104 * is a slash and a slash is a colon. So we can just replace the slash with the
1105 * colon in our tables and everything will just work.
1109 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
1110 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
1111 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
1112 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
1113 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
1114 0x20, 0x2e /* 28 - 29 */
1116 #define SFM2MAC_LEN ((sizeof(sfm2mac))/sizeof(sfm2mac[0]))
1120 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
1121 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
1122 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
1123 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
1124 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
1125 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
1126 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
1127 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
1128 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
1129 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
1130 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
1131 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
1133 #define MAC2SFM_LEN ((sizeof(mac2sfm))/sizeof(mac2sfm[0]))
1137 * Encode illegal NTFS filename characters into SFM Private Unicode characters
1139 * Assumes non-zero ASCII input.
1142 ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
)
1144 /* The last character of filename cannot be a space or period. */
1148 else if (ucs_ch
== 0x2e)
1151 /* 0x01 - 0x1f is simple transformation. */
1152 if (ucs_ch
<= 0x1f) {
1153 return (ucs_ch
| 0xf000);
1154 } else /* 0x20 - 0x7f */ {
1157 assert((ucs_ch
- 0x0020) < MAC2SFM_LEN
);
1158 lsb
= mac2sfm
[ucs_ch
- 0x0020];
1160 return(0xf000 | lsb
);
1166 * Decode any SFM Private Unicode characters
1169 sfm_to_ucs(u_int16_t ucs_ch
)
1171 if (((ucs_ch
& 0xffC0) == SFMCODE_PREFIX_MASK
) &&
1172 ((ucs_ch
& 0x003f) <= MAX_SFM2MAC
)) {
1173 assert((ucs_ch
& 0x003f) < SFM2MAC_LEN
);
1174 ucs_ch
= sfm2mac
[ucs_ch
& 0x003f];