2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 * Includes Unicode 3.2 decomposition code derived from Core Foundation
33 #include <sys/param.h>
34 #include <sys/utfconv.h>
35 #include <sys/errno.h>
36 #include <sys/malloc.h>
37 #include <libkern/OSByteOrder.h>
39 #if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST)
40 #include <kern/assert.h>
46 * UTF-8 (Unicode Transformation Format)
48 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
49 * character as a sequence of one to four bytes. Only the shortest form
50 * required to represent the significant Unicode bits is legal.
52 * UTF-8 Multibyte Codes
54 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
55 * -----------------------------------------------------------------------------
56 * 1 7 0x0000 0x007F 0xxxxxxx
57 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
58 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
59 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
60 * -----------------------------------------------------------------------------
64 #define UNICODE_TO_UTF8_LEN(c) \
65 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
67 #define UCS_ALT_NULL 0x2400
69 /* Surrogate Pair Constants */
70 #define SP_HALF_SHIFT 10
71 #define SP_HALF_BASE 0x0010000u
72 #define SP_HALF_MASK 0x3FFu
74 #define SP_HIGH_FIRST 0xD800u
75 #define SP_HIGH_LAST 0xDBFFu
76 #define SP_LOW_FIRST 0xDC00u
77 #define SP_LOW_LAST 0xDFFFu
80 #include "vfs_utfconvdata.h"
84 * Test for a combining character.
86 * Similar to __CFUniCharIsNonBaseCharacter except that
87 * unicode_combinable also includes Hangul Jamo characters.
90 unicode_combinable(u_int16_t character
)
92 const u_int8_t
*bitmap
= __CFUniCharCombiningBitmap
;
95 if (character
< 0x0300) {
99 value
= bitmap
[(character
>> 8) & 0xFF];
104 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
105 return bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0;
111 * Test for a precomposed character.
113 * Similar to __CFUniCharIsDecomposableCharacter.
116 unicode_decomposeable(u_int16_t character
)
118 const u_int8_t
*bitmap
= __CFUniCharDecomposableBitmap
;
121 if (character
< 0x00C0) {
125 value
= bitmap
[(character
>> 8) & 0xFF];
130 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
131 return bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0;
138 * Get the combing class.
140 * Similar to CFUniCharGetCombiningPropertyForCharacter.
142 static inline u_int8_t
143 get_combining_class(u_int16_t character
)
145 const u_int8_t
*bitmap
= __CFUniCharCombiningPropertyBitmap
;
147 u_int8_t value
= bitmap
[(character
>> 8)];
150 bitmap
= bitmap
+ (value
* 256);
151 return bitmap
[character
% 256];
157 static int unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
);
159 static u_int16_t
unicode_combine(u_int16_t base
, u_int16_t combining
);
161 static void prioritysort(u_int16_t
* characters
, int count
);
163 static u_int16_t
ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
);
165 static u_int16_t
sfm_to_ucs(u_int16_t ucs_ch
);
168 char utf_extrabytes
[32] = {
169 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
170 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
173 const char hexdigits
[16] = {
174 '0', '1', '2', '3', '4', '5', '6', '7',
175 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
179 * utf8_encodelen - Calculate the UTF-8 encoding length
181 * This function takes a Unicode input string, ucsp, of ucslen bytes
182 * and calculates the size of the UTF-8 output in bytes (not including
183 * a NULL termination byte). The string must reside in kernel memory.
185 * If '/' chars are possible in the Unicode input then an alternate
186 * (replacement) char should be provided in altslash.
189 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
191 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
193 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
195 * UTF_DECOMPOSED: generate fully decomposed output
197 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
203 utf8_encodelen(const u_int16_t
* ucsp
, size_t ucslen
, u_int16_t altslash
, int flags
)
206 u_int16_t
* chp
= NULL
;
207 u_int16_t sequence
[8];
210 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
211 int decompose
= (flags
& UTF_DECOMPOSED
);
214 charcnt
= ucslen
/ 2;
217 while (charcnt
-- > 0) {
224 ucs_ch
= OSSwapInt16(ucs_ch
);
227 ucs_ch
= altslash
? altslash
: '_';
228 } else if (ucs_ch
== '\0') {
229 ucs_ch
= UCS_ALT_NULL
;
230 } else if (decompose
&& unicode_decomposeable(ucs_ch
)) {
231 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
233 ucs_ch
= sequence
[0];
237 len
+= UNICODE_TO_UTF8_LEN(ucs_ch
);
245 * utf8_encodestr - Encodes a Unicode string to UTF-8
248 * The resulting UTF-8 string is NULL terminated.
250 * If '/' chars are allowed on disk then an alternate
251 * (replacement) char must be provided in altslash.
254 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
256 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
258 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
260 * UTF_DECOMPOSED: generate fully decomposed output
262 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
265 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
267 * EINVAL: Illegal char found; char was replaced by an '_'.
270 utf8_encodestr(const u_int16_t
* ucsp
, size_t ucslen
, u_int8_t
* utf8p
,
271 size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
)
276 u_int16_t
* chp
= NULL
;
277 u_int16_t sequence
[8];
280 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
281 int nullterm
= ((flags
& UTF_NO_NULL_TERM
) == 0);
282 int decompose
= (flags
& UTF_DECOMPOSED
);
283 int sfmconv
= (flags
& UTF_SFM_CONVERSIONS
);
287 bufend
= bufstart
+ buflen
;
291 charcnt
= ucslen
/ 2;
293 while (charcnt
-- > 0) {
298 ucs_ch
= swapbytes
? OSSwapInt16(*ucsp
++) : *ucsp
++;
300 if (decompose
&& unicode_decomposeable(ucs_ch
)) {
301 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
303 ucs_ch
= sequence
[0];
308 /* Slash and NULL are not permitted */
316 } else if (ucs_ch
== '\0') {
317 ucs_ch
= UCS_ALT_NULL
;
320 if (ucs_ch
< 0x0080) {
321 if (utf8p
>= bufend
) {
322 result
= ENAMETOOLONG
;
326 } else if (ucs_ch
< 0x800) {
327 if ((utf8p
+ 1) >= bufend
) {
328 result
= ENAMETOOLONG
;
331 *utf8p
++ = 0xc0 | (ucs_ch
>> 6);
332 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
334 /* These chars never valid Unicode. */
335 if (ucs_ch
== 0xFFFE || ucs_ch
== 0xFFFF) {
340 /* Combine valid surrogate pairs */
341 if (ucs_ch
>= SP_HIGH_FIRST
&& ucs_ch
<= SP_HIGH_LAST
346 ch2
= swapbytes
? OSSwapInt16(*ucsp
) : *ucsp
;
347 if (ch2
>= SP_LOW_FIRST
&& ch2
<= SP_LOW_LAST
) {
348 pair
= ((ucs_ch
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
)
349 + (ch2
- SP_LOW_FIRST
) + SP_HALF_BASE
;
350 if ((utf8p
+ 3) >= bufend
) {
351 result
= ENAMETOOLONG
;
356 *utf8p
++ = 0xf0 | (pair
>> 18);
357 *utf8p
++ = 0x80 | (0x3f & (pair
>> 12));
358 *utf8p
++ = 0x80 | (0x3f & (pair
>> 6));
359 *utf8p
++ = 0x80 | (0x3f & pair
);
362 } else if (sfmconv
) {
363 ucs_ch
= sfm_to_ucs(ucs_ch
);
364 if (ucs_ch
< 0x0080) {
365 if (utf8p
>= bufend
) {
366 result
= ENAMETOOLONG
;
373 if ((utf8p
+ 2) >= bufend
) {
374 result
= ENAMETOOLONG
;
377 *utf8p
++ = 0xe0 | (ucs_ch
>> 12);
378 *utf8p
++ = 0x80 | (0x3f & (ucs_ch
>> 6));
379 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
383 *utf8len
= utf8p
- bufstart
;
391 // Pushes a character taking account of combining character sequences
393 push(uint16_t ucs_ch
, int *combcharcnt
, uint16_t **ucsp
)
396 * Make multiple combining character sequences canonical
398 if (unicode_combinable(ucs_ch
)) {
399 ++*combcharcnt
; /* start tracking a run */
400 } else if (*combcharcnt
) {
401 if (*combcharcnt
> 1) {
402 prioritysort(*ucsp
- *combcharcnt
, *combcharcnt
);
404 *combcharcnt
= 0; /* start over */
411 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
414 * The input UTF-8 string does not need to be null terminated
417 * If '/' chars are allowed on disk then an alternate
418 * (replacement) char must be provided in altslash.
421 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
423 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
425 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
427 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
429 * UTF_PRECOMPOSED: generate precomposed output (NFC)
431 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
434 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
436 * EINVAL: Illegal UTF-8 sequence found.
439 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
,
440 size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
)
448 int decompose
, precompose
, escaping
;
452 decompose
= (flags
& UTF_DECOMPOSED
);
453 precompose
= (flags
& UTF_PRECOMPOSED
);
454 escaping
= (flags
& UTF_ESCAPE_ILLEGAL
);
455 sfmconv
= (flags
& UTF_SFM_CONVERSIONS
);
458 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
460 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
461 if (ucsp
>= bufend
) {
465 /* check for ascii */
467 ucs_ch
= sfmconv
? ucs_to_sfm(byte
, utf8len
== 0) : byte
;
471 extrabytes
= utf_extrabytes
[byte
>> 3];
472 if ((extrabytes
< 0) || ((int)utf8len
< extrabytes
)) {
475 utf8len
-= extrabytes
;
477 switch (extrabytes
) {
479 ch
= byte
; ch
<<= 6; /* 1st byte */
480 byte
= *utf8p
++; /* 2nd byte */
481 if ((byte
>> 6) != 2) {
492 ch
= byte
; ch
<<= 6; /* 1st byte */
493 byte
= *utf8p
++; /* 2nd byte */
494 if ((byte
>> 6) != 2) {
497 ch
+= byte
; ch
<<= 6;
498 byte
= *utf8p
++; /* 3rd byte */
499 if ((byte
>> 6) != 2) {
511 if (ch
== 0xFFFE || ch
== 0xFFFF) {
518 ch
= byte
; ch
<<= 6; /* 1st byte */
519 byte
= *utf8p
++; /* 2nd byte */
520 if ((byte
>> 6) != 2) {
523 ch
+= byte
; ch
<<= 6;
524 byte
= *utf8p
++; /* 3rd byte */
525 if ((byte
>> 6) != 2) {
528 ch
+= byte
; ch
<<= 6;
529 byte
= *utf8p
++; /* 4th byte */
530 if ((byte
>> 6) != 2) {
534 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
535 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
536 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
) {
539 push(ucs_ch
, &combcharcnt
, &ucsp
);
540 if (ucsp
>= bufend
) {
543 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
544 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
) {
555 if (unicode_decomposeable(ucs_ch
)) {
556 u_int16_t sequence
[8];
559 count
= unicode_decompose(ucs_ch
, sequence
);
561 for (i
= 0; i
< count
; ++i
) {
562 if (ucsp
>= bufend
) {
566 push(sequence
[i
], &combcharcnt
, &ucsp
);
571 } else if (precompose
&& (ucsp
!= bufstart
)) {
572 u_int16_t composite
, base
;
574 if (unicode_combinable(ucs_ch
)) {
576 composite
= unicode_combine(base
, ucs_ch
);
583 if (ucs_ch
== UCS_ALT_NULL
) {
587 if (ucs_ch
== altslash
) {
591 push(ucs_ch
, &combcharcnt
, &ucsp
);
595 * Escape illegal UTF-8 into something legal.
610 if (extrabytes
> 0) {
611 utf8len
+= extrabytes
;
615 if ((ucsp
+ 2) >= bufend
) {
619 /* Make a previous combining sequence canonical. */
620 if (combcharcnt
> 1) {
621 prioritysort(ucsp
- combcharcnt
, combcharcnt
);
627 ucs_ch
= hexdigits
[byte
>> 4];
629 ucs_ch
= hexdigits
[byte
& 0x0F];
633 * Make a previous combining sequence canonical
635 if (combcharcnt
> 1) {
636 prioritysort(ucsp
- combcharcnt
, combcharcnt
);
639 if (flags
& UTF_REVERSE_ENDIAN
) {
640 uint16_t *p
= bufstart
;
642 *p
= OSSwapInt16(*p
);
648 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
653 result
= ENAMETOOLONG
;
659 * utf8_validatestr - Check for a valid UTF-8 string.
662 utf8_validatestr(const u_int8_t
* utf8p
, size_t utf8len
)
669 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
671 continue; /* plain ascii */
673 extrabytes
= utf_extrabytes
[byte
>> 3];
675 if (utf8len
< extrabytes
) {
678 utf8len
-= extrabytes
;
680 switch (extrabytes
) {
682 ch
= byte
; ch
<<= 6; /* 1st byte */
683 byte
= *utf8p
++; /* 2nd byte */
684 if ((byte
>> 6) != 2) {
694 ch
= byte
; ch
<<= 6; /* 1st byte */
695 byte
= *utf8p
++; /* 2nd byte */
696 if ((byte
>> 6) != 2) {
699 ch
+= byte
; ch
<<= 6;
700 byte
= *utf8p
++; /* 3rd byte */
701 if ((byte
>> 6) != 2) {
713 if (ch
== 0xFFFE || ch
== 0xFFFF) {
719 ch
= byte
; ch
<<= 6; /* 1st byte */
720 byte
= *utf8p
++; /* 2nd byte */
721 if ((byte
>> 6) != 2) {
724 ch
+= byte
; ch
<<= 6;
725 byte
= *utf8p
++; /* 3rd byte */
726 if ((byte
>> 6) != 2) {
729 ch
+= byte
; ch
<<= 6;
730 byte
= *utf8p
++; /* 4th byte */
731 if ((byte
>> 6) != 2) {
735 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
736 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
737 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
) {
740 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
741 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
) {
755 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
757 * This function takes an UTF-8 input string, instr, of inlen bytes
758 * and produces normalized UTF-8 output into a buffer of buflen bytes
759 * pointed to by outstr. The size of the output in bytes (not including
760 * a NULL termination byte) is returned in outlen. In-place conversions
761 * are not supported (i.e. instr != outstr).]
764 * UTF_DECOMPOSED: output string will be fully decomposed (NFD)
766 * UTF_PRECOMPOSED: output string will be precomposed (NFC)
768 * UTF_NO_NULL_TERM: do not add null termination to output string
770 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
773 * ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
775 * EINVAL: illegal UTF-8 sequence encountered or invalid flags
778 utf8_normalizestr(const u_int8_t
* instr
, size_t inlen
, u_int8_t
* outstr
,
779 size_t *outlen
, size_t buflen
, int flags
)
781 u_int16_t unicodebuf
[32];
782 u_int16_t
* unistr
= NULL
;
783 size_t unicode_bytes
;
786 u_int8_t
*outbufstart
, *outbufend
;
787 const u_int8_t
*inbufstart
;
789 int decompose
, precompose
;
792 if (flags
& ~(UTF_DECOMPOSED
| UTF_PRECOMPOSED
| UTF_NO_NULL_TERM
| UTF_ESCAPE_ILLEGAL
)) {
795 decompose
= (flags
& UTF_DECOMPOSED
);
796 precompose
= (flags
& UTF_PRECOMPOSED
);
797 if ((decompose
&& precompose
) || (!decompose
&& !precompose
)) {
800 outbufstart
= outstr
;
801 outbufend
= outbufstart
+ buflen
;
805 while (inlen
-- > 0 && (byte
= *instr
++) != '\0') {
806 if (outstr
>= outbufend
) {
807 result
= ENAMETOOLONG
;
813 /* ASCII is already normalized. */
817 *outlen
= outstr
- outbufstart
;
818 if (((flags
& UTF_NO_NULL_TERM
) == 0)) {
819 if (outstr
< outbufend
) {
822 result
= ENAMETOOLONG
;
829 * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
830 * functions to perform the normalization. Since this will
831 * presumably be used to normalize filenames in the back-end
832 * (on disk or over-the-wire), it should be fast enough.
836 /* Make sure the input size is reasonable. */
837 if (inbuflen
> MAXPATHLEN
) {
838 result
= ENAMETOOLONG
;
842 * Compute worst case Unicode buffer size.
844 * For pre-composed output, every UTF-8 input byte will be at
845 * most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
846 * (smallest composite char sequence) may yield 6 Unicode bytes
847 * (1 base char + 2 combining chars).
849 unicode_bytes
= precompose
? (inbuflen
* 2) : (inbuflen
* 3);
851 if (unicode_bytes
<= sizeof(unicodebuf
)) {
852 unistr
= &unicodebuf
[0];
854 MALLOC(unistr
, uint16_t *, unicode_bytes
, M_TEMP
, M_WAITOK
);
857 /* Normalize the string. */
858 result
= utf8_decodestr(inbufstart
, inbuflen
, unistr
, &unicode_bytes
,
859 unicode_bytes
, 0, flags
& ~UTF_NO_NULL_TERM
);
861 /* Put results back into UTF-8. */
862 result
= utf8_encodestr(unistr
, unicode_bytes
, outbufstart
,
863 &uft8_bytes
, buflen
, 0, UTF_NO_NULL_TERM
);
864 outstr
= outbufstart
+ uft8_bytes
;
866 if (unistr
&& unistr
!= &unicodebuf
[0]) {
867 FREE(unistr
, M_TEMP
);
874 * Unicode 3.2 decomposition code (derived from Core Foundation)
880 } unicode_mappings32
;
882 static inline u_int32_t
883 getmappedvalue32(const unicode_mappings32
*theTable
, u_int32_t numElem
,
886 const unicode_mappings32
*p
, *q
, *divider
;
888 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
- 1]._key
)) {
893 q
= p
+ (numElem
- 1);
895 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
896 if (character
< divider
->_key
) {
898 } else if (character
> divider
->_key
) {
901 return divider
->_value
;
907 #define RECURSIVE_DECOMPOSITION (1 << 15)
908 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
913 } unicode_mappings16
;
915 static inline u_int16_t
916 getmappedvalue16(const unicode_mappings16
*theTable
, u_int32_t numElem
,
919 const unicode_mappings16
*p
, *q
, *divider
;
921 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
- 1]._key
)) {
926 q
= p
+ (numElem
- 1);
928 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
929 if (character
< divider
->_key
) {
931 } else if (character
> divider
->_key
) {
934 return divider
->_value
;
942 unicode_recursive_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
948 const u_int16_t
*bmpMappings
;
949 u_int32_t usedLength
;
951 value
= getmappedvalue16(
952 (const unicode_mappings16
*)__CFUniCharDecompositionTable
,
953 __UniCharDecompositionTableLength
, character
);
954 length
= EXTRACT_COUNT(value
);
955 firstChar
= value
& 0x0FFF;
957 bmpMappings
= (length
== 1 ? &theChar
: __CFUniCharMultipleDecompositionTable
+ firstChar
);
960 if (value
& RECURSIVE_DECOMPOSITION
) {
961 usedLength
= unicode_recursive_decompose((u_int16_t
)*bmpMappings
, convertedChars
);
963 --length
; /* Decrement for the first char */
968 convertedChars
+= usedLength
;
971 usedLength
+= length
;
974 *(convertedChars
++) = *(bmpMappings
++);
980 #define HANGUL_SBASE 0xAC00
981 #define HANGUL_LBASE 0x1100
982 #define HANGUL_VBASE 0x1161
983 #define HANGUL_TBASE 0x11A7
985 #define HANGUL_SCOUNT 11172
986 #define HANGUL_LCOUNT 19
987 #define HANGUL_VCOUNT 21
988 #define HANGUL_TCOUNT 28
989 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
992 * unicode_decompose - decompose a composed Unicode char
994 * Composed Unicode characters are forbidden on
995 * HFS Plus volumes. ucs_decompose will convert a
996 * composed character into its correct decomposed
999 * Similar to CFUniCharDecomposeCharacter
1002 unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
1004 if ((character
>= HANGUL_SBASE
) &&
1005 (character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
1008 character
-= HANGUL_SBASE
;
1009 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
1011 *(convertedChars
++) =
1012 character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
1013 *(convertedChars
++) =
1014 (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
1016 *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
1020 return unicode_recursive_decompose(character
, convertedChars
);
1025 * unicode_combine - generate a precomposed Unicode char
1027 * Precomposed Unicode characters are required for some volume
1028 * formats and network protocols. unicode_combine will combine
1029 * a decomposed character sequence into a single precomposed
1030 * (composite) character.
1032 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
1033 * also handles Hangul Jamo characters.
1036 unicode_combine(u_int16_t base
, u_int16_t combining
)
1041 if ((combining
>= HANGUL_VBASE
) && (combining
< (HANGUL_TBASE
+ HANGUL_TCOUNT
))) {
1042 /* 2 char Hangul sequences */
1043 if ((combining
< (HANGUL_VBASE
+ HANGUL_VCOUNT
)) &&
1044 (base
>= HANGUL_LBASE
&& base
< (HANGUL_LBASE
+ HANGUL_LCOUNT
))) {
1045 return HANGUL_SBASE
+
1046 ((base
- HANGUL_LBASE
) * (HANGUL_VCOUNT
* HANGUL_TCOUNT
)) +
1047 ((combining
- HANGUL_VBASE
) * HANGUL_TCOUNT
);
1050 /* 3 char Hangul sequences */
1051 if ((combining
> HANGUL_TBASE
) &&
1052 (base
>= HANGUL_SBASE
&& base
< (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
1053 if ((base
- HANGUL_SBASE
) % HANGUL_TCOUNT
) {
1056 return base
+ (combining
- HANGUL_TBASE
);
1061 value
= getmappedvalue32(
1062 (const unicode_mappings32
*)__CFUniCharPrecompSourceTable
,
1063 __CFUniCharPrecompositionTableLength
, combining
);
1066 value
= getmappedvalue16(
1067 (const unicode_mappings16
*)
1068 ((const u_int32_t
*)__CFUniCharBMPPrecompDestinationTable
+ (value
& 0xFFFF)),
1069 (value
>> 16), base
);
1076 * prioritysort - order combining chars into canonical order
1078 * Similar to CFUniCharPrioritySort
1081 prioritysort(u_int16_t
* characters
, int count
)
1084 u_int16_t
*ch1
, *ch2
;
1088 end
= characters
+ count
;
1092 ch2
= characters
+ 1;
1093 p2
= get_combining_class(*ch1
);
1096 p2
= get_combining_class(*ch2
);
1097 if (p1
> p2
&& p2
!= 0) {
1106 * Make sure that p2 contains the combining class for the
1107 * character now stored at *ch2. This isn't required for
1108 * correctness, but it will be more efficient if a character
1109 * with a large combining class has to "bubble past" several
1110 * characters with lower combining classes.
1122 * Invalid NTFS filename characters are encodeded using the
1123 * SFM (Services for Macintosh) private use Unicode characters.
1125 * These should only be used for SMB, MSDOS or NTFS.
1127 * Illegal NTFS Char SFM Unicode Char
1128 * ----------------------------------------
1129 * 0x01-0x1f 0xf001-0xf01f
1138 * ' ' 0xf028 (Only if last char of the name)
1139 * '.' 0xf029 (Only if last char of the name)
1140 * ----------------------------------------
1142 * Reference: http://support.microsoft.com/kb/q117258/
1145 #define MAX_SFM2MAC 0x29
1146 #define SFMCODE_PREFIX_MASK 0xf000
1149 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1150 * SFM had no conversion for the colon. There is a conversion for the
1151 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1152 * is a slash and a slash is a colon. So we can just replace the slash with the
1153 * colon in our tables and everything will just work.
1157 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
1158 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
1159 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
1160 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
1161 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
1162 0x20, 0x2e /* 28 - 29 */
1164 #define SFM2MAC_LEN ((sizeof(sfm2mac))/sizeof(sfm2mac[0]))
1168 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
1169 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
1170 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
1171 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
1172 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
1173 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
1174 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
1175 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
1176 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
1177 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
1178 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
1179 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
1181 #define MAC2SFM_LEN ((sizeof(mac2sfm))/sizeof(mac2sfm[0]))
1185 * Encode illegal NTFS filename characters into SFM Private Unicode characters
1187 * Assumes non-zero ASCII input.
1190 ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
)
1192 /* The last character of filename cannot be a space or period. */
1194 if (ucs_ch
== 0x20) {
1196 } else if (ucs_ch
== 0x2e) {
1200 /* 0x01 - 0x1f is simple transformation. */
1201 if (ucs_ch
<= 0x1f) {
1202 return ucs_ch
| 0xf000;
1203 } else { /* 0x20 - 0x7f */
1206 assert((ucs_ch
- 0x0020) < MAC2SFM_LEN
);
1207 lsb
= mac2sfm
[ucs_ch
- 0x0020];
1208 if (lsb
!= ucs_ch
) {
1209 return 0xf000 | lsb
;
1216 * Decode any SFM Private Unicode characters
1219 sfm_to_ucs(u_int16_t ucs_ch
)
1221 if (((ucs_ch
& 0xffC0) == SFMCODE_PREFIX_MASK
) &&
1222 ((ucs_ch
& 0x003f) <= MAX_SFM2MAC
)) {
1223 assert((ucs_ch
& 0x003f) < SFM2MAC_LEN
);
1224 ucs_ch
= sfm2mac
[ucs_ch
& 0x003f];