2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 Includes Unicode 3.2 decomposition code derived from Core Foundation
33 #include <sys/param.h>
34 #include <sys/utfconv.h>
35 #include <sys/errno.h>
36 #include <sys/malloc.h>
37 #include <libkern/OSByteOrder.h>
40 * UTF-8 (Unicode Transformation Format)
42 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
43 * character as a sequence of one to four bytes. Only the shortest form
44 * required to represent the significant Unicode bits is legal.
46 * UTF-8 Multibyte Codes
48 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
49 * -----------------------------------------------------------------------------
50 * 1 7 0x0000 0x007F 0xxxxxxx
51 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
52 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
53 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
54 * -----------------------------------------------------------------------------
58 #define UNICODE_TO_UTF8_LEN(c) \
59 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
61 #define UCS_ALT_NULL 0x2400
63 /* Surrogate Pair Constants */
64 #define SP_HALF_SHIFT 10
65 #define SP_HALF_BASE 0x0010000UL
66 #define SP_HALF_MASK 0x3FFUL
68 #define SP_HIGH_FIRST 0xD800UL
69 #define SP_HIGH_LAST 0xDBFFUL
70 #define SP_LOW_FIRST 0xDC00UL
71 #define SP_LOW_LAST 0xDFFFUL
74 #include "vfs_utfconvdata.h"
78 * Test for a combining character.
80 * Similar to __CFUniCharIsNonBaseCharacter except that
81 * unicode_combinable also includes Hangul Jamo characters.
84 unicode_combinable(u_int16_t character
)
86 const u_int8_t
*bitmap
= __CFUniCharCombiningBitmap
;
89 if (character
< 0x0300)
92 value
= bitmap
[(character
>> 8) & 0xFF];
97 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
98 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
104 * Test for a precomposed character.
106 * Similar to __CFUniCharIsDecomposableCharacter.
109 unicode_decomposeable(u_int16_t character
) {
110 const u_int8_t
*bitmap
= __CFUniCharDecomposableBitmap
;
113 if (character
< 0x00C0)
116 value
= bitmap
[(character
>> 8) & 0xFF];
121 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
122 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
129 * Get the combing class.
131 * Similar to CFUniCharGetCombiningPropertyForCharacter.
133 static inline u_int8_t
134 get_combining_class(u_int16_t character
) {
135 const u_int8_t
*bitmap
= __CFUniCharCombiningPropertyBitmap
;
137 u_int8_t value
= bitmap
[(character
>> 8)];
140 bitmap
= bitmap
+ (value
* 256);
141 return bitmap
[character
% 256];
147 static int unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
);
149 static u_int16_t
unicode_combine(u_int16_t base
, u_int16_t combining
);
151 static void priortysort(u_int16_t
* characters
, int count
);
153 static u_int16_t
ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
);
155 static u_int16_t
sfm_to_ucs(u_int16_t ucs_ch
);
158 char utf_extrabytes
[32] = {
159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
163 const char hexdigits
[16] = {
164 '0', '1', '2', '3', '4', '5', '6', '7',
165 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
169 * utf8_encodelen - Calculate the UTF-8 encoding length
171 * This function takes a Unicode input string, ucsp, of ucslen bytes
172 * and calculates the size of the UTF-8 output in bytes (not including
173 * a NULL termination byte). The string must reside in kernel memory.
175 * If '/' chars are possible in the Unicode input then an alternate
176 * (replacement) char should be provided in altslash.
179 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
181 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
183 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
185 * UTF_DECOMPOSED: generate fully decomposed output
187 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
193 utf8_encodelen(const u_int16_t
* ucsp
, size_t ucslen
, u_int16_t altslash
, int flags
)
196 u_int16_t
* chp
= NULL
;
197 u_int16_t sequence
[8];
200 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
201 int decompose
= (flags
& UTF_DECOMPOSED
);
204 charcnt
= ucslen
/ 2;
207 while (charcnt
-- > 0) {
214 ucs_ch
= OSSwapInt16(ucs_ch
);
217 ucs_ch
= altslash
? altslash
: '_';
218 } else if (ucs_ch
== '\0') {
219 ucs_ch
= UCS_ALT_NULL
;
220 } else if (decompose
&& unicode_decomposeable(ucs_ch
)) {
221 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
223 ucs_ch
= sequence
[0];
227 len
+= UNICODE_TO_UTF8_LEN(ucs_ch
);
235 * utf8_encodestr - Encodes a Unicode string to UTF-8
238 * The resulting UTF-8 string is NULL terminated.
240 * If '/' chars are allowed on disk then an alternate
241 * (replacement) char must be provided in altslash.
244 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
246 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
248 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
250 * UTF_DECOMPOSED: generate fully decomposed output
252 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
255 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
257 * EINVAL: Illegal char found; char was replaced by an '_'.
260 utf8_encodestr(const u_int16_t
* ucsp
, size_t ucslen
, u_int8_t
* utf8p
,
261 size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
)
266 u_int16_t
* chp
= NULL
;
267 u_int16_t sequence
[8];
270 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
271 int nullterm
= ((flags
& UTF_NO_NULL_TERM
) == 0);
272 int decompose
= (flags
& UTF_DECOMPOSED
);
273 int sfmconv
= (flags
& UTF_SFM_CONVERSIONS
);
277 bufend
= bufstart
+ buflen
;
280 charcnt
= ucslen
/ 2;
282 while (charcnt
-- > 0) {
287 ucs_ch
= swapbytes
? OSSwapInt16(*ucsp
++) : *ucsp
++;
289 if (decompose
&& unicode_decomposeable(ucs_ch
)) {
290 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
292 ucs_ch
= sequence
[0];
297 /* Slash and NULL are not permitted */
305 } else if (ucs_ch
== '\0') {
306 ucs_ch
= UCS_ALT_NULL
;
309 if (ucs_ch
< 0x0080) {
310 if (utf8p
>= bufend
) {
311 result
= ENAMETOOLONG
;
316 } else if (ucs_ch
< 0x800) {
317 if ((utf8p
+ 1) >= bufend
) {
318 result
= ENAMETOOLONG
;
321 *utf8p
++ = 0xc0 | (ucs_ch
>> 6);
322 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
325 /* These chars never valid Unicode. */
326 if (ucs_ch
== 0xFFFE || ucs_ch
== 0xFFFF) {
331 /* Combine valid surrogate pairs */
332 if (ucs_ch
>= SP_HIGH_FIRST
&& ucs_ch
<= SP_HIGH_LAST
337 ch2
= swapbytes
? OSSwapInt16(*ucsp
) : *ucsp
;
338 if (ch2
>= SP_LOW_FIRST
&& ch2
<= SP_LOW_LAST
) {
339 pair
= ((ucs_ch
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
)
340 + (ch2
- SP_LOW_FIRST
) + SP_HALF_BASE
;
341 if ((utf8p
+ 3) >= bufend
) {
342 result
= ENAMETOOLONG
;
347 *utf8p
++ = 0xf0 | (pair
>> 18);
348 *utf8p
++ = 0x80 | (0x3f & (pair
>> 12));
349 *utf8p
++ = 0x80 | (0x3f & (pair
>> 6));
350 *utf8p
++ = 0x80 | (0x3f & pair
);
353 } else if (sfmconv
) {
354 ucs_ch
= sfm_to_ucs(ucs_ch
);
355 if (ucs_ch
< 0x0080) {
356 if (utf8p
>= bufend
) {
357 result
= ENAMETOOLONG
;
364 if ((utf8p
+ 2) >= bufend
) {
365 result
= ENAMETOOLONG
;
368 *utf8p
++ = 0xe0 | (ucs_ch
>> 12);
369 *utf8p
++ = 0x80 | (0x3f & (ucs_ch
>> 6));
370 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
374 *utf8len
= utf8p
- bufstart
;
383 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
386 * The input UTF-8 string does not need to be null terminated
389 * If '/' chars are allowed on disk then an alternate
390 * (replacement) char must be provided in altslash.
393 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
395 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
397 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
399 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
401 * UTF_PRECOMPOSED: generate precomposed output (NFC)
403 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
406 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
408 * EINVAL: Illegal UTF-8 sequence found.
411 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
,
412 size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
)
420 int decompose
, precompose
, swapbytes
, escaping
;
424 decompose
= (flags
& UTF_DECOMPOSED
);
425 precompose
= (flags
& UTF_PRECOMPOSED
);
426 swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
427 escaping
= (flags
& UTF_ESCAPE_ILLEGAL
);
428 sfmconv
= (flags
& UTF_SFM_CONVERSIONS
);
431 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
433 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
437 /* check for ascii */
439 ucs_ch
= sfmconv
? ucs_to_sfm(byte
, utf8len
== 0) : byte
;
443 extrabytes
= utf_extrabytes
[byte
>> 3];
444 if ((extrabytes
< 0) || ((int)utf8len
< extrabytes
)) {
447 utf8len
-= extrabytes
;
449 switch (extrabytes
) {
451 ch
= byte
; ch
<<= 6; /* 1st byte */
452 byte
= *utf8p
++; /* 2nd byte */
453 if ((byte
>> 6) != 2)
462 ch
= byte
; ch
<<= 6; /* 1st byte */
463 byte
= *utf8p
++; /* 2nd byte */
464 if ((byte
>> 6) != 2)
466 ch
+= byte
; ch
<<= 6;
467 byte
= *utf8p
++; /* 3rd byte */
468 if ((byte
>> 6) != 2)
477 if (ch
== 0xFFFE || ch
== 0xFFFF)
483 ch
= byte
; ch
<<= 6; /* 1st byte */
484 byte
= *utf8p
++; /* 2nd byte */
485 if ((byte
>> 6) != 2)
487 ch
+= byte
; ch
<<= 6;
488 byte
= *utf8p
++; /* 3rd byte */
489 if ((byte
>> 6) != 2)
491 ch
+= byte
; ch
<<= 6;
492 byte
= *utf8p
++; /* 4th byte */
493 if ((byte
>> 6) != 2)
496 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
497 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
498 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
500 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
503 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
504 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
) {
508 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
515 if (unicode_decomposeable(ucs_ch
)) {
516 u_int16_t sequence
[8];
519 /* Before decomposing a new unicode character, sort
520 * previous combining characters, if any, and reset
523 if (combcharcnt
> 1) {
524 priortysort(ucsp
- combcharcnt
, combcharcnt
);
528 count
= unicode_decompose(ucs_ch
, sequence
);
529 for (i
= 0; i
< count
; ++i
) {
530 ucs_ch
= sequence
[i
];
531 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
535 combcharcnt
+= count
- 1;
538 } else if (precompose
&& (ucsp
!= bufstart
)) {
539 u_int16_t composite
, base
;
541 if (unicode_combinable(ucs_ch
)) {
542 base
= swapbytes
? OSSwapInt16(*(ucsp
- 1)) : *(ucsp
- 1);
543 composite
= unicode_combine(base
, ucs_ch
);
550 if (ucs_ch
== UCS_ALT_NULL
)
553 if (ucs_ch
== altslash
)
557 * Make multiple combining character sequences canonical
559 if (unicode_combinable(ucs_ch
)) {
560 ++combcharcnt
; /* start tracking a run */
561 } else if (combcharcnt
) {
562 if (combcharcnt
> 1) {
563 priortysort(ucsp
- combcharcnt
, combcharcnt
);
565 combcharcnt
= 0; /* start over */
568 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
572 * Escape illegal UTF-8 into something legal.
588 utf8len
+= extrabytes
;
591 if ((ucsp
+ 2) >= bufend
)
594 /* Make a previous combining sequence canonical. */
595 if (combcharcnt
> 1) {
596 priortysort(ucsp
- combcharcnt
, combcharcnt
);
601 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
602 ucs_ch
= hexdigits
[byte
>> 4];
603 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
604 ucs_ch
= hexdigits
[byte
& 0x0F];
605 *ucsp
++ = swapbytes
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
;
608 * Make a previous combining sequence canonical
610 if (combcharcnt
> 1) {
611 priortysort(ucsp
- combcharcnt
, combcharcnt
);
614 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
619 result
= ENAMETOOLONG
;
625 * utf8_validatestr - Check for a valid UTF-8 string.
628 utf8_validatestr(const u_int8_t
* utf8p
, size_t utf8len
)
635 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
637 continue; /* plain ascii */
639 extrabytes
= utf_extrabytes
[byte
>> 3];
641 if (utf8len
< extrabytes
)
643 utf8len
-= extrabytes
;
645 switch (extrabytes
) {
647 ch
= byte
; ch
<<= 6; /* 1st byte */
648 byte
= *utf8p
++; /* 2nd byte */
649 if ((byte
>> 6) != 2)
657 ch
= byte
; ch
<<= 6; /* 1st byte */
658 byte
= *utf8p
++; /* 2nd byte */
659 if ((byte
>> 6) != 2)
661 ch
+= byte
; ch
<<= 6;
662 byte
= *utf8p
++; /* 3rd byte */
663 if ((byte
>> 6) != 2)
672 if (ch
== 0xFFFE || ch
== 0xFFFF)
677 ch
= byte
; ch
<<= 6; /* 1st byte */
678 byte
= *utf8p
++; /* 2nd byte */
679 if ((byte
>> 6) != 2)
681 ch
+= byte
; ch
<<= 6;
682 byte
= *utf8p
++; /* 3rd byte */
683 if ((byte
>> 6) != 2)
685 ch
+= byte
; ch
<<= 6;
686 byte
= *utf8p
++; /* 4th byte */
687 if ((byte
>> 6) != 2)
690 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
691 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
692 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
694 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
695 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
)
709 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
711 * This function takes an UTF-8 input string, instr, of inlen bytes
712 * and produces normalized UTF-8 output into a buffer of buflen bytes
713 * pointed to by outstr. The size of the output in bytes (not including
714 * a NULL termination byte) is returned in outlen. In-place conversions
715 * are not supported (i.e. instr != outstr).]
718 * UTF_DECOMPOSED: output string will be fully decomposed (NFD)
720 * UTF_PRECOMPOSED: output string will be precomposed (NFC)
722 * UTF_NO_NULL_TERM: do not add null termination to output string
724 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
727 * ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
729 * EINVAL: illegal UTF-8 sequence encountered or invalid flags
732 utf8_normalizestr(const u_int8_t
* instr
, size_t inlen
, u_int8_t
* outstr
,
733 size_t *outlen
, size_t buflen
, int flags
)
735 u_int16_t unicodebuf
[32];
736 u_int16_t
* unistr
= NULL
;
737 size_t unicode_bytes
;
740 u_int8_t
*outbufstart
, *outbufend
;
741 const u_int8_t
*inbufstart
;
743 int decompose
, precompose
;
746 if (flags
& ~(UTF_DECOMPOSED
| UTF_PRECOMPOSED
| UTF_NO_NULL_TERM
| UTF_ESCAPE_ILLEGAL
)) {
749 decompose
= (flags
& UTF_DECOMPOSED
);
750 precompose
= (flags
& UTF_PRECOMPOSED
);
751 if ((decompose
&& precompose
) || (!decompose
&& !precompose
)) {
754 outbufstart
= outstr
;
755 outbufend
= outbufstart
+ buflen
;
759 while (inlen
-- > 0 && (byte
= *instr
++) != '\0') {
760 if (outstr
>= outbufend
) {
761 result
= ENAMETOOLONG
;
767 /* ASCII is already normalized. */
771 *outlen
= outstr
- outbufstart
;
772 if (((flags
& UTF_NO_NULL_TERM
) == 0)) {
773 if (outstr
< outbufend
)
776 result
= ENAMETOOLONG
;
782 * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
783 * functions to perform the normalization. Since this will
784 * presumably be used to normalize filenames in the back-end
785 * (on disk or over-the-wire), it should be fast enough.
789 /* Make sure the input size is reasonable. */
790 if (inbuflen
> MAXPATHLEN
) {
791 result
= ENAMETOOLONG
;
795 * Compute worst case Unicode buffer size.
797 * For pre-composed output, every UTF-8 input byte will be at
798 * most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
799 * (smallest composite char sequence) may yield 6 Unicode bytes
800 * (1 base char + 2 combining chars).
802 unicode_bytes
= precompose
? (inbuflen
* 2) : (inbuflen
* 3);
804 if (unicode_bytes
<= sizeof(unicodebuf
))
805 unistr
= &unicodebuf
[0];
807 MALLOC(unistr
, u_int16_t
*, unicode_bytes
, M_TEMP
, M_WAITOK
);
809 /* Normalize the string. */
810 result
= utf8_decodestr(inbufstart
, inbuflen
, unistr
, &unicode_bytes
,
811 unicode_bytes
, 0, flags
& ~UTF_NO_NULL_TERM
);
813 /* Put results back into UTF-8. */
814 result
= utf8_encodestr(unistr
, unicode_bytes
, outbufstart
,
815 &uft8_bytes
, buflen
, 0, UTF_NO_NULL_TERM
);
816 outstr
= outbufstart
+ uft8_bytes
;
818 if (unistr
&& unistr
!= &unicodebuf
[0]) {
819 FREE(unistr
, M_TEMP
);
826 * Unicode 3.2 decomposition code (derived from Core Foundation)
832 } unicode_mappings32
;
834 static inline u_int32_t
835 getmappedvalue32(const unicode_mappings32
*theTable
, u_int32_t numElem
,
838 const unicode_mappings32
*p
, *q
, *divider
;
840 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
846 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
847 if (character
< divider
->_key
) { q
= divider
- 1; }
848 else if (character
> divider
->_key
) { p
= divider
+ 1; }
849 else { return (divider
->_value
); }
854 #define RECURSIVE_DECOMPOSITION (1 << 15)
855 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
860 } unicode_mappings16
;
862 static inline u_int16_t
863 getmappedvalue16(const unicode_mappings16
*theTable
, u_int32_t numElem
,
866 const unicode_mappings16
*p
, *q
, *divider
;
868 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
874 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
875 if (character
< divider
->_key
)
877 else if (character
> divider
->_key
)
880 return (divider
->_value
);
887 unicode_recursive_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
893 const u_int16_t
*bmpMappings
;
894 u_int32_t usedLength
;
896 value
= getmappedvalue16(
897 (const unicode_mappings16
*)__CFUniCharDecompositionTable
,
898 __UniCharDecompositionTableLength
, character
);
899 length
= EXTRACT_COUNT(value
);
900 firstChar
= value
& 0x0FFF;
902 bmpMappings
= (length
== 1 ? &theChar
: __CFUniCharMultipleDecompositionTable
+ firstChar
);
905 if (value
& RECURSIVE_DECOMPOSITION
) {
906 usedLength
= unicode_recursive_decompose((u_int16_t
)*bmpMappings
, convertedChars
);
908 --length
; /* Decrement for the first char */
912 convertedChars
+= usedLength
;
915 usedLength
+= length
;
918 *(convertedChars
++) = *(bmpMappings
++);
923 #define HANGUL_SBASE 0xAC00
924 #define HANGUL_LBASE 0x1100
925 #define HANGUL_VBASE 0x1161
926 #define HANGUL_TBASE 0x11A7
928 #define HANGUL_SCOUNT 11172
929 #define HANGUL_LCOUNT 19
930 #define HANGUL_VCOUNT 21
931 #define HANGUL_TCOUNT 28
932 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
935 * unicode_decompose - decompose a composed Unicode char
937 * Composed Unicode characters are forbidden on
938 * HFS Plus volumes. ucs_decompose will convert a
939 * composed character into its correct decomposed
942 * Similar to CFUniCharDecomposeCharacter
945 unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
947 if ((character
>= HANGUL_SBASE
) &&
948 (character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
951 character
-= HANGUL_SBASE
;
952 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
954 *(convertedChars
++) =
955 character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
956 *(convertedChars
++) =
957 (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
959 *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
962 return (unicode_recursive_decompose(character
, convertedChars
));
967 * unicode_combine - generate a precomposed Unicode char
969 * Precomposed Unicode characters are required for some volume
970 * formats and network protocols. unicode_combine will combine
971 * a decomposed character sequence into a single precomposed
972 * (composite) character.
974 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
975 * also handles Hangul Jamo characters.
978 unicode_combine(u_int16_t base
, u_int16_t combining
)
983 if ((combining
>= HANGUL_VBASE
) && (combining
< (HANGUL_TBASE
+ HANGUL_TCOUNT
))) {
984 /* 2 char Hangul sequences */
985 if ((combining
< (HANGUL_VBASE
+ HANGUL_VCOUNT
)) &&
986 (base
>= HANGUL_LBASE
&& base
< (HANGUL_LBASE
+ HANGUL_LCOUNT
))) {
987 return (HANGUL_SBASE
+
988 ((base
- HANGUL_LBASE
)*(HANGUL_VCOUNT
*HANGUL_TCOUNT
)) +
989 ((combining
- HANGUL_VBASE
)*HANGUL_TCOUNT
));
992 /* 3 char Hangul sequences */
993 if ((combining
> HANGUL_TBASE
) &&
994 (base
>= HANGUL_SBASE
&& base
< (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
995 if ((base
- HANGUL_SBASE
) % HANGUL_TCOUNT
)
998 return (base
+ (combining
- HANGUL_TBASE
));
1002 value
= getmappedvalue32(
1003 (const unicode_mappings32
*)__CFUniCharPrecompSourceTable
,
1004 __CFUniCharPrecompositionTableLength
, combining
);
1007 value
= getmappedvalue16(
1008 (const unicode_mappings16
*)
1009 ((const u_int32_t
*)__CFUniCharBMPPrecompDestinationTable
+ (value
& 0xFFFF)),
1010 (value
>> 16), base
);
1017 * priortysort - order combining chars into canonical order
1019 * Similar to CFUniCharPrioritySort
1022 priortysort(u_int16_t
* characters
, int count
)
1025 u_int16_t
*ch1
, *ch2
;
1029 end
= characters
+ count
;
1033 ch2
= characters
+ 1;
1034 p2
= get_combining_class(*ch1
);
1037 p2
= get_combining_class(*ch2
);
1038 if (p1
> p2
&& p2
!= 0) {
1047 * Make sure that p2 contains the combining class for the
1048 * character now stored at *ch2. This isn't required for
1049 * correctness, but it will be more efficient if a character
1050 * with a large combining class has to "bubble past" several
1051 * characters with lower combining classes.
1063 * Invalid NTFS filename characters are encodeded using the
1064 * SFM (Services for Macintosh) private use Unicode characters.
1066 * These should only be used for SMB, MSDOS or NTFS.
1068 * Illegal NTFS Char SFM Unicode Char
1069 * ----------------------------------------
1070 * 0x01-0x1f 0xf001-0xf01f
1079 * ' ' 0xf028 (Only if last char of the name)
1080 * '.' 0xf029 (Only if last char of the name)
1081 * ----------------------------------------
1083 * Reference: http://support.microsoft.com/kb/q117258/
1086 #define MAX_SFM2MAC 0x29
1087 #define SFMCODE_PREFIX_MASK 0xf000
1090 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1091 * SFM had no conversion for the colon. There is a conversion for the
1092 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1093 * is a slash and a slash is a colon. So we can just replace the slash with the
1094 * colon in our tables and everything will just work.
1098 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
1099 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
1100 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
1101 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
1102 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
1103 0x20, 0x2e /* 28 - 29 */
1108 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
1109 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
1110 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
1111 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
1112 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
1113 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
1114 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
1115 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
1116 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
1117 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
1118 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
1119 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
1124 * Encode illegal NTFS filename characters into SFM Private Unicode characters
1126 * Assumes non-zero ASCII input.
1129 ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
)
1131 /* The last character of filename cannot be a space or period. */
1135 else if (ucs_ch
== 0x2e)
1138 /* 0x01 - 0x1f is simple transformation. */
1139 if (ucs_ch
<= 0x1f) {
1140 return (ucs_ch
| 0xf000);
1141 } else /* 0x20 - 0x7f */ {
1144 lsb
= mac2sfm
[ucs_ch
- 0x0020];
1146 return(0xf000 | lsb
);
1152 * Decode any SFM Private Unicode characters
1155 sfm_to_ucs(u_int16_t ucs_ch
)
1157 if (((ucs_ch
& 0xffC0) == SFMCODE_PREFIX_MASK
) &&
1158 ((ucs_ch
& 0x003f) <= MAX_SFM2MAC
)) {
1159 ucs_ch
= sfm2mac
[ucs_ch
& 0x003f];