2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 Includes Unicode 3.2 decomposition code derived from Core Foundation
33 #include <sys/param.h>
34 #include <sys/utfconv.h>
35 #include <sys/errno.h>
36 #include <sys/malloc.h>
37 #include <libkern/OSByteOrder.h>
40 * UTF-8 (Unicode Transformation Format)
42 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
43 * character as a sequence of one to four bytes. Only the shortest form
44 * required to represent the significant Unicode bits is legal.
46 * UTF-8 Multibyte Codes
48 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
49 * -----------------------------------------------------------------------------
50 * 1 7 0x0000 0x007F 0xxxxxxx
51 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
52 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
53 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
54 * -----------------------------------------------------------------------------
58 #define UNICODE_TO_UTF8_LEN(c) \
59 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
61 #define UCS_ALT_NULL 0x2400
63 /* Surrogate Pair Constants */
64 #define SP_HALF_SHIFT 10
65 #define SP_HALF_BASE 0x0010000u
66 #define SP_HALF_MASK 0x3FFu
68 #define SP_HIGH_FIRST 0xD800u
69 #define SP_HIGH_LAST 0xDBFFu
70 #define SP_LOW_FIRST 0xDC00u
71 #define SP_LOW_LAST 0xDFFFu
74 #include "vfs_utfconvdata.h"
78 * Test for a combining character.
80 * Similar to __CFUniCharIsNonBaseCharacter except that
81 * unicode_combinable also includes Hangul Jamo characters.
84 unicode_combinable(u_int16_t character
)
86 const u_int8_t
*bitmap
= __CFUniCharCombiningBitmap
;
89 if (character
< 0x0300)
92 value
= bitmap
[(character
>> 8) & 0xFF];
97 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
98 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
104 * Test for a precomposed character.
106 * Similar to __CFUniCharIsDecomposableCharacter.
109 unicode_decomposeable(u_int16_t character
) {
110 const u_int8_t
*bitmap
= __CFUniCharDecomposableBitmap
;
113 if (character
< 0x00C0)
116 value
= bitmap
[(character
>> 8) & 0xFF];
121 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
122 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
129 * Get the combing class.
131 * Similar to CFUniCharGetCombiningPropertyForCharacter.
133 static inline u_int8_t
134 get_combining_class(u_int16_t character
) {
135 const u_int8_t
*bitmap
= __CFUniCharCombiningPropertyBitmap
;
137 u_int8_t value
= bitmap
[(character
>> 8)];
140 bitmap
= bitmap
+ (value
* 256);
141 return bitmap
[character
% 256];
147 static int unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
);
149 static u_int16_t
unicode_combine(u_int16_t base
, u_int16_t combining
);
151 static void prioritysort(u_int16_t
* characters
, int count
);
153 static u_int16_t
ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
);
155 static u_int16_t
sfm_to_ucs(u_int16_t ucs_ch
);
158 char utf_extrabytes
[32] = {
159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
163 const char hexdigits
[16] = {
164 '0', '1', '2', '3', '4', '5', '6', '7',
165 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
169 * utf8_encodelen - Calculate the UTF-8 encoding length
171 * This function takes a Unicode input string, ucsp, of ucslen bytes
172 * and calculates the size of the UTF-8 output in bytes (not including
173 * a NULL termination byte). The string must reside in kernel memory.
175 * If '/' chars are possible in the Unicode input then an alternate
176 * (replacement) char should be provided in altslash.
179 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
181 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
183 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
185 * UTF_DECOMPOSED: generate fully decomposed output
187 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
193 utf8_encodelen(const u_int16_t
* ucsp
, size_t ucslen
, u_int16_t altslash
, int flags
)
196 u_int16_t
* chp
= NULL
;
197 u_int16_t sequence
[8];
200 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
201 int decompose
= (flags
& UTF_DECOMPOSED
);
204 charcnt
= ucslen
/ 2;
207 while (charcnt
-- > 0) {
214 ucs_ch
= OSSwapInt16(ucs_ch
);
217 ucs_ch
= altslash
? altslash
: '_';
218 } else if (ucs_ch
== '\0') {
219 ucs_ch
= UCS_ALT_NULL
;
220 } else if (decompose
&& unicode_decomposeable(ucs_ch
)) {
221 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
223 ucs_ch
= sequence
[0];
227 len
+= UNICODE_TO_UTF8_LEN(ucs_ch
);
235 * utf8_encodestr - Encodes a Unicode string to UTF-8
238 * The resulting UTF-8 string is NULL terminated.
240 * If '/' chars are allowed on disk then an alternate
241 * (replacement) char must be provided in altslash.
244 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
246 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
248 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
250 * UTF_DECOMPOSED: generate fully decomposed output
252 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
255 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
257 * EINVAL: Illegal char found; char was replaced by an '_'.
260 utf8_encodestr(const u_int16_t
* ucsp
, size_t ucslen
, u_int8_t
* utf8p
,
261 size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
)
266 u_int16_t
* chp
= NULL
;
267 u_int16_t sequence
[8];
270 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
271 int nullterm
= ((flags
& UTF_NO_NULL_TERM
) == 0);
272 int decompose
= (flags
& UTF_DECOMPOSED
);
273 int sfmconv
= (flags
& UTF_SFM_CONVERSIONS
);
277 bufend
= bufstart
+ buflen
;
280 charcnt
= ucslen
/ 2;
282 while (charcnt
-- > 0) {
287 ucs_ch
= swapbytes
? OSSwapInt16(*ucsp
++) : *ucsp
++;
289 if (decompose
&& unicode_decomposeable(ucs_ch
)) {
290 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
292 ucs_ch
= sequence
[0];
297 /* Slash and NULL are not permitted */
305 } else if (ucs_ch
== '\0') {
306 ucs_ch
= UCS_ALT_NULL
;
309 if (ucs_ch
< 0x0080) {
310 if (utf8p
>= bufend
) {
311 result
= ENAMETOOLONG
;
316 } else if (ucs_ch
< 0x800) {
317 if ((utf8p
+ 1) >= bufend
) {
318 result
= ENAMETOOLONG
;
321 *utf8p
++ = 0xc0 | (ucs_ch
>> 6);
322 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
325 /* These chars never valid Unicode. */
326 if (ucs_ch
== 0xFFFE || ucs_ch
== 0xFFFF) {
331 /* Combine valid surrogate pairs */
332 if (ucs_ch
>= SP_HIGH_FIRST
&& ucs_ch
<= SP_HIGH_LAST
337 ch2
= swapbytes
? OSSwapInt16(*ucsp
) : *ucsp
;
338 if (ch2
>= SP_LOW_FIRST
&& ch2
<= SP_LOW_LAST
) {
339 pair
= ((ucs_ch
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
)
340 + (ch2
- SP_LOW_FIRST
) + SP_HALF_BASE
;
341 if ((utf8p
+ 3) >= bufend
) {
342 result
= ENAMETOOLONG
;
347 *utf8p
++ = 0xf0 | (pair
>> 18);
348 *utf8p
++ = 0x80 | (0x3f & (pair
>> 12));
349 *utf8p
++ = 0x80 | (0x3f & (pair
>> 6));
350 *utf8p
++ = 0x80 | (0x3f & pair
);
353 } else if (sfmconv
) {
354 ucs_ch
= sfm_to_ucs(ucs_ch
);
355 if (ucs_ch
< 0x0080) {
356 if (utf8p
>= bufend
) {
357 result
= ENAMETOOLONG
;
364 if ((utf8p
+ 2) >= bufend
) {
365 result
= ENAMETOOLONG
;
368 *utf8p
++ = 0xe0 | (ucs_ch
>> 12);
369 *utf8p
++ = 0x80 | (0x3f & (ucs_ch
>> 6));
370 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
374 *utf8len
= utf8p
- bufstart
;
381 // Pushes a character taking account of combining character sequences
382 static void push(uint16_t ucs_ch
, int *combcharcnt
, uint16_t **ucsp
)
385 * Make multiple combining character sequences canonical
387 if (unicode_combinable(ucs_ch
)) {
388 ++*combcharcnt
; /* start tracking a run */
389 } else if (*combcharcnt
) {
390 if (*combcharcnt
> 1) {
391 prioritysort(*ucsp
- *combcharcnt
, *combcharcnt
);
393 *combcharcnt
= 0; /* start over */
400 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
403 * The input UTF-8 string does not need to be null terminated
406 * If '/' chars are allowed on disk then an alternate
407 * (replacement) char must be provided in altslash.
410 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
412 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
414 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
416 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
418 * UTF_PRECOMPOSED: generate precomposed output (NFC)
420 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
423 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
425 * EINVAL: Illegal UTF-8 sequence found.
428 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
,
429 size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
)
437 int decompose
, precompose
, escaping
;
441 decompose
= (flags
& UTF_DECOMPOSED
);
442 precompose
= (flags
& UTF_PRECOMPOSED
);
443 escaping
= (flags
& UTF_ESCAPE_ILLEGAL
);
444 sfmconv
= (flags
& UTF_SFM_CONVERSIONS
);
447 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
449 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
453 /* check for ascii */
455 ucs_ch
= sfmconv
? ucs_to_sfm(byte
, utf8len
== 0) : byte
;
459 extrabytes
= utf_extrabytes
[byte
>> 3];
460 if ((extrabytes
< 0) || ((int)utf8len
< extrabytes
)) {
463 utf8len
-= extrabytes
;
465 switch (extrabytes
) {
467 ch
= byte
; ch
<<= 6; /* 1st byte */
468 byte
= *utf8p
++; /* 2nd byte */
469 if ((byte
>> 6) != 2)
478 ch
= byte
; ch
<<= 6; /* 1st byte */
479 byte
= *utf8p
++; /* 2nd byte */
480 if ((byte
>> 6) != 2)
482 ch
+= byte
; ch
<<= 6;
483 byte
= *utf8p
++; /* 3rd byte */
484 if ((byte
>> 6) != 2)
493 if (ch
== 0xFFFE || ch
== 0xFFFF)
499 ch
= byte
; ch
<<= 6; /* 1st byte */
500 byte
= *utf8p
++; /* 2nd byte */
501 if ((byte
>> 6) != 2)
503 ch
+= byte
; ch
<<= 6;
504 byte
= *utf8p
++; /* 3rd byte */
505 if ((byte
>> 6) != 2)
507 ch
+= byte
; ch
<<= 6;
508 byte
= *utf8p
++; /* 4th byte */
509 if ((byte
>> 6) != 2)
512 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
513 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
514 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
516 push(ucs_ch
, &combcharcnt
, &ucsp
);
519 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
520 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
) {
531 if (unicode_decomposeable(ucs_ch
)) {
532 u_int16_t sequence
[8];
535 count
= unicode_decompose(ucs_ch
, sequence
);
537 for (i
= 0; i
< count
; ++i
) {
541 push(sequence
[i
], &combcharcnt
, &ucsp
);
546 } else if (precompose
&& (ucsp
!= bufstart
)) {
547 u_int16_t composite
, base
;
549 if (unicode_combinable(ucs_ch
)) {
551 composite
= unicode_combine(base
, ucs_ch
);
558 if (ucs_ch
== UCS_ALT_NULL
)
561 if (ucs_ch
== altslash
)
564 push(ucs_ch
, &combcharcnt
, &ucsp
);
568 * Escape illegal UTF-8 into something legal.
584 utf8len
+= extrabytes
;
587 if ((ucsp
+ 2) >= bufend
)
590 /* Make a previous combining sequence canonical. */
591 if (combcharcnt
> 1) {
592 prioritysort(ucsp
- combcharcnt
, combcharcnt
);
598 ucs_ch
= hexdigits
[byte
>> 4];
600 ucs_ch
= hexdigits
[byte
& 0x0F];
604 * Make a previous combining sequence canonical
606 if (combcharcnt
> 1) {
607 prioritysort(ucsp
- combcharcnt
, combcharcnt
);
610 if (flags
& UTF_REVERSE_ENDIAN
) {
611 uint16_t *p
= bufstart
;
613 *p
= OSSwapInt16(*p
);
619 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
624 result
= ENAMETOOLONG
;
630 * utf8_validatestr - Check for a valid UTF-8 string.
633 utf8_validatestr(const u_int8_t
* utf8p
, size_t utf8len
)
640 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
642 continue; /* plain ascii */
644 extrabytes
= utf_extrabytes
[byte
>> 3];
646 if (utf8len
< extrabytes
)
648 utf8len
-= extrabytes
;
650 switch (extrabytes
) {
652 ch
= byte
; ch
<<= 6; /* 1st byte */
653 byte
= *utf8p
++; /* 2nd byte */
654 if ((byte
>> 6) != 2)
662 ch
= byte
; ch
<<= 6; /* 1st byte */
663 byte
= *utf8p
++; /* 2nd byte */
664 if ((byte
>> 6) != 2)
666 ch
+= byte
; ch
<<= 6;
667 byte
= *utf8p
++; /* 3rd byte */
668 if ((byte
>> 6) != 2)
677 if (ch
== 0xFFFE || ch
== 0xFFFF)
682 ch
= byte
; ch
<<= 6; /* 1st byte */
683 byte
= *utf8p
++; /* 2nd byte */
684 if ((byte
>> 6) != 2)
686 ch
+= byte
; ch
<<= 6;
687 byte
= *utf8p
++; /* 3rd byte */
688 if ((byte
>> 6) != 2)
690 ch
+= byte
; ch
<<= 6;
691 byte
= *utf8p
++; /* 4th byte */
692 if ((byte
>> 6) != 2)
695 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
696 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
697 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
699 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
700 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
)
714 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
716 * This function takes an UTF-8 input string, instr, of inlen bytes
717 * and produces normalized UTF-8 output into a buffer of buflen bytes
718 * pointed to by outstr. The size of the output in bytes (not including
719 * a NULL termination byte) is returned in outlen. In-place conversions
720 * are not supported (i.e. instr != outstr).]
723 * UTF_DECOMPOSED: output string will be fully decomposed (NFD)
725 * UTF_PRECOMPOSED: output string will be precomposed (NFC)
727 * UTF_NO_NULL_TERM: do not add null termination to output string
729 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
732 * ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
734 * EINVAL: illegal UTF-8 sequence encountered or invalid flags
737 utf8_normalizestr(const u_int8_t
* instr
, size_t inlen
, u_int8_t
* outstr
,
738 size_t *outlen
, size_t buflen
, int flags
)
740 u_int16_t unicodebuf
[32];
741 u_int16_t
* unistr
= NULL
;
742 size_t unicode_bytes
;
745 u_int8_t
*outbufstart
, *outbufend
;
746 const u_int8_t
*inbufstart
;
748 int decompose
, precompose
;
751 if (flags
& ~(UTF_DECOMPOSED
| UTF_PRECOMPOSED
| UTF_NO_NULL_TERM
| UTF_ESCAPE_ILLEGAL
)) {
754 decompose
= (flags
& UTF_DECOMPOSED
);
755 precompose
= (flags
& UTF_PRECOMPOSED
);
756 if ((decompose
&& precompose
) || (!decompose
&& !precompose
)) {
759 outbufstart
= outstr
;
760 outbufend
= outbufstart
+ buflen
;
764 while (inlen
-- > 0 && (byte
= *instr
++) != '\0') {
765 if (outstr
>= outbufend
) {
766 result
= ENAMETOOLONG
;
772 /* ASCII is already normalized. */
776 *outlen
= outstr
- outbufstart
;
777 if (((flags
& UTF_NO_NULL_TERM
) == 0)) {
778 if (outstr
< outbufend
)
781 result
= ENAMETOOLONG
;
787 * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
788 * functions to perform the normalization. Since this will
789 * presumably be used to normalize filenames in the back-end
790 * (on disk or over-the-wire), it should be fast enough.
794 /* Make sure the input size is reasonable. */
795 if (inbuflen
> MAXPATHLEN
) {
796 result
= ENAMETOOLONG
;
800 * Compute worst case Unicode buffer size.
802 * For pre-composed output, every UTF-8 input byte will be at
803 * most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
804 * (smallest composite char sequence) may yield 6 Unicode bytes
805 * (1 base char + 2 combining chars).
807 unicode_bytes
= precompose
? (inbuflen
* 2) : (inbuflen
* 3);
809 if (unicode_bytes
<= sizeof(unicodebuf
))
810 unistr
= &unicodebuf
[0];
812 MALLOC(unistr
, uint16_t *, unicode_bytes
, M_TEMP
, M_WAITOK
);
814 /* Normalize the string. */
815 result
= utf8_decodestr(inbufstart
, inbuflen
, unistr
, &unicode_bytes
,
816 unicode_bytes
, 0, flags
& ~UTF_NO_NULL_TERM
);
818 /* Put results back into UTF-8. */
819 result
= utf8_encodestr(unistr
, unicode_bytes
, outbufstart
,
820 &uft8_bytes
, buflen
, 0, UTF_NO_NULL_TERM
);
821 outstr
= outbufstart
+ uft8_bytes
;
823 if (unistr
&& unistr
!= &unicodebuf
[0]) {
824 FREE(unistr
, M_TEMP
);
831 * Unicode 3.2 decomposition code (derived from Core Foundation)
837 } unicode_mappings32
;
839 static inline u_int32_t
840 getmappedvalue32(const unicode_mappings32
*theTable
, u_int32_t numElem
,
843 const unicode_mappings32
*p
, *q
, *divider
;
845 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
851 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
852 if (character
< divider
->_key
) { q
= divider
- 1; }
853 else if (character
> divider
->_key
) { p
= divider
+ 1; }
854 else { return (divider
->_value
); }
859 #define RECURSIVE_DECOMPOSITION (1 << 15)
860 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
865 } unicode_mappings16
;
867 static inline u_int16_t
868 getmappedvalue16(const unicode_mappings16
*theTable
, u_int32_t numElem
,
871 const unicode_mappings16
*p
, *q
, *divider
;
873 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
879 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
880 if (character
< divider
->_key
)
882 else if (character
> divider
->_key
)
885 return (divider
->_value
);
892 unicode_recursive_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
898 const u_int16_t
*bmpMappings
;
899 u_int32_t usedLength
;
901 value
= getmappedvalue16(
902 (const unicode_mappings16
*)__CFUniCharDecompositionTable
,
903 __UniCharDecompositionTableLength
, character
);
904 length
= EXTRACT_COUNT(value
);
905 firstChar
= value
& 0x0FFF;
907 bmpMappings
= (length
== 1 ? &theChar
: __CFUniCharMultipleDecompositionTable
+ firstChar
);
910 if (value
& RECURSIVE_DECOMPOSITION
) {
911 usedLength
= unicode_recursive_decompose((u_int16_t
)*bmpMappings
, convertedChars
);
913 --length
; /* Decrement for the first char */
917 convertedChars
+= usedLength
;
920 usedLength
+= length
;
923 *(convertedChars
++) = *(bmpMappings
++);
928 #define HANGUL_SBASE 0xAC00
929 #define HANGUL_LBASE 0x1100
930 #define HANGUL_VBASE 0x1161
931 #define HANGUL_TBASE 0x11A7
933 #define HANGUL_SCOUNT 11172
934 #define HANGUL_LCOUNT 19
935 #define HANGUL_VCOUNT 21
936 #define HANGUL_TCOUNT 28
937 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
940 * unicode_decompose - decompose a composed Unicode char
942 * Composed Unicode characters are forbidden on
943 * HFS Plus volumes. ucs_decompose will convert a
944 * composed character into its correct decomposed
947 * Similar to CFUniCharDecomposeCharacter
950 unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
952 if ((character
>= HANGUL_SBASE
) &&
953 (character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
956 character
-= HANGUL_SBASE
;
957 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
959 *(convertedChars
++) =
960 character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
961 *(convertedChars
++) =
962 (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
964 *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
967 return (unicode_recursive_decompose(character
, convertedChars
));
972 * unicode_combine - generate a precomposed Unicode char
974 * Precomposed Unicode characters are required for some volume
975 * formats and network protocols. unicode_combine will combine
976 * a decomposed character sequence into a single precomposed
977 * (composite) character.
979 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
980 * also handles Hangul Jamo characters.
983 unicode_combine(u_int16_t base
, u_int16_t combining
)
988 if ((combining
>= HANGUL_VBASE
) && (combining
< (HANGUL_TBASE
+ HANGUL_TCOUNT
))) {
989 /* 2 char Hangul sequences */
990 if ((combining
< (HANGUL_VBASE
+ HANGUL_VCOUNT
)) &&
991 (base
>= HANGUL_LBASE
&& base
< (HANGUL_LBASE
+ HANGUL_LCOUNT
))) {
992 return (HANGUL_SBASE
+
993 ((base
- HANGUL_LBASE
)*(HANGUL_VCOUNT
*HANGUL_TCOUNT
)) +
994 ((combining
- HANGUL_VBASE
)*HANGUL_TCOUNT
));
997 /* 3 char Hangul sequences */
998 if ((combining
> HANGUL_TBASE
) &&
999 (base
>= HANGUL_SBASE
&& base
< (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
1000 if ((base
- HANGUL_SBASE
) % HANGUL_TCOUNT
)
1003 return (base
+ (combining
- HANGUL_TBASE
));
1007 value
= getmappedvalue32(
1008 (const unicode_mappings32
*)__CFUniCharPrecompSourceTable
,
1009 __CFUniCharPrecompositionTableLength
, combining
);
1012 value
= getmappedvalue16(
1013 (const unicode_mappings16
*)
1014 ((const u_int32_t
*)__CFUniCharBMPPrecompDestinationTable
+ (value
& 0xFFFF)),
1015 (value
>> 16), base
);
1022 * prioritysort - order combining chars into canonical order
1024 * Similar to CFUniCharPrioritySort
1027 prioritysort(u_int16_t
* characters
, int count
)
1030 u_int16_t
*ch1
, *ch2
;
1034 end
= characters
+ count
;
1038 ch2
= characters
+ 1;
1039 p2
= get_combining_class(*ch1
);
1042 p2
= get_combining_class(*ch2
);
1043 if (p1
> p2
&& p2
!= 0) {
1052 * Make sure that p2 contains the combining class for the
1053 * character now stored at *ch2. This isn't required for
1054 * correctness, but it will be more efficient if a character
1055 * with a large combining class has to "bubble past" several
1056 * characters with lower combining classes.
1068 * Invalid NTFS filename characters are encodeded using the
1069 * SFM (Services for Macintosh) private use Unicode characters.
1071 * These should only be used for SMB, MSDOS or NTFS.
1073 * Illegal NTFS Char SFM Unicode Char
1074 * ----------------------------------------
1075 * 0x01-0x1f 0xf001-0xf01f
1084 * ' ' 0xf028 (Only if last char of the name)
1085 * '.' 0xf029 (Only if last char of the name)
1086 * ----------------------------------------
1088 * Reference: http://support.microsoft.com/kb/q117258/
1091 #define MAX_SFM2MAC 0x29
1092 #define SFMCODE_PREFIX_MASK 0xf000
1095 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1096 * SFM had no conversion for the colon. There is a conversion for the
1097 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1098 * is a slash and a slash is a colon. So we can just replace the slash with the
1099 * colon in our tables and everything will just work.
1103 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
1104 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
1105 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
1106 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
1107 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
1108 0x20, 0x2e /* 28 - 29 */
1113 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
1114 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
1115 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
1116 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
1117 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
1118 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
1119 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
1120 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
1121 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
1122 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
1123 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
1124 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
1129 * Encode illegal NTFS filename characters into SFM Private Unicode characters
1131 * Assumes non-zero ASCII input.
1134 ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
)
1136 /* The last character of filename cannot be a space or period. */
1140 else if (ucs_ch
== 0x2e)
1143 /* 0x01 - 0x1f is simple transformation. */
1144 if (ucs_ch
<= 0x1f) {
1145 return (ucs_ch
| 0xf000);
1146 } else /* 0x20 - 0x7f */ {
1149 lsb
= mac2sfm
[ucs_ch
- 0x0020];
1151 return(0xf000 | lsb
);
1157 * Decode any SFM Private Unicode characters
1160 sfm_to_ucs(u_int16_t ucs_ch
)
1162 if (((ucs_ch
& 0xffC0) == SFMCODE_PREFIX_MASK
) &&
1163 ((ucs_ch
& 0x003f) <= MAX_SFM2MAC
)) {
1164 ucs_ch
= sfm2mac
[ucs_ch
& 0x003f];