2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 Includes Unicode 3.2 decomposition code derived from Core Foundation
33 #include <sys/param.h>
34 #include <sys/utfconv.h>
35 #include <sys/errno.h>
36 #include <architecture/byte_order.h>
39 * UTF-8 (Unicode Transformation Format)
41 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
42 * character as a sequence of one to four bytes. Only the shortest form
43 * required to represent the significant Unicode bits is legal.
45 * UTF-8 Multibyte Codes
47 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
48 * -----------------------------------------------------------------------------
49 * 1 7 0x0000 0x007F 0xxxxxxx
50 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
51 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
52 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
53 * -----------------------------------------------------------------------------
57 #define UNICODE_TO_UTF8_LEN(c) \
58 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
60 #define UCS_ALT_NULL 0x2400
62 /* Surrogate Pair Constants */
63 #define SP_HALF_SHIFT 10
64 #define SP_HALF_BASE 0x0010000UL
65 #define SP_HALF_MASK 0x3FFUL
67 #define SP_HIGH_FIRST 0xD800UL
68 #define SP_HIGH_LAST 0xDBFFUL
69 #define SP_LOW_FIRST 0xDC00UL
70 #define SP_LOW_LAST 0xDFFFUL
73 #include "vfs_utfconvdata.h"
77 * Test for a combining character.
79 * Similar to __CFUniCharIsNonBaseCharacter except that
80 * unicode_combinable also includes Hangul Jamo characters.
83 unicode_combinable(u_int16_t character
)
85 const u_int8_t
*bitmap
= __CFUniCharCombiningBitmap
;
88 if (character
< 0x0300)
91 value
= bitmap
[(character
>> 8) & 0xFF];
96 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
97 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
103 * Test for a precomposed character.
105 * Similar to __CFUniCharIsDecomposableCharacter.
108 unicode_decomposeable(u_int16_t character
) {
109 const u_int8_t
*bitmap
= __CFUniCharDecomposableBitmap
;
112 if (character
< 0x00C0)
115 value
= bitmap
[(character
>> 8) & 0xFF];
120 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
121 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
128 * Get the combing class.
130 * Similar to CFUniCharGetCombiningPropertyForCharacter.
132 static inline u_int8_t
133 get_combining_class(u_int16_t character
) {
134 const u_int8_t
*bitmap
= __CFUniCharCombiningPropertyBitmap
;
136 u_int8_t value
= bitmap
[(character
>> 8)];
139 bitmap
= bitmap
+ (value
* 256);
140 return bitmap
[character
% 256];
146 static int unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
);
148 static u_int16_t
unicode_combine(u_int16_t base
, u_int16_t combining
);
150 static void priortysort(u_int16_t
* characters
, int count
);
152 char utf_extrabytes
[32] = {
153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
154 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
159 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
162 * If '/' chars are allowed on disk then an alternate
163 * (replacement) char must be provided in altslash.
166 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
169 utf8_encodelen(const u_int16_t
* ucsp
, size_t ucslen
, u_int16_t altslash
,
174 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
177 charcnt
= ucslen
/ 2;
180 while (charcnt
-- > 0) {
184 ucs_ch
= NXSwapShort(ucs_ch
);
186 ucs_ch
= altslash
? altslash
: '_';
187 else if (ucs_ch
== '\0')
188 ucs_ch
= UCS_ALT_NULL
;
190 len
+= UNICODE_TO_UTF8_LEN(ucs_ch
);
198 * utf8_encodestr - Encodes a Unicode string to UTF-8
201 * The resulting UTF-8 string is NULL terminated.
203 * If '/' chars are allowed on disk then an alternate
204 * (replacement) char must be provided in altslash.
207 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
208 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
211 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
212 * EINVAL: Illegal char found; char was replaced by an '_'.
215 utf8_encodestr(const u_int16_t
* ucsp
, size_t ucslen
, u_int8_t
* utf8p
,
216 size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
)
221 u_int16_t
* chp
= NULL
;
222 u_int16_t sequence
[8];
225 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
226 int nullterm
= ((flags
& UTF_NO_NULL_TERM
) == 0);
227 int decompose
= (flags
& UTF_DECOMPOSED
);
231 bufend
= bufstart
+ buflen
;
234 charcnt
= ucslen
/ 2;
236 while (charcnt
-- > 0) {
241 ucs_ch
= swapbytes
? NXSwapShort(*ucsp
++) : *ucsp
++;
243 if (decompose
&& unicode_decomposeable(ucs_ch
)) {
244 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
246 ucs_ch
= sequence
[0];
251 /* Slash and NULL are not permitted */
259 } else if (ucs_ch
== '\0') {
260 ucs_ch
= UCS_ALT_NULL
;
263 if (ucs_ch
< 0x0080) {
264 if (utf8p
>= bufend
) {
265 result
= ENAMETOOLONG
;
270 } else if (ucs_ch
< 0x800) {
271 if ((utf8p
+ 1) >= bufend
) {
272 result
= ENAMETOOLONG
;
275 *utf8p
++ = 0xc0 | (ucs_ch
>> 6);
276 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
279 /* Combine valid surrogate pairs */
280 if (ucs_ch
>= SP_HIGH_FIRST
&& ucs_ch
<= SP_HIGH_LAST
285 ch2
= swapbytes
? NXSwapShort(*ucsp
) : *ucsp
;
286 if (ch2
>= SP_LOW_FIRST
&& ch2
<= SP_LOW_LAST
) {
287 pair
= ((ucs_ch
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
)
288 + (ch2
- SP_LOW_FIRST
) + SP_HALF_BASE
;
289 if ((utf8p
+ 3) >= bufend
) {
290 result
= ENAMETOOLONG
;
295 *utf8p
++ = 0xf0 | (pair
>> 18);
296 *utf8p
++ = 0x80 | (0x3f & (pair
>> 12));
297 *utf8p
++ = 0x80 | (0x3f & (pair
>> 6));
298 *utf8p
++ = 0x80 | (0x3f & pair
);
302 if ((utf8p
+ 2) >= bufend
) {
303 result
= ENAMETOOLONG
;
306 *utf8p
++ = 0xe0 | (ucs_ch
>> 12);
307 *utf8p
++ = 0x80 | (0x3f & (ucs_ch
>> 6));
308 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
312 *utf8len
= utf8p
- bufstart
;
321 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
324 * The input UTF-8 string does not need to be null terminated
327 * If '/' chars are allowed on disk then an alternate
328 * (replacement) char must be provided in altslash.
331 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
332 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
335 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
336 * EINVAL: Illegal UTF-8 sequence found.
339 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
,
340 size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
)
348 int decompose
, precompose
, swapbytes
;
350 decompose
= (flags
& UTF_DECOMPOSED
);
351 precompose
= (flags
& UTF_PRECOMPOSED
);
352 swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
355 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
357 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
361 /* check for ascii */
363 ucs_ch
= byte
; /* 1st byte */
366 int extrabytes
= utf_extrabytes
[byte
>> 3];
368 if (utf8len
< extrabytes
)
370 utf8len
-= extrabytes
;
372 switch (extrabytes
) {
374 ch
= byte
; ch
<<= 6; /* 1st byte */
375 byte
= *utf8p
++; /* 2nd byte */
376 if ((byte
>> 6) != 2)
385 ch
= byte
; ch
<<= 6; /* 1st byte */
386 byte
= *utf8p
++; /* 2nd byte */
387 if ((byte
>> 6) != 2)
389 ch
+= byte
; ch
<<= 6;
390 byte
= *utf8p
++; /* 3rd byte */
391 if ((byte
>> 6) != 2)
400 if (ch
== 0xFFFE || ch
== 0xFFFF)
406 ch
= byte
; ch
<<= 6; /* 1st byte */
407 byte
= *utf8p
++; /* 2nd byte */
408 if ((byte
>> 6) != 2)
410 ch
+= byte
; ch
<<= 6;
411 byte
= *utf8p
++; /* 3rd byte */
412 if ((byte
>> 6) != 2)
414 ch
+= byte
; ch
<<= 6;
415 byte
= *utf8p
++; /* 4th byte */
416 if ((byte
>> 6) != 2)
419 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
420 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
421 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
423 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
426 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
427 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
)
429 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
435 if (unicode_decomposeable(ucs_ch
)) {
436 u_int16_t sequence
[8];
439 count
= unicode_decompose(ucs_ch
, sequence
);
441 for (i
= 0; i
< count
; ++i
) {
442 ucs_ch
= sequence
[i
];
443 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
447 combcharcnt
+= count
- 1;
450 } else if (precompose
&& (ucsp
!= bufstart
)) {
451 u_int16_t composite
, base
;
453 if (unicode_combinable(ucs_ch
)) {
454 base
= swapbytes
? NXSwapShort(*(ucsp
- 1)) : *(ucsp
- 1);
455 composite
= unicode_combine(base
, ucs_ch
);
462 if (ucs_ch
== UCS_ALT_NULL
)
465 if (ucs_ch
== altslash
)
469 * Make multiple combining character sequences canonical
471 if (unicode_combinable(ucs_ch
)) {
472 ++combcharcnt
; /* start tracking a run */
473 } else if (combcharcnt
) {
474 if (combcharcnt
> 1) {
475 priortysort(ucsp
- combcharcnt
, combcharcnt
);
477 combcharcnt
= 0; /* start over */
479 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
482 * Make a previous combining sequence canonical
484 if (combcharcnt
> 1) {
485 priortysort(ucsp
- combcharcnt
, combcharcnt
);
489 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
498 result
= ENAMETOOLONG
;
504 * utf8_validatestr - Check for a valid UTF-8 string.
507 utf8_validatestr(const u_int8_t
* utf8p
, size_t utf8len
)
514 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
516 continue; /* plain ascii */
518 extrabytes
= utf_extrabytes
[byte
>> 3];
520 if (utf8len
< extrabytes
)
522 utf8len
-= extrabytes
;
524 switch (extrabytes
) {
526 ch
= byte
; ch
<<= 6; /* 1st byte */
527 byte
= *utf8p
++; /* 2nd byte */
528 if ((byte
>> 6) != 2)
536 ch
= byte
; ch
<<= 6; /* 1st byte */
537 byte
= *utf8p
++; /* 2nd byte */
538 if ((byte
>> 6) != 2)
540 ch
+= byte
; ch
<<= 6;
541 byte
= *utf8p
++; /* 3rd byte */
542 if ((byte
>> 6) != 2)
551 if (ch
== 0xFFFE || ch
== 0xFFFF)
556 ch
= byte
; ch
<<= 6; /* 1st byte */
557 byte
= *utf8p
++; /* 2nd byte */
558 if ((byte
>> 6) != 2)
560 ch
+= byte
; ch
<<= 6;
561 byte
= *utf8p
++; /* 3rd byte */
562 if ((byte
>> 6) != 2)
564 ch
+= byte
; ch
<<= 6;
565 byte
= *utf8p
++; /* 4th byte */
566 if ((byte
>> 6) != 2)
569 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
570 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
571 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
573 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
574 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
)
589 * Unicode 3.2 decomposition code (derived from Core Foundation)
595 } unicode_mappings32
;
597 static inline u_int32_t
598 getmappedvalue32(const unicode_mappings32
*theTable
, u_int32_t numElem
,
601 const unicode_mappings32
*p
, *q
, *divider
;
603 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
609 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
610 if (character
< divider
->_key
) { q
= divider
- 1; }
611 else if (character
> divider
->_key
) { p
= divider
+ 1; }
612 else { return (divider
->_value
); }
617 #define RECURSIVE_DECOMPOSITION (1 << 15)
618 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
623 } unicode_mappings16
;
625 static inline u_int16_t
626 getmappedvalue16(const unicode_mappings16
*theTable
, u_int32_t numElem
,
629 const unicode_mappings16
*p
, *q
, *divider
;
631 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
637 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
638 if (character
< divider
->_key
)
640 else if (character
> divider
->_key
)
643 return (divider
->_value
);
650 unicode_recursive_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
656 const u_int16_t
*bmpMappings
;
657 u_int32_t usedLength
;
659 value
= getmappedvalue16(
660 (const unicode_mappings16
*)__CFUniCharDecompositionTable
,
661 __UniCharDecompositionTableLength
, character
);
662 length
= EXTRACT_COUNT(value
);
663 firstChar
= value
& 0x0FFF;
665 bmpMappings
= (length
== 1 ? &theChar
: __CFUniCharMultipleDecompositionTable
+ firstChar
);
668 if (value
& RECURSIVE_DECOMPOSITION
) {
669 usedLength
= unicode_recursive_decompose((u_int16_t
)*bmpMappings
, convertedChars
);
671 --length
; /* Decrement for the first char */
675 convertedChars
+= usedLength
;
678 usedLength
+= length
;
681 *(convertedChars
++) = *(bmpMappings
++);
686 #define HANGUL_SBASE 0xAC00
687 #define HANGUL_LBASE 0x1100
688 #define HANGUL_VBASE 0x1161
689 #define HANGUL_TBASE 0x11A7
691 #define HANGUL_SCOUNT 11172
692 #define HANGUL_LCOUNT 19
693 #define HANGUL_VCOUNT 21
694 #define HANGUL_TCOUNT 28
695 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
698 * unicode_decompose - decompose a composed Unicode char
700 * Composed Unicode characters are forbidden on
701 * HFS Plus volumes. ucs_decompose will convert a
702 * composed character into its correct decomposed
705 * Similar to CFUniCharDecomposeCharacter
708 unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
710 if ((character
>= HANGUL_SBASE
) &&
711 (character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
714 character
-= HANGUL_SBASE
;
715 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
717 *(convertedChars
++) =
718 character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
719 *(convertedChars
++) =
720 (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
722 *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
725 return (unicode_recursive_decompose(character
, convertedChars
));
730 * unicode_combine - generate a precomposed Unicode char
732 * Precomposed Unicode characters are required for some volume
733 * formats and network protocols. unicode_combine will combine
734 * a decomposed character sequence into a single precomposed
735 * (composite) character.
737 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
738 * also handles Hangul Jamo characters.
741 unicode_combine(u_int16_t base
, u_int16_t combining
)
746 if ((combining
>= HANGUL_VBASE
) && (combining
< (HANGUL_TBASE
+ HANGUL_TCOUNT
))) {
747 /* 2 char Hangul sequences */
748 if ((combining
< (HANGUL_VBASE
+ HANGUL_VCOUNT
)) &&
749 (base
>= HANGUL_LBASE
&& base
< (HANGUL_LBASE
+ HANGUL_LCOUNT
))) {
750 return (HANGUL_SBASE
+
751 ((base
- HANGUL_LBASE
)*(HANGUL_VCOUNT
*HANGUL_TCOUNT
)) +
752 ((combining
- HANGUL_VBASE
)*HANGUL_TCOUNT
));
755 /* 3 char Hangul sequences */
756 if ((combining
> HANGUL_TBASE
) &&
757 (base
>= HANGUL_SBASE
&& base
< (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
758 if ((base
- HANGUL_SBASE
) % HANGUL_TCOUNT
)
761 return (base
+ (combining
- HANGUL_TBASE
));
765 value
= getmappedvalue32(
766 (const unicode_mappings32
*)__CFUniCharPrecompSourceTable
,
767 __CFUniCharPrecompositionTableLength
, combining
);
770 value
= getmappedvalue16(
771 (const unicode_mappings16
*)
772 ((u_int32_t
*)__CFUniCharBMPPrecompDestinationTable
+ (value
& 0xFFFF)),
773 (value
>> 16), base
);
780 * priortysort - order combining chars into canonical order
782 * Similar to CFUniCharPrioritySort
785 priortysort(u_int16_t
* characters
, int count
)
788 u_int16_t
*ch1
, *ch2
;
792 end
= characters
+ count
;
796 ch2
= characters
+ 1;
797 p2
= get_combining_class(*ch1
);
800 p2
= get_combining_class(*ch2
);