2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
24 Includes Unicode 3.2 decomposition code derived from Core Foundation
27 #include <sys/param.h>
28 #include <sys/utfconv.h>
29 #include <sys/errno.h>
30 #include <architecture/byte_order.h>
33 * UTF-8 (Unicode Transformation Format)
35 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
36 * character as a sequence of one to four bytes. Only the shortest form
37 * required to represent the significant Unicode bits is legal.
39 * UTF-8 Multibyte Codes
41 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
42 * -----------------------------------------------------------------------------
43 * 1 7 0x0000 0x007F 0xxxxxxx
44 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
45 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
46 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
47 * -----------------------------------------------------------------------------
51 #define UNICODE_TO_UTF8_LEN(c) \
52 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
54 #define UCS_ALT_NULL 0x2400
56 /* Surrogate Pair Constants */
57 #define SP_HALF_SHIFT 10
58 #define SP_HALF_BASE 0x0010000UL
59 #define SP_HALF_MASK 0x3FFUL
61 #define SP_HIGH_FIRST 0xD800UL
62 #define SP_HIGH_LAST 0xDBFFUL
63 #define SP_LOW_FIRST 0xDC00UL
64 #define SP_LOW_LAST 0xDFFFUL
67 #include "vfs_utfconvdata.h"
71 * Test for a combining character.
73 * Similar to __CFUniCharIsNonBaseCharacter except that
74 * unicode_combinable also includes Hangul Jamo characters.
77 unicode_combinable(u_int16_t character
)
79 const u_int8_t
*bitmap
= __CFUniCharCombiningBitmap
;
82 if (character
< 0x0300)
85 value
= bitmap
[(character
>> 8) & 0xFF];
90 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
91 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
97 * Test for a precomposed character.
99 * Similar to __CFUniCharIsDecomposableCharacter.
102 unicode_decomposeable(u_int16_t character
) {
103 const u_int8_t
*bitmap
= __CFUniCharDecomposableBitmap
;
106 if (character
< 0x00C0)
109 value
= bitmap
[(character
>> 8) & 0xFF];
114 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
115 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
120 static int unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
);
122 static u_int16_t
unicode_combine(u_int16_t base
, u_int16_t combining
);
125 char utf_extrabytes
[32] = {
126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
132 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
135 * If '/' chars are allowed on disk then an alternate
136 * (replacement) char must be provided in altslash.
139 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
142 utf8_encodelen(const u_int16_t
* ucsp
, size_t ucslen
, u_int16_t altslash
,
147 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
150 charcnt
= ucslen
/ 2;
153 while (charcnt
-- > 0) {
157 ucs_ch
= NXSwapShort(ucs_ch
);
159 ucs_ch
= altslash
? altslash
: '_';
160 else if (ucs_ch
== '\0')
161 ucs_ch
= UCS_ALT_NULL
;
163 len
+= UNICODE_TO_UTF8_LEN(ucs_ch
);
171 * utf8_encodestr - Encodes a Unicode string to UTF-8
174 * The resulting UTF-8 string is NULL terminated.
176 * If '/' chars are allowed on disk then an alternate
177 * (replacement) char must be provided in altslash.
180 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
181 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
184 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
185 * EINVAL: Illegal char found; char was replaced by an '_'.
188 utf8_encodestr(const u_int16_t
* ucsp
, size_t ucslen
, u_int8_t
* utf8p
,
189 size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
)
194 u_int16_t
* chp
= NULL
;
195 u_int16_t sequence
[8];
198 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
199 int nullterm
= ((flags
& UTF_NO_NULL_TERM
) == 0);
200 int decompose
= (flags
& UTF_DECOMPOSED
);
204 bufend
= bufstart
+ buflen
;
207 charcnt
= ucslen
/ 2;
209 while (charcnt
-- > 0) {
214 ucs_ch
= swapbytes
? NXSwapShort(*ucsp
++) : *ucsp
++;
216 if (decompose
&& unicode_decomposeable(ucs_ch
)) {
217 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
219 ucs_ch
= sequence
[0];
224 /* Slash and NULL are not permitted */
232 } else if (ucs_ch
== '\0') {
233 ucs_ch
= UCS_ALT_NULL
;
236 if (ucs_ch
< 0x0080) {
237 if (utf8p
>= bufend
) {
238 result
= ENAMETOOLONG
;
243 } else if (ucs_ch
< 0x800) {
244 if ((utf8p
+ 1) >= bufend
) {
245 result
= ENAMETOOLONG
;
248 *utf8p
++ = 0xc0 | (ucs_ch
>> 6);
249 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
252 /* Combine valid surrogate pairs */
253 if (ucs_ch
>= SP_HIGH_FIRST
&& ucs_ch
<= SP_HIGH_LAST
258 ch2
= swapbytes
? NXSwapShort(*ucsp
) : *ucsp
;
259 if (ch2
>= SP_LOW_FIRST
&& ch2
<= SP_LOW_LAST
) {
260 pair
= ((ucs_ch
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
)
261 + (ch2
- SP_LOW_FIRST
) + SP_HALF_BASE
;
262 if ((utf8p
+ 3) >= bufend
) {
263 result
= ENAMETOOLONG
;
268 *utf8p
++ = 0xf0 | (pair
>> 18);
269 *utf8p
++ = 0x80 | (0x3f & (pair
>> 12));
270 *utf8p
++ = 0x80 | (0x3f & (pair
>> 6));
271 *utf8p
++ = 0x80 | (0x3f & pair
);
275 if ((utf8p
+ 2) >= bufend
) {
276 result
= ENAMETOOLONG
;
279 *utf8p
++ = 0xe0 | (ucs_ch
>> 12);
280 *utf8p
++ = 0x80 | (0x3f & (ucs_ch
>> 6));
281 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
285 *utf8len
= utf8p
- bufstart
;
294 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
297 * The input UTF-8 string does not need to be null terminated
300 * If '/' chars are allowed on disk then an alternate
301 * (replacement) char must be provided in altslash.
304 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
305 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
308 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
309 * EINVAL: Illegal UTF-8 sequence found.
312 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
,
313 size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
)
320 int decompose
, precompose
, swapbytes
;
322 decompose
= (flags
& UTF_DECOMPOSED
);
323 precompose
= (flags
& UTF_PRECOMPOSED
);
324 swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
327 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
329 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
333 /* check for ascii */
335 ucs_ch
= byte
; /* 1st byte */
338 int extrabytes
= utf_extrabytes
[byte
>> 3];
340 if (utf8len
< extrabytes
)
342 utf8len
-= extrabytes
;
344 switch (extrabytes
) {
346 ch
= byte
; ch
<<= 6; /* 1st byte */
347 byte
= *utf8p
++; /* 2nd byte */
348 if ((byte
>> 6) != 2)
357 ch
= byte
; ch
<<= 6; /* 1st byte */
358 byte
= *utf8p
++; /* 2nd byte */
359 if ((byte
>> 6) != 2)
361 ch
+= byte
; ch
<<= 6;
362 byte
= *utf8p
++; /* 3rd byte */
363 if ((byte
>> 6) != 2)
372 if (ch
== 0xFFFE || ch
== 0xFFFF)
378 ch
= byte
; ch
<<= 6; /* 1st byte */
379 byte
= *utf8p
++; /* 2nd byte */
380 if ((byte
>> 6) != 2)
382 ch
+= byte
; ch
<<= 6;
383 byte
= *utf8p
++; /* 3rd byte */
384 if ((byte
>> 6) != 2)
386 ch
+= byte
; ch
<<= 6;
387 byte
= *utf8p
++; /* 4th byte */
388 if ((byte
>> 6) != 2)
391 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
392 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
393 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
395 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
398 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
399 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
)
401 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
407 if (unicode_decomposeable(ucs_ch
)) {
408 u_int16_t sequence
[8];
411 count
= unicode_decompose(ucs_ch
, sequence
);
413 for (i
= 0; i
< count
; ++i
) {
414 ucs_ch
= sequence
[i
];
415 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
421 } else if (precompose
&& (ucsp
!= bufstart
)) {
422 u_int16_t composite
, base
;
424 if (unicode_combinable(ucs_ch
)) {
425 base
= swapbytes
? NXSwapShort(*(ucsp
- 1)) : *(ucsp
- 1);
426 composite
= unicode_combine(base
, ucs_ch
);
433 if (ucs_ch
== UCS_ALT_NULL
)
436 if (ucs_ch
== altslash
)
439 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
443 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
452 result
= ENAMETOOLONG
;
458 * utf8_validatestr - Check for a valid UTF-8 string.
461 utf8_validatestr(const u_int8_t
* utf8p
, size_t utf8len
)
468 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
470 continue; /* plain ascii */
472 extrabytes
= utf_extrabytes
[byte
>> 3];
474 if (utf8len
< extrabytes
)
476 utf8len
-= extrabytes
;
478 switch (extrabytes
) {
480 ch
= byte
; ch
<<= 6; /* 1st byte */
481 byte
= *utf8p
++; /* 2nd byte */
482 if ((byte
>> 6) != 2)
490 ch
= byte
; ch
<<= 6; /* 1st byte */
491 byte
= *utf8p
++; /* 2nd byte */
492 if ((byte
>> 6) != 2)
494 ch
+= byte
; ch
<<= 6;
495 byte
= *utf8p
++; /* 3rd byte */
496 if ((byte
>> 6) != 2)
505 if (ch
== 0xFFFE || ch
== 0xFFFF)
510 ch
= byte
; ch
<<= 6; /* 1st byte */
511 byte
= *utf8p
++; /* 2nd byte */
512 if ((byte
>> 6) != 2)
514 ch
+= byte
; ch
<<= 6;
515 byte
= *utf8p
++; /* 3rd byte */
516 if ((byte
>> 6) != 2)
518 ch
+= byte
; ch
<<= 6;
519 byte
= *utf8p
++; /* 4th byte */
520 if ((byte
>> 6) != 2)
523 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
524 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
525 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
527 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
528 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
)
543 * Unicode 3.2 decomposition code (derived from Core Foundation)
549 } unicode_mappings32
;
551 static inline u_int32_t
552 getmappedvalue32(const unicode_mappings32
*theTable
, u_int32_t numElem
,
555 const unicode_mappings32
*p
, *q
, *divider
;
557 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
563 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
564 if (character
< divider
->_key
) { q
= divider
- 1; }
565 else if (character
> divider
->_key
) { p
= divider
+ 1; }
566 else { return (divider
->_value
); }
571 #define RECURSIVE_DECOMPOSITION (1 << 15)
572 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
577 } unicode_mappings16
;
579 static inline u_int16_t
580 getmappedvalue16(const unicode_mappings16
*theTable
, u_int32_t numElem
,
583 const unicode_mappings16
*p
, *q
, *divider
;
585 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
591 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
592 if (character
< divider
->_key
)
594 else if (character
> divider
->_key
)
597 return (divider
->_value
);
604 unicode_recursive_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
610 const u_int16_t
*bmpMappings
;
611 u_int32_t usedLength
;
613 value
= getmappedvalue16(
614 (const unicode_mappings16
*)__CFUniCharDecompositionTable
,
615 __UniCharDecompositionTableLength
, character
);
616 length
= EXTRACT_COUNT(value
);
617 firstChar
= value
& 0x0FFF;
619 bmpMappings
= (length
== 1 ? &theChar
: __CFUniCharMultipleDecompositionTable
+ firstChar
);
622 if (value
& RECURSIVE_DECOMPOSITION
) {
623 usedLength
= unicode_recursive_decompose((u_int16_t
)*bmpMappings
, convertedChars
);
625 --length
; /* Decrement for the first char */
629 convertedChars
+= usedLength
;
632 usedLength
+= length
;
635 *(convertedChars
++) = *(bmpMappings
++);
640 #define HANGUL_SBASE 0xAC00
641 #define HANGUL_LBASE 0x1100
642 #define HANGUL_VBASE 0x1161
643 #define HANGUL_TBASE 0x11A7
645 #define HANGUL_SCOUNT 11172
646 #define HANGUL_LCOUNT 19
647 #define HANGUL_VCOUNT 21
648 #define HANGUL_TCOUNT 28
649 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
652 * unicode_decompose - decompose a composed Unicode char
654 * Composed Unicode characters are forbidden on
655 * HFS Plus volumes. ucs_decompose will convert a
656 * composed character into its correct decomposed
659 * Similar to CFUniCharDecomposeCharacter
662 unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
664 if ((character
>= HANGUL_SBASE
) &&
665 (character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
668 character
-= HANGUL_SBASE
;
669 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
671 *(convertedChars
++) =
672 character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
673 *(convertedChars
++) =
674 (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
676 *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
679 return (unicode_recursive_decompose(character
, convertedChars
));
684 * unicode_combine - generate a precomposed Unicode char
686 * Precomposed Unicode characters are required for some volume
687 * formats and network protocols. unicode_combine will combine
688 * a decomposed character sequence into a single precomposed
689 * (composite) character.
691 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
692 * also handles Hangul Jamo characters.
695 unicode_combine(u_int16_t base
, u_int16_t combining
)
700 if ((combining
>= HANGUL_VBASE
) && (combining
< (HANGUL_TBASE
+ HANGUL_TCOUNT
))) {
701 /* 2 char Hangul sequences */
702 if ((combining
< (HANGUL_VBASE
+ HANGUL_VCOUNT
)) &&
703 (base
>= HANGUL_LBASE
&& base
< (HANGUL_LBASE
+ HANGUL_LCOUNT
))) {
704 return (HANGUL_SBASE
+
705 ((base
- HANGUL_LBASE
)*(HANGUL_VCOUNT
*HANGUL_TCOUNT
)) +
706 ((combining
- HANGUL_VBASE
)*HANGUL_TCOUNT
));
709 /* 3 char Hangul sequences */
710 if ((combining
> HANGUL_TBASE
) &&
711 (base
>= HANGUL_SBASE
&& base
< (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
712 if ((base
- HANGUL_SBASE
) % HANGUL_TCOUNT
)
715 return (base
+ (combining
- HANGUL_TBASE
));
719 value
= getmappedvalue32(
720 (const unicode_mappings32
*)__CFUniCharPrecompSourceTable
,
721 __CFUniCharPrecompositionTableLength
, combining
);
724 value
= getmappedvalue16(
725 (const unicode_mappings16
*)
726 ((u_int32_t
*)__CFUniCharBMPPrecompDestinationTable
+ (value
& 0xFFFF)),
727 (value
>> 16), base
);