2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
27 Includes Unicode 3.2 decomposition code derived from Core Foundation
30 #include <sys/param.h>
31 #include <sys/utfconv.h>
32 #include <sys/errno.h>
33 #include <architecture/byte_order.h>
36 * UTF-8 (Unicode Transformation Format)
38 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
39 * character as a sequence of one to four bytes. Only the shortest form
40 * required to represent the significant Unicode bits is legal.
42 * UTF-8 Multibyte Codes
44 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
45 * -----------------------------------------------------------------------------
46 * 1 7 0x0000 0x007F 0xxxxxxx
47 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
48 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
49 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
50 * -----------------------------------------------------------------------------
54 #define UNICODE_TO_UTF8_LEN(c) \
55 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
57 #define UCS_ALT_NULL 0x2400
59 /* Surrogate Pair Constants */
60 #define SP_HALF_SHIFT 10
61 #define SP_HALF_BASE 0x0010000UL
62 #define SP_HALF_MASK 0x3FFUL
64 #define SP_HIGH_FIRST 0xD800UL
65 #define SP_HIGH_LAST 0xDBFFUL
66 #define SP_LOW_FIRST 0xDC00UL
67 #define SP_LOW_LAST 0xDFFFUL
70 #include "vfs_utfconvdata.h"
74 * Test for a combining character.
76 * Similar to __CFUniCharIsNonBaseCharacter except that
77 * unicode_combinable also includes Hangul Jamo characters.
80 unicode_combinable(u_int16_t character
)
82 const u_int8_t
*bitmap
= __CFUniCharCombiningBitmap
;
85 if (character
< 0x0300)
88 value
= bitmap
[(character
>> 8) & 0xFF];
93 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
94 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
100 * Test for a precomposed character.
102 * Similar to __CFUniCharIsDecomposableCharacter.
105 unicode_decomposeable(u_int16_t character
) {
106 const u_int8_t
*bitmap
= __CFUniCharDecomposableBitmap
;
109 if (character
< 0x00C0)
112 value
= bitmap
[(character
>> 8) & 0xFF];
117 bitmap
= bitmap
+ ((value
- 1) * 32) + 256;
118 return (bitmap
[(character
& 0xFF) / 8] & (1 << (character
% 8)) ? 1 : 0);
123 static int unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
);
125 static u_int16_t
unicode_combine(u_int16_t base
, u_int16_t combining
);
128 char utf_extrabytes
[32] = {
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
135 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
138 * If '/' chars are allowed on disk then an alternate
139 * (replacement) char must be provided in altslash.
142 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
145 utf8_encodelen(const u_int16_t
* ucsp
, size_t ucslen
, u_int16_t altslash
,
150 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
153 charcnt
= ucslen
/ 2;
156 while (charcnt
-- > 0) {
160 ucs_ch
= NXSwapShort(ucs_ch
);
162 ucs_ch
= altslash
? altslash
: '_';
163 else if (ucs_ch
== '\0')
164 ucs_ch
= UCS_ALT_NULL
;
166 len
+= UNICODE_TO_UTF8_LEN(ucs_ch
);
174 * utf8_encodestr - Encodes a Unicode string to UTF-8
177 * The resulting UTF-8 string is NULL terminated.
179 * If '/' chars are allowed on disk then an alternate
180 * (replacement) char must be provided in altslash.
183 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
184 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
187 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
188 * EINVAL: Illegal char found; char was replaced by an '_'.
191 utf8_encodestr(const u_int16_t
* ucsp
, size_t ucslen
, u_int8_t
* utf8p
,
192 size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
)
197 u_int16_t
* chp
= NULL
;
198 u_int16_t sequence
[8];
201 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
202 int nullterm
= ((flags
& UTF_NO_NULL_TERM
) == 0);
203 int decompose
= (flags
& UTF_DECOMPOSED
);
207 bufend
= bufstart
+ buflen
;
210 charcnt
= ucslen
/ 2;
212 while (charcnt
-- > 0) {
217 ucs_ch
= swapbytes
? NXSwapShort(*ucsp
++) : *ucsp
++;
219 if (decompose
&& unicode_decomposeable(ucs_ch
)) {
220 extra
= unicode_decompose(ucs_ch
, sequence
) - 1;
222 ucs_ch
= sequence
[0];
227 /* Slash and NULL are not permitted */
235 } else if (ucs_ch
== '\0') {
236 ucs_ch
= UCS_ALT_NULL
;
239 if (ucs_ch
< 0x0080) {
240 if (utf8p
>= bufend
) {
241 result
= ENAMETOOLONG
;
246 } else if (ucs_ch
< 0x800) {
247 if ((utf8p
+ 1) >= bufend
) {
248 result
= ENAMETOOLONG
;
251 *utf8p
++ = 0xc0 | (ucs_ch
>> 6);
252 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
255 /* Combine valid surrogate pairs */
256 if (ucs_ch
>= SP_HIGH_FIRST
&& ucs_ch
<= SP_HIGH_LAST
261 ch2
= swapbytes
? NXSwapShort(*ucsp
) : *ucsp
;
262 if (ch2
>= SP_LOW_FIRST
&& ch2
<= SP_LOW_LAST
) {
263 pair
= ((ucs_ch
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
)
264 + (ch2
- SP_LOW_FIRST
) + SP_HALF_BASE
;
265 if ((utf8p
+ 3) >= bufend
) {
266 result
= ENAMETOOLONG
;
271 *utf8p
++ = 0xf0 | (pair
>> 18);
272 *utf8p
++ = 0x80 | (0x3f & (pair
>> 12));
273 *utf8p
++ = 0x80 | (0x3f & (pair
>> 6));
274 *utf8p
++ = 0x80 | (0x3f & pair
);
278 if ((utf8p
+ 2) >= bufend
) {
279 result
= ENAMETOOLONG
;
282 *utf8p
++ = 0xe0 | (ucs_ch
>> 12);
283 *utf8p
++ = 0x80 | (0x3f & (ucs_ch
>> 6));
284 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
288 *utf8len
= utf8p
- bufstart
;
297 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
300 * The input UTF-8 string does not need to be null terminated
303 * If '/' chars are allowed on disk then an alternate
304 * (replacement) char must be provided in altslash.
307 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
308 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
311 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
312 * EINVAL: Illegal UTF-8 sequence found.
315 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
,
316 size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
)
323 int decompose
, precompose
, swapbytes
;
325 decompose
= (flags
& UTF_DECOMPOSED
);
326 precompose
= (flags
& UTF_PRECOMPOSED
);
327 swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
330 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
332 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
336 /* check for ascii */
338 ucs_ch
= byte
; /* 1st byte */
341 int extrabytes
= utf_extrabytes
[byte
>> 3];
343 if (utf8len
< extrabytes
)
345 utf8len
-= extrabytes
;
347 switch (extrabytes
) {
349 ch
= byte
; ch
<<= 6; /* 1st byte */
350 byte
= *utf8p
++; /* 2nd byte */
351 if ((byte
>> 6) != 2)
360 ch
= byte
; ch
<<= 6; /* 1st byte */
361 byte
= *utf8p
++; /* 2nd byte */
362 if ((byte
>> 6) != 2)
364 ch
+= byte
; ch
<<= 6;
365 byte
= *utf8p
++; /* 3rd byte */
366 if ((byte
>> 6) != 2)
375 if (ch
== 0xFFFE || ch
== 0xFFFF)
381 ch
= byte
; ch
<<= 6; /* 1st byte */
382 byte
= *utf8p
++; /* 2nd byte */
383 if ((byte
>> 6) != 2)
385 ch
+= byte
; ch
<<= 6;
386 byte
= *utf8p
++; /* 3rd byte */
387 if ((byte
>> 6) != 2)
389 ch
+= byte
; ch
<<= 6;
390 byte
= *utf8p
++; /* 4th byte */
391 if ((byte
>> 6) != 2)
394 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
395 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
396 if (ucs_ch
< SP_HIGH_FIRST
|| ucs_ch
> SP_HIGH_LAST
)
398 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
401 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
402 if (ucs_ch
< SP_LOW_FIRST
|| ucs_ch
> SP_LOW_LAST
)
404 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
410 if (unicode_decomposeable(ucs_ch
)) {
411 u_int16_t sequence
[8];
414 count
= unicode_decompose(ucs_ch
, sequence
);
416 for (i
= 0; i
< count
; ++i
) {
417 ucs_ch
= sequence
[i
];
418 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
424 } else if (precompose
&& (ucsp
!= bufstart
)) {
425 u_int16_t composite
, base
;
427 if (unicode_combinable(ucs_ch
)) {
428 base
= swapbytes
? NXSwapShort(*(ucsp
- 1)) : *(ucsp
- 1);
429 composite
= unicode_combine(base
, ucs_ch
);
436 if (ucs_ch
== UCS_ALT_NULL
)
439 if (ucs_ch
== altslash
)
442 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
446 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
455 result
= ENAMETOOLONG
;
461 * Unicode 3.2 decomposition code (derived from Core Foundation)
467 } unicode_mappings32
;
469 static inline u_int32_t
470 getmappedvalue32(const unicode_mappings32
*theTable
, u_int32_t numElem
,
473 const unicode_mappings32
*p
, *q
, *divider
;
475 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
481 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
482 if (character
< divider
->_key
) { q
= divider
- 1; }
483 else if (character
> divider
->_key
) { p
= divider
+ 1; }
484 else { return (divider
->_value
); }
489 #define RECURSIVE_DECOMPOSITION (1 << 15)
490 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
495 } unicode_mappings16
;
497 static inline u_int16_t
498 getmappedvalue16(const unicode_mappings16
*theTable
, u_int32_t numElem
,
501 const unicode_mappings16
*p
, *q
, *divider
;
503 if ((character
< theTable
[0]._key
) || (character
> theTable
[numElem
-1]._key
))
509 divider
= p
+ ((q
- p
) >> 1); /* divide by 2 */
510 if (character
< divider
->_key
)
512 else if (character
> divider
->_key
)
515 return (divider
->_value
);
522 unicode_recursive_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
528 const u_int16_t
*bmpMappings
;
529 u_int32_t usedLength
;
531 value
= getmappedvalue16(
532 (const unicode_mappings16
*)__CFUniCharDecompositionTable
,
533 __UniCharDecompositionTableLength
, character
);
534 length
= EXTRACT_COUNT(value
);
535 firstChar
= value
& 0x0FFF;
537 bmpMappings
= (length
== 1 ? &theChar
: __CFUniCharMultipleDecompositionTable
+ firstChar
);
540 if (value
& RECURSIVE_DECOMPOSITION
) {
541 usedLength
= unicode_recursive_decompose((u_int16_t
)*bmpMappings
, convertedChars
);
543 --length
; /* Decrement for the first char */
547 convertedChars
+= usedLength
;
550 usedLength
+= length
;
553 *(convertedChars
++) = *(bmpMappings
++);
558 #define HANGUL_SBASE 0xAC00
559 #define HANGUL_LBASE 0x1100
560 #define HANGUL_VBASE 0x1161
561 #define HANGUL_TBASE 0x11A7
563 #define HANGUL_SCOUNT 11172
564 #define HANGUL_LCOUNT 19
565 #define HANGUL_VCOUNT 21
566 #define HANGUL_TCOUNT 28
567 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
570 * unicode_decompose - decompose a composed Unicode char
572 * Composed Unicode characters are forbidden on
573 * HFS Plus volumes. ucs_decompose will convert a
574 * composed character into its correct decomposed
577 * Similar to CFUniCharDecomposeCharacter
580 unicode_decompose(u_int16_t character
, u_int16_t
*convertedChars
)
582 if ((character
>= HANGUL_SBASE
) &&
583 (character
<= (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
586 character
-= HANGUL_SBASE
;
587 length
= (character
% HANGUL_TCOUNT
? 3 : 2);
589 *(convertedChars
++) =
590 character
/ HANGUL_NCOUNT
+ HANGUL_LBASE
;
591 *(convertedChars
++) =
592 (character
% HANGUL_NCOUNT
) / HANGUL_TCOUNT
+ HANGUL_VBASE
;
594 *convertedChars
= (character
% HANGUL_TCOUNT
) + HANGUL_TBASE
;
597 return (unicode_recursive_decompose(character
, convertedChars
));
602 * unicode_combine - generate a precomposed Unicode char
604 * Precomposed Unicode characters are required for some volume
605 * formats and network protocols. unicode_combine will combine
606 * a decomposed character sequence into a single precomposed
607 * (composite) character.
609 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
610 * also handles Hangul Jamo characters.
613 unicode_combine(u_int16_t base
, u_int16_t combining
)
618 if ((combining
>= HANGUL_VBASE
) && (combining
< (HANGUL_TBASE
+ HANGUL_TCOUNT
))) {
619 /* 2 char Hangul sequences */
620 if ((combining
< (HANGUL_VBASE
+ HANGUL_VCOUNT
)) &&
621 (base
>= HANGUL_LBASE
&& base
< (HANGUL_LBASE
+ HANGUL_LCOUNT
))) {
622 return (HANGUL_SBASE
+
623 ((base
- HANGUL_LBASE
)*(HANGUL_VCOUNT
*HANGUL_TCOUNT
)) +
624 ((combining
- HANGUL_VBASE
)*HANGUL_TCOUNT
));
627 /* 3 char Hangul sequences */
628 if ((combining
> HANGUL_TBASE
) &&
629 (base
>= HANGUL_SBASE
&& base
< (HANGUL_SBASE
+ HANGUL_SCOUNT
))) {
630 if ((base
- HANGUL_SBASE
) % HANGUL_TCOUNT
)
633 return (base
+ (combining
- HANGUL_TBASE
));
637 value
= getmappedvalue32(
638 (const unicode_mappings32
*)__CFUniCharPrecompSourceTable
,
639 __CFUniCharPrecompositionTableLength
, combining
);
642 value
= getmappedvalue16(
643 (const unicode_mappings16
*)
644 ((u_int32_t
*)__CFUniCharBMPPrecompDestinationTable
+ (value
& 0xFFFF)),
645 (value
>> 16), base
);