2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. 
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 
   6  * This file contains Original Code and/or Modifications of Original Code 
   7  * as defined in and that are subject to the Apple Public Source License 
   8  * Version 2.0 (the 'License'). You may not use this file except in 
   9  * compliance with the License. The rights granted to you under the License 
  10  * may not be used to create, or enable the creation or redistribution of, 
  11  * unlawful or unlicensed copies of an Apple operating system, or to 
  12  * circumvent, violate, or enable the circumvention or violation of, any 
  13  * terms of an Apple operating system software license agreement. 
  15  * Please obtain a copy of the License at 
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file. 
  18  * The Original Code and all software distributed under the License are 
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
  23  * Please see the License for the specific language governing rights and 
  24  * limitations under the License. 
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 
  30         Includes Unicode 3.2 decomposition code derived from Core Foundation 
  33 #include <sys/param.h> 
  34 #include <sys/utfconv.h> 
  35 #include <sys/errno.h> 
  36 #include <sys/malloc.h> 
  37 #include <libkern/OSByteOrder.h> 
  40  * UTF-8 (Unicode Transformation Format) 
  42  * UTF-8 is the Unicode Transformation Format that serializes a Unicode 
  43  * character as a sequence of one to four bytes. Only the shortest form 
  44  * required to represent the significant Unicode bits is legal. 
  46  * UTF-8 Multibyte Codes 
  48  * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary) 
  49  * ----------------------------------------------------------------------------- 
  50  *   1       7       0x0000        0x007F    0xxxxxxx 
  51  *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx 
  52  *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx 
  53  *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
  54  * ----------------------------------------------------------------------------- 
  58 #define UNICODE_TO_UTF8_LEN(c)  \ 
  59         ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3))) 
  61 #define UCS_ALT_NULL    0x2400 
  63 /* Surrogate Pair Constants */ 
  64 #define SP_HALF_SHIFT   10 
  65 #define SP_HALF_BASE    0x0010000UL 
  66 #define SP_HALF_MASK    0x3FFUL 
  68 #define SP_HIGH_FIRST   0xD800UL 
  69 #define SP_HIGH_LAST    0xDBFFUL 
  70 #define SP_LOW_FIRST    0xDC00UL 
  71 #define SP_LOW_LAST     0xDFFFUL 
  74 #include "vfs_utfconvdata.h" 
  78  * Test for a combining character. 
  80  * Similar to __CFUniCharIsNonBaseCharacter except that 
  81  * unicode_combinable also includes Hangul Jamo characters. 
  84 unicode_combinable(u_int16_t character
) 
  86         const u_int8_t 
*bitmap 
= __CFUniCharCombiningBitmap
; 
  89         if (character 
< 0x0300) 
  92         value 
= bitmap
[(character 
>> 8) & 0xFF]; 
  97                 bitmap 
= bitmap 
+ ((value 
- 1) * 32) + 256; 
  98                 return (bitmap
[(character 
& 0xFF) / 8] & (1 << (character 
% 8)) ? 1 : 0); 
 104  * Test for a precomposed character. 
 106  * Similar to __CFUniCharIsDecomposableCharacter. 
 109 unicode_decomposeable(u_int16_t character
) { 
 110         const u_int8_t 
*bitmap 
= __CFUniCharDecomposableBitmap
; 
 113         if (character 
< 0x00C0) 
 116         value 
= bitmap
[(character 
>> 8) & 0xFF]; 
 121                 bitmap 
= bitmap 
+ ((value 
- 1) * 32) + 256; 
 122                 return (bitmap
[(character 
& 0xFF) / 8] & (1 << (character 
% 8)) ? 1 : 0); 
 129  * Get the combing class. 
 131  * Similar to CFUniCharGetCombiningPropertyForCharacter. 
 133 static inline u_int8_t
 
 134 get_combining_class(u_int16_t character
) { 
 135         const u_int8_t 
*bitmap 
= __CFUniCharCombiningPropertyBitmap
; 
 137         u_int8_t value 
= bitmap
[(character 
>> 8)]; 
 140                 bitmap 
= bitmap 
+ (value 
* 256); 
 141                 return bitmap
[character 
% 256]; 
 147 static int unicode_decompose(u_int16_t character
, u_int16_t 
*convertedChars
); 
 149 static u_int16_t 
unicode_combine(u_int16_t base
, u_int16_t combining
); 
 151 static void priortysort(u_int16_t
* characters
, int count
); 
 153 static u_int16_t  
ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
); 
 155 static u_int16_t  
sfm_to_ucs(u_int16_t ucs_ch
); 
 158 char utf_extrabytes
[32] = { 
 159          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 
 160         -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,  2,  2,  3, -1 
 163 const char hexdigits
[16] = { 
 164          '0',  '1',  '2',  '3',  '4',  '5',  '6', '7', 
 165          '8',  '9',  'A',  'B',  'C',  'D',  'E', 'F' 
 169  * utf8_encodelen - Calculate the UTF-8 encoding length 
 171  * This function takes a Unicode input string, ucsp, of ucslen bytes 
 172  * and calculates the size of the UTF-8 output in bytes (not including 
 173  * a NULL termination byte). The string must reside in kernel memory. 
 175  * If '/' chars are possible in the Unicode input then an alternate 
 176  * (replacement) char should be provided in altslash. 
 179  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime 
 181  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian 
 183  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian 
 185  *    UTF_DECOMPOSED:  generate fully decomposed output 
 187  *    UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it 
 193 utf8_encodelen(const u_int16_t 
* ucsp
, size_t ucslen
, u_int16_t altslash
, int flags
) 
 196         u_int16_t 
* chp 
= NULL
; 
 197         u_int16_t sequence
[8]; 
 200         int swapbytes 
= (flags 
& UTF_REVERSE_ENDIAN
); 
 201         int decompose 
= (flags 
& UTF_DECOMPOSED
); 
 204         charcnt 
= ucslen 
/ 2; 
 207         while (charcnt
-- > 0) { 
 214                                 ucs_ch 
= OSSwapInt16(ucs_ch
); 
 217                                 ucs_ch 
= altslash 
? altslash 
: '_'; 
 218                         } else if (ucs_ch 
== '\0') { 
 219                                 ucs_ch 
= UCS_ALT_NULL
; 
 220                         } else if (decompose 
&& unicode_decomposeable(ucs_ch
)) { 
 221                                 extra 
= unicode_decompose(ucs_ch
, sequence
) - 1; 
 223                                 ucs_ch 
= sequence
[0]; 
 227                 len 
+= UNICODE_TO_UTF8_LEN(ucs_ch
); 
 235  * utf8_encodestr - Encodes a Unicode string to UTF-8 
 238  *    The resulting UTF-8 string is NULL terminated. 
 240  *    If '/' chars are allowed on disk then an alternate 
 241  *    (replacement) char must be provided in altslash. 
 244  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime 
 246  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian 
 248  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian 
 250  *    UTF_DECOMPOSED:  generate fully decomposed output 
 252  *    UTF_NO_NULL_TERM:  don't add NULL termination to UTF-8 output 
 255  *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded 
 257  *    EINVAL: Illegal char found; char was replaced by an '_'. 
 260 utf8_encodestr(const u_int16_t 
* ucsp
, size_t ucslen
, u_int8_t 
* utf8p
, 
 261                size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
) 
 266         u_int16_t 
* chp 
= NULL
; 
 267         u_int16_t sequence
[8]; 
 270         int swapbytes 
= (flags 
& UTF_REVERSE_ENDIAN
); 
 271         int nullterm  
= ((flags 
& UTF_NO_NULL_TERM
) == 0); 
 272         int decompose 
= (flags 
& UTF_DECOMPOSED
); 
 273         int sfmconv 
= (flags 
& UTF_SFM_CONVERSIONS
); 
 277         bufend 
= bufstart 
+ buflen
; 
 280         charcnt 
= ucslen 
/ 2; 
 282         while (charcnt
-- > 0) { 
 287                         ucs_ch 
= swapbytes 
? OSSwapInt16(*ucsp
++) : *ucsp
++; 
 289                         if (decompose 
&& unicode_decomposeable(ucs_ch
)) { 
 290                                 extra 
= unicode_decompose(ucs_ch
, sequence
) - 1; 
 292                                 ucs_ch 
= sequence
[0]; 
 297                 /* Slash and NULL are not permitted */ 
 305                 } else if (ucs_ch 
== '\0') { 
 306                         ucs_ch 
= UCS_ALT_NULL
; 
 309                 if (ucs_ch 
< 0x0080) { 
 310                         if (utf8p 
>= bufend
) { 
 311                                 result 
= ENAMETOOLONG
; 
 316                 } else if (ucs_ch 
< 0x800) { 
 317                         if ((utf8p 
+ 1) >= bufend
) { 
 318                                 result 
= ENAMETOOLONG
; 
 321                         *utf8p
++ = 0xc0 | (ucs_ch 
>> 6); 
 322                         *utf8p
++ = 0x80 | (0x3f & ucs_ch
); 
 325                         /* These chars never valid Unicode. */ 
 326                         if (ucs_ch 
== 0xFFFE || ucs_ch 
== 0xFFFF) { 
 331                         /* Combine valid surrogate pairs */ 
 332                         if (ucs_ch 
>= SP_HIGH_FIRST 
&& ucs_ch 
<= SP_HIGH_LAST
 
 337                                 ch2 
= swapbytes 
? OSSwapInt16(*ucsp
) : *ucsp
; 
 338                                 if (ch2 
>= SP_LOW_FIRST 
&& ch2 
<= SP_LOW_LAST
) { 
 339                                         pair 
= ((ucs_ch 
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
) 
 340                                                 + (ch2 
- SP_LOW_FIRST
) + SP_HALF_BASE
; 
 341                                         if ((utf8p 
+ 3) >= bufend
) { 
 342                                                 result 
= ENAMETOOLONG
; 
 347                                         *utf8p
++ = 0xf0 | (pair 
>> 18); 
 348                                         *utf8p
++ = 0x80 | (0x3f & (pair 
>> 12)); 
 349                                         *utf8p
++ = 0x80 | (0x3f & (pair 
>> 6)); 
 350                                         *utf8p
++ = 0x80 | (0x3f & pair
); 
 353                         } else if (sfmconv
) { 
 354                                 ucs_ch 
= sfm_to_ucs(ucs_ch
); 
 355                                 if (ucs_ch 
< 0x0080) { 
 356                                         if (utf8p 
>= bufend
) { 
 357                                                 result 
= ENAMETOOLONG
; 
 364                         if ((utf8p 
+ 2) >= bufend
) { 
 365                                 result 
= ENAMETOOLONG
; 
 368                         *utf8p
++ = 0xe0 | (ucs_ch 
>> 12); 
 369                         *utf8p
++ = 0x80 | (0x3f & (ucs_ch 
>> 6)); 
 370                         *utf8p
++ = 0x80 | (0x3f & ucs_ch
); 
 374         *utf8len 
= utf8p 
- bufstart
; 
 383  * utf8_decodestr - Decodes a UTF-8 string back to Unicode 
 386  *    The input UTF-8 string does not need to be null terminated 
 389  *    If '/' chars are allowed on disk then an alternate 
 390  *    (replacement) char must be provided in altslash. 
 393  *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime 
 395  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian 
 397  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian 
 399  *    UTF_DECOMPOSED:  generate fully decomposed output (NFD) 
 401  *    UTF_PRECOMPOSED:  generate precomposed output (NFC) 
 403  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input 
 406  *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded. 
 408  *    EINVAL: Illegal UTF-8 sequence found. 
 411 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
, 
 412                size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
) 
 420         int decompose
, precompose
, swapbytes
, escaping
; 
 424         decompose  
= (flags 
& UTF_DECOMPOSED
); 
 425         precompose 
= (flags 
& UTF_PRECOMPOSED
); 
 426         swapbytes  
= (flags 
& UTF_REVERSE_ENDIAN
); 
 427         escaping   
= (flags 
& UTF_ESCAPE_ILLEGAL
); 
 428         sfmconv    
= (flags 
& UTF_SFM_CONVERSIONS
); 
 431         bufend 
= (u_int16_t 
*)((u_int8_t 
*)ucsp 
+ buflen
); 
 433         while (utf8len
-- > 0 && (byte 
= *utf8p
++) != '\0') { 
 437                 /* check for ascii */ 
 439                         ucs_ch 
= sfmconv 
? ucs_to_sfm(byte
, utf8len 
== 0) : byte
; 
 443                         extrabytes 
= utf_extrabytes
[byte 
>> 3]; 
 444                         if ((extrabytes 
< 0) || ((int)utf8len 
< extrabytes
)) { 
 447                         utf8len 
-= extrabytes
; 
 449                         switch (extrabytes
) { 
 451                                 ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 452                                 byte 
= *utf8p
++;       /* 2nd byte */ 
 453                                 if ((byte 
>> 6) != 2) 
 462                                 ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 463                                 byte 
= *utf8p
++;       /* 2nd byte */ 
 464                                 if ((byte 
>> 6) != 2) 
 466                                 ch 
+= byte
; ch 
<<= 6; 
 467                                 byte 
= *utf8p
++;       /* 3rd byte */ 
 468                                 if ((byte 
>> 6) != 2) 
 477                                         if (ch 
== 0xFFFE || ch 
== 0xFFFF) 
 483                                 ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 484                                 byte 
= *utf8p
++;       /* 2nd byte */ 
 485                                 if ((byte 
>> 6) != 2) 
 487                                 ch 
+= byte
; ch 
<<= 6; 
 488                                 byte 
= *utf8p
++;       /* 3rd byte */ 
 489                                 if ((byte 
>> 6) != 2) 
 491                                 ch 
+= byte
; ch 
<<= 6; 
 492                                 byte 
= *utf8p
++;       /* 4th byte */ 
 493                                 if ((byte 
>> 6) != 2) 
 496                                 ch 
-= 0x03C82080UL 
+ SP_HALF_BASE
; 
 497                                 ucs_ch 
= (ch 
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
; 
 498                                 if (ucs_ch 
< SP_HIGH_FIRST 
|| ucs_ch 
> SP_HIGH_LAST
) 
 500                                 *ucsp
++ = swapbytes 
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
; 
 503                                 ucs_ch 
= (ch 
& SP_HALF_MASK
) + SP_LOW_FIRST
; 
 504                                 if (ucs_ch 
< SP_LOW_FIRST 
|| ucs_ch 
> SP_LOW_LAST
) { 
 508                                 *ucsp
++ = swapbytes 
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
; 
 515                                 if (unicode_decomposeable(ucs_ch
)) { 
 516                                         u_int16_t sequence
[8]; 
 519                                         /* Before decomposing a new unicode character, sort  
 520                                          * previous combining characters, if any, and reset 
 523                                         if (combcharcnt 
> 1) { 
 524                                                 priortysort(ucsp 
- combcharcnt
, combcharcnt
); 
 528                                         count 
= unicode_decompose(ucs_ch
, sequence
); 
 529                                         for (i 
= 0; i 
< count
; ++i
) { 
 530                                                 ucs_ch 
= sequence
[i
]; 
 531                                                 *ucsp
++ = swapbytes 
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
; 
 535                                         combcharcnt 
+= count 
- 1; 
 538                         } else if (precompose 
&& (ucsp 
!= bufstart
)) { 
 539                                 u_int16_t composite
, base
; 
 541                                 if (unicode_combinable(ucs_ch
)) { 
 542                                         base 
= swapbytes 
? OSSwapInt16(*(ucsp 
- 1)) : *(ucsp 
- 1); 
 543                                         composite 
= unicode_combine(base
, ucs_ch
); 
 550                         if (ucs_ch 
== UCS_ALT_NULL
) 
 553                 if (ucs_ch 
== altslash
) 
 557                  * Make multiple combining character sequences canonical 
 559                 if (unicode_combinable(ucs_ch
)) { 
 560                         ++combcharcnt
;   /* start tracking a run */ 
 561                 } else if (combcharcnt
) { 
 562                         if (combcharcnt 
> 1) { 
 563                                 priortysort(ucsp 
- combcharcnt
, combcharcnt
); 
 565                         combcharcnt 
= 0;  /* start over */ 
 568                 *ucsp
++ = swapbytes 
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
; 
 572                  * Escape illegal UTF-8 into something legal. 
 588                         utf8len 
+= extrabytes
; 
 591                 if ((ucsp 
+ 2) >= bufend
) 
 595                 *ucsp
++ = swapbytes 
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
; 
 596                 ucs_ch 
=  hexdigits
[byte 
>> 4]; 
 597                 *ucsp
++ = swapbytes 
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
; 
 598                 ucs_ch 
=  hexdigits
[byte 
& 0x0F]; 
 599                 *ucsp
++ = swapbytes 
? OSSwapInt16(ucs_ch
) : (u_int16_t
)ucs_ch
; 
 602          * Make a previous combining sequence canonical 
 604         if (combcharcnt 
> 1) { 
 605                 priortysort(ucsp 
- combcharcnt
, combcharcnt
); 
 608         *ucslen 
= (u_int8_t
*)ucsp 
- (u_int8_t
*)bufstart
; 
 613         result 
= ENAMETOOLONG
; 
 619  * utf8_validatestr - Check for a valid UTF-8 string. 
 622 utf8_validatestr(const u_int8_t
* utf8p
, size_t utf8len
) 
 629         while (utf8len
-- > 0 && (byte 
= *utf8p
++) != '\0') { 
 631                         continue;  /* plain ascii */ 
 633                 extrabytes 
= utf_extrabytes
[byte 
>> 3]; 
 635                 if (utf8len 
< extrabytes
) 
 637                 utf8len 
-= extrabytes
; 
 639                 switch (extrabytes
) { 
 641                         ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 642                         byte 
= *utf8p
++;       /* 2nd byte */ 
 643                         if ((byte 
>> 6) != 2) 
 651                         ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 652                         byte 
= *utf8p
++;       /* 2nd byte */ 
 653                         if ((byte 
>> 6) != 2) 
 655                         ch 
+= byte
; ch 
<<= 6; 
 656                         byte 
= *utf8p
++;       /* 3rd byte */ 
 657                         if ((byte 
>> 6) != 2) 
 666                                 if (ch 
== 0xFFFE || ch 
== 0xFFFF) 
 671                         ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 672                         byte 
= *utf8p
++;       /* 2nd byte */ 
 673                         if ((byte 
>> 6) != 2) 
 675                         ch 
+= byte
; ch 
<<= 6; 
 676                         byte 
= *utf8p
++;       /* 3rd byte */ 
 677                         if ((byte 
>> 6) != 2) 
 679                         ch 
+= byte
; ch 
<<= 6; 
 680                         byte 
= *utf8p
++;       /* 4th byte */ 
 681                         if ((byte 
>> 6) != 2) 
 684                         ch 
-= 0x03C82080UL 
+ SP_HALF_BASE
; 
 685                         ucs_ch 
= (ch 
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
; 
 686                         if (ucs_ch 
< SP_HIGH_FIRST 
|| ucs_ch 
> SP_HIGH_LAST
) 
 688                         ucs_ch 
= (ch 
& SP_HALF_MASK
) + SP_LOW_FIRST
; 
 689                         if (ucs_ch 
< SP_LOW_FIRST 
|| ucs_ch 
> SP_LOW_LAST
) 
 703  * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD) 
 705  * This function takes an UTF-8 input string, instr, of inlen bytes 
 706  * and produces normalized UTF-8 output into a buffer of buflen bytes 
 707  * pointed to by outstr. The size of the output in bytes (not including 
 708  * a NULL termination byte) is returned in outlen. In-place conversions 
 709  * are not supported (i.e. instr != outstr).] 
 712  *    UTF_DECOMPOSED:  output string will be fully decomposed (NFD) 
 714  *    UTF_PRECOMPOSED:  output string will be precomposed (NFC) 
 716  *    UTF_NO_NULL_TERM:  do not add null termination to output string 
 718  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input 
 721  *    ENAMETOOLONG:  output did not fit or input exceeded MAXPATHLEN bytes 
 723  *    EINVAL:  illegal UTF-8 sequence encountered or invalid flags 
 726 utf8_normalizestr(const u_int8_t
* instr
, size_t inlen
, u_int8_t
* outstr
, 
 727                   size_t *outlen
, size_t buflen
, int flags
) 
 729         u_int16_t unicodebuf
[32]; 
 730         u_int16_t
* unistr 
= NULL
; 
 731         size_t unicode_bytes
; 
 734         u_int8_t 
*outbufstart
, *outbufend
; 
 735         const u_int8_t 
*inbufstart
; 
 737         int decompose
, precompose
; 
 740         if (flags 
& ~(UTF_DECOMPOSED 
| UTF_PRECOMPOSED 
| UTF_NO_NULL_TERM 
| UTF_ESCAPE_ILLEGAL
)) { 
 743         decompose 
= (flags 
& UTF_DECOMPOSED
); 
 744         precompose 
= (flags 
& UTF_PRECOMPOSED
); 
 745         if ((decompose 
&& precompose
) || (!decompose 
&& !precompose
)) { 
 748         outbufstart 
= outstr
; 
 749         outbufend 
= outbufstart 
+ buflen
; 
 753         while (inlen
-- > 0 && (byte 
= *instr
++) != '\0') { 
 754                 if (outstr 
>= outbufend
) { 
 755                         result 
= ENAMETOOLONG
; 
 761                 /* ASCII is already normalized. */ 
 765         *outlen 
= outstr 
- outbufstart
; 
 766         if (((flags 
& UTF_NO_NULL_TERM
) == 0)) { 
 767                 if (outstr 
< outbufend
) 
 770                         result 
= ENAMETOOLONG
; 
 776          * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr 
 777          * functions to perform the normalization.  Since this will 
 778          * presumably be used to normalize filenames in the back-end 
 779          * (on disk or over-the-wire), it should be fast enough. 
 783         /* Make sure the input size is reasonable. */ 
 784         if (inbuflen 
> MAXPATHLEN
) { 
 785                 result 
= ENAMETOOLONG
; 
 789          * Compute worst case Unicode buffer size. 
 791          * For pre-composed output, every UTF-8 input byte will be at 
 792          * most 2 Unicode bytes.  For decomposed output, 2 UTF-8 bytes 
 793          * (smallest composite char sequence) may yield 6 Unicode bytes 
 794          * (1 base char + 2 combining chars). 
 796         unicode_bytes 
= precompose 
? (inbuflen 
* 2) : (inbuflen 
* 3); 
 798         if (unicode_bytes 
<= sizeof(unicodebuf
)) 
 799                 unistr 
= &unicodebuf
[0]; 
 801                 MALLOC(unistr
, u_int16_t 
*, unicode_bytes
, M_TEMP
, M_WAITOK
); 
 803         /* Normalize the string. */ 
 804         result 
= utf8_decodestr(inbufstart
, inbuflen
, unistr
, &unicode_bytes
, 
 805                                 unicode_bytes
, 0, flags 
& ~UTF_NO_NULL_TERM
); 
 807                 /* Put results back into UTF-8. */ 
 808                 result 
= utf8_encodestr(unistr
, unicode_bytes
, outbufstart
, 
 809                                         &uft8_bytes
, buflen
, 0, UTF_NO_NULL_TERM
); 
 810                 outstr 
= outbufstart 
+ uft8_bytes
; 
 812         if (unistr 
&& unistr 
!= &unicodebuf
[0]) { 
 813                 FREE(unistr
, M_TEMP
); 
 820   * Unicode 3.2 decomposition code (derived from Core Foundation) 
 826 } unicode_mappings32
; 
 828 static inline u_int32_t
 
 829 getmappedvalue32(const unicode_mappings32 
*theTable
, u_int32_t numElem
, 
 832         const unicode_mappings32 
*p
, *q
, *divider
; 
 834         if ((character 
< theTable
[0]._key
) || (character 
> theTable
[numElem
-1]._key
)) 
 840                 divider 
= p 
+ ((q 
- p
) >> 1);   /* divide by 2 */ 
 841                 if (character 
< divider
->_key
) { q 
= divider 
- 1; } 
 842                 else if (character 
> divider
->_key
) { p 
= divider 
+ 1; } 
 843                 else { return (divider
->_value
); } 
 848 #define RECURSIVE_DECOMPOSITION (1 << 15) 
 849 #define EXTRACT_COUNT(value)    (((value) >> 12) & 0x0007) 
 854 } unicode_mappings16
; 
 856 static inline u_int16_t
 
 857 getmappedvalue16(const unicode_mappings16 
*theTable
, u_int32_t numElem
, 
 860         const unicode_mappings16 
*p
, *q
, *divider
; 
 862         if ((character 
< theTable
[0]._key
) || (character 
> theTable
[numElem
-1]._key
)) 
 868                 divider 
= p 
+ ((q 
- p
) >> 1);   /* divide by 2 */ 
 869                 if (character 
< divider
->_key
) 
 871                 else if (character 
> divider
->_key
) 
 874                         return (divider
->_value
); 
 881 unicode_recursive_decompose(u_int16_t character
, u_int16_t 
*convertedChars
) 
 887         const u_int16_t 
*bmpMappings
; 
 888         u_int32_t usedLength
; 
 890         value 
= getmappedvalue16( 
 891                 (const unicode_mappings16 
*)__CFUniCharDecompositionTable
, 
 892                 __UniCharDecompositionTableLength
, character
); 
 893         length 
= EXTRACT_COUNT(value
); 
 894         firstChar 
= value 
& 0x0FFF; 
 896         bmpMappings 
= (length 
== 1 ? &theChar 
: __CFUniCharMultipleDecompositionTable 
+ firstChar
); 
 899         if (value 
& RECURSIVE_DECOMPOSITION
) { 
 900             usedLength 
= unicode_recursive_decompose((u_int16_t
)*bmpMappings
, convertedChars
); 
 902             --length
;   /* Decrement for the first char */ 
 906             convertedChars 
+= usedLength
; 
 909         usedLength 
+= length
; 
 912                 *(convertedChars
++) = *(bmpMappings
++); 
 917 #define HANGUL_SBASE 0xAC00 
 918 #define HANGUL_LBASE 0x1100 
 919 #define HANGUL_VBASE 0x1161 
 920 #define HANGUL_TBASE 0x11A7 
 922 #define HANGUL_SCOUNT 11172 
 923 #define HANGUL_LCOUNT 19 
 924 #define HANGUL_VCOUNT 21 
 925 #define HANGUL_TCOUNT 28 
 926 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT) 
 929  * unicode_decompose - decompose a composed Unicode char 
 931  * Composed Unicode characters are forbidden on 
 932  * HFS Plus volumes. ucs_decompose will convert a 
 933  * composed character into its correct decomposed 
 936  * Similar to CFUniCharDecomposeCharacter 
 939 unicode_decompose(u_int16_t character
, u_int16_t 
*convertedChars
) 
 941         if ((character 
>= HANGUL_SBASE
) && 
 942             (character 
<= (HANGUL_SBASE 
+ HANGUL_SCOUNT
))) { 
 945                 character 
-= HANGUL_SBASE
; 
 946                 length 
= (character 
% HANGUL_TCOUNT 
? 3 : 2); 
 948                 *(convertedChars
++) = 
 949                         character 
/ HANGUL_NCOUNT 
+ HANGUL_LBASE
; 
 950                 *(convertedChars
++) = 
 951                         (character 
% HANGUL_NCOUNT
) / HANGUL_TCOUNT 
+ HANGUL_VBASE
; 
 953                         *convertedChars 
= (character 
% HANGUL_TCOUNT
) + HANGUL_TBASE
; 
 956                 return (unicode_recursive_decompose(character
, convertedChars
)); 
 961  * unicode_combine - generate a precomposed Unicode char 
 963  * Precomposed Unicode characters are required for some volume 
 964  * formats and network protocols.  unicode_combine will combine 
 965  * a decomposed character sequence into a single precomposed 
 966  * (composite) character. 
 968  * Similar toCFUniCharPrecomposeCharacter but unicode_combine 
 969  * also handles Hangul Jamo characters. 
 972 unicode_combine(u_int16_t base
, u_int16_t combining
) 
 977         if ((combining 
>= HANGUL_VBASE
) && (combining 
< (HANGUL_TBASE 
+ HANGUL_TCOUNT
))) { 
 978                 /* 2 char Hangul sequences */ 
 979                 if ((combining 
< (HANGUL_VBASE 
+ HANGUL_VCOUNT
)) && 
 980                     (base 
>= HANGUL_LBASE 
&& base 
< (HANGUL_LBASE 
+ HANGUL_LCOUNT
))) { 
 981                     return (HANGUL_SBASE 
+ 
 982                             ((base 
- HANGUL_LBASE
)*(HANGUL_VCOUNT
*HANGUL_TCOUNT
)) + 
 983                             ((combining  
- HANGUL_VBASE
)*HANGUL_TCOUNT
)); 
 986                 /* 3 char Hangul sequences */ 
 987                 if ((combining 
> HANGUL_TBASE
) && 
 988                     (base 
>= HANGUL_SBASE 
&& base 
< (HANGUL_SBASE 
+ HANGUL_SCOUNT
))) { 
 989                         if ((base 
- HANGUL_SBASE
) % HANGUL_TCOUNT
) 
 992                                 return (base 
+ (combining 
- HANGUL_TBASE
)); 
 996         value 
= getmappedvalue32( 
 997                 (const unicode_mappings32 
*)__CFUniCharPrecompSourceTable
, 
 998                 __CFUniCharPrecompositionTableLength
, combining
); 
1001                 value 
= getmappedvalue16( 
1002                         (const unicode_mappings16 
*) 
1003                         ((const u_int32_t 
*)__CFUniCharBMPPrecompDestinationTable 
+ (value 
& 0xFFFF)), 
1004                         (value 
>> 16), base
); 
1011  * priortysort - order combining chars into canonical order 
1013  * Similar to CFUniCharPrioritySort 
1016 priortysort(u_int16_t
* characters
, int count
) 
1019         u_int16_t 
*ch1
, *ch2
; 
1023         end 
= characters 
+ count
; 
1027                 ch2 
= characters 
+ 1; 
1028                 p2 
= get_combining_class(*ch1
); 
1031                         p2 
= get_combining_class(*ch2
); 
1048  * Invalid NTFS filename characters are encodeded using the 
1049  * SFM (Services for Macintosh) private use Unicode characters. 
1051  * These should only be used for SMB, MSDOS or NTFS. 
1053  *    Illegal NTFS Char   SFM Unicode Char 
1054  *  ---------------------------------------- 
1055  *    0x01-0x1f           0xf001-0xf01f 
1064  *    ' '                 0xf028  (Only if last char of the name) 
1065  *    '.'                 0xf029  (Only if last char of the name) 
1066  *  ---------------------------------------- 
1068  *  Reference: http://support.microsoft.com/kb/q117258/ 
1071 #define MAX_SFM2MAC           0x29 
1072 #define SFMCODE_PREFIX_MASK   0xf000  
1075  * In the Mac OS 9 days the colon was illegal in a file name. For that reason 
1076  * SFM had no conversion for the colon. There is a conversion for the 
1077  * slash. In Mac OS X the slash is illegal in a file name. So for us the colon 
1078  * is a slash and a slash is a colon. So we can just replace the slash with the 
1079  * colon in our tables and everything will just work.  
1083         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* 00 - 07 */ 
1084         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,   /* 08 - 0F */ 
1085         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,   /* 10 - 17 */ 
1086         0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,   /* 18 - 1F */ 
1087         0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c,   /* 20 - 27 */ 
1088         0x20, 0x2e                                        /* 28 - 29 */ 
1093         0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27,   /* 20 - 27 */ 
1094         0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22,   /* 28 - 2f */ 
1095         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,   /* 30 - 37 */ 
1096         0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25,   /* 38 - 3f */ 
1097         0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,   /* 40 - 47 */ 
1098         0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,   /* 48 - 4f */ 
1099         0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,   /* 50 - 57 */ 
1100         0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f,   /* 58 - 5f */ 
1101         0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,   /* 60 - 67 */ 
1102         0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,   /* 68 - 6f */ 
1103         0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,   /* 70 - 77 */ 
1104         0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f    /* 78 - 7f */ 
1109  * Encode illegal NTFS filename characters into SFM Private Unicode characters 
1111  * Assumes non-zero ASCII input. 
1114 ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
) 
1116         /* The last character of filename cannot be a space or period. */ 
1120                 else if (ucs_ch 
== 0x2e) 
1123         /* 0x01 - 0x1f is simple transformation. */ 
1124         if (ucs_ch 
<= 0x1f) { 
1125                 return (ucs_ch 
| 0xf000); 
1126         } else /* 0x20 - 0x7f */ { 
1129                 lsb 
= mac2sfm
[ucs_ch 
- 0x0020]; 
1131                         return(0xf000 | lsb
);  
1137  * Decode any SFM Private Unicode characters 
1140 sfm_to_ucs(u_int16_t ucs_ch
) 
1142         if (((ucs_ch 
& 0xffC0) == SFMCODE_PREFIX_MASK
) &&  
1143             ((ucs_ch 
& 0x003f) <= MAX_SFM2MAC
)) { 
1144                 ucs_ch 
= sfm2mac
[ucs_ch 
& 0x003f];