2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. 
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 
   6  * This file contains Original Code and/or Modifications of Original Code 
   7  * as defined in and that are subject to the Apple Public Source License 
   8  * Version 2.0 (the 'License'). You may not use this file except in 
   9  * compliance with the License. The rights granted to you under the License 
  10  * may not be used to create, or enable the creation or redistribution of, 
  11  * unlawful or unlicensed copies of an Apple operating system, or to 
  12  * circumvent, violate, or enable the circumvention or violation of, any 
  13  * terms of an Apple operating system software license agreement. 
  15  * Please obtain a copy of the License at 
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file. 
  18  * The Original Code and all software distributed under the License are 
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
  23  * Please see the License for the specific language governing rights and 
  24  * limitations under the License. 
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 
  30         Includes Unicode 3.2 decomposition code derived from Core Foundation 
  33 #include <sys/param.h> 
  34 #include <sys/utfconv.h> 
  35 #include <sys/errno.h> 
  36 #include <sys/malloc.h> 
  37 #include <libkern/OSByteOrder.h> 
  39 #if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST) 
  40 #include <kern/assert.h> 
  46  * UTF-8 (Unicode Transformation Format) 
  48  * UTF-8 is the Unicode Transformation Format that serializes a Unicode 
  49  * character as a sequence of one to four bytes. Only the shortest form 
  50  * required to represent the significant Unicode bits is legal. 
  52  * UTF-8 Multibyte Codes 
  54  * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary) 
  55  * ----------------------------------------------------------------------------- 
  56  *   1       7       0x0000        0x007F    0xxxxxxx 
  57  *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx 
  58  *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx 
  59  *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
  60  * ----------------------------------------------------------------------------- 
  64 #define UNICODE_TO_UTF8_LEN(c)  \ 
  65         ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3))) 
  67 #define UCS_ALT_NULL    0x2400 
  69 /* Surrogate Pair Constants */ 
  70 #define SP_HALF_SHIFT   10 
  71 #define SP_HALF_BASE    0x0010000u 
  72 #define SP_HALF_MASK    0x3FFu 
  74 #define SP_HIGH_FIRST   0xD800u 
  75 #define SP_HIGH_LAST    0xDBFFu 
  76 #define SP_LOW_FIRST    0xDC00u 
  77 #define SP_LOW_LAST             0xDFFFu 
  80 #include "vfs_utfconvdata.h" 
  84  * Test for a combining character. 
  86  * Similar to __CFUniCharIsNonBaseCharacter except that 
  87  * unicode_combinable also includes Hangul Jamo characters. 
  90 unicode_combinable(u_int16_t character
) 
  92         const u_int8_t 
*bitmap 
= __CFUniCharCombiningBitmap
; 
  95         if (character 
< 0x0300) 
  98         value 
= bitmap
[(character 
>> 8) & 0xFF]; 
 103                 bitmap 
= bitmap 
+ ((value 
- 1) * 32) + 256; 
 104                 return (bitmap
[(character 
& 0xFF) / 8] & (1 << (character 
% 8)) ? 1 : 0); 
 110  * Test for a precomposed character. 
 112  * Similar to __CFUniCharIsDecomposableCharacter. 
 115 unicode_decomposeable(u_int16_t character
) { 
 116         const u_int8_t 
*bitmap 
= __CFUniCharDecomposableBitmap
; 
 119         if (character 
< 0x00C0) 
 122         value 
= bitmap
[(character 
>> 8) & 0xFF]; 
 127                 bitmap 
= bitmap 
+ ((value 
- 1) * 32) + 256; 
 128                 return (bitmap
[(character 
& 0xFF) / 8] & (1 << (character 
% 8)) ? 1 : 0); 
 135  * Get the combing class. 
 137  * Similar to CFUniCharGetCombiningPropertyForCharacter. 
 139 static inline u_int8_t
 
 140 get_combining_class(u_int16_t character
) { 
 141         const u_int8_t 
*bitmap 
= __CFUniCharCombiningPropertyBitmap
; 
 143         u_int8_t value 
= bitmap
[(character 
>> 8)]; 
 146                 bitmap 
= bitmap 
+ (value 
* 256); 
 147                 return bitmap
[character 
% 256]; 
 153 static int unicode_decompose(u_int16_t character
, u_int16_t 
*convertedChars
); 
 155 static u_int16_t 
unicode_combine(u_int16_t base
, u_int16_t combining
); 
 157 static void prioritysort(u_int16_t
* characters
, int count
); 
 159 static u_int16_t  
ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
); 
 161 static u_int16_t  
sfm_to_ucs(u_int16_t ucs_ch
); 
 164 char utf_extrabytes
[32] = { 
 165          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 
 166         -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,  2,  2,  3, -1 
 169 const char hexdigits
[16] = { 
 170          '0',  '1',  '2',  '3',  '4',  '5',  '6', '7', 
 171          '8',  '9',  'A',  'B',  'C',  'D',  'E', 'F' 
 175  * utf8_encodelen - Calculate the UTF-8 encoding length 
 177  * This function takes a Unicode input string, ucsp, of ucslen bytes 
 178  * and calculates the size of the UTF-8 output in bytes (not including 
 179  * a NULL termination byte). The string must reside in kernel memory. 
 181  * If '/' chars are possible in the Unicode input then an alternate 
 182  * (replacement) char should be provided in altslash. 
 185  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime 
 187  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian 
 189  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian 
 191  *    UTF_DECOMPOSED:  generate fully decomposed output 
 193  *    UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it 
 199 utf8_encodelen(const u_int16_t 
* ucsp
, size_t ucslen
, u_int16_t altslash
, int flags
) 
 202         u_int16_t 
* chp 
= NULL
; 
 203         u_int16_t sequence
[8]; 
 206         int swapbytes 
= (flags 
& UTF_REVERSE_ENDIAN
); 
 207         int decompose 
= (flags 
& UTF_DECOMPOSED
); 
 210         charcnt 
= ucslen 
/ 2; 
 213         while (charcnt
-- > 0) { 
 220                                 ucs_ch 
= OSSwapInt16(ucs_ch
); 
 223                                 ucs_ch 
= altslash 
? altslash 
: '_'; 
 224                         } else if (ucs_ch 
== '\0') { 
 225                                 ucs_ch 
= UCS_ALT_NULL
; 
 226                         } else if (decompose 
&& unicode_decomposeable(ucs_ch
)) { 
 227                                 extra 
= unicode_decompose(ucs_ch
, sequence
) - 1; 
 229                                 ucs_ch 
= sequence
[0]; 
 233                 len 
+= UNICODE_TO_UTF8_LEN(ucs_ch
); 
 241  * utf8_encodestr - Encodes a Unicode string to UTF-8 
 244  *    The resulting UTF-8 string is NULL terminated. 
 246  *    If '/' chars are allowed on disk then an alternate 
 247  *    (replacement) char must be provided in altslash. 
 250  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime 
 252  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian 
 254  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian 
 256  *    UTF_DECOMPOSED:  generate fully decomposed output 
 258  *    UTF_NO_NULL_TERM:  don't add NULL termination to UTF-8 output 
 261  *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded 
 263  *    EINVAL: Illegal char found; char was replaced by an '_'. 
 266 utf8_encodestr(const u_int16_t 
* ucsp
, size_t ucslen
, u_int8_t 
* utf8p
, 
 267                size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
) 
 272         u_int16_t 
* chp 
= NULL
; 
 273         u_int16_t sequence
[8]; 
 276         int swapbytes 
= (flags 
& UTF_REVERSE_ENDIAN
); 
 277         int nullterm  
= ((flags 
& UTF_NO_NULL_TERM
) == 0); 
 278         int decompose 
= (flags 
& UTF_DECOMPOSED
); 
 279         int sfmconv 
= (flags 
& UTF_SFM_CONVERSIONS
); 
 283         bufend 
= bufstart 
+ buflen
; 
 286         charcnt 
= ucslen 
/ 2; 
 288         while (charcnt
-- > 0) { 
 293                         ucs_ch 
= swapbytes 
? OSSwapInt16(*ucsp
++) : *ucsp
++; 
 295                         if (decompose 
&& unicode_decomposeable(ucs_ch
)) { 
 296                                 extra 
= unicode_decompose(ucs_ch
, sequence
) - 1; 
 298                                 ucs_ch 
= sequence
[0]; 
 303                 /* Slash and NULL are not permitted */ 
 311                 } else if (ucs_ch 
== '\0') { 
 312                         ucs_ch 
= UCS_ALT_NULL
; 
 315                 if (ucs_ch 
< 0x0080) { 
 316                         if (utf8p 
>= bufend
) { 
 317                                 result 
= ENAMETOOLONG
; 
 322                 } else if (ucs_ch 
< 0x800) { 
 323                         if ((utf8p 
+ 1) >= bufend
) { 
 324                                 result 
= ENAMETOOLONG
; 
 327                         *utf8p
++ = 0xc0 | (ucs_ch 
>> 6); 
 328                         *utf8p
++ = 0x80 | (0x3f & ucs_ch
); 
 331                         /* These chars never valid Unicode. */ 
 332                         if (ucs_ch 
== 0xFFFE || ucs_ch 
== 0xFFFF) { 
 337                         /* Combine valid surrogate pairs */ 
 338                         if (ucs_ch 
>= SP_HIGH_FIRST 
&& ucs_ch 
<= SP_HIGH_LAST
 
 343                                 ch2 
= swapbytes 
? OSSwapInt16(*ucsp
) : *ucsp
; 
 344                                 if (ch2 
>= SP_LOW_FIRST 
&& ch2 
<= SP_LOW_LAST
) { 
 345                                         pair 
= ((ucs_ch 
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
) 
 346                                                 + (ch2 
- SP_LOW_FIRST
) + SP_HALF_BASE
; 
 347                                         if ((utf8p 
+ 3) >= bufend
) { 
 348                                                 result 
= ENAMETOOLONG
; 
 353                                         *utf8p
++ = 0xf0 | (pair 
>> 18); 
 354                                         *utf8p
++ = 0x80 | (0x3f & (pair 
>> 12)); 
 355                                         *utf8p
++ = 0x80 | (0x3f & (pair 
>> 6)); 
 356                                         *utf8p
++ = 0x80 | (0x3f & pair
); 
 359                         } else if (sfmconv
) { 
 360                                 ucs_ch 
= sfm_to_ucs(ucs_ch
); 
 361                                 if (ucs_ch 
< 0x0080) { 
 362                                         if (utf8p 
>= bufend
) { 
 363                                                 result 
= ENAMETOOLONG
; 
 370                         if ((utf8p 
+ 2) >= bufend
) { 
 371                                 result 
= ENAMETOOLONG
; 
 374                         *utf8p
++ = 0xe0 | (ucs_ch 
>> 12); 
 375                         *utf8p
++ = 0x80 | (0x3f & (ucs_ch 
>> 6)); 
 376                         *utf8p
++ = 0x80 | (0x3f & ucs_ch
); 
 380         *utf8len 
= utf8p 
- bufstart
; 
 387 // Pushes a character taking account of combining character sequences 
 388 static void push(uint16_t ucs_ch
, int *combcharcnt
, uint16_t **ucsp
) 
 391          * Make multiple combining character sequences canonical 
 393         if (unicode_combinable(ucs_ch
)) { 
 394                 ++*combcharcnt
;         /* start tracking a run */ 
 395         } else if (*combcharcnt
) { 
 396                 if (*combcharcnt 
> 1) { 
 397                         prioritysort(*ucsp 
- *combcharcnt
, *combcharcnt
); 
 399                 *combcharcnt 
= 0;       /* start over */ 
 406  * utf8_decodestr - Decodes a UTF-8 string back to Unicode 
 409  *    The input UTF-8 string does not need to be null terminated 
 412  *    If '/' chars are allowed on disk then an alternate 
 413  *    (replacement) char must be provided in altslash. 
 416  *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime 
 418  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian 
 420  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian 
 422  *    UTF_DECOMPOSED:  generate fully decomposed output (NFD) 
 424  *    UTF_PRECOMPOSED:  generate precomposed output (NFC) 
 426  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input 
 429  *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded. 
 431  *    EINVAL: Illegal UTF-8 sequence found. 
 434 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
, 
 435                size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
) 
 443         int decompose
, precompose
, escaping
; 
 447         decompose  
= (flags 
& UTF_DECOMPOSED
); 
 448         precompose 
= (flags 
& UTF_PRECOMPOSED
); 
 449         escaping   
= (flags 
& UTF_ESCAPE_ILLEGAL
); 
 450         sfmconv    
= (flags 
& UTF_SFM_CONVERSIONS
); 
 453         bufend 
= (u_int16_t 
*)((u_int8_t 
*)ucsp 
+ buflen
); 
 455         while (utf8len
-- > 0 && (byte 
= *utf8p
++) != '\0') { 
 459                 /* check for ascii */ 
 461                         ucs_ch 
= sfmconv 
? ucs_to_sfm(byte
, utf8len 
== 0) : byte
; 
 465                         extrabytes 
= utf_extrabytes
[byte 
>> 3]; 
 466                         if ((extrabytes 
< 0) || ((int)utf8len 
< extrabytes
)) { 
 469                         utf8len 
-= extrabytes
; 
 471                         switch (extrabytes
) { 
 473                                 ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 474                                 byte 
= *utf8p
++;       /* 2nd byte */ 
 475                                 if ((byte 
>> 6) != 2) 
 484                                 ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 485                                 byte 
= *utf8p
++;       /* 2nd byte */ 
 486                                 if ((byte 
>> 6) != 2) 
 488                                 ch 
+= byte
; ch 
<<= 6; 
 489                                 byte 
= *utf8p
++;       /* 3rd byte */ 
 490                                 if ((byte 
>> 6) != 2) 
 499                                         if (ch 
== 0xFFFE || ch 
== 0xFFFF) 
 505                                 ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 506                                 byte 
= *utf8p
++;       /* 2nd byte */ 
 507                                 if ((byte 
>> 6) != 2) 
 509                                 ch 
+= byte
; ch 
<<= 6; 
 510                                 byte 
= *utf8p
++;       /* 3rd byte */ 
 511                                 if ((byte 
>> 6) != 2) 
 513                                 ch 
+= byte
; ch 
<<= 6; 
 514                                 byte 
= *utf8p
++;       /* 4th byte */ 
 515                                 if ((byte 
>> 6) != 2) 
 518                                 ch 
-= 0x03C82080UL 
+ SP_HALF_BASE
; 
 519                                 ucs_ch 
= (ch 
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
; 
 520                                 if (ucs_ch 
< SP_HIGH_FIRST 
|| ucs_ch 
> SP_HIGH_LAST
) 
 522                                 push(ucs_ch
, &combcharcnt
, &ucsp
); 
 525                                 ucs_ch 
= (ch 
& SP_HALF_MASK
) + SP_LOW_FIRST
; 
 526                                 if (ucs_ch 
< SP_LOW_FIRST 
|| ucs_ch 
> SP_LOW_LAST
) { 
 537                                 if (unicode_decomposeable(ucs_ch
)) { 
 538                                         u_int16_t sequence
[8]; 
 541                                         count 
= unicode_decompose(ucs_ch
, sequence
); 
 543                                         for (i 
= 0; i 
< count
; ++i
) { 
 547                                                 push(sequence
[i
], &combcharcnt
, &ucsp
); 
 552                         } else if (precompose 
&& (ucsp 
!= bufstart
)) { 
 553                                 u_int16_t composite
, base
; 
 555                                 if (unicode_combinable(ucs_ch
)) { 
 557                                         composite 
= unicode_combine(base
, ucs_ch
); 
 564                         if (ucs_ch 
== UCS_ALT_NULL
) 
 567                 if (ucs_ch 
== altslash
) 
 570                 push(ucs_ch
, &combcharcnt
, &ucsp
); 
 574                  * Escape illegal UTF-8 into something legal. 
 590                         utf8len 
+= extrabytes
; 
 593                 if ((ucsp 
+ 2) >= bufend
) 
 596                 /* Make a previous combining sequence canonical. */ 
 597                 if (combcharcnt 
> 1) { 
 598                         prioritysort(ucsp 
- combcharcnt
, combcharcnt
); 
 604                 ucs_ch 
=  hexdigits
[byte 
>> 4]; 
 606                 ucs_ch 
=  hexdigits
[byte 
& 0x0F]; 
 610          * Make a previous combining sequence canonical 
 612         if (combcharcnt 
> 1) { 
 613                 prioritysort(ucsp 
- combcharcnt
, combcharcnt
); 
 616         if (flags 
& UTF_REVERSE_ENDIAN
) { 
 617                 uint16_t *p 
= bufstart
; 
 619                         *p 
= OSSwapInt16(*p
); 
 625         *ucslen 
= (u_int8_t
*)ucsp 
- (u_int8_t
*)bufstart
; 
 630         result 
= ENAMETOOLONG
; 
 636  * utf8_validatestr - Check for a valid UTF-8 string. 
 639 utf8_validatestr(const u_int8_t
* utf8p
, size_t utf8len
) 
 646         while (utf8len
-- > 0 && (byte 
= *utf8p
++) != '\0') { 
 648                         continue;  /* plain ascii */ 
 650                 extrabytes 
= utf_extrabytes
[byte 
>> 3]; 
 652                 if (utf8len 
< extrabytes
) 
 654                 utf8len 
-= extrabytes
; 
 656                 switch (extrabytes
) { 
 658                         ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 659                         byte 
= *utf8p
++;       /* 2nd byte */ 
 660                         if ((byte 
>> 6) != 2) 
 668                         ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 669                         byte 
= *utf8p
++;       /* 2nd byte */ 
 670                         if ((byte 
>> 6) != 2) 
 672                         ch 
+= byte
; ch 
<<= 6; 
 673                         byte 
= *utf8p
++;       /* 3rd byte */ 
 674                         if ((byte 
>> 6) != 2) 
 683                                 if (ch 
== 0xFFFE || ch 
== 0xFFFF) 
 688                         ch 
= byte
; ch 
<<= 6;   /* 1st byte */ 
 689                         byte 
= *utf8p
++;       /* 2nd byte */ 
 690                         if ((byte 
>> 6) != 2) 
 692                         ch 
+= byte
; ch 
<<= 6; 
 693                         byte 
= *utf8p
++;       /* 3rd byte */ 
 694                         if ((byte 
>> 6) != 2) 
 696                         ch 
+= byte
; ch 
<<= 6; 
 697                         byte 
= *utf8p
++;       /* 4th byte */ 
 698                         if ((byte 
>> 6) != 2) 
 701                         ch 
-= 0x03C82080UL 
+ SP_HALF_BASE
; 
 702                         ucs_ch 
= (ch 
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
; 
 703                         if (ucs_ch 
< SP_HIGH_FIRST 
|| ucs_ch 
> SP_HIGH_LAST
) 
 705                         ucs_ch 
= (ch 
& SP_HALF_MASK
) + SP_LOW_FIRST
; 
 706                         if (ucs_ch 
< SP_LOW_FIRST 
|| ucs_ch 
> SP_LOW_LAST
) 
 720  * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD) 
 722  * This function takes an UTF-8 input string, instr, of inlen bytes 
 723  * and produces normalized UTF-8 output into a buffer of buflen bytes 
 724  * pointed to by outstr. The size of the output in bytes (not including 
 725  * a NULL termination byte) is returned in outlen. In-place conversions 
 726  * are not supported (i.e. instr != outstr).] 
 729  *    UTF_DECOMPOSED:  output string will be fully decomposed (NFD) 
 731  *    UTF_PRECOMPOSED:  output string will be precomposed (NFC) 
 733  *    UTF_NO_NULL_TERM:  do not add null termination to output string 
 735  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input 
 738  *    ENAMETOOLONG:  output did not fit or input exceeded MAXPATHLEN bytes 
 740  *    EINVAL:  illegal UTF-8 sequence encountered or invalid flags 
 743 utf8_normalizestr(const u_int8_t
* instr
, size_t inlen
, u_int8_t
* outstr
, 
 744                   size_t *outlen
, size_t buflen
, int flags
) 
 746         u_int16_t unicodebuf
[32]; 
 747         u_int16_t
* unistr 
= NULL
; 
 748         size_t unicode_bytes
; 
 751         u_int8_t 
*outbufstart
, *outbufend
; 
 752         const u_int8_t 
*inbufstart
; 
 754         int decompose
, precompose
; 
 757         if (flags 
& ~(UTF_DECOMPOSED 
| UTF_PRECOMPOSED 
| UTF_NO_NULL_TERM 
| UTF_ESCAPE_ILLEGAL
)) { 
 760         decompose 
= (flags 
& UTF_DECOMPOSED
); 
 761         precompose 
= (flags 
& UTF_PRECOMPOSED
); 
 762         if ((decompose 
&& precompose
) || (!decompose 
&& !precompose
)) { 
 765         outbufstart 
= outstr
; 
 766         outbufend 
= outbufstart 
+ buflen
; 
 770         while (inlen
-- > 0 && (byte 
= *instr
++) != '\0') { 
 771                 if (outstr 
>= outbufend
) { 
 772                         result 
= ENAMETOOLONG
; 
 778                 /* ASCII is already normalized. */ 
 782         *outlen 
= outstr 
- outbufstart
; 
 783         if (((flags 
& UTF_NO_NULL_TERM
) == 0)) { 
 784                 if (outstr 
< outbufend
) 
 787                         result 
= ENAMETOOLONG
; 
 793          * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr 
 794          * functions to perform the normalization.  Since this will 
 795          * presumably be used to normalize filenames in the back-end 
 796          * (on disk or over-the-wire), it should be fast enough. 
 800         /* Make sure the input size is reasonable. */ 
 801         if (inbuflen 
> MAXPATHLEN
) { 
 802                 result 
= ENAMETOOLONG
; 
 806          * Compute worst case Unicode buffer size. 
 808          * For pre-composed output, every UTF-8 input byte will be at 
 809          * most 2 Unicode bytes.  For decomposed output, 2 UTF-8 bytes 
 810          * (smallest composite char sequence) may yield 6 Unicode bytes 
 811          * (1 base char + 2 combining chars). 
 813         unicode_bytes 
= precompose 
? (inbuflen 
* 2) : (inbuflen 
* 3); 
 815         if (unicode_bytes 
<= sizeof(unicodebuf
)) 
 816                 unistr 
= &unicodebuf
[0]; 
 818                 MALLOC(unistr
, uint16_t *, unicode_bytes
, M_TEMP
, M_WAITOK
); 
 820         /* Normalize the string. */ 
 821         result 
= utf8_decodestr(inbufstart
, inbuflen
, unistr
, &unicode_bytes
, 
 822                                 unicode_bytes
, 0, flags 
& ~UTF_NO_NULL_TERM
); 
 824                 /* Put results back into UTF-8. */ 
 825                 result 
= utf8_encodestr(unistr
, unicode_bytes
, outbufstart
, 
 826                                         &uft8_bytes
, buflen
, 0, UTF_NO_NULL_TERM
); 
 827                 outstr 
= outbufstart 
+ uft8_bytes
; 
 829         if (unistr 
&& unistr 
!= &unicodebuf
[0]) { 
 830                 FREE(unistr
, M_TEMP
); 
 837   * Unicode 3.2 decomposition code (derived from Core Foundation) 
 843 } unicode_mappings32
; 
 845 static inline u_int32_t
 
 846 getmappedvalue32(const unicode_mappings32 
*theTable
, u_int32_t numElem
, 
 849         const unicode_mappings32 
*p
, *q
, *divider
; 
 851         if ((character 
< theTable
[0]._key
) || (character 
> theTable
[numElem
-1]._key
)) 
 857                 divider 
= p 
+ ((q 
- p
) >> 1);   /* divide by 2 */ 
 858                 if (character 
< divider
->_key
) { q 
= divider 
- 1; } 
 859                 else if (character 
> divider
->_key
) { p 
= divider 
+ 1; } 
 860                 else { return (divider
->_value
); } 
 865 #define RECURSIVE_DECOMPOSITION (1 << 15) 
 866 #define EXTRACT_COUNT(value)    (((value) >> 12) & 0x0007) 
 871 } unicode_mappings16
; 
 873 static inline u_int16_t
 
 874 getmappedvalue16(const unicode_mappings16 
*theTable
, u_int32_t numElem
, 
 877         const unicode_mappings16 
*p
, *q
, *divider
; 
 879         if ((character 
< theTable
[0]._key
) || (character 
> theTable
[numElem
-1]._key
)) 
 885                 divider 
= p 
+ ((q 
- p
) >> 1);   /* divide by 2 */ 
 886                 if (character 
< divider
->_key
) 
 888                 else if (character 
> divider
->_key
) 
 891                         return (divider
->_value
); 
 898 unicode_recursive_decompose(u_int16_t character
, u_int16_t 
*convertedChars
) 
 904         const u_int16_t 
*bmpMappings
; 
 905         u_int32_t usedLength
; 
 907         value 
= getmappedvalue16( 
 908                 (const unicode_mappings16 
*)__CFUniCharDecompositionTable
, 
 909                 __UniCharDecompositionTableLength
, character
); 
 910         length 
= EXTRACT_COUNT(value
); 
 911         firstChar 
= value 
& 0x0FFF; 
 913         bmpMappings 
= (length 
== 1 ? &theChar 
: __CFUniCharMultipleDecompositionTable 
+ firstChar
); 
 916         if (value 
& RECURSIVE_DECOMPOSITION
) { 
 917             usedLength 
= unicode_recursive_decompose((u_int16_t
)*bmpMappings
, convertedChars
); 
 919             --length
;   /* Decrement for the first char */ 
 923             convertedChars 
+= usedLength
; 
 926         usedLength 
+= length
; 
 929                 *(convertedChars
++) = *(bmpMappings
++); 
 934 #define HANGUL_SBASE 0xAC00 
 935 #define HANGUL_LBASE 0x1100 
 936 #define HANGUL_VBASE 0x1161 
 937 #define HANGUL_TBASE 0x11A7 
 939 #define HANGUL_SCOUNT 11172 
 940 #define HANGUL_LCOUNT 19 
 941 #define HANGUL_VCOUNT 21 
 942 #define HANGUL_TCOUNT 28 
 943 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT) 
 946  * unicode_decompose - decompose a composed Unicode char 
 948  * Composed Unicode characters are forbidden on 
 949  * HFS Plus volumes. ucs_decompose will convert a 
 950  * composed character into its correct decomposed 
 953  * Similar to CFUniCharDecomposeCharacter 
 956 unicode_decompose(u_int16_t character
, u_int16_t 
*convertedChars
) 
 958         if ((character 
>= HANGUL_SBASE
) && 
 959             (character 
<= (HANGUL_SBASE 
+ HANGUL_SCOUNT
))) { 
 962                 character 
-= HANGUL_SBASE
; 
 963                 length 
= (character 
% HANGUL_TCOUNT 
? 3 : 2); 
 965                 *(convertedChars
++) = 
 966                         character 
/ HANGUL_NCOUNT 
+ HANGUL_LBASE
; 
 967                 *(convertedChars
++) = 
 968                         (character 
% HANGUL_NCOUNT
) / HANGUL_TCOUNT 
+ HANGUL_VBASE
; 
 970                         *convertedChars 
= (character 
% HANGUL_TCOUNT
) + HANGUL_TBASE
; 
 973                 return (unicode_recursive_decompose(character
, convertedChars
)); 
 978  * unicode_combine - generate a precomposed Unicode char 
 980  * Precomposed Unicode characters are required for some volume 
 981  * formats and network protocols.  unicode_combine will combine 
 982  * a decomposed character sequence into a single precomposed 
 983  * (composite) character. 
 985  * Similar toCFUniCharPrecomposeCharacter but unicode_combine 
 986  * also handles Hangul Jamo characters. 
 989 unicode_combine(u_int16_t base
, u_int16_t combining
) 
 994         if ((combining 
>= HANGUL_VBASE
) && (combining 
< (HANGUL_TBASE 
+ HANGUL_TCOUNT
))) { 
 995                 /* 2 char Hangul sequences */ 
 996                 if ((combining 
< (HANGUL_VBASE 
+ HANGUL_VCOUNT
)) && 
 997                     (base 
>= HANGUL_LBASE 
&& base 
< (HANGUL_LBASE 
+ HANGUL_LCOUNT
))) { 
 998                     return (HANGUL_SBASE 
+ 
 999                             ((base 
- HANGUL_LBASE
)*(HANGUL_VCOUNT
*HANGUL_TCOUNT
)) + 
1000                             ((combining  
- HANGUL_VBASE
)*HANGUL_TCOUNT
)); 
1003                 /* 3 char Hangul sequences */ 
1004                 if ((combining 
> HANGUL_TBASE
) && 
1005                     (base 
>= HANGUL_SBASE 
&& base 
< (HANGUL_SBASE 
+ HANGUL_SCOUNT
))) { 
1006                         if ((base 
- HANGUL_SBASE
) % HANGUL_TCOUNT
) 
1009                                 return (base 
+ (combining 
- HANGUL_TBASE
)); 
1013         value 
= getmappedvalue32( 
1014                 (const unicode_mappings32 
*)__CFUniCharPrecompSourceTable
, 
1015                 __CFUniCharPrecompositionTableLength
, combining
); 
1018                 value 
= getmappedvalue16( 
1019                         (const unicode_mappings16 
*) 
1020                         ((const u_int32_t 
*)__CFUniCharBMPPrecompDestinationTable 
+ (value 
& 0xFFFF)), 
1021                         (value 
>> 16), base
); 
1028  * prioritysort - order combining chars into canonical order 
1030  * Similar to CFUniCharPrioritySort 
1033 prioritysort(u_int16_t
* characters
, int count
) 
1036         u_int16_t 
*ch1
, *ch2
; 
1040         end 
= characters 
+ count
; 
1044                 ch2 
= characters 
+ 1; 
1045                 p2 
= get_combining_class(*ch1
); 
1048                         p2 
= get_combining_class(*ch2
); 
1049                         if (p1 
> p2 
&& p2 
!= 0) { 
1058                                  * Make sure that p2 contains the combining class for the 
1059                                  * character now stored at *ch2.  This isn't required for 
1060                                  * correctness, but it will be more efficient if a character 
1061                                  * with a large combining class has to "bubble past" several 
1062                                  * characters with lower combining classes. 
1074  * Invalid NTFS filename characters are encodeded using the 
1075  * SFM (Services for Macintosh) private use Unicode characters. 
1077  * These should only be used for SMB, MSDOS or NTFS. 
1079  *    Illegal NTFS Char   SFM Unicode Char 
1080  *  ---------------------------------------- 
1081  *    0x01-0x1f           0xf001-0xf01f 
1090  *    ' '                 0xf028  (Only if last char of the name) 
1091  *    '.'                 0xf029  (Only if last char of the name) 
1092  *  ---------------------------------------- 
1094  *  Reference: http://support.microsoft.com/kb/q117258/ 
1097 #define MAX_SFM2MAC           0x29 
1098 #define SFMCODE_PREFIX_MASK   0xf000  
1101  * In the Mac OS 9 days the colon was illegal in a file name. For that reason 
1102  * SFM had no conversion for the colon. There is a conversion for the 
1103  * slash. In Mac OS X the slash is illegal in a file name. So for us the colon 
1104  * is a slash and a slash is a colon. So we can just replace the slash with the 
1105  * colon in our tables and everything will just work.  
1109         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* 00 - 07 */ 
1110         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,   /* 08 - 0F */ 
1111         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,   /* 10 - 17 */ 
1112         0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,   /* 18 - 1F */ 
1113         0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c,   /* 20 - 27 */ 
1114         0x20, 0x2e                                        /* 28 - 29 */ 
1116 #define SFM2MAC_LEN     ((sizeof(sfm2mac))/sizeof(sfm2mac[0])) 
1120         0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27,   /* 20 - 27 */ 
1121         0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22,   /* 28 - 2f */ 
1122         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,   /* 30 - 37 */ 
1123         0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25,   /* 38 - 3f */ 
1124         0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,   /* 40 - 47 */ 
1125         0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,   /* 48 - 4f */ 
1126         0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,   /* 50 - 57 */ 
1127         0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f,   /* 58 - 5f */ 
1128         0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,   /* 60 - 67 */ 
1129         0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,   /* 68 - 6f */ 
1130         0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,   /* 70 - 77 */ 
1131         0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f    /* 78 - 7f */ 
1133 #define MAC2SFM_LEN     ((sizeof(mac2sfm))/sizeof(mac2sfm[0])) 
1137  * Encode illegal NTFS filename characters into SFM Private Unicode characters 
1139  * Assumes non-zero ASCII input. 
1142 ucs_to_sfm(u_int16_t ucs_ch
, int lastchar
) 
1144         /* The last character of filename cannot be a space or period. */ 
1148                 else if (ucs_ch 
== 0x2e) 
1151         /* 0x01 - 0x1f is simple transformation. */ 
1152         if (ucs_ch 
<= 0x1f) { 
1153                 return (ucs_ch 
| 0xf000); 
1154         } else /* 0x20 - 0x7f */ { 
1157                 assert((ucs_ch 
- 0x0020) < MAC2SFM_LEN
); 
1158                 lsb 
= mac2sfm
[ucs_ch 
- 0x0020]; 
1160                         return(0xf000 | lsb
);  
1166  * Decode any SFM Private Unicode characters 
1169 sfm_to_ucs(u_int16_t ucs_ch
) 
1171         if (((ucs_ch 
& 0xffC0) == SFMCODE_PREFIX_MASK
) &&  
1172             ((ucs_ch 
& 0x003f) <= MAX_SFM2MAC
)) { 
1173                 assert((ucs_ch 
& 0x003f) < SFM2MAC_LEN
); 
1174                 ucs_ch 
= sfm2mac
[ucs_ch 
& 0x003f];