bsd/vfs/vfs_utfconv.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29  /*
  30         Includes Unicode 3.2 decomposition code derived from Core Foundation
  31  */
  32
  33 #include <sys/param.h>
  34 #include <sys/utfconv.h>
  35 #include <sys/errno.h>
  36 #include <sys/malloc.h>
  37 #include <libkern/OSByteOrder.h>
  38
  39 /*
  40  * UTF-8 (Unicode Transformation Format)
  41  *
  42  * UTF-8 is the Unicode Transformation Format that serializes a Unicode
  43  * character as a sequence of one to four bytes. Only the shortest form
  44  * required to represent the significant Unicode bits is legal.
  45  *
  46  * UTF-8 Multibyte Codes
  47  *
  48  * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary)
  49  * -----------------------------------------------------------------------------
  50  *   1       7       0x0000        0x007F    0xxxxxxx
  51  *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx
  52  *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx
  53  *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  54  * -----------------------------------------------------------------------------
  55  */
  56
  57
  58 #define UNICODE_TO_UTF8_LEN(c)  \
  59         ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
  60
  61 #define UCS_ALT_NULL    0x2400
  62
  63 /* Surrogate Pair Constants */
  64 #define SP_HALF_SHIFT   10
  65 #define SP_HALF_BASE    0x0010000UL
  66 #define SP_HALF_MASK    0x3FFUL
  67
  68 #define SP_HIGH_FIRST   0xD800UL
  69 #define SP_HIGH_LAST    0xDBFFUL
  70 #define SP_LOW_FIRST    0xDC00UL
  71 #define SP_LOW_LAST     0xDFFFUL
  72
  73
  74 #include "vfs_utfconvdata.h"
  75
  76
  77 /*
  78  * Test for a combining character.
  79  *
  80  * Similar to __CFUniCharIsNonBaseCharacter except that
  81  * unicode_combinable also includes Hangul Jamo characters.
  82  */
  83 int
  84 unicode_combinable(u_int16_t character)
  85 {
  86         const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
  87         u_int8_t value;
  88
  89         if (character < 0x0300)
  90                 return (0);
  91
  92         value = bitmap[(character >> 8) & 0xFF];
  93
  94         if (value == 0xFF) {
  95                 return (1);
  96         } else if (value) {
  97                 bitmap = bitmap + ((value - 1) * 32) + 256;
  98                 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
  99         }
 100         return (0);
 101 }
 102
 103 /*
 104  * Test for a precomposed character.
 105  *
 106  * Similar to __CFUniCharIsDecomposableCharacter.
 107  */
 108 int
 109 unicode_decomposeable(u_int16_t character) {
 110         const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
 111         u_int8_t value;
 112
 113         if (character < 0x00C0)
 114                 return (0);
 115
 116         value = bitmap[(character >> 8) & 0xFF];
 117
 118         if (value == 0xFF) {
 119                 return (1);
 120         } else if (value) {
 121                 bitmap = bitmap + ((value - 1) * 32) + 256;
 122                 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
 123         }
 124         return (0);
 125 }
 126
 127
 128 /*
 129  * Get the combing class.
 130  *
 131  * Similar to CFUniCharGetCombiningPropertyForCharacter.
 132  */
 133 static inline u_int8_t
 134 get_combining_class(u_int16_t character) {
 135         const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
 136
 137         u_int8_t value = bitmap[(character >> 8)];
 138
 139         if (value) {
 140                 bitmap = bitmap + (value * 256);
 141                 return bitmap[character % 256];
 142         }
 143         return (0);
 144 }
 145
 146
 147 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
 148
 149 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
 150
 151 static void priortysort(u_int16_t* characters, int count);
 152
 153 static u_int16_t  ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
 154
 155 static u_int16_t  sfm_to_ucs(u_int16_t ucs_ch);
 156
 157
 158 char utf_extrabytes[32] = {
 159          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 160         -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,  2,  2,  3, -1
 161 };
 162
 163 const char hexdigits[16] = {
 164          '0',  '1',  '2',  '3',  '4',  '5',  '6', '7',
 165          '8',  '9',  'A',  'B',  'C',  'D',  'E', 'F'
 166 };
 167
 168 /*
 169  * utf8_encodelen - Calculate the UTF-8 encoding length
 170  *
 171  * This function takes a Unicode input string, ucsp, of ucslen bytes
 172  * and calculates the size of the UTF-8 output in bytes (not including
 173  * a NULL termination byte). The string must reside in kernel memory.
 174  *
 175  * If '/' chars are possible in the Unicode input then an alternate
 176  * (replacement) char should be provided in altslash.
 177  *
 178  * FLAGS
 179  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
 180  *
 181  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 182  *
 183  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 184  *
 185  *    UTF_DECOMPOSED:  generate fully decomposed output
 186  *
 187  *    UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
 188  *
 189  * ERRORS
 190  *    None
 191  */
 192 size_t
 193 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
 194 {
 195         u_int16_t ucs_ch;
 196         u_int16_t * chp = NULL;
 197         u_int16_t sequence[8];
 198         int extra = 0;
 199         int charcnt;
 200         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 201         int decompose = (flags & UTF_DECOMPOSED);
 202         size_t len;
 203
 204         charcnt = ucslen / 2;
 205         len = 0;
 206
 207         while (charcnt-- > 0) {
 208                 if (extra > 0) {
 209                         --extra;
 210                         ucs_ch = *chp++;
 211                 } else {
 212                         ucs_ch = *ucsp++;
 213                         if (swapbytes) {
 214                                 ucs_ch = OSSwapInt16(ucs_ch);
 215                         }
 216                         if (ucs_ch == '/') {
 217                                 ucs_ch = altslash ? altslash : '_';
 218                         } else if (ucs_ch == '\0') {
 219                                 ucs_ch = UCS_ALT_NULL;
 220                         } else if (decompose && unicode_decomposeable(ucs_ch)) {
 221                                 extra = unicode_decompose(ucs_ch, sequence) - 1;
 222                                 charcnt += extra;
 223                                 ucs_ch = sequence[0];
 224                                 chp = &sequence[1];
 225                         }
 226                 }
 227                 len += UNICODE_TO_UTF8_LEN(ucs_ch);
 228         }
 229
 230         return (len);
 231 }
 232
 233
 234 /*
 235  * utf8_encodestr - Encodes a Unicode string to UTF-8
 236  *
 237  * NOTES:
 238  *    The resulting UTF-8 string is NULL terminated.
 239  *
 240  *    If '/' chars are allowed on disk then an alternate
 241  *    (replacement) char must be provided in altslash.
 242  *
 243  * input flags:
 244  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
 245  *
 246  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 247  *
 248  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 249  *
 250  *    UTF_DECOMPOSED:  generate fully decomposed output
 251  *
 252  *    UTF_NO_NULL_TERM:  don't add NULL termination to UTF-8 output
 253  *
 254  * result:
 255  *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
 256  *
 257  *    EINVAL: Illegal char found; char was replaced by an '_'.
 258  */
 259 int
 260 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
 261                size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
 262 {
 263         u_int8_t * bufstart;
 264         u_int8_t * bufend;
 265         u_int16_t ucs_ch;
 266         u_int16_t * chp = NULL;
 267         u_int16_t sequence[8];
 268         int extra = 0;
 269         int charcnt;
 270         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 271         int nullterm  = ((flags & UTF_NO_NULL_TERM) == 0);
 272         int decompose = (flags & UTF_DECOMPOSED);
 273         int sfmconv = (flags & UTF_SFM_CONVERSIONS);
 274         int result = 0;
 275
 276         bufstart = utf8p;
 277         bufend = bufstart + buflen;
 278         if (nullterm)
 279                 --bufend;
 280         charcnt = ucslen / 2;
 281
 282         while (charcnt-- > 0) {
 283                 if (extra > 0) {
 284                         --extra;
 285                         ucs_ch = *chp++;
 286                 } else {
 287                         ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
 288
 289                         if (decompose && unicode_decomposeable(ucs_ch)) {
 290                                 extra = unicode_decompose(ucs_ch, sequence) - 1;
 291                                 charcnt += extra;
 292                                 ucs_ch = sequence[0];
 293                                 chp = &sequence[1];
 294                         }
 295                 }
 296
 297                 /* Slash and NULL are not permitted */
 298                 if (ucs_ch == '/') {
 299                         if (altslash)
 300                                 ucs_ch = altslash;
 301                         else {
 302                                 ucs_ch = '_';
 303                                 result = EINVAL;
 304                         }
 305                 } else if (ucs_ch == '\0') {
 306                         ucs_ch = UCS_ALT_NULL;
 307                 }
 308
 309                 if (ucs_ch < 0x0080) {
 310                         if (utf8p >= bufend) {
 311                                 result = ENAMETOOLONG;
 312                                 break;
 313                         }
 314                         *utf8p++ = ucs_ch;
 315
 316                 } else if (ucs_ch < 0x800) {
 317                         if ((utf8p + 1) >= bufend) {
 318                                 result = ENAMETOOLONG;
 319                                 break;
 320                         }
 321                         *utf8p++ = 0xc0 | (ucs_ch >> 6);
 322                         *utf8p++ = 0x80 | (0x3f & ucs_ch);
 323
 324                 } else {
 325                         /* These chars never valid Unicode. */
 326                         if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) {
 327                                 result = EINVAL;
 328                                 break;
 329                         }
 330
 331                         /* Combine valid surrogate pairs */
 332                         if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
 333                                 && charcnt > 0) {
 334                                 u_int16_t ch2;
 335                                 u_int32_t pair;
 336
 337                                 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
 338                                 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
 339                                         pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
 340                                                 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
 341                                         if ((utf8p + 3) >= bufend) {
 342                                                 result = ENAMETOOLONG;
 343                                                 break;
 344                                         }
 345                                         --charcnt;
 346                                         ++ucsp;
 347                                         *utf8p++ = 0xf0 | (pair >> 18);
 348                                         *utf8p++ = 0x80 | (0x3f & (pair >> 12));
 349                                         *utf8p++ = 0x80 | (0x3f & (pair >> 6));
 350                                         *utf8p++ = 0x80 | (0x3f & pair);
 351                                         continue;
 352                                 }
 353                         } else if (sfmconv) {
 354                                 ucs_ch = sfm_to_ucs(ucs_ch);
 355                                 if (ucs_ch < 0x0080) {
 356                                         if (utf8p >= bufend) {
 357                                                 result = ENAMETOOLONG;
 358                                                 break;
 359                                         }
 360                                         *utf8p++ = ucs_ch;
 361                                         continue;
 362                                 }
 363                         }
 364                         if ((utf8p + 2) >= bufend) {
 365                                 result = ENAMETOOLONG;
 366                                 break;
 367                         }
 368                         *utf8p++ = 0xe0 | (ucs_ch >> 12);
 369                         *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
 370                         *utf8p++ = 0x80 | (0x3f & ucs_ch);
 371                 }
 372         }
 373
 374         *utf8len = utf8p - bufstart;
 375         if (nullterm)
 376                 *utf8p++ = '\0';
 377
 378         return (result);
 379 }
 380
 381
 382 /*
 383  * utf8_decodestr - Decodes a UTF-8 string back to Unicode
 384  *
 385  * NOTES:
 386  *    The input UTF-8 string does not need to be null terminated
 387  *    if utf8len is set.
 388  *
 389  *    If '/' chars are allowed on disk then an alternate
 390  *    (replacement) char must be provided in altslash.
 391  *
 392  * input flags:
 393  *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
 394  *
 395  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 396  *
 397  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 398  *
 399  *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
 400  *
 401  *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
 402  *
 403  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 404  *
 405  * result:
 406  *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
 407  *
 408  *    EINVAL: Illegal UTF-8 sequence found.
 409  */
 410 int
 411 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
 412                size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
 413 {
 414         u_int16_t* bufstart;
 415         u_int16_t* bufend;
 416         unsigned int ucs_ch;
 417         unsigned int byte;
 418         int combcharcnt = 0;
 419         int result = 0;
 420         int decompose, precompose, swapbytes, escaping;
 421         int sfmconv;
 422         int extrabytes;
 423
 424         decompose  = (flags & UTF_DECOMPOSED);
 425         precompose = (flags & UTF_PRECOMPOSED);
 426         swapbytes  = (flags & UTF_REVERSE_ENDIAN);
 427         escaping   = (flags & UTF_ESCAPE_ILLEGAL);
 428         sfmconv    = (flags & UTF_SFM_CONVERSIONS);
 429
 430         bufstart = ucsp;
 431         bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
 432
 433         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 434                 if (ucsp >= bufend)
 435                         goto toolong;
 436
 437                 /* check for ascii */
 438                 if (byte < 0x80) {
 439                         ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
 440                 } else {
 441                         u_int32_t ch;
 442
 443                         extrabytes = utf_extrabytes[byte >> 3];
 444                         if ((extrabytes < 0) || ((int)utf8len < extrabytes)) {
 445                                 goto escape;
 446                         }
 447                         utf8len -= extrabytes;
 448
 449                         switch (extrabytes) {
 450                         case 1:
 451                                 ch = byte; ch <<= 6;   /* 1st byte */
 452                                 byte = *utf8p++;       /* 2nd byte */
 453                                 if ((byte >> 6) != 2)
 454                                         goto escape2;
 455                                 ch += byte;
 456                                 ch -= 0x00003080UL;
 457                                 if (ch < 0x0080)
 458                                         goto escape2;
 459                                 ucs_ch = ch;
 460                                 break;
 461                         case 2:
 462                                 ch = byte; ch <<= 6;   /* 1st byte */
 463                                 byte = *utf8p++;       /* 2nd byte */
 464                                 if ((byte >> 6) != 2)
 465                                         goto escape2;
 466                                 ch += byte; ch <<= 6;
 467                                 byte = *utf8p++;       /* 3rd byte */
 468                                 if ((byte >> 6) != 2)
 469                                         goto escape3;
 470                                 ch += byte;
 471                                 ch -= 0x000E2080UL;
 472                                 if (ch < 0x0800)
 473                                         goto escape3;
 474                                 if (ch >= 0xD800) {
 475                                         if (ch <= 0xDFFF)
 476                                                 goto escape3;
 477                                         if (ch == 0xFFFE || ch == 0xFFFF)
 478                                                 goto escape3;
 479                                 }
 480                                 ucs_ch = ch;
 481                                 break;
 482                         case 3:
 483                                 ch = byte; ch <<= 6;   /* 1st byte */
 484                                 byte = *utf8p++;       /* 2nd byte */
 485                                 if ((byte >> 6) != 2)
 486                                         goto escape2;
 487                                 ch += byte; ch <<= 6;
 488                                 byte = *utf8p++;       /* 3rd byte */
 489                                 if ((byte >> 6) != 2)
 490                                         goto escape3;
 491                                 ch += byte; ch <<= 6;
 492                                 byte = *utf8p++;       /* 4th byte */
 493                                 if ((byte >> 6) != 2)
 494                                         goto escape4;
 495                                 ch += byte;
 496                                 ch -= 0x03C82080UL + SP_HALF_BASE;
 497                                 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 498                                 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
 499                                         goto escape4;
 500                                 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 501                                 if (ucsp >= bufend)
 502                                         goto toolong;
 503                                 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 504                                 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
 505                                         --ucsp;
 506                                         goto escape4;
 507                                 }
 508                                 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 509                                 continue;
 510                         default:
 511                                 result = EINVAL;
 512                                 goto exit;
 513                         }
 514                         if (decompose) {
 515                                 if (unicode_decomposeable(ucs_ch)) {
 516                                         u_int16_t sequence[8];
 517                                         int count, i;
 518
 519                                         /* Before decomposing a new unicode character, sort
 520                                          * previous combining characters, if any, and reset
 521                                          * the counter.
 522                                          */
 523                                         if (combcharcnt > 1) {
 524                                                 priortysort(ucsp - combcharcnt, combcharcnt);
 525                                         }
 526                                         combcharcnt = 0;
 527
 528                                         count = unicode_decompose(ucs_ch, sequence);
 529                                         for (i = 0; i < count; ++i) {
 530                                                 ucs_ch = sequence[i];
 531                                                 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 532                                                 if (ucsp >= bufend)
 533                                                         goto toolong;
 534                                         }
 535                                         combcharcnt += count - 1;
 536                                         continue;
 537                                 }
 538                         } else if (precompose && (ucsp != bufstart)) {
 539                                 u_int16_t composite, base;
 540
 541                                 if (unicode_combinable(ucs_ch)) {
 542                                         base = swapbytes ? OSSwapInt16(*(ucsp - 1)) : *(ucsp - 1);
 543                                         composite = unicode_combine(base, ucs_ch);
 544                                         if (composite) {
 545                                                 --ucsp;
 546                                                 ucs_ch = composite;
 547                                         }
 548                                 }
 549                         }
 550                         if (ucs_ch == UCS_ALT_NULL)
 551                                 ucs_ch = '\0';
 552                 }
 553                 if (ucs_ch == altslash)
 554                         ucs_ch = '/';
 555
 556                 /*
 557                  * Make multiple combining character sequences canonical
 558                  */
 559                 if (unicode_combinable(ucs_ch)) {
 560                         ++combcharcnt;   /* start tracking a run */
 561                 } else if (combcharcnt) {
 562                         if (combcharcnt > 1) {
 563                                 priortysort(ucsp - combcharcnt, combcharcnt);
 564                         }
 565                         combcharcnt = 0;  /* start over */
 566                 }
 567
 568                 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 569                 continue;
 570
 571                 /*
 572                  * Escape illegal UTF-8 into something legal.
 573                  */
 574 escape4:
 575                 utf8p -= 3;
 576                 goto escape;
 577 escape3:
 578                 utf8p -= 2;
 579                 goto escape;
 580 escape2:
 581                 utf8p -= 1;
 582 escape:
 583                 if (!escaping) {
 584                         result = EINVAL;
 585                         goto exit;
 586                 }
 587                 if (extrabytes > 0)
 588                         utf8len += extrabytes;
 589                 byte = *(utf8p - 1);
 590
 591                 if ((ucsp + 2) >= bufend)
 592                         goto toolong;
 593
 594                 /* Make a previous combining sequence canonical. */
 595                 if (combcharcnt > 1) {
 596                         priortysort(ucsp - combcharcnt, combcharcnt);
 597                 }
 598                 combcharcnt = 0;
 599
 600                 ucs_ch = '%';
 601                 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 602                 ucs_ch =  hexdigits[byte >> 4];
 603                 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 604                 ucs_ch =  hexdigits[byte & 0x0F];
 605                 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 606         }
 607         /*
 608          * Make a previous combining sequence canonical
 609          */
 610         if (combcharcnt > 1) {
 611                 priortysort(ucsp - combcharcnt, combcharcnt);
 612         }
 613 exit:
 614         *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
 615
 616         return (result);
 617
 618 toolong:
 619         result = ENAMETOOLONG;
 620         goto exit;
 621 }
 622
 623
 624 /*
 625  * utf8_validatestr - Check for a valid UTF-8 string.
 626  */
 627 int
 628 utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
 629 {
 630         unsigned int byte;
 631         u_int32_t ch;
 632         unsigned int ucs_ch;
 633         size_t extrabytes;
 634
 635         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 636                 if (byte < 0x80)
 637                         continue;  /* plain ascii */
 638
 639                 extrabytes = utf_extrabytes[byte >> 3];
 640
 641                 if (utf8len < extrabytes)
 642                         goto invalid;
 643                 utf8len -= extrabytes;
 644
 645                 switch (extrabytes) {
 646                 case 1:
 647                         ch = byte; ch <<= 6;   /* 1st byte */
 648                         byte = *utf8p++;       /* 2nd byte */
 649                         if ((byte >> 6) != 2)
 650                                 goto invalid;
 651                         ch += byte;
 652                         ch -= 0x00003080UL;
 653                         if (ch < 0x0080)
 654                                 goto invalid;
 655                         break;
 656                 case 2:
 657                         ch = byte; ch <<= 6;   /* 1st byte */
 658                         byte = *utf8p++;       /* 2nd byte */
 659                         if ((byte >> 6) != 2)
 660                                 goto invalid;
 661                         ch += byte; ch <<= 6;
 662                         byte = *utf8p++;       /* 3rd byte */
 663                         if ((byte >> 6) != 2)
 664                                 goto invalid;
 665                         ch += byte;
 666                         ch -= 0x000E2080UL;
 667                         if (ch < 0x0800)
 668                                 goto invalid;
 669                         if (ch >= 0xD800) {
 670                                 if (ch <= 0xDFFF)
 671                                         goto invalid;
 672                                 if (ch == 0xFFFE || ch == 0xFFFF)
 673                                         goto invalid;
 674                         }
 675                         break;
 676                 case 3:
 677                         ch = byte; ch <<= 6;   /* 1st byte */
 678                         byte = *utf8p++;       /* 2nd byte */
 679                         if ((byte >> 6) != 2)
 680                                 goto invalid;
 681                         ch += byte; ch <<= 6;
 682                         byte = *utf8p++;       /* 3rd byte */
 683                         if ((byte >> 6) != 2)
 684                                 goto invalid;
 685                         ch += byte; ch <<= 6;
 686                         byte = *utf8p++;       /* 4th byte */
 687                         if ((byte >> 6) != 2)
 688                                 goto invalid;
 689                         ch += byte;
 690                         ch -= 0x03C82080UL + SP_HALF_BASE;
 691                         ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 692                         if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
 693                                 goto invalid;
 694                         ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 695                         if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
 696                                 goto invalid;
 697                         break;
 698                 default:
 699                         goto invalid;
 700                 }
 701
 702         }
 703         return (0);
 704 invalid:
 705         return (EINVAL);
 706 }
 707
 708 /*
 709  * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
 710  *
 711  * This function takes an UTF-8 input string, instr, of inlen bytes
 712  * and produces normalized UTF-8 output into a buffer of buflen bytes
 713  * pointed to by outstr. The size of the output in bytes (not including
 714  * a NULL termination byte) is returned in outlen. In-place conversions
 715  * are not supported (i.e. instr != outstr).]
 716
 717  * FLAGS
 718  *    UTF_DECOMPOSED:  output string will be fully decomposed (NFD)
 719  *
 720  *    UTF_PRECOMPOSED:  output string will be precomposed (NFC)
 721  *
 722  *    UTF_NO_NULL_TERM:  do not add null termination to output string
 723  *
 724  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 725  *
 726  * ERRORS
 727  *    ENAMETOOLONG:  output did not fit or input exceeded MAXPATHLEN bytes
 728  *
 729  *    EINVAL:  illegal UTF-8 sequence encountered or invalid flags
 730  */
 731 int
 732 utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
 733                   size_t *outlen, size_t buflen, int flags)
 734 {
 735         u_int16_t unicodebuf[32];
 736         u_int16_t* unistr = NULL;
 737         size_t unicode_bytes;
 738         size_t uft8_bytes;
 739         size_t inbuflen;
 740         u_int8_t *outbufstart, *outbufend;
 741         const u_int8_t *inbufstart;
 742         unsigned int byte;
 743         int decompose, precompose;
 744         int result = 0;
 745
 746         if (flags & ~(UTF_DECOMPOSED | UTF_PRECOMPOSED | UTF_NO_NULL_TERM | UTF_ESCAPE_ILLEGAL)) {
 747                 return (EINVAL);
 748         }
 749         decompose = (flags & UTF_DECOMPOSED);
 750         precompose = (flags & UTF_PRECOMPOSED);
 751         if ((decompose && precompose) || (!decompose && !precompose)) {
 752                 return (EINVAL);
 753         }
 754         outbufstart = outstr;
 755         outbufend = outbufstart + buflen;
 756         inbufstart = instr;
 757         inbuflen = inlen;
 758
 759         while (inlen-- > 0 && (byte = *instr++) != '\0') {
 760                 if (outstr >= outbufend) {
 761                         result = ENAMETOOLONG;
 762                         goto exit;
 763                 }
 764                 if (byte >= 0x80) {
 765                         goto nonASCII;
 766                 }
 767                 /* ASCII is already normalized. */
 768                 *outstr++ = byte;
 769         }
 770 exit:
 771         *outlen = outstr - outbufstart;
 772         if (((flags & UTF_NO_NULL_TERM) == 0)) {
 773                 if (outstr < outbufend)
 774                         *outstr++ = '\0';
 775                 else
 776                         result = ENAMETOOLONG;
 777         }
 778         return (result);
 779
 780
 781         /*
 782          * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
 783          * functions to perform the normalization.  Since this will
 784          * presumably be used to normalize filenames in the back-end
 785          * (on disk or over-the-wire), it should be fast enough.
 786          */
 787 nonASCII:
 788
 789         /* Make sure the input size is reasonable. */
 790         if (inbuflen > MAXPATHLEN) {
 791                 result = ENAMETOOLONG;
 792                 goto exit;
 793         }
 794         /*
 795          * Compute worst case Unicode buffer size.
 796          *
 797          * For pre-composed output, every UTF-8 input byte will be at
 798          * most 2 Unicode bytes.  For decomposed output, 2 UTF-8 bytes
 799          * (smallest composite char sequence) may yield 6 Unicode bytes
 800          * (1 base char + 2 combining chars).
 801          */
 802         unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
 803
 804         if (unicode_bytes <= sizeof(unicodebuf))
 805                 unistr = &unicodebuf[0];
 806         else
 807                 MALLOC(unistr, u_int16_t *, unicode_bytes, M_TEMP, M_WAITOK);
 808
 809         /* Normalize the string. */
 810         result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
 811                                 unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
 812         if (result == 0) {
 813                 /* Put results back into UTF-8. */
 814                 result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
 815                                         &uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
 816                 outstr = outbufstart + uft8_bytes;
 817         }
 818         if (unistr && unistr != &unicodebuf[0]) {
 819                 FREE(unistr, M_TEMP);
 820         }
 821         goto exit;
 822 }
 823
 824
 825  /*
 826   * Unicode 3.2 decomposition code (derived from Core Foundation)
 827   */
 828
 829 typedef struct {
 830         u_int32_t _key;
 831         u_int32_t _value;
 832 } unicode_mappings32;
 833
 834 static inline u_int32_t
 835 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
 836                 u_int16_t character)
 837 {
 838         const unicode_mappings32 *p, *q, *divider;
 839
 840         if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
 841                 return (0);
 842
 843         p = theTable;
 844         q = p + (numElem-1);
 845         while (p <= q) {
 846                 divider = p + ((q - p) >> 1);   /* divide by 2 */
 847                 if (character < divider->_key) { q = divider - 1; }
 848                 else if (character > divider->_key) { p = divider + 1; }
 849                 else { return (divider->_value); }
 850         }
 851         return (0);
 852 }
 853
 854 #define RECURSIVE_DECOMPOSITION (1 << 15)
 855 #define EXTRACT_COUNT(value)    (((value) >> 12) & 0x0007)
 856
 857 typedef struct {
 858         u_int16_t _key;
 859         u_int16_t _value;
 860 } unicode_mappings16;
 861
 862 static inline u_int16_t
 863 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
 864                 u_int16_t character)
 865 {
 866         const unicode_mappings16 *p, *q, *divider;
 867
 868         if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
 869                 return (0);
 870
 871         p = theTable;
 872         q = p + (numElem-1);
 873         while (p <= q) {
 874                 divider = p + ((q - p) >> 1);   /* divide by 2 */
 875                 if (character < divider->_key)
 876                         q = divider - 1;
 877                 else if (character > divider->_key)
 878                         p = divider + 1;
 879                 else
 880                         return (divider->_value);
 881         }
 882         return (0);
 883 }
 884
 885
 886 static u_int32_t
 887 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
 888 {
 889         u_int16_t value;
 890         u_int32_t length;
 891         u_int16_t firstChar;
 892         u_int16_t theChar;
 893         const u_int16_t *bmpMappings;
 894         u_int32_t usedLength;
 895
 896         value = getmappedvalue16(
 897                 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
 898                 __UniCharDecompositionTableLength, character);
 899         length = EXTRACT_COUNT(value);
 900         firstChar = value & 0x0FFF;
 901         theChar = firstChar;
 902         bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
 903         usedLength = 0;
 904
 905         if (value & RECURSIVE_DECOMPOSITION) {
 906             usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
 907
 908             --length;   /* Decrement for the first char */
 909             if (!usedLength)
 910                 return 0;
 911             ++bmpMappings;
 912             convertedChars += usedLength;
 913         }
 914
 915         usedLength += length;
 916
 917         while (length--)
 918                 *(convertedChars++) = *(bmpMappings++);
 919
 920         return (usedLength);
 921 }
 922
 923 #define HANGUL_SBASE 0xAC00
 924 #define HANGUL_LBASE 0x1100
 925 #define HANGUL_VBASE 0x1161
 926 #define HANGUL_TBASE 0x11A7
 927
 928 #define HANGUL_SCOUNT 11172
 929 #define HANGUL_LCOUNT 19
 930 #define HANGUL_VCOUNT 21
 931 #define HANGUL_TCOUNT 28
 932 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
 933
 934 /*
 935  * unicode_decompose - decompose a composed Unicode char
 936  *
 937  * Composed Unicode characters are forbidden on
 938  * HFS Plus volumes. ucs_decompose will convert a
 939  * composed character into its correct decomposed
 940  * sequence.
 941  *
 942  * Similar to CFUniCharDecomposeCharacter
 943  */
 944 static int
 945 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
 946 {
 947         if ((character >= HANGUL_SBASE) &&
 948             (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
 949                 u_int32_t length;
 950
 951                 character -= HANGUL_SBASE;
 952                 length = (character % HANGUL_TCOUNT ? 3 : 2);
 953
 954                 *(convertedChars++) =
 955                         character / HANGUL_NCOUNT + HANGUL_LBASE;
 956                 *(convertedChars++) =
 957                         (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
 958                 if (length > 2)
 959                         *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
 960                 return (length);
 961         } else {
 962                 return (unicode_recursive_decompose(character, convertedChars));
 963         }
 964 }
 965
 966 /*
 967  * unicode_combine - generate a precomposed Unicode char
 968  *
 969  * Precomposed Unicode characters are required for some volume
 970  * formats and network protocols.  unicode_combine will combine
 971  * a decomposed character sequence into a single precomposed
 972  * (composite) character.
 973  *
 974  * Similar toCFUniCharPrecomposeCharacter but unicode_combine
 975  * also handles Hangul Jamo characters.
 976  */
 977 static u_int16_t
 978 unicode_combine(u_int16_t base, u_int16_t combining)
 979 {
 980         u_int32_t value;
 981
 982         /* Check HANGUL */
 983         if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
 984                 /* 2 char Hangul sequences */
 985                 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
 986                     (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
 987                     return (HANGUL_SBASE +
 988                             ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
 989                             ((combining  - HANGUL_VBASE)*HANGUL_TCOUNT));
 990                 }
 991
 992                 /* 3 char Hangul sequences */
 993                 if ((combining > HANGUL_TBASE) &&
 994                     (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
 995                         if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
 996                                 return (0);
 997                         else
 998                                 return (base + (combining - HANGUL_TBASE));
 999                 }
1000         }
1001
1002         value = getmappedvalue32(
1003                 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
1004                 __CFUniCharPrecompositionTableLength, combining);
1005
1006         if (value) {
1007                 value = getmappedvalue16(
1008                         (const unicode_mappings16 *)
1009                         ((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
1010                         (value >> 16), base);
1011         }
1012         return (value);
1013 }
1014
1015
1016 /*
1017  * priortysort - order combining chars into canonical order
1018  *
1019  * Similar to CFUniCharPrioritySort
1020  */
1021 static void
1022 priortysort(u_int16_t* characters, int count)
1023 {
1024         u_int32_t p1, p2;
1025         u_int16_t *ch1, *ch2;
1026         u_int16_t *end;
1027         int changes = 0;
1028
1029         end = characters + count;
1030         do {
1031                 changes = 0;
1032                 ch1 = characters;
1033                 ch2 = characters + 1;
1034                 p2 = get_combining_class(*ch1);
1035                 while (ch2 < end) {
1036                         p1 = p2;
1037                         p2 = get_combining_class(*ch2);
1038                         if (p1 > p2 && p2 != 0) {
1039                                 u_int32_t tmp;
1040
1041                                 tmp = *ch1;
1042                                 *ch1 = *ch2;
1043                                 *ch2 = tmp;
1044                                 changes = 1;
1045
1046                                 /*
1047                                  * Make sure that p2 contains the combining class for the
1048                                  * character now stored at *ch2.  This isn't required for
1049                                  * correctness, but it will be more efficient if a character
1050                                  * with a large combining class has to "bubble past" several
1051                                  * characters with lower combining classes.
1052                                  */
1053                                 p2 = p1;
1054                         }
1055                         ++ch1;
1056                         ++ch2;
1057                 }
1058         } while (changes);
1059 }
1060
1061
1062 /*
1063  * Invalid NTFS filename characters are encodeded using the
1064  * SFM (Services for Macintosh) private use Unicode characters.
1065  *
1066  * These should only be used for SMB, MSDOS or NTFS.
1067  *
1068  *    Illegal NTFS Char   SFM Unicode Char
1069  *  ----------------------------------------
1070  *    0x01-0x1f           0xf001-0xf01f
1071  *    '"'                 0xf020
1072  *    '*'                 0xf021
1073  *    '/'                 0xf022
1074  *    '<'                 0xf023
1075  *    '>'                 0xf024
1076  *    '?'                 0xf025
1077  *    '\'                 0xf026
1078  *    '|'                 0xf027
1079  *    ' '                 0xf028  (Only if last char of the name)
1080  *    '.'                 0xf029  (Only if last char of the name)
1081  *  ----------------------------------------
1082  *
1083  *  Reference: http://support.microsoft.com/kb/q117258/
1084  */
1085
1086 #define MAX_SFM2MAC           0x29
1087 #define SFMCODE_PREFIX_MASK   0xf000
1088
1089 /*
1090  * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1091  * SFM had no conversion for the colon. There is a conversion for the
1092  * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1093  * is a slash and a slash is a colon. So we can just replace the slash with the
1094  * colon in our tables and everything will just work.
1095  */
1096 static u_int8_t
1097 sfm2mac[42] = {
1098         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* 00 - 07 */
1099         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,   /* 08 - 0F */
1100         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,   /* 10 - 17 */
1101         0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,   /* 18 - 1F */
1102         0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c,   /* 20 - 27 */
1103         0x20, 0x2e                                        /* 28 - 29 */
1104 };
1105
1106 static u_int8_t
1107 mac2sfm[112] = {
1108         0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27,   /* 20 - 27 */
1109         0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22,   /* 28 - 2f */
1110         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,   /* 30 - 37 */
1111         0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25,   /* 38 - 3f */
1112         0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,   /* 40 - 47 */
1113         0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,   /* 48 - 4f */
1114         0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,   /* 50 - 57 */
1115         0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f,   /* 58 - 5f */
1116         0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,   /* 60 - 67 */
1117         0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,   /* 68 - 6f */
1118         0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,   /* 70 - 77 */
1119         0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f    /* 78 - 7f */
1120 };
1121
1122
1123 /*
1124  * Encode illegal NTFS filename characters into SFM Private Unicode characters
1125  *
1126  * Assumes non-zero ASCII input.
1127  */
1128 static u_int16_t
1129 ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
1130 {
1131         /* The last character of filename cannot be a space or period. */
1132         if (lastchar) {
1133                 if (ucs_ch == 0x20)
1134                         return (0xf028);
1135                 else if (ucs_ch == 0x2e)
1136                         return (0xf029);
1137         }
1138         /* 0x01 - 0x1f is simple transformation. */
1139         if (ucs_ch <= 0x1f) {
1140                 return (ucs_ch | 0xf000);
1141         } else /* 0x20 - 0x7f */ {
1142                 u_int16_t lsb;
1143
1144                 lsb = mac2sfm[ucs_ch - 0x0020];
1145                 if (lsb != ucs_ch)
1146                         return(0xf000 | lsb);
1147         }
1148         return (ucs_ch);
1149 }
1150
1151 /*
1152  * Decode any SFM Private Unicode characters
1153  */
1154 static u_int16_t
1155 sfm_to_ucs(u_int16_t ucs_ch)
1156 {
1157         if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
1158             ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
1159                 ucs_ch = sfm2mac[ucs_ch & 0x003f];
1160         }
1161         return (ucs_ch);
1162 }
1163
1164