bsd/vfs/vfs_utfconv.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  *      Includes Unicode 3.2 decomposition code derived from Core Foundation
  31  */
  32
  33 #include <sys/param.h>
  34 #include <sys/utfconv.h>
  35 #include <sys/errno.h>
  36 #include <sys/malloc.h>
  37 #include <libkern/OSByteOrder.h>
  38
  39 #if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST)
  40 #include <kern/assert.h>
  41 #else
  42 #include <assert.h>
  43 #endif
  44
  45 /*
  46  * UTF-8 (Unicode Transformation Format)
  47  *
  48  * UTF-8 is the Unicode Transformation Format that serializes a Unicode
  49  * character as a sequence of one to four bytes. Only the shortest form
  50  * required to represent the significant Unicode bits is legal.
  51  *
  52  * UTF-8 Multibyte Codes
  53  *
  54  * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary)
  55  * -----------------------------------------------------------------------------
  56  *   1       7       0x0000        0x007F    0xxxxxxx
  57  *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx
  58  *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx
  59  *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  60  * -----------------------------------------------------------------------------
  61  */
  62
  63
  64 #define UNICODE_TO_UTF8_LEN(c)  \
  65         ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
  66
  67 #define UCS_ALT_NULL    0x2400
  68
  69 /* Surrogate Pair Constants */
  70 #define SP_HALF_SHIFT   10
  71 #define SP_HALF_BASE    0x0010000u
  72 #define SP_HALF_MASK    0x3FFu
  73
  74 #define SP_HIGH_FIRST   0xD800u
  75 #define SP_HIGH_LAST    0xDBFFu
  76 #define SP_LOW_FIRST    0xDC00u
  77 #define SP_LOW_LAST             0xDFFFu
  78
  79
  80 #include "vfs_utfconvdata.h"
  81
  82
  83 /*
  84  * Test for a combining character.
  85  *
  86  * Similar to __CFUniCharIsNonBaseCharacter except that
  87  * unicode_combinable also includes Hangul Jamo characters.
  88  */
  89 int
  90 unicode_combinable(u_int16_t character)
  91 {
  92         const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
  93         u_int8_t value;
  94
  95         if (character < 0x0300) {
  96                 return 0;
  97         }
  98
  99         value = bitmap[(character >> 8) & 0xFF];
 100
 101         if (value == 0xFF) {
 102                 return 1;
 103         } else if (value) {
 104                 bitmap = bitmap + ((value - 1) * 32) + 256;
 105                 return bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0;
 106         }
 107         return 0;
 108 }
 109
 110 /*
 111  * Test for a precomposed character.
 112  *
 113  * Similar to __CFUniCharIsDecomposableCharacter.
 114  */
 115 int
 116 unicode_decomposeable(u_int16_t character)
 117 {
 118         const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
 119         u_int8_t value;
 120
 121         if (character < 0x00C0) {
 122                 return 0;
 123         }
 124
 125         value = bitmap[(character >> 8) & 0xFF];
 126
 127         if (value == 0xFF) {
 128                 return 1;
 129         } else if (value) {
 130                 bitmap = bitmap + ((value - 1) * 32) + 256;
 131                 return bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0;
 132         }
 133         return 0;
 134 }
 135
 136
 137 /*
 138  * Get the combing class.
 139  *
 140  * Similar to CFUniCharGetCombiningPropertyForCharacter.
 141  */
 142 static inline u_int8_t
 143 get_combining_class(u_int16_t character)
 144 {
 145         const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
 146
 147         u_int8_t value = bitmap[(character >> 8)];
 148
 149         if (value) {
 150                 bitmap = bitmap + (value * 256);
 151                 return bitmap[character % 256];
 152         }
 153         return 0;
 154 }
 155
 156
 157 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
 158
 159 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
 160
 161 static void prioritysort(u_int16_t* characters, int count);
 162
 163 static u_int16_t  ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
 164
 165 static u_int16_t  sfm_to_ucs(u_int16_t ucs_ch);
 166
 167
 168 char utf_extrabytes[32] = {
 169         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 170         -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
 171 };
 172
 173 const char hexdigits[16] = {
 174         '0', '1', '2', '3', '4', '5', '6', '7',
 175         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
 176 };
 177
 178 /*
 179  * utf8_encodelen - Calculate the UTF-8 encoding length
 180  *
 181  * This function takes a Unicode input string, ucsp, of ucslen bytes
 182  * and calculates the size of the UTF-8 output in bytes (not including
 183  * a NULL termination byte). The string must reside in kernel memory.
 184  *
 185  * If '/' chars are possible in the Unicode input then an alternate
 186  * (replacement) char should be provided in altslash.
 187  *
 188  * FLAGS
 189  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
 190  *
 191  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 192  *
 193  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 194  *
 195  *    UTF_DECOMPOSED:  generate fully decomposed output
 196  *
 197  *    UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
 198  *
 199  * ERRORS
 200  *    None
 201  */
 202 size_t
 203 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
 204 {
 205         u_int16_t ucs_ch;
 206         u_int16_t * chp = NULL;
 207         u_int16_t sequence[8];
 208         int extra = 0;
 209         size_t charcnt;
 210         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 211         int decompose = (flags & UTF_DECOMPOSED);
 212         size_t len;
 213
 214         charcnt = ucslen / 2;
 215         len = 0;
 216
 217         while (charcnt-- > 0) {
 218                 if (extra > 0) {
 219                         --extra;
 220                         ucs_ch = *chp++;
 221                 } else {
 222                         ucs_ch = *ucsp++;
 223                         if (swapbytes) {
 224                                 ucs_ch = OSSwapInt16(ucs_ch);
 225                         }
 226                         if (ucs_ch == '/') {
 227                                 ucs_ch = altslash ? altslash : '_';
 228                         } else if (ucs_ch == '\0') {
 229                                 ucs_ch = UCS_ALT_NULL;
 230                         } else if (decompose && unicode_decomposeable(ucs_ch)) {
 231                                 extra = unicode_decompose(ucs_ch, sequence) - 1;
 232                                 charcnt += extra;
 233                                 ucs_ch = sequence[0];
 234                                 chp = &sequence[1];
 235                         }
 236                 }
 237                 len += UNICODE_TO_UTF8_LEN(ucs_ch);
 238         }
 239
 240         return len;
 241 }
 242
 243
 244 /*
 245  * utf8_encodestr - Encodes a Unicode string to UTF-8
 246  *
 247  * NOTES:
 248  *    The resulting UTF-8 string is NULL terminated.
 249  *
 250  *    If '/' chars are allowed on disk then an alternate
 251  *    (replacement) char must be provided in altslash.
 252  *
 253  * input flags:
 254  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
 255  *
 256  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 257  *
 258  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 259  *
 260  *    UTF_DECOMPOSED:  generate fully decomposed output
 261  *
 262  *    UTF_NO_NULL_TERM:  don't add NULL termination to UTF-8 output
 263  *
 264  * result:
 265  *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
 266  *
 267  *    EINVAL: Illegal char found; char was replaced by an '_'.
 268  */
 269 int
 270 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
 271     size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
 272 {
 273         u_int8_t * bufstart;
 274         u_int8_t * bufend;
 275         u_int16_t ucs_ch;
 276         u_int16_t * chp = NULL;
 277         u_int16_t sequence[8];
 278         int extra = 0;
 279         size_t charcnt;
 280         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 281         int nullterm  = ((flags & UTF_NO_NULL_TERM) == 0);
 282         int decompose = (flags & UTF_DECOMPOSED);
 283         int sfmconv = (flags & UTF_SFM_CONVERSIONS);
 284         int result = 0;
 285
 286         bufstart = utf8p;
 287         bufend = bufstart + buflen;
 288         if (nullterm) {
 289                 --bufend;
 290         }
 291         charcnt = ucslen / 2;
 292
 293         while (charcnt-- > 0) {
 294                 if (extra > 0) {
 295                         --extra;
 296                         ucs_ch = *chp++;
 297                 } else {
 298                         ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
 299
 300                         if (decompose && unicode_decomposeable(ucs_ch)) {
 301                                 extra = unicode_decompose(ucs_ch, sequence) - 1;
 302                                 charcnt += extra;
 303                                 ucs_ch = sequence[0];
 304                                 chp = &sequence[1];
 305                         }
 306                 }
 307
 308                 /* Slash and NULL are not permitted */
 309                 if (ucs_ch == '/') {
 310                         if (altslash) {
 311                                 ucs_ch = altslash;
 312                         } else {
 313                                 ucs_ch = '_';
 314                                 result = EINVAL;
 315                         }
 316                 } else if (ucs_ch == '\0') {
 317                         ucs_ch = UCS_ALT_NULL;
 318                 }
 319
 320                 if (ucs_ch < 0x0080) {
 321                         if (utf8p >= bufend) {
 322                                 result = ENAMETOOLONG;
 323                                 break;
 324                         }
 325                         *utf8p++ = ucs_ch;
 326                 } else if (ucs_ch < 0x800) {
 327                         if ((utf8p + 1) >= bufend) {
 328                                 result = ENAMETOOLONG;
 329                                 break;
 330                         }
 331                         *utf8p++ = 0xc0 | (ucs_ch >> 6);
 332                         *utf8p++ = 0x80 | (0x3f & ucs_ch);
 333                 } else {
 334                         /* These chars never valid Unicode. */
 335                         if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) {
 336                                 result = EINVAL;
 337                                 break;
 338                         }
 339
 340                         /* Combine valid surrogate pairs */
 341                         if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
 342                             && charcnt > 0) {
 343                                 u_int16_t ch2;
 344                                 u_int32_t pair;
 345
 346                                 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
 347                                 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
 348                                         pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
 349                                             + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
 350                                         if ((utf8p + 3) >= bufend) {
 351                                                 result = ENAMETOOLONG;
 352                                                 break;
 353                                         }
 354                                         --charcnt;
 355                                         ++ucsp;
 356                                         *utf8p++ = 0xf0 | (pair >> 18);
 357                                         *utf8p++ = 0x80 | (0x3f & (pair >> 12));
 358                                         *utf8p++ = 0x80 | (0x3f & (pair >> 6));
 359                                         *utf8p++ = 0x80 | (0x3f & pair);
 360                                         continue;
 361                                 }
 362                         } else if (sfmconv) {
 363                                 ucs_ch = sfm_to_ucs(ucs_ch);
 364                                 if (ucs_ch < 0x0080) {
 365                                         if (utf8p >= bufend) {
 366                                                 result = ENAMETOOLONG;
 367                                                 break;
 368                                         }
 369                                         *utf8p++ = ucs_ch;
 370                                         continue;
 371                                 }
 372                         }
 373                         if ((utf8p + 2) >= bufend) {
 374                                 result = ENAMETOOLONG;
 375                                 break;
 376                         }
 377                         *utf8p++ = 0xe0 | (ucs_ch >> 12);
 378                         *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
 379                         *utf8p++ = 0x80 | (0x3f & ucs_ch);
 380                 }
 381         }
 382
 383         *utf8len = utf8p - bufstart;
 384         if (nullterm) {
 385                 *utf8p++ = '\0';
 386         }
 387
 388         return result;
 389 }
 390
 391 // Pushes a character taking account of combining character sequences
 392 static void
 393 push(uint16_t ucs_ch, int *combcharcnt, uint16_t **ucsp)
 394 {
 395         /*
 396          * Make multiple combining character sequences canonical
 397          */
 398         if (unicode_combinable(ucs_ch)) {
 399                 ++*combcharcnt;         /* start tracking a run */
 400         } else if (*combcharcnt) {
 401                 if (*combcharcnt > 1) {
 402                         prioritysort(*ucsp - *combcharcnt, *combcharcnt);
 403                 }
 404                 *combcharcnt = 0;       /* start over */
 405         }
 406
 407         *(*ucsp)++ = ucs_ch;
 408 }
 409
 410 /*
 411  * utf8_decodestr - Decodes a UTF-8 string back to Unicode
 412  *
 413  * NOTES:
 414  *    The input UTF-8 string does not need to be null terminated
 415  *    if utf8len is set.
 416  *
 417  *    If '/' chars are allowed on disk then an alternate
 418  *    (replacement) char must be provided in altslash.
 419  *
 420  * input flags:
 421  *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
 422  *
 423  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 424  *
 425  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 426  *
 427  *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
 428  *
 429  *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
 430  *
 431  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 432  *
 433  * result:
 434  *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
 435  *
 436  *    EINVAL: Illegal UTF-8 sequence found.
 437  */
 438 int
 439 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
 440     size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
 441 {
 442         u_int16_t* bufstart;
 443         u_int16_t* bufend;
 444         unsigned int ucs_ch;
 445         unsigned int byte;
 446         int combcharcnt = 0;
 447         int result = 0;
 448         int decompose, precompose, escaping;
 449         int sfmconv;
 450         int extrabytes;
 451
 452         decompose  = (flags & UTF_DECOMPOSED);
 453         precompose = (flags & UTF_PRECOMPOSED);
 454         escaping   = (flags & UTF_ESCAPE_ILLEGAL);
 455         sfmconv    = (flags & UTF_SFM_CONVERSIONS);
 456
 457         bufstart = ucsp;
 458         bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
 459
 460         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 461                 if (ucsp >= bufend) {
 462                         goto toolong;
 463                 }
 464
 465                 /* check for ascii */
 466                 if (byte < 0x80) {
 467                         ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
 468                 } else {
 469                         u_int32_t ch;
 470
 471                         extrabytes = utf_extrabytes[byte >> 3];
 472                         if ((extrabytes < 0) || ((int)utf8len < extrabytes)) {
 473                                 goto escape;
 474                         }
 475                         utf8len -= extrabytes;
 476
 477                         switch (extrabytes) {
 478                         case 1:
 479                                 ch = byte; ch <<= 6;   /* 1st byte */
 480                                 byte = *utf8p++;       /* 2nd byte */
 481                                 if ((byte >> 6) != 2) {
 482                                         goto escape2;
 483                                 }
 484                                 ch += byte;
 485                                 ch -= 0x00003080UL;
 486                                 if (ch < 0x0080) {
 487                                         goto escape2;
 488                                 }
 489                                 ucs_ch = ch;
 490                                 break;
 491                         case 2:
 492                                 ch = byte; ch <<= 6;   /* 1st byte */
 493                                 byte = *utf8p++;       /* 2nd byte */
 494                                 if ((byte >> 6) != 2) {
 495                                         goto escape2;
 496                                 }
 497                                 ch += byte; ch <<= 6;
 498                                 byte = *utf8p++;       /* 3rd byte */
 499                                 if ((byte >> 6) != 2) {
 500                                         goto escape3;
 501                                 }
 502                                 ch += byte;
 503                                 ch -= 0x000E2080UL;
 504                                 if (ch < 0x0800) {
 505                                         goto escape3;
 506                                 }
 507                                 if (ch >= 0xD800) {
 508                                         if (ch <= 0xDFFF) {
 509                                                 goto escape3;
 510                                         }
 511                                         if (ch == 0xFFFE || ch == 0xFFFF) {
 512                                                 goto escape3;
 513                                         }
 514                                 }
 515                                 ucs_ch = ch;
 516                                 break;
 517                         case 3:
 518                                 ch = byte; ch <<= 6;   /* 1st byte */
 519                                 byte = *utf8p++;       /* 2nd byte */
 520                                 if ((byte >> 6) != 2) {
 521                                         goto escape2;
 522                                 }
 523                                 ch += byte; ch <<= 6;
 524                                 byte = *utf8p++;       /* 3rd byte */
 525                                 if ((byte >> 6) != 2) {
 526                                         goto escape3;
 527                                 }
 528                                 ch += byte; ch <<= 6;
 529                                 byte = *utf8p++;       /* 4th byte */
 530                                 if ((byte >> 6) != 2) {
 531                                         goto escape4;
 532                                 }
 533                                 ch += byte;
 534                                 ch -= 0x03C82080UL + SP_HALF_BASE;
 535                                 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 536                                 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST) {
 537                                         goto escape4;
 538                                 }
 539                                 push(ucs_ch, &combcharcnt, &ucsp);
 540                                 if (ucsp >= bufend) {
 541                                         goto toolong;
 542                                 }
 543                                 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 544                                 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
 545                                         --ucsp;
 546                                         goto escape4;
 547                                 }
 548                                 *ucsp++ = ucs_ch;
 549                                 continue;
 550                         default:
 551                                 result = EINVAL;
 552                                 goto exit;
 553                         }
 554                         if (decompose) {
 555                                 if (unicode_decomposeable(ucs_ch)) {
 556                                         u_int16_t sequence[8];
 557                                         int count, i;
 558
 559                                         count = unicode_decompose(ucs_ch, sequence);
 560
 561                                         for (i = 0; i < count; ++i) {
 562                                                 if (ucsp >= bufend) {
 563                                                         goto toolong;
 564                                                 }
 565
 566                                                 push(sequence[i], &combcharcnt, &ucsp);
 567                                         }
 568
 569                                         continue;
 570                                 }
 571                         } else if (precompose && (ucsp != bufstart)) {
 572                                 u_int16_t composite, base;
 573
 574                                 if (unicode_combinable(ucs_ch)) {
 575                                         base = ucsp[-1];
 576                                         composite = unicode_combine(base, ucs_ch);
 577                                         if (composite) {
 578                                                 --ucsp;
 579                                                 ucs_ch = composite;
 580                                         }
 581                                 }
 582                         }
 583                         if (ucs_ch == UCS_ALT_NULL) {
 584                                 ucs_ch = '\0';
 585                         }
 586                 }
 587                 if (ucs_ch == altslash) {
 588                         ucs_ch = '/';
 589                 }
 590
 591                 push(ucs_ch, &combcharcnt, &ucsp);
 592                 continue;
 593
 594                 /*
 595                  * Escape illegal UTF-8 into something legal.
 596                  */
 597 escape4:
 598                 utf8p -= 3;
 599                 goto escape;
 600 escape3:
 601                 utf8p -= 2;
 602                 goto escape;
 603 escape2:
 604                 utf8p -= 1;
 605 escape:
 606                 if (!escaping) {
 607                         result = EINVAL;
 608                         goto exit;
 609                 }
 610                 if (extrabytes > 0) {
 611                         utf8len += extrabytes;
 612                 }
 613                 byte = *(utf8p - 1);
 614
 615                 if ((ucsp + 2) >= bufend) {
 616                         goto toolong;
 617                 }
 618
 619                 /* Make a previous combining sequence canonical. */
 620                 if (combcharcnt > 1) {
 621                         prioritysort(ucsp - combcharcnt, combcharcnt);
 622                 }
 623                 combcharcnt = 0;
 624
 625                 ucs_ch = '%';
 626                 *ucsp++ = ucs_ch;
 627                 ucs_ch =  hexdigits[byte >> 4];
 628                 *ucsp++ = ucs_ch;
 629                 ucs_ch =  hexdigits[byte & 0x0F];
 630                 *ucsp++ = ucs_ch;
 631         }
 632         /*
 633          * Make a previous combining sequence canonical
 634          */
 635         if (combcharcnt > 1) {
 636                 prioritysort(ucsp - combcharcnt, combcharcnt);
 637         }
 638
 639         if (flags & UTF_REVERSE_ENDIAN) {
 640                 uint16_t *p = bufstart;
 641                 while (p < ucsp) {
 642                         *p = OSSwapInt16(*p);
 643                         ++p;
 644                 }
 645         }
 646
 647 exit:
 648         *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
 649
 650         return result;
 651
 652 toolong:
 653         result = ENAMETOOLONG;
 654         goto exit;
 655 }
 656
 657
 658 /*
 659  * utf8_validatestr - Check for a valid UTF-8 string.
 660  */
 661 int
 662 utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
 663 {
 664         unsigned int byte;
 665         u_int32_t ch;
 666         unsigned int ucs_ch;
 667         size_t extrabytes;
 668
 669         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 670                 if (byte < 0x80) {
 671                         continue;  /* plain ascii */
 672                 }
 673                 extrabytes = utf_extrabytes[byte >> 3];
 674
 675                 if (utf8len < extrabytes) {
 676                         goto invalid;
 677                 }
 678                 utf8len -= extrabytes;
 679
 680                 switch (extrabytes) {
 681                 case 1:
 682                         ch = byte; ch <<= 6;   /* 1st byte */
 683                         byte = *utf8p++;       /* 2nd byte */
 684                         if ((byte >> 6) != 2) {
 685                                 goto invalid;
 686                         }
 687                         ch += byte;
 688                         ch -= 0x00003080UL;
 689                         if (ch < 0x0080) {
 690                                 goto invalid;
 691                         }
 692                         break;
 693                 case 2:
 694                         ch = byte; ch <<= 6;   /* 1st byte */
 695                         byte = *utf8p++;       /* 2nd byte */
 696                         if ((byte >> 6) != 2) {
 697                                 goto invalid;
 698                         }
 699                         ch += byte; ch <<= 6;
 700                         byte = *utf8p++;       /* 3rd byte */
 701                         if ((byte >> 6) != 2) {
 702                                 goto invalid;
 703                         }
 704                         ch += byte;
 705                         ch -= 0x000E2080UL;
 706                         if (ch < 0x0800) {
 707                                 goto invalid;
 708                         }
 709                         if (ch >= 0xD800) {
 710                                 if (ch <= 0xDFFF) {
 711                                         goto invalid;
 712                                 }
 713                                 if (ch == 0xFFFE || ch == 0xFFFF) {
 714                                         goto invalid;
 715                                 }
 716                         }
 717                         break;
 718                 case 3:
 719                         ch = byte; ch <<= 6;   /* 1st byte */
 720                         byte = *utf8p++;       /* 2nd byte */
 721                         if ((byte >> 6) != 2) {
 722                                 goto invalid;
 723                         }
 724                         ch += byte; ch <<= 6;
 725                         byte = *utf8p++;       /* 3rd byte */
 726                         if ((byte >> 6) != 2) {
 727                                 goto invalid;
 728                         }
 729                         ch += byte; ch <<= 6;
 730                         byte = *utf8p++;       /* 4th byte */
 731                         if ((byte >> 6) != 2) {
 732                                 goto invalid;
 733                         }
 734                         ch += byte;
 735                         ch -= 0x03C82080UL + SP_HALF_BASE;
 736                         ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 737                         if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST) {
 738                                 goto invalid;
 739                         }
 740                         ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 741                         if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
 742                                 goto invalid;
 743                         }
 744                         break;
 745                 default:
 746                         goto invalid;
 747                 }
 748         }
 749         return 0;
 750 invalid:
 751         return EINVAL;
 752 }
 753
 754 /*
 755  * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
 756  *
 757  * This function takes an UTF-8 input string, instr, of inlen bytes
 758  * and produces normalized UTF-8 output into a buffer of buflen bytes
 759  * pointed to by outstr. The size of the output in bytes (not including
 760  * a NULL termination byte) is returned in outlen. In-place conversions
 761  * are not supported (i.e. instr != outstr).]
 762  *
 763  * FLAGS
 764  *    UTF_DECOMPOSED:  output string will be fully decomposed (NFD)
 765  *
 766  *    UTF_PRECOMPOSED:  output string will be precomposed (NFC)
 767  *
 768  *    UTF_NO_NULL_TERM:  do not add null termination to output string
 769  *
 770  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 771  *
 772  * ERRORS
 773  *    ENAMETOOLONG:  output did not fit or input exceeded MAXPATHLEN bytes
 774  *
 775  *    EINVAL:  illegal UTF-8 sequence encountered or invalid flags
 776  */
 777 int
 778 utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
 779     size_t *outlen, size_t buflen, int flags)
 780 {
 781         u_int16_t unicodebuf[32];
 782         u_int16_t* unistr = NULL;
 783         size_t unicode_bytes;
 784         size_t uft8_bytes;
 785         size_t inbuflen;
 786         u_int8_t *outbufstart, *outbufend;
 787         const u_int8_t *inbufstart;
 788         unsigned int byte;
 789         int decompose, precompose;
 790         int result = 0;
 791
 792         if (flags & ~(UTF_DECOMPOSED | UTF_PRECOMPOSED | UTF_NO_NULL_TERM | UTF_ESCAPE_ILLEGAL)) {
 793                 return EINVAL;
 794         }
 795         decompose = (flags & UTF_DECOMPOSED);
 796         precompose = (flags & UTF_PRECOMPOSED);
 797         if ((decompose && precompose) || (!decompose && !precompose)) {
 798                 return EINVAL;
 799         }
 800         outbufstart = outstr;
 801         outbufend = outbufstart + buflen;
 802         inbufstart = instr;
 803         inbuflen = inlen;
 804
 805         while (inlen-- > 0 && (byte = *instr++) != '\0') {
 806                 if (outstr >= outbufend) {
 807                         result = ENAMETOOLONG;
 808                         goto exit;
 809                 }
 810                 if (byte >= 0x80) {
 811                         goto nonASCII;
 812                 }
 813                 /* ASCII is already normalized. */
 814                 *outstr++ = byte;
 815         }
 816 exit:
 817         *outlen = outstr - outbufstart;
 818         if (((flags & UTF_NO_NULL_TERM) == 0)) {
 819                 if (outstr < outbufend) {
 820                         *outstr++ = '\0';
 821                 } else {
 822                         result = ENAMETOOLONG;
 823                 }
 824         }
 825         return result;
 826
 827
 828         /*
 829          * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
 830          * functions to perform the normalization.  Since this will
 831          * presumably be used to normalize filenames in the back-end
 832          * (on disk or over-the-wire), it should be fast enough.
 833          */
 834 nonASCII:
 835
 836         /* Make sure the input size is reasonable. */
 837         if (inbuflen > MAXPATHLEN) {
 838                 result = ENAMETOOLONG;
 839                 goto exit;
 840         }
 841         /*
 842          * Compute worst case Unicode buffer size.
 843          *
 844          * For pre-composed output, every UTF-8 input byte will be at
 845          * most 2 Unicode bytes.  For decomposed output, 2 UTF-8 bytes
 846          * (smallest composite char sequence) may yield 6 Unicode bytes
 847          * (1 base char + 2 combining chars).
 848          */
 849         unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
 850
 851         if (unicode_bytes <= sizeof(unicodebuf)) {
 852                 unistr = &unicodebuf[0];
 853         } else {
 854                 MALLOC(unistr, uint16_t *, unicode_bytes, M_TEMP, M_WAITOK);
 855         }
 856
 857         /* Normalize the string. */
 858         result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
 859             unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
 860         if (result == 0) {
 861                 /* Put results back into UTF-8. */
 862                 result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
 863                     &uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
 864                 outstr = outbufstart + uft8_bytes;
 865         }
 866         if (unistr && unistr != &unicodebuf[0]) {
 867                 FREE(unistr, M_TEMP);
 868         }
 869         goto exit;
 870 }
 871
 872
 873 /*
 874  * Unicode 3.2 decomposition code (derived from Core Foundation)
 875  */
 876
 877 typedef struct {
 878         u_int32_t _key;
 879         u_int32_t _value;
 880 } unicode_mappings32;
 881
 882 static inline u_int32_t
 883 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
 884     u_int16_t character)
 885 {
 886         const unicode_mappings32 *p, *q, *divider;
 887
 888         if ((character < theTable[0]._key) || (character > theTable[numElem - 1]._key)) {
 889                 return 0;
 890         }
 891
 892         p = theTable;
 893         q = p + (numElem - 1);
 894         while (p <= q) {
 895                 divider = p + ((q - p) >> 1);   /* divide by 2 */
 896                 if (character < divider->_key) {
 897                         q = divider - 1;
 898                 } else if (character > divider->_key) {
 899                         p = divider + 1;
 900                 } else {
 901                         return divider->_value;
 902                 }
 903         }
 904         return 0;
 905 }
 906
 907 #define RECURSIVE_DECOMPOSITION (1 << 15)
 908 #define EXTRACT_COUNT(value)    (((value) >> 12) & 0x0007)
 909
 910 typedef struct {
 911         u_int16_t _key;
 912         u_int16_t _value;
 913 } unicode_mappings16;
 914
 915 static inline u_int16_t
 916 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
 917     u_int16_t character)
 918 {
 919         const unicode_mappings16 *p, *q, *divider;
 920
 921         if ((character < theTable[0]._key) || (character > theTable[numElem - 1]._key)) {
 922                 return 0;
 923         }
 924
 925         p = theTable;
 926         q = p + (numElem - 1);
 927         while (p <= q) {
 928                 divider = p + ((q - p) >> 1);   /* divide by 2 */
 929                 if (character < divider->_key) {
 930                         q = divider - 1;
 931                 } else if (character > divider->_key) {
 932                         p = divider + 1;
 933                 } else {
 934                         return divider->_value;
 935                 }
 936         }
 937         return 0;
 938 }
 939
 940
 941 static u_int32_t
 942 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
 943 {
 944         u_int16_t value;
 945         u_int32_t length;
 946         u_int16_t firstChar;
 947         u_int16_t theChar;
 948         const u_int16_t *bmpMappings;
 949         u_int32_t usedLength;
 950
 951         value = getmappedvalue16(
 952                 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
 953                 __UniCharDecompositionTableLength, character);
 954         length = EXTRACT_COUNT(value);
 955         firstChar = value & 0x0FFF;
 956         theChar = firstChar;
 957         bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
 958         usedLength = 0;
 959
 960         if (value & RECURSIVE_DECOMPOSITION) {
 961                 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
 962
 963                 --length; /* Decrement for the first char */
 964                 if (!usedLength) {
 965                         return 0;
 966                 }
 967                 ++bmpMappings;
 968                 convertedChars += usedLength;
 969         }
 970
 971         usedLength += length;
 972
 973         while (length--) {
 974                 *(convertedChars++) = *(bmpMappings++);
 975         }
 976
 977         return usedLength;
 978 }
 979
 980 #define HANGUL_SBASE 0xAC00
 981 #define HANGUL_LBASE 0x1100
 982 #define HANGUL_VBASE 0x1161
 983 #define HANGUL_TBASE 0x11A7
 984
 985 #define HANGUL_SCOUNT 11172
 986 #define HANGUL_LCOUNT 19
 987 #define HANGUL_VCOUNT 21
 988 #define HANGUL_TCOUNT 28
 989 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
 990
 991 /*
 992  * unicode_decompose - decompose a composed Unicode char
 993  *
 994  * Composed Unicode characters are forbidden on
 995  * HFS Plus volumes. ucs_decompose will convert a
 996  * composed character into its correct decomposed
 997  * sequence.
 998  *
 999  * Similar to CFUniCharDecomposeCharacter
1000  */
1001 static int
1002 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
1003 {
1004         if ((character >= HANGUL_SBASE) &&
1005             (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
1006                 u_int32_t length;
1007
1008                 character -= HANGUL_SBASE;
1009                 length = (character % HANGUL_TCOUNT ? 3 : 2);
1010
1011                 *(convertedChars++) =
1012                     character / HANGUL_NCOUNT + HANGUL_LBASE;
1013                 *(convertedChars++) =
1014                     (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
1015                 if (length > 2) {
1016                         *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
1017                 }
1018                 return length;
1019         } else {
1020                 return unicode_recursive_decompose(character, convertedChars);
1021         }
1022 }
1023
1024 /*
1025  * unicode_combine - generate a precomposed Unicode char
1026  *
1027  * Precomposed Unicode characters are required for some volume
1028  * formats and network protocols.  unicode_combine will combine
1029  * a decomposed character sequence into a single precomposed
1030  * (composite) character.
1031  *
1032  * Similar toCFUniCharPrecomposeCharacter but unicode_combine
1033  * also handles Hangul Jamo characters.
1034  */
1035 static u_int16_t
1036 unicode_combine(u_int16_t base, u_int16_t combining)
1037 {
1038         u_int32_t value;
1039
1040         /* Check HANGUL */
1041         if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
1042                 /* 2 char Hangul sequences */
1043                 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
1044                     (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
1045                         return HANGUL_SBASE +
1046                                ((base - HANGUL_LBASE) * (HANGUL_VCOUNT * HANGUL_TCOUNT)) +
1047                                ((combining  - HANGUL_VBASE) * HANGUL_TCOUNT);
1048                 }
1049
1050                 /* 3 char Hangul sequences */
1051                 if ((combining > HANGUL_TBASE) &&
1052                     (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
1053                         if ((base - HANGUL_SBASE) % HANGUL_TCOUNT) {
1054                                 return 0;
1055                         } else {
1056                                 return base + (combining - HANGUL_TBASE);
1057                         }
1058                 }
1059         }
1060
1061         value = getmappedvalue32(
1062                 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
1063                 __CFUniCharPrecompositionTableLength, combining);
1064
1065         if (value) {
1066                 value = getmappedvalue16(
1067                         (const unicode_mappings16 *)
1068                         ((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
1069                         (value >> 16), base);
1070         }
1071         return value;
1072 }
1073
1074
1075 /*
1076  * prioritysort - order combining chars into canonical order
1077  *
1078  * Similar to CFUniCharPrioritySort
1079  */
1080 static void
1081 prioritysort(u_int16_t* characters, int count)
1082 {
1083         u_int32_t p1, p2;
1084         u_int16_t *ch1, *ch2;
1085         u_int16_t *end;
1086         int changes = 0;
1087
1088         end = characters + count;
1089         do {
1090                 changes = 0;
1091                 ch1 = characters;
1092                 ch2 = characters + 1;
1093                 p2 = get_combining_class(*ch1);
1094                 while (ch2 < end) {
1095                         p1 = p2;
1096                         p2 = get_combining_class(*ch2);
1097                         if (p1 > p2 && p2 != 0) {
1098                                 u_int32_t tmp;
1099
1100                                 tmp = *ch1;
1101                                 *ch1 = *ch2;
1102                                 *ch2 = tmp;
1103                                 changes = 1;
1104
1105                                 /*
1106                                  * Make sure that p2 contains the combining class for the
1107                                  * character now stored at *ch2.  This isn't required for
1108                                  * correctness, but it will be more efficient if a character
1109                                  * with a large combining class has to "bubble past" several
1110                                  * characters with lower combining classes.
1111                                  */
1112                                 p2 = p1;
1113                         }
1114                         ++ch1;
1115                         ++ch2;
1116                 }
1117         } while (changes);
1118 }
1119
1120
1121 /*
1122  * Invalid NTFS filename characters are encodeded using the
1123  * SFM (Services for Macintosh) private use Unicode characters.
1124  *
1125  * These should only be used for SMB, MSDOS or NTFS.
1126  *
1127  *    Illegal NTFS Char   SFM Unicode Char
1128  *  ----------------------------------------
1129  *    0x01-0x1f           0xf001-0xf01f
1130  *    '"'                 0xf020
1131  *    '*'                 0xf021
1132  *    '/'                 0xf022
1133  *    '<'                 0xf023
1134  *    '>'                 0xf024
1135  *    '?'                 0xf025
1136  *    '\'                 0xf026
1137  *    '|'                 0xf027
1138  *    ' '                 0xf028  (Only if last char of the name)
1139  *    '.'                 0xf029  (Only if last char of the name)
1140  *  ----------------------------------------
1141  *
1142  *  Reference: http://support.microsoft.com/kb/q117258/
1143  */
1144
1145 #define MAX_SFM2MAC           0x29
1146 #define SFMCODE_PREFIX_MASK   0xf000
1147
1148 /*
1149  * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1150  * SFM had no conversion for the colon. There is a conversion for the
1151  * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1152  * is a slash and a slash is a colon. So we can just replace the slash with the
1153  * colon in our tables and everything will just work.
1154  */
1155 static u_int8_t
1156     sfm2mac[] = {
1157         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* 00 - 07 */
1158         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,   /* 08 - 0F */
1159         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,   /* 10 - 17 */
1160         0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,   /* 18 - 1F */
1161         0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c,   /* 20 - 27 */
1162         0x20, 0x2e                                        /* 28 - 29 */
1163 };
1164 #define SFM2MAC_LEN     ((sizeof(sfm2mac))/sizeof(sfm2mac[0]))
1165
1166 static u_int8_t
1167     mac2sfm[] = {
1168         0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27,   /* 20 - 27 */
1169         0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22,   /* 28 - 2f */
1170         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,   /* 30 - 37 */
1171         0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25,   /* 38 - 3f */
1172         0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,   /* 40 - 47 */
1173         0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,   /* 48 - 4f */
1174         0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,   /* 50 - 57 */
1175         0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f,   /* 58 - 5f */
1176         0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,   /* 60 - 67 */
1177         0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,   /* 68 - 6f */
1178         0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,   /* 70 - 77 */
1179         0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f    /* 78 - 7f */
1180 };
1181 #define MAC2SFM_LEN     ((sizeof(mac2sfm))/sizeof(mac2sfm[0]))
1182
1183
1184 /*
1185  * Encode illegal NTFS filename characters into SFM Private Unicode characters
1186  *
1187  * Assumes non-zero ASCII input.
1188  */
1189 static u_int16_t
1190 ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
1191 {
1192         /* The last character of filename cannot be a space or period. */
1193         if (lastchar) {
1194                 if (ucs_ch == 0x20) {
1195                         return 0xf028;
1196                 } else if (ucs_ch == 0x2e) {
1197                         return 0xf029;
1198                 }
1199         }
1200         /* 0x01 - 0x1f is simple transformation. */
1201         if (ucs_ch <= 0x1f) {
1202                 return ucs_ch | 0xf000;
1203         } else { /* 0x20 - 0x7f */
1204                 u_int16_t lsb;
1205
1206                 assert((ucs_ch - 0x0020) < MAC2SFM_LEN);
1207                 lsb = mac2sfm[ucs_ch - 0x0020];
1208                 if (lsb != ucs_ch) {
1209                         return 0xf000 | lsb;
1210                 }
1211         }
1212         return ucs_ch;
1213 }
1214
1215 /*
1216  * Decode any SFM Private Unicode characters
1217  */
1218 static u_int16_t
1219 sfm_to_ucs(u_int16_t ucs_ch)
1220 {
1221         if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
1222             ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
1223                 assert((ucs_ch & 0x003f) < SFM2MAC_LEN);
1224                 ucs_ch = sfm2mac[ucs_ch & 0x003f];
1225         }
1226         return ucs_ch;
1227 }