bsd/vfs/vfs_utfconv.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29  /*
  30         Includes Unicode 3.2 decomposition code derived from Core Foundation
  31  */
  32
  33 #include <sys/param.h>
  34 #include <sys/utfconv.h>
  35 #include <sys/errno.h>
  36 #include <sys/malloc.h>
  37 #include <libkern/OSByteOrder.h>
  38
  39 #if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST)
  40 #include <kern/assert.h>
  41 #else
  42 #include <assert.h>
  43 #endif
  44
  45 /*
  46  * UTF-8 (Unicode Transformation Format)
  47  *
  48  * UTF-8 is the Unicode Transformation Format that serializes a Unicode
  49  * character as a sequence of one to four bytes. Only the shortest form
  50  * required to represent the significant Unicode bits is legal.
  51  *
  52  * UTF-8 Multibyte Codes
  53  *
  54  * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary)
  55  * -----------------------------------------------------------------------------
  56  *   1       7       0x0000        0x007F    0xxxxxxx
  57  *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx
  58  *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx
  59  *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  60  * -----------------------------------------------------------------------------
  61  */
  62
  63
  64 #define UNICODE_TO_UTF8_LEN(c)  \
  65         ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
  66
  67 #define UCS_ALT_NULL    0x2400
  68
  69 /* Surrogate Pair Constants */
  70 #define SP_HALF_SHIFT   10
  71 #define SP_HALF_BASE    0x0010000u
  72 #define SP_HALF_MASK    0x3FFu
  73
  74 #define SP_HIGH_FIRST   0xD800u
  75 #define SP_HIGH_LAST    0xDBFFu
  76 #define SP_LOW_FIRST    0xDC00u
  77 #define SP_LOW_LAST             0xDFFFu
  78
  79
  80 #include "vfs_utfconvdata.h"
  81
  82
  83 /*
  84  * Test for a combining character.
  85  *
  86  * Similar to __CFUniCharIsNonBaseCharacter except that
  87  * unicode_combinable also includes Hangul Jamo characters.
  88  */
  89 int
  90 unicode_combinable(u_int16_t character)
  91 {
  92         const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
  93         u_int8_t value;
  94
  95         if (character < 0x0300)
  96                 return (0);
  97
  98         value = bitmap[(character >> 8) & 0xFF];
  99
 100         if (value == 0xFF) {
 101                 return (1);
 102         } else if (value) {
 103                 bitmap = bitmap + ((value - 1) * 32) + 256;
 104                 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
 105         }
 106         return (0);
 107 }
 108
 109 /*
 110  * Test for a precomposed character.
 111  *
 112  * Similar to __CFUniCharIsDecomposableCharacter.
 113  */
 114 int
 115 unicode_decomposeable(u_int16_t character) {
 116         const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
 117         u_int8_t value;
 118
 119         if (character < 0x00C0)
 120                 return (0);
 121
 122         value = bitmap[(character >> 8) & 0xFF];
 123
 124         if (value == 0xFF) {
 125                 return (1);
 126         } else if (value) {
 127                 bitmap = bitmap + ((value - 1) * 32) + 256;
 128                 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
 129         }
 130         return (0);
 131 }
 132
 133
 134 /*
 135  * Get the combing class.
 136  *
 137  * Similar to CFUniCharGetCombiningPropertyForCharacter.
 138  */
 139 static inline u_int8_t
 140 get_combining_class(u_int16_t character) {
 141         const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
 142
 143         u_int8_t value = bitmap[(character >> 8)];
 144
 145         if (value) {
 146                 bitmap = bitmap + (value * 256);
 147                 return bitmap[character % 256];
 148         }
 149         return (0);
 150 }
 151
 152
 153 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
 154
 155 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
 156
 157 static void prioritysort(u_int16_t* characters, int count);
 158
 159 static u_int16_t  ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
 160
 161 static u_int16_t  sfm_to_ucs(u_int16_t ucs_ch);
 162
 163
 164 char utf_extrabytes[32] = {
 165          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 166         -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,  2,  2,  3, -1
 167 };
 168
 169 const char hexdigits[16] = {
 170          '0',  '1',  '2',  '3',  '4',  '5',  '6', '7',
 171          '8',  '9',  'A',  'B',  'C',  'D',  'E', 'F'
 172 };
 173
 174 /*
 175  * utf8_encodelen - Calculate the UTF-8 encoding length
 176  *
 177  * This function takes a Unicode input string, ucsp, of ucslen bytes
 178  * and calculates the size of the UTF-8 output in bytes (not including
 179  * a NULL termination byte). The string must reside in kernel memory.
 180  *
 181  * If '/' chars are possible in the Unicode input then an alternate
 182  * (replacement) char should be provided in altslash.
 183  *
 184  * FLAGS
 185  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
 186  *
 187  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 188  *
 189  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 190  *
 191  *    UTF_DECOMPOSED:  generate fully decomposed output
 192  *
 193  *    UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
 194  *
 195  * ERRORS
 196  *    None
 197  */
 198 size_t
 199 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
 200 {
 201         u_int16_t ucs_ch;
 202         u_int16_t * chp = NULL;
 203         u_int16_t sequence[8];
 204         int extra = 0;
 205         size_t charcnt;
 206         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 207         int decompose = (flags & UTF_DECOMPOSED);
 208         size_t len;
 209
 210         charcnt = ucslen / 2;
 211         len = 0;
 212
 213         while (charcnt-- > 0) {
 214                 if (extra > 0) {
 215                         --extra;
 216                         ucs_ch = *chp++;
 217                 } else {
 218                         ucs_ch = *ucsp++;
 219                         if (swapbytes) {
 220                                 ucs_ch = OSSwapInt16(ucs_ch);
 221                         }
 222                         if (ucs_ch == '/') {
 223                                 ucs_ch = altslash ? altslash : '_';
 224                         } else if (ucs_ch == '\0') {
 225                                 ucs_ch = UCS_ALT_NULL;
 226                         } else if (decompose && unicode_decomposeable(ucs_ch)) {
 227                                 extra = unicode_decompose(ucs_ch, sequence) - 1;
 228                                 charcnt += extra;
 229                                 ucs_ch = sequence[0];
 230                                 chp = &sequence[1];
 231                         }
 232                 }
 233                 len += UNICODE_TO_UTF8_LEN(ucs_ch);
 234         }
 235
 236         return (len);
 237 }
 238
 239
 240 /*
 241  * utf8_encodestr - Encodes a Unicode string to UTF-8
 242  *
 243  * NOTES:
 244  *    The resulting UTF-8 string is NULL terminated.
 245  *
 246  *    If '/' chars are allowed on disk then an alternate
 247  *    (replacement) char must be provided in altslash.
 248  *
 249  * input flags:
 250  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
 251  *
 252  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 253  *
 254  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 255  *
 256  *    UTF_DECOMPOSED:  generate fully decomposed output
 257  *
 258  *    UTF_NO_NULL_TERM:  don't add NULL termination to UTF-8 output
 259  *
 260  * result:
 261  *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
 262  *
 263  *    EINVAL: Illegal char found; char was replaced by an '_'.
 264  */
 265 int
 266 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
 267                size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
 268 {
 269         u_int8_t * bufstart;
 270         u_int8_t * bufend;
 271         u_int16_t ucs_ch;
 272         u_int16_t * chp = NULL;
 273         u_int16_t sequence[8];
 274         int extra = 0;
 275         size_t charcnt;
 276         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 277         int nullterm  = ((flags & UTF_NO_NULL_TERM) == 0);
 278         int decompose = (flags & UTF_DECOMPOSED);
 279         int sfmconv = (flags & UTF_SFM_CONVERSIONS);
 280         int result = 0;
 281
 282         bufstart = utf8p;
 283         bufend = bufstart + buflen;
 284         if (nullterm)
 285                 --bufend;
 286         charcnt = ucslen / 2;
 287
 288         while (charcnt-- > 0) {
 289                 if (extra > 0) {
 290                         --extra;
 291                         ucs_ch = *chp++;
 292                 } else {
 293                         ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
 294
 295                         if (decompose && unicode_decomposeable(ucs_ch)) {
 296                                 extra = unicode_decompose(ucs_ch, sequence) - 1;
 297                                 charcnt += extra;
 298                                 ucs_ch = sequence[0];
 299                                 chp = &sequence[1];
 300                         }
 301                 }
 302
 303                 /* Slash and NULL are not permitted */
 304                 if (ucs_ch == '/') {
 305                         if (altslash)
 306                                 ucs_ch = altslash;
 307                         else {
 308                                 ucs_ch = '_';
 309                                 result = EINVAL;
 310                         }
 311                 } else if (ucs_ch == '\0') {
 312                         ucs_ch = UCS_ALT_NULL;
 313                 }
 314
 315                 if (ucs_ch < 0x0080) {
 316                         if (utf8p >= bufend) {
 317                                 result = ENAMETOOLONG;
 318                                 break;
 319                         }
 320                         *utf8p++ = ucs_ch;
 321
 322                 } else if (ucs_ch < 0x800) {
 323                         if ((utf8p + 1) >= bufend) {
 324                                 result = ENAMETOOLONG;
 325                                 break;
 326                         }
 327                         *utf8p++ = 0xc0 | (ucs_ch >> 6);
 328                         *utf8p++ = 0x80 | (0x3f & ucs_ch);
 329
 330                 } else {
 331                         /* These chars never valid Unicode. */
 332                         if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) {
 333                                 result = EINVAL;
 334                                 break;
 335                         }
 336
 337                         /* Combine valid surrogate pairs */
 338                         if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
 339                                 && charcnt > 0) {
 340                                 u_int16_t ch2;
 341                                 u_int32_t pair;
 342
 343                                 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
 344                                 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
 345                                         pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
 346                                                 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
 347                                         if ((utf8p + 3) >= bufend) {
 348                                                 result = ENAMETOOLONG;
 349                                                 break;
 350                                         }
 351                                         --charcnt;
 352                                         ++ucsp;
 353                                         *utf8p++ = 0xf0 | (pair >> 18);
 354                                         *utf8p++ = 0x80 | (0x3f & (pair >> 12));
 355                                         *utf8p++ = 0x80 | (0x3f & (pair >> 6));
 356                                         *utf8p++ = 0x80 | (0x3f & pair);
 357                                         continue;
 358                                 }
 359                         } else if (sfmconv) {
 360                                 ucs_ch = sfm_to_ucs(ucs_ch);
 361                                 if (ucs_ch < 0x0080) {
 362                                         if (utf8p >= bufend) {
 363                                                 result = ENAMETOOLONG;
 364                                                 break;
 365                                         }
 366                                         *utf8p++ = ucs_ch;
 367                                         continue;
 368                                 }
 369                         }
 370                         if ((utf8p + 2) >= bufend) {
 371                                 result = ENAMETOOLONG;
 372                                 break;
 373                         }
 374                         *utf8p++ = 0xe0 | (ucs_ch >> 12);
 375                         *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
 376                         *utf8p++ = 0x80 | (0x3f & ucs_ch);
 377                 }
 378         }
 379
 380         *utf8len = utf8p - bufstart;
 381         if (nullterm)
 382                 *utf8p++ = '\0';
 383
 384         return (result);
 385 }
 386
 387 // Pushes a character taking account of combining character sequences
 388 static void push(uint16_t ucs_ch, int *combcharcnt, uint16_t **ucsp)
 389 {
 390         /*
 391          * Make multiple combining character sequences canonical
 392          */
 393         if (unicode_combinable(ucs_ch)) {
 394                 ++*combcharcnt;         /* start tracking a run */
 395         } else if (*combcharcnt) {
 396                 if (*combcharcnt > 1) {
 397                         prioritysort(*ucsp - *combcharcnt, *combcharcnt);
 398                 }
 399                 *combcharcnt = 0;       /* start over */
 400         }
 401
 402         *(*ucsp)++ = ucs_ch;
 403 }
 404
 405 /*
 406  * utf8_decodestr - Decodes a UTF-8 string back to Unicode
 407  *
 408  * NOTES:
 409  *    The input UTF-8 string does not need to be null terminated
 410  *    if utf8len is set.
 411  *
 412  *    If '/' chars are allowed on disk then an alternate
 413  *    (replacement) char must be provided in altslash.
 414  *
 415  * input flags:
 416  *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
 417  *
 418  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 419  *
 420  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 421  *
 422  *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
 423  *
 424  *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
 425  *
 426  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 427  *
 428  * result:
 429  *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
 430  *
 431  *    EINVAL: Illegal UTF-8 sequence found.
 432  */
 433 int
 434 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
 435                size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
 436 {
 437         u_int16_t* bufstart;
 438         u_int16_t* bufend;
 439         unsigned int ucs_ch;
 440         unsigned int byte;
 441         int combcharcnt = 0;
 442         int result = 0;
 443         int decompose, precompose, escaping;
 444         int sfmconv;
 445         int extrabytes;
 446
 447         decompose  = (flags & UTF_DECOMPOSED);
 448         precompose = (flags & UTF_PRECOMPOSED);
 449         escaping   = (flags & UTF_ESCAPE_ILLEGAL);
 450         sfmconv    = (flags & UTF_SFM_CONVERSIONS);
 451
 452         bufstart = ucsp;
 453         bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
 454
 455         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 456                 if (ucsp >= bufend)
 457                         goto toolong;
 458
 459                 /* check for ascii */
 460                 if (byte < 0x80) {
 461                         ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
 462                 } else {
 463                         u_int32_t ch;
 464
 465                         extrabytes = utf_extrabytes[byte >> 3];
 466                         if ((extrabytes < 0) || ((int)utf8len < extrabytes)) {
 467                                 goto escape;
 468                         }
 469                         utf8len -= extrabytes;
 470
 471                         switch (extrabytes) {
 472                         case 1:
 473                                 ch = byte; ch <<= 6;   /* 1st byte */
 474                                 byte = *utf8p++;       /* 2nd byte */
 475                                 if ((byte >> 6) != 2)
 476                                         goto escape2;
 477                                 ch += byte;
 478                                 ch -= 0x00003080UL;
 479                                 if (ch < 0x0080)
 480                                         goto escape2;
 481                                 ucs_ch = ch;
 482                                 break;
 483                         case 2:
 484                                 ch = byte; ch <<= 6;   /* 1st byte */
 485                                 byte = *utf8p++;       /* 2nd byte */
 486                                 if ((byte >> 6) != 2)
 487                                         goto escape2;
 488                                 ch += byte; ch <<= 6;
 489                                 byte = *utf8p++;       /* 3rd byte */
 490                                 if ((byte >> 6) != 2)
 491                                         goto escape3;
 492                                 ch += byte;
 493                                 ch -= 0x000E2080UL;
 494                                 if (ch < 0x0800)
 495                                         goto escape3;
 496                                 if (ch >= 0xD800) {
 497                                         if (ch <= 0xDFFF)
 498                                                 goto escape3;
 499                                         if (ch == 0xFFFE || ch == 0xFFFF)
 500                                                 goto escape3;
 501                                 }
 502                                 ucs_ch = ch;
 503                                 break;
 504                         case 3:
 505                                 ch = byte; ch <<= 6;   /* 1st byte */
 506                                 byte = *utf8p++;       /* 2nd byte */
 507                                 if ((byte >> 6) != 2)
 508                                         goto escape2;
 509                                 ch += byte; ch <<= 6;
 510                                 byte = *utf8p++;       /* 3rd byte */
 511                                 if ((byte >> 6) != 2)
 512                                         goto escape3;
 513                                 ch += byte; ch <<= 6;
 514                                 byte = *utf8p++;       /* 4th byte */
 515                                 if ((byte >> 6) != 2)
 516                                         goto escape4;
 517                                 ch += byte;
 518                                 ch -= 0x03C82080UL + SP_HALF_BASE;
 519                                 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 520                                 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
 521                                         goto escape4;
 522                                 push(ucs_ch, &combcharcnt, &ucsp);
 523                                 if (ucsp >= bufend)
 524                                         goto toolong;
 525                                 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 526                                 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
 527                                         --ucsp;
 528                                         goto escape4;
 529                                 }
 530                                 *ucsp++ = ucs_ch;
 531                                 continue;
 532                         default:
 533                                 result = EINVAL;
 534                                 goto exit;
 535                         }
 536                         if (decompose) {
 537                                 if (unicode_decomposeable(ucs_ch)) {
 538                                         u_int16_t sequence[8];
 539                                         int count, i;
 540
 541                                         count = unicode_decompose(ucs_ch, sequence);
 542
 543                                         for (i = 0; i < count; ++i) {
 544                                                 if (ucsp >= bufend)
 545                                                         goto toolong;
 546
 547                                                 push(sequence[i], &combcharcnt, &ucsp);
 548                                         }
 549
 550                                         continue;
 551                                 }
 552                         } else if (precompose && (ucsp != bufstart)) {
 553                                 u_int16_t composite, base;
 554
 555                                 if (unicode_combinable(ucs_ch)) {
 556                                         base = ucsp[-1];
 557                                         composite = unicode_combine(base, ucs_ch);
 558                                         if (composite) {
 559                                                 --ucsp;
 560                                                 ucs_ch = composite;
 561                                         }
 562                                 }
 563                         }
 564                         if (ucs_ch == UCS_ALT_NULL)
 565                                 ucs_ch = '\0';
 566                 }
 567                 if (ucs_ch == altslash)
 568                         ucs_ch = '/';
 569
 570                 push(ucs_ch, &combcharcnt, &ucsp);
 571                 continue;
 572
 573                 /*
 574                  * Escape illegal UTF-8 into something legal.
 575                  */
 576 escape4:
 577                 utf8p -= 3;
 578                 goto escape;
 579 escape3:
 580                 utf8p -= 2;
 581                 goto escape;
 582 escape2:
 583                 utf8p -= 1;
 584 escape:
 585                 if (!escaping) {
 586                         result = EINVAL;
 587                         goto exit;
 588                 }
 589                 if (extrabytes > 0)
 590                         utf8len += extrabytes;
 591                 byte = *(utf8p - 1);
 592
 593                 if ((ucsp + 2) >= bufend)
 594                         goto toolong;
 595
 596                 /* Make a previous combining sequence canonical. */
 597                 if (combcharcnt > 1) {
 598                         prioritysort(ucsp - combcharcnt, combcharcnt);
 599                 }
 600                 combcharcnt = 0;
 601
 602                 ucs_ch = '%';
 603                 *ucsp++ = ucs_ch;
 604                 ucs_ch =  hexdigits[byte >> 4];
 605                 *ucsp++ = ucs_ch;
 606                 ucs_ch =  hexdigits[byte & 0x0F];
 607                 *ucsp++ = ucs_ch;
 608         }
 609         /*
 610          * Make a previous combining sequence canonical
 611          */
 612         if (combcharcnt > 1) {
 613                 prioritysort(ucsp - combcharcnt, combcharcnt);
 614         }
 615
 616         if (flags & UTF_REVERSE_ENDIAN) {
 617                 uint16_t *p = bufstart;
 618                 while (p < ucsp) {
 619                         *p = OSSwapInt16(*p);
 620                         ++p;
 621                 }
 622         }
 623
 624 exit:
 625         *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
 626
 627         return (result);
 628
 629 toolong:
 630         result = ENAMETOOLONG;
 631         goto exit;
 632 }
 633
 634
 635 /*
 636  * utf8_validatestr - Check for a valid UTF-8 string.
 637  */
 638 int
 639 utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
 640 {
 641         unsigned int byte;
 642         u_int32_t ch;
 643         unsigned int ucs_ch;
 644         size_t extrabytes;
 645
 646         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 647                 if (byte < 0x80)
 648                         continue;  /* plain ascii */
 649
 650                 extrabytes = utf_extrabytes[byte >> 3];
 651
 652                 if (utf8len < extrabytes)
 653                         goto invalid;
 654                 utf8len -= extrabytes;
 655
 656                 switch (extrabytes) {
 657                 case 1:
 658                         ch = byte; ch <<= 6;   /* 1st byte */
 659                         byte = *utf8p++;       /* 2nd byte */
 660                         if ((byte >> 6) != 2)
 661                                 goto invalid;
 662                         ch += byte;
 663                         ch -= 0x00003080UL;
 664                         if (ch < 0x0080)
 665                                 goto invalid;
 666                         break;
 667                 case 2:
 668                         ch = byte; ch <<= 6;   /* 1st byte */
 669                         byte = *utf8p++;       /* 2nd byte */
 670                         if ((byte >> 6) != 2)
 671                                 goto invalid;
 672                         ch += byte; ch <<= 6;
 673                         byte = *utf8p++;       /* 3rd byte */
 674                         if ((byte >> 6) != 2)
 675                                 goto invalid;
 676                         ch += byte;
 677                         ch -= 0x000E2080UL;
 678                         if (ch < 0x0800)
 679                                 goto invalid;
 680                         if (ch >= 0xD800) {
 681                                 if (ch <= 0xDFFF)
 682                                         goto invalid;
 683                                 if (ch == 0xFFFE || ch == 0xFFFF)
 684                                         goto invalid;
 685                         }
 686                         break;
 687                 case 3:
 688                         ch = byte; ch <<= 6;   /* 1st byte */
 689                         byte = *utf8p++;       /* 2nd byte */
 690                         if ((byte >> 6) != 2)
 691                                 goto invalid;
 692                         ch += byte; ch <<= 6;
 693                         byte = *utf8p++;       /* 3rd byte */
 694                         if ((byte >> 6) != 2)
 695                                 goto invalid;
 696                         ch += byte; ch <<= 6;
 697                         byte = *utf8p++;       /* 4th byte */
 698                         if ((byte >> 6) != 2)
 699                                 goto invalid;
 700                         ch += byte;
 701                         ch -= 0x03C82080UL + SP_HALF_BASE;
 702                         ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 703                         if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
 704                                 goto invalid;
 705                         ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 706                         if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
 707                                 goto invalid;
 708                         break;
 709                 default:
 710                         goto invalid;
 711                 }
 712
 713         }
 714         return (0);
 715 invalid:
 716         return (EINVAL);
 717 }
 718
 719 /*
 720  * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
 721  *
 722  * This function takes an UTF-8 input string, instr, of inlen bytes
 723  * and produces normalized UTF-8 output into a buffer of buflen bytes
 724  * pointed to by outstr. The size of the output in bytes (not including
 725  * a NULL termination byte) is returned in outlen. In-place conversions
 726  * are not supported (i.e. instr != outstr).]
 727
 728  * FLAGS
 729  *    UTF_DECOMPOSED:  output string will be fully decomposed (NFD)
 730  *
 731  *    UTF_PRECOMPOSED:  output string will be precomposed (NFC)
 732  *
 733  *    UTF_NO_NULL_TERM:  do not add null termination to output string
 734  *
 735  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 736  *
 737  * ERRORS
 738  *    ENAMETOOLONG:  output did not fit or input exceeded MAXPATHLEN bytes
 739  *
 740  *    EINVAL:  illegal UTF-8 sequence encountered or invalid flags
 741  */
 742 int
 743 utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
 744                   size_t *outlen, size_t buflen, int flags)
 745 {
 746         u_int16_t unicodebuf[32];
 747         u_int16_t* unistr = NULL;
 748         size_t unicode_bytes;
 749         size_t uft8_bytes;
 750         size_t inbuflen;
 751         u_int8_t *outbufstart, *outbufend;
 752         const u_int8_t *inbufstart;
 753         unsigned int byte;
 754         int decompose, precompose;
 755         int result = 0;
 756
 757         if (flags & ~(UTF_DECOMPOSED | UTF_PRECOMPOSED | UTF_NO_NULL_TERM | UTF_ESCAPE_ILLEGAL)) {
 758                 return (EINVAL);
 759         }
 760         decompose = (flags & UTF_DECOMPOSED);
 761         precompose = (flags & UTF_PRECOMPOSED);
 762         if ((decompose && precompose) || (!decompose && !precompose)) {
 763                 return (EINVAL);
 764         }
 765         outbufstart = outstr;
 766         outbufend = outbufstart + buflen;
 767         inbufstart = instr;
 768         inbuflen = inlen;
 769
 770         while (inlen-- > 0 && (byte = *instr++) != '\0') {
 771                 if (outstr >= outbufend) {
 772                         result = ENAMETOOLONG;
 773                         goto exit;
 774                 }
 775                 if (byte >= 0x80) {
 776                         goto nonASCII;
 777                 }
 778                 /* ASCII is already normalized. */
 779                 *outstr++ = byte;
 780         }
 781 exit:
 782         *outlen = outstr - outbufstart;
 783         if (((flags & UTF_NO_NULL_TERM) == 0)) {
 784                 if (outstr < outbufend)
 785                         *outstr++ = '\0';
 786                 else
 787                         result = ENAMETOOLONG;
 788         }
 789         return (result);
 790
 791
 792         /*
 793          * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
 794          * functions to perform the normalization.  Since this will
 795          * presumably be used to normalize filenames in the back-end
 796          * (on disk or over-the-wire), it should be fast enough.
 797          */
 798 nonASCII:
 799
 800         /* Make sure the input size is reasonable. */
 801         if (inbuflen > MAXPATHLEN) {
 802                 result = ENAMETOOLONG;
 803                 goto exit;
 804         }
 805         /*
 806          * Compute worst case Unicode buffer size.
 807          *
 808          * For pre-composed output, every UTF-8 input byte will be at
 809          * most 2 Unicode bytes.  For decomposed output, 2 UTF-8 bytes
 810          * (smallest composite char sequence) may yield 6 Unicode bytes
 811          * (1 base char + 2 combining chars).
 812          */
 813         unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
 814
 815         if (unicode_bytes <= sizeof(unicodebuf))
 816                 unistr = &unicodebuf[0];
 817         else
 818                 MALLOC(unistr, uint16_t *, unicode_bytes, M_TEMP, M_WAITOK);
 819
 820         /* Normalize the string. */
 821         result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
 822                                 unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
 823         if (result == 0) {
 824                 /* Put results back into UTF-8. */
 825                 result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
 826                                         &uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
 827                 outstr = outbufstart + uft8_bytes;
 828         }
 829         if (unistr && unistr != &unicodebuf[0]) {
 830                 FREE(unistr, M_TEMP);
 831         }
 832         goto exit;
 833 }
 834
 835
 836  /*
 837   * Unicode 3.2 decomposition code (derived from Core Foundation)
 838   */
 839
 840 typedef struct {
 841         u_int32_t _key;
 842         u_int32_t _value;
 843 } unicode_mappings32;
 844
 845 static inline u_int32_t
 846 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
 847                 u_int16_t character)
 848 {
 849         const unicode_mappings32 *p, *q, *divider;
 850
 851         if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
 852                 return (0);
 853
 854         p = theTable;
 855         q = p + (numElem-1);
 856         while (p <= q) {
 857                 divider = p + ((q - p) >> 1);   /* divide by 2 */
 858                 if (character < divider->_key) { q = divider - 1; }
 859                 else if (character > divider->_key) { p = divider + 1; }
 860                 else { return (divider->_value); }
 861         }
 862         return (0);
 863 }
 864
 865 #define RECURSIVE_DECOMPOSITION (1 << 15)
 866 #define EXTRACT_COUNT(value)    (((value) >> 12) & 0x0007)
 867
 868 typedef struct {
 869         u_int16_t _key;
 870         u_int16_t _value;
 871 } unicode_mappings16;
 872
 873 static inline u_int16_t
 874 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
 875                 u_int16_t character)
 876 {
 877         const unicode_mappings16 *p, *q, *divider;
 878
 879         if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
 880                 return (0);
 881
 882         p = theTable;
 883         q = p + (numElem-1);
 884         while (p <= q) {
 885                 divider = p + ((q - p) >> 1);   /* divide by 2 */
 886                 if (character < divider->_key)
 887                         q = divider - 1;
 888                 else if (character > divider->_key)
 889                         p = divider + 1;
 890                 else
 891                         return (divider->_value);
 892         }
 893         return (0);
 894 }
 895
 896
 897 static u_int32_t
 898 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
 899 {
 900         u_int16_t value;
 901         u_int32_t length;
 902         u_int16_t firstChar;
 903         u_int16_t theChar;
 904         const u_int16_t *bmpMappings;
 905         u_int32_t usedLength;
 906
 907         value = getmappedvalue16(
 908                 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
 909                 __UniCharDecompositionTableLength, character);
 910         length = EXTRACT_COUNT(value);
 911         firstChar = value & 0x0FFF;
 912         theChar = firstChar;
 913         bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
 914         usedLength = 0;
 915
 916         if (value & RECURSIVE_DECOMPOSITION) {
 917             usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
 918
 919             --length;   /* Decrement for the first char */
 920             if (!usedLength)
 921                 return 0;
 922             ++bmpMappings;
 923             convertedChars += usedLength;
 924         }
 925
 926         usedLength += length;
 927
 928         while (length--)
 929                 *(convertedChars++) = *(bmpMappings++);
 930
 931         return (usedLength);
 932 }
 933
 934 #define HANGUL_SBASE 0xAC00
 935 #define HANGUL_LBASE 0x1100
 936 #define HANGUL_VBASE 0x1161
 937 #define HANGUL_TBASE 0x11A7
 938
 939 #define HANGUL_SCOUNT 11172
 940 #define HANGUL_LCOUNT 19
 941 #define HANGUL_VCOUNT 21
 942 #define HANGUL_TCOUNT 28
 943 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
 944
 945 /*
 946  * unicode_decompose - decompose a composed Unicode char
 947  *
 948  * Composed Unicode characters are forbidden on
 949  * HFS Plus volumes. ucs_decompose will convert a
 950  * composed character into its correct decomposed
 951  * sequence.
 952  *
 953  * Similar to CFUniCharDecomposeCharacter
 954  */
 955 static int
 956 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
 957 {
 958         if ((character >= HANGUL_SBASE) &&
 959             (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
 960                 u_int32_t length;
 961
 962                 character -= HANGUL_SBASE;
 963                 length = (character % HANGUL_TCOUNT ? 3 : 2);
 964
 965                 *(convertedChars++) =
 966                         character / HANGUL_NCOUNT + HANGUL_LBASE;
 967                 *(convertedChars++) =
 968                         (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
 969                 if (length > 2)
 970                         *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
 971                 return (length);
 972         } else {
 973                 return (unicode_recursive_decompose(character, convertedChars));
 974         }
 975 }
 976
 977 /*
 978  * unicode_combine - generate a precomposed Unicode char
 979  *
 980  * Precomposed Unicode characters are required for some volume
 981  * formats and network protocols.  unicode_combine will combine
 982  * a decomposed character sequence into a single precomposed
 983  * (composite) character.
 984  *
 985  * Similar toCFUniCharPrecomposeCharacter but unicode_combine
 986  * also handles Hangul Jamo characters.
 987  */
 988 static u_int16_t
 989 unicode_combine(u_int16_t base, u_int16_t combining)
 990 {
 991         u_int32_t value;
 992
 993         /* Check HANGUL */
 994         if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
 995                 /* 2 char Hangul sequences */
 996                 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
 997                     (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
 998                     return (HANGUL_SBASE +
 999                             ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
1000                             ((combining  - HANGUL_VBASE)*HANGUL_TCOUNT));
1001                 }
1002
1003                 /* 3 char Hangul sequences */
1004                 if ((combining > HANGUL_TBASE) &&
1005                     (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
1006                         if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
1007                                 return (0);
1008                         else
1009                                 return (base + (combining - HANGUL_TBASE));
1010                 }
1011         }
1012
1013         value = getmappedvalue32(
1014                 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
1015                 __CFUniCharPrecompositionTableLength, combining);
1016
1017         if (value) {
1018                 value = getmappedvalue16(
1019                         (const unicode_mappings16 *)
1020                         ((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
1021                         (value >> 16), base);
1022         }
1023         return (value);
1024 }
1025
1026
1027 /*
1028  * prioritysort - order combining chars into canonical order
1029  *
1030  * Similar to CFUniCharPrioritySort
1031  */
1032 static void
1033 prioritysort(u_int16_t* characters, int count)
1034 {
1035         u_int32_t p1, p2;
1036         u_int16_t *ch1, *ch2;
1037         u_int16_t *end;
1038         int changes = 0;
1039
1040         end = characters + count;
1041         do {
1042                 changes = 0;
1043                 ch1 = characters;
1044                 ch2 = characters + 1;
1045                 p2 = get_combining_class(*ch1);
1046                 while (ch2 < end) {
1047                         p1 = p2;
1048                         p2 = get_combining_class(*ch2);
1049                         if (p1 > p2 && p2 != 0) {
1050                                 u_int32_t tmp;
1051
1052                                 tmp = *ch1;
1053                                 *ch1 = *ch2;
1054                                 *ch2 = tmp;
1055                                 changes = 1;
1056
1057                                 /*
1058                                  * Make sure that p2 contains the combining class for the
1059                                  * character now stored at *ch2.  This isn't required for
1060                                  * correctness, but it will be more efficient if a character
1061                                  * with a large combining class has to "bubble past" several
1062                                  * characters with lower combining classes.
1063                                  */
1064                                 p2 = p1;
1065                         }
1066                         ++ch1;
1067                         ++ch2;
1068                 }
1069         } while (changes);
1070 }
1071
1072
1073 /*
1074  * Invalid NTFS filename characters are encodeded using the
1075  * SFM (Services for Macintosh) private use Unicode characters.
1076  *
1077  * These should only be used for SMB, MSDOS or NTFS.
1078  *
1079  *    Illegal NTFS Char   SFM Unicode Char
1080  *  ----------------------------------------
1081  *    0x01-0x1f           0xf001-0xf01f
1082  *    '"'                 0xf020
1083  *    '*'                 0xf021
1084  *    '/'                 0xf022
1085  *    '<'                 0xf023
1086  *    '>'                 0xf024
1087  *    '?'                 0xf025
1088  *    '\'                 0xf026
1089  *    '|'                 0xf027
1090  *    ' '                 0xf028  (Only if last char of the name)
1091  *    '.'                 0xf029  (Only if last char of the name)
1092  *  ----------------------------------------
1093  *
1094  *  Reference: http://support.microsoft.com/kb/q117258/
1095  */
1096
1097 #define MAX_SFM2MAC           0x29
1098 #define SFMCODE_PREFIX_MASK   0xf000
1099
1100 /*
1101  * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1102  * SFM had no conversion for the colon. There is a conversion for the
1103  * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1104  * is a slash and a slash is a colon. So we can just replace the slash with the
1105  * colon in our tables and everything will just work.
1106  */
1107 static u_int8_t
1108 sfm2mac[] = {
1109         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* 00 - 07 */
1110         0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,   /* 08 - 0F */
1111         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,   /* 10 - 17 */
1112         0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,   /* 18 - 1F */
1113         0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c,   /* 20 - 27 */
1114         0x20, 0x2e                                        /* 28 - 29 */
1115 };
1116 #define SFM2MAC_LEN     ((sizeof(sfm2mac))/sizeof(sfm2mac[0]))
1117
1118 static u_int8_t
1119 mac2sfm[] = {
1120         0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27,   /* 20 - 27 */
1121         0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22,   /* 28 - 2f */
1122         0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,   /* 30 - 37 */
1123         0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25,   /* 38 - 3f */
1124         0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,   /* 40 - 47 */
1125         0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,   /* 48 - 4f */
1126         0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,   /* 50 - 57 */
1127         0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f,   /* 58 - 5f */
1128         0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,   /* 60 - 67 */
1129         0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,   /* 68 - 6f */
1130         0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,   /* 70 - 77 */
1131         0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f    /* 78 - 7f */
1132 };
1133 #define MAC2SFM_LEN     ((sizeof(mac2sfm))/sizeof(mac2sfm[0]))
1134
1135
1136 /*
1137  * Encode illegal NTFS filename characters into SFM Private Unicode characters
1138  *
1139  * Assumes non-zero ASCII input.
1140  */
1141 static u_int16_t
1142 ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
1143 {
1144         /* The last character of filename cannot be a space or period. */
1145         if (lastchar) {
1146                 if (ucs_ch == 0x20)
1147                         return (0xf028);
1148                 else if (ucs_ch == 0x2e)
1149                         return (0xf029);
1150         }
1151         /* 0x01 - 0x1f is simple transformation. */
1152         if (ucs_ch <= 0x1f) {
1153                 return (ucs_ch | 0xf000);
1154         } else /* 0x20 - 0x7f */ {
1155                 u_int16_t lsb;
1156
1157                 assert((ucs_ch - 0x0020) < MAC2SFM_LEN);
1158                 lsb = mac2sfm[ucs_ch - 0x0020];
1159                 if (lsb != ucs_ch)
1160                         return(0xf000 | lsb);
1161         }
1162         return (ucs_ch);
1163 }
1164
1165 /*
1166  * Decode any SFM Private Unicode characters
1167  */
1168 static u_int16_t
1169 sfm_to_ucs(u_int16_t ucs_ch)
1170 {
1171         if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
1172             ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
1173                 assert((ucs_ch & 0x003f) < SFM2MAC_LEN);
1174                 ucs_ch = sfm2mac[ucs_ch & 0x003f];
1175         }
1176         return (ucs_ch);
1177 }
1178
1179