livefiles_hfs_plugin/lf_hfs_sbunicode.c

   1 /* Copyright © 2017-2018 Apple Inc. All rights reserved.
   2  *
   3  *  lf_hfs_sbunicode.c
   4  *  livefiles_hfs
   5  *
   6  *  Created by Oded Shoshani on 31/1/18.
   7  */
   8
   9 /*
  10  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
  11  *
  12  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  13  *
  14  * This file contains Original Code and/or Modifications of Original Code
  15  * as defined in and that are subject to the Apple Public Source License
  16  * Version 2.0 (the 'License'). You may not use this file except in
  17  * compliance with the License. The rights granted to you under the License
  18  * may not be used to create, or enable the creation or redistribution of,
  19  * unlawful or unlicensed copies of an Apple operating system, or to
  20  * circumvent, violate, or enable the circumvention or violation of, any
  21  * terms of an Apple operating system software license agreement.
  22  *
  23  * Please obtain a copy of the License at
  24  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  25  *
  26  * The Original Code and all software distributed under the License are
  27  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  28  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  29  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  30  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  31  * Please see the License for the specific language governing rights and
  32  * limitations under the License.
  33  *
  34  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  35  */
  36
  37 /*
  38  Includes Unicode 3.2 decomposition code derived from Core Foundation
  39  */
  40
  41 #pragma clang diagnostic ignored "-Wsign-conversion"
  42 #pragma clang diagnostic ignored "-Wconversion"
  43
  44 #include <sys/param.h>
  45 #include <sys/errno.h>
  46 #include <libkern/OSByteOrder.h>
  47 #include <stdio.h>
  48 #include <stdlib.h>
  49 #include "lf_hfs_sbunicode.h"
  50
  51
  52 /*
  53  * UTF-8 (Unicode Transformation Format)
  54  *
  55  * UTF-8 is the Unicode Transformation Format that serializes a Unicode
  56  * character as a sequence of one to four bytes. Only the shortest form
  57  * required to represent the significant Unicode bits is legal.
  58  *
  59  * UTF-8 Multibyte Codes
  60  *
  61  * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary)
  62  * -----------------------------------------------------------------------------
  63  *   1       7       0x0000        0x007F    0xxxxxxx
  64  *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx
  65  *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx
  66  *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  67  * -----------------------------------------------------------------------------
  68  */
  69
  70
  71 #define UNICODE_TO_UTF8_LEN(c)  \
  72 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
  73
  74 #define UCS_ALT_NULL    0x2400
  75
  76
  77 /* Surrogate Pair Constants */
  78
  79 #define SP_HALF_SHIFT   10
  80 #define SP_HALF_BASE    0x0010000u
  81 #define SP_HALF_MASK    0x3FFu
  82 #define SP_HIGH_FIRST   0xD800u
  83 #define SP_HIGH_LAST    0xDBFFu
  84 #define SP_LOW_FIRST    0xDC00u
  85 #define SP_LOW_LAST     0xDFFFu
  86
  87
  88 #include "lf_hfs_utfconvdata.h"
  89
  90
  91 /*
  92  * Test for a combining character.
  93  *
  94  * Similar to __CFUniCharIsNonBaseCharacter except that
  95  * unicode_combinable also includes Hangul Jamo characters.
  96  */
  97 static int
  98 unicode_combinable(u_int16_t character)
  99 {
 100     const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
 101     u_int8_t value;
 102
 103     if (character < 0x0300)
 104         return (0);
 105
 106     value = bitmap[(character >> 8) & 0xFF];
 107
 108     if (value == 0xFF) {
 109         return (1);
 110     } else if (value) {
 111         bitmap = bitmap + ((value - 1) * 32) + 256;
 112         return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
 113     }
 114     return (0);
 115 }
 116
 117 /*
 118  * Test for a precomposed character.
 119  *
 120  * Similar to __CFUniCharIsDecomposableCharacter.
 121  */
 122 static int
 123 unicode_decomposeable(u_int16_t character) {
 124     const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
 125     u_int8_t value;
 126
 127     if (character < 0x00C0)
 128         return (0);
 129
 130     value = bitmap[(character >> 8) & 0xFF];
 131
 132     if (value == 0xFF) {
 133         return (1);
 134     } else if (value) {
 135         bitmap = bitmap + ((value - 1) * 32) + 256;
 136         return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
 137     }
 138     return (0);
 139 }
 140
 141
 142 /*
 143  * Get the combing class.
 144  *
 145  * Similar to CFUniCharGetCombiningPropertyForCharacter.
 146  */
 147 static inline u_int8_t
 148 get_combining_class(u_int16_t character) {
 149     const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
 150
 151     u_int8_t value = bitmap[(character >> 8)];
 152
 153     if (value) {
 154         bitmap = bitmap + (value * 256);
 155         return bitmap[character % 256];
 156     }
 157     return (0);
 158 }
 159
 160 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
 161
 162 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
 163
 164 static void priortysort(u_int16_t* characters, int count);
 165
 166 static u_int16_t  ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
 167
 168 static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
 169
 170 char utf_extrabytes[32] = {
 171     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 172     -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,  2,  2,  3, -1
 173 };
 174
 175 const char hexdigits[16] = {
 176     '0',  '1',  '2',  '3',  '4',  '5',  '6', '7',
 177     '8',  '9',  'A',  'B',  'C',  'D',  'E', 'F'
 178 };
 179
 180 /*
 181  * utf8_encodelen - Calculate the UTF-8 encoding length
 182  *
 183  * This function takes a Unicode input string, ucsp, of ucslen bytes
 184  * and calculates the size of the UTF-8 output in bytes (not including
 185  * a NULL termination byte). The string must reside in kernel memory.
 186  *
 187  * If '/' chars are possible in the Unicode input then an alternate
 188  * (replacement) char should be provided in altslash.
 189  *
 190  * FLAGS
 191  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
 192  *
 193  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 194  *
 195  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 196  *
 197  *    UTF_DECOMPOSED:  generate fully decomposed output
 198  *
 199  *    UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
 200  *
 201  * ERRORS
 202  *    None
 203  */
 204 size_t
 205 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
 206 {
 207     u_int16_t ucs_ch;
 208     u_int16_t * chp = NULL;
 209     u_int16_t sequence[8];
 210     int extra = 0;
 211     size_t charcnt;
 212     int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 213     int decompose = (flags & UTF_DECOMPOSED);
 214     size_t len;
 215
 216     charcnt = ucslen / 2;
 217     len = 0;
 218
 219     while (charcnt-- > 0) {
 220         if (extra > 0) {
 221             --extra;
 222             ucs_ch = *chp++;
 223         } else {
 224             ucs_ch = *ucsp++;
 225             if (swapbytes) {
 226                 ucs_ch = OSSwapInt16(ucs_ch);
 227             }
 228             if (ucs_ch == '/') {
 229                 ucs_ch = altslash ? altslash : '_';
 230             } else if (ucs_ch == '\0') {
 231                 ucs_ch = UCS_ALT_NULL;
 232             } else if (decompose && unicode_decomposeable(ucs_ch)) {
 233                 extra = unicode_decompose(ucs_ch, sequence) - 1;
 234                 charcnt += extra;
 235                 ucs_ch = sequence[0];
 236                 chp = &sequence[1];
 237             }
 238         }
 239         len += UNICODE_TO_UTF8_LEN(ucs_ch);
 240     }
 241
 242     return (len);
 243 }
 244
 245
 246 /*
 247  * utf8_encodestr - Encodes a Unicode string to UTF-8
 248  *
 249  * NOTES:
 250  *    The resulting UTF-8 string is NULL terminated.
 251  *
 252  *    If '/' chars are allowed on disk then an alternate
 253  *    (replacement) char must be provided in altslash.
 254  *
 255  * input flags:
 256  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
 257  *
 258  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 259  *
 260  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 261  *
 262  *    UTF_DECOMPOSED:  generate fully decomposed output
 263  *
 264  *    UTF_ADD_NULL_TERM:  add NULL termination to UTF-8 output
 265  *
 266  * result:
 267  *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
 268  *
 269  *    EINVAL: Illegal char found; char was replaced by an '_'.
 270  */
 271 extern int
 272 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
 273                size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
 274 {
 275     u_int8_t * bufstart;
 276     u_int8_t * bufend;
 277     u_int16_t ucs_ch;
 278     u_int16_t * chp = NULL;
 279     u_int16_t sequence[8];
 280     int extra = 0;
 281     size_t charcnt;
 282     int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 283     int nullterm  = (flags & UTF_ADD_NULL_TERM);
 284     int decompose = (flags & UTF_DECOMPOSED);
 285     int sfmconv = (flags & UTF_SFM_CONVERSIONS);
 286     int result = 0;
 287
 288     bufstart = utf8p;
 289     bufend = bufstart + buflen;
 290     if (nullterm)
 291         --bufend;
 292     charcnt = ucslen / 2;
 293
 294     while (charcnt-- > 0) {
 295         if (extra > 0) {
 296             --extra;
 297             ucs_ch = *chp++;
 298         } else {
 299             ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
 300
 301             if (decompose && unicode_decomposeable(ucs_ch)) {
 302                 extra = unicode_decompose(ucs_ch, sequence) - 1;
 303                 charcnt += extra;
 304                 ucs_ch = sequence[0];
 305                 chp = &sequence[1];
 306             }
 307         }
 308
 309         /* Slash and NULL are not permitted */
 310         if (ucs_ch == '/') {
 311             if (altslash)
 312                 ucs_ch = altslash;
 313             else {
 314                 ucs_ch = '_';
 315                 result = EINVAL;
 316             }
 317         } else if (ucs_ch == '\0') {
 318             ucs_ch = UCS_ALT_NULL;
 319         }
 320
 321         if (ucs_ch < 0x0080) {
 322             if (utf8p >= bufend) {
 323                 result = ENAMETOOLONG;
 324                 break;
 325             }
 326             *utf8p++ = ucs_ch;
 327
 328         } else if (ucs_ch < 0x800) {
 329             if ((utf8p + 1) >= bufend) {
 330                 result = ENAMETOOLONG;
 331                 break;
 332             }
 333             *utf8p++ = 0xc0 | (ucs_ch >> 6);
 334             *utf8p++ = 0x80 | (0x3f & ucs_ch);
 335
 336         } else {
 337             /* These chars never valid Unicode. */
 338             if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) {
 339                 result = EINVAL;
 340                 break;
 341             }
 342
 343             /* Combine valid surrogate pairs */
 344             if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
 345                 && charcnt > 0) {
 346                 u_int16_t ch2;
 347                 u_int32_t pair;
 348
 349                 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
 350                 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
 351                     pair = (u_int32_t)((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
 352                     + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
 353                     if ((utf8p + 3) >= bufend) {
 354                         result = ENAMETOOLONG;
 355                         break;
 356                     }
 357                     --charcnt;
 358                     ++ucsp;
 359                     *utf8p++ = 0xf0 | (pair >> 18);
 360                     *utf8p++ = 0x80 | (0x3f & (pair >> 12));
 361                     *utf8p++ = 0x80 | (0x3f & (pair >> 6));
 362                     *utf8p++ = 0x80 | (0x3f & pair);
 363                     continue;
 364                 }
 365             } else if (sfmconv) {
 366                 ucs_ch = sfm_to_ucs(ucs_ch);
 367                 if (ucs_ch < 0x0080) {
 368                     if (utf8p >= bufend) {
 369                         result = ENAMETOOLONG;
 370                         break;
 371                     }
 372                     *utf8p++ = ucs_ch;
 373                     continue;
 374                 }
 375             }
 376             if ((utf8p + 2) >= bufend) {
 377                 result = ENAMETOOLONG;
 378                 break;
 379             }
 380             *utf8p++ = 0xe0 | (ucs_ch >> 12);
 381             *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
 382             *utf8p++ = 0x80 | (0x3f & ucs_ch);
 383         }
 384     }
 385
 386     *utf8len = utf8p - bufstart;
 387     if (nullterm)
 388         *utf8p++ = '\0';
 389
 390     return (result);
 391 }
 392
 393
 394 /*
 395  * utf8_decodestr - Decodes a UTF-8 string back to Unicode
 396  *
 397  * NOTES:
 398  *    The input UTF-8 string does not need to be null terminated
 399  *    if utf8len is set.
 400  *
 401  *    If '/' chars are allowed on disk then an alternate
 402  *    (replacement) char must be provided in altslash.
 403  *
 404  * input flags:
 405  *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
 406  *
 407  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 408  *
 409  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 410  *
 411  *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
 412  *
 413  *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
 414  *
 415  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 416  *
 417  * result:
 418  *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
 419  *
 420  *    EINVAL: Illegal UTF-8 sequence found.
 421  */
 422 int
 423 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
 424                size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
 425 {
 426     u_int16_t* bufstart;
 427     u_int16_t* bufend;
 428     unsigned int ucs_ch;
 429     unsigned int byte;
 430     int combcharcnt = 0;
 431     int result = 0;
 432     int decompose, precompose, swapbytes, escaping;
 433     int sfmconv;
 434     int extrabytes;
 435
 436     decompose  = (flags & UTF_DECOMPOSED);
 437     precompose = (flags & UTF_PRECOMPOSED);
 438     swapbytes  = (flags & UTF_REVERSE_ENDIAN);
 439     escaping   = (flags & UTF_ESCAPE_ILLEGAL);
 440     sfmconv    = (flags & UTF_SFM_CONVERSIONS);
 441
 442     bufstart = ucsp;
 443     bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
 444
 445     while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 446         if (ucsp >= bufend)
 447             goto toolong;
 448
 449         /* check for ascii */
 450         if (byte < 0x80) {
 451             ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
 452         } else {
 453             u_int32_t ch;
 454
 455             extrabytes = utf_extrabytes[byte >> 3];
 456             if ((extrabytes < 0) || ((int)utf8len < extrabytes)) {
 457                 goto escape;
 458             }
 459             utf8len -= extrabytes;
 460
 461             switch (extrabytes) {
 462                 case 1:
 463                     ch = byte; ch <<= 6;   /* 1st byte */
 464                     byte = *utf8p++;       /* 2nd byte */
 465                     if ((byte >> 6) != 2)
 466                         goto escape2;
 467                     ch += byte;
 468                     ch -= 0x00003080UL;
 469                     if (ch < 0x0080)
 470                         goto escape2;
 471                     ucs_ch = ch;
 472                     break;
 473                 case 2:
 474                     ch = byte; ch <<= 6;   /* 1st byte */
 475                     byte = *utf8p++;       /* 2nd byte */
 476                     if ((byte >> 6) != 2)
 477                         goto escape2;
 478                     ch += byte; ch <<= 6;
 479                     byte = *utf8p++;       /* 3rd byte */
 480                     if ((byte >> 6) != 2)
 481                         goto escape3;
 482                     ch += byte;
 483                     ch -= 0x000E2080UL;
 484                     if (ch < 0x0800)
 485                         goto escape3;
 486                     if (ch >= 0xD800) {
 487                         if (ch <= 0xDFFF)
 488                             goto escape3;
 489                         if (ch == 0xFFFE || ch == 0xFFFF)
 490                             goto escape3;
 491                     }
 492                     ucs_ch = ch;
 493                     break;
 494                 case 3:
 495                     ch = byte; ch <<= 6;   /* 1st byte */
 496                     byte = *utf8p++;       /* 2nd byte */
 497                     if ((byte >> 6) != 2)
 498                         goto escape2;
 499                     ch += byte; ch <<= 6;
 500                     byte = *utf8p++;       /* 3rd byte */
 501                     if ((byte >> 6) != 2)
 502                         goto escape3;
 503                     ch += byte; ch <<= 6;
 504                     byte = *utf8p++;       /* 4th byte */
 505                     if ((byte >> 6) != 2)
 506                         goto escape4;
 507                     ch += byte;
 508                     ch -= 0x03C82080UL + SP_HALF_BASE;
 509                     ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 510                     if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
 511                         goto escape4;
 512                     *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 513                     if (ucsp >= bufend)
 514                         goto toolong;
 515                     ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 516                     if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
 517                         --ucsp;
 518                         goto escape4;
 519                     }
 520                     *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 521                     continue;
 522                 default:
 523                     result = EINVAL;
 524                     goto exit;
 525             }
 526             if (decompose) {
 527                 if (unicode_decomposeable(ucs_ch)) {
 528                     u_int16_t sequence[8] = {0};
 529                     int count, i;
 530
 531                     /* Before decomposing a new unicode character, sort
 532                      * previous combining characters, if any, and reset
 533                      * the counter.
 534                      */
 535                     if (combcharcnt > 1) {
 536                         priortysort(ucsp - combcharcnt, combcharcnt);
 537                     }
 538                     combcharcnt = 0;
 539
 540                     count = unicode_decompose(ucs_ch, sequence);
 541                     for (i = 0; i < count; ++i) {
 542                         ucs_ch = sequence[i];
 543                         *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 544                         if (ucsp >= bufend)
 545                             goto toolong;
 546                     }
 547                     combcharcnt += count - 1;
 548                     continue;
 549                 }
 550             } else if (precompose && (ucsp != bufstart)) {
 551                 u_int16_t composite, base;
 552
 553                 if (unicode_combinable(ucs_ch)) {
 554                     base = swapbytes ? OSSwapInt16(*(ucsp - 1)) : *(ucsp - 1);
 555                     composite = unicode_combine(base, ucs_ch);
 556                     if (composite) {
 557                         --ucsp;
 558                         ucs_ch = composite;
 559                     }
 560                 }
 561             }
 562             if (ucs_ch == UCS_ALT_NULL)
 563                 ucs_ch = '\0';
 564         }
 565         if (ucs_ch == altslash)
 566             ucs_ch = '/';
 567
 568         /*
 569          * Make multiple combining character sequences canonical
 570          */
 571         if (unicode_combinable(ucs_ch)) {
 572             ++combcharcnt;   /* start tracking a run */
 573         } else if (combcharcnt) {
 574             if (combcharcnt > 1) {
 575                 priortysort(ucsp - combcharcnt, combcharcnt);
 576             }
 577             combcharcnt = 0;  /* start over */
 578         }
 579
 580         *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 581         continue;
 582
 583         /*
 584          * Escape illegal UTF-8 into something legal.
 585          */
 586     escape4:
 587         utf8p -= 3;
 588         goto escape;
 589     escape3:
 590         utf8p -= 2;
 591         goto escape;
 592     escape2:
 593         utf8p -= 1;
 594     escape:
 595         if (!escaping) {
 596             result = EINVAL;
 597             goto exit;
 598         }
 599         if (extrabytes > 0)
 600             utf8len += extrabytes;
 601         byte = *(utf8p - 1);
 602
 603         if ((ucsp + 2) >= bufend)
 604             goto toolong;
 605
 606         /* Make a previous combining sequence canonical. */
 607         if (combcharcnt > 1) {
 608             priortysort(ucsp - combcharcnt, combcharcnt);
 609         }
 610         combcharcnt = 0;
 611
 612         ucs_ch = '%';
 613         *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 614         ucs_ch =  hexdigits[byte >> 4];
 615         *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 616         ucs_ch =  hexdigits[byte & 0x0F];
 617         *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
 618     }
 619     /*
 620      * Make a previous combining sequence canonical
 621      */
 622     if (combcharcnt > 1) {
 623         priortysort(ucsp - combcharcnt, combcharcnt);
 624     }
 625 exit:
 626     *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
 627
 628     return (result);
 629
 630 toolong:
 631     result = ENAMETOOLONG;
 632     goto exit;
 633 }
 634
 635 /*
 636  * Unicode 3.2 decomposition code (derived from Core Foundation)
 637  */
 638
 639 #define HANGUL_SBASE 0xAC00
 640 #define HANGUL_LBASE 0x1100
 641 #define HANGUL_VBASE 0x1161
 642 #define HANGUL_TBASE 0x11A7
 643
 644 #define HANGUL_SCOUNT 11172
 645 #define HANGUL_LCOUNT 19
 646 #define HANGUL_VCOUNT 21
 647 #define HANGUL_TCOUNT 28
 648 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
 649
 650
 651 typedef struct {
 652     u_int32_t _key;
 653     u_int32_t _value;
 654 } unicode_mappings32;
 655
 656 #define RECURSIVE_DECOMPOSITION    (1 << 15)
 657 #define EXTRACT_COUNT(value)    (((value) >> 12) & 0x0007)
 658
 659 typedef struct {
 660     u_int16_t _key;
 661     u_int16_t _value;
 662 } unicode_mappings16;
 663
 664 static inline u_int32_t
 665 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
 666                  u_int16_t character)
 667 {
 668     const unicode_mappings32 *p, *q, *divider;
 669
 670     if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
 671         return (0);
 672
 673     p = theTable;
 674     q = p + (numElem-1);
 675     while (p <= q) {
 676         divider = p + ((q - p) >> 1);    /* divide by 2 */
 677         if (character < divider->_key) { q = divider - 1; }
 678         else if (character > divider->_key) { p = divider + 1; }
 679         else { return (divider->_value); }
 680     }
 681     return (0);
 682 }
 683
 684 static inline u_int16_t
 685 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
 686                  u_int16_t character)
 687 {
 688     const unicode_mappings16 *p, *q, *divider;
 689
 690     if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
 691         return (0);
 692
 693     p = theTable;
 694     q = p + (numElem-1);
 695     while (p <= q) {
 696         divider = p + ((q - p) >> 1);    /* divide by 2 */
 697         if (character < divider->_key)
 698             q = divider - 1;
 699         else if (character > divider->_key)
 700             p = divider + 1;
 701         else
 702             return (divider->_value);
 703     }
 704     return (0);
 705 }
 706
 707 static u_int32_t
 708 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
 709 {
 710     u_int16_t value;
 711     u_int32_t length;
 712     u_int16_t firstChar;
 713     u_int16_t theChar;
 714     const u_int16_t *bmpMappings;
 715     u_int32_t usedLength;
 716
 717     value = getmappedvalue16(
 718                              (const unicode_mappings16 *)__CFUniCharDecompositionTable,
 719                              __UniCharDecompositionTableLength, character);
 720     length = EXTRACT_COUNT(value);
 721     firstChar = value & 0x0FFF;
 722     theChar = firstChar;
 723     bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
 724     usedLength = 0;
 725
 726     if (value & RECURSIVE_DECOMPOSITION) {
 727         usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
 728
 729         --length;    /* Decrement for the first char */
 730         if (!usedLength)
 731             return 0;
 732         ++bmpMappings;
 733         convertedChars += usedLength;
 734     }
 735
 736     usedLength += length;
 737
 738     while (length--)
 739         *(convertedChars++) = *(bmpMappings++);
 740
 741     return (usedLength);
 742 }
 743
 744 /*
 745  * unicode_decompose - decompose a composed Unicode char
 746  *
 747  * Composed Unicode characters are forbidden on
 748  * HFS Plus volumes. ucs_decompose will convert a
 749  * composed character into its correct decomposed
 750  * sequence.
 751  *
 752  * Similar to CFUniCharDecomposeCharacter
 753  */
 754 static int
 755 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
 756 {
 757     if ((character >= HANGUL_SBASE) &&
 758         (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
 759         u_int32_t length;
 760
 761         character -= HANGUL_SBASE;
 762         length = (character % HANGUL_TCOUNT ? 3 : 2);
 763
 764         *(convertedChars++) =
 765         character / HANGUL_NCOUNT + HANGUL_LBASE;
 766         *(convertedChars++) =
 767         (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
 768         if (length > 2)
 769             *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
 770         return (length);
 771     } else {
 772         return (unicode_recursive_decompose(character, convertedChars));
 773     }
 774 }
 775
 776 /*
 777  * unicode_combine - generate a precomposed Unicode char
 778  *
 779  * Precomposed Unicode characters are required for some volume
 780  * formats and network protocols.  unicode_combine will combine
 781  * a decomposed character sequence into a single precomposed
 782  * (composite) character.
 783  *
 784  * Similar toCFUniCharPrecomposeCharacter but unicode_combine
 785  * also handles Hangul Jamo characters.
 786  */
 787 static u_int16_t
 788 unicode_combine(u_int16_t base, u_int16_t combining)
 789 {
 790     u_int32_t value;
 791
 792     /* Check HANGUL */
 793     if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
 794         /* 2 char Hangul sequences */
 795         if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
 796             (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
 797             return (HANGUL_SBASE +
 798                     ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
 799                     ((combining  - HANGUL_VBASE)*HANGUL_TCOUNT));
 800         }
 801
 802         /* 3 char Hangul sequences */
 803         if ((combining > HANGUL_TBASE) &&
 804             (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
 805             if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
 806                 return (0);
 807             else
 808                 return (base + (combining - HANGUL_TBASE));
 809         }
 810     }
 811
 812     value = getmappedvalue32(
 813                              (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
 814                              __CFUniCharPrecompositionTableLength, combining);
 815
 816     if (value) {
 817         value = getmappedvalue16(
 818                                  (const unicode_mappings16 *)
 819                                  ((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
 820                                  (value >> 16), base);
 821     }
 822     return (value);
 823 }
 824
 825
 826 /*
 827  * priortysort - order combining chars into canonical order
 828  *
 829  * Similar to CFUniCharPrioritySort
 830  */
 831 static void
 832 priortysort(u_int16_t* characters, int count)
 833 {
 834     u_int32_t p1, p2;
 835     u_int16_t *ch1, *ch2;
 836     u_int16_t *end;
 837     int changes = 0;
 838
 839     end = characters + count;
 840     do {
 841         changes = 0;
 842         ch1 = characters;
 843         ch2 = characters + 1;
 844         p2 = get_combining_class(*ch1);
 845         while (ch2 < end) {
 846             p1 = p2;
 847             p2 = get_combining_class(*ch2);
 848             if (p1 > p2 && p2 != 0) {
 849                 u_int32_t tmp;
 850
 851                 tmp = *ch1;
 852                 *ch1 = *ch2;
 853                 *ch2 = tmp;
 854                 changes = 1;
 855
 856                 /*
 857                  * Make sure that p2 contains the combining class for the
 858                  * character now stored at *ch2.  This isn't required for
 859                  * correctness, but it will be more efficient if a character
 860                  * with a large combining class has to "bubble past" several
 861                  * characters with lower combining classes.
 862                  */
 863                 p2 = p1;
 864             }
 865             ++ch1;
 866             ++ch2;
 867         }
 868     } while (changes);
 869 }
 870
 871
 872 /*
 873  * Invalid NTFS filename characters are encodeded using the
 874  * SFM (Services for Macintosh) private use Unicode characters.
 875  *
 876  * These should only be used for SMB, MSDOS or NTFS.
 877  *
 878  *    Illegal NTFS Char   SFM Unicode Char
 879  *  ----------------------------------------
 880  *    0x01-0x1f           0xf001-0xf01f
 881  *    '"'                 0xf020
 882  *    '*'                 0xf021
 883  *    '/'                 0xf022
 884  *    '<'                 0xf023
 885  *    '>'                 0xf024
 886  *    '?'                 0xf025
 887  *    '\'                 0xf026
 888  *    '|'                 0xf027
 889  *    ' '                 0xf028  (Only if last char of the name)
 890  *    '.'                 0xf029  (Only if last char of the name)
 891  *  ----------------------------------------
 892  *
 893  *  Reference: http://support.microsoft.com/kb/q117258/
 894  */
 895
 896 #define MAX_SFM2MAC           0x29
 897 #define SFMCODE_PREFIX_MASK   0xf000
 898
 899 /*
 900  * In the Mac OS 9 days the colon was illegal in a file name. For that reason
 901  * SFM had no conversion for the colon. There is a conversion for the
 902  * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
 903  * is a slash and a slash is a colon. So we can just replace the slash with the
 904  * colon in our tables and everything will just work.
 905  */
 906 static u_int8_t
 907 sfm2mac[42] = {
 908     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* 00 - 07 */
 909     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,   /* 08 - 0F */
 910     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,   /* 10 - 17 */
 911     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,   /* 18 - 1F */
 912     0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c,   /* 20 - 27 */
 913     0x20, 0x2e                                        /* 28 - 29 */
 914 };
 915
 916 static u_int8_t
 917 mac2sfm[112] = {
 918     0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27,      /* 20 - 27 */
 919     0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22,   /* 28 - 2f */
 920     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,   /* 30 - 37 */
 921     0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25,   /* 38 - 3f */
 922     0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,   /* 40 - 47 */
 923     0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,   /* 48 - 4f */
 924     0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,   /* 50 - 57 */
 925     0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f,   /* 58 - 5f */
 926     0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,   /* 60 - 67 */
 927     0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,   /* 68 - 6f */
 928     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,   /* 70 - 77 */
 929     0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f    /* 78 - 7f */
 930 };
 931
 932
 933 /*
 934  * Encode illegal NTFS filename characters into SFM Private Unicode characters
 935  *
 936  * Assumes non-zero ASCII input.
 937  */
 938 static u_int16_t
 939 ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
 940 {
 941     /* The last character of filename cannot be a space or period. */
 942     if (lastchar) {
 943         if (ucs_ch == 0x20)
 944             return (0xf028);
 945         else if (ucs_ch == 0x2e)
 946             return (0xf029);
 947     }
 948     /* 0x01 - 0x1f is simple transformation. */
 949     if (ucs_ch <= 0x1f) {
 950         return (ucs_ch | 0xf000);
 951     } else /* 0x20 - 0x7f */ {
 952         u_int16_t lsb;
 953
 954         lsb = mac2sfm[ucs_ch - 0x0020];
 955         if (lsb != ucs_ch)
 956             return(0xf000 | lsb);
 957     }
 958     return (ucs_ch);
 959 }
 960
 961 /*
 962  * Decode any SFM Private Unicode characters
 963  */
 964 static u_int16_t
 965 sfm_to_ucs(u_int16_t ucs_ch)
 966 {
 967     if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
 968         ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
 969         ucs_ch = sfm2mac[ucs_ch & 0x003f];
 970     }
 971     return (ucs_ch);
 972 }
 973