bsd/vfs/vfs_utfconv.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23
  24  /*
  25         Includes Unicode 3.2 decomposition code derived from Core Foundation
  26  */
  27
  28 #include <sys/param.h>
  29 #include <sys/utfconv.h>
  30 #include <sys/errno.h>
  31 #include <architecture/byte_order.h>
  32
  33 /*
  34  * UTF-8 (Unicode Transformation Format)
  35  *
  36  * UTF-8 is the Unicode Transformation Format that serializes a Unicode
  37  * character as a sequence of one to four bytes. Only the shortest form
  38  * required to represent the significant Unicode bits is legal.
  39  *
  40  * UTF-8 Multibyte Codes
  41  *
  42  * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary)
  43  * -----------------------------------------------------------------------------
  44  *   1       7       0x0000        0x007F    0xxxxxxx
  45  *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx
  46  *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx
  47  *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  48  * -----------------------------------------------------------------------------
  49  */
  50
  51
  52 #define UNICODE_TO_UTF8_LEN(c)  \
  53         ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
  54
  55 #define UCS_ALT_NULL    0x2400
  56
  57 /* Surrogate Pair Constants */
  58 #define SP_HALF_SHIFT   10
  59 #define SP_HALF_BASE    0x0010000UL
  60 #define SP_HALF_MASK    0x3FFUL
  61
  62 #define SP_HIGH_FIRST   0xD800UL
  63 #define SP_HIGH_LAST    0xDBFFUL
  64 #define SP_LOW_FIRST    0xDC00UL
  65 #define SP_LOW_LAST     0xDFFFUL
  66
  67
  68 #include "vfs_utfconvdata.h"
  69
  70
  71 /*
  72  * Test for a combining character.
  73  *
  74  * Similar to __CFUniCharIsNonBaseCharacter except that
  75  * unicode_combinable also includes Hangul Jamo characters.
  76  */
  77 static inline int
  78 unicode_combinable(u_int16_t character)
  79 {
  80         const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
  81         u_int8_t value;
  82
  83         if (character < 0x0300)
  84                 return (0);
  85
  86         value = bitmap[(character >> 8) & 0xFF];
  87
  88         if (value == 0xFF) {
  89                 return (1);
  90         } else if (value) {
  91                 bitmap = bitmap + ((value - 1) * 32) + 256;
  92                 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
  93         }
  94         return (0);
  95 }
  96
  97 /*
  98  * Test for a precomposed character.
  99  *
 100  * Similar to __CFUniCharIsDecomposableCharacter.
 101  */
 102 static inline int
 103 unicode_decomposeable(u_int16_t character) {
 104         const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
 105         u_int8_t value;
 106
 107         if (character < 0x00C0)
 108                 return (0);
 109
 110         value = bitmap[(character >> 8) & 0xFF];
 111
 112         if (value == 0xFF) {
 113                 return (1);
 114         } else if (value) {
 115                 bitmap = bitmap + ((value - 1) * 32) + 256;
 116                 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
 117         }
 118         return (0);
 119 }
 120
 121 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
 122
 123 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
 124
 125
 126 char utf_extrabytes[32] = {
 127          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 128         -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,  2,  2,  3, -1
 129 };
 130
 131
 132 /*
 133  * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
 134  *
 135  * NOTES:
 136  *    If '/' chars are allowed on disk then an alternate
 137  *    (replacement) char must be provided in altslash.
 138  *
 139  * input flags:
 140  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
 141  */
 142 size_t
 143 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
 144                int flags)
 145 {
 146         u_int16_t ucs_ch;
 147         int charcnt;
 148         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 149         size_t len;
 150
 151         charcnt = ucslen / 2;
 152         len = 0;
 153
 154         while (charcnt-- > 0) {
 155                 ucs_ch = *ucsp++;
 156
 157                 if (swapbytes)
 158                         ucs_ch = NXSwapShort(ucs_ch);
 159                 if (ucs_ch == '/')
 160                         ucs_ch = altslash ? altslash : '_';
 161                 else if (ucs_ch == '\0')
 162                         ucs_ch = UCS_ALT_NULL;
 163
 164                 len += UNICODE_TO_UTF8_LEN(ucs_ch);
 165         }
 166
 167         return (len);
 168 }
 169
 170
 171 /*
 172  * utf8_encodestr - Encodes a Unicode string to UTF-8
 173  *
 174  * NOTES:
 175  *    The resulting UTF-8 string is NULL terminated.
 176  *
 177  *    If '/' chars are allowed on disk then an alternate
 178  *    (replacement) char must be provided in altslash.
 179  *
 180  * input flags:
 181  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
 182  *    UTF_NO_NULL_TERM:  don't add NULL termination to UTF-8 output
 183  *
 184  * result:
 185  *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
 186  *    EINVAL: Illegal char found; char was replaced by an '_'.
 187  */
 188 int
 189 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
 190                size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
 191 {
 192         u_int8_t * bufstart;
 193         u_int8_t * bufend;
 194         u_int16_t ucs_ch;
 195         u_int16_t * chp = NULL;
 196         u_int16_t sequence[8];
 197         int extra = 0;
 198         int charcnt;
 199         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 200         int nullterm  = ((flags & UTF_NO_NULL_TERM) == 0);
 201         int decompose = (flags & UTF_DECOMPOSED);
 202         int result = 0;
 203
 204         bufstart = utf8p;
 205         bufend = bufstart + buflen;
 206         if (nullterm)
 207                 --bufend;
 208         charcnt = ucslen / 2;
 209
 210         while (charcnt-- > 0) {
 211                 if (extra > 0) {
 212                         --extra;
 213                         ucs_ch = *chp++;
 214                 } else {
 215                         ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
 216
 217                         if (decompose && unicode_decomposeable(ucs_ch)) {
 218                                 extra = unicode_decompose(ucs_ch, sequence) - 1;
 219                                 charcnt += extra;
 220                                 ucs_ch = sequence[0];
 221                                 chp = &sequence[1];
 222                         }
 223                 }
 224
 225                 /* Slash and NULL are not permitted */
 226                 if (ucs_ch == '/') {
 227                         if (altslash)
 228                                 ucs_ch = altslash;
 229                         else {
 230                                 ucs_ch = '_';
 231                                 result = EINVAL;
 232                         }
 233                 } else if (ucs_ch == '\0') {
 234                         ucs_ch = UCS_ALT_NULL;
 235                 }
 236
 237                 if (ucs_ch < 0x0080) {
 238                         if (utf8p >= bufend) {
 239                                 result = ENAMETOOLONG;
 240                                 break;
 241                         }
 242                         *utf8p++ = ucs_ch;
 243
 244                 } else if (ucs_ch < 0x800) {
 245                         if ((utf8p + 1) >= bufend) {
 246                                 result = ENAMETOOLONG;
 247                                 break;
 248                         }
 249                         *utf8p++ = 0xc0 | (ucs_ch >> 6);
 250                         *utf8p++ = 0x80 | (0x3f & ucs_ch);
 251
 252                 } else {
 253                         /* Combine valid surrogate pairs */
 254                         if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
 255                                 && charcnt > 0) {
 256                                 u_int16_t ch2;
 257                                 u_int32_t pair;
 258
 259                                 ch2 = swapbytes ? NXSwapShort(*ucsp) : *ucsp;
 260                                 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
 261                                         pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
 262                                                 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
 263                                         if ((utf8p + 3) >= bufend) {
 264                                                 result = ENAMETOOLONG;
 265                                                 break;
 266                                         }
 267                                         --charcnt;
 268                                         ++ucsp;
 269                                         *utf8p++ = 0xf0 | (pair >> 18);
 270                                         *utf8p++ = 0x80 | (0x3f & (pair >> 12));
 271                                         *utf8p++ = 0x80 | (0x3f & (pair >> 6));
 272                                         *utf8p++ = 0x80 | (0x3f & pair);
 273                                         continue;
 274                                 }
 275                         }
 276                         if ((utf8p + 2) >= bufend) {
 277                                 result = ENAMETOOLONG;
 278                                 break;
 279                         }
 280                         *utf8p++ = 0xe0 | (ucs_ch >> 12);
 281                         *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
 282                         *utf8p++ = 0x80 | (0x3f & ucs_ch);
 283                 }
 284         }
 285
 286         *utf8len = utf8p - bufstart;
 287         if (nullterm)
 288                 *utf8p++ = '\0';
 289
 290         return (result);
 291 }
 292
 293
 294 /*
 295  * utf8_decodestr - Decodes a UTF-8 string back to Unicode
 296  *
 297  * NOTES:
 298  *    The input UTF-8 string does not need to be null terminated
 299  *    if utf8len is set.
 300  *
 301  *    If '/' chars are allowed on disk then an alternate
 302  *    (replacement) char must be provided in altslash.
 303  *
 304  * input flags:
 305  *    UTF_REV_ENDIAN:   Unicode byteorder is oposite current runtime
 306  *    UTF_DECOMPOSED:   Unicode output string must be fully decompsed
 307  *
 308  * result:
 309  *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
 310  *    EINVAL: Illegal UTF-8 sequence found.
 311  */
 312 int
 313 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
 314                size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
 315 {
 316         u_int16_t* bufstart;
 317         u_int16_t* bufend;
 318         unsigned int ucs_ch;
 319         unsigned int byte;
 320         int result = 0;
 321         int decompose, precompose, swapbytes;
 322
 323         decompose =  (flags & UTF_DECOMPOSED);
 324         precompose = (flags & UTF_PRECOMPOSED);
 325         swapbytes =  (flags & UTF_REVERSE_ENDIAN);
 326
 327         bufstart = ucsp;
 328         bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
 329
 330         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 331                 if (ucsp >= bufend)
 332                         goto toolong;
 333
 334                 /* check for ascii */
 335                 if (byte < 0x80) {
 336                         ucs_ch = byte;                 /* 1st byte */
 337                 } else {
 338                         u_int32_t ch;
 339                         int extrabytes = utf_extrabytes[byte >> 3];
 340
 341                         if (utf8len < extrabytes)
 342                                 goto invalid;
 343                         utf8len -= extrabytes;
 344
 345                         switch (extrabytes) {
 346                         case 1:
 347                                 ch = byte; ch <<= 6;   /* 1st byte */
 348                                 byte = *utf8p++;       /* 2nd byte */
 349                                 if ((byte >> 6) != 2)
 350                                         goto invalid;
 351                                 ch += byte;
 352                                 ch -= 0x00003080UL;
 353                                 if (ch < 0x0080)
 354                                         goto invalid;
 355                                 ucs_ch = ch;
 356                                 break;
 357                         case 2:
 358                                 ch = byte; ch <<= 6;   /* 1st byte */
 359                                 byte = *utf8p++;       /* 2nd byte */
 360                                 if ((byte >> 6) != 2)
 361                                         goto invalid;
 362                                 ch += byte; ch <<= 6;
 363                                 byte = *utf8p++;       /* 3rd byte */
 364                                 if ((byte >> 6) != 2)
 365                                         goto invalid;
 366                                 ch += byte;
 367                                 ch -= 0x000E2080UL;
 368                                 if (ch < 0x0800)
 369                                         goto invalid;
 370                                 if (ch >= 0xD800) {
 371                                         if (ch <= 0xDFFF)
 372                                                 goto invalid;
 373                                         if (ch == 0xFFFE || ch == 0xFFFF)
 374                                                 goto invalid;
 375                                 }
 376                                 ucs_ch = ch;
 377                                 break;
 378                         case 3:
 379                                 ch = byte; ch <<= 6;   /* 1st byte */
 380                                 byte = *utf8p++;       /* 2nd byte */
 381                                 if ((byte >> 6) != 2)
 382                                         goto invalid;
 383                                 ch += byte; ch <<= 6;
 384                                 byte = *utf8p++;       /* 3rd byte */
 385                                 if ((byte >> 6) != 2)
 386                                         goto invalid;
 387                                 ch += byte; ch <<= 6;
 388                                 byte = *utf8p++;       /* 4th byte */
 389                                 if ((byte >> 6) != 2)
 390                                         goto invalid;
 391                                 ch += byte;
 392                                 ch -= 0x03C82080UL + SP_HALF_BASE;
 393                                 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 394                                 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
 395                                         goto invalid;
 396                                 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
 397                                 if (ucsp >= bufend)
 398                                         goto toolong;
 399                                 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 400                                 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
 401                                         goto invalid;
 402                                 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
 403                                 continue;
 404                         default:
 405                                 goto invalid;
 406                         }
 407                         if (decompose) {
 408                                 if (unicode_decomposeable(ucs_ch)) {
 409                                         u_int16_t sequence[8];
 410                                         int count, i;
 411
 412                                         count = unicode_decompose(ucs_ch, sequence);
 413
 414                                         for (i = 0; i < count; ++i) {
 415                                                 ucs_ch = sequence[i];
 416                                                 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
 417                                                 if (ucsp >= bufend)
 418                                                         goto toolong;
 419                                         }
 420                                         continue;
 421                                 }
 422                         } else if (precompose && (ucsp != bufstart)) {
 423                                 u_int16_t composite, base;
 424
 425                                 if (unicode_combinable(ucs_ch)) {
 426                                         base = swapbytes ? NXSwapShort(*(ucsp - 1)) : *(ucsp - 1);
 427                                         composite = unicode_combine(base, ucs_ch);
 428                                         if (composite) {
 429                                                 --ucsp;
 430                                                 ucs_ch = composite;
 431                                         }
 432                                 }
 433                         }
 434                         if (ucs_ch == UCS_ALT_NULL)
 435                                 ucs_ch = '\0';
 436                 }
 437                 if (ucs_ch == altslash)
 438                         ucs_ch = '/';
 439
 440                 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
 441         }
 442
 443 exit:
 444         *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
 445
 446         return (result);
 447
 448 invalid:
 449         result = EINVAL;
 450         goto exit;
 451
 452 toolong:
 453         result = ENAMETOOLONG;
 454         goto exit;
 455 }
 456
 457
 458 /*
 459  * utf8_validatestr - Check for a valid UTF-8 string.
 460  */
 461 int
 462 utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
 463 {
 464         unsigned int byte;
 465         u_int32_t ch;
 466         unsigned int ucs_ch;
 467         size_t extrabytes;
 468
 469         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 470                 if (byte < 0x80)
 471                         continue;  /* plain ascii */
 472
 473                 extrabytes = utf_extrabytes[byte >> 3];
 474
 475                 if (utf8len < extrabytes)
 476                         goto invalid;
 477                 utf8len -= extrabytes;
 478
 479                 switch (extrabytes) {
 480                 case 1:
 481                         ch = byte; ch <<= 6;   /* 1st byte */
 482                         byte = *utf8p++;       /* 2nd byte */
 483                         if ((byte >> 6) != 2)
 484                                 goto invalid;
 485                         ch += byte;
 486                         ch -= 0x00003080UL;
 487                         if (ch < 0x0080)
 488                                 goto invalid;
 489                         break;
 490                 case 2:
 491                         ch = byte; ch <<= 6;   /* 1st byte */
 492                         byte = *utf8p++;       /* 2nd byte */
 493                         if ((byte >> 6) != 2)
 494                                 goto invalid;
 495                         ch += byte; ch <<= 6;
 496                         byte = *utf8p++;       /* 3rd byte */
 497                         if ((byte >> 6) != 2)
 498                                 goto invalid;
 499                         ch += byte;
 500                         ch -= 0x000E2080UL;
 501                         if (ch < 0x0800)
 502                                 goto invalid;
 503                         if (ch >= 0xD800) {
 504                                 if (ch <= 0xDFFF)
 505                                         goto invalid;
 506                                 if (ch == 0xFFFE || ch == 0xFFFF)
 507                                         goto invalid;
 508                         }
 509                         break;
 510                 case 3:
 511                         ch = byte; ch <<= 6;   /* 1st byte */
 512                         byte = *utf8p++;       /* 2nd byte */
 513                         if ((byte >> 6) != 2)
 514                                 goto invalid;
 515                         ch += byte; ch <<= 6;
 516                         byte = *utf8p++;       /* 3rd byte */
 517                         if ((byte >> 6) != 2)
 518                                 goto invalid;
 519                         ch += byte; ch <<= 6;
 520                         byte = *utf8p++;       /* 4th byte */
 521                         if ((byte >> 6) != 2)
 522                                 goto invalid;
 523                         ch += byte;
 524                         ch -= 0x03C82080UL + SP_HALF_BASE;
 525                         ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 526                         if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
 527                                 goto invalid;
 528                         ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 529                         if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
 530                                 goto invalid;
 531                         break;
 532                 default:
 533                         goto invalid;
 534                 }
 535
 536         }
 537         return (0);
 538 invalid:
 539         return (EINVAL);
 540 }
 541
 542
 543  /*
 544   * Unicode 3.2 decomposition code (derived from Core Foundation)
 545   */
 546
 547 typedef struct {
 548         u_int32_t _key;
 549         u_int32_t _value;
 550 } unicode_mappings32;
 551
 552 static inline u_int32_t
 553 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
 554                 u_int16_t character)
 555 {
 556         const unicode_mappings32 *p, *q, *divider;
 557
 558         if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
 559                 return (0);
 560
 561         p = theTable;
 562         q = p + (numElem-1);
 563         while (p <= q) {
 564                 divider = p + ((q - p) >> 1);   /* divide by 2 */
 565                 if (character < divider->_key) { q = divider - 1; }
 566                 else if (character > divider->_key) { p = divider + 1; }
 567                 else { return (divider->_value); }
 568         }
 569         return (0);
 570 }
 571
 572 #define RECURSIVE_DECOMPOSITION (1 << 15)
 573 #define EXTRACT_COUNT(value)    (((value) >> 12) & 0x0007)
 574
 575 typedef struct {
 576         u_int16_t _key;
 577         u_int16_t _value;
 578 } unicode_mappings16;
 579
 580 static inline u_int16_t
 581 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
 582                 u_int16_t character)
 583 {
 584         const unicode_mappings16 *p, *q, *divider;
 585
 586         if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
 587                 return (0);
 588
 589         p = theTable;
 590         q = p + (numElem-1);
 591         while (p <= q) {
 592                 divider = p + ((q - p) >> 1);   /* divide by 2 */
 593                 if (character < divider->_key)
 594                         q = divider - 1;
 595                 else if (character > divider->_key)
 596                         p = divider + 1;
 597                 else
 598                         return (divider->_value);
 599         }
 600         return (0);
 601 }
 602
 603
 604 static u_int32_t
 605 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
 606 {
 607         u_int16_t value;
 608         u_int32_t length;
 609         u_int16_t firstChar;
 610         u_int16_t theChar;
 611         const u_int16_t *bmpMappings;
 612         u_int32_t usedLength;
 613
 614         value = getmappedvalue16(
 615                 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
 616                 __UniCharDecompositionTableLength, character);
 617         length = EXTRACT_COUNT(value);
 618         firstChar = value & 0x0FFF;
 619         theChar = firstChar;
 620         bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
 621         usedLength = 0;
 622
 623         if (value & RECURSIVE_DECOMPOSITION) {
 624             usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
 625
 626             --length;   /* Decrement for the first char */
 627             if (!usedLength)
 628                 return 0;
 629             ++bmpMappings;
 630             convertedChars += usedLength;
 631         }
 632
 633         usedLength += length;
 634
 635         while (length--)
 636                 *(convertedChars++) = *(bmpMappings++);
 637
 638         return (usedLength);
 639 }
 640
 641 #define HANGUL_SBASE 0xAC00
 642 #define HANGUL_LBASE 0x1100
 643 #define HANGUL_VBASE 0x1161
 644 #define HANGUL_TBASE 0x11A7
 645
 646 #define HANGUL_SCOUNT 11172
 647 #define HANGUL_LCOUNT 19
 648 #define HANGUL_VCOUNT 21
 649 #define HANGUL_TCOUNT 28
 650 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
 651
 652 /*
 653  * unicode_decompose - decompose a composed Unicode char
 654  *
 655  * Composed Unicode characters are forbidden on
 656  * HFS Plus volumes. ucs_decompose will convert a
 657  * composed character into its correct decomposed
 658  * sequence.
 659  *
 660  * Similar to CFUniCharDecomposeCharacter
 661  */
 662 static int
 663 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
 664 {
 665         if ((character >= HANGUL_SBASE) &&
 666             (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
 667                 u_int32_t length;
 668
 669                 character -= HANGUL_SBASE;
 670                 length = (character % HANGUL_TCOUNT ? 3 : 2);
 671
 672                 *(convertedChars++) =
 673                         character / HANGUL_NCOUNT + HANGUL_LBASE;
 674                 *(convertedChars++) =
 675                         (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
 676                 if (length > 2)
 677                         *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
 678                 return (length);
 679         } else {
 680                 return (unicode_recursive_decompose(character, convertedChars));
 681         }
 682 }
 683
 684 /*
 685  * unicode_combine - generate a precomposed Unicode char
 686  *
 687  * Precomposed Unicode characters are required for some volume
 688  * formats and network protocols.  unicode_combine will combine
 689  * a decomposed character sequence into a single precomposed
 690  * (composite) character.
 691  *
 692  * Similar toCFUniCharPrecomposeCharacter but unicode_combine
 693  * also handles Hangul Jamo characters.
 694  */
 695 static u_int16_t
 696 unicode_combine(u_int16_t base, u_int16_t combining)
 697 {
 698         u_int32_t value;
 699
 700         /* Check HANGUL */
 701         if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
 702                 /* 2 char Hangul sequences */
 703                 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
 704                     (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
 705                     return (HANGUL_SBASE +
 706                             ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
 707                             ((combining  - HANGUL_VBASE)*HANGUL_TCOUNT));
 708                 }
 709
 710                 /* 3 char Hangul sequences */
 711                 if ((combining > HANGUL_TBASE) &&
 712                     (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
 713                         if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
 714                                 return (0);
 715                         else
 716                                 return (base + (combining - HANGUL_TBASE));
 717                 }
 718         }
 719
 720         value = getmappedvalue32(
 721                 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
 722                 __CFUniCharPrecompositionTableLength, combining);
 723
 724         if (value) {
 725                 value = getmappedvalue16(
 726                         (const unicode_mappings16 *)
 727                         ((u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
 728                         (value >> 16), base);
 729         }
 730         return (value);
 731 }
 732