bsd/vfs/vfs_utfconv.c

   1 /*
   2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29  /*
  30         Includes Unicode 3.2 decomposition code derived from Core Foundation
  31  */
  32
  33 #include <sys/param.h>
  34 #include <sys/utfconv.h>
  35 #include <sys/errno.h>
  36 #include <architecture/byte_order.h>
  37
  38 /*
  39  * UTF-8 (Unicode Transformation Format)
  40  *
  41  * UTF-8 is the Unicode Transformation Format that serializes a Unicode
  42  * character as a sequence of one to four bytes. Only the shortest form
  43  * required to represent the significant Unicode bits is legal.
  44  *
  45  * UTF-8 Multibyte Codes
  46  *
  47  * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary)
  48  * -----------------------------------------------------------------------------
  49  *   1       7       0x0000        0x007F    0xxxxxxx
  50  *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx
  51  *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx
  52  *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  53  * -----------------------------------------------------------------------------
  54  */
  55
  56
  57 #define UNICODE_TO_UTF8_LEN(c)  \
  58         ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
  59
  60 #define UCS_ALT_NULL    0x2400
  61
  62 /* Surrogate Pair Constants */
  63 #define SP_HALF_SHIFT   10
  64 #define SP_HALF_BASE    0x0010000UL
  65 #define SP_HALF_MASK    0x3FFUL
  66
  67 #define SP_HIGH_FIRST   0xD800UL
  68 #define SP_HIGH_LAST    0xDBFFUL
  69 #define SP_LOW_FIRST    0xDC00UL
  70 #define SP_LOW_LAST     0xDFFFUL
  71
  72
  73 #include "vfs_utfconvdata.h"
  74
  75
  76 /*
  77  * Test for a combining character.
  78  *
  79  * Similar to __CFUniCharIsNonBaseCharacter except that
  80  * unicode_combinable also includes Hangul Jamo characters.
  81  */
  82 static inline int
  83 unicode_combinable(u_int16_t character)
  84 {
  85         const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
  86         u_int8_t value;
  87
  88         if (character < 0x0300)
  89                 return (0);
  90
  91         value = bitmap[(character >> 8) & 0xFF];
  92
  93         if (value == 0xFF) {
  94                 return (1);
  95         } else if (value) {
  96                 bitmap = bitmap + ((value - 1) * 32) + 256;
  97                 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
  98         }
  99         return (0);
 100 }
 101
 102 /*
 103  * Test for a precomposed character.
 104  *
 105  * Similar to __CFUniCharIsDecomposableCharacter.
 106  */
 107 static inline int
 108 unicode_decomposeable(u_int16_t character) {
 109         const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
 110         u_int8_t value;
 111
 112         if (character < 0x00C0)
 113                 return (0);
 114
 115         value = bitmap[(character >> 8) & 0xFF];
 116
 117         if (value == 0xFF) {
 118                 return (1);
 119         } else if (value) {
 120                 bitmap = bitmap + ((value - 1) * 32) + 256;
 121                 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
 122         }
 123         return (0);
 124 }
 125
 126
 127 /*
 128  * Get the combing class.
 129  *
 130  * Similar to CFUniCharGetCombiningPropertyForCharacter.
 131  */
 132 static inline u_int8_t
 133 get_combining_class(u_int16_t character) {
 134         const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
 135
 136         u_int8_t value = bitmap[(character >> 8)];
 137
 138         if (value) {
 139                 bitmap = bitmap + (value * 256);
 140                 return bitmap[character % 256];
 141         }
 142         return (0);
 143 }
 144
 145
 146 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
 147
 148 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
 149
 150 static void priortysort(u_int16_t* characters, int count);
 151
 152 char utf_extrabytes[32] = {
 153          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 154         -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,  2,  2,  3, -1
 155 };
 156
 157
 158 /*
 159  * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
 160  *
 161  * NOTES:
 162  *    If '/' chars are allowed on disk then an alternate
 163  *    (replacement) char must be provided in altslash.
 164  *
 165  * input flags:
 166  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
 167  */
 168 size_t
 169 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
 170                int flags)
 171 {
 172         u_int16_t ucs_ch;
 173         int charcnt;
 174         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 175         size_t len;
 176
 177         charcnt = ucslen / 2;
 178         len = 0;
 179
 180         while (charcnt-- > 0) {
 181                 ucs_ch = *ucsp++;
 182
 183                 if (swapbytes)
 184                         ucs_ch = NXSwapShort(ucs_ch);
 185                 if (ucs_ch == '/')
 186                         ucs_ch = altslash ? altslash : '_';
 187                 else if (ucs_ch == '\0')
 188                         ucs_ch = UCS_ALT_NULL;
 189
 190                 len += UNICODE_TO_UTF8_LEN(ucs_ch);
 191         }
 192
 193         return (len);
 194 }
 195
 196
 197 /*
 198  * utf8_encodestr - Encodes a Unicode string to UTF-8
 199  *
 200  * NOTES:
 201  *    The resulting UTF-8 string is NULL terminated.
 202  *
 203  *    If '/' chars are allowed on disk then an alternate
 204  *    (replacement) char must be provided in altslash.
 205  *
 206  * input flags:
 207  *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
 208  *    UTF_NO_NULL_TERM:  don't add NULL termination to UTF-8 output
 209  *
 210  * result:
 211  *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
 212  *    EINVAL: Illegal char found; char was replaced by an '_'.
 213  */
 214 int
 215 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
 216                size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
 217 {
 218         u_int8_t * bufstart;
 219         u_int8_t * bufend;
 220         u_int16_t ucs_ch;
 221         u_int16_t * chp = NULL;
 222         u_int16_t sequence[8];
 223         int extra = 0;
 224         int charcnt;
 225         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 226         int nullterm  = ((flags & UTF_NO_NULL_TERM) == 0);
 227         int decompose = (flags & UTF_DECOMPOSED);
 228         int result = 0;
 229
 230         bufstart = utf8p;
 231         bufend = bufstart + buflen;
 232         if (nullterm)
 233                 --bufend;
 234         charcnt = ucslen / 2;
 235
 236         while (charcnt-- > 0) {
 237                 if (extra > 0) {
 238                         --extra;
 239                         ucs_ch = *chp++;
 240                 } else {
 241                         ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
 242
 243                         if (decompose && unicode_decomposeable(ucs_ch)) {
 244                                 extra = unicode_decompose(ucs_ch, sequence) - 1;
 245                                 charcnt += extra;
 246                                 ucs_ch = sequence[0];
 247                                 chp = &sequence[1];
 248                         }
 249                 }
 250
 251                 /* Slash and NULL are not permitted */
 252                 if (ucs_ch == '/') {
 253                         if (altslash)
 254                                 ucs_ch = altslash;
 255                         else {
 256                                 ucs_ch = '_';
 257                                 result = EINVAL;
 258                         }
 259                 } else if (ucs_ch == '\0') {
 260                         ucs_ch = UCS_ALT_NULL;
 261                 }
 262
 263                 if (ucs_ch < 0x0080) {
 264                         if (utf8p >= bufend) {
 265                                 result = ENAMETOOLONG;
 266                                 break;
 267                         }
 268                         *utf8p++ = ucs_ch;
 269
 270                 } else if (ucs_ch < 0x800) {
 271                         if ((utf8p + 1) >= bufend) {
 272                                 result = ENAMETOOLONG;
 273                                 break;
 274                         }
 275                         *utf8p++ = 0xc0 | (ucs_ch >> 6);
 276                         *utf8p++ = 0x80 | (0x3f & ucs_ch);
 277
 278                 } else {
 279                         /* Combine valid surrogate pairs */
 280                         if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
 281                                 && charcnt > 0) {
 282                                 u_int16_t ch2;
 283                                 u_int32_t pair;
 284
 285                                 ch2 = swapbytes ? NXSwapShort(*ucsp) : *ucsp;
 286                                 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
 287                                         pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
 288                                                 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
 289                                         if ((utf8p + 3) >= bufend) {
 290                                                 result = ENAMETOOLONG;
 291                                                 break;
 292                                         }
 293                                         --charcnt;
 294                                         ++ucsp;
 295                                         *utf8p++ = 0xf0 | (pair >> 18);
 296                                         *utf8p++ = 0x80 | (0x3f & (pair >> 12));
 297                                         *utf8p++ = 0x80 | (0x3f & (pair >> 6));
 298                                         *utf8p++ = 0x80 | (0x3f & pair);
 299                                         continue;
 300                                 }
 301                         }
 302                         if ((utf8p + 2) >= bufend) {
 303                                 result = ENAMETOOLONG;
 304                                 break;
 305                         }
 306                         *utf8p++ = 0xe0 | (ucs_ch >> 12);
 307                         *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
 308                         *utf8p++ = 0x80 | (0x3f & ucs_ch);
 309                 }
 310         }
 311
 312         *utf8len = utf8p - bufstart;
 313         if (nullterm)
 314                 *utf8p++ = '\0';
 315
 316         return (result);
 317 }
 318
 319
 320 /*
 321  * utf8_decodestr - Decodes a UTF-8 string back to Unicode
 322  *
 323  * NOTES:
 324  *    The input UTF-8 string does not need to be null terminated
 325  *    if utf8len is set.
 326  *
 327  *    If '/' chars are allowed on disk then an alternate
 328  *    (replacement) char must be provided in altslash.
 329  *
 330  * input flags:
 331  *    UTF_REV_ENDIAN:   Unicode byteorder is oposite current runtime
 332  *    UTF_DECOMPOSED:   Unicode output string must be fully decompsed
 333  *
 334  * result:
 335  *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
 336  *    EINVAL: Illegal UTF-8 sequence found.
 337  */
 338 int
 339 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
 340                size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
 341 {
 342         u_int16_t* bufstart;
 343         u_int16_t* bufend;
 344         unsigned int ucs_ch;
 345         unsigned int byte;
 346         int combcharcnt = 0;
 347         int result = 0;
 348         int decompose, precompose, swapbytes;
 349
 350         decompose =  (flags & UTF_DECOMPOSED);
 351         precompose = (flags & UTF_PRECOMPOSED);
 352         swapbytes =  (flags & UTF_REVERSE_ENDIAN);
 353
 354         bufstart = ucsp;
 355         bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
 356
 357         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 358                 if (ucsp >= bufend)
 359                         goto toolong;
 360
 361                 /* check for ascii */
 362                 if (byte < 0x80) {
 363                         ucs_ch = byte;                 /* 1st byte */
 364                 } else {
 365                         u_int32_t ch;
 366                         int extrabytes = utf_extrabytes[byte >> 3];
 367
 368                         if (utf8len < extrabytes)
 369                                 goto invalid;
 370                         utf8len -= extrabytes;
 371
 372                         switch (extrabytes) {
 373                         case 1:
 374                                 ch = byte; ch <<= 6;   /* 1st byte */
 375                                 byte = *utf8p++;       /* 2nd byte */
 376                                 if ((byte >> 6) != 2)
 377                                         goto invalid;
 378                                 ch += byte;
 379                                 ch -= 0x00003080UL;
 380                                 if (ch < 0x0080)
 381                                         goto invalid;
 382                                 ucs_ch = ch;
 383                                 break;
 384                         case 2:
 385                                 ch = byte; ch <<= 6;   /* 1st byte */
 386                                 byte = *utf8p++;       /* 2nd byte */
 387                                 if ((byte >> 6) != 2)
 388                                         goto invalid;
 389                                 ch += byte; ch <<= 6;
 390                                 byte = *utf8p++;       /* 3rd byte */
 391                                 if ((byte >> 6) != 2)
 392                                         goto invalid;
 393                                 ch += byte;
 394                                 ch -= 0x000E2080UL;
 395                                 if (ch < 0x0800)
 396                                         goto invalid;
 397                                 if (ch >= 0xD800) {
 398                                         if (ch <= 0xDFFF)
 399                                                 goto invalid;
 400                                         if (ch == 0xFFFE || ch == 0xFFFF)
 401                                                 goto invalid;
 402                                 }
 403                                 ucs_ch = ch;
 404                                 break;
 405                         case 3:
 406                                 ch = byte; ch <<= 6;   /* 1st byte */
 407                                 byte = *utf8p++;       /* 2nd byte */
 408                                 if ((byte >> 6) != 2)
 409                                         goto invalid;
 410                                 ch += byte; ch <<= 6;
 411                                 byte = *utf8p++;       /* 3rd byte */
 412                                 if ((byte >> 6) != 2)
 413                                         goto invalid;
 414                                 ch += byte; ch <<= 6;
 415                                 byte = *utf8p++;       /* 4th byte */
 416                                 if ((byte >> 6) != 2)
 417                                         goto invalid;
 418                                 ch += byte;
 419                                 ch -= 0x03C82080UL + SP_HALF_BASE;
 420                                 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 421                                 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
 422                                         goto invalid;
 423                                 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
 424                                 if (ucsp >= bufend)
 425                                         goto toolong;
 426                                 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 427                                 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
 428                                         goto invalid;
 429                                 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
 430                                 continue;
 431                         default:
 432                                 goto invalid;
 433                         }
 434                         if (decompose) {
 435                                 if (unicode_decomposeable(ucs_ch)) {
 436                                         u_int16_t sequence[8];
 437                                         int count, i;
 438
 439                                         count = unicode_decompose(ucs_ch, sequence);
 440
 441                                         for (i = 0; i < count; ++i) {
 442                                                 ucs_ch = sequence[i];
 443                                                 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
 444                                                 if (ucsp >= bufend)
 445                                                         goto toolong;
 446                                         }
 447                                         combcharcnt += count - 1;
 448                                         continue;
 449                                 }
 450                         } else if (precompose && (ucsp != bufstart)) {
 451                                 u_int16_t composite, base;
 452
 453                                 if (unicode_combinable(ucs_ch)) {
 454                                         base = swapbytes ? NXSwapShort(*(ucsp - 1)) : *(ucsp - 1);
 455                                         composite = unicode_combine(base, ucs_ch);
 456                                         if (composite) {
 457                                                 --ucsp;
 458                                                 ucs_ch = composite;
 459                                         }
 460                                 }
 461                         }
 462                         if (ucs_ch == UCS_ALT_NULL)
 463                                 ucs_ch = '\0';
 464                 }
 465                 if (ucs_ch == altslash)
 466                         ucs_ch = '/';
 467
 468                 /*
 469                  * Make multiple combining character sequences canonical
 470                  */
 471                 if (unicode_combinable(ucs_ch)) {
 472                         ++combcharcnt;   /* start tracking a run */
 473                 } else if (combcharcnt) {
 474                         if (combcharcnt > 1) {
 475                                 priortysort(ucsp - combcharcnt, combcharcnt);
 476                         }
 477                         combcharcnt = 0;  /* start over */
 478                 }
 479                 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
 480         }
 481         /*
 482          * Make a previous combining sequence canonical
 483          */
 484         if (combcharcnt > 1) {
 485                 priortysort(ucsp - combcharcnt, combcharcnt);
 486         }
 487
 488 exit:
 489         *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
 490
 491         return (result);
 492
 493 invalid:
 494         result = EINVAL;
 495         goto exit;
 496
 497 toolong:
 498         result = ENAMETOOLONG;
 499         goto exit;
 500 }
 501
 502
 503 /*
 504  * utf8_validatestr - Check for a valid UTF-8 string.
 505  */
 506 int
 507 utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
 508 {
 509         unsigned int byte;
 510         u_int32_t ch;
 511         unsigned int ucs_ch;
 512         size_t extrabytes;
 513
 514         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 515                 if (byte < 0x80)
 516                         continue;  /* plain ascii */
 517
 518                 extrabytes = utf_extrabytes[byte >> 3];
 519
 520                 if (utf8len < extrabytes)
 521                         goto invalid;
 522                 utf8len -= extrabytes;
 523
 524                 switch (extrabytes) {
 525                 case 1:
 526                         ch = byte; ch <<= 6;   /* 1st byte */
 527                         byte = *utf8p++;       /* 2nd byte */
 528                         if ((byte >> 6) != 2)
 529                                 goto invalid;
 530                         ch += byte;
 531                         ch -= 0x00003080UL;
 532                         if (ch < 0x0080)
 533                                 goto invalid;
 534                         break;
 535                 case 2:
 536                         ch = byte; ch <<= 6;   /* 1st byte */
 537                         byte = *utf8p++;       /* 2nd byte */
 538                         if ((byte >> 6) != 2)
 539                                 goto invalid;
 540                         ch += byte; ch <<= 6;
 541                         byte = *utf8p++;       /* 3rd byte */
 542                         if ((byte >> 6) != 2)
 543                                 goto invalid;
 544                         ch += byte;
 545                         ch -= 0x000E2080UL;
 546                         if (ch < 0x0800)
 547                                 goto invalid;
 548                         if (ch >= 0xD800) {
 549                                 if (ch <= 0xDFFF)
 550                                         goto invalid;
 551                                 if (ch == 0xFFFE || ch == 0xFFFF)
 552                                         goto invalid;
 553                         }
 554                         break;
 555                 case 3:
 556                         ch = byte; ch <<= 6;   /* 1st byte */
 557                         byte = *utf8p++;       /* 2nd byte */
 558                         if ((byte >> 6) != 2)
 559                                 goto invalid;
 560                         ch += byte; ch <<= 6;
 561                         byte = *utf8p++;       /* 3rd byte */
 562                         if ((byte >> 6) != 2)
 563                                 goto invalid;
 564                         ch += byte; ch <<= 6;
 565                         byte = *utf8p++;       /* 4th byte */
 566                         if ((byte >> 6) != 2)
 567                                 goto invalid;
 568                         ch += byte;
 569                         ch -= 0x03C82080UL + SP_HALF_BASE;
 570                         ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
 571                         if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
 572                                 goto invalid;
 573                         ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
 574                         if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
 575                                 goto invalid;
 576                         break;
 577                 default:
 578                         goto invalid;
 579                 }
 580
 581         }
 582         return (0);
 583 invalid:
 584         return (EINVAL);
 585 }
 586
 587
 588  /*
 589   * Unicode 3.2 decomposition code (derived from Core Foundation)
 590   */
 591
 592 typedef struct {
 593         u_int32_t _key;
 594         u_int32_t _value;
 595 } unicode_mappings32;
 596
 597 static inline u_int32_t
 598 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
 599                 u_int16_t character)
 600 {
 601         const unicode_mappings32 *p, *q, *divider;
 602
 603         if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
 604                 return (0);
 605
 606         p = theTable;
 607         q = p + (numElem-1);
 608         while (p <= q) {
 609                 divider = p + ((q - p) >> 1);   /* divide by 2 */
 610                 if (character < divider->_key) { q = divider - 1; }
 611                 else if (character > divider->_key) { p = divider + 1; }
 612                 else { return (divider->_value); }
 613         }
 614         return (0);
 615 }
 616
 617 #define RECURSIVE_DECOMPOSITION (1 << 15)
 618 #define EXTRACT_COUNT(value)    (((value) >> 12) & 0x0007)
 619
 620 typedef struct {
 621         u_int16_t _key;
 622         u_int16_t _value;
 623 } unicode_mappings16;
 624
 625 static inline u_int16_t
 626 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
 627                 u_int16_t character)
 628 {
 629         const unicode_mappings16 *p, *q, *divider;
 630
 631         if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
 632                 return (0);
 633
 634         p = theTable;
 635         q = p + (numElem-1);
 636         while (p <= q) {
 637                 divider = p + ((q - p) >> 1);   /* divide by 2 */
 638                 if (character < divider->_key)
 639                         q = divider - 1;
 640                 else if (character > divider->_key)
 641                         p = divider + 1;
 642                 else
 643                         return (divider->_value);
 644         }
 645         return (0);
 646 }
 647
 648
 649 static u_int32_t
 650 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
 651 {
 652         u_int16_t value;
 653         u_int32_t length;
 654         u_int16_t firstChar;
 655         u_int16_t theChar;
 656         const u_int16_t *bmpMappings;
 657         u_int32_t usedLength;
 658
 659         value = getmappedvalue16(
 660                 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
 661                 __UniCharDecompositionTableLength, character);
 662         length = EXTRACT_COUNT(value);
 663         firstChar = value & 0x0FFF;
 664         theChar = firstChar;
 665         bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
 666         usedLength = 0;
 667
 668         if (value & RECURSIVE_DECOMPOSITION) {
 669             usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
 670
 671             --length;   /* Decrement for the first char */
 672             if (!usedLength)
 673                 return 0;
 674             ++bmpMappings;
 675             convertedChars += usedLength;
 676         }
 677
 678         usedLength += length;
 679
 680         while (length--)
 681                 *(convertedChars++) = *(bmpMappings++);
 682
 683         return (usedLength);
 684 }
 685
 686 #define HANGUL_SBASE 0xAC00
 687 #define HANGUL_LBASE 0x1100
 688 #define HANGUL_VBASE 0x1161
 689 #define HANGUL_TBASE 0x11A7
 690
 691 #define HANGUL_SCOUNT 11172
 692 #define HANGUL_LCOUNT 19
 693 #define HANGUL_VCOUNT 21
 694 #define HANGUL_TCOUNT 28
 695 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
 696
 697 /*
 698  * unicode_decompose - decompose a composed Unicode char
 699  *
 700  * Composed Unicode characters are forbidden on
 701  * HFS Plus volumes. ucs_decompose will convert a
 702  * composed character into its correct decomposed
 703  * sequence.
 704  *
 705  * Similar to CFUniCharDecomposeCharacter
 706  */
 707 static int
 708 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
 709 {
 710         if ((character >= HANGUL_SBASE) &&
 711             (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
 712                 u_int32_t length;
 713
 714                 character -= HANGUL_SBASE;
 715                 length = (character % HANGUL_TCOUNT ? 3 : 2);
 716
 717                 *(convertedChars++) =
 718                         character / HANGUL_NCOUNT + HANGUL_LBASE;
 719                 *(convertedChars++) =
 720                         (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
 721                 if (length > 2)
 722                         *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
 723                 return (length);
 724         } else {
 725                 return (unicode_recursive_decompose(character, convertedChars));
 726         }
 727 }
 728
 729 /*
 730  * unicode_combine - generate a precomposed Unicode char
 731  *
 732  * Precomposed Unicode characters are required for some volume
 733  * formats and network protocols.  unicode_combine will combine
 734  * a decomposed character sequence into a single precomposed
 735  * (composite) character.
 736  *
 737  * Similar toCFUniCharPrecomposeCharacter but unicode_combine
 738  * also handles Hangul Jamo characters.
 739  */
 740 static u_int16_t
 741 unicode_combine(u_int16_t base, u_int16_t combining)
 742 {
 743         u_int32_t value;
 744
 745         /* Check HANGUL */
 746         if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
 747                 /* 2 char Hangul sequences */
 748                 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
 749                     (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
 750                     return (HANGUL_SBASE +
 751                             ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
 752                             ((combining  - HANGUL_VBASE)*HANGUL_TCOUNT));
 753                 }
 754
 755                 /* 3 char Hangul sequences */
 756                 if ((combining > HANGUL_TBASE) &&
 757                     (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
 758                         if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
 759                                 return (0);
 760                         else
 761                                 return (base + (combining - HANGUL_TBASE));
 762                 }
 763         }
 764
 765         value = getmappedvalue32(
 766                 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
 767                 __CFUniCharPrecompositionTableLength, combining);
 768
 769         if (value) {
 770                 value = getmappedvalue16(
 771                         (const unicode_mappings16 *)
 772                         ((u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
 773                         (value >> 16), base);
 774         }
 775         return (value);
 776 }
 777
 778
 779 /*
 780  * priortysort - order combining chars into canonical order
 781  *
 782  * Similar to CFUniCharPrioritySort
 783  */
 784 static void
 785 priortysort(u_int16_t* characters, int count)
 786 {
 787         u_int32_t p1, p2;
 788         u_int16_t *ch1, *ch2;
 789         u_int16_t *end;
 790         int changes = 1;
 791
 792         end = characters + count;
 793         do {
 794                 changes = 0;
 795                 ch1 = characters;
 796                 ch2 = characters + 1;
 797                 p2 = get_combining_class(*ch1);
 798                 while (ch2 < end) {
 799                         p1 = p2;
 800                         p2 = get_combining_class(*ch2);
 801                         if (p1 > p2) {
 802                                 u_int32_t tmp;
 803
 804                                 tmp = *ch1;
 805                                 *ch1 = *ch2;
 806                                 *ch2 = tmp;
 807                                 changes = 1;
 808                         }
 809                         ++ch1;
 810                         ++ch2;
 811                 }
 812         } while (changes);
 813 }