bsd/vfs/vfs_utfconv.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #include <sys/param.h>
  24 #include <sys/utfconv.h>
  25 #include <sys/errno.h>
  26 #include <architecture/byte_order.h>
  27
  28
  29 /*
  30  * UTF-8 (UCS Transformation Format)
  31  *
  32  * The following subset of UTF-8 is used to encode UCS-2 filenames. It
  33  * requires a maximum of three 3 bytes per UCS-2 character.  Only the
  34  * shortest encoding required to represent the significant UCS-2 bits
  35  * is legal.
  36  *
  37  * UTF-8 Multibyte Codes
  38  *
  39  * Bytes   Bits   UCS-2 Min   UCS-2 Max   UTF-8 Byte Sequence (binary)
  40  * -------------------------------------------------------------------
  41  *   1       7     0x0000      0x007F      0xxxxxxx
  42  *   2      11     0x0080      0x07FF      110xxxxx 10xxxxxx
  43  *   3      16     0x0800      0xFFFF      1110xxxx 10xxxxxx 10xxxxxx
  44  * -------------------------------------------------------------------
  45  */
  46
  47
  48 #define UCS_TO_UTF_LEN(c)       ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : 3))
  49
  50
  51 static u_int16_t ucs_decompose __P((u_int16_t, u_int16_t *));
  52
  53
  54 /*
  55  * utf8_encodelen - Calculates the UTF-8 encoding length for a UCS-2 filename
  56  *
  57  * NOTES:
  58  *    If '/' chars are allowed on disk then an alternate
  59  *    (replacement) char must be provided in altslash.
  60  *
  61  * input flags:
  62  *    UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
  63  */
  64 size_t
  65 utf8_encodelen(ucsp, ucslen, altslash, flags)
  66         const u_int16_t * ucsp;
  67         size_t ucslen;
  68         u_int16_t altslash;
  69         int flags;
  70 {
  71         u_int16_t ucs_ch;
  72         int charcnt;
  73         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
  74         size_t len;
  75
  76         charcnt = ucslen / 2;
  77         len = 0;
  78
  79         while (charcnt-- > 0) {
  80                 ucs_ch = *ucsp++;
  81
  82                 if (swapbytes)
  83                         ucs_ch = NXSwapShort(ucs_ch);
  84                 if (altslash && ucs_ch == '/')
  85                         ucs_ch = altslash;
  86                 if (ucs_ch == '\0')
  87                         ucs_ch = 0xc080;
  88
  89                 len += UCS_TO_UTF_LEN(ucs_ch);
  90         }
  91
  92         return (len);
  93 }
  94
  95
  96 /*
  97  * utf8_encodestr - Encodes a UCS-2 (Unicode) string to UTF-8
  98  *
  99  * NOTES:
 100  *    The resulting UTF-8 string is not null terminated.
 101  *
 102  *    If '/' chars are allowed on disk then an alternate
 103  *    (replacement) char must be provided in altslash.
 104  *
 105  * input flags:
 106  *    UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
 107  *    UTF_NO_NULL_TERM:  don't add NULL termination to UTF-8 output
 108  */
 109 int utf8_encodestr(ucsp, ucslen, utf8p, utf8len, buflen, altslash, flags)
 110         const u_int16_t * ucsp;
 111         size_t ucslen;
 112         u_int8_t * utf8p;
 113         size_t * utf8len;
 114         size_t buflen;
 115         u_int16_t altslash;
 116         int flags;
 117 {
 118         u_int8_t * bufstart;
 119         u_int8_t * bufend;
 120         u_int16_t ucs_ch;
 121         int charcnt;
 122         int swapbytes = (flags & UTF_REVERSE_ENDIAN);
 123         int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
 124         int result = 0;
 125
 126         bufstart = utf8p;
 127         bufend = bufstart + buflen;
 128         if (nullterm)
 129                 --bufend;
 130         charcnt = ucslen / 2;
 131
 132         while (charcnt-- > 0) {
 133                 ucs_ch = *ucsp++;
 134
 135                 if (swapbytes)
 136                         ucs_ch = NXSwapShort(ucs_ch);
 137                 if (altslash && ucs_ch == '/')
 138                         ucs_ch = altslash;
 139
 140                 if ((ucs_ch < 0x0080) && (ucs_ch != '\0')) {
 141                         if (utf8p >= bufend) {
 142                                 result = ENAMETOOLONG;
 143                                 break;
 144                         }
 145                         *utf8p++ = ucs_ch;
 146
 147                 } else if (ucs_ch < 0x800) {
 148                         if ((utf8p + 1) >= bufend) {
 149                                 result = ENAMETOOLONG;
 150                                 break;
 151                         }
 152                         /* NOTE: NULL maps to 0xC080 */
 153                         *utf8p++ = (ucs_ch >> 6) | 0xc0;
 154                         *utf8p++ = (ucs_ch & 0x3f) | 0x80;
 155
 156                 } else {
 157                         if ((utf8p + 2) >= bufend) {
 158                                 result = ENAMETOOLONG;
 159                                 break;
 160                         }
 161                         *utf8p++ = (ucs_ch >> 12) | 0xe0;
 162                         *utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
 163                         *utf8p++ = ((ucs_ch) & 0x3f) | 0x80;
 164                 }
 165         }
 166
 167         *utf8len = utf8p - bufstart;
 168         if (nullterm)
 169                 *utf8p++ = '\0';
 170
 171         return (result);
 172 }
 173
 174
 175 /*
 176  * utf8_decodestr - Decodes a UTF-8 string back to UCS-2 (Unicode)
 177  *
 178  * NOTES:
 179  *    The input UTF-8 string does not need to be null terminated
 180  *    if utf8len is set.
 181  *
 182  *    If '/' chars are allowed on disk then an alternate
 183  *    (replacement) char must be provided in altslash.
 184  *
 185  * input flags:
 186  *    UTF_REV_ENDIAN:   UCS-2 byteorder is oposite current runtime
 187  *    UTF_DECOMPOSED:   UCS-2 output string must be fully decompsed
 188  */
 189 int
 190 utf8_decodestr(utf8p, utf8len, ucsp, ucslen, buflen, altslash, flags)
 191         const u_int8_t* utf8p;
 192         size_t utf8len;
 193         u_int16_t* ucsp;
 194         size_t *ucslen;
 195         size_t buflen;
 196         u_int16_t altslash;
 197         int flags;
 198 {
 199         u_int16_t* bufstart;
 200         u_int16_t* bufend;
 201         u_int16_t ucs_ch;
 202         u_int8_t byte;
 203         int result = 0;
 204         int decompose, swapbytes;
 205
 206         decompose = (flags & UTF_DECOMPOSED);
 207         swapbytes = (flags & UTF_REVERSE_ENDIAN);
 208
 209         bufstart = ucsp;
 210         bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
 211
 212         while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
 213                 if (ucsp >= bufend) {
 214                         result = ENAMETOOLONG;
 215                         goto stop;
 216                 }
 217
 218                 /* check for ascii */
 219                 if (byte < 0x80) {
 220                         ucs_ch = byte;
 221                 } else {
 222                         switch (byte & 0xf0) {
 223                         /*  2 byte sequence*/
 224                         case 0xc0:
 225                         case 0xd0:
 226                                 /* extract bits 6 - 10 from first byte */
 227                                 ucs_ch = (byte & 0x1F) << 6;
 228                                 if ((ucs_ch < 0x0080) && (*utf8p != 0x80)) {
 229                                         result = EINVAL;  /* seq not minimal */
 230                                         goto stop;
 231                                 }
 232                                 break;
 233                         /* 3 byte sequence*/
 234                         case 0xe0:
 235                                 /* extract bits 12 - 15 from first byte */
 236                                 ucs_ch = (byte & 0x0F) << 6;
 237
 238                                 /* extract bits 6 - 11 from second byte */
 239                                 if (((byte = *utf8p++) & 0xc0) != 0x80) {
 240                                         result = EINVAL;
 241                                         goto stop;
 242                                 }
 243                                 utf8len--;
 244
 245                                 ucs_ch += (byte & 0x3F);
 246                                 ucs_ch <<= 6;
 247
 248                                 if (ucs_ch < 0x0800) {
 249                                         result = EINVAL; /* seq not minimal */
 250                                         goto stop;
 251                                 }
 252                                 break;
 253                         default:
 254                                 result = EINVAL;
 255                                 goto stop;
 256                         }
 257
 258                         /* extract bits 0 - 5 from final byte */
 259                         if (((byte = *utf8p++) & 0xc0) != 0x80) {
 260                                 result = EINVAL;
 261                                 goto stop;
 262                         }
 263                         utf8len--;
 264                         ucs_ch += (byte & 0x3F);
 265
 266                         if (decompose) {
 267                                 u_int16_t comb_ch;
 268
 269                                 ucs_ch = ucs_decompose(ucs_ch, &comb_ch);
 270
 271                                 if (comb_ch) {
 272                                         if (swapbytes)
 273                                                 *ucsp++ = NXSwapShort(ucs_ch);
 274                                         else
 275                                                 *ucsp++ = ucs_ch;
 276
 277                                         if (ucsp >= bufend) {
 278                                                 result = ENAMETOOLONG;
 279                                                 goto stop;
 280                                         }
 281
 282                                         ucs_ch = comb_ch;
 283                                 }
 284                         }
 285                 }
 286
 287                 if (ucs_ch == altslash)
 288                         ucs_ch = '/';
 289                 if (swapbytes)
 290                         ucs_ch = NXSwapShort(ucs_ch);
 291
 292                 *ucsp++ = ucs_ch;
 293         }
 294 stop:
 295         *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
 296
 297         return (result);
 298 }
 299
 300
 301 /*
 302  * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
 303  * primary_char yields first decomposed char. If this
 304  * char is an alpha char then get the combining char
 305  * from the combining_char table and add 0x0300 to it.
 306  */
 307
 308 static unsigned char primary_char[64] = {
 309         0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43,
 310
 311         0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49,
 312
 313         0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7,
 314
 315         0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF,
 316
 317         0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63,
 318
 319         0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69,
 320
 321         0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7,
 322
 323         0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79,
 324 };
 325
 326 static unsigned char combining_char[64] = {
 327         0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
 328
 329         0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
 330
 331         0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
 332
 333         0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF,
 334
 335         0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
 336
 337         0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
 338
 339         0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
 340
 341         0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08
 342 };
 343
 344
 345 /* CJK codepoints 0x3000 ~ 0x30FF */
 346 static const unsigned long __CJKDecompBitmap[] = {
 347     0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C,     /* 0x3000 */
 348     0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2,     /* 0x3080 */
 349 };
 350 #define IS_DECOMPOSABLE(table,unicodeVal) \
 351         (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
 352
 353 /*
 354  * ucs_decompose - decompose a composed UCS-2 char
 355  *
 356  * Composed Unicode characters are forbidden on
 357  * HFS Plus volumes. ucs_decompose will convert a
 358  * composed character into its correct decomposed
 359  * sequence.
 360  *
 361  * Currently only MacRoman and MacJapanese chars
 362  * are handled.  Other composed characters are
 363  * passed unchanged.
 364  */
 365 static u_int16_t
 366 ucs_decompose(register u_int16_t ch, u_int16_t *cmb)
 367 {
 368         u_int16_t base;
 369
 370         *cmb = 0;
 371
 372         if ((ch <= 0x00FF) && (ch >= 0x00C0)) {
 373                 ch -= 0x00C0;
 374
 375                 base = (u_int16_t) primary_char[ch];
 376
 377                 if (base <= 'z') {
 378                         *cmb = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch];
 379                 }
 380         } else if ((ch > 0x3000) && (ch < 0x3100) &&
 381                    IS_DECOMPOSABLE(__CJKDecompBitmap, ch - 0x3000)) {
 382
 383                 /* Handle HIRAGANA LETTERs */
 384                 switch(ch) {
 385                 case 0x3071: base = 0x306F; *cmb = 0x309A; break; /* PA */
 386                 case 0x3074: base = 0x3072; *cmb = 0x309A; break; /* PI */
 387                 case 0x3077: base = 0x3075; *cmb = 0x309A; break; /* PU */
 388                 case 0x307A: base = 0x3078; *cmb = 0x309A; break; /* PE */
 389
 390                 case 0x307D: base = 0x307B; *cmb = 0x309A; break; /* PO */
 391                 case 0x3094: base = 0x3046; *cmb = 0x3099; break; /* VU */
 392                 case 0x30D1: base = 0x30CF; *cmb = 0x309A; break; /* PA */
 393                 case 0x30D4: base = 0x30D2; *cmb = 0x309A; break; /* PI */
 394
 395                 case 0x30D7: base = 0x30D5; *cmb = 0x309A; break; /* PU */
 396                 case 0x30DA: base = 0x30D8; *cmb = 0x309A; break; /* PE */
 397                 case 0x30DD: base = 0x30DB; *cmb = 0x309A; break; /* PO */
 398                 case 0x30F4: base = 0x30A6; *cmb = 0x3099; break; /* VU */
 399
 400                 case 0x30F7: base = 0x30EF; *cmb = 0x3099; break; /* VA */
 401                 case 0x30F8: base = 0x30F0; *cmb = 0x3099; break; /* VI */
 402                 case 0x30F9: base = 0x30F1; *cmb = 0x3099; break; /* VE */
 403                 case 0x30FA: base = 0x30F2; *cmb = 0x3099; break; /* VO */
 404
 405                 default:
 406                         /* the rest (41 of them) have a simple conversion */
 407                         base = ch - 1;
 408                         *cmb = 0x3099;
 409                 }
 410         } else {
 411                 base = ch;
 412         }
 413
 414         return (base);
 415 }
 416