]> git.saurik.com Git - apple/hfs.git/blame - livefiles_hfs_plugin/lf_hfs_sbunicode.c
hfs-522.100.5.tar.gz
[apple/hfs.git] / livefiles_hfs_plugin / lf_hfs_sbunicode.c
CommitLineData
de8ee011
A
1/* Copyright © 2017-2018 Apple Inc. All rights reserved.
2 *
3 * lf_hfs_sbunicode.c
4 * livefiles_hfs
5 *
6 * Created by Oded Shoshani on 31/1/18.
7 */
8
9/*
10 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
11 *
12 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
13 *
14 * This file contains Original Code and/or Modifications of Original Code
15 * as defined in and that are subject to the Apple Public Source License
16 * Version 2.0 (the 'License'). You may not use this file except in
17 * compliance with the License. The rights granted to you under the License
18 * may not be used to create, or enable the creation or redistribution of,
19 * unlawful or unlicensed copies of an Apple operating system, or to
20 * circumvent, violate, or enable the circumvention or violation of, any
21 * terms of an Apple operating system software license agreement.
22 *
23 * Please obtain a copy of the License at
24 * http://www.opensource.apple.com/apsl/ and read it before using this file.
25 *
26 * The Original Code and all software distributed under the License are
27 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
28 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
29 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
31 * Please see the License for the specific language governing rights and
32 * limitations under the License.
33 *
34 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
35 */
36
37/*
38 Includes Unicode 3.2 decomposition code derived from Core Foundation
39 */
40
41#pragma clang diagnostic ignored "-Wsign-conversion"
42#pragma clang diagnostic ignored "-Wconversion"
43
44#include <sys/param.h>
45#include <sys/errno.h>
46#include <libkern/OSByteOrder.h>
47#include <stdio.h>
48#include <stdlib.h>
49#include "lf_hfs_sbunicode.h"
50
51
52/*
53 * UTF-8 (Unicode Transformation Format)
54 *
55 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
56 * character as a sequence of one to four bytes. Only the shortest form
57 * required to represent the significant Unicode bits is legal.
58 *
59 * UTF-8 Multibyte Codes
60 *
61 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
62 * -----------------------------------------------------------------------------
63 * 1 7 0x0000 0x007F 0xxxxxxx
64 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
65 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
66 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
67 * -----------------------------------------------------------------------------
68 */
69
70
71#define UNICODE_TO_UTF8_LEN(c) \
72((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
73
74#define UCS_ALT_NULL 0x2400
75
76
77/* Surrogate Pair Constants */
78
79#define SP_HALF_SHIFT 10
80#define SP_HALF_BASE 0x0010000u
81#define SP_HALF_MASK 0x3FFu
82#define SP_HIGH_FIRST 0xD800u
83#define SP_HIGH_LAST 0xDBFFu
84#define SP_LOW_FIRST 0xDC00u
85#define SP_LOW_LAST 0xDFFFu
86
87
88#include "lf_hfs_utfconvdata.h"
89
90
91/*
92 * Test for a combining character.
93 *
94 * Similar to __CFUniCharIsNonBaseCharacter except that
95 * unicode_combinable also includes Hangul Jamo characters.
96 */
97static int
98unicode_combinable(u_int16_t character)
99{
100 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
101 u_int8_t value;
102
103 if (character < 0x0300)
104 return (0);
105
106 value = bitmap[(character >> 8) & 0xFF];
107
108 if (value == 0xFF) {
109 return (1);
110 } else if (value) {
111 bitmap = bitmap + ((value - 1) * 32) + 256;
112 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
113 }
114 return (0);
115}
116
117/*
118 * Test for a precomposed character.
119 *
120 * Similar to __CFUniCharIsDecomposableCharacter.
121 */
122static int
123unicode_decomposeable(u_int16_t character) {
124 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
125 u_int8_t value;
126
127 if (character < 0x00C0)
128 return (0);
129
130 value = bitmap[(character >> 8) & 0xFF];
131
132 if (value == 0xFF) {
133 return (1);
134 } else if (value) {
135 bitmap = bitmap + ((value - 1) * 32) + 256;
136 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
137 }
138 return (0);
139}
140
141
142/*
143 * Get the combing class.
144 *
145 * Similar to CFUniCharGetCombiningPropertyForCharacter.
146 */
147static inline u_int8_t
148get_combining_class(u_int16_t character) {
149 const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
150
151 u_int8_t value = bitmap[(character >> 8)];
152
153 if (value) {
154 bitmap = bitmap + (value * 256);
155 return bitmap[character % 256];
156 }
157 return (0);
158}
159
160static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
161
162static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
163
164static void priortysort(u_int16_t* characters, int count);
165
166static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
167
168static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
169
170char utf_extrabytes[32] = {
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
173};
174
175const char hexdigits[16] = {
176 '0', '1', '2', '3', '4', '5', '6', '7',
177 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
178};
179
180/*
181 * utf8_encodelen - Calculate the UTF-8 encoding length
182 *
183 * This function takes a Unicode input string, ucsp, of ucslen bytes
184 * and calculates the size of the UTF-8 output in bytes (not including
185 * a NULL termination byte). The string must reside in kernel memory.
186 *
187 * If '/' chars are possible in the Unicode input then an alternate
188 * (replacement) char should be provided in altslash.
189 *
190 * FLAGS
191 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
192 *
193 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
194 *
195 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
196 *
197 * UTF_DECOMPOSED: generate fully decomposed output
198 *
199 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
200 *
201 * ERRORS
202 * None
203 */
204size_t
205utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
206{
207 u_int16_t ucs_ch;
208 u_int16_t * chp = NULL;
209 u_int16_t sequence[8];
210 int extra = 0;
211 size_t charcnt;
212 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
213 int decompose = (flags & UTF_DECOMPOSED);
214 size_t len;
215
216 charcnt = ucslen / 2;
217 len = 0;
218
219 while (charcnt-- > 0) {
220 if (extra > 0) {
221 --extra;
222 ucs_ch = *chp++;
223 } else {
224 ucs_ch = *ucsp++;
225 if (swapbytes) {
226 ucs_ch = OSSwapInt16(ucs_ch);
227 }
228 if (ucs_ch == '/') {
229 ucs_ch = altslash ? altslash : '_';
230 } else if (ucs_ch == '\0') {
231 ucs_ch = UCS_ALT_NULL;
232 } else if (decompose && unicode_decomposeable(ucs_ch)) {
233 extra = unicode_decompose(ucs_ch, sequence) - 1;
234 charcnt += extra;
235 ucs_ch = sequence[0];
236 chp = &sequence[1];
237 }
238 }
239 len += UNICODE_TO_UTF8_LEN(ucs_ch);
240 }
241
242 return (len);
243}
244
245
246/*
247 * utf8_encodestr - Encodes a Unicode string to UTF-8
248 *
249 * NOTES:
250 * The resulting UTF-8 string is NULL terminated.
251 *
252 * If '/' chars are allowed on disk then an alternate
253 * (replacement) char must be provided in altslash.
254 *
255 * input flags:
256 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
257 *
258 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
259 *
260 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
261 *
262 * UTF_DECOMPOSED: generate fully decomposed output
263 *
264 * UTF_ADD_NULL_TERM: add NULL termination to UTF-8 output
265 *
266 * result:
267 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
268 *
269 * EINVAL: Illegal char found; char was replaced by an '_'.
270 */
271extern int
272utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
273 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
274{
275 u_int8_t * bufstart;
276 u_int8_t * bufend;
277 u_int16_t ucs_ch;
278 u_int16_t * chp = NULL;
279 u_int16_t sequence[8];
280 int extra = 0;
281 size_t charcnt;
282 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
283 int nullterm = (flags & UTF_ADD_NULL_TERM);
284 int decompose = (flags & UTF_DECOMPOSED);
285 int sfmconv = (flags & UTF_SFM_CONVERSIONS);
286 int result = 0;
287
288 bufstart = utf8p;
289 bufend = bufstart + buflen;
290 if (nullterm)
291 --bufend;
292 charcnt = ucslen / 2;
293
294 while (charcnt-- > 0) {
295 if (extra > 0) {
296 --extra;
297 ucs_ch = *chp++;
298 } else {
299 ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
300
301 if (decompose && unicode_decomposeable(ucs_ch)) {
302 extra = unicode_decompose(ucs_ch, sequence) - 1;
303 charcnt += extra;
304 ucs_ch = sequence[0];
305 chp = &sequence[1];
306 }
307 }
308
309 /* Slash and NULL are not permitted */
310 if (ucs_ch == '/') {
311 if (altslash)
312 ucs_ch = altslash;
313 else {
314 ucs_ch = '_';
315 result = EINVAL;
316 }
317 } else if (ucs_ch == '\0') {
318 ucs_ch = UCS_ALT_NULL;
319 }
320
321 if (ucs_ch < 0x0080) {
322 if (utf8p >= bufend) {
323 result = ENAMETOOLONG;
324 break;
325 }
326 *utf8p++ = ucs_ch;
327
328 } else if (ucs_ch < 0x800) {
329 if ((utf8p + 1) >= bufend) {
330 result = ENAMETOOLONG;
331 break;
332 }
333 *utf8p++ = 0xc0 | (ucs_ch >> 6);
334 *utf8p++ = 0x80 | (0x3f & ucs_ch);
335
336 } else {
337 /* These chars never valid Unicode. */
338 if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) {
339 result = EINVAL;
340 break;
341 }
342
343 /* Combine valid surrogate pairs */
344 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
345 && charcnt > 0) {
346 u_int16_t ch2;
347 u_int32_t pair;
348
349 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
350 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
351 pair = (u_int32_t)((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
352 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
353 if ((utf8p + 3) >= bufend) {
354 result = ENAMETOOLONG;
355 break;
356 }
357 --charcnt;
358 ++ucsp;
359 *utf8p++ = 0xf0 | (pair >> 18);
360 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
361 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
362 *utf8p++ = 0x80 | (0x3f & pair);
363 continue;
364 }
365 } else if (sfmconv) {
366 ucs_ch = sfm_to_ucs(ucs_ch);
367 if (ucs_ch < 0x0080) {
368 if (utf8p >= bufend) {
369 result = ENAMETOOLONG;
370 break;
371 }
372 *utf8p++ = ucs_ch;
373 continue;
374 }
375 }
376 if ((utf8p + 2) >= bufend) {
377 result = ENAMETOOLONG;
378 break;
379 }
380 *utf8p++ = 0xe0 | (ucs_ch >> 12);
381 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
382 *utf8p++ = 0x80 | (0x3f & ucs_ch);
383 }
384 }
385
386 *utf8len = utf8p - bufstart;
387 if (nullterm)
388 *utf8p++ = '\0';
389
390 return (result);
391}
392
393
394/*
395 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
396 *
397 * NOTES:
398 * The input UTF-8 string does not need to be null terminated
399 * if utf8len is set.
400 *
401 * If '/' chars are allowed on disk then an alternate
402 * (replacement) char must be provided in altslash.
403 *
404 * input flags:
405 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
406 *
407 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
408 *
409 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
410 *
411 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
412 *
413 * UTF_PRECOMPOSED: generate precomposed output (NFC)
414 *
415 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
416 *
417 * result:
418 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
419 *
420 * EINVAL: Illegal UTF-8 sequence found.
421 */
422int
423utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
424 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
425{
426 u_int16_t* bufstart;
427 u_int16_t* bufend;
428 unsigned int ucs_ch;
429 unsigned int byte;
430 int combcharcnt = 0;
431 int result = 0;
432 int decompose, precompose, swapbytes, escaping;
433 int sfmconv;
434 int extrabytes;
435
436 decompose = (flags & UTF_DECOMPOSED);
437 precompose = (flags & UTF_PRECOMPOSED);
438 swapbytes = (flags & UTF_REVERSE_ENDIAN);
439 escaping = (flags & UTF_ESCAPE_ILLEGAL);
440 sfmconv = (flags & UTF_SFM_CONVERSIONS);
441
442 bufstart = ucsp;
443 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
444
445 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
446 if (ucsp >= bufend)
447 goto toolong;
448
449 /* check for ascii */
450 if (byte < 0x80) {
451 ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
452 } else {
453 u_int32_t ch;
454
455 extrabytes = utf_extrabytes[byte >> 3];
456 if ((extrabytes < 0) || ((int)utf8len < extrabytes)) {
457 goto escape;
458 }
459 utf8len -= extrabytes;
460
461 switch (extrabytes) {
462 case 1:
463 ch = byte; ch <<= 6; /* 1st byte */
464 byte = *utf8p++; /* 2nd byte */
465 if ((byte >> 6) != 2)
466 goto escape2;
467 ch += byte;
468 ch -= 0x00003080UL;
469 if (ch < 0x0080)
470 goto escape2;
471 ucs_ch = ch;
472 break;
473 case 2:
474 ch = byte; ch <<= 6; /* 1st byte */
475 byte = *utf8p++; /* 2nd byte */
476 if ((byte >> 6) != 2)
477 goto escape2;
478 ch += byte; ch <<= 6;
479 byte = *utf8p++; /* 3rd byte */
480 if ((byte >> 6) != 2)
481 goto escape3;
482 ch += byte;
483 ch -= 0x000E2080UL;
484 if (ch < 0x0800)
485 goto escape3;
486 if (ch >= 0xD800) {
487 if (ch <= 0xDFFF)
488 goto escape3;
489 if (ch == 0xFFFE || ch == 0xFFFF)
490 goto escape3;
491 }
492 ucs_ch = ch;
493 break;
494 case 3:
495 ch = byte; ch <<= 6; /* 1st byte */
496 byte = *utf8p++; /* 2nd byte */
497 if ((byte >> 6) != 2)
498 goto escape2;
499 ch += byte; ch <<= 6;
500 byte = *utf8p++; /* 3rd byte */
501 if ((byte >> 6) != 2)
502 goto escape3;
503 ch += byte; ch <<= 6;
504 byte = *utf8p++; /* 4th byte */
505 if ((byte >> 6) != 2)
506 goto escape4;
507 ch += byte;
508 ch -= 0x03C82080UL + SP_HALF_BASE;
509 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
510 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
511 goto escape4;
512 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
513 if (ucsp >= bufend)
514 goto toolong;
515 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
516 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
517 --ucsp;
518 goto escape4;
519 }
520 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
521 continue;
522 default:
523 result = EINVAL;
524 goto exit;
525 }
526 if (decompose) {
527 if (unicode_decomposeable(ucs_ch)) {
528 u_int16_t sequence[8] = {0};
529 int count, i;
530
531 /* Before decomposing a new unicode character, sort
532 * previous combining characters, if any, and reset
533 * the counter.
534 */
535 if (combcharcnt > 1) {
536 priortysort(ucsp - combcharcnt, combcharcnt);
537 }
538 combcharcnt = 0;
539
540 count = unicode_decompose(ucs_ch, sequence);
541 for (i = 0; i < count; ++i) {
542 ucs_ch = sequence[i];
543 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
544 if (ucsp >= bufend)
545 goto toolong;
546 }
547 combcharcnt += count - 1;
548 continue;
549 }
550 } else if (precompose && (ucsp != bufstart)) {
551 u_int16_t composite, base;
552
553 if (unicode_combinable(ucs_ch)) {
554 base = swapbytes ? OSSwapInt16(*(ucsp - 1)) : *(ucsp - 1);
555 composite = unicode_combine(base, ucs_ch);
556 if (composite) {
557 --ucsp;
558 ucs_ch = composite;
559 }
560 }
561 }
562 if (ucs_ch == UCS_ALT_NULL)
563 ucs_ch = '\0';
564 }
565 if (ucs_ch == altslash)
566 ucs_ch = '/';
567
568 /*
569 * Make multiple combining character sequences canonical
570 */
571 if (unicode_combinable(ucs_ch)) {
572 ++combcharcnt; /* start tracking a run */
573 } else if (combcharcnt) {
574 if (combcharcnt > 1) {
575 priortysort(ucsp - combcharcnt, combcharcnt);
576 }
577 combcharcnt = 0; /* start over */
578 }
579
580 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
581 continue;
582
583 /*
584 * Escape illegal UTF-8 into something legal.
585 */
586 escape4:
587 utf8p -= 3;
588 goto escape;
589 escape3:
590 utf8p -= 2;
591 goto escape;
592 escape2:
593 utf8p -= 1;
594 escape:
595 if (!escaping) {
596 result = EINVAL;
597 goto exit;
598 }
599 if (extrabytes > 0)
600 utf8len += extrabytes;
601 byte = *(utf8p - 1);
602
603 if ((ucsp + 2) >= bufend)
604 goto toolong;
605
606 /* Make a previous combining sequence canonical. */
607 if (combcharcnt > 1) {
608 priortysort(ucsp - combcharcnt, combcharcnt);
609 }
610 combcharcnt = 0;
611
612 ucs_ch = '%';
613 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
614 ucs_ch = hexdigits[byte >> 4];
615 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
616 ucs_ch = hexdigits[byte & 0x0F];
617 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
618 }
619 /*
620 * Make a previous combining sequence canonical
621 */
622 if (combcharcnt > 1) {
623 priortysort(ucsp - combcharcnt, combcharcnt);
624 }
625exit:
626 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
627
628 return (result);
629
630toolong:
631 result = ENAMETOOLONG;
632 goto exit;
633}
634
635/*
636 * Unicode 3.2 decomposition code (derived from Core Foundation)
637 */
638
639#define HANGUL_SBASE 0xAC00
640#define HANGUL_LBASE 0x1100
641#define HANGUL_VBASE 0x1161
642#define HANGUL_TBASE 0x11A7
643
644#define HANGUL_SCOUNT 11172
645#define HANGUL_LCOUNT 19
646#define HANGUL_VCOUNT 21
647#define HANGUL_TCOUNT 28
648#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
649
650
651typedef struct {
652 u_int32_t _key;
653 u_int32_t _value;
654} unicode_mappings32;
655
656#define RECURSIVE_DECOMPOSITION (1 << 15)
657#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
658
659typedef struct {
660 u_int16_t _key;
661 u_int16_t _value;
662} unicode_mappings16;
663
664static inline u_int32_t
665getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
666 u_int16_t character)
667{
668 const unicode_mappings32 *p, *q, *divider;
669
670 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
671 return (0);
672
673 p = theTable;
674 q = p + (numElem-1);
675 while (p <= q) {
676 divider = p + ((q - p) >> 1); /* divide by 2 */
677 if (character < divider->_key) { q = divider - 1; }
678 else if (character > divider->_key) { p = divider + 1; }
679 else { return (divider->_value); }
680 }
681 return (0);
682}
683
684static inline u_int16_t
685getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
686 u_int16_t character)
687{
688 const unicode_mappings16 *p, *q, *divider;
689
690 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
691 return (0);
692
693 p = theTable;
694 q = p + (numElem-1);
695 while (p <= q) {
696 divider = p + ((q - p) >> 1); /* divide by 2 */
697 if (character < divider->_key)
698 q = divider - 1;
699 else if (character > divider->_key)
700 p = divider + 1;
701 else
702 return (divider->_value);
703 }
704 return (0);
705}
706
707static u_int32_t
708unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
709{
710 u_int16_t value;
711 u_int32_t length;
712 u_int16_t firstChar;
713 u_int16_t theChar;
714 const u_int16_t *bmpMappings;
715 u_int32_t usedLength;
716
717 value = getmappedvalue16(
718 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
719 __UniCharDecompositionTableLength, character);
720 length = EXTRACT_COUNT(value);
721 firstChar = value & 0x0FFF;
722 theChar = firstChar;
723 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
724 usedLength = 0;
725
726 if (value & RECURSIVE_DECOMPOSITION) {
727 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
728
729 --length; /* Decrement for the first char */
730 if (!usedLength)
731 return 0;
732 ++bmpMappings;
733 convertedChars += usedLength;
734 }
735
736 usedLength += length;
737
738 while (length--)
739 *(convertedChars++) = *(bmpMappings++);
740
741 return (usedLength);
742}
743
744/*
745 * unicode_decompose - decompose a composed Unicode char
746 *
747 * Composed Unicode characters are forbidden on
748 * HFS Plus volumes. ucs_decompose will convert a
749 * composed character into its correct decomposed
750 * sequence.
751 *
752 * Similar to CFUniCharDecomposeCharacter
753 */
754static int
755unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
756{
757 if ((character >= HANGUL_SBASE) &&
758 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
759 u_int32_t length;
760
761 character -= HANGUL_SBASE;
762 length = (character % HANGUL_TCOUNT ? 3 : 2);
763
764 *(convertedChars++) =
765 character / HANGUL_NCOUNT + HANGUL_LBASE;
766 *(convertedChars++) =
767 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
768 if (length > 2)
769 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
770 return (length);
771 } else {
772 return (unicode_recursive_decompose(character, convertedChars));
773 }
774}
775
776/*
777 * unicode_combine - generate a precomposed Unicode char
778 *
779 * Precomposed Unicode characters are required for some volume
780 * formats and network protocols. unicode_combine will combine
781 * a decomposed character sequence into a single precomposed
782 * (composite) character.
783 *
784 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
785 * also handles Hangul Jamo characters.
786 */
787static u_int16_t
788unicode_combine(u_int16_t base, u_int16_t combining)
789{
790 u_int32_t value;
791
792 /* Check HANGUL */
793 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
794 /* 2 char Hangul sequences */
795 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
796 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
797 return (HANGUL_SBASE +
798 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
799 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
800 }
801
802 /* 3 char Hangul sequences */
803 if ((combining > HANGUL_TBASE) &&
804 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
805 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
806 return (0);
807 else
808 return (base + (combining - HANGUL_TBASE));
809 }
810 }
811
812 value = getmappedvalue32(
813 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
814 __CFUniCharPrecompositionTableLength, combining);
815
816 if (value) {
817 value = getmappedvalue16(
818 (const unicode_mappings16 *)
819 ((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
820 (value >> 16), base);
821 }
822 return (value);
823}
824
825
826/*
827 * priortysort - order combining chars into canonical order
828 *
829 * Similar to CFUniCharPrioritySort
830 */
831static void
832priortysort(u_int16_t* characters, int count)
833{
834 u_int32_t p1, p2;
835 u_int16_t *ch1, *ch2;
836 u_int16_t *end;
837 int changes = 0;
838
839 end = characters + count;
840 do {
841 changes = 0;
842 ch1 = characters;
843 ch2 = characters + 1;
844 p2 = get_combining_class(*ch1);
845 while (ch2 < end) {
846 p1 = p2;
847 p2 = get_combining_class(*ch2);
848 if (p1 > p2 && p2 != 0) {
849 u_int32_t tmp;
850
851 tmp = *ch1;
852 *ch1 = *ch2;
853 *ch2 = tmp;
854 changes = 1;
855
856 /*
857 * Make sure that p2 contains the combining class for the
858 * character now stored at *ch2. This isn't required for
859 * correctness, but it will be more efficient if a character
860 * with a large combining class has to "bubble past" several
861 * characters with lower combining classes.
862 */
863 p2 = p1;
864 }
865 ++ch1;
866 ++ch2;
867 }
868 } while (changes);
869}
870
871
872/*
873 * Invalid NTFS filename characters are encodeded using the
874 * SFM (Services for Macintosh) private use Unicode characters.
875 *
876 * These should only be used for SMB, MSDOS or NTFS.
877 *
878 * Illegal NTFS Char SFM Unicode Char
879 * ----------------------------------------
880 * 0x01-0x1f 0xf001-0xf01f
881 * '"' 0xf020
882 * '*' 0xf021
883 * '/' 0xf022
884 * '<' 0xf023
885 * '>' 0xf024
886 * '?' 0xf025
887 * '\' 0xf026
888 * '|' 0xf027
889 * ' ' 0xf028 (Only if last char of the name)
890 * '.' 0xf029 (Only if last char of the name)
891 * ----------------------------------------
892 *
893 * Reference: http://support.microsoft.com/kb/q117258/
894 */
895
896#define MAX_SFM2MAC 0x29
897#define SFMCODE_PREFIX_MASK 0xf000
898
899/*
900 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
901 * SFM had no conversion for the colon. There is a conversion for the
902 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
903 * is a slash and a slash is a colon. So we can just replace the slash with the
904 * colon in our tables and everything will just work.
905 */
906static u_int8_t
907sfm2mac[42] = {
908 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
909 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
910 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
911 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
912 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
913 0x20, 0x2e /* 28 - 29 */
914};
915
916static u_int8_t
917mac2sfm[112] = {
918 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
919 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
920 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
921 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
922 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
923 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
924 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
925 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
926 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
927 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
928 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
929 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
930};
931
932
933/*
934 * Encode illegal NTFS filename characters into SFM Private Unicode characters
935 *
936 * Assumes non-zero ASCII input.
937 */
938static u_int16_t
939ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
940{
941 /* The last character of filename cannot be a space or period. */
942 if (lastchar) {
943 if (ucs_ch == 0x20)
944 return (0xf028);
945 else if (ucs_ch == 0x2e)
946 return (0xf029);
947 }
948 /* 0x01 - 0x1f is simple transformation. */
949 if (ucs_ch <= 0x1f) {
950 return (ucs_ch | 0xf000);
951 } else /* 0x20 - 0x7f */ {
952 u_int16_t lsb;
953
954 lsb = mac2sfm[ucs_ch - 0x0020];
955 if (lsb != ucs_ch)
956 return(0xf000 | lsb);
957 }
958 return (ucs_ch);
959}
960
961/*
962 * Decode any SFM Private Unicode characters
963 */
964static u_int16_t
965sfm_to_ucs(u_int16_t ucs_ch)
966{
967 if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
968 ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
969 ucs_ch = sfm2mac[ucs_ch & 0x003f];
970 }
971 return (ucs_ch);
972}
973