]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_utfconv.c
029b8f04e0ca13c94db98f7fcbeee3976b08d42f
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All Rights Reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30
31 /*
32 Includes Unicode 3.2 decomposition code derived from Core Foundation
33 */
34
35 #include <sys/param.h>
36 #include <sys/utfconv.h>
37 #include <sys/errno.h>
38 #include <architecture/byte_order.h>
39
40 /*
41 * UTF-8 (Unicode Transformation Format)
42 *
43 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
44 * character as a sequence of one to four bytes. Only the shortest form
45 * required to represent the significant Unicode bits is legal.
46 *
47 * UTF-8 Multibyte Codes
48 *
49 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
50 * -----------------------------------------------------------------------------
51 * 1 7 0x0000 0x007F 0xxxxxxx
52 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
53 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
54 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
55 * -----------------------------------------------------------------------------
56 */
57
58
59 #define UNICODE_TO_UTF8_LEN(c) \
60 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
61
62 #define UCS_ALT_NULL 0x2400
63
64 /* Surrogate Pair Constants */
65 #define SP_HALF_SHIFT 10
66 #define SP_HALF_BASE 0x0010000UL
67 #define SP_HALF_MASK 0x3FFUL
68
69 #define SP_HIGH_FIRST 0xD800UL
70 #define SP_HIGH_LAST 0xDBFFUL
71 #define SP_LOW_FIRST 0xDC00UL
72 #define SP_LOW_LAST 0xDFFFUL
73
74
75 #include "vfs_utfconvdata.h"
76
77
78 /*
79 * Test for a combining character.
80 *
81 * Similar to __CFUniCharIsNonBaseCharacter except that
82 * unicode_combinable also includes Hangul Jamo characters.
83 */
84 static inline int
85 unicode_combinable(u_int16_t character)
86 {
87 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
88 u_int8_t value;
89
90 if (character < 0x0300)
91 return (0);
92
93 value = bitmap[(character >> 8) & 0xFF];
94
95 if (value == 0xFF) {
96 return (1);
97 } else if (value) {
98 bitmap = bitmap + ((value - 1) * 32) + 256;
99 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
100 }
101 return (0);
102 }
103
104 /*
105 * Test for a precomposed character.
106 *
107 * Similar to __CFUniCharIsDecomposableCharacter.
108 */
109 static inline int
110 unicode_decomposeable(u_int16_t character) {
111 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
112 u_int8_t value;
113
114 if (character < 0x00C0)
115 return (0);
116
117 value = bitmap[(character >> 8) & 0xFF];
118
119 if (value == 0xFF) {
120 return (1);
121 } else if (value) {
122 bitmap = bitmap + ((value - 1) * 32) + 256;
123 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
124 }
125 return (0);
126 }
127
128
129 /*
130 * Get the combing class.
131 *
132 * Similar to CFUniCharGetCombiningPropertyForCharacter.
133 */
134 static inline u_int8_t
135 get_combining_class(u_int16_t character) {
136 const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
137
138 u_int8_t value = bitmap[(character >> 8)];
139
140 if (value) {
141 bitmap = bitmap + (value * 256);
142 return bitmap[character % 256];
143 }
144 return (0);
145 }
146
147
148 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
149
150 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
151
152 static void priortysort(u_int16_t* characters, int count);
153
154 char utf_extrabytes[32] = {
155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
156 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
157 };
158
159
160 /*
161 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
162 *
163 * NOTES:
164 * If '/' chars are allowed on disk then an alternate
165 * (replacement) char must be provided in altslash.
166 *
167 * input flags:
168 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
169 */
170 size_t
171 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
172 int flags)
173 {
174 u_int16_t ucs_ch;
175 int charcnt;
176 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
177 size_t len;
178
179 charcnt = ucslen / 2;
180 len = 0;
181
182 while (charcnt-- > 0) {
183 ucs_ch = *ucsp++;
184
185 if (swapbytes)
186 ucs_ch = NXSwapShort(ucs_ch);
187 if (ucs_ch == '/')
188 ucs_ch = altslash ? altslash : '_';
189 else if (ucs_ch == '\0')
190 ucs_ch = UCS_ALT_NULL;
191
192 len += UNICODE_TO_UTF8_LEN(ucs_ch);
193 }
194
195 return (len);
196 }
197
198
199 /*
200 * utf8_encodestr - Encodes a Unicode string to UTF-8
201 *
202 * NOTES:
203 * The resulting UTF-8 string is NULL terminated.
204 *
205 * If '/' chars are allowed on disk then an alternate
206 * (replacement) char must be provided in altslash.
207 *
208 * input flags:
209 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
210 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
211 *
212 * result:
213 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
214 * EINVAL: Illegal char found; char was replaced by an '_'.
215 */
216 int
217 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
218 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
219 {
220 u_int8_t * bufstart;
221 u_int8_t * bufend;
222 u_int16_t ucs_ch;
223 u_int16_t * chp = NULL;
224 u_int16_t sequence[8];
225 int extra = 0;
226 int charcnt;
227 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
228 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
229 int decompose = (flags & UTF_DECOMPOSED);
230 int result = 0;
231
232 bufstart = utf8p;
233 bufend = bufstart + buflen;
234 if (nullterm)
235 --bufend;
236 charcnt = ucslen / 2;
237
238 while (charcnt-- > 0) {
239 if (extra > 0) {
240 --extra;
241 ucs_ch = *chp++;
242 } else {
243 ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
244
245 if (decompose && unicode_decomposeable(ucs_ch)) {
246 extra = unicode_decompose(ucs_ch, sequence) - 1;
247 charcnt += extra;
248 ucs_ch = sequence[0];
249 chp = &sequence[1];
250 }
251 }
252
253 /* Slash and NULL are not permitted */
254 if (ucs_ch == '/') {
255 if (altslash)
256 ucs_ch = altslash;
257 else {
258 ucs_ch = '_';
259 result = EINVAL;
260 }
261 } else if (ucs_ch == '\0') {
262 ucs_ch = UCS_ALT_NULL;
263 }
264
265 if (ucs_ch < 0x0080) {
266 if (utf8p >= bufend) {
267 result = ENAMETOOLONG;
268 break;
269 }
270 *utf8p++ = ucs_ch;
271
272 } else if (ucs_ch < 0x800) {
273 if ((utf8p + 1) >= bufend) {
274 result = ENAMETOOLONG;
275 break;
276 }
277 *utf8p++ = 0xc0 | (ucs_ch >> 6);
278 *utf8p++ = 0x80 | (0x3f & ucs_ch);
279
280 } else {
281 /* Combine valid surrogate pairs */
282 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
283 && charcnt > 0) {
284 u_int16_t ch2;
285 u_int32_t pair;
286
287 ch2 = swapbytes ? NXSwapShort(*ucsp) : *ucsp;
288 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
289 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
290 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
291 if ((utf8p + 3) >= bufend) {
292 result = ENAMETOOLONG;
293 break;
294 }
295 --charcnt;
296 ++ucsp;
297 *utf8p++ = 0xf0 | (pair >> 18);
298 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
299 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
300 *utf8p++ = 0x80 | (0x3f & pair);
301 continue;
302 }
303 }
304 if ((utf8p + 2) >= bufend) {
305 result = ENAMETOOLONG;
306 break;
307 }
308 *utf8p++ = 0xe0 | (ucs_ch >> 12);
309 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
310 *utf8p++ = 0x80 | (0x3f & ucs_ch);
311 }
312 }
313
314 *utf8len = utf8p - bufstart;
315 if (nullterm)
316 *utf8p++ = '\0';
317
318 return (result);
319 }
320
321
322 /*
323 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
324 *
325 * NOTES:
326 * The input UTF-8 string does not need to be null terminated
327 * if utf8len is set.
328 *
329 * If '/' chars are allowed on disk then an alternate
330 * (replacement) char must be provided in altslash.
331 *
332 * input flags:
333 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
334 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
335 *
336 * result:
337 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
338 * EINVAL: Illegal UTF-8 sequence found.
339 */
340 int
341 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
342 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
343 {
344 u_int16_t* bufstart;
345 u_int16_t* bufend;
346 unsigned int ucs_ch;
347 unsigned int byte;
348 int combcharcnt = 0;
349 int result = 0;
350 int decompose, precompose, swapbytes;
351
352 decompose = (flags & UTF_DECOMPOSED);
353 precompose = (flags & UTF_PRECOMPOSED);
354 swapbytes = (flags & UTF_REVERSE_ENDIAN);
355
356 bufstart = ucsp;
357 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
358
359 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
360 if (ucsp >= bufend)
361 goto toolong;
362
363 /* check for ascii */
364 if (byte < 0x80) {
365 ucs_ch = byte; /* 1st byte */
366 } else {
367 u_int32_t ch;
368 int extrabytes = utf_extrabytes[byte >> 3];
369
370 if (utf8len < extrabytes)
371 goto invalid;
372 utf8len -= extrabytes;
373
374 switch (extrabytes) {
375 case 1:
376 ch = byte; ch <<= 6; /* 1st byte */
377 byte = *utf8p++; /* 2nd byte */
378 if ((byte >> 6) != 2)
379 goto invalid;
380 ch += byte;
381 ch -= 0x00003080UL;
382 if (ch < 0x0080)
383 goto invalid;
384 ucs_ch = ch;
385 break;
386 case 2:
387 ch = byte; ch <<= 6; /* 1st byte */
388 byte = *utf8p++; /* 2nd byte */
389 if ((byte >> 6) != 2)
390 goto invalid;
391 ch += byte; ch <<= 6;
392 byte = *utf8p++; /* 3rd byte */
393 if ((byte >> 6) != 2)
394 goto invalid;
395 ch += byte;
396 ch -= 0x000E2080UL;
397 if (ch < 0x0800)
398 goto invalid;
399 if (ch >= 0xD800) {
400 if (ch <= 0xDFFF)
401 goto invalid;
402 if (ch == 0xFFFE || ch == 0xFFFF)
403 goto invalid;
404 }
405 ucs_ch = ch;
406 break;
407 case 3:
408 ch = byte; ch <<= 6; /* 1st byte */
409 byte = *utf8p++; /* 2nd byte */
410 if ((byte >> 6) != 2)
411 goto invalid;
412 ch += byte; ch <<= 6;
413 byte = *utf8p++; /* 3rd byte */
414 if ((byte >> 6) != 2)
415 goto invalid;
416 ch += byte; ch <<= 6;
417 byte = *utf8p++; /* 4th byte */
418 if ((byte >> 6) != 2)
419 goto invalid;
420 ch += byte;
421 ch -= 0x03C82080UL + SP_HALF_BASE;
422 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
423 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
424 goto invalid;
425 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
426 if (ucsp >= bufend)
427 goto toolong;
428 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
429 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
430 goto invalid;
431 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
432 continue;
433 default:
434 goto invalid;
435 }
436 if (decompose) {
437 if (unicode_decomposeable(ucs_ch)) {
438 u_int16_t sequence[8];
439 int count, i;
440
441 count = unicode_decompose(ucs_ch, sequence);
442
443 for (i = 0; i < count; ++i) {
444 ucs_ch = sequence[i];
445 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
446 if (ucsp >= bufend)
447 goto toolong;
448 }
449 combcharcnt += count - 1;
450 continue;
451 }
452 } else if (precompose && (ucsp != bufstart)) {
453 u_int16_t composite, base;
454
455 if (unicode_combinable(ucs_ch)) {
456 base = swapbytes ? NXSwapShort(*(ucsp - 1)) : *(ucsp - 1);
457 composite = unicode_combine(base, ucs_ch);
458 if (composite) {
459 --ucsp;
460 ucs_ch = composite;
461 }
462 }
463 }
464 if (ucs_ch == UCS_ALT_NULL)
465 ucs_ch = '\0';
466 }
467 if (ucs_ch == altslash)
468 ucs_ch = '/';
469
470 /*
471 * Make multiple combining character sequences canonical
472 */
473 if (unicode_combinable(ucs_ch)) {
474 ++combcharcnt; /* start tracking a run */
475 } else if (combcharcnt) {
476 if (combcharcnt > 1) {
477 priortysort(ucsp - combcharcnt, combcharcnt);
478 }
479 combcharcnt = 0; /* start over */
480 }
481 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
482 }
483 /*
484 * Make a previous combining sequence canonical
485 */
486 if (combcharcnt > 1) {
487 priortysort(ucsp - combcharcnt, combcharcnt);
488 }
489
490 exit:
491 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
492
493 return (result);
494
495 invalid:
496 result = EINVAL;
497 goto exit;
498
499 toolong:
500 result = ENAMETOOLONG;
501 goto exit;
502 }
503
504
505 /*
506 * utf8_validatestr - Check for a valid UTF-8 string.
507 */
508 int
509 utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
510 {
511 unsigned int byte;
512 u_int32_t ch;
513 unsigned int ucs_ch;
514 size_t extrabytes;
515
516 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
517 if (byte < 0x80)
518 continue; /* plain ascii */
519
520 extrabytes = utf_extrabytes[byte >> 3];
521
522 if (utf8len < extrabytes)
523 goto invalid;
524 utf8len -= extrabytes;
525
526 switch (extrabytes) {
527 case 1:
528 ch = byte; ch <<= 6; /* 1st byte */
529 byte = *utf8p++; /* 2nd byte */
530 if ((byte >> 6) != 2)
531 goto invalid;
532 ch += byte;
533 ch -= 0x00003080UL;
534 if (ch < 0x0080)
535 goto invalid;
536 break;
537 case 2:
538 ch = byte; ch <<= 6; /* 1st byte */
539 byte = *utf8p++; /* 2nd byte */
540 if ((byte >> 6) != 2)
541 goto invalid;
542 ch += byte; ch <<= 6;
543 byte = *utf8p++; /* 3rd byte */
544 if ((byte >> 6) != 2)
545 goto invalid;
546 ch += byte;
547 ch -= 0x000E2080UL;
548 if (ch < 0x0800)
549 goto invalid;
550 if (ch >= 0xD800) {
551 if (ch <= 0xDFFF)
552 goto invalid;
553 if (ch == 0xFFFE || ch == 0xFFFF)
554 goto invalid;
555 }
556 break;
557 case 3:
558 ch = byte; ch <<= 6; /* 1st byte */
559 byte = *utf8p++; /* 2nd byte */
560 if ((byte >> 6) != 2)
561 goto invalid;
562 ch += byte; ch <<= 6;
563 byte = *utf8p++; /* 3rd byte */
564 if ((byte >> 6) != 2)
565 goto invalid;
566 ch += byte; ch <<= 6;
567 byte = *utf8p++; /* 4th byte */
568 if ((byte >> 6) != 2)
569 goto invalid;
570 ch += byte;
571 ch -= 0x03C82080UL + SP_HALF_BASE;
572 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
573 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
574 goto invalid;
575 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
576 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
577 goto invalid;
578 break;
579 default:
580 goto invalid;
581 }
582
583 }
584 return (0);
585 invalid:
586 return (EINVAL);
587 }
588
589
590 /*
591 * Unicode 3.2 decomposition code (derived from Core Foundation)
592 */
593
594 typedef struct {
595 u_int32_t _key;
596 u_int32_t _value;
597 } unicode_mappings32;
598
599 static inline u_int32_t
600 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
601 u_int16_t character)
602 {
603 const unicode_mappings32 *p, *q, *divider;
604
605 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
606 return (0);
607
608 p = theTable;
609 q = p + (numElem-1);
610 while (p <= q) {
611 divider = p + ((q - p) >> 1); /* divide by 2 */
612 if (character < divider->_key) { q = divider - 1; }
613 else if (character > divider->_key) { p = divider + 1; }
614 else { return (divider->_value); }
615 }
616 return (0);
617 }
618
619 #define RECURSIVE_DECOMPOSITION (1 << 15)
620 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
621
622 typedef struct {
623 u_int16_t _key;
624 u_int16_t _value;
625 } unicode_mappings16;
626
627 static inline u_int16_t
628 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
629 u_int16_t character)
630 {
631 const unicode_mappings16 *p, *q, *divider;
632
633 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
634 return (0);
635
636 p = theTable;
637 q = p + (numElem-1);
638 while (p <= q) {
639 divider = p + ((q - p) >> 1); /* divide by 2 */
640 if (character < divider->_key)
641 q = divider - 1;
642 else if (character > divider->_key)
643 p = divider + 1;
644 else
645 return (divider->_value);
646 }
647 return (0);
648 }
649
650
651 static u_int32_t
652 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
653 {
654 u_int16_t value;
655 u_int32_t length;
656 u_int16_t firstChar;
657 u_int16_t theChar;
658 const u_int16_t *bmpMappings;
659 u_int32_t usedLength;
660
661 value = getmappedvalue16(
662 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
663 __UniCharDecompositionTableLength, character);
664 length = EXTRACT_COUNT(value);
665 firstChar = value & 0x0FFF;
666 theChar = firstChar;
667 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
668 usedLength = 0;
669
670 if (value & RECURSIVE_DECOMPOSITION) {
671 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
672
673 --length; /* Decrement for the first char */
674 if (!usedLength)
675 return 0;
676 ++bmpMappings;
677 convertedChars += usedLength;
678 }
679
680 usedLength += length;
681
682 while (length--)
683 *(convertedChars++) = *(bmpMappings++);
684
685 return (usedLength);
686 }
687
688 #define HANGUL_SBASE 0xAC00
689 #define HANGUL_LBASE 0x1100
690 #define HANGUL_VBASE 0x1161
691 #define HANGUL_TBASE 0x11A7
692
693 #define HANGUL_SCOUNT 11172
694 #define HANGUL_LCOUNT 19
695 #define HANGUL_VCOUNT 21
696 #define HANGUL_TCOUNT 28
697 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
698
699 /*
700 * unicode_decompose - decompose a composed Unicode char
701 *
702 * Composed Unicode characters are forbidden on
703 * HFS Plus volumes. ucs_decompose will convert a
704 * composed character into its correct decomposed
705 * sequence.
706 *
707 * Similar to CFUniCharDecomposeCharacter
708 */
709 static int
710 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
711 {
712 if ((character >= HANGUL_SBASE) &&
713 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
714 u_int32_t length;
715
716 character -= HANGUL_SBASE;
717 length = (character % HANGUL_TCOUNT ? 3 : 2);
718
719 *(convertedChars++) =
720 character / HANGUL_NCOUNT + HANGUL_LBASE;
721 *(convertedChars++) =
722 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
723 if (length > 2)
724 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
725 return (length);
726 } else {
727 return (unicode_recursive_decompose(character, convertedChars));
728 }
729 }
730
731 /*
732 * unicode_combine - generate a precomposed Unicode char
733 *
734 * Precomposed Unicode characters are required for some volume
735 * formats and network protocols. unicode_combine will combine
736 * a decomposed character sequence into a single precomposed
737 * (composite) character.
738 *
739 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
740 * also handles Hangul Jamo characters.
741 */
742 static u_int16_t
743 unicode_combine(u_int16_t base, u_int16_t combining)
744 {
745 u_int32_t value;
746
747 /* Check HANGUL */
748 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
749 /* 2 char Hangul sequences */
750 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
751 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
752 return (HANGUL_SBASE +
753 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
754 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
755 }
756
757 /* 3 char Hangul sequences */
758 if ((combining > HANGUL_TBASE) &&
759 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
760 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
761 return (0);
762 else
763 return (base + (combining - HANGUL_TBASE));
764 }
765 }
766
767 value = getmappedvalue32(
768 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
769 __CFUniCharPrecompositionTableLength, combining);
770
771 if (value) {
772 value = getmappedvalue16(
773 (const unicode_mappings16 *)
774 ((u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
775 (value >> 16), base);
776 }
777 return (value);
778 }
779
780
781 /*
782 * priortysort - order combining chars into canonical order
783 *
784 * Similar to CFUniCharPrioritySort
785 */
786 static void
787 priortysort(u_int16_t* characters, int count)
788 {
789 u_int32_t p1, p2;
790 u_int16_t *ch1, *ch2;
791 u_int16_t *end;
792 int changes = 1;
793
794 end = characters + count;
795 do {
796 changes = 0;
797 ch1 = characters;
798 ch2 = characters + 1;
799 p2 = get_combining_class(*ch1);
800 while (ch2 < end) {
801 p1 = p2;
802 p2 = get_combining_class(*ch2);
803 if (p1 > p2) {
804 u_int32_t tmp;
805
806 tmp = *ch1;
807 *ch1 = *ch2;
808 *ch2 = tmp;
809 changes = 1;
810 }
811 ++ch1;
812 ++ch2;
813 }
814 } while (changes);
815 }