]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_utfconv.c
xnu-792.21.3.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 Includes Unicode 3.2 decomposition code derived from Core Foundation
31 */
32
33 #include <sys/param.h>
34 #include <sys/utfconv.h>
35 #include <sys/errno.h>
36 #include <architecture/byte_order.h>
37
38 /*
39 * UTF-8 (Unicode Transformation Format)
40 *
41 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
42 * character as a sequence of one to four bytes. Only the shortest form
43 * required to represent the significant Unicode bits is legal.
44 *
45 * UTF-8 Multibyte Codes
46 *
47 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
48 * -----------------------------------------------------------------------------
49 * 1 7 0x0000 0x007F 0xxxxxxx
50 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
51 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
52 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
53 * -----------------------------------------------------------------------------
54 */
55
56
57 #define UNICODE_TO_UTF8_LEN(c) \
58 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
59
60 #define UCS_ALT_NULL 0x2400
61
62 /* Surrogate Pair Constants */
63 #define SP_HALF_SHIFT 10
64 #define SP_HALF_BASE 0x0010000UL
65 #define SP_HALF_MASK 0x3FFUL
66
67 #define SP_HIGH_FIRST 0xD800UL
68 #define SP_HIGH_LAST 0xDBFFUL
69 #define SP_LOW_FIRST 0xDC00UL
70 #define SP_LOW_LAST 0xDFFFUL
71
72
73 #include "vfs_utfconvdata.h"
74
75
76 /*
77 * Test for a combining character.
78 *
79 * Similar to __CFUniCharIsNonBaseCharacter except that
80 * unicode_combinable also includes Hangul Jamo characters.
81 */
82 static inline int
83 unicode_combinable(u_int16_t character)
84 {
85 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
86 u_int8_t value;
87
88 if (character < 0x0300)
89 return (0);
90
91 value = bitmap[(character >> 8) & 0xFF];
92
93 if (value == 0xFF) {
94 return (1);
95 } else if (value) {
96 bitmap = bitmap + ((value - 1) * 32) + 256;
97 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
98 }
99 return (0);
100 }
101
102 /*
103 * Test for a precomposed character.
104 *
105 * Similar to __CFUniCharIsDecomposableCharacter.
106 */
107 static inline int
108 unicode_decomposeable(u_int16_t character) {
109 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
110 u_int8_t value;
111
112 if (character < 0x00C0)
113 return (0);
114
115 value = bitmap[(character >> 8) & 0xFF];
116
117 if (value == 0xFF) {
118 return (1);
119 } else if (value) {
120 bitmap = bitmap + ((value - 1) * 32) + 256;
121 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
122 }
123 return (0);
124 }
125
126
127 /*
128 * Get the combing class.
129 *
130 * Similar to CFUniCharGetCombiningPropertyForCharacter.
131 */
132 static inline u_int8_t
133 get_combining_class(u_int16_t character) {
134 const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
135
136 u_int8_t value = bitmap[(character >> 8)];
137
138 if (value) {
139 bitmap = bitmap + (value * 256);
140 return bitmap[character % 256];
141 }
142 return (0);
143 }
144
145
146 static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
147
148 static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
149
150 static void priortysort(u_int16_t* characters, int count);
151
152 char utf_extrabytes[32] = {
153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
154 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
155 };
156
157
158 /*
159 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
160 *
161 * NOTES:
162 * If '/' chars are allowed on disk then an alternate
163 * (replacement) char must be provided in altslash.
164 *
165 * input flags:
166 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
167 */
168 size_t
169 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
170 int flags)
171 {
172 u_int16_t ucs_ch;
173 int charcnt;
174 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
175 size_t len;
176
177 charcnt = ucslen / 2;
178 len = 0;
179
180 while (charcnt-- > 0) {
181 ucs_ch = *ucsp++;
182
183 if (swapbytes)
184 ucs_ch = NXSwapShort(ucs_ch);
185 if (ucs_ch == '/')
186 ucs_ch = altslash ? altslash : '_';
187 else if (ucs_ch == '\0')
188 ucs_ch = UCS_ALT_NULL;
189
190 len += UNICODE_TO_UTF8_LEN(ucs_ch);
191 }
192
193 return (len);
194 }
195
196
197 /*
198 * utf8_encodestr - Encodes a Unicode string to UTF-8
199 *
200 * NOTES:
201 * The resulting UTF-8 string is NULL terminated.
202 *
203 * If '/' chars are allowed on disk then an alternate
204 * (replacement) char must be provided in altslash.
205 *
206 * input flags:
207 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
208 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
209 *
210 * result:
211 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
212 * EINVAL: Illegal char found; char was replaced by an '_'.
213 */
214 int
215 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
216 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
217 {
218 u_int8_t * bufstart;
219 u_int8_t * bufend;
220 u_int16_t ucs_ch;
221 u_int16_t * chp = NULL;
222 u_int16_t sequence[8];
223 int extra = 0;
224 int charcnt;
225 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
226 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
227 int decompose = (flags & UTF_DECOMPOSED);
228 int result = 0;
229
230 bufstart = utf8p;
231 bufend = bufstart + buflen;
232 if (nullterm)
233 --bufend;
234 charcnt = ucslen / 2;
235
236 while (charcnt-- > 0) {
237 if (extra > 0) {
238 --extra;
239 ucs_ch = *chp++;
240 } else {
241 ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
242
243 if (decompose && unicode_decomposeable(ucs_ch)) {
244 extra = unicode_decompose(ucs_ch, sequence) - 1;
245 charcnt += extra;
246 ucs_ch = sequence[0];
247 chp = &sequence[1];
248 }
249 }
250
251 /* Slash and NULL are not permitted */
252 if (ucs_ch == '/') {
253 if (altslash)
254 ucs_ch = altslash;
255 else {
256 ucs_ch = '_';
257 result = EINVAL;
258 }
259 } else if (ucs_ch == '\0') {
260 ucs_ch = UCS_ALT_NULL;
261 }
262
263 if (ucs_ch < 0x0080) {
264 if (utf8p >= bufend) {
265 result = ENAMETOOLONG;
266 break;
267 }
268 *utf8p++ = ucs_ch;
269
270 } else if (ucs_ch < 0x800) {
271 if ((utf8p + 1) >= bufend) {
272 result = ENAMETOOLONG;
273 break;
274 }
275 *utf8p++ = 0xc0 | (ucs_ch >> 6);
276 *utf8p++ = 0x80 | (0x3f & ucs_ch);
277
278 } else {
279 /* Combine valid surrogate pairs */
280 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
281 && charcnt > 0) {
282 u_int16_t ch2;
283 u_int32_t pair;
284
285 ch2 = swapbytes ? NXSwapShort(*ucsp) : *ucsp;
286 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
287 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
288 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
289 if ((utf8p + 3) >= bufend) {
290 result = ENAMETOOLONG;
291 break;
292 }
293 --charcnt;
294 ++ucsp;
295 *utf8p++ = 0xf0 | (pair >> 18);
296 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
297 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
298 *utf8p++ = 0x80 | (0x3f & pair);
299 continue;
300 }
301 }
302 if ((utf8p + 2) >= bufend) {
303 result = ENAMETOOLONG;
304 break;
305 }
306 *utf8p++ = 0xe0 | (ucs_ch >> 12);
307 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
308 *utf8p++ = 0x80 | (0x3f & ucs_ch);
309 }
310 }
311
312 *utf8len = utf8p - bufstart;
313 if (nullterm)
314 *utf8p++ = '\0';
315
316 return (result);
317 }
318
319
320 /*
321 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
322 *
323 * NOTES:
324 * The input UTF-8 string does not need to be null terminated
325 * if utf8len is set.
326 *
327 * If '/' chars are allowed on disk then an alternate
328 * (replacement) char must be provided in altslash.
329 *
330 * input flags:
331 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
332 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
333 *
334 * result:
335 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
336 * EINVAL: Illegal UTF-8 sequence found.
337 */
338 int
339 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
340 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
341 {
342 u_int16_t* bufstart;
343 u_int16_t* bufend;
344 unsigned int ucs_ch;
345 unsigned int byte;
346 int combcharcnt = 0;
347 int result = 0;
348 int decompose, precompose, swapbytes;
349
350 decompose = (flags & UTF_DECOMPOSED);
351 precompose = (flags & UTF_PRECOMPOSED);
352 swapbytes = (flags & UTF_REVERSE_ENDIAN);
353
354 bufstart = ucsp;
355 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
356
357 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
358 if (ucsp >= bufend)
359 goto toolong;
360
361 /* check for ascii */
362 if (byte < 0x80) {
363 ucs_ch = byte; /* 1st byte */
364 } else {
365 u_int32_t ch;
366 int extrabytes = utf_extrabytes[byte >> 3];
367
368 if (utf8len < extrabytes)
369 goto invalid;
370 utf8len -= extrabytes;
371
372 switch (extrabytes) {
373 case 1:
374 ch = byte; ch <<= 6; /* 1st byte */
375 byte = *utf8p++; /* 2nd byte */
376 if ((byte >> 6) != 2)
377 goto invalid;
378 ch += byte;
379 ch -= 0x00003080UL;
380 if (ch < 0x0080)
381 goto invalid;
382 ucs_ch = ch;
383 break;
384 case 2:
385 ch = byte; ch <<= 6; /* 1st byte */
386 byte = *utf8p++; /* 2nd byte */
387 if ((byte >> 6) != 2)
388 goto invalid;
389 ch += byte; ch <<= 6;
390 byte = *utf8p++; /* 3rd byte */
391 if ((byte >> 6) != 2)
392 goto invalid;
393 ch += byte;
394 ch -= 0x000E2080UL;
395 if (ch < 0x0800)
396 goto invalid;
397 if (ch >= 0xD800) {
398 if (ch <= 0xDFFF)
399 goto invalid;
400 if (ch == 0xFFFE || ch == 0xFFFF)
401 goto invalid;
402 }
403 ucs_ch = ch;
404 break;
405 case 3:
406 ch = byte; ch <<= 6; /* 1st byte */
407 byte = *utf8p++; /* 2nd byte */
408 if ((byte >> 6) != 2)
409 goto invalid;
410 ch += byte; ch <<= 6;
411 byte = *utf8p++; /* 3rd byte */
412 if ((byte >> 6) != 2)
413 goto invalid;
414 ch += byte; ch <<= 6;
415 byte = *utf8p++; /* 4th byte */
416 if ((byte >> 6) != 2)
417 goto invalid;
418 ch += byte;
419 ch -= 0x03C82080UL + SP_HALF_BASE;
420 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
421 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
422 goto invalid;
423 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
424 if (ucsp >= bufend)
425 goto toolong;
426 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
427 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
428 goto invalid;
429 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
430 continue;
431 default:
432 goto invalid;
433 }
434 if (decompose) {
435 if (unicode_decomposeable(ucs_ch)) {
436 u_int16_t sequence[8];
437 int count, i;
438
439 count = unicode_decompose(ucs_ch, sequence);
440
441 for (i = 0; i < count; ++i) {
442 ucs_ch = sequence[i];
443 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
444 if (ucsp >= bufend)
445 goto toolong;
446 }
447 combcharcnt += count - 1;
448 continue;
449 }
450 } else if (precompose && (ucsp != bufstart)) {
451 u_int16_t composite, base;
452
453 if (unicode_combinable(ucs_ch)) {
454 base = swapbytes ? NXSwapShort(*(ucsp - 1)) : *(ucsp - 1);
455 composite = unicode_combine(base, ucs_ch);
456 if (composite) {
457 --ucsp;
458 ucs_ch = composite;
459 }
460 }
461 }
462 if (ucs_ch == UCS_ALT_NULL)
463 ucs_ch = '\0';
464 }
465 if (ucs_ch == altslash)
466 ucs_ch = '/';
467
468 /*
469 * Make multiple combining character sequences canonical
470 */
471 if (unicode_combinable(ucs_ch)) {
472 ++combcharcnt; /* start tracking a run */
473 } else if (combcharcnt) {
474 if (combcharcnt > 1) {
475 priortysort(ucsp - combcharcnt, combcharcnt);
476 }
477 combcharcnt = 0; /* start over */
478 }
479 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
480 }
481 /*
482 * Make a previous combining sequence canonical
483 */
484 if (combcharcnt > 1) {
485 priortysort(ucsp - combcharcnt, combcharcnt);
486 }
487
488 exit:
489 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
490
491 return (result);
492
493 invalid:
494 result = EINVAL;
495 goto exit;
496
497 toolong:
498 result = ENAMETOOLONG;
499 goto exit;
500 }
501
502
503 /*
504 * utf8_validatestr - Check for a valid UTF-8 string.
505 */
506 int
507 utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
508 {
509 unsigned int byte;
510 u_int32_t ch;
511 unsigned int ucs_ch;
512 size_t extrabytes;
513
514 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
515 if (byte < 0x80)
516 continue; /* plain ascii */
517
518 extrabytes = utf_extrabytes[byte >> 3];
519
520 if (utf8len < extrabytes)
521 goto invalid;
522 utf8len -= extrabytes;
523
524 switch (extrabytes) {
525 case 1:
526 ch = byte; ch <<= 6; /* 1st byte */
527 byte = *utf8p++; /* 2nd byte */
528 if ((byte >> 6) != 2)
529 goto invalid;
530 ch += byte;
531 ch -= 0x00003080UL;
532 if (ch < 0x0080)
533 goto invalid;
534 break;
535 case 2:
536 ch = byte; ch <<= 6; /* 1st byte */
537 byte = *utf8p++; /* 2nd byte */
538 if ((byte >> 6) != 2)
539 goto invalid;
540 ch += byte; ch <<= 6;
541 byte = *utf8p++; /* 3rd byte */
542 if ((byte >> 6) != 2)
543 goto invalid;
544 ch += byte;
545 ch -= 0x000E2080UL;
546 if (ch < 0x0800)
547 goto invalid;
548 if (ch >= 0xD800) {
549 if (ch <= 0xDFFF)
550 goto invalid;
551 if (ch == 0xFFFE || ch == 0xFFFF)
552 goto invalid;
553 }
554 break;
555 case 3:
556 ch = byte; ch <<= 6; /* 1st byte */
557 byte = *utf8p++; /* 2nd byte */
558 if ((byte >> 6) != 2)
559 goto invalid;
560 ch += byte; ch <<= 6;
561 byte = *utf8p++; /* 3rd byte */
562 if ((byte >> 6) != 2)
563 goto invalid;
564 ch += byte; ch <<= 6;
565 byte = *utf8p++; /* 4th byte */
566 if ((byte >> 6) != 2)
567 goto invalid;
568 ch += byte;
569 ch -= 0x03C82080UL + SP_HALF_BASE;
570 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
571 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
572 goto invalid;
573 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
574 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
575 goto invalid;
576 break;
577 default:
578 goto invalid;
579 }
580
581 }
582 return (0);
583 invalid:
584 return (EINVAL);
585 }
586
587
588 /*
589 * Unicode 3.2 decomposition code (derived from Core Foundation)
590 */
591
592 typedef struct {
593 u_int32_t _key;
594 u_int32_t _value;
595 } unicode_mappings32;
596
597 static inline u_int32_t
598 getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
599 u_int16_t character)
600 {
601 const unicode_mappings32 *p, *q, *divider;
602
603 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
604 return (0);
605
606 p = theTable;
607 q = p + (numElem-1);
608 while (p <= q) {
609 divider = p + ((q - p) >> 1); /* divide by 2 */
610 if (character < divider->_key) { q = divider - 1; }
611 else if (character > divider->_key) { p = divider + 1; }
612 else { return (divider->_value); }
613 }
614 return (0);
615 }
616
617 #define RECURSIVE_DECOMPOSITION (1 << 15)
618 #define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
619
620 typedef struct {
621 u_int16_t _key;
622 u_int16_t _value;
623 } unicode_mappings16;
624
625 static inline u_int16_t
626 getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
627 u_int16_t character)
628 {
629 const unicode_mappings16 *p, *q, *divider;
630
631 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
632 return (0);
633
634 p = theTable;
635 q = p + (numElem-1);
636 while (p <= q) {
637 divider = p + ((q - p) >> 1); /* divide by 2 */
638 if (character < divider->_key)
639 q = divider - 1;
640 else if (character > divider->_key)
641 p = divider + 1;
642 else
643 return (divider->_value);
644 }
645 return (0);
646 }
647
648
649 static u_int32_t
650 unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
651 {
652 u_int16_t value;
653 u_int32_t length;
654 u_int16_t firstChar;
655 u_int16_t theChar;
656 const u_int16_t *bmpMappings;
657 u_int32_t usedLength;
658
659 value = getmappedvalue16(
660 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
661 __UniCharDecompositionTableLength, character);
662 length = EXTRACT_COUNT(value);
663 firstChar = value & 0x0FFF;
664 theChar = firstChar;
665 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
666 usedLength = 0;
667
668 if (value & RECURSIVE_DECOMPOSITION) {
669 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
670
671 --length; /* Decrement for the first char */
672 if (!usedLength)
673 return 0;
674 ++bmpMappings;
675 convertedChars += usedLength;
676 }
677
678 usedLength += length;
679
680 while (length--)
681 *(convertedChars++) = *(bmpMappings++);
682
683 return (usedLength);
684 }
685
686 #define HANGUL_SBASE 0xAC00
687 #define HANGUL_LBASE 0x1100
688 #define HANGUL_VBASE 0x1161
689 #define HANGUL_TBASE 0x11A7
690
691 #define HANGUL_SCOUNT 11172
692 #define HANGUL_LCOUNT 19
693 #define HANGUL_VCOUNT 21
694 #define HANGUL_TCOUNT 28
695 #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
696
697 /*
698 * unicode_decompose - decompose a composed Unicode char
699 *
700 * Composed Unicode characters are forbidden on
701 * HFS Plus volumes. ucs_decompose will convert a
702 * composed character into its correct decomposed
703 * sequence.
704 *
705 * Similar to CFUniCharDecomposeCharacter
706 */
707 static int
708 unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
709 {
710 if ((character >= HANGUL_SBASE) &&
711 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
712 u_int32_t length;
713
714 character -= HANGUL_SBASE;
715 length = (character % HANGUL_TCOUNT ? 3 : 2);
716
717 *(convertedChars++) =
718 character / HANGUL_NCOUNT + HANGUL_LBASE;
719 *(convertedChars++) =
720 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
721 if (length > 2)
722 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
723 return (length);
724 } else {
725 return (unicode_recursive_decompose(character, convertedChars));
726 }
727 }
728
729 /*
730 * unicode_combine - generate a precomposed Unicode char
731 *
732 * Precomposed Unicode characters are required for some volume
733 * formats and network protocols. unicode_combine will combine
734 * a decomposed character sequence into a single precomposed
735 * (composite) character.
736 *
737 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
738 * also handles Hangul Jamo characters.
739 */
740 static u_int16_t
741 unicode_combine(u_int16_t base, u_int16_t combining)
742 {
743 u_int32_t value;
744
745 /* Check HANGUL */
746 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
747 /* 2 char Hangul sequences */
748 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
749 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
750 return (HANGUL_SBASE +
751 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
752 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
753 }
754
755 /* 3 char Hangul sequences */
756 if ((combining > HANGUL_TBASE) &&
757 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
758 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
759 return (0);
760 else
761 return (base + (combining - HANGUL_TBASE));
762 }
763 }
764
765 value = getmappedvalue32(
766 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
767 __CFUniCharPrecompositionTableLength, combining);
768
769 if (value) {
770 value = getmappedvalue16(
771 (const unicode_mappings16 *)
772 ((u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
773 (value >> 16), base);
774 }
775 return (value);
776 }
777
778
779 /*
780 * priortysort - order combining chars into canonical order
781 *
782 * Similar to CFUniCharPrioritySort
783 */
784 static void
785 priortysort(u_int16_t* characters, int count)
786 {
787 u_int32_t p1, p2;
788 u_int16_t *ch1, *ch2;
789 u_int16_t *end;
790 int changes = 1;
791
792 end = characters + count;
793 do {
794 changes = 0;
795 ch1 = characters;
796 ch2 = characters + 1;
797 p2 = get_combining_class(*ch1);
798 while (ch2 < end) {
799 p1 = p2;
800 p2 = get_combining_class(*ch2);
801 if (p1 > p2) {
802 u_int32_t tmp;
803
804 tmp = *ch1;
805 *ch1 = *ch2;
806 *ch2 = tmp;
807 changes = 1;
808 }
809 ++ch1;
810 ++ch2;
811 }
812 } while (changes);
813 }