]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_utfconv.c
xnu-792.22.5.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
CommitLineData
1c79356b 1/*
5d5c5d0d
A
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
8f6c56a5 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
8f6c56a5
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
8ad349bb 24 * limitations under the License.
8f6c56a5
A
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
9bccf70c
A
28
29 /*
30 Includes Unicode 3.2 decomposition code derived from Core Foundation
31 */
32
1c79356b
A
33#include <sys/param.h>
34#include <sys/utfconv.h>
35#include <sys/errno.h>
4452a7af 36#include <libkern/OSByteOrder.h>
1c79356b 37
1c79356b 38/*
765c9de3 39 * UTF-8 (Unicode Transformation Format)
1c79356b 40 *
765c9de3
A
41 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
42 * character as a sequence of one to four bytes. Only the shortest form
43 * required to represent the significant Unicode bits is legal.
1c79356b
A
44 *
45 * UTF-8 Multibyte Codes
46 *
765c9de3
A
47 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
48 * -----------------------------------------------------------------------------
49 * 1 7 0x0000 0x007F 0xxxxxxx
50 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
51 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
52 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
53 * -----------------------------------------------------------------------------
1c79356b
A
54 */
55
56
765c9de3
A
57#define UNICODE_TO_UTF8_LEN(c) \
58 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
0b4e3aa0
A
59
60#define UCS_ALT_NULL 0x2400
1c79356b 61
765c9de3
A
62/* Surrogate Pair Constants */
63#define SP_HALF_SHIFT 10
64#define SP_HALF_BASE 0x0010000UL
65#define SP_HALF_MASK 0x3FFUL
66
67#define SP_HIGH_FIRST 0xD800UL
68#define SP_HIGH_LAST 0xDBFFUL
69#define SP_LOW_FIRST 0xDC00UL
9bccf70c
A
70#define SP_LOW_LAST 0xDFFFUL
71
1c79356b 72
9bccf70c 73#include "vfs_utfconvdata.h"
765c9de3 74
1c79356b 75
9bccf70c
A
76/*
77 * Test for a combining character.
78 *
79 * Similar to __CFUniCharIsNonBaseCharacter except that
80 * unicode_combinable also includes Hangul Jamo characters.
81 */
82static inline int
83unicode_combinable(u_int16_t character)
84{
85 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
86 u_int8_t value;
87
88 if (character < 0x0300)
89 return (0);
90
91 value = bitmap[(character >> 8) & 0xFF];
92
93 if (value == 0xFF) {
94 return (1);
95 } else if (value) {
96 bitmap = bitmap + ((value - 1) * 32) + 256;
97 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
98 }
99 return (0);
100}
101
102/*
103 * Test for a precomposed character.
104 *
105 * Similar to __CFUniCharIsDecomposableCharacter.
106 */
107static inline int
108unicode_decomposeable(u_int16_t character) {
109 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
110 u_int8_t value;
111
112 if (character < 0x00C0)
113 return (0);
114
115 value = bitmap[(character >> 8) & 0xFF];
116
117 if (value == 0xFF) {
118 return (1);
119 } else if (value) {
120 bitmap = bitmap + ((value - 1) * 32) + 256;
121 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
122 }
123 return (0);
124}
125
13fec989
A
126
127/*
128 * Get the combing class.
129 *
130 * Similar to CFUniCharGetCombiningPropertyForCharacter.
131 */
132static inline u_int8_t
133get_combining_class(u_int16_t character) {
134 const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
135
136 u_int8_t value = bitmap[(character >> 8)];
137
138 if (value) {
139 bitmap = bitmap + (value * 256);
140 return bitmap[character % 256];
141 }
142 return (0);
143}
144
145
9bccf70c
A
146static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
147
148static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
0b4e3aa0 149
13fec989 150static void priortysort(u_int16_t* characters, int count);
1c79356b 151
765c9de3
A
152char utf_extrabytes[32] = {
153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
154 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
155};
156
157
1c79356b 158/*
765c9de3 159 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
1c79356b
A
160 *
161 * NOTES:
162 * If '/' chars are allowed on disk then an alternate
163 * (replacement) char must be provided in altslash.
164 *
165 * input flags:
765c9de3 166 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
1c79356b
A
167 */
168size_t
765c9de3
A
169utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
170 int flags)
1c79356b
A
171{
172 u_int16_t ucs_ch;
173 int charcnt;
174 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
175 size_t len;
176
177 charcnt = ucslen / 2;
178 len = 0;
179
180 while (charcnt-- > 0) {
181 ucs_ch = *ucsp++;
182
183 if (swapbytes)
4452a7af 184 ucs_ch = OSSwapInt16(ucs_ch);
0b4e3aa0
A
185 if (ucs_ch == '/')
186 ucs_ch = altslash ? altslash : '_';
187 else if (ucs_ch == '\0')
188 ucs_ch = UCS_ALT_NULL;
1c79356b 189
765c9de3 190 len += UNICODE_TO_UTF8_LEN(ucs_ch);
1c79356b
A
191 }
192
193 return (len);
194}
195
196
197/*
765c9de3 198 * utf8_encodestr - Encodes a Unicode string to UTF-8
1c79356b
A
199 *
200 * NOTES:
0b4e3aa0 201 * The resulting UTF-8 string is NULL terminated.
1c79356b
A
202 *
203 * If '/' chars are allowed on disk then an alternate
204 * (replacement) char must be provided in altslash.
205 *
206 * input flags:
765c9de3 207 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
1c79356b 208 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
0b4e3aa0
A
209 *
210 * result:
211 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
212 * EINVAL: Illegal char found; char was replaced by an '_'.
1c79356b 213 */
765c9de3
A
214int
215utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
216 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
1c79356b
A
217{
218 u_int8_t * bufstart;
219 u_int8_t * bufend;
220 u_int16_t ucs_ch;
9bccf70c
A
221 u_int16_t * chp = NULL;
222 u_int16_t sequence[8];
223 int extra = 0;
1c79356b
A
224 int charcnt;
225 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
0b4e3aa0
A
226 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
227 int decompose = (flags & UTF_DECOMPOSED);
1c79356b
A
228 int result = 0;
229
230 bufstart = utf8p;
231 bufend = bufstart + buflen;
232 if (nullterm)
233 --bufend;
234 charcnt = ucslen / 2;
235
236 while (charcnt-- > 0) {
9bccf70c
A
237 if (extra > 0) {
238 --extra;
239 ucs_ch = *chp++;
0b4e3aa0 240 } else {
4452a7af 241 ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
9bccf70c
A
242
243 if (decompose && unicode_decomposeable(ucs_ch)) {
244 extra = unicode_decompose(ucs_ch, sequence) - 1;
245 charcnt += extra;
246 ucs_ch = sequence[0];
247 chp = &sequence[1];
248 }
0b4e3aa0 249 }
1c79356b 250
0b4e3aa0
A
251 /* Slash and NULL are not permitted */
252 if (ucs_ch == '/') {
253 if (altslash)
254 ucs_ch = altslash;
255 else {
256 ucs_ch = '_';
257 result = EINVAL;
258 }
259 } else if (ucs_ch == '\0') {
260 ucs_ch = UCS_ALT_NULL;
261 }
1c79356b 262
0b4e3aa0 263 if (ucs_ch < 0x0080) {
1c79356b
A
264 if (utf8p >= bufend) {
265 result = ENAMETOOLONG;
266 break;
765c9de3 267 }
1c79356b
A
268 *utf8p++ = ucs_ch;
269
270 } else if (ucs_ch < 0x800) {
271 if ((utf8p + 1) >= bufend) {
272 result = ENAMETOOLONG;
273 break;
274 }
765c9de3
A
275 *utf8p++ = 0xc0 | (ucs_ch >> 6);
276 *utf8p++ = 0x80 | (0x3f & ucs_ch);
1c79356b
A
277
278 } else {
765c9de3
A
279 /* Combine valid surrogate pairs */
280 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
281 && charcnt > 0) {
282 u_int16_t ch2;
283 u_int32_t pair;
284
4452a7af 285 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
765c9de3
A
286 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
287 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
288 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
289 if ((utf8p + 3) >= bufend) {
290 result = ENAMETOOLONG;
291 break;
292 }
293 --charcnt;
294 ++ucsp;
295 *utf8p++ = 0xf0 | (pair >> 18);
296 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
297 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
298 *utf8p++ = 0x80 | (0x3f & pair);
299 continue;
300 }
301 }
1c79356b
A
302 if ((utf8p + 2) >= bufend) {
303 result = ENAMETOOLONG;
304 break;
305 }
765c9de3
A
306 *utf8p++ = 0xe0 | (ucs_ch >> 12);
307 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
308 *utf8p++ = 0x80 | (0x3f & ucs_ch);
1c79356b
A
309 }
310 }
311
312 *utf8len = utf8p - bufstart;
313 if (nullterm)
314 *utf8p++ = '\0';
315
316 return (result);
317}
318
319
320/*
765c9de3 321 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
1c79356b
A
322 *
323 * NOTES:
324 * The input UTF-8 string does not need to be null terminated
325 * if utf8len is set.
326 *
327 * If '/' chars are allowed on disk then an alternate
328 * (replacement) char must be provided in altslash.
329 *
330 * input flags:
765c9de3
A
331 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
332 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
0b4e3aa0
A
333 *
334 * result:
335 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
336 * EINVAL: Illegal UTF-8 sequence found.
1c79356b
A
337 */
338int
765c9de3
A
339utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
340 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
1c79356b
A
341{
342 u_int16_t* bufstart;
343 u_int16_t* bufend;
55e303ae
A
344 unsigned int ucs_ch;
345 unsigned int byte;
13fec989 346 int combcharcnt = 0;
1c79356b 347 int result = 0;
0b4e3aa0 348 int decompose, precompose, swapbytes;
1c79356b 349
0b4e3aa0
A
350 decompose = (flags & UTF_DECOMPOSED);
351 precompose = (flags & UTF_PRECOMPOSED);
352 swapbytes = (flags & UTF_REVERSE_ENDIAN);
1c79356b
A
353
354 bufstart = ucsp;
355 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
356
357 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
765c9de3
A
358 if (ucsp >= bufend)
359 goto toolong;
1c79356b
A
360
361 /* check for ascii */
362 if (byte < 0x80) {
55e303ae 363 ucs_ch = byte; /* 1st byte */
1c79356b 364 } else {
765c9de3
A
365 u_int32_t ch;
366 int extrabytes = utf_extrabytes[byte >> 3];
367
368 if (utf8len < extrabytes)
369 goto invalid;
370 utf8len -= extrabytes;
371
372 switch (extrabytes) {
55e303ae
A
373 case 1:
374 ch = byte; ch <<= 6; /* 1st byte */
375 byte = *utf8p++; /* 2nd byte */
376 if ((byte >> 6) != 2)
377 goto invalid;
378 ch += byte;
379 ch -= 0x00003080UL;
380 if (ch < 0x0080)
381 goto invalid;
382 ucs_ch = ch;
765c9de3 383 break;
55e303ae
A
384 case 2:
385 ch = byte; ch <<= 6; /* 1st byte */
386 byte = *utf8p++; /* 2nd byte */
387 if ((byte >> 6) != 2)
388 goto invalid;
389 ch += byte; ch <<= 6;
390 byte = *utf8p++; /* 3rd byte */
391 if ((byte >> 6) != 2)
392 goto invalid;
393 ch += byte;
394 ch -= 0x000E2080UL;
395 if (ch < 0x0800)
396 goto invalid;
397 if (ch >= 0xD800) {
398 if (ch <= 0xDFFF)
765c9de3 399 goto invalid;
55e303ae
A
400 if (ch == 0xFFFE || ch == 0xFFFF)
401 goto invalid;
402 }
403 ucs_ch = ch;
404 break;
405 case 3:
406 ch = byte; ch <<= 6; /* 1st byte */
407 byte = *utf8p++; /* 2nd byte */
408 if ((byte >> 6) != 2)
409 goto invalid;
410 ch += byte; ch <<= 6;
411 byte = *utf8p++; /* 3rd byte */
412 if ((byte >> 6) != 2)
413 goto invalid;
414 ch += byte; ch <<= 6;
415 byte = *utf8p++; /* 4th byte */
416 if ((byte >> 6) != 2)
417 goto invalid;
418 ch += byte;
419 ch -= 0x03C82080UL + SP_HALF_BASE;
420 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
421 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
422 goto invalid;
4452a7af 423 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : ucs_ch;
55e303ae
A
424 if (ucsp >= bufend)
425 goto toolong;
426 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
427 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
428 goto invalid;
4452a7af 429 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : ucs_ch;
765c9de3 430 continue;
1c79356b 431 default:
55e303ae 432 goto invalid;
1c79356b 433 }
1c79356b 434 if (decompose) {
9bccf70c
A
435 if (unicode_decomposeable(ucs_ch)) {
436 u_int16_t sequence[8];
437 int count, i;
1c79356b 438
9bccf70c 439 count = unicode_decompose(ucs_ch, sequence);
1c79356b 440
9bccf70c
A
441 for (i = 0; i < count; ++i) {
442 ucs_ch = sequence[i];
4452a7af 443 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : ucs_ch;
765c9de3
A
444 if (ucsp >= bufend)
445 goto toolong;
0b4e3aa0 446 }
13fec989 447 combcharcnt += count - 1;
9bccf70c 448 continue;
0b4e3aa0
A
449 }
450 } else if (precompose && (ucsp != bufstart)) {
451 u_int16_t composite, base;
452
9bccf70c 453 if (unicode_combinable(ucs_ch)) {
4452a7af 454 base = swapbytes ? OSSwapInt16(*(ucsp - 1)) : *(ucsp - 1);
9bccf70c
A
455 composite = unicode_combine(base, ucs_ch);
456 if (composite) {
457 --ucsp;
458 ucs_ch = composite;
459 }
1c79356b
A
460 }
461 }
0b4e3aa0
A
462 if (ucs_ch == UCS_ALT_NULL)
463 ucs_ch = '\0';
1c79356b 464 }
1c79356b
A
465 if (ucs_ch == altslash)
466 ucs_ch = '/';
1c79356b 467
13fec989
A
468 /*
469 * Make multiple combining character sequences canonical
470 */
471 if (unicode_combinable(ucs_ch)) {
472 ++combcharcnt; /* start tracking a run */
473 } else if (combcharcnt) {
474 if (combcharcnt > 1) {
475 priortysort(ucsp - combcharcnt, combcharcnt);
476 }
477 combcharcnt = 0; /* start over */
478 }
4452a7af 479 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : ucs_ch;
1c79356b 480 }
13fec989
A
481 /*
482 * Make a previous combining sequence canonical
483 */
484 if (combcharcnt > 1) {
485 priortysort(ucsp - combcharcnt, combcharcnt);
486 }
765c9de3
A
487
488exit:
1c79356b
A
489 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
490
491 return (result);
765c9de3
A
492
493invalid:
494 result = EINVAL;
495 goto exit;
496
497toolong:
498 result = ENAMETOOLONG;
499 goto exit;
1c79356b
A
500}
501
502
91447636
A
503/*
504 * utf8_validatestr - Check for a valid UTF-8 string.
505 */
506int
507utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
508{
509 unsigned int byte;
510 u_int32_t ch;
511 unsigned int ucs_ch;
512 size_t extrabytes;
513
514 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
515 if (byte < 0x80)
516 continue; /* plain ascii */
517
518 extrabytes = utf_extrabytes[byte >> 3];
519
520 if (utf8len < extrabytes)
521 goto invalid;
522 utf8len -= extrabytes;
523
524 switch (extrabytes) {
525 case 1:
526 ch = byte; ch <<= 6; /* 1st byte */
527 byte = *utf8p++; /* 2nd byte */
528 if ((byte >> 6) != 2)
529 goto invalid;
530 ch += byte;
531 ch -= 0x00003080UL;
532 if (ch < 0x0080)
533 goto invalid;
534 break;
535 case 2:
536 ch = byte; ch <<= 6; /* 1st byte */
537 byte = *utf8p++; /* 2nd byte */
538 if ((byte >> 6) != 2)
539 goto invalid;
540 ch += byte; ch <<= 6;
541 byte = *utf8p++; /* 3rd byte */
542 if ((byte >> 6) != 2)
543 goto invalid;
544 ch += byte;
545 ch -= 0x000E2080UL;
546 if (ch < 0x0800)
547 goto invalid;
548 if (ch >= 0xD800) {
549 if (ch <= 0xDFFF)
550 goto invalid;
551 if (ch == 0xFFFE || ch == 0xFFFF)
552 goto invalid;
553 }
554 break;
555 case 3:
556 ch = byte; ch <<= 6; /* 1st byte */
557 byte = *utf8p++; /* 2nd byte */
558 if ((byte >> 6) != 2)
559 goto invalid;
560 ch += byte; ch <<= 6;
561 byte = *utf8p++; /* 3rd byte */
562 if ((byte >> 6) != 2)
563 goto invalid;
564 ch += byte; ch <<= 6;
565 byte = *utf8p++; /* 4th byte */
566 if ((byte >> 6) != 2)
567 goto invalid;
568 ch += byte;
569 ch -= 0x03C82080UL + SP_HALF_BASE;
570 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
571 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
572 goto invalid;
573 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
574 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
575 goto invalid;
576 break;
577 default:
578 goto invalid;
579 }
580
581 }
582 return (0);
583invalid:
584 return (EINVAL);
585}
586
587
9bccf70c
A
588 /*
589 * Unicode 3.2 decomposition code (derived from Core Foundation)
590 */
1c79356b 591
9bccf70c
A
592typedef struct {
593 u_int32_t _key;
594 u_int32_t _value;
595} unicode_mappings32;
0b4e3aa0 596
9bccf70c
A
597static inline u_int32_t
598getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
599 u_int16_t character)
600{
601 const unicode_mappings32 *p, *q, *divider;
1c79356b 602
9bccf70c
A
603 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
604 return (0);
1c79356b 605
9bccf70c
A
606 p = theTable;
607 q = p + (numElem-1);
608 while (p <= q) {
609 divider = p + ((q - p) >> 1); /* divide by 2 */
610 if (character < divider->_key) { q = divider - 1; }
611 else if (character > divider->_key) { p = divider + 1; }
612 else { return (divider->_value); }
613 }
614 return (0);
615}
1c79356b 616
9bccf70c
A
617#define RECURSIVE_DECOMPOSITION (1 << 15)
618#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
1c79356b 619
9bccf70c
A
620typedef struct {
621 u_int16_t _key;
622 u_int16_t _value;
623} unicode_mappings16;
1c79356b 624
9bccf70c
A
625static inline u_int16_t
626getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
627 u_int16_t character)
628{
629 const unicode_mappings16 *p, *q, *divider;
1c79356b 630
9bccf70c
A
631 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
632 return (0);
1c79356b 633
9bccf70c
A
634 p = theTable;
635 q = p + (numElem-1);
636 while (p <= q) {
637 divider = p + ((q - p) >> 1); /* divide by 2 */
638 if (character < divider->_key)
639 q = divider - 1;
640 else if (character > divider->_key)
641 p = divider + 1;
642 else
643 return (divider->_value);
644 }
645 return (0);
646}
647
648
649static u_int32_t
650unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
651{
652 u_int16_t value;
653 u_int32_t length;
654 u_int16_t firstChar;
655 u_int16_t theChar;
656 const u_int16_t *bmpMappings;
657 u_int32_t usedLength;
658
659 value = getmappedvalue16(
660 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
661 __UniCharDecompositionTableLength, character);
662 length = EXTRACT_COUNT(value);
663 firstChar = value & 0x0FFF;
664 theChar = firstChar;
665 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
666 usedLength = 0;
667
668 if (value & RECURSIVE_DECOMPOSITION) {
669 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
670
671 --length; /* Decrement for the first char */
672 if (!usedLength)
673 return 0;
674 ++bmpMappings;
675 convertedChars += usedLength;
676 }
0b4e3aa0 677
9bccf70c 678 usedLength += length;
0b4e3aa0 679
9bccf70c
A
680 while (length--)
681 *(convertedChars++) = *(bmpMappings++);
0b4e3aa0 682
9bccf70c
A
683 return (usedLength);
684}
685
686#define HANGUL_SBASE 0xAC00
687#define HANGUL_LBASE 0x1100
688#define HANGUL_VBASE 0x1161
689#define HANGUL_TBASE 0x11A7
690
691#define HANGUL_SCOUNT 11172
692#define HANGUL_LCOUNT 19
693#define HANGUL_VCOUNT 21
694#define HANGUL_TCOUNT 28
695#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
1c79356b
A
696
697/*
9bccf70c 698 * unicode_decompose - decompose a composed Unicode char
1c79356b
A
699 *
700 * Composed Unicode characters are forbidden on
701 * HFS Plus volumes. ucs_decompose will convert a
702 * composed character into its correct decomposed
703 * sequence.
704 *
9bccf70c 705 * Similar to CFUniCharDecomposeCharacter
1c79356b 706 */
9bccf70c
A
707static int
708unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
1c79356b 709{
9bccf70c
A
710 if ((character >= HANGUL_SBASE) &&
711 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
712 u_int32_t length;
713
714 character -= HANGUL_SBASE;
715 length = (character % HANGUL_TCOUNT ? 3 : 2);
716
717 *(convertedChars++) =
718 character / HANGUL_NCOUNT + HANGUL_LBASE;
719 *(convertedChars++) =
720 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
721 if (length > 2)
722 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
723 return (length);
1c79356b 724 } else {
9bccf70c 725 return (unicode_recursive_decompose(character, convertedChars));
1c79356b 726 }
1c79356b
A
727}
728
0b4e3aa0 729/*
9bccf70c 730 * unicode_combine - generate a precomposed Unicode char
0b4e3aa0
A
731 *
732 * Precomposed Unicode characters are required for some volume
9bccf70c
A
733 * formats and network protocols. unicode_combine will combine
734 * a decomposed character sequence into a single precomposed
0b4e3aa0
A
735 * (composite) character.
736 *
9bccf70c
A
737 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
738 * also handles Hangul Jamo characters.
0b4e3aa0
A
739 */
740static u_int16_t
9bccf70c 741unicode_combine(u_int16_t base, u_int16_t combining)
0b4e3aa0 742{
9bccf70c
A
743 u_int32_t value;
744
745 /* Check HANGUL */
746 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
747 /* 2 char Hangul sequences */
748 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
749 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
750 return (HANGUL_SBASE +
751 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
752 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
0b4e3aa0 753 }
9bccf70c
A
754
755 /* 3 char Hangul sequences */
756 if ((combining > HANGUL_TBASE) &&
757 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
758 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
759 return (0);
760 else
761 return (base + (combining - HANGUL_TBASE));
0b4e3aa0 762 }
0b4e3aa0
A
763 }
764
9bccf70c
A
765 value = getmappedvalue32(
766 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
767 __CFUniCharPrecompositionTableLength, combining);
0b4e3aa0 768
9bccf70c
A
769 if (value) {
770 value = getmappedvalue16(
771 (const unicode_mappings16 *)
772 ((u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
773 (value >> 16), base);
0b4e3aa0 774 }
9bccf70c 775 return (value);
0b4e3aa0
A
776}
777
13fec989
A
778
779/*
780 * priortysort - order combining chars into canonical order
781 *
782 * Similar to CFUniCharPrioritySort
783 */
784static void
785priortysort(u_int16_t* characters, int count)
786{
787 u_int32_t p1, p2;
788 u_int16_t *ch1, *ch2;
789 u_int16_t *end;
790 int changes = 1;
791
792 end = characters + count;
793 do {
794 changes = 0;
795 ch1 = characters;
796 ch2 = characters + 1;
797 p2 = get_combining_class(*ch1);
798 while (ch2 < end) {
799 p1 = p2;
800 p2 = get_combining_class(*ch2);
801 if (p1 > p2) {
802 u_int32_t tmp;
803
804 tmp = *ch1;
805 *ch1 = *ch2;
806 *ch2 = tmp;
807 changes = 1;
808 }
809 ++ch1;
810 ++ch2;
811 }
812 } while (changes);
813}