]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_utfconv.c
xnu-792.25.20.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
CommitLineData
1c79356b 1/*
5d5c5d0d
A
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
6601e61a 4 * @APPLE_LICENSE_HEADER_START@
1c79356b 5 *
6601e61a
A
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
8f6c56a5 11 *
6601e61a
A
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
6601e61a
A
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
8f6c56a5 19 *
6601e61a 20 * @APPLE_LICENSE_HEADER_END@
1c79356b 21 */
9bccf70c
A
22
23 /*
24 Includes Unicode 3.2 decomposition code derived from Core Foundation
25 */
26
1c79356b
A
27#include <sys/param.h>
28#include <sys/utfconv.h>
29#include <sys/errno.h>
0c530ab8 30#include <libkern/OSByteOrder.h>
1c79356b 31
1c79356b 32/*
765c9de3 33 * UTF-8 (Unicode Transformation Format)
1c79356b 34 *
765c9de3
A
35 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
36 * character as a sequence of one to four bytes. Only the shortest form
37 * required to represent the significant Unicode bits is legal.
1c79356b
A
38 *
39 * UTF-8 Multibyte Codes
40 *
765c9de3
A
41 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
42 * -----------------------------------------------------------------------------
43 * 1 7 0x0000 0x007F 0xxxxxxx
44 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
45 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
46 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
47 * -----------------------------------------------------------------------------
1c79356b
A
48 */
49
50
765c9de3
A
51#define UNICODE_TO_UTF8_LEN(c) \
52 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
0b4e3aa0
A
53
54#define UCS_ALT_NULL 0x2400
1c79356b 55
765c9de3
A
56/* Surrogate Pair Constants */
57#define SP_HALF_SHIFT 10
58#define SP_HALF_BASE 0x0010000UL
59#define SP_HALF_MASK 0x3FFUL
60
61#define SP_HIGH_FIRST 0xD800UL
62#define SP_HIGH_LAST 0xDBFFUL
63#define SP_LOW_FIRST 0xDC00UL
9bccf70c
A
64#define SP_LOW_LAST 0xDFFFUL
65
1c79356b 66
9bccf70c 67#include "vfs_utfconvdata.h"
765c9de3 68
1c79356b 69
9bccf70c
A
70/*
71 * Test for a combining character.
72 *
73 * Similar to __CFUniCharIsNonBaseCharacter except that
74 * unicode_combinable also includes Hangul Jamo characters.
75 */
76static inline int
77unicode_combinable(u_int16_t character)
78{
79 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
80 u_int8_t value;
81
82 if (character < 0x0300)
83 return (0);
84
85 value = bitmap[(character >> 8) & 0xFF];
86
87 if (value == 0xFF) {
88 return (1);
89 } else if (value) {
90 bitmap = bitmap + ((value - 1) * 32) + 256;
91 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
92 }
93 return (0);
94}
95
96/*
97 * Test for a precomposed character.
98 *
99 * Similar to __CFUniCharIsDecomposableCharacter.
100 */
101static inline int
102unicode_decomposeable(u_int16_t character) {
103 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
104 u_int8_t value;
105
106 if (character < 0x00C0)
107 return (0);
108
109 value = bitmap[(character >> 8) & 0xFF];
110
111 if (value == 0xFF) {
112 return (1);
113 } else if (value) {
114 bitmap = bitmap + ((value - 1) * 32) + 256;
115 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
116 }
117 return (0);
118}
119
13fec989
A
120
121/*
122 * Get the combing class.
123 *
124 * Similar to CFUniCharGetCombiningPropertyForCharacter.
125 */
126static inline u_int8_t
127get_combining_class(u_int16_t character) {
128 const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
129
130 u_int8_t value = bitmap[(character >> 8)];
131
132 if (value) {
133 bitmap = bitmap + (value * 256);
134 return bitmap[character % 256];
135 }
136 return (0);
137}
138
139
9bccf70c
A
140static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
141
142static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
0b4e3aa0 143
13fec989 144static void priortysort(u_int16_t* characters, int count);
1c79356b 145
765c9de3
A
146char utf_extrabytes[32] = {
147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
149};
150
151
1c79356b 152/*
765c9de3 153 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
1c79356b
A
154 *
155 * NOTES:
156 * If '/' chars are allowed on disk then an alternate
157 * (replacement) char must be provided in altslash.
158 *
159 * input flags:
765c9de3 160 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
1c79356b
A
161 */
162size_t
765c9de3
A
163utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
164 int flags)
1c79356b
A
165{
166 u_int16_t ucs_ch;
167 int charcnt;
168 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
169 size_t len;
170
171 charcnt = ucslen / 2;
172 len = 0;
173
174 while (charcnt-- > 0) {
175 ucs_ch = *ucsp++;
176
177 if (swapbytes)
0c530ab8 178 ucs_ch = OSSwapInt16(ucs_ch);
0b4e3aa0
A
179 if (ucs_ch == '/')
180 ucs_ch = altslash ? altslash : '_';
181 else if (ucs_ch == '\0')
182 ucs_ch = UCS_ALT_NULL;
1c79356b 183
765c9de3 184 len += UNICODE_TO_UTF8_LEN(ucs_ch);
1c79356b
A
185 }
186
187 return (len);
188}
189
190
191/*
765c9de3 192 * utf8_encodestr - Encodes a Unicode string to UTF-8
1c79356b
A
193 *
194 * NOTES:
0b4e3aa0 195 * The resulting UTF-8 string is NULL terminated.
1c79356b
A
196 *
197 * If '/' chars are allowed on disk then an alternate
198 * (replacement) char must be provided in altslash.
199 *
200 * input flags:
765c9de3 201 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
1c79356b 202 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
0b4e3aa0
A
203 *
204 * result:
205 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
206 * EINVAL: Illegal char found; char was replaced by an '_'.
1c79356b 207 */
765c9de3
A
208int
209utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
210 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
1c79356b
A
211{
212 u_int8_t * bufstart;
213 u_int8_t * bufend;
214 u_int16_t ucs_ch;
9bccf70c
A
215 u_int16_t * chp = NULL;
216 u_int16_t sequence[8];
217 int extra = 0;
1c79356b
A
218 int charcnt;
219 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
0b4e3aa0
A
220 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
221 int decompose = (flags & UTF_DECOMPOSED);
1c79356b
A
222 int result = 0;
223
224 bufstart = utf8p;
225 bufend = bufstart + buflen;
226 if (nullterm)
227 --bufend;
228 charcnt = ucslen / 2;
229
230 while (charcnt-- > 0) {
9bccf70c
A
231 if (extra > 0) {
232 --extra;
233 ucs_ch = *chp++;
0b4e3aa0 234 } else {
0c530ab8 235 ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
9bccf70c
A
236
237 if (decompose && unicode_decomposeable(ucs_ch)) {
238 extra = unicode_decompose(ucs_ch, sequence) - 1;
239 charcnt += extra;
240 ucs_ch = sequence[0];
241 chp = &sequence[1];
242 }
0b4e3aa0 243 }
1c79356b 244
0b4e3aa0
A
245 /* Slash and NULL are not permitted */
246 if (ucs_ch == '/') {
247 if (altslash)
248 ucs_ch = altslash;
249 else {
250 ucs_ch = '_';
251 result = EINVAL;
252 }
253 } else if (ucs_ch == '\0') {
254 ucs_ch = UCS_ALT_NULL;
255 }
1c79356b 256
0b4e3aa0 257 if (ucs_ch < 0x0080) {
1c79356b
A
258 if (utf8p >= bufend) {
259 result = ENAMETOOLONG;
260 break;
765c9de3 261 }
1c79356b
A
262 *utf8p++ = ucs_ch;
263
264 } else if (ucs_ch < 0x800) {
265 if ((utf8p + 1) >= bufend) {
266 result = ENAMETOOLONG;
267 break;
268 }
765c9de3
A
269 *utf8p++ = 0xc0 | (ucs_ch >> 6);
270 *utf8p++ = 0x80 | (0x3f & ucs_ch);
1c79356b
A
271
272 } else {
765c9de3
A
273 /* Combine valid surrogate pairs */
274 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
275 && charcnt > 0) {
276 u_int16_t ch2;
277 u_int32_t pair;
278
0c530ab8 279 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
765c9de3
A
280 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
281 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
282 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
283 if ((utf8p + 3) >= bufend) {
284 result = ENAMETOOLONG;
285 break;
286 }
287 --charcnt;
288 ++ucsp;
289 *utf8p++ = 0xf0 | (pair >> 18);
290 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
291 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
292 *utf8p++ = 0x80 | (0x3f & pair);
293 continue;
294 }
295 }
1c79356b
A
296 if ((utf8p + 2) >= bufend) {
297 result = ENAMETOOLONG;
298 break;
299 }
765c9de3
A
300 *utf8p++ = 0xe0 | (ucs_ch >> 12);
301 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
302 *utf8p++ = 0x80 | (0x3f & ucs_ch);
1c79356b
A
303 }
304 }
305
306 *utf8len = utf8p - bufstart;
307 if (nullterm)
308 *utf8p++ = '\0';
309
310 return (result);
311}
312
313
314/*
765c9de3 315 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
1c79356b
A
316 *
317 * NOTES:
318 * The input UTF-8 string does not need to be null terminated
319 * if utf8len is set.
320 *
321 * If '/' chars are allowed on disk then an alternate
322 * (replacement) char must be provided in altslash.
323 *
324 * input flags:
765c9de3
A
325 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
326 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
0b4e3aa0
A
327 *
328 * result:
329 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
330 * EINVAL: Illegal UTF-8 sequence found.
1c79356b
A
331 */
332int
765c9de3
A
333utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
334 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
1c79356b
A
335{
336 u_int16_t* bufstart;
337 u_int16_t* bufend;
55e303ae
A
338 unsigned int ucs_ch;
339 unsigned int byte;
13fec989 340 int combcharcnt = 0;
1c79356b 341 int result = 0;
0b4e3aa0 342 int decompose, precompose, swapbytes;
1c79356b 343
0b4e3aa0
A
344 decompose = (flags & UTF_DECOMPOSED);
345 precompose = (flags & UTF_PRECOMPOSED);
346 swapbytes = (flags & UTF_REVERSE_ENDIAN);
1c79356b
A
347
348 bufstart = ucsp;
349 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
350
351 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
765c9de3
A
352 if (ucsp >= bufend)
353 goto toolong;
1c79356b
A
354
355 /* check for ascii */
356 if (byte < 0x80) {
55e303ae 357 ucs_ch = byte; /* 1st byte */
1c79356b 358 } else {
765c9de3
A
359 u_int32_t ch;
360 int extrabytes = utf_extrabytes[byte >> 3];
361
362 if (utf8len < extrabytes)
363 goto invalid;
364 utf8len -= extrabytes;
365
366 switch (extrabytes) {
55e303ae
A
367 case 1:
368 ch = byte; ch <<= 6; /* 1st byte */
369 byte = *utf8p++; /* 2nd byte */
370 if ((byte >> 6) != 2)
371 goto invalid;
372 ch += byte;
373 ch -= 0x00003080UL;
374 if (ch < 0x0080)
375 goto invalid;
376 ucs_ch = ch;
765c9de3 377 break;
55e303ae
A
378 case 2:
379 ch = byte; ch <<= 6; /* 1st byte */
380 byte = *utf8p++; /* 2nd byte */
381 if ((byte >> 6) != 2)
382 goto invalid;
383 ch += byte; ch <<= 6;
384 byte = *utf8p++; /* 3rd byte */
385 if ((byte >> 6) != 2)
386 goto invalid;
387 ch += byte;
388 ch -= 0x000E2080UL;
389 if (ch < 0x0800)
390 goto invalid;
391 if (ch >= 0xD800) {
392 if (ch <= 0xDFFF)
765c9de3 393 goto invalid;
55e303ae
A
394 if (ch == 0xFFFE || ch == 0xFFFF)
395 goto invalid;
396 }
397 ucs_ch = ch;
398 break;
399 case 3:
400 ch = byte; ch <<= 6; /* 1st byte */
401 byte = *utf8p++; /* 2nd byte */
402 if ((byte >> 6) != 2)
403 goto invalid;
404 ch += byte; ch <<= 6;
405 byte = *utf8p++; /* 3rd byte */
406 if ((byte >> 6) != 2)
407 goto invalid;
408 ch += byte; ch <<= 6;
409 byte = *utf8p++; /* 4th byte */
410 if ((byte >> 6) != 2)
411 goto invalid;
412 ch += byte;
413 ch -= 0x03C82080UL + SP_HALF_BASE;
414 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
415 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
416 goto invalid;
0c530ab8 417 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : ucs_ch;
55e303ae
A
418 if (ucsp >= bufend)
419 goto toolong;
420 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
421 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
422 goto invalid;
0c530ab8 423 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : ucs_ch;
765c9de3 424 continue;
1c79356b 425 default:
55e303ae 426 goto invalid;
1c79356b 427 }
1c79356b 428 if (decompose) {
9bccf70c
A
429 if (unicode_decomposeable(ucs_ch)) {
430 u_int16_t sequence[8];
431 int count, i;
0c530ab8 432
6601e61a 433 /* Before decomposing a new unicode character, sort
0c530ab8 434 * previous combining characters, if any, and reset
6601e61a
A
435 * the counter
436 */
437 if (combcharcnt > 1){
438 priortysort(ucsp - combcharcnt, combcharcnt);
439 }
440 combcharcnt = 0;
9bccf70c 441 count = unicode_decompose(ucs_ch, sequence);
1c79356b 442
9bccf70c
A
443 for (i = 0; i < count; ++i) {
444 ucs_ch = sequence[i];
0c530ab8 445 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : ucs_ch;
765c9de3
A
446 if (ucsp >= bufend)
447 goto toolong;
0b4e3aa0 448 }
13fec989 449 combcharcnt += count - 1;
0c530ab8 450 continue;
0b4e3aa0
A
451 }
452 } else if (precompose && (ucsp != bufstart)) {
453 u_int16_t composite, base;
454
9bccf70c 455 if (unicode_combinable(ucs_ch)) {
0c530ab8 456 base = swapbytes ? OSSwapInt16(*(ucsp - 1)) : *(ucsp - 1);
9bccf70c
A
457 composite = unicode_combine(base, ucs_ch);
458 if (composite) {
459 --ucsp;
460 ucs_ch = composite;
461 }
1c79356b
A
462 }
463 }
0b4e3aa0
A
464 if (ucs_ch == UCS_ALT_NULL)
465 ucs_ch = '\0';
1c79356b 466 }
1c79356b
A
467 if (ucs_ch == altslash)
468 ucs_ch = '/';
1c79356b 469
13fec989
A
470 /*
471 * Make multiple combining character sequences canonical
472 */
473 if (unicode_combinable(ucs_ch)) {
474 ++combcharcnt; /* start tracking a run */
475 } else if (combcharcnt) {
476 if (combcharcnt > 1) {
477 priortysort(ucsp - combcharcnt, combcharcnt);
478 }
479 combcharcnt = 0; /* start over */
480 }
0c530ab8 481 *ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : ucs_ch;
1c79356b 482 }
13fec989
A
483 /*
484 * Make a previous combining sequence canonical
485 */
486 if (combcharcnt > 1) {
487 priortysort(ucsp - combcharcnt, combcharcnt);
488 }
765c9de3
A
489
490exit:
1c79356b
A
491 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
492
493 return (result);
765c9de3
A
494
495invalid:
496 result = EINVAL;
497 goto exit;
498
499toolong:
500 result = ENAMETOOLONG;
501 goto exit;
1c79356b
A
502}
503
504
91447636
A
505/*
506 * utf8_validatestr - Check for a valid UTF-8 string.
507 */
508int
509utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
510{
511 unsigned int byte;
512 u_int32_t ch;
513 unsigned int ucs_ch;
514 size_t extrabytes;
515
516 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
517 if (byte < 0x80)
518 continue; /* plain ascii */
519
520 extrabytes = utf_extrabytes[byte >> 3];
521
522 if (utf8len < extrabytes)
523 goto invalid;
524 utf8len -= extrabytes;
525
526 switch (extrabytes) {
527 case 1:
528 ch = byte; ch <<= 6; /* 1st byte */
529 byte = *utf8p++; /* 2nd byte */
530 if ((byte >> 6) != 2)
531 goto invalid;
532 ch += byte;
533 ch -= 0x00003080UL;
534 if (ch < 0x0080)
535 goto invalid;
536 break;
537 case 2:
538 ch = byte; ch <<= 6; /* 1st byte */
539 byte = *utf8p++; /* 2nd byte */
540 if ((byte >> 6) != 2)
541 goto invalid;
542 ch += byte; ch <<= 6;
543 byte = *utf8p++; /* 3rd byte */
544 if ((byte >> 6) != 2)
545 goto invalid;
546 ch += byte;
547 ch -= 0x000E2080UL;
548 if (ch < 0x0800)
549 goto invalid;
550 if (ch >= 0xD800) {
551 if (ch <= 0xDFFF)
552 goto invalid;
553 if (ch == 0xFFFE || ch == 0xFFFF)
554 goto invalid;
555 }
556 break;
557 case 3:
558 ch = byte; ch <<= 6; /* 1st byte */
559 byte = *utf8p++; /* 2nd byte */
560 if ((byte >> 6) != 2)
561 goto invalid;
562 ch += byte; ch <<= 6;
563 byte = *utf8p++; /* 3rd byte */
564 if ((byte >> 6) != 2)
565 goto invalid;
566 ch += byte; ch <<= 6;
567 byte = *utf8p++; /* 4th byte */
568 if ((byte >> 6) != 2)
569 goto invalid;
570 ch += byte;
571 ch -= 0x03C82080UL + SP_HALF_BASE;
572 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
573 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
574 goto invalid;
575 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
576 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
577 goto invalid;
578 break;
579 default:
580 goto invalid;
581 }
582
583 }
584 return (0);
585invalid:
586 return (EINVAL);
587}
588
589
9bccf70c
A
590 /*
591 * Unicode 3.2 decomposition code (derived from Core Foundation)
592 */
1c79356b 593
9bccf70c
A
594typedef struct {
595 u_int32_t _key;
596 u_int32_t _value;
597} unicode_mappings32;
0b4e3aa0 598
9bccf70c
A
599static inline u_int32_t
600getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
601 u_int16_t character)
602{
603 const unicode_mappings32 *p, *q, *divider;
1c79356b 604
9bccf70c
A
605 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
606 return (0);
1c79356b 607
9bccf70c
A
608 p = theTable;
609 q = p + (numElem-1);
610 while (p <= q) {
611 divider = p + ((q - p) >> 1); /* divide by 2 */
612 if (character < divider->_key) { q = divider - 1; }
613 else if (character > divider->_key) { p = divider + 1; }
614 else { return (divider->_value); }
615 }
616 return (0);
617}
1c79356b 618
9bccf70c
A
619#define RECURSIVE_DECOMPOSITION (1 << 15)
620#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
1c79356b 621
9bccf70c
A
622typedef struct {
623 u_int16_t _key;
624 u_int16_t _value;
625} unicode_mappings16;
1c79356b 626
9bccf70c
A
627static inline u_int16_t
628getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
629 u_int16_t character)
630{
631 const unicode_mappings16 *p, *q, *divider;
1c79356b 632
9bccf70c
A
633 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
634 return (0);
1c79356b 635
9bccf70c
A
636 p = theTable;
637 q = p + (numElem-1);
638 while (p <= q) {
639 divider = p + ((q - p) >> 1); /* divide by 2 */
640 if (character < divider->_key)
641 q = divider - 1;
642 else if (character > divider->_key)
643 p = divider + 1;
644 else
645 return (divider->_value);
646 }
647 return (0);
648}
649
650
651static u_int32_t
652unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
653{
654 u_int16_t value;
655 u_int32_t length;
656 u_int16_t firstChar;
657 u_int16_t theChar;
658 const u_int16_t *bmpMappings;
659 u_int32_t usedLength;
660
661 value = getmappedvalue16(
662 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
663 __UniCharDecompositionTableLength, character);
664 length = EXTRACT_COUNT(value);
665 firstChar = value & 0x0FFF;
666 theChar = firstChar;
667 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
668 usedLength = 0;
669
670 if (value & RECURSIVE_DECOMPOSITION) {
671 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
672
673 --length; /* Decrement for the first char */
674 if (!usedLength)
675 return 0;
676 ++bmpMappings;
677 convertedChars += usedLength;
678 }
0b4e3aa0 679
9bccf70c 680 usedLength += length;
0b4e3aa0 681
9bccf70c
A
682 while (length--)
683 *(convertedChars++) = *(bmpMappings++);
0b4e3aa0 684
9bccf70c
A
685 return (usedLength);
686}
687
688#define HANGUL_SBASE 0xAC00
689#define HANGUL_LBASE 0x1100
690#define HANGUL_VBASE 0x1161
691#define HANGUL_TBASE 0x11A7
692
693#define HANGUL_SCOUNT 11172
694#define HANGUL_LCOUNT 19
695#define HANGUL_VCOUNT 21
696#define HANGUL_TCOUNT 28
697#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
1c79356b
A
698
699/*
9bccf70c 700 * unicode_decompose - decompose a composed Unicode char
1c79356b
A
701 *
702 * Composed Unicode characters are forbidden on
703 * HFS Plus volumes. ucs_decompose will convert a
704 * composed character into its correct decomposed
705 * sequence.
706 *
9bccf70c 707 * Similar to CFUniCharDecomposeCharacter
1c79356b 708 */
9bccf70c
A
709static int
710unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
1c79356b 711{
9bccf70c
A
712 if ((character >= HANGUL_SBASE) &&
713 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
714 u_int32_t length;
715
716 character -= HANGUL_SBASE;
717 length = (character % HANGUL_TCOUNT ? 3 : 2);
718
719 *(convertedChars++) =
720 character / HANGUL_NCOUNT + HANGUL_LBASE;
721 *(convertedChars++) =
722 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
723 if (length > 2)
724 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
725 return (length);
1c79356b 726 } else {
9bccf70c 727 return (unicode_recursive_decompose(character, convertedChars));
1c79356b 728 }
1c79356b
A
729}
730
0b4e3aa0 731/*
9bccf70c 732 * unicode_combine - generate a precomposed Unicode char
0b4e3aa0
A
733 *
734 * Precomposed Unicode characters are required for some volume
9bccf70c
A
735 * formats and network protocols. unicode_combine will combine
736 * a decomposed character sequence into a single precomposed
0b4e3aa0
A
737 * (composite) character.
738 *
9bccf70c
A
739 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
740 * also handles Hangul Jamo characters.
0b4e3aa0
A
741 */
742static u_int16_t
9bccf70c 743unicode_combine(u_int16_t base, u_int16_t combining)
0b4e3aa0 744{
9bccf70c
A
745 u_int32_t value;
746
747 /* Check HANGUL */
748 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
749 /* 2 char Hangul sequences */
750 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
751 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
752 return (HANGUL_SBASE +
753 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
754 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
0b4e3aa0 755 }
9bccf70c
A
756
757 /* 3 char Hangul sequences */
758 if ((combining > HANGUL_TBASE) &&
759 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
760 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
761 return (0);
762 else
763 return (base + (combining - HANGUL_TBASE));
0b4e3aa0 764 }
0b4e3aa0
A
765 }
766
9bccf70c
A
767 value = getmappedvalue32(
768 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
769 __CFUniCharPrecompositionTableLength, combining);
0b4e3aa0 770
9bccf70c
A
771 if (value) {
772 value = getmappedvalue16(
773 (const unicode_mappings16 *)
774 ((u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
775 (value >> 16), base);
0b4e3aa0 776 }
9bccf70c 777 return (value);
0b4e3aa0
A
778}
779
13fec989
A
780
781/*
782 * priortysort - order combining chars into canonical order
783 *
784 * Similar to CFUniCharPrioritySort
785 */
786static void
787priortysort(u_int16_t* characters, int count)
788{
789 u_int32_t p1, p2;
790 u_int16_t *ch1, *ch2;
791 u_int16_t *end;
792 int changes = 1;
793
794 end = characters + count;
795 do {
796 changes = 0;
797 ch1 = characters;
798 ch2 = characters + 1;
799 p2 = get_combining_class(*ch1);
800 while (ch2 < end) {
801 p1 = p2;
802 p2 = get_combining_class(*ch2);
803 if (p1 > p2) {
804 u_int32_t tmp;
805
806 tmp = *ch1;
807 *ch1 = *ch2;
808 *ch2 = tmp;
809 changes = 1;
810 }
811 ++ch1;
812 ++ch2;
813 }
814 } while (changes);
815}