]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_utfconv.c
b0173cfb469f0e1237d09c9aeddcd27967f1b3e2
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
1 /*
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #include <sys/param.h>
24 #include <sys/utfconv.h>
25 #include <sys/errno.h>
26 #include <architecture/byte_order.h>
27
28
29 /*
30 * UTF-8 (UCS Transformation Format)
31 *
32 * The following subset of UTF-8 is used to encode UCS-2 filenames. It
33 * requires a maximum of three 3 bytes per UCS-2 character. Only the
34 * shortest encoding required to represent the significant UCS-2 bits
35 * is legal.
36 *
37 * UTF-8 Multibyte Codes
38 *
39 * Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary)
40 * -------------------------------------------------------------------
41 * 1 7 0x0000 0x007F 0xxxxxxx
42 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
43 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
44 * -------------------------------------------------------------------
45 */
46
47
48 #define UCS_TO_UTF8_LEN(c) ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : 3))
49
50 #define UCS_ALT_NULL 0x2400
51
52
53 static u_int16_t ucs_decompose __P((u_int16_t, u_int16_t *));
54
55 static u_int16_t ucs_combine(u_int16_t base, u_int16_t comb);
56
57
58 /*
59 * utf8_encodelen - Calculates the UTF-8 encoding length for a UCS-2 filename
60 *
61 * NOTES:
62 * If '/' chars are allowed on disk then an alternate
63 * (replacement) char must be provided in altslash.
64 *
65 * input flags:
66 * UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
67 */
68 size_t
69 utf8_encodelen(ucsp, ucslen, altslash, flags)
70 const u_int16_t * ucsp;
71 size_t ucslen;
72 u_int16_t altslash;
73 int flags;
74 {
75 u_int16_t ucs_ch;
76 int charcnt;
77 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
78 size_t len;
79
80 charcnt = ucslen / 2;
81 len = 0;
82
83 while (charcnt-- > 0) {
84 ucs_ch = *ucsp++;
85
86 if (swapbytes)
87 ucs_ch = NXSwapShort(ucs_ch);
88 if (ucs_ch == '/')
89 ucs_ch = altslash ? altslash : '_';
90 else if (ucs_ch == '\0')
91 ucs_ch = UCS_ALT_NULL;
92
93 len += UCS_TO_UTF8_LEN(ucs_ch);
94 }
95
96 return (len);
97 }
98
99
100 /*
101 * utf8_encodestr - Encodes a UCS-2 (Unicode) string to UTF-8
102 *
103 * NOTES:
104 * The resulting UTF-8 string is NULL terminated.
105 *
106 * If '/' chars are allowed on disk then an alternate
107 * (replacement) char must be provided in altslash.
108 *
109 * input flags:
110 * UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
111 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
112 *
113 * result:
114 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
115 * EINVAL: Illegal char found; char was replaced by an '_'.
116 */
117 int utf8_encodestr(ucsp, ucslen, utf8p, utf8len, buflen, altslash, flags)
118 const u_int16_t * ucsp;
119 size_t ucslen;
120 u_int8_t * utf8p;
121 size_t * utf8len;
122 size_t buflen;
123 u_int16_t altslash;
124 int flags;
125 {
126 u_int8_t * bufstart;
127 u_int8_t * bufend;
128 u_int16_t ucs_ch;
129 u_int16_t extra[2] = {0};
130 int charcnt;
131 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
132 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
133 int decompose = (flags & UTF_DECOMPOSED);
134 int result = 0;
135
136 bufstart = utf8p;
137 bufend = bufstart + buflen;
138 if (nullterm)
139 --bufend;
140 charcnt = ucslen / 2;
141
142 while (charcnt-- > 0) {
143 if (!decompose)
144 ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
145 else if (extra[0]) {
146 ucs_ch = extra[0]; extra[0] = 0;
147 } else if (extra[1]) {
148 ucs_ch = extra[1]; extra[1] = 0;
149 } else {
150 ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
151 ucs_ch = ucs_decompose(ucs_ch, &extra[0]);
152 if (extra[0])
153 charcnt++;
154 if (extra[1])
155 charcnt++;
156 }
157
158 /* Slash and NULL are not permitted */
159 if (ucs_ch == '/') {
160 if (altslash)
161 ucs_ch = altslash;
162 else {
163 ucs_ch = '_';
164 result = EINVAL;
165 }
166 } else if (ucs_ch == '\0') {
167 ucs_ch = UCS_ALT_NULL;
168 }
169
170 if (ucs_ch < 0x0080) {
171 if (utf8p >= bufend) {
172 result = ENAMETOOLONG;
173 break;
174 }
175 *utf8p++ = ucs_ch;
176
177 } else if (ucs_ch < 0x800) {
178 if ((utf8p + 1) >= bufend) {
179 result = ENAMETOOLONG;
180 break;
181 }
182 *utf8p++ = (ucs_ch >> 6) | 0xc0;
183 *utf8p++ = (ucs_ch & 0x3f) | 0x80;
184
185 } else {
186 if ((utf8p + 2) >= bufend) {
187 result = ENAMETOOLONG;
188 break;
189 }
190 *utf8p++ = (ucs_ch >> 12) | 0xe0;
191 *utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
192 *utf8p++ = ((ucs_ch) & 0x3f) | 0x80;
193 }
194 }
195
196 *utf8len = utf8p - bufstart;
197 if (nullterm)
198 *utf8p++ = '\0';
199
200 return (result);
201 }
202
203
204 /*
205 * utf8_decodestr - Decodes a UTF-8 string back to UCS-2 (Unicode)
206 *
207 * NOTES:
208 * The input UTF-8 string does not need to be null terminated
209 * if utf8len is set.
210 *
211 * If '/' chars are allowed on disk then an alternate
212 * (replacement) char must be provided in altslash.
213 *
214 * input flags:
215 * UTF_REV_ENDIAN: UCS-2 byteorder is oposite current runtime
216 * UTF_DECOMPOSED: UCS-2 output string must be fully decompsed
217 *
218 * result:
219 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
220 * EINVAL: Illegal UTF-8 sequence found.
221 */
222 int
223 utf8_decodestr(utf8p, utf8len, ucsp, ucslen, buflen, altslash, flags)
224 const u_int8_t* utf8p;
225 size_t utf8len;
226 u_int16_t* ucsp;
227 size_t *ucslen;
228 size_t buflen;
229 u_int16_t altslash;
230 int flags;
231 {
232 u_int16_t* bufstart;
233 u_int16_t* bufend;
234 u_int16_t ucs_ch;
235 u_int8_t byte;
236 int result = 0;
237 int decompose, precompose, swapbytes;
238
239 decompose = (flags & UTF_DECOMPOSED);
240 precompose = (flags & UTF_PRECOMPOSED);
241 swapbytes = (flags & UTF_REVERSE_ENDIAN);
242
243 bufstart = ucsp;
244 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
245
246 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
247 if (ucsp >= bufend) {
248 result = ENAMETOOLONG;
249 goto stop;
250 }
251
252 /* check for ascii */
253 if (byte < 0x80) {
254 ucs_ch = byte;
255 } else {
256 switch (byte & 0xf0) {
257 /* 2 byte sequence */
258 case 0xc0:
259 case 0xd0:
260 /* extract bits 6 - 10 from first byte */
261 ucs_ch = (byte & 0x1F) << 6;
262 if (ucs_ch < 0x0080) {
263 result = EINVAL; /* seq not minimal */
264 goto stop;
265 }
266 break;
267 /* 3 byte sequence */
268 case 0xe0:
269 /* extract bits 12 - 15 from first byte */
270 ucs_ch = (byte & 0x0F) << 6;
271
272 /* extract bits 6 - 11 from second byte */
273 if (((byte = *utf8p++) & 0xc0) != 0x80) {
274 result = EINVAL;
275 goto stop;
276 }
277 utf8len--;
278
279 ucs_ch += (byte & 0x3F);
280 ucs_ch <<= 6;
281
282 if (ucs_ch < 0x0800) {
283 result = EINVAL; /* sequence not minimal */
284 goto stop;
285 }
286 break;
287 default:
288 result = EINVAL;
289 goto stop;
290 }
291
292 /* extract bits 0 - 5 from final byte */
293 if (((byte = *utf8p++) & 0xc0) != 0x80) {
294 result = EINVAL;
295 goto stop;
296 }
297 utf8len--;
298 ucs_ch += (byte & 0x3F);
299
300 if (decompose) {
301 u_int16_t comb_ch[2];
302
303 ucs_ch = ucs_decompose(ucs_ch, &comb_ch[0]);
304
305 if (comb_ch[0]) {
306 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
307 if (ucsp >= bufend) {
308 result = ENAMETOOLONG;
309 goto stop;
310 }
311 ucs_ch = comb_ch[0];
312 if (comb_ch[1]) {
313 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
314 if (ucsp >= bufend) {
315 result = ENAMETOOLONG;
316 goto stop;
317 }
318 ucs_ch = comb_ch[1];
319 }
320 }
321 } else if (precompose && (ucsp != bufstart)) {
322 u_int16_t composite, base;
323
324 base = swapbytes ? NXSwapShort(*(ucsp - 1)) : *(ucsp - 1);
325 composite = ucs_combine(base, ucs_ch);
326 if (composite) {
327 --ucsp;
328 ucs_ch = composite;
329 }
330 }
331 if (ucs_ch == UCS_ALT_NULL)
332 ucs_ch = '\0';
333 }
334
335 if (ucs_ch == altslash)
336 ucs_ch = '/';
337 if (swapbytes)
338 ucs_ch = NXSwapShort(ucs_ch);
339
340 *ucsp++ = ucs_ch;
341 }
342 stop:
343 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
344
345 return (result);
346 }
347
348
349 /*
350 * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
351 * primary_char yields first decomposed char. If this
352 * char is an alpha char then get the combining char
353 * from the combining_char table and add 0x0300 to it.
354 */
355
356 static unsigned char primary_char[8*36] = {
357 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x00, 0x43,
358
359 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, /* CF */
360
361 0x00, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0x00,
362
363 0x00, 0x55, 0x55, 0x55, 0x55, 0x59, 0x00, 0x00, /* DF */
364
365 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x00, 0x63,
366
367 0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69, /* EF */
368
369 0x00, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0x00,
370
371 0x00, 0x75, 0x75, 0x75, 0x75, 0x79, 0x00, 0x79, /* FF */
372
373 0x41, 0x61, 0x41, 0x61, 0x41, 0x61, 0x43, 0x63,
374
375 0x43, 0x63, 0x43, 0x63, 0x43, 0x63, 0x44, 0x64, /* 10F */
376
377 0x00, 0x00, 0x45, 0x65, 0x45, 0x65, 0x45, 0x65,
378
379 0x45, 0x65, 0x45, 0x65, 0x47, 0x67, 0x47, 0x67, /* 11F */
380
381 0x47, 0x67, 0x47, 0x67, 0x48, 0x68, 0x00, 0x00,
382
383 0x49, 0x69, 0x49, 0x69, 0x49, 0x69, 0x49, 0x69,
384
385 0x49, 0x00, 0x00, 0x00, 0x4A, 0x6A, 0x4B, 0x6B,
386
387 0x00, 0x4C, 0x6C, 0x4C, 0x6C, 0x4C, 0x6C, 0x00, /* 13F */
388
389 0x00, 0x00, 0x00, 0x4E, 0x6E, 0x4E, 0x6E, 0x4E,
390
391 0x6E, 0x00, 0x00, 0x00, 0x4F, 0x6F, 0x4F, 0x6F,
392
393 0x4F, 0x6F, 0x00, 0x00, 0x52, 0x72, 0x52, 0x72,
394
395 0x52, 0x72, 0x53, 0x73, 0x53, 0x73, 0x53, 0x73, /* 15F */
396
397 0x53, 0x73, 0x54, 0x74, 0x54, 0x74, 0x00, 0x00,
398
399 0x55, 0x75, 0x55, 0x75, 0x55, 0x75, 0x55, 0x75,
400
401 0x55, 0x75, 0x55, 0x75, 0x57, 0x77, 0x59, 0x79,
402
403 0x59, 0x5A, 0x7A, 0x5A, 0x7A, 0x5A, 0x7A, 0x00, /* 17F */
404
405 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
406
407 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
408
409 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
410
411 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 19F */
412
413 0x4F, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
414
415 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55,
416
417 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
418
419 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1BF */
420
421 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
422
423 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x61, 0x49,
424
425 0x69, 0x4F, 0x6F, 0x55, 0x75, 0xDC, 0xFC, 0xDC,
426
427 0xFC, 0xDC, 0xFC, 0xDC, 0xFC, 0x00, 0xC4, 0xE4 /* 1DF */
428
429 };
430
431 static unsigned char combining_char[8*36] = {
432 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
433
434 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, /* CF */
435
436 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
437
438 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF, /* DF */
439
440 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
441
442 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, /* EF */
443
444 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
445
446 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08, /* FF */
447
448 0x04, 0x04, 0x06, 0x06, 0x28, 0x28, 0x01, 0x01,
449
450 0x02, 0x02, 0x07, 0x07, 0x0C, 0x0C, 0x0C, 0x0C,
451
452 0x00, 0x00, 0x04, 0x04, 0x06, 0x06, 0x07, 0x07,
453
454 0x28, 0x28, 0x0C, 0x0C, 0x02, 0x02, 0x06, 0x06,
455
456 0x07, 0x07, 0x27, 0x27, 0x02, 0x02, 0x00, 0x00,
457
458 0x03, 0x03, 0x04, 0x04, 0x06, 0x06, 0x28, 0x28,
459
460 0x07, 0x00, 0x00, 0x00, 0x02, 0x02, 0x27, 0x27,
461
462 0x00, 0x01, 0x01, 0x27, 0x27, 0x0C, 0x0C, 0x00, /* 13F */
463
464 0x00, 0x00, 0x00, 0x01, 0x01, 0x27, 0x27, 0x0C,
465
466 0x0C, 0x00, 0x00, 0x00, 0x04, 0x04, 0x06, 0x06,
467
468 0x0B, 0x0B, 0x00, 0x00, 0x01, 0x01, 0x27, 0x27,
469
470 0x0C, 0x0C, 0x01, 0x01, 0x02, 0x02, 0x27, 0x27,
471
472 0x0C, 0x0C, 0x27, 0x27, 0x0C, 0x0C, 0x00, 0x00,
473
474 0x03, 0x03, 0x04, 0x04, 0x06, 0x06, 0x0A, 0x0A, /* 16F */
475
476 0x0B, 0x0B, 0x28, 0x28, 0x02, 0x02, 0x02, 0x02,
477
478 0x08, 0x01, 0x01, 0x07, 0x07, 0x0C, 0x0C, 0x00,
479
480 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 17F */
481
482 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
483
484 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
485
486 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 19F */
487
488 0x1B, 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
489
490 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1B,
491
492 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
493
494 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
495
496 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
497
498 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x0C, /* 1CF */
499
500 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x04, 0x04, 0x01,
501
502 0x01, 0x0C, 0x0C, 0x00, 0x00, 0x00, 0x04, 0x04 /* 1DF */
503 };
504
505
506 /* CYRILLIC codepoints 0x0400 ~ 0x04FF */
507 static const unsigned long __CyrillicDecompBitmap[] = {
508 0x40000040, 0x00000040, 0x00004000, 0x00000000, /* 0x0400 */
509 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 0x0480 */
510 };
511
512 /* CJK codepoints 0x3000 ~ 0x30FF */
513 static const unsigned long __CJKDecompBitmap[] = {
514 0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, /* 0x3000 */
515 0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, /* 0x3080 */
516 };
517 #define IS_DECOMPOSABLE(table,unicodeVal) \
518 (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
519
520 /*
521 * ucs_decompose - decompose a composed UCS-2 char
522 *
523 * Composed Unicode characters are forbidden on
524 * HFS Plus volumes. ucs_decompose will convert a
525 * composed character into its correct decomposed
526 * sequence.
527 *
528 * Currently only Tier-1 and Tier-2 languages
529 * are handled. Other composed characters are
530 * passed unchanged.
531 */
532 static u_int16_t
533 ucs_decompose(register u_int16_t ch, u_int16_t *cmb)
534 {
535 u_int16_t base;
536
537 cmb[0] = 0;
538 cmb[1] = 0;
539
540 if (ch < 0x00C0) {
541 base = ch;
542 } else if (ch <= 0x01DF) {
543
544 base = (u_int16_t) primary_char[ch - 0x00C0];
545
546 if (base == 0)
547 base = ch;
548 else {
549 if ((base < 0x00C0) || (primary_char[base - 0x00C0] == 0))
550 cmb[0] = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch - 0x00C0];
551 else {
552 u_int16_t tch = base;
553
554 base = (u_int16_t)primary_char[tch - 0x00C0];
555 cmb[0] = (u_int16_t)0x0300 + (u_int16_t)combining_char[tch - 0x00C0];
556 cmb[1] = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch - 0x00C0];
557 }
558 }
559 } else if ((ch >= 0x0400) && (ch <= 0x04FF) &&
560 IS_DECOMPOSABLE(__CyrillicDecompBitmap, ch - 0x0400)) {
561
562 /* Handle CYRILLIC LETTERs */
563 switch(ch) {
564 case 0x0401: base = 0x0415; cmb[0] = 0x0308; break; /* */
565 case 0x0419: base = 0x0418; cmb[0] = 0x0306; break; /* */
566 case 0x0439: base = 0x0438; cmb[0] = 0x0306; break; /* */
567 case 0x0451: base = 0x0435; cmb[0] = 0x0308; break; /* */
568
569 default:
570 /* Should not be hit from bit map table */
571 base = ch;
572 }
573 } else if (ch == 0x1E3F) {
574 base = 0x006D; cmb[0] = 0x0301; /* LATIN SMALL LETTER M WITH ACUTE */
575 } else if ((ch > 0x3000) && (ch < 0x3100) &&
576 IS_DECOMPOSABLE(__CJKDecompBitmap, ch - 0x3000)) {
577
578 /* Handle HIRAGANA LETTERs */
579 switch(ch) {
580 case 0x3071: base = 0x306F; cmb[0] = 0x309A; break; /* PA */
581 case 0x3074: base = 0x3072; cmb[0] = 0x309A; break; /* PI */
582 case 0x3077: base = 0x3075; cmb[0] = 0x309A; break; /* PU */
583 case 0x307A: base = 0x3078; cmb[0] = 0x309A; break; /* PE */
584
585 case 0x307D: base = 0x307B; cmb[0] = 0x309A; break; /* PO */
586 case 0x3094: base = 0x3046; cmb[0] = 0x3099; break; /* VU */
587 case 0x30D1: base = 0x30CF; cmb[0] = 0x309A; break; /* PA */
588 case 0x30D4: base = 0x30D2; cmb[0] = 0x309A; break; /* PI */
589
590 case 0x30D7: base = 0x30D5; cmb[0] = 0x309A; break; /* PU */
591 case 0x30DA: base = 0x30D8; cmb[0] = 0x309A; break; /* PE */
592 case 0x30DD: base = 0x30DB; cmb[0] = 0x309A; break; /* PO */
593 case 0x30F4: base = 0x30A6; cmb[0] = 0x3099; break; /* VU */
594
595 case 0x30F7: base = 0x30EF; cmb[0] = 0x3099; break; /* VA */
596 case 0x30F8: base = 0x30F0; cmb[0] = 0x3099; break; /* VI */
597 case 0x30F9: base = 0x30F1; cmb[0] = 0x3099; break; /* VE */
598 case 0x30FA: base = 0x30F2; cmb[0] = 0x3099; break; /* VO */
599
600 default:
601 /* the rest (41 of them) have a simple conversion */
602 base = ch - 1;
603 cmb[0] = 0x3099;
604 }
605 } else if ((ch >= 0xAC00) && (ch < 0xD7A4)) {
606 /* Hangul */
607 ch -= 0xAC00;
608 base = 0x1100 + (ch / (21*28));
609 cmb[0] = 0x1161 + (ch % (21*28)) / 28;
610
611 if (ch % 28)
612 cmb[1] = 0x11A7 + (ch % 28);
613 } else {
614 base = ch;
615 }
616
617 return (base);
618 }
619
620
621 static const short diacrit_tbl[8*6] = {
622 /* 300 - 307 */ 0, 58, 116, 174, 232, -1, 290, 348,
623 /* 308 - 30F */ 406, -1, 464, 522, 580, -1, -1, -1,
624 /* 310 - 317 */ -1, -1, -1, -1, -1, -1, -1, -1,
625 /* 318 - 31F */ -1, -1, -1, 638, -1, -1, -1, -1,
626 /* 320 - 327 */ -1, -1, -1, -1, -1, -1, -1, 696,
627 /* 328 - 32F */ 754, -1, -1, -1, -1, -1, -1, -1
628 };
629
630 static const u_int16_t composite_tbl[58*14] = {
631 /*
632 * A B C D E F G H I J K L M
633 * N O P Q R S T U V W X Y Z
634 * [ \ ] ^ _ `
635 * a b c d e f g h i j k l m
636 * n o p q r s t u v w x y z
637 */
638
639 /*
640 * 0x300 - grave accent
641 */
642 0x0C0, 0, 0, 0,0x0C8, 0, 0, 0,0x0CC, 0, 0, 0, 0,
643 0,0x0D2, 0, 0, 0, 0, 0,0x0D9, 0, 0, 0, 0, 0,
644 0, 0, 0, 0, 0, 0,
645 0x0E0, 0, 0, 0,0x0E8, 0, 0, 0,0x0EC, 0, 0, 0, 0,
646 0,0x0F2, 0, 0, 0, 0, 0,0x0F9, 0, 0, 0, 0, 0,
647 /*
648 * 0x301 - acute accent
649 */
650 0x0C1, 0,0x106, 0,0x0C9, 0, 0, 0,0x0CD, 0, 0,0x139, 0,
651 0x143,0x0D3, 0, 0,0x154,0x15A, 0,0x0DA, 0, 0, 0,0x0DD,0x179,
652 0, 0, 0, 0, 0, 0,
653 0x0E1, 0,0x107, 0,0x0E9, 0, 0, 0,0x0ED, 0, 0,0x13A,0x1E3F,
654 0x144,0x0F3, 0, 0,0x155,0x15B, 0,0x0FA, 0, 0, 0,0x0FD,0x17A,
655 /*
656 * 0x302 - circumflex accent
657 */
658 0x0C2, 0,0x108, 0,0x0CA, 0,0x11C,0x124,0x0CE,0x134, 0, 0, 0,
659 0,0x0D4, 0, 0, 0,0x15C, 0,0x0DB, 0,0x174, 0,0x176, 0,
660 0, 0, 0, 0, 0, 0,
661 0x0E2, 0,0x109, 0,0x0EA, 0,0x11D,0x125,0x0EE,0x135, 0, 0, 0,
662 0,0x0F4, 0, 0, 0,0x15D, 0,0x0FB, 0,0x175, 0,0x177, 0,
663 /*
664 * 0x303 - tilde
665 */
666 0x0C3, 0, 0, 0, 0, 0, 0, 0,0x128, 0, 0, 0, 0,
667 0x0D1,0x0D5, 0, 0, 0, 0, 0,0x168, 0, 0, 0, 0, 0,
668 0, 0, 0, 0, 0, 0,
669 0x0E3, 0, 0, 0, 0, 0, 0, 0,0x129, 0, 0, 0, 0,
670 0x0F1,0x0F5, 0, 0, 0, 0, 0,0x169, 0, 0, 0, 0, 0,
671 /*
672 * 0x304 - macron
673 */
674 0x100, 0, 0, 0,0x112, 0, 0, 0,0x12A, 0, 0, 0, 0,
675 0,0x14C, 0, 0, 0, 0, 0,0x16A, 0, 0, 0, 0, 0,
676 0, 0, 0, 0, 0, 0,
677 0x101, 0, 0, 0,0x113, 0, 0, 0,0x12B, 0, 0, 0, 0,
678 0,0x14D, 0, 0, 0, 0, 0,0x16B, 0, 0, 0, 0, 0,
679 /*
680 * 0x306 - breve
681 */
682 0x102, 0, 0, 0,0x114, 0,0x11E, 0,0x12C, 0, 0, 0, 0,
683 0,0x14E, 0, 0, 0, 0, 0,0x16C, 0, 0, 0, 0, 0,
684 0, 0, 0, 0, 0, 0,
685 0x103, 0, 0, 0,0x115, 0,0x11F, 0,0x12D, 0, 0, 0, 0,
686 0,0x14F, 0, 0, 0, 0, 0,0x16D, 0, 0, 0, 0, 0,
687 /*
688 * 0x307 - dot above
689 */
690 0, 0,0x10A, 0,0x116, 0,0x120, 0,0x130, 0, 0, 0, 0,
691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0x17B,
692 0, 0, 0, 0, 0, 0,
693 0, 0,0x10B, 0,0x117, 0,0x121, 0, 0, 0, 0, 0, 0,
694 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0x17C,
695 /*
696 * 0x308 - diaeresis
697 */
698 0x0C4, 0, 0, 0,0x0CB, 0, 0, 0,0x0CF, 0, 0, 0, 0,
699 0,0x0D6, 0, 0, 0, 0, 0,0x0DC, 0, 0, 0,0x178, 0,
700 0, 0, 0, 0, 0, 0,
701 0x0E4, 0, 0, 0,0x0EB, 0, 0, 0,0x0EF, 0, 0, 0, 0,
702 0,0x0F6, 0, 0, 0, 0, 0,0x0FC, 0, 0, 0,0x0FF, 0,
703 /*
704 * 0x30A - ring above
705 */
706 0x0C5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
707 0, 0, 0, 0, 0, 0, 0,0x16E, 0, 0, 0, 0, 0,
708 0, 0, 0, 0, 0, 0,
709 0x0E5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
710 0, 0, 0, 0, 0, 0, 0,0x16F, 0, 0, 0, 0, 0,
711 /*
712 * 0x30B - double aute accent
713 */
714 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
715 0,0x150, 0, 0, 0, 0, 0,0x170, 0, 0, 0, 0, 0,
716 0, 0, 0, 0, 0, 0,
717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
718 0,0x151, 0, 0, 0, 0, 0,0x171, 0, 0, 0, 0, 0,
719 /*
720 * 0x30C - caron
721 */
722 0x1CD, 0,0x10C,0x10E,0x11A, 0, 0, 0,0x1CF, 0, 0,0x13D, 0,
723 0x147,0x1D1, 0, 0,0x158,0x160,0x164,0x1D3, 0, 0, 0, 0,0x17D,
724 0, 0, 0, 0, 0, 0,
725 0x1CE, 0,0x10D,0x10F,0x11B, 0, 0, 0,0x1D0, 0, 0,0x13E, 0,
726 0x148,0x1D2, 0, 0,0x159,0x161,0x165,0x1D4, 0, 0, 0, 0,0x17E,
727 /*
728 * 0x31B - horn
729 */
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
731 0,0x1A0, 0, 0, 0, 0, 0,0x1AF, 0, 0, 0, 0, 0,
732 0, 0, 0, 0, 0, 0,
733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
734 0,0x1A1, 0, 0, 0, 0, 0,0x1B0, 0, 0, 0, 0, 0,
735 /*
736 * 0x327 - cedilla
737 */
738 0, 0,0x0C7, 0, 0, 0,0x122, 0, 0, 0,0x136,0x13B, 0,
739 0x145, 0, 0, 0,0x156,0x15E,0x162, 0, 0, 0, 0, 0, 0,
740 0, 0, 0, 0, 0, 0,
741 0, 0,0x0E7, 0, 0, 0,0x123, 0, 0, 0,0x137,0x13C, 0,
742 0x146, 0, 0, 0,0x157,0x15F,0x163, 0, 0, 0, 0, 0, 0,
743 /*
744 * 0x328 - ogonek
745 */
746 0x104, 0, 0, 0,0x118, 0, 0, 0,0x12E, 0, 0, 0, 0,
747 0, 0, 0, 0, 0, 0, 0,0x172, 0, 0, 0, 0, 0,
748 0, 0, 0, 0, 0, 0,
749 0x105, 0, 0, 0,0x119, 0, 0, 0,0x12F, 0, 0, 0, 0,
750 0, 0, 0, 0, 0, 0, 0,0x173, 0, 0, 0, 0, 0,
751 };
752
753
754 /* CJK codepoints 0x3000 ~ 0x30FF */
755 static const unsigned long __CJKCombBitmap[] = {
756 0x00000000, 0x00000000, 0x02155555, 0x4A812490, /* 0x3000 */
757 0x00000004, 0x02155555, 0x4A812490, 0x0001E004, /* 0x3080 */
758 };
759 #define CAN_COMBINE(table,unicodeVal) \
760 (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
761
762
763 /*
764 * ucs_combine - generate a precomposed UCS-2 char
765 *
766 * Precomposed Unicode characters are required for some volume
767 * formats and network protocols. ucs_combine will combine a
768 * decomposed character sequence into a single precomposed
769 * (composite) character.
770 *
771 * Currently only decomcomposed sequences from Apple's Tier 1
772 * and Tier 2 languages are handled.
773 *
774 * INPUT:
775 * base - base character
776 * comb - combining character
777 * OUTPUT:
778 * result - precomposed char or zero if not combinable
779 */
780 static u_int16_t
781 ucs_combine(u_int16_t base, u_int16_t comb)
782 {
783 /* Get out early if we can */
784 if (comb < 0x0300)
785 return (0);
786
787 /* Try ordinary diacritics (0x300 - 0x32F) */
788 if (comb <= 0x032F) {
789 int index;
790
791 if (base >= 'A' && base <= 'z') {
792 index = diacrit_tbl[comb - 0x0300];
793 if (index < 0 ) return (0);
794
795 return (composite_tbl[index + (base - 'A')]);
796 }
797
798 /* Handle Cyrillic and some 3 char latin sequences */
799 switch (comb) {
800 case 0x0300:
801 switch (base) {
802 case 0x00DC: return (0x01DB);
803 case 0x00FC: return (0x01DC);
804 } break;
805 case 0x0301:
806 switch (base) {
807 case 0x00DC: return (0x01D7);
808 case 0x00FC: return (0x01D8);
809 } break;
810 case 0x0304:
811 switch (base) {
812 case 0x00DC: return (0x01D5);
813 case 0x00FC: return (0x01D6);
814 case 0x00C4: return (0x01DE);
815 case 0x00E4: return (0x01DF);
816 } break;
817 case 0x0306:
818 switch (base) {
819 case 0x0418: return (0x0419);
820 case 0x0438: return (0x0439);
821 } break;
822 case 0x0308:
823 switch (base) {
824 case 0x0415: return (0x0401);
825 case 0x0435: return (0x0451);
826 } break;
827 case 0x030C:
828 switch (base) {
829 case 0x00DC: return (0x01D9);
830 case 0x00FC: return (0x01DA);
831 } break;
832 }
833 return (0);
834 }
835
836 /* Now try HANGUL */
837 if (comb < 0x1161)
838 return (0);
839
840 /* 2 char Hangul sequences */
841 if ((comb <= 0x1175) && (base >= 0x1100 && base <= 0x1112))
842 return (0xAC00 + ((base - 0x1100)*(21*28)) + ((comb - 0x1161)*28));
843
844 /* 3 char Hangul sequences */
845 if ((comb >= 0x11A8 && comb <= 0x11C2) &&
846 (base >= 0xAC00 && base <= 0xD788)) {
847 if ((base - 0xAC00) % 28)
848 return (0);
849 else
850 return (base + (comb - 0x11A7));
851 }
852
853 /* Now try HIRAGANA and KATAKANA */
854 if ((comb == 0x3099 || comb == 0x309A) &&
855 (base > 0x3000 && base < 0x3100) &&
856 CAN_COMBINE(__CJKCombBitmap, base - 0x3000)) {
857 if (comb == 0x309A) {
858 switch(base) {
859 case 0x306F: return (0x3071); /* PA */
860 case 0x3072: return (0x3074); /* PI */
861 case 0x3075: return (0x3077); /* PU */
862 case 0x3078: return (0x307A); /* PE */
863 case 0x307B: return (0x307D); /* PO */
864 case 0x30CF: return (0x30D1); /* PA */
865 case 0x30D2: return (0x30D4); /* PI */
866 case 0x30D5: return (0x30D7); /* PU */
867 case 0x30D8: return (0x30DA); /* PE */
868 case 0x30DB: return (0x30DD); /* PO */
869 default: return (0);
870 }
871 } else /* 0x3099 */ {
872 switch (base) {
873 case 0x3046: return (0x3094); /* VU */
874 case 0x30A6: return (0x30F4); /* VU */
875 case 0x30EF: return (0x30F7); /* VA */
876 case 0x30F0: return (0x30F8); /* VI */
877 case 0x30F1: return (0x30F9); /* VE */
878 case 0x30F2: return (0x30FA); /* VO */
879 default: return (base + 1); /* 41 code points here */
880 }
881 }
882 }
883
884 return (0);
885 }
886