]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_utfconv.c
4629660720852a32834914f3df9315625919de00
[apple/xnu.git] / bsd / vfs / vfs_utfconv.c
1 /*
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 #include <sys/param.h>
23 #include <sys/utfconv.h>
24 #include <sys/errno.h>
25 #include <architecture/byte_order.h>
26
27 /*
28 * UTF-8 (Unicode Transformation Format)
29 *
30 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
31 * character as a sequence of one to four bytes. Only the shortest form
32 * required to represent the significant Unicode bits is legal.
33 *
34 * UTF-8 Multibyte Codes
35 *
36 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
37 * -----------------------------------------------------------------------------
38 * 1 7 0x0000 0x007F 0xxxxxxx
39 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
40 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
41 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
42 * -----------------------------------------------------------------------------
43 */
44
45
46 #define UNICODE_TO_UTF8_LEN(c) \
47 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
48
49 #define UCS_ALT_NULL 0x2400
50
51 /* Surrogate Pair Constants */
52 #define SP_HALF_SHIFT 10
53 #define SP_HALF_BASE 0x0010000UL
54 #define SP_HALF_MASK 0x3FFUL
55
56 #define SP_HIGH_FIRST 0xD800UL
57 #define SP_HIGH_LAST 0xDBFFUL
58 #define SP_LOW_FIRST 0xDC00UL
59 #define SP_LOW_LAST 0xDFFFUL
60
61
62 static u_int16_t ucs_decompose(u_int16_t, u_int16_t *);
63
64 static u_int16_t ucs_combine(u_int16_t base, u_int16_t comb);
65
66
67 char utf_extrabytes[32] = {
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
70 };
71
72
73 /*
74 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
75 *
76 * NOTES:
77 * If '/' chars are allowed on disk then an alternate
78 * (replacement) char must be provided in altslash.
79 *
80 * input flags:
81 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
82 */
83 size_t
84 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
85 int flags)
86 {
87 u_int16_t ucs_ch;
88 int charcnt;
89 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
90 size_t len;
91
92 charcnt = ucslen / 2;
93 len = 0;
94
95 while (charcnt-- > 0) {
96 ucs_ch = *ucsp++;
97
98 if (swapbytes)
99 ucs_ch = NXSwapShort(ucs_ch);
100 if (ucs_ch == '/')
101 ucs_ch = altslash ? altslash : '_';
102 else if (ucs_ch == '\0')
103 ucs_ch = UCS_ALT_NULL;
104
105 len += UNICODE_TO_UTF8_LEN(ucs_ch);
106 }
107
108 return (len);
109 }
110
111
112 /*
113 * utf8_encodestr - Encodes a Unicode string to UTF-8
114 *
115 * NOTES:
116 * The resulting UTF-8 string is NULL terminated.
117 *
118 * If '/' chars are allowed on disk then an alternate
119 * (replacement) char must be provided in altslash.
120 *
121 * input flags:
122 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
123 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
124 *
125 * result:
126 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
127 * EINVAL: Illegal char found; char was replaced by an '_'.
128 */
129 int
130 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
131 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
132 {
133 u_int8_t * bufstart;
134 u_int8_t * bufend;
135 u_int16_t ucs_ch;
136 u_int16_t extra[2] = {0};
137 int charcnt;
138 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
139 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
140 int decompose = (flags & UTF_DECOMPOSED);
141 int result = 0;
142
143 bufstart = utf8p;
144 bufend = bufstart + buflen;
145 if (nullterm)
146 --bufend;
147 charcnt = ucslen / 2;
148
149 while (charcnt-- > 0) {
150 if (!decompose)
151 ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
152 else if (extra[0]) {
153 ucs_ch = extra[0]; extra[0] = 0;
154 } else if (extra[1]) {
155 ucs_ch = extra[1]; extra[1] = 0;
156 } else {
157 ucs_ch = swapbytes ? NXSwapShort(*ucsp++) : *ucsp++;
158 ucs_ch = ucs_decompose(ucs_ch, &extra[0]);
159 if (extra[0])
160 charcnt++;
161 if (extra[1])
162 charcnt++;
163 }
164
165 /* Slash and NULL are not permitted */
166 if (ucs_ch == '/') {
167 if (altslash)
168 ucs_ch = altslash;
169 else {
170 ucs_ch = '_';
171 result = EINVAL;
172 }
173 } else if (ucs_ch == '\0') {
174 ucs_ch = UCS_ALT_NULL;
175 }
176
177 if (ucs_ch < 0x0080) {
178 if (utf8p >= bufend) {
179 result = ENAMETOOLONG;
180 break;
181 }
182 *utf8p++ = ucs_ch;
183
184 } else if (ucs_ch < 0x800) {
185 if ((utf8p + 1) >= bufend) {
186 result = ENAMETOOLONG;
187 break;
188 }
189 *utf8p++ = 0xc0 | (ucs_ch >> 6);
190 *utf8p++ = 0x80 | (0x3f & ucs_ch);
191
192 } else {
193 /* Combine valid surrogate pairs */
194 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
195 && charcnt > 0) {
196 u_int16_t ch2;
197 u_int32_t pair;
198
199 ch2 = swapbytes ? NXSwapShort(*ucsp) : *ucsp;
200 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
201 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
202 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
203 if ((utf8p + 3) >= bufend) {
204 result = ENAMETOOLONG;
205 break;
206 }
207 --charcnt;
208 ++ucsp;
209 *utf8p++ = 0xf0 | (pair >> 18);
210 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
211 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
212 *utf8p++ = 0x80 | (0x3f & pair);
213 continue;
214 }
215 }
216 if ((utf8p + 2) >= bufend) {
217 result = ENAMETOOLONG;
218 break;
219 }
220 *utf8p++ = 0xe0 | (ucs_ch >> 12);
221 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
222 *utf8p++ = 0x80 | (0x3f & ucs_ch);
223 }
224 }
225
226 *utf8len = utf8p - bufstart;
227 if (nullterm)
228 *utf8p++ = '\0';
229
230 return (result);
231 }
232
233
234 /*
235 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
236 *
237 * NOTES:
238 * The input UTF-8 string does not need to be null terminated
239 * if utf8len is set.
240 *
241 * If '/' chars are allowed on disk then an alternate
242 * (replacement) char must be provided in altslash.
243 *
244 * input flags:
245 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
246 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
247 *
248 * result:
249 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
250 * EINVAL: Illegal UTF-8 sequence found.
251 */
252 int
253 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
254 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
255 {
256 u_int16_t* bufstart;
257 u_int16_t* bufend;
258 u_int16_t ucs_ch;
259 u_int8_t byte;
260 int result = 0;
261 int decompose, precompose, swapbytes;
262
263 decompose = (flags & UTF_DECOMPOSED);
264 precompose = (flags & UTF_PRECOMPOSED);
265 swapbytes = (flags & UTF_REVERSE_ENDIAN);
266
267 bufstart = ucsp;
268 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
269
270 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
271 if (ucsp >= bufend)
272 goto toolong;
273
274 /* check for ascii */
275 if (byte < 0x80) {
276 ucs_ch = byte; /* 1st byte */
277 } else {
278 u_int32_t ch;
279 int extrabytes = utf_extrabytes[byte >> 3];
280
281 if (utf8len < extrabytes)
282 goto invalid;
283 utf8len -= extrabytes;
284
285 switch (extrabytes) {
286 case 1: ch = byte; /* 1st byte */
287 ch <<= 6;
288 ch += *utf8p++; /* 2nd byte */
289 ch -= 0x00003080UL;
290 if (ch < 0x0080)
291 goto invalid;
292 ucs_ch = ch;
293 break;
294
295 case 2: ch = byte; /* 1st byte */
296 ch <<= 6;
297 ch += *utf8p++; /* 2nd byte */
298 ch <<= 6;
299 ch += *utf8p++; /* 3rd byte */
300 ch -= 0x000E2080UL;
301 if (ch < 0x0800)
302 goto invalid;
303 ucs_ch = ch;
304 break;
305
306 case 3: ch = byte; /* 1st byte */
307 ch <<= 6;
308 ch += *utf8p++; /* 2nd byte */
309 ch <<= 6;
310 ch += *utf8p++; /* 3rd byte */
311 ch <<= 6;
312 ch += *utf8p++; /* 4th byte */
313 ch -= 0x03C82080UL + SP_HALF_BASE;
314 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
315 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
316 if (ucsp >= bufend)
317 goto toolong;
318 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
319 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
320 continue;
321
322 default:
323 goto invalid;
324 }
325 if (decompose) {
326 u_int16_t comb_ch[2];
327
328 ucs_ch = ucs_decompose(ucs_ch, &comb_ch[0]);
329
330 if (comb_ch[0]) {
331 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
332 if (ucsp >= bufend)
333 goto toolong;
334 ucs_ch = comb_ch[0];
335 if (comb_ch[1]) {
336 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
337 if (ucsp >= bufend)
338 goto toolong;
339 ucs_ch = comb_ch[1];
340 }
341 }
342 } else if (precompose && (ucsp != bufstart)) {
343 u_int16_t composite, base;
344
345 base = swapbytes ? NXSwapShort(*(ucsp - 1)) : *(ucsp - 1);
346 composite = ucs_combine(base, ucs_ch);
347 if (composite) {
348 --ucsp;
349 ucs_ch = composite;
350 }
351 }
352 if (ucs_ch == UCS_ALT_NULL)
353 ucs_ch = '\0';
354 }
355 if (ucs_ch == altslash)
356 ucs_ch = '/';
357
358 *ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
359 }
360
361 exit:
362 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
363
364 return (result);
365
366 invalid:
367 result = EINVAL;
368 goto exit;
369
370 toolong:
371 result = ENAMETOOLONG;
372 goto exit;
373 }
374
375
376 /*
377 * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
378 * primary_char yields first decomposed char. If this
379 * char is an alpha char then get the combining char
380 * from the combining_char table and add 0x0300 to it.
381 */
382
383 static unsigned char primary_char[8*36] = {
384 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x00, 0x43,
385
386 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, /* CF */
387
388 0x00, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0x00,
389
390 0x00, 0x55, 0x55, 0x55, 0x55, 0x59, 0x00, 0x00, /* DF */
391
392 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x00, 0x63,
393
394 0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69, /* EF */
395
396 0x00, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0x00,
397
398 0x00, 0x75, 0x75, 0x75, 0x75, 0x79, 0x00, 0x79, /* FF */
399
400 0x41, 0x61, 0x41, 0x61, 0x41, 0x61, 0x43, 0x63,
401
402 0x43, 0x63, 0x43, 0x63, 0x43, 0x63, 0x44, 0x64, /* 10F */
403
404 0x00, 0x00, 0x45, 0x65, 0x45, 0x65, 0x45, 0x65,
405
406 0x45, 0x65, 0x45, 0x65, 0x47, 0x67, 0x47, 0x67, /* 11F */
407
408 0x47, 0x67, 0x47, 0x67, 0x48, 0x68, 0x00, 0x00,
409
410 0x49, 0x69, 0x49, 0x69, 0x49, 0x69, 0x49, 0x69,
411
412 0x49, 0x00, 0x00, 0x00, 0x4A, 0x6A, 0x4B, 0x6B,
413
414 0x00, 0x4C, 0x6C, 0x4C, 0x6C, 0x4C, 0x6C, 0x00, /* 13F */
415
416 0x00, 0x00, 0x00, 0x4E, 0x6E, 0x4E, 0x6E, 0x4E,
417
418 0x6E, 0x00, 0x00, 0x00, 0x4F, 0x6F, 0x4F, 0x6F,
419
420 0x4F, 0x6F, 0x00, 0x00, 0x52, 0x72, 0x52, 0x72,
421
422 0x52, 0x72, 0x53, 0x73, 0x53, 0x73, 0x53, 0x73, /* 15F */
423
424 0x53, 0x73, 0x54, 0x74, 0x54, 0x74, 0x00, 0x00,
425
426 0x55, 0x75, 0x55, 0x75, 0x55, 0x75, 0x55, 0x75,
427
428 0x55, 0x75, 0x55, 0x75, 0x57, 0x77, 0x59, 0x79,
429
430 0x59, 0x5A, 0x7A, 0x5A, 0x7A, 0x5A, 0x7A, 0x00, /* 17F */
431
432 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
433
434 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
435
436 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
437
438 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 19F */
439
440 0x4F, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
441
442 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55,
443
444 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
445
446 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1BF */
447
448 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
449
450 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x61, 0x49,
451
452 0x69, 0x4F, 0x6F, 0x55, 0x75, 0xDC, 0xFC, 0xDC,
453
454 0xFC, 0xDC, 0xFC, 0xDC, 0xFC, 0x00, 0xC4, 0xE4 /* 1DF */
455
456 };
457
458 static unsigned char combining_char[8*36] = {
459 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
460
461 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, /* CF */
462
463 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
464
465 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF, /* DF */
466
467 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
468
469 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, /* EF */
470
471 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
472
473 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08, /* FF */
474
475 0x04, 0x04, 0x06, 0x06, 0x28, 0x28, 0x01, 0x01,
476
477 0x02, 0x02, 0x07, 0x07, 0x0C, 0x0C, 0x0C, 0x0C,
478
479 0x00, 0x00, 0x04, 0x04, 0x06, 0x06, 0x07, 0x07,
480
481 0x28, 0x28, 0x0C, 0x0C, 0x02, 0x02, 0x06, 0x06,
482
483 0x07, 0x07, 0x27, 0x27, 0x02, 0x02, 0x00, 0x00,
484
485 0x03, 0x03, 0x04, 0x04, 0x06, 0x06, 0x28, 0x28,
486
487 0x07, 0x00, 0x00, 0x00, 0x02, 0x02, 0x27, 0x27,
488
489 0x00, 0x01, 0x01, 0x27, 0x27, 0x0C, 0x0C, 0x00, /* 13F */
490
491 0x00, 0x00, 0x00, 0x01, 0x01, 0x27, 0x27, 0x0C,
492
493 0x0C, 0x00, 0x00, 0x00, 0x04, 0x04, 0x06, 0x06,
494
495 0x0B, 0x0B, 0x00, 0x00, 0x01, 0x01, 0x27, 0x27,
496
497 0x0C, 0x0C, 0x01, 0x01, 0x02, 0x02, 0x27, 0x27,
498
499 0x0C, 0x0C, 0x27, 0x27, 0x0C, 0x0C, 0x00, 0x00,
500
501 0x03, 0x03, 0x04, 0x04, 0x06, 0x06, 0x0A, 0x0A, /* 16F */
502
503 0x0B, 0x0B, 0x28, 0x28, 0x02, 0x02, 0x02, 0x02,
504
505 0x08, 0x01, 0x01, 0x07, 0x07, 0x0C, 0x0C, 0x00,
506
507 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 17F */
508
509 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
510
511 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
512
513 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 19F */
514
515 0x1B, 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
516
517 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1B,
518
519 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
520
521 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
522
523 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
524
525 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x0C, /* 1CF */
526
527 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x04, 0x04, 0x01,
528
529 0x01, 0x0C, 0x0C, 0x00, 0x00, 0x00, 0x04, 0x04 /* 1DF */
530 };
531
532
533 /* CYRILLIC codepoints 0x0400 ~ 0x04FF */
534 static const unsigned long __CyrillicDecompBitmap[] = {
535 0x510A0040, 0x00000040, 0x0000510A, 0x00000000, /* 0x0400 */
536 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 0x0480 */
537 };
538
539 /* CJK codepoints 0x3000 ~ 0x30FF */
540 static const unsigned long __CJKDecompBitmap[] = {
541 0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, /* 0x3000 */
542 0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, /* 0x3080 */
543 };
544 #define IS_DECOMPOSABLE(table,unicodeVal) \
545 (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
546
547 /*
548 * ucs_decompose - decompose a composed Unicode char
549 *
550 * Composed Unicode characters are forbidden on
551 * HFS Plus volumes. ucs_decompose will convert a
552 * composed character into its correct decomposed
553 * sequence.
554 *
555 * Currently only Tier-1 and Tier-2 languages
556 * are handled. Other composed characters are
557 * passed unchanged.
558 */
559 static u_int16_t
560 ucs_decompose(register u_int16_t ch, u_int16_t *cmb)
561 {
562 u_int16_t base;
563
564 cmb[0] = 0;
565 cmb[1] = 0;
566
567 if (ch < 0x00C0) {
568 base = ch;
569 } else if (ch <= 0x01DF) {
570
571 base = (u_int16_t) primary_char[ch - 0x00C0];
572
573 if (base == 0)
574 base = ch;
575 else {
576 if ((base < 0x00C0) || (primary_char[base - 0x00C0] == 0))
577 cmb[0] = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch - 0x00C0];
578 else {
579 u_int16_t tch = base;
580
581 base = (u_int16_t)primary_char[tch - 0x00C0];
582 cmb[0] = (u_int16_t)0x0300 + (u_int16_t)combining_char[tch - 0x00C0];
583 cmb[1] = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch - 0x00C0];
584 }
585 }
586 } else if ((ch >= 0x0400) && (ch <= 0x04FF) &&
587 IS_DECOMPOSABLE(__CyrillicDecompBitmap, ch - 0x0400)) {
588
589 /* Handle CYRILLIC LETTERs */
590 switch(ch) {
591 case 0x0401: base = 0x0415; cmb[0] = 0x0308; break; /* */
592 case 0x0403: base = 0x0413; cmb[0] = 0x0301; break; /* */
593 case 0x0407: base = 0x0406; cmb[0] = 0x0308; break; /* */
594 case 0x040C: base = 0x041A; cmb[0] = 0x0301; break; /* */
595 case 0x040E: base = 0x0423; cmb[0] = 0x0306; break; /* */
596 case 0x0419: base = 0x0418; cmb[0] = 0x0306; break; /* */
597 case 0x0439: base = 0x0438; cmb[0] = 0x0306; break; /* */
598 case 0x0451: base = 0x0435; cmb[0] = 0x0308; break; /* */
599 case 0x0453: base = 0x0433; cmb[0] = 0x0301; break; /* */
600 case 0x0457: base = 0x0456; cmb[0] = 0x0308; break; /* */
601 case 0x045C: base = 0x043A; cmb[0] = 0x0301; break; /* */
602 case 0x045E: base = 0x0443; cmb[0] = 0x0306; break; /* */
603
604 default:
605 /* Should not be hit from bit map table */
606 base = ch;
607 }
608 } else if (ch == 0x1E3F) {
609 base = 0x006D; cmb[0] = 0x0301; /* LATIN SMALL LETTER M WITH ACUTE */
610 } else if ((ch > 0x3000) && (ch < 0x3100) &&
611 IS_DECOMPOSABLE(__CJKDecompBitmap, ch - 0x3000)) {
612
613 /* Handle HIRAGANA LETTERs */
614 switch(ch) {
615 case 0x3071: base = 0x306F; cmb[0] = 0x309A; break; /* PA */
616 case 0x3074: base = 0x3072; cmb[0] = 0x309A; break; /* PI */
617 case 0x3077: base = 0x3075; cmb[0] = 0x309A; break; /* PU */
618 case 0x307A: base = 0x3078; cmb[0] = 0x309A; break; /* PE */
619
620 case 0x307D: base = 0x307B; cmb[0] = 0x309A; break; /* PO */
621 case 0x3094: base = 0x3046; cmb[0] = 0x3099; break; /* VU */
622 case 0x30D1: base = 0x30CF; cmb[0] = 0x309A; break; /* PA */
623 case 0x30D4: base = 0x30D2; cmb[0] = 0x309A; break; /* PI */
624
625 case 0x30D7: base = 0x30D5; cmb[0] = 0x309A; break; /* PU */
626 case 0x30DA: base = 0x30D8; cmb[0] = 0x309A; break; /* PE */
627 case 0x30DD: base = 0x30DB; cmb[0] = 0x309A; break; /* PO */
628 case 0x30F4: base = 0x30A6; cmb[0] = 0x3099; break; /* VU */
629
630 case 0x30F7: base = 0x30EF; cmb[0] = 0x3099; break; /* VA */
631 case 0x30F8: base = 0x30F0; cmb[0] = 0x3099; break; /* VI */
632 case 0x30F9: base = 0x30F1; cmb[0] = 0x3099; break; /* VE */
633 case 0x30FA: base = 0x30F2; cmb[0] = 0x3099; break; /* VO */
634
635 default:
636 /* the rest (41 of them) have a simple conversion */
637 base = ch - 1;
638 cmb[0] = 0x3099;
639 }
640 } else if ((ch >= 0xAC00) && (ch < 0xD7A4)) {
641 /* Hangul */
642 ch -= 0xAC00;
643 base = 0x1100 + (ch / (21*28));
644 cmb[0] = 0x1161 + (ch % (21*28)) / 28;
645
646 if (ch % 28)
647 cmb[1] = 0x11A7 + (ch % 28);
648 } else {
649 base = ch;
650 }
651
652 return (base);
653 }
654
655
656 static const short diacrit_tbl[8*6] = {
657 /* 300 - 307 */ 0, 58, 116, 174, 232, -1, 290, 348,
658 /* 308 - 30F */ 406, -1, 464, 522, 580, -1, -1, -1,
659 /* 310 - 317 */ -1, -1, -1, -1, -1, -1, -1, -1,
660 /* 318 - 31F */ -1, -1, -1, 638, -1, -1, -1, -1,
661 /* 320 - 327 */ -1, -1, -1, -1, -1, -1, -1, 696,
662 /* 328 - 32F */ 754, -1, -1, -1, -1, -1, -1, -1
663 };
664
665 static const u_int16_t composite_tbl[58*14] = {
666 /*
667 * A B C D E F G H I J K L M
668 * N O P Q R S T U V W X Y Z
669 * [ \ ] ^ _ `
670 * a b c d e f g h i j k l m
671 * n o p q r s t u v w x y z
672 */
673
674 /*
675 * 0x300 - grave accent
676 */
677 0x0C0, 0, 0, 0,0x0C8, 0, 0, 0,0x0CC, 0, 0, 0, 0,
678 0,0x0D2, 0, 0, 0, 0, 0,0x0D9, 0, 0, 0, 0, 0,
679 0, 0, 0, 0, 0, 0,
680 0x0E0, 0, 0, 0,0x0E8, 0, 0, 0,0x0EC, 0, 0, 0, 0,
681 0,0x0F2, 0, 0, 0, 0, 0,0x0F9, 0, 0, 0, 0, 0,
682 /*
683 * 0x301 - acute accent
684 */
685 0x0C1, 0,0x106, 0,0x0C9, 0, 0, 0,0x0CD, 0, 0,0x139, 0,
686 0x143,0x0D3, 0, 0,0x154,0x15A, 0,0x0DA, 0, 0, 0,0x0DD,0x179,
687 0, 0, 0, 0, 0, 0,
688 0x0E1, 0,0x107, 0,0x0E9, 0, 0, 0,0x0ED, 0, 0,0x13A,0x1E3F,
689 0x144,0x0F3, 0, 0,0x155,0x15B, 0,0x0FA, 0, 0, 0,0x0FD,0x17A,
690 /*
691 * 0x302 - circumflex accent
692 */
693 0x0C2, 0,0x108, 0,0x0CA, 0,0x11C,0x124,0x0CE,0x134, 0, 0, 0,
694 0,0x0D4, 0, 0, 0,0x15C, 0,0x0DB, 0,0x174, 0,0x176, 0,
695 0, 0, 0, 0, 0, 0,
696 0x0E2, 0,0x109, 0,0x0EA, 0,0x11D,0x125,0x0EE,0x135, 0, 0, 0,
697 0,0x0F4, 0, 0, 0,0x15D, 0,0x0FB, 0,0x175, 0,0x177, 0,
698 /*
699 * 0x303 - tilde
700 */
701 0x0C3, 0, 0, 0, 0, 0, 0, 0,0x128, 0, 0, 0, 0,
702 0x0D1,0x0D5, 0, 0, 0, 0, 0,0x168, 0, 0, 0, 0, 0,
703 0, 0, 0, 0, 0, 0,
704 0x0E3, 0, 0, 0, 0, 0, 0, 0,0x129, 0, 0, 0, 0,
705 0x0F1,0x0F5, 0, 0, 0, 0, 0,0x169, 0, 0, 0, 0, 0,
706 /*
707 * 0x304 - macron
708 */
709 0x100, 0, 0, 0,0x112, 0, 0, 0,0x12A, 0, 0, 0, 0,
710 0,0x14C, 0, 0, 0, 0, 0,0x16A, 0, 0, 0, 0, 0,
711 0, 0, 0, 0, 0, 0,
712 0x101, 0, 0, 0,0x113, 0, 0, 0,0x12B, 0, 0, 0, 0,
713 0,0x14D, 0, 0, 0, 0, 0,0x16B, 0, 0, 0, 0, 0,
714 /*
715 * 0x306 - breve
716 */
717 0x102, 0, 0, 0,0x114, 0,0x11E, 0,0x12C, 0, 0, 0, 0,
718 0,0x14E, 0, 0, 0, 0, 0,0x16C, 0, 0, 0, 0, 0,
719 0, 0, 0, 0, 0, 0,
720 0x103, 0, 0, 0,0x115, 0,0x11F, 0,0x12D, 0, 0, 0, 0,
721 0,0x14F, 0, 0, 0, 0, 0,0x16D, 0, 0, 0, 0, 0,
722 /*
723 * 0x307 - dot above
724 */
725 0, 0,0x10A, 0,0x116, 0,0x120, 0,0x130, 0, 0, 0, 0,
726 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0x17B,
727 0, 0, 0, 0, 0, 0,
728 0, 0,0x10B, 0,0x117, 0,0x121, 0, 0, 0, 0, 0, 0,
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0x17C,
730 /*
731 * 0x308 - diaeresis
732 */
733 0x0C4, 0, 0, 0,0x0CB, 0, 0, 0,0x0CF, 0, 0, 0, 0,
734 0,0x0D6, 0, 0, 0, 0, 0,0x0DC, 0, 0, 0,0x178, 0,
735 0, 0, 0, 0, 0, 0,
736 0x0E4, 0, 0, 0,0x0EB, 0, 0, 0,0x0EF, 0, 0, 0, 0,
737 0,0x0F6, 0, 0, 0, 0, 0,0x0FC, 0, 0, 0,0x0FF, 0,
738 /*
739 * 0x30A - ring above
740 */
741 0x0C5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
742 0, 0, 0, 0, 0, 0, 0,0x16E, 0, 0, 0, 0, 0,
743 0, 0, 0, 0, 0, 0,
744 0x0E5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
745 0, 0, 0, 0, 0, 0, 0,0x16F, 0, 0, 0, 0, 0,
746 /*
747 * 0x30B - double aute accent
748 */
749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
750 0,0x150, 0, 0, 0, 0, 0,0x170, 0, 0, 0, 0, 0,
751 0, 0, 0, 0, 0, 0,
752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
753 0,0x151, 0, 0, 0, 0, 0,0x171, 0, 0, 0, 0, 0,
754 /*
755 * 0x30C - caron
756 */
757 0x1CD, 0,0x10C,0x10E,0x11A, 0, 0, 0,0x1CF, 0, 0,0x13D, 0,
758 0x147,0x1D1, 0, 0,0x158,0x160,0x164,0x1D3, 0, 0, 0, 0,0x17D,
759 0, 0, 0, 0, 0, 0,
760 0x1CE, 0,0x10D,0x10F,0x11B, 0, 0, 0,0x1D0, 0, 0,0x13E, 0,
761 0x148,0x1D2, 0, 0,0x159,0x161,0x165,0x1D4, 0, 0, 0, 0,0x17E,
762 /*
763 * 0x31B - horn
764 */
765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
766 0,0x1A0, 0, 0, 0, 0, 0,0x1AF, 0, 0, 0, 0, 0,
767 0, 0, 0, 0, 0, 0,
768 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
769 0,0x1A1, 0, 0, 0, 0, 0,0x1B0, 0, 0, 0, 0, 0,
770 /*
771 * 0x327 - cedilla
772 */
773 0, 0,0x0C7, 0, 0, 0,0x122, 0, 0, 0,0x136,0x13B, 0,
774 0x145, 0, 0, 0,0x156,0x15E,0x162, 0, 0, 0, 0, 0, 0,
775 0, 0, 0, 0, 0, 0,
776 0, 0,0x0E7, 0, 0, 0,0x123, 0, 0, 0,0x137,0x13C, 0,
777 0x146, 0, 0, 0,0x157,0x15F,0x163, 0, 0, 0, 0, 0, 0,
778 /*
779 * 0x328 - ogonek
780 */
781 0x104, 0, 0, 0,0x118, 0, 0, 0,0x12E, 0, 0, 0, 0,
782 0, 0, 0, 0, 0, 0, 0,0x172, 0, 0, 0, 0, 0,
783 0, 0, 0, 0, 0, 0,
784 0x105, 0, 0, 0,0x119, 0, 0, 0,0x12F, 0, 0, 0, 0,
785 0, 0, 0, 0, 0, 0, 0,0x173, 0, 0, 0, 0, 0,
786 };
787
788
789 /* CJK codepoints 0x3000 ~ 0x30FF */
790 static const unsigned long __CJKCombBitmap[] = {
791 0x00000000, 0x00000000, 0x02155555, 0x4A812490, /* 0x3000 */
792 0x00000004, 0x02155555, 0x4A812490, 0x0001E004, /* 0x3080 */
793 };
794 #define CAN_COMBINE(table,unicodeVal) \
795 (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
796
797
798 /*
799 * ucs_combine - generate a precomposed Unicode char
800 *
801 * Precomposed Unicode characters are required for some volume
802 * formats and network protocols. ucs_combine will combine a
803 * decomposed character sequence into a single precomposed
804 * (composite) character.
805 *
806 * Currently only decomcomposed sequences from Apple's Tier 1
807 * and Tier 2 languages are handled.
808 *
809 * INPUT:
810 * base - base character
811 * comb - combining character
812 * OUTPUT:
813 * result - precomposed char or zero if not combinable
814 */
815 static u_int16_t
816 ucs_combine(u_int16_t base, u_int16_t comb)
817 {
818 /* Get out early if we can */
819 if (comb < 0x0300)
820 return (0);
821
822 /* Try ordinary diacritics (0x300 - 0x32F) */
823 if (comb <= 0x032F) {
824 int index;
825
826 if (base >= 'A' && base <= 'z') {
827 index = diacrit_tbl[comb - 0x0300];
828 if (index < 0 ) return (0);
829
830 return (composite_tbl[index + (base - 'A')]);
831 }
832
833 /* Handle Cyrillic and some 3 char latin sequences */
834 switch (comb) {
835 case 0x0300:
836 switch (base) {
837 case 0x00DC: return (0x01DB);
838 case 0x00FC: return (0x01DC);
839 } break;
840 case 0x0301:
841 switch (base) {
842 case 0x00DC: return (0x01D7);
843 case 0x00FC: return (0x01D8);
844 case 0x0413: return (0x0403);
845 case 0x041A: return (0x040C);
846 case 0x0433: return (0x0453);
847 case 0x043A: return (0x045C);
848 } break;
849 case 0x0304:
850 switch (base) {
851 case 0x00DC: return (0x01D5);
852 case 0x00FC: return (0x01D6);
853 case 0x00C4: return (0x01DE);
854 case 0x00E4: return (0x01DF);
855 } break;
856 case 0x0306:
857 switch (base) {
858 case 0x0418: return (0x0419);
859 case 0x0423: return (0x040E);
860 case 0x0438: return (0x0439);
861 case 0x0443: return (0x045E);
862 } break;
863 case 0x0308:
864 switch (base) {
865 case 0x0406: return (0x0407);
866 case 0x0415: return (0x0401);
867 case 0x0435: return (0x0451);
868 case 0x0456: return (0x0457);
869 } break;
870 case 0x030C:
871 switch (base) {
872 case 0x00DC: return (0x01D9);
873 case 0x00FC: return (0x01DA);
874 } break;
875 }
876 return (0);
877 }
878
879 /* Now try HANGUL */
880 if (comb < 0x1161)
881 return (0);
882
883 /* 2 char Hangul sequences */
884 if ((comb <= 0x1175) && (base >= 0x1100 && base <= 0x1112))
885 return (0xAC00 + ((base - 0x1100)*(21*28)) + ((comb - 0x1161)*28));
886
887 /* 3 char Hangul sequences */
888 if ((comb >= 0x11A8 && comb <= 0x11C2) &&
889 (base >= 0xAC00 && base <= 0xD788)) {
890 if ((base - 0xAC00) % 28)
891 return (0);
892 else
893 return (base + (comb - 0x11A7));
894 }
895
896 /* Now try HIRAGANA and KATAKANA */
897 if ((comb == 0x3099 || comb == 0x309A) &&
898 (base > 0x3000 && base < 0x3100) &&
899 CAN_COMBINE(__CJKCombBitmap, base - 0x3000)) {
900 if (comb == 0x309A) {
901 switch(base) {
902 case 0x306F: return (0x3071); /* PA */
903 case 0x3072: return (0x3074); /* PI */
904 case 0x3075: return (0x3077); /* PU */
905 case 0x3078: return (0x307A); /* PE */
906 case 0x307B: return (0x307D); /* PO */
907 case 0x30CF: return (0x30D1); /* PA */
908 case 0x30D2: return (0x30D4); /* PI */
909 case 0x30D5: return (0x30D7); /* PU */
910 case 0x30D8: return (0x30DA); /* PE */
911 case 0x30DB: return (0x30DD); /* PO */
912 default: return (0);
913 }
914 } else /* 0x3099 */ {
915 switch (base) {
916 case 0x3046: return (0x3094); /* VU */
917 case 0x30A6: return (0x30F4); /* VU */
918 case 0x30EF: return (0x30F7); /* VA */
919 case 0x30F0: return (0x30F8); /* VI */
920 case 0x30F1: return (0x30F9); /* VE */
921 case 0x30F2: return (0x30FA); /* VO */
922 default: return (base + 1); /* 41 code points here */
923 }
924 }
925 }
926
927 return (0);
928 }
929