]>
git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_utfconv.c
b0173cfb469f0e1237d09c9aeddcd27967f1b3e2
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 #include <sys/param.h>
24 #include <sys/utfconv.h>
25 #include <sys/errno.h>
26 #include <architecture/byte_order.h>
30 * UTF-8 (UCS Transformation Format)
32 * The following subset of UTF-8 is used to encode UCS-2 filenames. It
33 * requires a maximum of three 3 bytes per UCS-2 character. Only the
34 * shortest encoding required to represent the significant UCS-2 bits
37 * UTF-8 Multibyte Codes
39 * Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary)
40 * -------------------------------------------------------------------
41 * 1 7 0x0000 0x007F 0xxxxxxx
42 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
43 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
44 * -------------------------------------------------------------------
48 #define UCS_TO_UTF8_LEN(c) ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : 3))
50 #define UCS_ALT_NULL 0x2400
53 static u_int16_t ucs_decompose
__P((u_int16_t
, u_int16_t
*));
55 static u_int16_t
ucs_combine(u_int16_t base
, u_int16_t comb
);
59 * utf8_encodelen - Calculates the UTF-8 encoding length for a UCS-2 filename
62 * If '/' chars are allowed on disk then an alternate
63 * (replacement) char must be provided in altslash.
66 * UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
69 utf8_encodelen(ucsp
, ucslen
, altslash
, flags
)
70 const u_int16_t
* ucsp
;
77 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
83 while (charcnt
-- > 0) {
87 ucs_ch
= NXSwapShort(ucs_ch
);
89 ucs_ch
= altslash
? altslash
: '_';
90 else if (ucs_ch
== '\0')
91 ucs_ch
= UCS_ALT_NULL
;
93 len
+= UCS_TO_UTF8_LEN(ucs_ch
);
101 * utf8_encodestr - Encodes a UCS-2 (Unicode) string to UTF-8
104 * The resulting UTF-8 string is NULL terminated.
106 * If '/' chars are allowed on disk then an alternate
107 * (replacement) char must be provided in altslash.
110 * UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
111 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
114 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
115 * EINVAL: Illegal char found; char was replaced by an '_'.
117 int utf8_encodestr(ucsp
, ucslen
, utf8p
, utf8len
, buflen
, altslash
, flags
)
118 const u_int16_t
* ucsp
;
129 u_int16_t extra
[2] = {0};
131 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
132 int nullterm
= ((flags
& UTF_NO_NULL_TERM
) == 0);
133 int decompose
= (flags
& UTF_DECOMPOSED
);
137 bufend
= bufstart
+ buflen
;
140 charcnt
= ucslen
/ 2;
142 while (charcnt
-- > 0) {
144 ucs_ch
= swapbytes
? NXSwapShort(*ucsp
++) : *ucsp
++;
146 ucs_ch
= extra
[0]; extra
[0] = 0;
147 } else if (extra
[1]) {
148 ucs_ch
= extra
[1]; extra
[1] = 0;
150 ucs_ch
= swapbytes
? NXSwapShort(*ucsp
++) : *ucsp
++;
151 ucs_ch
= ucs_decompose(ucs_ch
, &extra
[0]);
158 /* Slash and NULL are not permitted */
166 } else if (ucs_ch
== '\0') {
167 ucs_ch
= UCS_ALT_NULL
;
170 if (ucs_ch
< 0x0080) {
171 if (utf8p
>= bufend
) {
172 result
= ENAMETOOLONG
;
177 } else if (ucs_ch
< 0x800) {
178 if ((utf8p
+ 1) >= bufend
) {
179 result
= ENAMETOOLONG
;
182 *utf8p
++ = (ucs_ch
>> 6) | 0xc0;
183 *utf8p
++ = (ucs_ch
& 0x3f) | 0x80;
186 if ((utf8p
+ 2) >= bufend
) {
187 result
= ENAMETOOLONG
;
190 *utf8p
++ = (ucs_ch
>> 12) | 0xe0;
191 *utf8p
++ = ((ucs_ch
>> 6) & 0x3f) | 0x80;
192 *utf8p
++ = ((ucs_ch
) & 0x3f) | 0x80;
196 *utf8len
= utf8p
- bufstart
;
205 * utf8_decodestr - Decodes a UTF-8 string back to UCS-2 (Unicode)
208 * The input UTF-8 string does not need to be null terminated
211 * If '/' chars are allowed on disk then an alternate
212 * (replacement) char must be provided in altslash.
215 * UTF_REV_ENDIAN: UCS-2 byteorder is oposite current runtime
216 * UTF_DECOMPOSED: UCS-2 output string must be fully decompsed
219 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
220 * EINVAL: Illegal UTF-8 sequence found.
223 utf8_decodestr(utf8p
, utf8len
, ucsp
, ucslen
, buflen
, altslash
, flags
)
224 const u_int8_t
* utf8p
;
237 int decompose
, precompose
, swapbytes
;
239 decompose
= (flags
& UTF_DECOMPOSED
);
240 precompose
= (flags
& UTF_PRECOMPOSED
);
241 swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
244 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
246 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
247 if (ucsp
>= bufend
) {
248 result
= ENAMETOOLONG
;
252 /* check for ascii */
256 switch (byte
& 0xf0) {
257 /* 2 byte sequence */
260 /* extract bits 6 - 10 from first byte */
261 ucs_ch
= (byte
& 0x1F) << 6;
262 if (ucs_ch
< 0x0080) {
263 result
= EINVAL
; /* seq not minimal */
267 /* 3 byte sequence */
269 /* extract bits 12 - 15 from first byte */
270 ucs_ch
= (byte
& 0x0F) << 6;
272 /* extract bits 6 - 11 from second byte */
273 if (((byte
= *utf8p
++) & 0xc0) != 0x80) {
279 ucs_ch
+= (byte
& 0x3F);
282 if (ucs_ch
< 0x0800) {
283 result
= EINVAL
; /* sequence not minimal */
292 /* extract bits 0 - 5 from final byte */
293 if (((byte
= *utf8p
++) & 0xc0) != 0x80) {
298 ucs_ch
+= (byte
& 0x3F);
301 u_int16_t comb_ch
[2];
303 ucs_ch
= ucs_decompose(ucs_ch
, &comb_ch
[0]);
306 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
307 if (ucsp
>= bufend
) {
308 result
= ENAMETOOLONG
;
313 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
314 if (ucsp
>= bufend
) {
315 result
= ENAMETOOLONG
;
321 } else if (precompose
&& (ucsp
!= bufstart
)) {
322 u_int16_t composite
, base
;
324 base
= swapbytes
? NXSwapShort(*(ucsp
- 1)) : *(ucsp
- 1);
325 composite
= ucs_combine(base
, ucs_ch
);
331 if (ucs_ch
== UCS_ALT_NULL
)
335 if (ucs_ch
== altslash
)
338 ucs_ch
= NXSwapShort(ucs_ch
);
343 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
350 * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
351 * primary_char yields first decomposed char. If this
352 * char is an alpha char then get the combining char
353 * from the combining_char table and add 0x0300 to it.
356 static unsigned char primary_char
[8*36] = {
357 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x00, 0x43,
359 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, /* CF */
361 0x00, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0x00,
363 0x00, 0x55, 0x55, 0x55, 0x55, 0x59, 0x00, 0x00, /* DF */
365 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x00, 0x63,
367 0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69, /* EF */
369 0x00, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0x00,
371 0x00, 0x75, 0x75, 0x75, 0x75, 0x79, 0x00, 0x79, /* FF */
373 0x41, 0x61, 0x41, 0x61, 0x41, 0x61, 0x43, 0x63,
375 0x43, 0x63, 0x43, 0x63, 0x43, 0x63, 0x44, 0x64, /* 10F */
377 0x00, 0x00, 0x45, 0x65, 0x45, 0x65, 0x45, 0x65,
379 0x45, 0x65, 0x45, 0x65, 0x47, 0x67, 0x47, 0x67, /* 11F */
381 0x47, 0x67, 0x47, 0x67, 0x48, 0x68, 0x00, 0x00,
383 0x49, 0x69, 0x49, 0x69, 0x49, 0x69, 0x49, 0x69,
385 0x49, 0x00, 0x00, 0x00, 0x4A, 0x6A, 0x4B, 0x6B,
387 0x00, 0x4C, 0x6C, 0x4C, 0x6C, 0x4C, 0x6C, 0x00, /* 13F */
389 0x00, 0x00, 0x00, 0x4E, 0x6E, 0x4E, 0x6E, 0x4E,
391 0x6E, 0x00, 0x00, 0x00, 0x4F, 0x6F, 0x4F, 0x6F,
393 0x4F, 0x6F, 0x00, 0x00, 0x52, 0x72, 0x52, 0x72,
395 0x52, 0x72, 0x53, 0x73, 0x53, 0x73, 0x53, 0x73, /* 15F */
397 0x53, 0x73, 0x54, 0x74, 0x54, 0x74, 0x00, 0x00,
399 0x55, 0x75, 0x55, 0x75, 0x55, 0x75, 0x55, 0x75,
401 0x55, 0x75, 0x55, 0x75, 0x57, 0x77, 0x59, 0x79,
403 0x59, 0x5A, 0x7A, 0x5A, 0x7A, 0x5A, 0x7A, 0x00, /* 17F */
405 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
407 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
409 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
411 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 19F */
413 0x4F, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
415 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55,
417 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
419 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1BF */
421 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
423 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x61, 0x49,
425 0x69, 0x4F, 0x6F, 0x55, 0x75, 0xDC, 0xFC, 0xDC,
427 0xFC, 0xDC, 0xFC, 0xDC, 0xFC, 0x00, 0xC4, 0xE4 /* 1DF */
431 static unsigned char combining_char
[8*36] = {
432 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
434 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, /* CF */
436 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
438 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF, /* DF */
440 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
442 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, /* EF */
444 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
446 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08, /* FF */
448 0x04, 0x04, 0x06, 0x06, 0x28, 0x28, 0x01, 0x01,
450 0x02, 0x02, 0x07, 0x07, 0x0C, 0x0C, 0x0C, 0x0C,
452 0x00, 0x00, 0x04, 0x04, 0x06, 0x06, 0x07, 0x07,
454 0x28, 0x28, 0x0C, 0x0C, 0x02, 0x02, 0x06, 0x06,
456 0x07, 0x07, 0x27, 0x27, 0x02, 0x02, 0x00, 0x00,
458 0x03, 0x03, 0x04, 0x04, 0x06, 0x06, 0x28, 0x28,
460 0x07, 0x00, 0x00, 0x00, 0x02, 0x02, 0x27, 0x27,
462 0x00, 0x01, 0x01, 0x27, 0x27, 0x0C, 0x0C, 0x00, /* 13F */
464 0x00, 0x00, 0x00, 0x01, 0x01, 0x27, 0x27, 0x0C,
466 0x0C, 0x00, 0x00, 0x00, 0x04, 0x04, 0x06, 0x06,
468 0x0B, 0x0B, 0x00, 0x00, 0x01, 0x01, 0x27, 0x27,
470 0x0C, 0x0C, 0x01, 0x01, 0x02, 0x02, 0x27, 0x27,
472 0x0C, 0x0C, 0x27, 0x27, 0x0C, 0x0C, 0x00, 0x00,
474 0x03, 0x03, 0x04, 0x04, 0x06, 0x06, 0x0A, 0x0A, /* 16F */
476 0x0B, 0x0B, 0x28, 0x28, 0x02, 0x02, 0x02, 0x02,
478 0x08, 0x01, 0x01, 0x07, 0x07, 0x0C, 0x0C, 0x00,
480 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 17F */
482 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
484 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
486 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 19F */
488 0x1B, 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
490 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1B,
492 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
494 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
496 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
498 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x0C, /* 1CF */
500 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x04, 0x04, 0x01,
502 0x01, 0x0C, 0x0C, 0x00, 0x00, 0x00, 0x04, 0x04 /* 1DF */
506 /* CYRILLIC codepoints 0x0400 ~ 0x04FF */
507 static const unsigned long __CyrillicDecompBitmap
[] = {
508 0x40000040, 0x00000040, 0x00004000, 0x00000000, /* 0x0400 */
509 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 0x0480 */
512 /* CJK codepoints 0x3000 ~ 0x30FF */
513 static const unsigned long __CJKDecompBitmap
[] = {
514 0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, /* 0x3000 */
515 0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, /* 0x3080 */
517 #define IS_DECOMPOSABLE(table,unicodeVal) \
518 (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
521 * ucs_decompose - decompose a composed UCS-2 char
523 * Composed Unicode characters are forbidden on
524 * HFS Plus volumes. ucs_decompose will convert a
525 * composed character into its correct decomposed
528 * Currently only Tier-1 and Tier-2 languages
529 * are handled. Other composed characters are
533 ucs_decompose(register u_int16_t ch
, u_int16_t
*cmb
)
542 } else if (ch
<= 0x01DF) {
544 base
= (u_int16_t
) primary_char
[ch
- 0x00C0];
549 if ((base
< 0x00C0) || (primary_char
[base
- 0x00C0] == 0))
550 cmb
[0] = (u_int16_t
)0x0300 + (u_int16_t
)combining_char
[ch
- 0x00C0];
552 u_int16_t tch
= base
;
554 base
= (u_int16_t
)primary_char
[tch
- 0x00C0];
555 cmb
[0] = (u_int16_t
)0x0300 + (u_int16_t
)combining_char
[tch
- 0x00C0];
556 cmb
[1] = (u_int16_t
)0x0300 + (u_int16_t
)combining_char
[ch
- 0x00C0];
559 } else if ((ch
>= 0x0400) && (ch
<= 0x04FF) &&
560 IS_DECOMPOSABLE(__CyrillicDecompBitmap
, ch
- 0x0400)) {
562 /* Handle CYRILLIC LETTERs */
564 case 0x0401: base
= 0x0415; cmb
[0] = 0x0308; break; /* */
565 case 0x0419: base
= 0x0418; cmb
[0] = 0x0306; break; /* */
566 case 0x0439: base
= 0x0438; cmb
[0] = 0x0306; break; /* */
567 case 0x0451: base
= 0x0435; cmb
[0] = 0x0308; break; /* */
570 /* Should not be hit from bit map table */
573 } else if (ch
== 0x1E3F) {
574 base
= 0x006D; cmb
[0] = 0x0301; /* LATIN SMALL LETTER M WITH ACUTE */
575 } else if ((ch
> 0x3000) && (ch
< 0x3100) &&
576 IS_DECOMPOSABLE(__CJKDecompBitmap
, ch
- 0x3000)) {
578 /* Handle HIRAGANA LETTERs */
580 case 0x3071: base
= 0x306F; cmb
[0] = 0x309A; break; /* PA */
581 case 0x3074: base
= 0x3072; cmb
[0] = 0x309A; break; /* PI */
582 case 0x3077: base
= 0x3075; cmb
[0] = 0x309A; break; /* PU */
583 case 0x307A: base
= 0x3078; cmb
[0] = 0x309A; break; /* PE */
585 case 0x307D: base
= 0x307B; cmb
[0] = 0x309A; break; /* PO */
586 case 0x3094: base
= 0x3046; cmb
[0] = 0x3099; break; /* VU */
587 case 0x30D1: base
= 0x30CF; cmb
[0] = 0x309A; break; /* PA */
588 case 0x30D4: base
= 0x30D2; cmb
[0] = 0x309A; break; /* PI */
590 case 0x30D7: base
= 0x30D5; cmb
[0] = 0x309A; break; /* PU */
591 case 0x30DA: base
= 0x30D8; cmb
[0] = 0x309A; break; /* PE */
592 case 0x30DD: base
= 0x30DB; cmb
[0] = 0x309A; break; /* PO */
593 case 0x30F4: base
= 0x30A6; cmb
[0] = 0x3099; break; /* VU */
595 case 0x30F7: base
= 0x30EF; cmb
[0] = 0x3099; break; /* VA */
596 case 0x30F8: base
= 0x30F0; cmb
[0] = 0x3099; break; /* VI */
597 case 0x30F9: base
= 0x30F1; cmb
[0] = 0x3099; break; /* VE */
598 case 0x30FA: base
= 0x30F2; cmb
[0] = 0x3099; break; /* VO */
601 /* the rest (41 of them) have a simple conversion */
605 } else if ((ch
>= 0xAC00) && (ch
< 0xD7A4)) {
608 base
= 0x1100 + (ch
/ (21*28));
609 cmb
[0] = 0x1161 + (ch
% (21*28)) / 28;
612 cmb
[1] = 0x11A7 + (ch
% 28);
621 static const short diacrit_tbl
[8*6] = {
622 /* 300 - 307 */ 0, 58, 116, 174, 232, -1, 290, 348,
623 /* 308 - 30F */ 406, -1, 464, 522, 580, -1, -1, -1,
624 /* 310 - 317 */ -1, -1, -1, -1, -1, -1, -1, -1,
625 /* 318 - 31F */ -1, -1, -1, 638, -1, -1, -1, -1,
626 /* 320 - 327 */ -1, -1, -1, -1, -1, -1, -1, 696,
627 /* 328 - 32F */ 754, -1, -1, -1, -1, -1, -1, -1
630 static const u_int16_t composite_tbl
[58*14] = {
632 * A B C D E F G H I J K L M
633 * N O P Q R S T U V W X Y Z
635 * a b c d e f g h i j k l m
636 * n o p q r s t u v w x y z
640 * 0x300 - grave accent
642 0x0C0, 0, 0, 0,0x0C8, 0, 0, 0,0x0CC, 0, 0, 0, 0,
643 0,0x0D2, 0, 0, 0, 0, 0,0x0D9, 0, 0, 0, 0, 0,
645 0x0E0, 0, 0, 0,0x0E8, 0, 0, 0,0x0EC, 0, 0, 0, 0,
646 0,0x0F2, 0, 0, 0, 0, 0,0x0F9, 0, 0, 0, 0, 0,
648 * 0x301 - acute accent
650 0x0C1, 0,0x106, 0,0x0C9, 0, 0, 0,0x0CD, 0, 0,0x139, 0,
651 0x143,0x0D3, 0, 0,0x154,0x15A, 0,0x0DA, 0, 0, 0,0x0DD,0x179,
653 0x0E1, 0,0x107, 0,0x0E9, 0, 0, 0,0x0ED, 0, 0,0x13A,0x1E3F,
654 0x144,0x0F3, 0, 0,0x155,0x15B, 0,0x0FA, 0, 0, 0,0x0FD,0x17A,
656 * 0x302 - circumflex accent
658 0x0C2, 0,0x108, 0,0x0CA, 0,0x11C,0x124,0x0CE,0x134, 0, 0, 0,
659 0,0x0D4, 0, 0, 0,0x15C, 0,0x0DB, 0,0x174, 0,0x176, 0,
661 0x0E2, 0,0x109, 0,0x0EA, 0,0x11D,0x125,0x0EE,0x135, 0, 0, 0,
662 0,0x0F4, 0, 0, 0,0x15D, 0,0x0FB, 0,0x175, 0,0x177, 0,
666 0x0C3, 0, 0, 0, 0, 0, 0, 0,0x128, 0, 0, 0, 0,
667 0x0D1,0x0D5, 0, 0, 0, 0, 0,0x168, 0, 0, 0, 0, 0,
669 0x0E3, 0, 0, 0, 0, 0, 0, 0,0x129, 0, 0, 0, 0,
670 0x0F1,0x0F5, 0, 0, 0, 0, 0,0x169, 0, 0, 0, 0, 0,
674 0x100, 0, 0, 0,0x112, 0, 0, 0,0x12A, 0, 0, 0, 0,
675 0,0x14C, 0, 0, 0, 0, 0,0x16A, 0, 0, 0, 0, 0,
677 0x101, 0, 0, 0,0x113, 0, 0, 0,0x12B, 0, 0, 0, 0,
678 0,0x14D, 0, 0, 0, 0, 0,0x16B, 0, 0, 0, 0, 0,
682 0x102, 0, 0, 0,0x114, 0,0x11E, 0,0x12C, 0, 0, 0, 0,
683 0,0x14E, 0, 0, 0, 0, 0,0x16C, 0, 0, 0, 0, 0,
685 0x103, 0, 0, 0,0x115, 0,0x11F, 0,0x12D, 0, 0, 0, 0,
686 0,0x14F, 0, 0, 0, 0, 0,0x16D, 0, 0, 0, 0, 0,
690 0, 0,0x10A, 0,0x116, 0,0x120, 0,0x130, 0, 0, 0, 0,
691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0x17B,
693 0, 0,0x10B, 0,0x117, 0,0x121, 0, 0, 0, 0, 0, 0,
694 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0x17C,
698 0x0C4, 0, 0, 0,0x0CB, 0, 0, 0,0x0CF, 0, 0, 0, 0,
699 0,0x0D6, 0, 0, 0, 0, 0,0x0DC, 0, 0, 0,0x178, 0,
701 0x0E4, 0, 0, 0,0x0EB, 0, 0, 0,0x0EF, 0, 0, 0, 0,
702 0,0x0F6, 0, 0, 0, 0, 0,0x0FC, 0, 0, 0,0x0FF, 0,
706 0x0C5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
707 0, 0, 0, 0, 0, 0, 0,0x16E, 0, 0, 0, 0, 0,
709 0x0E5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
710 0, 0, 0, 0, 0, 0, 0,0x16F, 0, 0, 0, 0, 0,
712 * 0x30B - double aute accent
714 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
715 0,0x150, 0, 0, 0, 0, 0,0x170, 0, 0, 0, 0, 0,
717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
718 0,0x151, 0, 0, 0, 0, 0,0x171, 0, 0, 0, 0, 0,
722 0x1CD, 0,0x10C,0x10E,0x11A, 0, 0, 0,0x1CF, 0, 0,0x13D, 0,
723 0x147,0x1D1, 0, 0,0x158,0x160,0x164,0x1D3, 0, 0, 0, 0,0x17D,
725 0x1CE, 0,0x10D,0x10F,0x11B, 0, 0, 0,0x1D0, 0, 0,0x13E, 0,
726 0x148,0x1D2, 0, 0,0x159,0x161,0x165,0x1D4, 0, 0, 0, 0,0x17E,
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
731 0,0x1A0, 0, 0, 0, 0, 0,0x1AF, 0, 0, 0, 0, 0,
733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
734 0,0x1A1, 0, 0, 0, 0, 0,0x1B0, 0, 0, 0, 0, 0,
738 0, 0,0x0C7, 0, 0, 0,0x122, 0, 0, 0,0x136,0x13B, 0,
739 0x145, 0, 0, 0,0x156,0x15E,0x162, 0, 0, 0, 0, 0, 0,
741 0, 0,0x0E7, 0, 0, 0,0x123, 0, 0, 0,0x137,0x13C, 0,
742 0x146, 0, 0, 0,0x157,0x15F,0x163, 0, 0, 0, 0, 0, 0,
746 0x104, 0, 0, 0,0x118, 0, 0, 0,0x12E, 0, 0, 0, 0,
747 0, 0, 0, 0, 0, 0, 0,0x172, 0, 0, 0, 0, 0,
749 0x105, 0, 0, 0,0x119, 0, 0, 0,0x12F, 0, 0, 0, 0,
750 0, 0, 0, 0, 0, 0, 0,0x173, 0, 0, 0, 0, 0,
754 /* CJK codepoints 0x3000 ~ 0x30FF */
755 static const unsigned long __CJKCombBitmap
[] = {
756 0x00000000, 0x00000000, 0x02155555, 0x4A812490, /* 0x3000 */
757 0x00000004, 0x02155555, 0x4A812490, 0x0001E004, /* 0x3080 */
759 #define CAN_COMBINE(table,unicodeVal) \
760 (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
764 * ucs_combine - generate a precomposed UCS-2 char
766 * Precomposed Unicode characters are required for some volume
767 * formats and network protocols. ucs_combine will combine a
768 * decomposed character sequence into a single precomposed
769 * (composite) character.
771 * Currently only decomcomposed sequences from Apple's Tier 1
772 * and Tier 2 languages are handled.
775 * base - base character
776 * comb - combining character
778 * result - precomposed char or zero if not combinable
781 ucs_combine(u_int16_t base
, u_int16_t comb
)
783 /* Get out early if we can */
787 /* Try ordinary diacritics (0x300 - 0x32F) */
788 if (comb
<= 0x032F) {
791 if (base
>= 'A' && base
<= 'z') {
792 index
= diacrit_tbl
[comb
- 0x0300];
793 if (index
< 0 ) return (0);
795 return (composite_tbl
[index
+ (base
- 'A')]);
798 /* Handle Cyrillic and some 3 char latin sequences */
802 case 0x00DC: return (0x01DB);
803 case 0x00FC: return (0x01DC);
807 case 0x00DC: return (0x01D7);
808 case 0x00FC: return (0x01D8);
812 case 0x00DC: return (0x01D5);
813 case 0x00FC: return (0x01D6);
814 case 0x00C4: return (0x01DE);
815 case 0x00E4: return (0x01DF);
819 case 0x0418: return (0x0419);
820 case 0x0438: return (0x0439);
824 case 0x0415: return (0x0401);
825 case 0x0435: return (0x0451);
829 case 0x00DC: return (0x01D9);
830 case 0x00FC: return (0x01DA);
840 /* 2 char Hangul sequences */
841 if ((comb
<= 0x1175) && (base
>= 0x1100 && base
<= 0x1112))
842 return (0xAC00 + ((base
- 0x1100)*(21*28)) + ((comb
- 0x1161)*28));
844 /* 3 char Hangul sequences */
845 if ((comb
>= 0x11A8 && comb
<= 0x11C2) &&
846 (base
>= 0xAC00 && base
<= 0xD788)) {
847 if ((base
- 0xAC00) % 28)
850 return (base
+ (comb
- 0x11A7));
853 /* Now try HIRAGANA and KATAKANA */
854 if ((comb
== 0x3099 || comb
== 0x309A) &&
855 (base
> 0x3000 && base
< 0x3100) &&
856 CAN_COMBINE(__CJKCombBitmap
, base
- 0x3000)) {
857 if (comb
== 0x309A) {
859 case 0x306F: return (0x3071); /* PA */
860 case 0x3072: return (0x3074); /* PI */
861 case 0x3075: return (0x3077); /* PU */
862 case 0x3078: return (0x307A); /* PE */
863 case 0x307B: return (0x307D); /* PO */
864 case 0x30CF: return (0x30D1); /* PA */
865 case 0x30D2: return (0x30D4); /* PI */
866 case 0x30D5: return (0x30D7); /* PU */
867 case 0x30D8: return (0x30DA); /* PE */
868 case 0x30DB: return (0x30DD); /* PO */
871 } else /* 0x3099 */ {
873 case 0x3046: return (0x3094); /* VU */
874 case 0x30A6: return (0x30F4); /* VU */
875 case 0x30EF: return (0x30F7); /* VA */
876 case 0x30F0: return (0x30F8); /* VI */
877 case 0x30F1: return (0x30F9); /* VE */
878 case 0x30F2: return (0x30FA); /* VO */
879 default: return (base
+ 1); /* 41 code points here */