]>
git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_utfconv.c
4629660720852a32834914f3df9315625919de00
2 * Copyright (c) 2000-2001 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 #include <sys/param.h>
23 #include <sys/utfconv.h>
24 #include <sys/errno.h>
25 #include <architecture/byte_order.h>
28 * UTF-8 (Unicode Transformation Format)
30 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
31 * character as a sequence of one to four bytes. Only the shortest form
32 * required to represent the significant Unicode bits is legal.
34 * UTF-8 Multibyte Codes
36 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
37 * -----------------------------------------------------------------------------
38 * 1 7 0x0000 0x007F 0xxxxxxx
39 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
40 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
41 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
42 * -----------------------------------------------------------------------------
46 #define UNICODE_TO_UTF8_LEN(c) \
47 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
49 #define UCS_ALT_NULL 0x2400
51 /* Surrogate Pair Constants */
52 #define SP_HALF_SHIFT 10
53 #define SP_HALF_BASE 0x0010000UL
54 #define SP_HALF_MASK 0x3FFUL
56 #define SP_HIGH_FIRST 0xD800UL
57 #define SP_HIGH_LAST 0xDBFFUL
58 #define SP_LOW_FIRST 0xDC00UL
59 #define SP_LOW_LAST 0xDFFFUL
62 static u_int16_t
ucs_decompose(u_int16_t
, u_int16_t
*);
64 static u_int16_t
ucs_combine(u_int16_t base
, u_int16_t comb
);
67 char utf_extrabytes
[32] = {
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
74 * utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
77 * If '/' chars are allowed on disk then an alternate
78 * (replacement) char must be provided in altslash.
81 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
84 utf8_encodelen(const u_int16_t
* ucsp
, size_t ucslen
, u_int16_t altslash
,
89 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
95 while (charcnt
-- > 0) {
99 ucs_ch
= NXSwapShort(ucs_ch
);
101 ucs_ch
= altslash
? altslash
: '_';
102 else if (ucs_ch
== '\0')
103 ucs_ch
= UCS_ALT_NULL
;
105 len
+= UNICODE_TO_UTF8_LEN(ucs_ch
);
113 * utf8_encodestr - Encodes a Unicode string to UTF-8
116 * The resulting UTF-8 string is NULL terminated.
118 * If '/' chars are allowed on disk then an alternate
119 * (replacement) char must be provided in altslash.
122 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
123 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
126 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
127 * EINVAL: Illegal char found; char was replaced by an '_'.
130 utf8_encodestr(const u_int16_t
* ucsp
, size_t ucslen
, u_int8_t
* utf8p
,
131 size_t * utf8len
, size_t buflen
, u_int16_t altslash
, int flags
)
136 u_int16_t extra
[2] = {0};
138 int swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
139 int nullterm
= ((flags
& UTF_NO_NULL_TERM
) == 0);
140 int decompose
= (flags
& UTF_DECOMPOSED
);
144 bufend
= bufstart
+ buflen
;
147 charcnt
= ucslen
/ 2;
149 while (charcnt
-- > 0) {
151 ucs_ch
= swapbytes
? NXSwapShort(*ucsp
++) : *ucsp
++;
153 ucs_ch
= extra
[0]; extra
[0] = 0;
154 } else if (extra
[1]) {
155 ucs_ch
= extra
[1]; extra
[1] = 0;
157 ucs_ch
= swapbytes
? NXSwapShort(*ucsp
++) : *ucsp
++;
158 ucs_ch
= ucs_decompose(ucs_ch
, &extra
[0]);
165 /* Slash and NULL are not permitted */
173 } else if (ucs_ch
== '\0') {
174 ucs_ch
= UCS_ALT_NULL
;
177 if (ucs_ch
< 0x0080) {
178 if (utf8p
>= bufend
) {
179 result
= ENAMETOOLONG
;
184 } else if (ucs_ch
< 0x800) {
185 if ((utf8p
+ 1) >= bufend
) {
186 result
= ENAMETOOLONG
;
189 *utf8p
++ = 0xc0 | (ucs_ch
>> 6);
190 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
193 /* Combine valid surrogate pairs */
194 if (ucs_ch
>= SP_HIGH_FIRST
&& ucs_ch
<= SP_HIGH_LAST
199 ch2
= swapbytes
? NXSwapShort(*ucsp
) : *ucsp
;
200 if (ch2
>= SP_LOW_FIRST
&& ch2
<= SP_LOW_LAST
) {
201 pair
= ((ucs_ch
- SP_HIGH_FIRST
) << SP_HALF_SHIFT
)
202 + (ch2
- SP_LOW_FIRST
) + SP_HALF_BASE
;
203 if ((utf8p
+ 3) >= bufend
) {
204 result
= ENAMETOOLONG
;
209 *utf8p
++ = 0xf0 | (pair
>> 18);
210 *utf8p
++ = 0x80 | (0x3f & (pair
>> 12));
211 *utf8p
++ = 0x80 | (0x3f & (pair
>> 6));
212 *utf8p
++ = 0x80 | (0x3f & pair
);
216 if ((utf8p
+ 2) >= bufend
) {
217 result
= ENAMETOOLONG
;
220 *utf8p
++ = 0xe0 | (ucs_ch
>> 12);
221 *utf8p
++ = 0x80 | (0x3f & (ucs_ch
>> 6));
222 *utf8p
++ = 0x80 | (0x3f & ucs_ch
);
226 *utf8len
= utf8p
- bufstart
;
235 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
238 * The input UTF-8 string does not need to be null terminated
241 * If '/' chars are allowed on disk then an alternate
242 * (replacement) char must be provided in altslash.
245 * UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
246 * UTF_DECOMPOSED: Unicode output string must be fully decompsed
249 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
250 * EINVAL: Illegal UTF-8 sequence found.
253 utf8_decodestr(const u_int8_t
* utf8p
, size_t utf8len
, u_int16_t
* ucsp
,
254 size_t *ucslen
, size_t buflen
, u_int16_t altslash
, int flags
)
261 int decompose
, precompose
, swapbytes
;
263 decompose
= (flags
& UTF_DECOMPOSED
);
264 precompose
= (flags
& UTF_PRECOMPOSED
);
265 swapbytes
= (flags
& UTF_REVERSE_ENDIAN
);
268 bufend
= (u_int16_t
*)((u_int8_t
*)ucsp
+ buflen
);
270 while (utf8len
-- > 0 && (byte
= *utf8p
++) != '\0') {
274 /* check for ascii */
276 ucs_ch
= byte
; /* 1st byte */
279 int extrabytes
= utf_extrabytes
[byte
>> 3];
281 if (utf8len
< extrabytes
)
283 utf8len
-= extrabytes
;
285 switch (extrabytes
) {
286 case 1: ch
= byte
; /* 1st byte */
288 ch
+= *utf8p
++; /* 2nd byte */
295 case 2: ch
= byte
; /* 1st byte */
297 ch
+= *utf8p
++; /* 2nd byte */
299 ch
+= *utf8p
++; /* 3rd byte */
306 case 3: ch
= byte
; /* 1st byte */
308 ch
+= *utf8p
++; /* 2nd byte */
310 ch
+= *utf8p
++; /* 3rd byte */
312 ch
+= *utf8p
++; /* 4th byte */
313 ch
-= 0x03C82080UL
+ SP_HALF_BASE
;
314 ucs_ch
= (ch
>> SP_HALF_SHIFT
) + SP_HIGH_FIRST
;
315 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
318 ucs_ch
= (ch
& SP_HALF_MASK
) + SP_LOW_FIRST
;
319 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
326 u_int16_t comb_ch
[2];
328 ucs_ch
= ucs_decompose(ucs_ch
, &comb_ch
[0]);
331 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
336 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
342 } else if (precompose
&& (ucsp
!= bufstart
)) {
343 u_int16_t composite
, base
;
345 base
= swapbytes
? NXSwapShort(*(ucsp
- 1)) : *(ucsp
- 1);
346 composite
= ucs_combine(base
, ucs_ch
);
352 if (ucs_ch
== UCS_ALT_NULL
)
355 if (ucs_ch
== altslash
)
358 *ucsp
++ = swapbytes
? NXSwapShort(ucs_ch
) : ucs_ch
;
362 *ucslen
= (u_int8_t
*)ucsp
- (u_int8_t
*)bufstart
;
371 result
= ENAMETOOLONG
;
377 * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
378 * primary_char yields first decomposed char. If this
379 * char is an alpha char then get the combining char
380 * from the combining_char table and add 0x0300 to it.
383 static unsigned char primary_char
[8*36] = {
384 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x00, 0x43,
386 0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, /* CF */
388 0x00, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0x00,
390 0x00, 0x55, 0x55, 0x55, 0x55, 0x59, 0x00, 0x00, /* DF */
392 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x00, 0x63,
394 0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69, /* EF */
396 0x00, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0x00,
398 0x00, 0x75, 0x75, 0x75, 0x75, 0x79, 0x00, 0x79, /* FF */
400 0x41, 0x61, 0x41, 0x61, 0x41, 0x61, 0x43, 0x63,
402 0x43, 0x63, 0x43, 0x63, 0x43, 0x63, 0x44, 0x64, /* 10F */
404 0x00, 0x00, 0x45, 0x65, 0x45, 0x65, 0x45, 0x65,
406 0x45, 0x65, 0x45, 0x65, 0x47, 0x67, 0x47, 0x67, /* 11F */
408 0x47, 0x67, 0x47, 0x67, 0x48, 0x68, 0x00, 0x00,
410 0x49, 0x69, 0x49, 0x69, 0x49, 0x69, 0x49, 0x69,
412 0x49, 0x00, 0x00, 0x00, 0x4A, 0x6A, 0x4B, 0x6B,
414 0x00, 0x4C, 0x6C, 0x4C, 0x6C, 0x4C, 0x6C, 0x00, /* 13F */
416 0x00, 0x00, 0x00, 0x4E, 0x6E, 0x4E, 0x6E, 0x4E,
418 0x6E, 0x00, 0x00, 0x00, 0x4F, 0x6F, 0x4F, 0x6F,
420 0x4F, 0x6F, 0x00, 0x00, 0x52, 0x72, 0x52, 0x72,
422 0x52, 0x72, 0x53, 0x73, 0x53, 0x73, 0x53, 0x73, /* 15F */
424 0x53, 0x73, 0x54, 0x74, 0x54, 0x74, 0x00, 0x00,
426 0x55, 0x75, 0x55, 0x75, 0x55, 0x75, 0x55, 0x75,
428 0x55, 0x75, 0x55, 0x75, 0x57, 0x77, 0x59, 0x79,
430 0x59, 0x5A, 0x7A, 0x5A, 0x7A, 0x5A, 0x7A, 0x00, /* 17F */
432 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
434 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
436 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
438 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 19F */
440 0x4F, 0x6F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
442 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55,
444 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
446 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1BF */
448 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
450 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x61, 0x49,
452 0x69, 0x4F, 0x6F, 0x55, 0x75, 0xDC, 0xFC, 0xDC,
454 0xFC, 0xDC, 0xFC, 0xDC, 0xFC, 0x00, 0xC4, 0xE4 /* 1DF */
458 static unsigned char combining_char
[8*36] = {
459 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
461 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, /* CF */
463 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
465 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF, /* DF */
467 0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
469 0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08, /* EF */
471 0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
473 0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08, /* FF */
475 0x04, 0x04, 0x06, 0x06, 0x28, 0x28, 0x01, 0x01,
477 0x02, 0x02, 0x07, 0x07, 0x0C, 0x0C, 0x0C, 0x0C,
479 0x00, 0x00, 0x04, 0x04, 0x06, 0x06, 0x07, 0x07,
481 0x28, 0x28, 0x0C, 0x0C, 0x02, 0x02, 0x06, 0x06,
483 0x07, 0x07, 0x27, 0x27, 0x02, 0x02, 0x00, 0x00,
485 0x03, 0x03, 0x04, 0x04, 0x06, 0x06, 0x28, 0x28,
487 0x07, 0x00, 0x00, 0x00, 0x02, 0x02, 0x27, 0x27,
489 0x00, 0x01, 0x01, 0x27, 0x27, 0x0C, 0x0C, 0x00, /* 13F */
491 0x00, 0x00, 0x00, 0x01, 0x01, 0x27, 0x27, 0x0C,
493 0x0C, 0x00, 0x00, 0x00, 0x04, 0x04, 0x06, 0x06,
495 0x0B, 0x0B, 0x00, 0x00, 0x01, 0x01, 0x27, 0x27,
497 0x0C, 0x0C, 0x01, 0x01, 0x02, 0x02, 0x27, 0x27,
499 0x0C, 0x0C, 0x27, 0x27, 0x0C, 0x0C, 0x00, 0x00,
501 0x03, 0x03, 0x04, 0x04, 0x06, 0x06, 0x0A, 0x0A, /* 16F */
503 0x0B, 0x0B, 0x28, 0x28, 0x02, 0x02, 0x02, 0x02,
505 0x08, 0x01, 0x01, 0x07, 0x07, 0x0C, 0x0C, 0x00,
507 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 17F */
509 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
511 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
513 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 19F */
515 0x1B, 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
517 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1B,
519 0x1B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
521 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
523 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
525 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x0C, /* 1CF */
527 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x04, 0x04, 0x01,
529 0x01, 0x0C, 0x0C, 0x00, 0x00, 0x00, 0x04, 0x04 /* 1DF */
533 /* CYRILLIC codepoints 0x0400 ~ 0x04FF */
534 static const unsigned long __CyrillicDecompBitmap
[] = {
535 0x510A0040, 0x00000040, 0x0000510A, 0x00000000, /* 0x0400 */
536 0x00000000, 0x00000000, 0x00000000, 0x00000000, /* 0x0480 */
539 /* CJK codepoints 0x3000 ~ 0x30FF */
540 static const unsigned long __CJKDecompBitmap
[] = {
541 0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, /* 0x3000 */
542 0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, /* 0x3080 */
544 #define IS_DECOMPOSABLE(table,unicodeVal) \
545 (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
548 * ucs_decompose - decompose a composed Unicode char
550 * Composed Unicode characters are forbidden on
551 * HFS Plus volumes. ucs_decompose will convert a
552 * composed character into its correct decomposed
555 * Currently only Tier-1 and Tier-2 languages
556 * are handled. Other composed characters are
560 ucs_decompose(register u_int16_t ch
, u_int16_t
*cmb
)
569 } else if (ch
<= 0x01DF) {
571 base
= (u_int16_t
) primary_char
[ch
- 0x00C0];
576 if ((base
< 0x00C0) || (primary_char
[base
- 0x00C0] == 0))
577 cmb
[0] = (u_int16_t
)0x0300 + (u_int16_t
)combining_char
[ch
- 0x00C0];
579 u_int16_t tch
= base
;
581 base
= (u_int16_t
)primary_char
[tch
- 0x00C0];
582 cmb
[0] = (u_int16_t
)0x0300 + (u_int16_t
)combining_char
[tch
- 0x00C0];
583 cmb
[1] = (u_int16_t
)0x0300 + (u_int16_t
)combining_char
[ch
- 0x00C0];
586 } else if ((ch
>= 0x0400) && (ch
<= 0x04FF) &&
587 IS_DECOMPOSABLE(__CyrillicDecompBitmap
, ch
- 0x0400)) {
589 /* Handle CYRILLIC LETTERs */
591 case 0x0401: base
= 0x0415; cmb
[0] = 0x0308; break; /* */
592 case 0x0403: base
= 0x0413; cmb
[0] = 0x0301; break; /* */
593 case 0x0407: base
= 0x0406; cmb
[0] = 0x0308; break; /* */
594 case 0x040C: base
= 0x041A; cmb
[0] = 0x0301; break; /* */
595 case 0x040E: base
= 0x0423; cmb
[0] = 0x0306; break; /* */
596 case 0x0419: base
= 0x0418; cmb
[0] = 0x0306; break; /* */
597 case 0x0439: base
= 0x0438; cmb
[0] = 0x0306; break; /* */
598 case 0x0451: base
= 0x0435; cmb
[0] = 0x0308; break; /* */
599 case 0x0453: base
= 0x0433; cmb
[0] = 0x0301; break; /* */
600 case 0x0457: base
= 0x0456; cmb
[0] = 0x0308; break; /* */
601 case 0x045C: base
= 0x043A; cmb
[0] = 0x0301; break; /* */
602 case 0x045E: base
= 0x0443; cmb
[0] = 0x0306; break; /* */
605 /* Should not be hit from bit map table */
608 } else if (ch
== 0x1E3F) {
609 base
= 0x006D; cmb
[0] = 0x0301; /* LATIN SMALL LETTER M WITH ACUTE */
610 } else if ((ch
> 0x3000) && (ch
< 0x3100) &&
611 IS_DECOMPOSABLE(__CJKDecompBitmap
, ch
- 0x3000)) {
613 /* Handle HIRAGANA LETTERs */
615 case 0x3071: base
= 0x306F; cmb
[0] = 0x309A; break; /* PA */
616 case 0x3074: base
= 0x3072; cmb
[0] = 0x309A; break; /* PI */
617 case 0x3077: base
= 0x3075; cmb
[0] = 0x309A; break; /* PU */
618 case 0x307A: base
= 0x3078; cmb
[0] = 0x309A; break; /* PE */
620 case 0x307D: base
= 0x307B; cmb
[0] = 0x309A; break; /* PO */
621 case 0x3094: base
= 0x3046; cmb
[0] = 0x3099; break; /* VU */
622 case 0x30D1: base
= 0x30CF; cmb
[0] = 0x309A; break; /* PA */
623 case 0x30D4: base
= 0x30D2; cmb
[0] = 0x309A; break; /* PI */
625 case 0x30D7: base
= 0x30D5; cmb
[0] = 0x309A; break; /* PU */
626 case 0x30DA: base
= 0x30D8; cmb
[0] = 0x309A; break; /* PE */
627 case 0x30DD: base
= 0x30DB; cmb
[0] = 0x309A; break; /* PO */
628 case 0x30F4: base
= 0x30A6; cmb
[0] = 0x3099; break; /* VU */
630 case 0x30F7: base
= 0x30EF; cmb
[0] = 0x3099; break; /* VA */
631 case 0x30F8: base
= 0x30F0; cmb
[0] = 0x3099; break; /* VI */
632 case 0x30F9: base
= 0x30F1; cmb
[0] = 0x3099; break; /* VE */
633 case 0x30FA: base
= 0x30F2; cmb
[0] = 0x3099; break; /* VO */
636 /* the rest (41 of them) have a simple conversion */
640 } else if ((ch
>= 0xAC00) && (ch
< 0xD7A4)) {
643 base
= 0x1100 + (ch
/ (21*28));
644 cmb
[0] = 0x1161 + (ch
% (21*28)) / 28;
647 cmb
[1] = 0x11A7 + (ch
% 28);
656 static const short diacrit_tbl
[8*6] = {
657 /* 300 - 307 */ 0, 58, 116, 174, 232, -1, 290, 348,
658 /* 308 - 30F */ 406, -1, 464, 522, 580, -1, -1, -1,
659 /* 310 - 317 */ -1, -1, -1, -1, -1, -1, -1, -1,
660 /* 318 - 31F */ -1, -1, -1, 638, -1, -1, -1, -1,
661 /* 320 - 327 */ -1, -1, -1, -1, -1, -1, -1, 696,
662 /* 328 - 32F */ 754, -1, -1, -1, -1, -1, -1, -1
665 static const u_int16_t composite_tbl
[58*14] = {
667 * A B C D E F G H I J K L M
668 * N O P Q R S T U V W X Y Z
670 * a b c d e f g h i j k l m
671 * n o p q r s t u v w x y z
675 * 0x300 - grave accent
677 0x0C0, 0, 0, 0,0x0C8, 0, 0, 0,0x0CC, 0, 0, 0, 0,
678 0,0x0D2, 0, 0, 0, 0, 0,0x0D9, 0, 0, 0, 0, 0,
680 0x0E0, 0, 0, 0,0x0E8, 0, 0, 0,0x0EC, 0, 0, 0, 0,
681 0,0x0F2, 0, 0, 0, 0, 0,0x0F9, 0, 0, 0, 0, 0,
683 * 0x301 - acute accent
685 0x0C1, 0,0x106, 0,0x0C9, 0, 0, 0,0x0CD, 0, 0,0x139, 0,
686 0x143,0x0D3, 0, 0,0x154,0x15A, 0,0x0DA, 0, 0, 0,0x0DD,0x179,
688 0x0E1, 0,0x107, 0,0x0E9, 0, 0, 0,0x0ED, 0, 0,0x13A,0x1E3F,
689 0x144,0x0F3, 0, 0,0x155,0x15B, 0,0x0FA, 0, 0, 0,0x0FD,0x17A,
691 * 0x302 - circumflex accent
693 0x0C2, 0,0x108, 0,0x0CA, 0,0x11C,0x124,0x0CE,0x134, 0, 0, 0,
694 0,0x0D4, 0, 0, 0,0x15C, 0,0x0DB, 0,0x174, 0,0x176, 0,
696 0x0E2, 0,0x109, 0,0x0EA, 0,0x11D,0x125,0x0EE,0x135, 0, 0, 0,
697 0,0x0F4, 0, 0, 0,0x15D, 0,0x0FB, 0,0x175, 0,0x177, 0,
701 0x0C3, 0, 0, 0, 0, 0, 0, 0,0x128, 0, 0, 0, 0,
702 0x0D1,0x0D5, 0, 0, 0, 0, 0,0x168, 0, 0, 0, 0, 0,
704 0x0E3, 0, 0, 0, 0, 0, 0, 0,0x129, 0, 0, 0, 0,
705 0x0F1,0x0F5, 0, 0, 0, 0, 0,0x169, 0, 0, 0, 0, 0,
709 0x100, 0, 0, 0,0x112, 0, 0, 0,0x12A, 0, 0, 0, 0,
710 0,0x14C, 0, 0, 0, 0, 0,0x16A, 0, 0, 0, 0, 0,
712 0x101, 0, 0, 0,0x113, 0, 0, 0,0x12B, 0, 0, 0, 0,
713 0,0x14D, 0, 0, 0, 0, 0,0x16B, 0, 0, 0, 0, 0,
717 0x102, 0, 0, 0,0x114, 0,0x11E, 0,0x12C, 0, 0, 0, 0,
718 0,0x14E, 0, 0, 0, 0, 0,0x16C, 0, 0, 0, 0, 0,
720 0x103, 0, 0, 0,0x115, 0,0x11F, 0,0x12D, 0, 0, 0, 0,
721 0,0x14F, 0, 0, 0, 0, 0,0x16D, 0, 0, 0, 0, 0,
725 0, 0,0x10A, 0,0x116, 0,0x120, 0,0x130, 0, 0, 0, 0,
726 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0x17B,
728 0, 0,0x10B, 0,0x117, 0,0x121, 0, 0, 0, 0, 0, 0,
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0x17C,
733 0x0C4, 0, 0, 0,0x0CB, 0, 0, 0,0x0CF, 0, 0, 0, 0,
734 0,0x0D6, 0, 0, 0, 0, 0,0x0DC, 0, 0, 0,0x178, 0,
736 0x0E4, 0, 0, 0,0x0EB, 0, 0, 0,0x0EF, 0, 0, 0, 0,
737 0,0x0F6, 0, 0, 0, 0, 0,0x0FC, 0, 0, 0,0x0FF, 0,
741 0x0C5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
742 0, 0, 0, 0, 0, 0, 0,0x16E, 0, 0, 0, 0, 0,
744 0x0E5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
745 0, 0, 0, 0, 0, 0, 0,0x16F, 0, 0, 0, 0, 0,
747 * 0x30B - double aute accent
749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
750 0,0x150, 0, 0, 0, 0, 0,0x170, 0, 0, 0, 0, 0,
752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
753 0,0x151, 0, 0, 0, 0, 0,0x171, 0, 0, 0, 0, 0,
757 0x1CD, 0,0x10C,0x10E,0x11A, 0, 0, 0,0x1CF, 0, 0,0x13D, 0,
758 0x147,0x1D1, 0, 0,0x158,0x160,0x164,0x1D3, 0, 0, 0, 0,0x17D,
760 0x1CE, 0,0x10D,0x10F,0x11B, 0, 0, 0,0x1D0, 0, 0,0x13E, 0,
761 0x148,0x1D2, 0, 0,0x159,0x161,0x165,0x1D4, 0, 0, 0, 0,0x17E,
765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
766 0,0x1A0, 0, 0, 0, 0, 0,0x1AF, 0, 0, 0, 0, 0,
768 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
769 0,0x1A1, 0, 0, 0, 0, 0,0x1B0, 0, 0, 0, 0, 0,
773 0, 0,0x0C7, 0, 0, 0,0x122, 0, 0, 0,0x136,0x13B, 0,
774 0x145, 0, 0, 0,0x156,0x15E,0x162, 0, 0, 0, 0, 0, 0,
776 0, 0,0x0E7, 0, 0, 0,0x123, 0, 0, 0,0x137,0x13C, 0,
777 0x146, 0, 0, 0,0x157,0x15F,0x163, 0, 0, 0, 0, 0, 0,
781 0x104, 0, 0, 0,0x118, 0, 0, 0,0x12E, 0, 0, 0, 0,
782 0, 0, 0, 0, 0, 0, 0,0x172, 0, 0, 0, 0, 0,
784 0x105, 0, 0, 0,0x119, 0, 0, 0,0x12F, 0, 0, 0, 0,
785 0, 0, 0, 0, 0, 0, 0,0x173, 0, 0, 0, 0, 0,
789 /* CJK codepoints 0x3000 ~ 0x30FF */
790 static const unsigned long __CJKCombBitmap
[] = {
791 0x00000000, 0x00000000, 0x02155555, 0x4A812490, /* 0x3000 */
792 0x00000004, 0x02155555, 0x4A812490, 0x0001E004, /* 0x3080 */
794 #define CAN_COMBINE(table,unicodeVal) \
795 (table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
799 * ucs_combine - generate a precomposed Unicode char
801 * Precomposed Unicode characters are required for some volume
802 * formats and network protocols. ucs_combine will combine a
803 * decomposed character sequence into a single precomposed
804 * (composite) character.
806 * Currently only decomcomposed sequences from Apple's Tier 1
807 * and Tier 2 languages are handled.
810 * base - base character
811 * comb - combining character
813 * result - precomposed char or zero if not combinable
816 ucs_combine(u_int16_t base
, u_int16_t comb
)
818 /* Get out early if we can */
822 /* Try ordinary diacritics (0x300 - 0x32F) */
823 if (comb
<= 0x032F) {
826 if (base
>= 'A' && base
<= 'z') {
827 index
= diacrit_tbl
[comb
- 0x0300];
828 if (index
< 0 ) return (0);
830 return (composite_tbl
[index
+ (base
- 'A')]);
833 /* Handle Cyrillic and some 3 char latin sequences */
837 case 0x00DC: return (0x01DB);
838 case 0x00FC: return (0x01DC);
842 case 0x00DC: return (0x01D7);
843 case 0x00FC: return (0x01D8);
844 case 0x0413: return (0x0403);
845 case 0x041A: return (0x040C);
846 case 0x0433: return (0x0453);
847 case 0x043A: return (0x045C);
851 case 0x00DC: return (0x01D5);
852 case 0x00FC: return (0x01D6);
853 case 0x00C4: return (0x01DE);
854 case 0x00E4: return (0x01DF);
858 case 0x0418: return (0x0419);
859 case 0x0423: return (0x040E);
860 case 0x0438: return (0x0439);
861 case 0x0443: return (0x045E);
865 case 0x0406: return (0x0407);
866 case 0x0415: return (0x0401);
867 case 0x0435: return (0x0451);
868 case 0x0456: return (0x0457);
872 case 0x00DC: return (0x01D9);
873 case 0x00FC: return (0x01DA);
883 /* 2 char Hangul sequences */
884 if ((comb
<= 0x1175) && (base
>= 0x1100 && base
<= 0x1112))
885 return (0xAC00 + ((base
- 0x1100)*(21*28)) + ((comb
- 0x1161)*28));
887 /* 3 char Hangul sequences */
888 if ((comb
>= 0x11A8 && comb
<= 0x11C2) &&
889 (base
>= 0xAC00 && base
<= 0xD788)) {
890 if ((base
- 0xAC00) % 28)
893 return (base
+ (comb
- 0x11A7));
896 /* Now try HIRAGANA and KATAKANA */
897 if ((comb
== 0x3099 || comb
== 0x309A) &&
898 (base
> 0x3000 && base
< 0x3100) &&
899 CAN_COMBINE(__CJKCombBitmap
, base
- 0x3000)) {
900 if (comb
== 0x309A) {
902 case 0x306F: return (0x3071); /* PA */
903 case 0x3072: return (0x3074); /* PI */
904 case 0x3075: return (0x3077); /* PU */
905 case 0x3078: return (0x307A); /* PE */
906 case 0x307B: return (0x307D); /* PO */
907 case 0x30CF: return (0x30D1); /* PA */
908 case 0x30D2: return (0x30D4); /* PI */
909 case 0x30D5: return (0x30D7); /* PU */
910 case 0x30D8: return (0x30DA); /* PE */
911 case 0x30DB: return (0x30DD); /* PO */
914 } else /* 0x3099 */ {
916 case 0x3046: return (0x3094); /* VU */
917 case 0x30A6: return (0x30F4); /* VU */
918 case 0x30EF: return (0x30F7); /* VA */
919 case 0x30F0: return (0x30F8); /* VI */
920 case 0x30F1: return (0x30F9); /* VE */
921 case 0x30F2: return (0x30FA); /* VO */
922 default: return (base
+ 1); /* 41 code points here */