[apple/xnu.git] / bsd / vfs / vfs_utfconv.c

/*
 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

#include <sys/param.h>
#include <sys/utfconv.h>
#include <sys/errno.h>
#include <architecture/byte_order.h>


/*
 * UTF-8 (UCS Transformation Format)
 *
 * The following subset of UTF-8 is used to encode UCS-2 filenames. It
 * requires a maximum of three 3 bytes per UCS-2 character.  Only the
 * shortest encoding required to represent the significant UCS-2 bits
 * is legal.
 * 
 * UTF-8 Multibyte Codes
 *
 * Bytes   Bits   UCS-2 Min   UCS-2 Max   UTF-8 Byte Sequence (binary)
 * -------------------------------------------------------------------
 *   1       7     0x0000      0x007F      0xxxxxxx
 *   2      11     0x0080      0x07FF      110xxxxx 10xxxxxx
 *   3      16     0x0800      0xFFFF      1110xxxx 10xxxxxx 10xxxxxx
 * -------------------------------------------------------------------
 */


#define UCS_TO_UTF_LEN(c)	((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : 3))


static u_int16_t ucs_decompose __P((u_int16_t, u_int16_t *));


/*
 * utf8_encodelen - Calculates the UTF-8 encoding length for a UCS-2 filename
 *
 * NOTES:
 *    If '/' chars are allowed on disk then an alternate
 *    (replacement) char must be provided in altslash.
 *
 * input flags:
 *    UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
 */
size_t
utf8_encodelen(ucsp, ucslen, altslash, flags)
	const u_int16_t * ucsp;
	size_t ucslen;
	u_int16_t altslash;
	int flags;
{
	u_int16_t ucs_ch;
	int charcnt;
	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
	size_t len;
	
	charcnt = ucslen / 2;
	len = 0;

	while (charcnt-- > 0) {
		ucs_ch = *ucsp++;

		if (swapbytes)
			ucs_ch = NXSwapShort(ucs_ch);
		if (altslash && ucs_ch == '/')
			ucs_ch = altslash;
		if (ucs_ch == '\0')
			ucs_ch = 0xc080;
		
		len += UCS_TO_UTF_LEN(ucs_ch);
	}

	return (len);
}


/*
 * utf8_encodestr - Encodes a UCS-2 (Unicode) string to UTF-8
 *
 * NOTES:
 *    The resulting UTF-8 string is not null terminated.
 *
 *    If '/' chars are allowed on disk then an alternate
 *    (replacement) char must be provided in altslash.
 *
 * input flags:
 *    UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
 *    UTF_NO_NULL_TERM:  don't add NULL termination to UTF-8 output
 */
int utf8_encodestr(ucsp, ucslen, utf8p, utf8len, buflen, altslash, flags)
	const u_int16_t * ucsp;
	size_t ucslen;
	u_int8_t * utf8p;
	size_t * utf8len;
	size_t buflen;
	u_int16_t altslash;
	int flags;
{
	u_int8_t * bufstart;
	u_int8_t * bufend;
	u_int16_t ucs_ch;
	int charcnt;
	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
	int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
	int result = 0;
	
	bufstart = utf8p;
	bufend = bufstart + buflen;
	if (nullterm)
		--bufend;
	charcnt = ucslen / 2;

	while (charcnt-- > 0) {
		ucs_ch = *ucsp++;

		if (swapbytes)
			ucs_ch = NXSwapShort(ucs_ch);
		if (altslash && ucs_ch == '/')
			ucs_ch = altslash;

		if ((ucs_ch < 0x0080) && (ucs_ch != '\0')) {
			if (utf8p >= bufend) {
				result = ENAMETOOLONG;
				break;
			}
			*utf8p++ = ucs_ch;

		} else if (ucs_ch < 0x800) {
			if ((utf8p + 1) >= bufend) {
				result = ENAMETOOLONG;
				break;
			}
			/* NOTE: NULL maps to 0xC080 */
			*utf8p++ = (ucs_ch >> 6) | 0xc0;
			*utf8p++ = (ucs_ch & 0x3f) | 0x80;

		} else {
			if ((utf8p + 2) >= bufend) {
				result = ENAMETOOLONG;
				break;
			}
			*utf8p++ = (ucs_ch >> 12) | 0xe0;
			*utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
			*utf8p++ = ((ucs_ch) & 0x3f) | 0x80;
		}	
	}
	
	*utf8len = utf8p - bufstart;
	if (nullterm)
		*utf8p++ = '\0';

	return (result);
}


/*
 * utf8_decodestr - Decodes a UTF-8 string back to UCS-2 (Unicode)
 *
 * NOTES:
 *    The input UTF-8 string does not need to be null terminated
 *    if utf8len is set.
 *
 *    If '/' chars are allowed on disk then an alternate
 *    (replacement) char must be provided in altslash.
 *
 * input flags:
 *    UTF_REV_ENDIAN:   UCS-2 byteorder is oposite current runtime
 *    UTF_DECOMPOSED:   UCS-2 output string must be fully decompsed
 */
int
utf8_decodestr(utf8p, utf8len, ucsp, ucslen, buflen, altslash, flags)
	const u_int8_t* utf8p;
	size_t utf8len;
	u_int16_t* ucsp;
	size_t *ucslen;
	size_t buflen;
	u_int16_t altslash;
	int flags;
{
	u_int16_t* bufstart;
	u_int16_t* bufend;
	u_int16_t ucs_ch;
	u_int8_t byte;
	int result = 0;
	int decompose, swapbytes;

	decompose = (flags & UTF_DECOMPOSED);
	swapbytes = (flags & UTF_REVERSE_ENDIAN);

	bufstart = ucsp;
	bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);

	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
		if (ucsp >= bufend) {
			result = ENAMETOOLONG;
			goto stop;
		}

		/* check for ascii */
		if (byte < 0x80) {
			ucs_ch = byte;
		} else {
			switch (byte & 0xf0) {
			/*  2 byte sequence*/
			case 0xc0:
			case 0xd0:
				/* extract bits 6 - 10 from first byte */
				ucs_ch = (byte & 0x1F) << 6;  
				if ((ucs_ch < 0x0080) && (*utf8p != 0x80)) {
					result = EINVAL;  /* seq not minimal */
					goto stop;
				}
				break;
			/* 3 byte sequence*/
			case 0xe0:
				/* extract bits 12 - 15 from first byte */
				ucs_ch = (byte & 0x0F) << 6;

				/* extract bits 6 - 11 from second byte */
				if (((byte = *utf8p++) & 0xc0) != 0x80) {
					result = EINVAL;
					goto stop;
				}
				utf8len--;

				ucs_ch += (byte & 0x3F);
				ucs_ch <<= 6;

				if (ucs_ch < 0x0800) {
					result = EINVAL; /* seq not minimal */
					goto stop;
				}
				break;
			default:
				result = EINVAL;
				goto stop;
			}

			/* extract bits 0 - 5 from final byte */
			if (((byte = *utf8p++) & 0xc0) != 0x80) {
				result = EINVAL;
				goto stop;
			}
			utf8len--;
			ucs_ch += (byte & 0x3F);  

			if (decompose) {
				u_int16_t comb_ch;

				ucs_ch = ucs_decompose(ucs_ch, &comb_ch);

				if (comb_ch) {
					if (swapbytes)
						*ucsp++ = NXSwapShort(ucs_ch);
					else
						*ucsp++ = ucs_ch;

					if (ucsp >= bufend) {
						result = ENAMETOOLONG;
						goto stop;
					}

					ucs_ch = comb_ch;
				}
			}
		}

		if (ucs_ch == altslash)
			ucs_ch = '/';
		if (swapbytes)
			ucs_ch = NXSwapShort(ucs_ch);

		*ucsp++ = ucs_ch;
	}
stop:
	*ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;

	return (result);
}


/*
 * Lookup tables for Unicode chars 0x00C0 thru 0x00FF
 * primary_char yields first decomposed char. If this
 * char is an alpha char then get the combining char
 * from the combining_char table and add 0x0300 to it.
 */

static unsigned char primary_char[64] = {
	0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43,

	0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49,

	0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7,

	0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF,

	0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63,

	0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69,

	0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7,

	0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79,
};

static unsigned char combining_char[64] = {
	0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,

	0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,

	0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,

	0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF,

	0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,

	0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,

	0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,

	0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08
};


/* CJK codepoints 0x3000 ~ 0x30FF */
static const unsigned long __CJKDecompBitmap[] = {
    0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C,	/* 0x3000 */
    0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2,	/* 0x3080 */
};
#define IS_DECOMPOSABLE(table,unicodeVal) \
	(table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))

/*
 * ucs_decompose - decompose a composed UCS-2 char
 *
 * Composed Unicode characters are forbidden on
 * HFS Plus volumes. ucs_decompose will convert a
 * composed character into its correct decomposed
 * sequence.
 *
 * Currently only MacRoman and MacJapanese chars
 * are handled.  Other composed characters are
 * passed unchanged.
 */
static u_int16_t
ucs_decompose(register u_int16_t ch, u_int16_t *cmb)
{
	u_int16_t base;
	
	*cmb = 0;

	if ((ch <= 0x00FF) && (ch >= 0x00C0)) {
		ch -= 0x00C0;
		
		base = (u_int16_t) primary_char[ch];

		if (base <= 'z') {
			*cmb = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch];
		}
	} else if ((ch > 0x3000) && (ch < 0x3100) &&
		   IS_DECOMPOSABLE(__CJKDecompBitmap, ch - 0x3000)) {

		/* Handle HIRAGANA LETTERs */
		switch(ch) {
		case 0x3071: base = 0x306F; *cmb = 0x309A; break; /* PA */
		case 0x3074: base = 0x3072; *cmb = 0x309A; break; /* PI */
		case 0x3077: base = 0x3075; *cmb = 0x309A; break; /* PU */
		case 0x307A: base = 0x3078; *cmb = 0x309A; break; /* PE */

		case 0x307D: base = 0x307B; *cmb = 0x309A; break; /* PO */
		case 0x3094: base = 0x3046; *cmb = 0x3099; break; /* VU */
		case 0x30D1: base = 0x30CF; *cmb = 0x309A; break; /* PA */
		case 0x30D4: base = 0x30D2; *cmb = 0x309A; break; /* PI */

		case 0x30D7: base = 0x30D5; *cmb = 0x309A; break; /* PU */
		case 0x30DA: base = 0x30D8; *cmb = 0x309A; break; /* PE */
		case 0x30DD: base = 0x30DB; *cmb = 0x309A; break; /* PO */
		case 0x30F4: base = 0x30A6; *cmb = 0x3099; break; /* VU */

		case 0x30F7: base = 0x30EF; *cmb = 0x3099; break; /* VA */
		case 0x30F8: base = 0x30F0; *cmb = 0x3099; break; /* VI */
		case 0x30F9: base = 0x30F1; *cmb = 0x3099; break; /* VE */
		case 0x30FA: base = 0x30F2; *cmb = 0x3099; break; /* VO */
		
		default:
			/* the rest (41 of them) have a simple conversion */
			base = ch - 1;
			*cmb = 0x3099;
		}
	} else {
		base = ch;
	}
	
	return (base);
}
Commit	Line	Data
1c79356b A	1	/*
	2	* Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22
	23	#include <sys/param.h>
	24	#include <sys/utfconv.h>
	25	#include <sys/errno.h>
	26	#include <architecture/byte_order.h>
	27
	28
	29	/*
	30	* UTF-8 (UCS Transformation Format)
	31	*
	32	* The following subset of UTF-8 is used to encode UCS-2 filenames. It
	33	* requires a maximum of three 3 bytes per UCS-2 character. Only the
	34	* shortest encoding required to represent the significant UCS-2 bits
	35	* is legal.
	36	*
	37	* UTF-8 Multibyte Codes
	38	*
	39	* Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary)
	40	* -------------------------------------------------------------------
	41	* 1 7 0x0000 0x007F 0xxxxxxx
	42	* 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
	43	* 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
	44	* -------------------------------------------------------------------
	45	*/
	46
	47
	48	#define UCS_TO_UTF_LEN(c) ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : 3))
	49
	50
	51	static u_int16_t ucs_decompose __P((u_int16_t, u_int16_t *));
	52
	53
	54	/*
	55	* utf8_encodelen - Calculates the UTF-8 encoding length for a UCS-2 filename
	56	*
	57	* NOTES:
	58	* If '/' chars are allowed on disk then an alternate
	59	* (replacement) char must be provided in altslash.
	60	*
	61	* input flags:
	62	* UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
	63	*/
	64	size_t
65	utf8_encodelen(ucsp, ucslen, altslash, flags)
66	const u_int16_t * ucsp;
67	size_t ucslen;
68	u_int16_t altslash;
69	int flags;
70	{
71	u_int16_t ucs_ch;
72	int charcnt;
73	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
74	size_t len;
75
76	charcnt = ucslen / 2;
77	len = 0;
78
79	while (charcnt-- > 0) {
80	ucs_ch = *ucsp++;
81
82	if (swapbytes)
83	ucs_ch = NXSwapShort(ucs_ch);
84	if (altslash && ucs_ch == '/')
85	ucs_ch = altslash;
86	if (ucs_ch == '\0')
87	ucs_ch = 0xc080;
88
89	len += UCS_TO_UTF_LEN(ucs_ch);
90	}
91
92	return (len);
93	}
94
95
96	/*
97	* utf8_encodestr - Encodes a UCS-2 (Unicode) string to UTF-8
98	*
99	* NOTES:
100	* The resulting UTF-8 string is not null terminated.
101	*
102	* If '/' chars are allowed on disk then an alternate
103	* (replacement) char must be provided in altslash.
104	*
105	* input flags:
106	* UTF_REVERSE_ENDIAN: UCS-2 byteorder is opposite current runtime
107	* UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
108	*/
109	int utf8_encodestr(ucsp, ucslen, utf8p, utf8len, buflen, altslash, flags)
110	const u_int16_t * ucsp;
111	size_t ucslen;
112	u_int8_t * utf8p;
113	size_t * utf8len;
114	size_t buflen;
115	u_int16_t altslash;
116	int flags;
117	{
118	u_int8_t * bufstart;
119	u_int8_t * bufend;
120	u_int16_t ucs_ch;
121	int charcnt;
122	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
123	int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
124	int result = 0;
125
126	bufstart = utf8p;
127	bufend = bufstart + buflen;
128	if (nullterm)
129	--bufend;
130	charcnt = ucslen / 2;
131
132	while (charcnt-- > 0) {
133	ucs_ch = *ucsp++;
134
135	if (swapbytes)
136	ucs_ch = NXSwapShort(ucs_ch);
137	if (altslash && ucs_ch == '/')
138	ucs_ch = altslash;
139
140	if ((ucs_ch < 0x0080) && (ucs_ch != '\0')) {
141	if (utf8p >= bufend) {
142	result = ENAMETOOLONG;
143	break;
144	}
145	*utf8p++ = ucs_ch;
146
147	} else if (ucs_ch < 0x800) {
148	if ((utf8p + 1) >= bufend) {
149	result = ENAMETOOLONG;
150	break;
151	}
152	/* NOTE: NULL maps to 0xC080 */
153	*utf8p++ = (ucs_ch >> 6) \| 0xc0;
154	*utf8p++ = (ucs_ch & 0x3f) \| 0x80;
155
156	} else {
157	if ((utf8p + 2) >= bufend) {
158	result = ENAMETOOLONG;
159	break;
160	}
161	*utf8p++ = (ucs_ch >> 12) \| 0xe0;
162	*utf8p++ = ((ucs_ch >> 6) & 0x3f) \| 0x80;
163	*utf8p++ = ((ucs_ch) & 0x3f) \| 0x80;
164	}
165	}
166
167	*utf8len = utf8p - bufstart;
168	if (nullterm)
169	*utf8p++ = '\0';
170
171	return (result);
172	}
173
174
175	/*
176	* utf8_decodestr - Decodes a UTF-8 string back to UCS-2 (Unicode)
177	*
178	* NOTES:
179	* The input UTF-8 string does not need to be null terminated
180	* if utf8len is set.
181	*
182	* If '/' chars are allowed on disk then an alternate
183	* (replacement) char must be provided in altslash.
184	*
185	* input flags:
186	* UTF_REV_ENDIAN: UCS-2 byteorder is oposite current runtime
187	* UTF_DECOMPOSED: UCS-2 output string must be fully decompsed
188	*/
189	int
190	utf8_decodestr(utf8p, utf8len, ucsp, ucslen, buflen, altslash, flags)
191	const u_int8_t* utf8p;
192	size_t utf8len;
193	u_int16_t* ucsp;
194	size_t *ucslen;
195	size_t buflen;
196	u_int16_t altslash;
197	int flags;
198	{
199	u_int16_t* bufstart;
200	u_int16_t* bufend;
201	u_int16_t ucs_ch;
202	u_int8_t byte;
203	int result = 0;
204	int decompose, swapbytes;
205
206	decompose = (flags & UTF_DECOMPOSED);
207	swapbytes = (flags & UTF_REVERSE_ENDIAN);
208
209	bufstart = ucsp;
210	bufend = (u_int16_t )((u_int8_t )ucsp + buflen);
211
212	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
213	if (ucsp >= bufend) {
214	result = ENAMETOOLONG;
215	goto stop;
216	}
217
218	/* check for ascii */
219	if (byte < 0x80) {
220	ucs_ch = byte;
221	} else {
222	switch (byte & 0xf0) {
223	/* 2 byte sequence*/
224	case 0xc0:
225	case 0xd0:
226	/* extract bits 6 - 10 from first byte */
227	ucs_ch = (byte & 0x1F) << 6;
228	if ((ucs_ch < 0x0080) && (*utf8p != 0x80)) {
229	result = EINVAL; /* seq not minimal */
230	goto stop;
231	}
232	break;
233	/* 3 byte sequence*/
234	case 0xe0:
235	/* extract bits 12 - 15 from first byte */
236	ucs_ch = (byte & 0x0F) << 6;
237
238	/* extract bits 6 - 11 from second byte */
239	if (((byte = *utf8p++) & 0xc0) != 0x80) {
240	result = EINVAL;
241	goto stop;
242	}
243	utf8len--;
244
245	ucs_ch += (byte & 0x3F);
246	ucs_ch <<= 6;
247
248	if (ucs_ch < 0x0800) {
249	result = EINVAL; /* seq not minimal */
250	goto stop;
251	}
252	break;
253	default:
254	result = EINVAL;
255	goto stop;
256	}
257
258	/* extract bits 0 - 5 from final byte */
259	if (((byte = *utf8p++) & 0xc0) != 0x80) {
260	result = EINVAL;
261	goto stop;
262	}
263	utf8len--;
264	ucs_ch += (byte & 0x3F);
265
266	if (decompose) {
267	u_int16_t comb_ch;
268
269	ucs_ch = ucs_decompose(ucs_ch, &comb_ch);
270
271	if (comb_ch) {
272	if (swapbytes)
273	*ucsp++ = NXSwapShort(ucs_ch);
274	else
275	*ucsp++ = ucs_ch;
276
277	if (ucsp >= bufend) {
278	result = ENAMETOOLONG;
279	goto stop;
280	}
281
282	ucs_ch = comb_ch;
283	}
284	}
285	}
286
287	if (ucs_ch == altslash)
288	ucs_ch = '/';
289	if (swapbytes)
290	ucs_ch = NXSwapShort(ucs_ch);
291
292	*ucsp++ = ucs_ch;
293	}
294	stop:
295	ucslen = (u_int8_t)ucsp - (u_int8_t*)bufstart;
296
297	return (result);
298	}
299
300
301	/*
302	* Lookup tables for Unicode chars 0x00C0 thru 0x00FF
303	* primary_char yields first decomposed char. If this
304	* char is an alpha char then get the combining char
305	* from the combining_char table and add 0x0300 to it.
306	*/
307
308	static unsigned char primary_char[64] = {
309	0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xC6, 0x43,
310
311	0x45, 0x45, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49,
312
313	0xD0, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x4F, 0xD7,
314
315	0xD8, 0x55, 0x55, 0x55, 0x55, 0x59, 0xDE, 0xDF,
316
317	0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0xE6, 0x63,
318
319	0x65, 0x65, 0x65, 0x65, 0x69, 0x69, 0x69, 0x69,
320
321	0xF0, 0x6E, 0x6F, 0x6F, 0x6F, 0x6F, 0x6F, 0xF7,
322
323	0xF8, 0x75, 0x75, 0x75, 0x75, 0x79, 0xFE, 0x79,
324	};
325
326	static unsigned char combining_char[64] = {
327	0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
328
329	0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
330
331	0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
332
333	0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0xFF,
334
335	0x00, 0x01, 0x02, 0x03, 0x08, 0x0A, 0xFF, 0x27,
336
337	0x00, 0x01, 0x02, 0x08, 0x00, 0x01, 0x02, 0x08,
338
339	0xFF, 0x03, 0x00, 0x01, 0x02, 0x03, 0x08, 0xFF,
340
341	0xFF, 0x00, 0x01, 0x02, 0x08, 0x01, 0xFF, 0x08
342	};
343
344
345	/* CJK codepoints 0x3000 ~ 0x30FF */
346	static const unsigned long __CJKDecompBitmap[] = {
347	0x00000000, 0x00000000, 0x000AAAAA, 0xA540DB6C, /* 0x3000 */
348	0x00000802, 0x000AAAAA, 0xA540DB6C, 0x000009E2, /* 0x3080 */
349	};
350	#define IS_DECOMPOSABLE(table,unicodeVal) \
351	(table[(unicodeVal) / 32] & (1 << (31 - ((unicodeVal) % 32))))
352
353	/*
354	* ucs_decompose - decompose a composed UCS-2 char
355	*
356	* Composed Unicode characters are forbidden on
357	* HFS Plus volumes. ucs_decompose will convert a
358	* composed character into its correct decomposed
359	* sequence.
360	*
361	* Currently only MacRoman and MacJapanese chars
362	* are handled. Other composed characters are
363	* passed unchanged.
364	*/
365	static u_int16_t
366	ucs_decompose(register u_int16_t ch, u_int16_t *cmb)
367	{
368	u_int16_t base;
369
370	*cmb = 0;
371
372	if ((ch <= 0x00FF) && (ch >= 0x00C0)) {
373	ch -= 0x00C0;
374
375	base = (u_int16_t) primary_char[ch];
376
377	if (base <= 'z') {
378	*cmb = (u_int16_t)0x0300 + (u_int16_t)combining_char[ch];
379	}
380	} else if ((ch > 0x3000) && (ch < 0x3100) &&
381	IS_DECOMPOSABLE(__CJKDecompBitmap, ch - 0x3000)) {
382
383	/* Handle HIRAGANA LETTERs */
384	switch(ch) {
385	case 0x3071: base = 0x306F; cmb = 0x309A; break; / PA */
386	case 0x3074: base = 0x3072; cmb = 0x309A; break; / PI */
387	case 0x3077: base = 0x3075; cmb = 0x309A; break; / PU */
388	case 0x307A: base = 0x3078; cmb = 0x309A; break; / PE */
389
390	case 0x307D: base = 0x307B; cmb = 0x309A; break; / PO */
391	case 0x3094: base = 0x3046; cmb = 0x3099; break; / VU */
392	case 0x30D1: base = 0x30CF; cmb = 0x309A; break; / PA */
393	case 0x30D4: base = 0x30D2; cmb = 0x309A; break; / PI */
394
395	case 0x30D7: base = 0x30D5; cmb = 0x309A; break; / PU */
396	case 0x30DA: base = 0x30D8; cmb = 0x309A; break; / PE */
397	case 0x30DD: base = 0x30DB; cmb = 0x309A; break; / PO */
398	case 0x30F4: base = 0x30A6; cmb = 0x3099; break; / VU */
399
400	case 0x30F7: base = 0x30EF; cmb = 0x3099; break; / VA */
401	case 0x30F8: base = 0x30F0; cmb = 0x3099; break; / VI */
402	case 0x30F9: base = 0x30F1; cmb = 0x3099; break; / VE */
403	case 0x30FA: base = 0x30F2; cmb = 0x3099; break; / VO */
404
405	default:
406	/* the rest (41 of them) have a simple conversion */
407	base = ch - 1;
408	*cmb = 0x3099;
409	}
410	} else {
411	base = ch;
412	}
413
414	return (base);
415	}
416