git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	/*
	30	* Includes Unicode 3.2 decomposition code derived from Core Foundation
	31	*/
	32
	33	#include <sys/param.h>
	34	#include <sys/utfconv.h>
	35	#include <sys/errno.h>
	36	#include <sys/malloc.h>
	37	#include <libkern/OSByteOrder.h>
	38
	39	#if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST)
	40	#include <kern/assert.h>
	41	#else
	42	#include <assert.h>
	43	#endif
	44
	45	/*
	46	* UTF-8 (Unicode Transformation Format)
	47	*
	48	* UTF-8 is the Unicode Transformation Format that serializes a Unicode
	49	* character as a sequence of one to four bytes. Only the shortest form
	50	* required to represent the significant Unicode bits is legal.
	51	*
	52	* UTF-8 Multibyte Codes
	53	*
	54	* Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
	55	* -----------------------------------------------------------------------------
	56	* 1 7 0x0000 0x007F 0xxxxxxx
	57	* 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
	58	* 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
	59	* 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	60	* -----------------------------------------------------------------------------
	61	*/
	62
	63
	64	#define UNICODE_TO_UTF8_LEN(c) \
	65	((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
	66
	67	#define UCS_ALT_NULL 0x2400
	68
	69	/* Surrogate Pair Constants */
	70	#define SP_HALF_SHIFT 10
	71	#define SP_HALF_BASE 0x0010000u
	72	#define SP_HALF_MASK 0x3FFu
	73
	74	#define SP_HIGH_FIRST 0xD800u
	75	#define SP_HIGH_LAST 0xDBFFu
	76	#define SP_LOW_FIRST 0xDC00u
	77	#define SP_LOW_LAST 0xDFFFu
	78
	79
	80	#include "vfs_utfconvdata.h"
	81
	82
	83	/*
	84	* Test for a combining character.
	85	*
	86	* Similar to __CFUniCharIsNonBaseCharacter except that
	87	* unicode_combinable also includes Hangul Jamo characters.
	88	*/
	89	int
	90	unicode_combinable(u_int16_t character)
	91	{
	92	const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
	93	u_int8_t value;
	94
	95	if (character < 0x0300) {
	96	return 0;
	97	}
	98
	99	value = bitmap[(character >> 8) & 0xFF];
	100
	101	if (value == 0xFF) {
	102	return 1;
	103	} else if (value) {
	104	bitmap = bitmap + ((value - 1) * 32) + 256;
	105	return bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0;
	106	}
	107	return 0;
	108	}
	109
	110	/*
	111	* Test for a precomposed character.
	112	*
	113	* Similar to __CFUniCharIsDecomposableCharacter.
	114	*/
	115	int
	116	unicode_decomposeable(u_int16_t character)
	117	{
	118	const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
	119	u_int8_t value;
	120
	121	if (character < 0x00C0) {
	122	return 0;
	123	}
	124
	125	value = bitmap[(character >> 8) & 0xFF];
	126
	127	if (value == 0xFF) {
	128	return 1;
	129	} else if (value) {
	130	bitmap = bitmap + ((value - 1) * 32) + 256;
	131	return bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0;
	132	}
	133	return 0;
	134	}
	135
	136
	137	/*
	138	* Get the combing class.
	139	*
	140	* Similar to CFUniCharGetCombiningPropertyForCharacter.
	141	*/
	142	static inline u_int8_t
	143	get_combining_class(u_int16_t character)
	144	{
	145	const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
	146
	147	u_int8_t value = bitmap[(character >> 8)];
	148
	149	if (value) {
	150	bitmap = bitmap + (value * 256);
	151	return bitmap[character % 256];
	152	}
	153	return 0;
	154	}
	155
	156
	157	static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
	158
	159	static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
	160
	161	static void prioritysort(u_int16_t* characters, int count);
	162
	163	static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
	164
	165	static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
	166
	167
	168	char utf_extrabytes[32] = {
	169	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	170	-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
	171	};
	172
	173	const char hexdigits[16] = {
	174	'0', '1', '2', '3', '4', '5', '6', '7',
	175	'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
	176	};
	177
	178	/*
	179	* utf8_encodelen - Calculate the UTF-8 encoding length
	180	*
	181	* This function takes a Unicode input string, ucsp, of ucslen bytes
	182	* and calculates the size of the UTF-8 output in bytes (not including
	183	* a NULL termination byte). The string must reside in kernel memory.
	184	*
	185	* If '/' chars are possible in the Unicode input then an alternate
	186	* (replacement) char should be provided in altslash.
	187	*
	188	* FLAGS
	189	* UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
	190	*
	191	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	192	*
	193	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	194	*
	195	* UTF_DECOMPOSED: generate fully decomposed output
	196	*
	197	* UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
	198	*
	199	* ERRORS
	200	* None
	201	*/
	202	size_t
	203	utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
	204	{
	205	u_int16_t ucs_ch;
	206	u_int16_t * chp = NULL;
	207	u_int16_t sequence[8];
	208	int extra = 0;
	209	size_t charcnt;
	210	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
	211	int decompose = (flags & UTF_DECOMPOSED);
	212	size_t len;
	213
	214	charcnt = ucslen / 2;
	215	len = 0;
	216
	217	while (charcnt-- > 0) {
	218	if (extra > 0) {
	219	--extra;
	220	ucs_ch = *chp++;
	221	} else {
	222	ucs_ch = *ucsp++;
	223	if (swapbytes) {
	224	ucs_ch = OSSwapInt16(ucs_ch);
	225	}
	226	if (ucs_ch == '/') {
	227	ucs_ch = altslash ? altslash : '_';
	228	} else if (ucs_ch == '\0') {
	229	ucs_ch = UCS_ALT_NULL;
	230	} else if (decompose && unicode_decomposeable(ucs_ch)) {
	231	extra = unicode_decompose(ucs_ch, sequence) - 1;
	232	charcnt += extra;
	233	ucs_ch = sequence[0];
	234	chp = &sequence[1];
	235	}
	236	}
	237	len += UNICODE_TO_UTF8_LEN(ucs_ch);
	238	}
	239
	240	return len;
	241	}
	242
	243
	244	/*
	245	* utf8_encodestr - Encodes a Unicode string to UTF-8
	246	*
	247	* NOTES:
	248	* The resulting UTF-8 string is NULL terminated.
	249	*
	250	* If '/' chars are allowed on disk then an alternate
	251	* (replacement) char must be provided in altslash.
	252	*
	253	* input flags:
	254	* UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
	255	*
	256	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	257	*
	258	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	259	*
	260	* UTF_DECOMPOSED: generate fully decomposed output
	261	*
	262	* UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
	263	*
	264	* result:
	265	* ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
	266	*
	267	* EINVAL: Illegal char found; char was replaced by an '_'.
	268	*/
	269	int
	270	utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
	271	size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
	272	{
	273	u_int8_t * bufstart;
	274	u_int8_t * bufend;
	275	u_int16_t ucs_ch;
	276	u_int16_t * chp = NULL;
	277	u_int16_t sequence[8];
	278	int extra = 0;
	279	size_t charcnt;
	280	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
	281	int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
	282	int decompose = (flags & UTF_DECOMPOSED);
	283	int sfmconv = (flags & UTF_SFM_CONVERSIONS);
	284	int result = 0;
	285
	286	bufstart = utf8p;
	287	bufend = bufstart + buflen;
	288	if (nullterm) {
	289	--bufend;
	290	}
	291	charcnt = ucslen / 2;
	292
	293	while (charcnt-- > 0) {
	294	if (extra > 0) {
	295	--extra;
	296	ucs_ch = *chp++;
	297	} else {
	298	ucs_ch = swapbytes ? OSSwapInt16(ucsp++) : ucsp++;
	299
	300	if (decompose && unicode_decomposeable(ucs_ch)) {
	301	extra = unicode_decompose(ucs_ch, sequence) - 1;
	302	charcnt += extra;
	303	ucs_ch = sequence[0];
	304	chp = &sequence[1];
	305	}
	306	}
	307
	308	/* Slash and NULL are not permitted */
	309	if (ucs_ch == '/') {
	310	if (altslash) {
	311	ucs_ch = altslash;
	312	} else {
	313	ucs_ch = '_';
	314	result = EINVAL;
	315	}
	316	} else if (ucs_ch == '\0') {
	317	ucs_ch = UCS_ALT_NULL;
	318	}
	319
	320	if (ucs_ch < 0x0080) {
	321	if (utf8p >= bufend) {
	322	result = ENAMETOOLONG;
	323	break;
	324	}
	325	*utf8p++ = (u_int8_t)ucs_ch;
	326	} else if (ucs_ch < 0x800) {
	327	if ((utf8p + 1) >= bufend) {
	328	result = ENAMETOOLONG;
	329	break;
	330	}
	331	*utf8p++ = 0xc0 \| (u_int8_t)(ucs_ch >> 6);
	332	*utf8p++ = 0x80 \| (0x3f & ucs_ch);
	333	} else {
	334	/* These chars never valid Unicode. */
	335	if (ucs_ch == 0xFFFE \|\| ucs_ch == 0xFFFF) {
	336	result = EINVAL;
	337	break;
	338	}
	339
	340	/* Combine valid surrogate pairs */
	341	if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
	342	&& charcnt > 0) {
	343	u_int16_t ch2;
	344	u_int32_t pair;
	345
	346	ch2 = swapbytes ? OSSwapInt16(ucsp) : ucsp;
	347	if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
	348	pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
	349	+ (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
	350	if ((utf8p + 3) >= bufend) {
	351	result = ENAMETOOLONG;
	352	break;
	353	}
	354	--charcnt;
	355	++ucsp;
	356	*utf8p++ = 0xf0 \| (u_int8_t)(pair >> 18);
	357	*utf8p++ = 0x80 \| (0x3f & (pair >> 12));
	358	*utf8p++ = 0x80 \| (0x3f & (pair >> 6));
	359	*utf8p++ = 0x80 \| (0x3f & pair);
	360	continue;
	361	}
	362	} else if (sfmconv) {
	363	ucs_ch = sfm_to_ucs(ucs_ch);
	364	if (ucs_ch < 0x0080) {
	365	if (utf8p >= bufend) {
	366	result = ENAMETOOLONG;
	367	break;
	368	}
	369	*utf8p++ = (u_int8_t)ucs_ch;
	370	continue;
	371	}
	372	}
	373	if ((utf8p + 2) >= bufend) {
	374	result = ENAMETOOLONG;
	375	break;
	376	}
	377	*utf8p++ = 0xe0 \| (ucs_ch >> 12);
	378	*utf8p++ = 0x80 \| (0x3f & (ucs_ch >> 6));
	379	*utf8p++ = 0x80 \| (0x3f & ucs_ch);
	380	}
	381	}
	382
	383	*utf8len = utf8p - bufstart;
	384	if (nullterm) {
	385	*utf8p++ = '\0';
	386	}
	387
	388	return result;
	389	}
	390
	391	// Pushes a character taking account of combining character sequences
	392	static void
	393	push(uint16_t ucs_ch, int combcharcnt, uint16_t *ucsp)
	394	{
	395	/*
	396	* Make multiple combining character sequences canonical
	397	*/
	398	if (unicode_combinable(ucs_ch)) {
	399	++combcharcnt; / start tracking a run */
	400	} else if (*combcharcnt) {
	401	if (*combcharcnt > 1) {
	402	prioritysort(ucsp - combcharcnt, *combcharcnt);
	403	}
	404	combcharcnt = 0; / start over */
	405	}
	406
	407	(ucsp)++ = ucs_ch;
	408	}
	409
	410	/*
	411	* utf8_decodestr - Decodes a UTF-8 string back to Unicode
	412	*
	413	* NOTES:
	414	* The input UTF-8 string does not need to be null terminated
	415	* if utf8len is set.
	416	*
	417	* If '/' chars are allowed on disk then an alternate
	418	* (replacement) char must be provided in altslash.
	419	*
	420	* input flags:
	421	* UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
	422	*
	423	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	424	*
	425	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	426	*
	427	* UTF_DECOMPOSED: generate fully decomposed output (NFD)
	428	*
	429	* UTF_PRECOMPOSED: generate precomposed output (NFC)
	430	*
	431	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
	432	*
	433	* result:
	434	* ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
	435	*
	436	* EINVAL: Illegal UTF-8 sequence found.
	437	*/
	438	int
	439	utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
	440	size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
	441	{
	442	u_int16_t* bufstart;
	443	u_int16_t* bufend;
	444	unsigned int ucs_ch;
	445	unsigned int byte;
	446	int combcharcnt = 0;
	447	int result = 0;
	448	int decompose, precompose, escaping;
	449	int sfmconv;
	450	int extrabytes;
	451
	452	decompose = (flags & UTF_DECOMPOSED);
	453	precompose = (flags & UTF_PRECOMPOSED);
	454	escaping = (flags & UTF_ESCAPE_ILLEGAL);
	455	sfmconv = (flags & UTF_SFM_CONVERSIONS);
	456
	457	bufstart = ucsp;
	458	bufend = (u_int16_t )((u_int8_t )ucsp + buflen);
	459
	460	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
	461	if (ucsp >= bufend) {
	462	goto toolong;
	463	}
	464
	465	/* check for ascii */
	466	if (byte < 0x80) {
	467	ucs_ch = sfmconv ? ucs_to_sfm((u_int16_t)byte, utf8len == 0) : byte;
	468	} else {
	469	u_int32_t ch;
	470
	471	extrabytes = utf_extrabytes[byte >> 3];
	472	if ((extrabytes < 0) \|\| ((int)utf8len < extrabytes)) {
	473	goto escape;
	474	}
	475	utf8len -= extrabytes;
	476
	477	switch (extrabytes) {
	478	case 1:
	479	ch = byte; ch <<= 6; /* 1st byte */
	480	byte = utf8p++; / 2nd byte */
	481	if ((byte >> 6) != 2) {
	482	goto escape2;
	483	}
	484	ch += byte;
	485	ch -= 0x00003080UL;
	486	if (ch < 0x0080) {
	487	goto escape2;
	488	}
	489	ucs_ch = ch;
	490	break;
	491	case 2:
	492	ch = byte; ch <<= 6; /* 1st byte */
	493	byte = utf8p++; / 2nd byte */
	494	if ((byte >> 6) != 2) {
	495	goto escape2;
	496	}
	497	ch += byte; ch <<= 6;
	498	byte = utf8p++; / 3rd byte */
	499	if ((byte >> 6) != 2) {
	500	goto escape3;
	501	}
	502	ch += byte;
	503	ch -= 0x000E2080UL;
	504	if (ch < 0x0800) {
	505	goto escape3;
	506	}
	507	if (ch >= 0xD800) {
	508	if (ch <= 0xDFFF) {
	509	goto escape3;
	510	}
	511	if (ch == 0xFFFE \|\| ch == 0xFFFF) {
	512	goto escape3;
	513	}
	514	}
	515	ucs_ch = ch;
	516	break;
	517	case 3:
	518	ch = byte; ch <<= 6; /* 1st byte */
	519	byte = utf8p++; / 2nd byte */
	520	if ((byte >> 6) != 2) {
	521	goto escape2;
	522	}
	523	ch += byte; ch <<= 6;
	524	byte = utf8p++; / 3rd byte */
	525	if ((byte >> 6) != 2) {
	526	goto escape3;
	527	}
	528	ch += byte; ch <<= 6;
	529	byte = utf8p++; / 4th byte */
	530	if ((byte >> 6) != 2) {
	531	goto escape4;
	532	}
	533	ch += byte;
	534	ch -= 0x03C82080UL + SP_HALF_BASE;
	535	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
	536	if (ucs_ch < SP_HIGH_FIRST \|\| ucs_ch > SP_HIGH_LAST) {
	537	goto escape4;
	538	}
	539	push((uint16_t)ucs_ch, &combcharcnt, &ucsp);
	540	if (ucsp >= bufend) {
	541	goto toolong;
	542	}
	543	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
	544	if (ucs_ch < SP_LOW_FIRST \|\| ucs_ch > SP_LOW_LAST) {
	545	--ucsp;
	546	goto escape4;
	547	}
	548	*ucsp++ = (u_int16_t)ucs_ch;
	549	continue;
	550	default:
	551	result = EINVAL;
	552	goto exit;
	553	}
	554	if (decompose) {
	555	if (unicode_decomposeable((u_int16_t)ucs_ch)) {
	556	u_int16_t sequence[8];
	557	int count, i;
	558
	559	count = unicode_decompose((u_int16_t)ucs_ch, sequence);
	560
	561	for (i = 0; i < count; ++i) {
	562	if (ucsp >= bufend) {
	563	goto toolong;
	564	}
	565
	566	push(sequence[i], &combcharcnt, &ucsp);
	567	}
	568
	569	continue;
	570	}
	571	} else if (precompose && (ucsp != bufstart)) {
	572	u_int16_t composite, base;
	573
	574	if (unicode_combinable((u_int16_t)ucs_ch)) {
	575	base = ucsp[-1];
	576	composite = unicode_combine(base, (u_int16_t)ucs_ch);
	577	if (composite) {
	578	--ucsp;
	579	ucs_ch = composite;
	580	}
	581	}
	582	}
	583	if (ucs_ch == UCS_ALT_NULL) {
	584	ucs_ch = '\0';
	585	}
	586	}
	587	if (ucs_ch == altslash) {
	588	ucs_ch = '/';
	589	}
	590
	591	push((u_int16_t)ucs_ch, &combcharcnt, &ucsp);
	592	continue;
	593
	594	/*
	595	* Escape illegal UTF-8 into something legal.
	596	*/
	597	escape4:
	598	utf8p -= 3;
	599	goto escape;
	600	escape3:
	601	utf8p -= 2;
	602	goto escape;
	603	escape2:
	604	utf8p -= 1;
	605	escape:
	606	if (!escaping) {
	607	result = EINVAL;
	608	goto exit;
	609	}
	610	if (extrabytes > 0) {
	611	utf8len += extrabytes;
	612	}
	613	byte = *(utf8p - 1);
	614
	615	if ((ucsp + 2) >= bufend) {
	616	goto toolong;
	617	}
	618
	619	/* Make a previous combining sequence canonical. */
	620	if (combcharcnt > 1) {
	621	prioritysort(ucsp - combcharcnt, combcharcnt);
	622	}
	623	combcharcnt = 0;
	624
	625	ucs_ch = '%';
	626	*ucsp++ = (u_int16_t)ucs_ch;
	627	ucs_ch = hexdigits[byte >> 4];
	628	*ucsp++ = (u_int16_t)ucs_ch;
	629	ucs_ch = hexdigits[byte & 0x0F];
	630	*ucsp++ = (u_int16_t)ucs_ch;
	631	}
	632	/*
	633	* Make a previous combining sequence canonical
	634	*/
	635	if (combcharcnt > 1) {
	636	prioritysort(ucsp - combcharcnt, combcharcnt);
	637	}
	638
	639	if (flags & UTF_REVERSE_ENDIAN) {
	640	uint16_t *p = bufstart;
	641	while (p < ucsp) {
	642	p = OSSwapInt16(p);
	643	++p;
	644	}
	645	}
	646
	647	exit:
	648	ucslen = (u_int8_t)ucsp - (u_int8_t*)bufstart;
	649
	650	return result;
	651
	652	toolong:
	653	result = ENAMETOOLONG;
	654	goto exit;
	655	}
	656
	657
	658	/*
	659	* utf8_validatestr - Check for a valid UTF-8 string.
	660	*/
	661	int
	662	utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
	663	{
	664	unsigned int byte;
	665	u_int32_t ch;
	666	unsigned int ucs_ch;
	667	size_t extrabytes;
	668
	669	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
	670	if (byte < 0x80) {
	671	continue; /* plain ascii */
	672	}
	673	extrabytes = utf_extrabytes[byte >> 3];
	674
	675	if (utf8len < extrabytes) {
	676	goto invalid;
	677	}
	678	utf8len -= extrabytes;
	679
	680	switch (extrabytes) {
	681	case 1:
	682	ch = byte; ch <<= 6; /* 1st byte */
	683	byte = utf8p++; / 2nd byte */
	684	if ((byte >> 6) != 2) {
	685	goto invalid;
	686	}
	687	ch += byte;
	688	ch -= 0x00003080UL;
	689	if (ch < 0x0080) {
	690	goto invalid;
	691	}
	692	break;
	693	case 2:
	694	ch = byte; ch <<= 6; /* 1st byte */
	695	byte = utf8p++; / 2nd byte */
	696	if ((byte >> 6) != 2) {
	697	goto invalid;
	698	}
	699	ch += byte; ch <<= 6;
	700	byte = utf8p++; / 3rd byte */
	701	if ((byte >> 6) != 2) {
	702	goto invalid;
	703	}
	704	ch += byte;
	705	ch -= 0x000E2080UL;
	706	if (ch < 0x0800) {
	707	goto invalid;
	708	}
	709	if (ch >= 0xD800) {
	710	if (ch <= 0xDFFF) {
	711	goto invalid;
	712	}
	713	if (ch == 0xFFFE \|\| ch == 0xFFFF) {
	714	goto invalid;
	715	}
	716	}
	717	break;
	718	case 3:
	719	ch = byte; ch <<= 6; /* 1st byte */
	720	byte = utf8p++; / 2nd byte */
	721	if ((byte >> 6) != 2) {
	722	goto invalid;
	723	}
	724	ch += byte; ch <<= 6;
	725	byte = utf8p++; / 3rd byte */
	726	if ((byte >> 6) != 2) {
	727	goto invalid;
	728	}
	729	ch += byte; ch <<= 6;
	730	byte = utf8p++; / 4th byte */
	731	if ((byte >> 6) != 2) {
	732	goto invalid;
	733	}
	734	ch += byte;
	735	ch -= 0x03C82080UL + SP_HALF_BASE;
	736	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
	737	if (ucs_ch < SP_HIGH_FIRST \|\| ucs_ch > SP_HIGH_LAST) {
	738	goto invalid;
	739	}
	740	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
	741	if (ucs_ch < SP_LOW_FIRST \|\| ucs_ch > SP_LOW_LAST) {
	742	goto invalid;
	743	}
	744	break;
	745	default:
	746	goto invalid;
	747	}
	748	}
	749	return 0;
	750	invalid:
	751	return EINVAL;
	752	}
	753
	754	/*
	755	* utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
	756	*
	757	* This function takes an UTF-8 input string, instr, of inlen bytes
	758	* and produces normalized UTF-8 output into a buffer of buflen bytes
	759	* pointed to by outstr. The size of the output in bytes (not including
	760	* a NULL termination byte) is returned in outlen. In-place conversions
	761	* are not supported (i.e. instr != outstr).]
	762	*
	763	* FLAGS
	764	* UTF_DECOMPOSED: output string will be fully decomposed (NFD)
	765	*
	766	* UTF_PRECOMPOSED: output string will be precomposed (NFC)
	767	*
	768	* UTF_NO_NULL_TERM: do not add null termination to output string
	769	*
	770	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
	771	*
	772	* ERRORS
	773	* ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
	774	*
	775	* EINVAL: illegal UTF-8 sequence encountered or invalid flags
	776	*/
	777	int
	778	utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
	779	size_t *outlen, size_t buflen, int flags)
	780	{
	781	u_int16_t unicodebuf[32];
	782	u_int16_t* unistr = NULL;
	783	size_t unicode_bytes;
	784	size_t uft8_bytes;
	785	size_t inbuflen;
	786	u_int8_t outbufstart, outbufend;
	787	const u_int8_t *inbufstart;
	788	unsigned int byte;
	789	int decompose, precompose;
	790	int result = 0;
	791
	792	if (flags & ~(UTF_DECOMPOSED \| UTF_PRECOMPOSED \| UTF_NO_NULL_TERM \| UTF_ESCAPE_ILLEGAL)) {
	793	return EINVAL;
	794	}
	795	decompose = (flags & UTF_DECOMPOSED);
	796	precompose = (flags & UTF_PRECOMPOSED);
	797	if ((decompose && precompose) \|\| (!decompose && !precompose)) {
	798	return EINVAL;
	799	}
	800	outbufstart = outstr;
	801	outbufend = outbufstart + buflen;
	802	inbufstart = instr;
	803	inbuflen = inlen;
	804
	805	while (inlen-- > 0 && (byte = *instr++) != '\0') {
	806	if (outstr >= outbufend) {
	807	result = ENAMETOOLONG;
	808	goto exit;
	809	}
	810	if (byte >= 0x80) {
	811	goto nonASCII;
	812	}
	813	/* ASCII is already normalized. */
	814	*outstr++ = (u_int8_t)byte;
	815	}
	816	exit:
	817	*outlen = outstr - outbufstart;
	818	if (((flags & UTF_NO_NULL_TERM) == 0)) {
	819	if (outstr < outbufend) {
	820	*outstr++ = '\0';
	821	} else {
	822	result = ENAMETOOLONG;
	823	}
	824	}
	825	return result;
	826
	827
	828	/*
	829	* Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
	830	* functions to perform the normalization. Since this will
	831	* presumably be used to normalize filenames in the back-end
	832	* (on disk or over-the-wire), it should be fast enough.
	833	*/
	834	nonASCII:
	835
	836	/* Make sure the input size is reasonable. */
	837	if (inbuflen > MAXPATHLEN) {
	838	result = ENAMETOOLONG;
	839	goto exit;
	840	}
	841	/*
	842	* Compute worst case Unicode buffer size.
	843	*
	844	* For pre-composed output, every UTF-8 input byte will be at
	845	* most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
	846	* (smallest composite char sequence) may yield 6 Unicode bytes
	847	* (1 base char + 2 combining chars).
	848	*/
	849	unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
	850
	851	if (unicode_bytes <= sizeof(unicodebuf)) {
	852	unistr = &unicodebuf[0];
	853	} else {
	854	unistr = kheap_alloc(KHEAP_DATA_BUFFERS, unicode_bytes, Z_WAITOK);
	855	}
	856
	857	/* Normalize the string. */
	858	result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
	859	unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
	860	if (result == 0) {
	861	/* Put results back into UTF-8. */
	862	result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
	863	&uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
	864	outstr = outbufstart + uft8_bytes;
	865	}
	866	if (unistr && unistr != &unicodebuf[0]) {
	867	kheap_free(KHEAP_DATA_BUFFERS, unistr, unicode_bytes);
	868	}
	869	goto exit;
	870	}
	871
	872
	873	/*
	874	* Unicode 3.2 decomposition code (derived from Core Foundation)
	875	*/
	876
	877	typedef struct {
	878	u_int32_t _key;
	879	u_int32_t _value;
	880	} unicode_mappings32;
	881
	882	static inline u_int32_t
	883	getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
	884	u_int16_t character)
	885	{
	886	const unicode_mappings32 p, q, *divider;
	887
	888	if ((character < theTable[0]._key) \|\| (character > theTable[numElem - 1]._key)) {
	889	return 0;
	890	}
	891
	892	p = theTable;
	893	q = p + (numElem - 1);
	894	while (p <= q) {
	895	divider = p + ((q - p) >> 1); /* divide by 2 */
	896	if (character < divider->_key) {
	897	q = divider - 1;
	898	} else if (character > divider->_key) {
	899	p = divider + 1;
	900	} else {
	901	return divider->_value;
	902	}
	903	}
	904	return 0;
	905	}
	906
	907	#define RECURSIVE_DECOMPOSITION (1 << 15)
	908	#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
	909
	910	typedef struct {
	911	u_int16_t _key;
	912	u_int16_t _value;
	913	} unicode_mappings16;
	914
	915	static inline u_int16_t
	916	getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
	917	u_int16_t character)
	918	{
	919	const unicode_mappings16 p, q, *divider;
	920
	921	if ((character < theTable[0]._key) \|\| (character > theTable[numElem - 1]._key)) {
	922	return 0;
	923	}
	924
	925	p = theTable;
	926	q = p + (numElem - 1);
	927	while (p <= q) {
	928	divider = p + ((q - p) >> 1); /* divide by 2 */
	929	if (character < divider->_key) {
	930	q = divider - 1;
	931	} else if (character > divider->_key) {
	932	p = divider + 1;
	933	} else {
	934	return divider->_value;
	935	}
	936	}
	937	return 0;
	938	}
	939
	940
	941	static u_int32_t
	942	unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
	943	{
	944	u_int16_t value;
	945	u_int32_t length;
	946	u_int16_t firstChar;
	947	u_int16_t theChar;
	948	const u_int16_t *bmpMappings;
	949	u_int32_t usedLength;
	950
	951	value = getmappedvalue16(
	952	(const unicode_mappings16 *)__CFUniCharDecompositionTable,
	953	__UniCharDecompositionTableLength, character);
	954	length = EXTRACT_COUNT(value);
	955	firstChar = value & 0x0FFF;
	956	theChar = firstChar;
	957	bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
	958	usedLength = 0;
	959
	960	if (value & RECURSIVE_DECOMPOSITION) {
	961	usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
	962
	963	--length; /* Decrement for the first char */
	964	if (!usedLength) {
	965	return 0;
	966	}
	967	++bmpMappings;
	968	convertedChars += usedLength;
	969	}
	970
	971	usedLength += length;
	972
	973	while (length--) {
	974	(convertedChars++) = (bmpMappings++);
	975	}
	976
	977	return usedLength;
	978	}
	979
	980	#define HANGUL_SBASE 0xAC00
	981	#define HANGUL_LBASE 0x1100
	982	#define HANGUL_VBASE 0x1161
	983	#define HANGUL_TBASE 0x11A7
	984
	985	#define HANGUL_SCOUNT 11172
	986	#define HANGUL_LCOUNT 19
	987	#define HANGUL_VCOUNT 21
	988	#define HANGUL_TCOUNT 28
	989	#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
	990
	991	/*
	992	* unicode_decompose - decompose a composed Unicode char
	993	*
	994	* Composed Unicode characters are forbidden on
	995	* HFS Plus volumes. ucs_decompose will convert a
	996	* composed character into its correct decomposed
	997	* sequence.
	998	*
	999	* Similar to CFUniCharDecomposeCharacter
	1000	*/
	1001	static int
	1002	unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
	1003	{
	1004	if ((character >= HANGUL_SBASE) &&
	1005	(character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
	1006	u_int32_t length;
	1007
	1008	character -= HANGUL_SBASE;
	1009	length = (character % HANGUL_TCOUNT ? 3 : 2);
	1010
	1011	*(convertedChars++) =
	1012	character / HANGUL_NCOUNT + HANGUL_LBASE;
	1013	*(convertedChars++) =
	1014	(character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
	1015	if (length > 2) {
	1016	*convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
	1017	}
	1018	return length;
	1019	} else {
	1020	return unicode_recursive_decompose(character, convertedChars);
	1021	}
	1022	}
	1023
	1024	/*
	1025	* unicode_combine - generate a precomposed Unicode char
	1026	*
	1027	* Precomposed Unicode characters are required for some volume
	1028	* formats and network protocols. unicode_combine will combine
	1029	* a decomposed character sequence into a single precomposed
	1030	* (composite) character.
	1031	*
	1032	* Similar toCFUniCharPrecomposeCharacter but unicode_combine
	1033	* also handles Hangul Jamo characters.
	1034	*/
	1035	static u_int16_t
	1036	unicode_combine(u_int16_t base, u_int16_t combining)
	1037	{
	1038	u_int32_t value;
	1039
	1040	/* Check HANGUL */
	1041	if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
	1042	/* 2 char Hangul sequences */
	1043	if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
	1044	(base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
	1045	return HANGUL_SBASE +
	1046	((base - HANGUL_LBASE) * (HANGUL_VCOUNT * HANGUL_TCOUNT)) +
	1047	((combining - HANGUL_VBASE) * HANGUL_TCOUNT);
	1048	}
	1049
	1050	/* 3 char Hangul sequences */
	1051	if ((combining > HANGUL_TBASE) &&
	1052	(base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
	1053	if ((base - HANGUL_SBASE) % HANGUL_TCOUNT) {
	1054	return 0;
	1055	} else {
	1056	return base + (combining - HANGUL_TBASE);
	1057	}
	1058	}
	1059	}
	1060
	1061	value = getmappedvalue32(
	1062	(const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
	1063	__CFUniCharPrecompositionTableLength, combining);
	1064
	1065	if (value) {
	1066	value = getmappedvalue16(
	1067	(const unicode_mappings16 *)
	1068	((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
	1069	(value >> 16), base);
	1070	}
	1071	return (u_int16_t)value;
	1072	}
	1073
	1074
	1075	/*
	1076	* prioritysort - order combining chars into canonical order
	1077	*
	1078	* Similar to CFUniCharPrioritySort
	1079	*/
	1080	static void
	1081	prioritysort(u_int16_t* characters, int count)
	1082	{
	1083	u_int32_t p1, p2;
	1084	u_int16_t ch1, ch2;
	1085	u_int16_t *end;
	1086	int changes = 0;
	1087
	1088	end = characters + count;
	1089	do {
	1090	changes = 0;
	1091	ch1 = characters;
	1092	ch2 = characters + 1;
	1093	p2 = get_combining_class(*ch1);
	1094	while (ch2 < end) {
	1095	p1 = p2;
	1096	p2 = get_combining_class(*ch2);
	1097	if (p1 > p2 && p2 != 0) {
	1098	u_int16_t tmp;
	1099
	1100	tmp = *ch1;
	1101	ch1 = ch2;
	1102	*ch2 = tmp;
	1103	changes = 1;
	1104
	1105	/*
	1106	* Make sure that p2 contains the combining class for the
	1107	* character now stored at *ch2. This isn't required for
	1108	* correctness, but it will be more efficient if a character
	1109	* with a large combining class has to "bubble past" several
	1110	* characters with lower combining classes.
	1111	*/
	1112	p2 = p1;
	1113	}
	1114	++ch1;
	1115	++ch2;
	1116	}
	1117	} while (changes);
	1118	}
	1119
	1120
	1121	/*
	1122	* Invalid NTFS filename characters are encodeded using the
	1123	* SFM (Services for Macintosh) private use Unicode characters.
	1124	*
	1125	* These should only be used for SMB, MSDOS or NTFS.
	1126	*
	1127	* Illegal NTFS Char SFM Unicode Char
	1128	* ----------------------------------------
	1129	* 0x01-0x1f 0xf001-0xf01f
	1130	* '"' 0xf020
	1131	* '*' 0xf021
	1132	* '/' 0xf022
	1133	* '<' 0xf023
	1134	* '>' 0xf024
	1135	* '?' 0xf025
	1136	* '\' 0xf026
	1137	* '\|' 0xf027
	1138	* ' ' 0xf028 (Only if last char of the name)
	1139	* '.' 0xf029 (Only if last char of the name)
	1140	* ----------------------------------------
	1141	*
	1142	* Reference: http://support.microsoft.com/kb/q117258/
	1143	*/
	1144
	1145	#define MAX_SFM2MAC 0x29
	1146	#define SFMCODE_PREFIX_MASK 0xf000
	1147
	1148	/*
	1149	* In the Mac OS 9 days the colon was illegal in a file name. For that reason
	1150	* SFM had no conversion for the colon. There is a conversion for the
	1151	* slash. In Mac OS X the slash is illegal in a file name. So for us the colon
	1152	* is a slash and a slash is a colon. So we can just replace the slash with the
	1153	* colon in our tables and everything will just work.
	1154	*/
	1155	static u_int8_t
	1156	sfm2mac[] = {
	1157	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
	1158	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
	1159	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
	1160	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
	1161	0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
	1162	0x20, 0x2e /* 28 - 29 */
	1163	};
	1164	#define SFM2MAC_LEN ((sizeof(sfm2mac))/sizeof(sfm2mac[0]))
	1165
	1166	static u_int8_t
	1167	mac2sfm[] = {
	1168	0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
	1169	0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
	1170	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
	1171	0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
	1172	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
	1173	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
	1174	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
	1175	0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
	1176	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
	1177	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
	1178	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
	1179	0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
	1180	};
	1181	#define MAC2SFM_LEN ((sizeof(mac2sfm))/sizeof(mac2sfm[0]))
	1182
	1183
	1184	/*
	1185	* Encode illegal NTFS filename characters into SFM Private Unicode characters
	1186	*
	1187	* Assumes non-zero ASCII input.
	1188	*/
	1189	static u_int16_t
	1190	ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
	1191	{
	1192	/* The last character of filename cannot be a space or period. */
	1193	if (lastchar) {
	1194	if (ucs_ch == 0x20) {
	1195	return 0xf028;
	1196	} else if (ucs_ch == 0x2e) {
	1197	return 0xf029;
	1198	}
	1199	}
	1200	/* 0x01 - 0x1f is simple transformation. */
	1201	if (ucs_ch <= 0x1f) {
	1202	return ucs_ch \| 0xf000;
	1203	} else { /* 0x20 - 0x7f */
	1204	u_int16_t lsb;
	1205
	1206	assert((ucs_ch - 0x0020) < MAC2SFM_LEN);
	1207	lsb = mac2sfm[ucs_ch - 0x0020];
	1208	if (lsb != ucs_ch) {
	1209	return 0xf000 \| lsb;
	1210	}
	1211	}
	1212	return ucs_ch;
	1213	}
	1214
	1215	/*
	1216	* Decode any SFM Private Unicode characters
	1217	*/
	1218	static u_int16_t
	1219	sfm_to_ucs(u_int16_t ucs_ch)
	1220	{
	1221	if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
	1222	((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
	1223	assert((ucs_ch & 0x003f) < SFM2MAC_LEN);
	1224	ucs_ch = sfm2mac[ucs_ch & 0x003f];
	1225	}
	1226	return ucs_ch;
	1227	}