git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	/*
	30	Includes Unicode 3.2 decomposition code derived from Core Foundation
	31	*/
	32
	33	#include <sys/param.h>
	34	#include <sys/utfconv.h>
	35	#include <sys/errno.h>
	36	#include <sys/malloc.h>
	37	#include <libkern/OSByteOrder.h>
	38
	39	#if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST)
	40	#include <kern/assert.h>
	41	#else
	42	#include <assert.h>
	43	#endif
	44
	45	/*
	46	* UTF-8 (Unicode Transformation Format)
	47	*
	48	* UTF-8 is the Unicode Transformation Format that serializes a Unicode
	49	* character as a sequence of one to four bytes. Only the shortest form
	50	* required to represent the significant Unicode bits is legal.
	51	*
	52	* UTF-8 Multibyte Codes
	53	*
	54	* Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
	55	* -----------------------------------------------------------------------------
	56	* 1 7 0x0000 0x007F 0xxxxxxx
	57	* 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
	58	* 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
	59	* 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	60	* -----------------------------------------------------------------------------
	61	*/
	62
	63
	64	#define UNICODE_TO_UTF8_LEN(c) \
	65	((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
	66
	67	#define UCS_ALT_NULL 0x2400
	68
	69	/* Surrogate Pair Constants */
	70	#define SP_HALF_SHIFT 10
	71	#define SP_HALF_BASE 0x0010000u
	72	#define SP_HALF_MASK 0x3FFu
	73
	74	#define SP_HIGH_FIRST 0xD800u
	75	#define SP_HIGH_LAST 0xDBFFu
	76	#define SP_LOW_FIRST 0xDC00u
	77	#define SP_LOW_LAST 0xDFFFu
	78
	79
	80	#include "vfs_utfconvdata.h"
	81
	82
	83	/*
	84	* Test for a combining character.
	85	*
	86	* Similar to __CFUniCharIsNonBaseCharacter except that
	87	* unicode_combinable also includes Hangul Jamo characters.
	88	*/
	89	int
	90	unicode_combinable(u_int16_t character)
	91	{
	92	const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
	93	u_int8_t value;
	94
	95	if (character < 0x0300)
	96	return (0);
	97
	98	value = bitmap[(character >> 8) & 0xFF];
	99
	100	if (value == 0xFF) {
	101	return (1);
	102	} else if (value) {
	103	bitmap = bitmap + ((value - 1) * 32) + 256;
	104	return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
	105	}
	106	return (0);
	107	}
	108
	109	/*
	110	* Test for a precomposed character.
	111	*
	112	* Similar to __CFUniCharIsDecomposableCharacter.
	113	*/
	114	int
	115	unicode_decomposeable(u_int16_t character) {
	116	const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
	117	u_int8_t value;
	118
	119	if (character < 0x00C0)
	120	return (0);
	121
	122	value = bitmap[(character >> 8) & 0xFF];
	123
	124	if (value == 0xFF) {
	125	return (1);
	126	} else if (value) {
	127	bitmap = bitmap + ((value - 1) * 32) + 256;
	128	return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
	129	}
	130	return (0);
	131	}
	132
	133
	134	/*
	135	* Get the combing class.
	136	*
	137	* Similar to CFUniCharGetCombiningPropertyForCharacter.
	138	*/
	139	static inline u_int8_t
	140	get_combining_class(u_int16_t character) {
	141	const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
	142
	143	u_int8_t value = bitmap[(character >> 8)];
	144
	145	if (value) {
	146	bitmap = bitmap + (value * 256);
	147	return bitmap[character % 256];
	148	}
	149	return (0);
	150	}
	151
	152
	153	static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
	154
	155	static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
	156
	157	static void prioritysort(u_int16_t* characters, int count);
	158
	159	static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
	160
	161	static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
	162
	163
	164	char utf_extrabytes[32] = {
	165	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	166	-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
	167	};
	168
	169	const char hexdigits[16] = {
	170	'0', '1', '2', '3', '4', '5', '6', '7',
	171	'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
	172	};
	173
	174	/*
	175	* utf8_encodelen - Calculate the UTF-8 encoding length
	176	*
	177	* This function takes a Unicode input string, ucsp, of ucslen bytes
	178	* and calculates the size of the UTF-8 output in bytes (not including
	179	* a NULL termination byte). The string must reside in kernel memory.
	180	*
	181	* If '/' chars are possible in the Unicode input then an alternate
	182	* (replacement) char should be provided in altslash.
	183	*
	184	* FLAGS
	185	* UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
	186	*
	187	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	188	*
	189	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	190	*
	191	* UTF_DECOMPOSED: generate fully decomposed output
	192	*
	193	* UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
	194	*
	195	* ERRORS
	196	* None
	197	*/
	198	size_t
	199	utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
	200	{
	201	u_int16_t ucs_ch;
	202	u_int16_t * chp = NULL;
	203	u_int16_t sequence[8];
	204	int extra = 0;
	205	size_t charcnt;
	206	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
	207	int decompose = (flags & UTF_DECOMPOSED);
	208	size_t len;
	209
	210	charcnt = ucslen / 2;
	211	len = 0;
	212
	213	while (charcnt-- > 0) {
	214	if (extra > 0) {
	215	--extra;
	216	ucs_ch = *chp++;
	217	} else {
	218	ucs_ch = *ucsp++;
	219	if (swapbytes) {
	220	ucs_ch = OSSwapInt16(ucs_ch);
	221	}
	222	if (ucs_ch == '/') {
	223	ucs_ch = altslash ? altslash : '_';
	224	} else if (ucs_ch == '\0') {
	225	ucs_ch = UCS_ALT_NULL;
	226	} else if (decompose && unicode_decomposeable(ucs_ch)) {
	227	extra = unicode_decompose(ucs_ch, sequence) - 1;
	228	charcnt += extra;
	229	ucs_ch = sequence[0];
	230	chp = &sequence[1];
	231	}
	232	}
	233	len += UNICODE_TO_UTF8_LEN(ucs_ch);
	234	}
	235
	236	return (len);
	237	}
	238
	239
	240	/*
	241	* utf8_encodestr - Encodes a Unicode string to UTF-8
	242	*
	243	* NOTES:
	244	* The resulting UTF-8 string is NULL terminated.
	245	*
	246	* If '/' chars are allowed on disk then an alternate
	247	* (replacement) char must be provided in altslash.
	248	*
	249	* input flags:
	250	* UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
	251	*
	252	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	253	*
	254	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	255	*
	256	* UTF_DECOMPOSED: generate fully decomposed output
	257	*
	258	* UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
	259	*
	260	* result:
	261	* ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
	262	*
	263	* EINVAL: Illegal char found; char was replaced by an '_'.
	264	*/
	265	int
	266	utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
	267	size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
	268	{
	269	u_int8_t * bufstart;
	270	u_int8_t * bufend;
	271	u_int16_t ucs_ch;
	272	u_int16_t * chp = NULL;
	273	u_int16_t sequence[8];
	274	int extra = 0;
	275	size_t charcnt;
	276	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
	277	int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
	278	int decompose = (flags & UTF_DECOMPOSED);
	279	int sfmconv = (flags & UTF_SFM_CONVERSIONS);
	280	int result = 0;
	281
	282	bufstart = utf8p;
	283	bufend = bufstart + buflen;
	284	if (nullterm)
	285	--bufend;
	286	charcnt = ucslen / 2;
	287
	288	while (charcnt-- > 0) {
	289	if (extra > 0) {
	290	--extra;
	291	ucs_ch = *chp++;
	292	} else {
	293	ucs_ch = swapbytes ? OSSwapInt16(ucsp++) : ucsp++;
	294
	295	if (decompose && unicode_decomposeable(ucs_ch)) {
	296	extra = unicode_decompose(ucs_ch, sequence) - 1;
	297	charcnt += extra;
	298	ucs_ch = sequence[0];
	299	chp = &sequence[1];
	300	}
	301	}
	302
	303	/* Slash and NULL are not permitted */
	304	if (ucs_ch == '/') {
	305	if (altslash)
	306	ucs_ch = altslash;
	307	else {
	308	ucs_ch = '_';
	309	result = EINVAL;
	310	}
	311	} else if (ucs_ch == '\0') {
	312	ucs_ch = UCS_ALT_NULL;
	313	}
	314
	315	if (ucs_ch < 0x0080) {
	316	if (utf8p >= bufend) {
	317	result = ENAMETOOLONG;
	318	break;
	319	}
	320	*utf8p++ = ucs_ch;
	321
	322	} else if (ucs_ch < 0x800) {
	323	if ((utf8p + 1) >= bufend) {
	324	result = ENAMETOOLONG;
	325	break;
	326	}
	327	*utf8p++ = 0xc0 \| (ucs_ch >> 6);
	328	*utf8p++ = 0x80 \| (0x3f & ucs_ch);
	329
	330	} else {
	331	/* These chars never valid Unicode. */
	332	if (ucs_ch == 0xFFFE \|\| ucs_ch == 0xFFFF) {
	333	result = EINVAL;
	334	break;
	335	}
	336
	337	/* Combine valid surrogate pairs */
	338	if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
	339	&& charcnt > 0) {
	340	u_int16_t ch2;
	341	u_int32_t pair;
	342
	343	ch2 = swapbytes ? OSSwapInt16(ucsp) : ucsp;
	344	if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
	345	pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
	346	+ (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
	347	if ((utf8p + 3) >= bufend) {
	348	result = ENAMETOOLONG;
	349	break;
	350	}
	351	--charcnt;
	352	++ucsp;
	353	*utf8p++ = 0xf0 \| (pair >> 18);
	354	*utf8p++ = 0x80 \| (0x3f & (pair >> 12));
	355	*utf8p++ = 0x80 \| (0x3f & (pair >> 6));
	356	*utf8p++ = 0x80 \| (0x3f & pair);
	357	continue;
	358	}
	359	} else if (sfmconv) {
	360	ucs_ch = sfm_to_ucs(ucs_ch);
	361	if (ucs_ch < 0x0080) {
	362	if (utf8p >= bufend) {
	363	result = ENAMETOOLONG;
	364	break;
	365	}
	366	*utf8p++ = ucs_ch;
	367	continue;
	368	}
	369	}
	370	if ((utf8p + 2) >= bufend) {
	371	result = ENAMETOOLONG;
	372	break;
	373	}
	374	*utf8p++ = 0xe0 \| (ucs_ch >> 12);
	375	*utf8p++ = 0x80 \| (0x3f & (ucs_ch >> 6));
	376	*utf8p++ = 0x80 \| (0x3f & ucs_ch);
	377	}
	378	}
	379
	380	*utf8len = utf8p - bufstart;
	381	if (nullterm)
	382	*utf8p++ = '\0';
	383
	384	return (result);
	385	}
	386
	387	// Pushes a character taking account of combining character sequences
	388	static void push(uint16_t ucs_ch, int combcharcnt, uint16_t *ucsp)
	389	{
	390	/*
	391	* Make multiple combining character sequences canonical
	392	*/
	393	if (unicode_combinable(ucs_ch)) {
	394	++combcharcnt; / start tracking a run */
	395	} else if (*combcharcnt) {
	396	if (*combcharcnt > 1) {
	397	prioritysort(ucsp - combcharcnt, *combcharcnt);
	398	}
	399	combcharcnt = 0; / start over */
	400	}
	401
	402	(ucsp)++ = ucs_ch;
	403	}
	404
	405	/*
	406	* utf8_decodestr - Decodes a UTF-8 string back to Unicode
	407	*
	408	* NOTES:
	409	* The input UTF-8 string does not need to be null terminated
	410	* if utf8len is set.
	411	*
	412	* If '/' chars are allowed on disk then an alternate
	413	* (replacement) char must be provided in altslash.
	414	*
	415	* input flags:
	416	* UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
	417	*
	418	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	419	*
	420	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	421	*
	422	* UTF_DECOMPOSED: generate fully decomposed output (NFD)
	423	*
	424	* UTF_PRECOMPOSED: generate precomposed output (NFC)
	425	*
	426	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
	427	*
	428	* result:
	429	* ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
	430	*
	431	* EINVAL: Illegal UTF-8 sequence found.
	432	*/
	433	int
	434	utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
	435	size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
	436	{
	437	u_int16_t* bufstart;
	438	u_int16_t* bufend;
	439	unsigned int ucs_ch;
	440	unsigned int byte;
	441	int combcharcnt = 0;
	442	int result = 0;
	443	int decompose, precompose, escaping;
	444	int sfmconv;
	445	int extrabytes;
	446
	447	decompose = (flags & UTF_DECOMPOSED);
	448	precompose = (flags & UTF_PRECOMPOSED);
	449	escaping = (flags & UTF_ESCAPE_ILLEGAL);
	450	sfmconv = (flags & UTF_SFM_CONVERSIONS);
	451
	452	bufstart = ucsp;
	453	bufend = (u_int16_t )((u_int8_t )ucsp + buflen);
	454
	455	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
	456	if (ucsp >= bufend)
	457	goto toolong;
	458
	459	/* check for ascii */
	460	if (byte < 0x80) {
	461	ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
	462	} else {
	463	u_int32_t ch;
	464
	465	extrabytes = utf_extrabytes[byte >> 3];
	466	if ((extrabytes < 0) \|\| ((int)utf8len < extrabytes)) {
	467	goto escape;
	468	}
	469	utf8len -= extrabytes;
	470
	471	switch (extrabytes) {
	472	case 1:
	473	ch = byte; ch <<= 6; /* 1st byte */
	474	byte = utf8p++; / 2nd byte */
	475	if ((byte >> 6) != 2)
	476	goto escape2;
	477	ch += byte;
	478	ch -= 0x00003080UL;
	479	if (ch < 0x0080)
	480	goto escape2;
	481	ucs_ch = ch;
	482	break;
	483	case 2:
	484	ch = byte; ch <<= 6; /* 1st byte */
	485	byte = utf8p++; / 2nd byte */
	486	if ((byte >> 6) != 2)
	487	goto escape2;
	488	ch += byte; ch <<= 6;
	489	byte = utf8p++; / 3rd byte */
	490	if ((byte >> 6) != 2)
	491	goto escape3;
	492	ch += byte;
	493	ch -= 0x000E2080UL;
	494	if (ch < 0x0800)
	495	goto escape3;
	496	if (ch >= 0xD800) {
	497	if (ch <= 0xDFFF)
	498	goto escape3;
	499	if (ch == 0xFFFE \|\| ch == 0xFFFF)
	500	goto escape3;
	501	}
	502	ucs_ch = ch;
	503	break;
	504	case 3:
	505	ch = byte; ch <<= 6; /* 1st byte */
	506	byte = utf8p++; / 2nd byte */
	507	if ((byte >> 6) != 2)
	508	goto escape2;
	509	ch += byte; ch <<= 6;
	510	byte = utf8p++; / 3rd byte */
	511	if ((byte >> 6) != 2)
	512	goto escape3;
	513	ch += byte; ch <<= 6;
	514	byte = utf8p++; / 4th byte */
	515	if ((byte >> 6) != 2)
	516	goto escape4;
	517	ch += byte;
	518	ch -= 0x03C82080UL + SP_HALF_BASE;
	519	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
	520	if (ucs_ch < SP_HIGH_FIRST \|\| ucs_ch > SP_HIGH_LAST)
	521	goto escape4;
	522	push(ucs_ch, &combcharcnt, &ucsp);
	523	if (ucsp >= bufend)
	524	goto toolong;
	525	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
	526	if (ucs_ch < SP_LOW_FIRST \|\| ucs_ch > SP_LOW_LAST) {
	527	--ucsp;
	528	goto escape4;
	529	}
	530	*ucsp++ = ucs_ch;
	531	continue;
	532	default:
	533	result = EINVAL;
	534	goto exit;
	535	}
	536	if (decompose) {
	537	if (unicode_decomposeable(ucs_ch)) {
	538	u_int16_t sequence[8];
	539	int count, i;
	540
	541	count = unicode_decompose(ucs_ch, sequence);
	542
	543	for (i = 0; i < count; ++i) {
	544	if (ucsp >= bufend)
	545	goto toolong;
	546
	547	push(sequence[i], &combcharcnt, &ucsp);
	548	}
	549
	550	continue;
	551	}
	552	} else if (precompose && (ucsp != bufstart)) {
	553	u_int16_t composite, base;
	554
	555	if (unicode_combinable(ucs_ch)) {
	556	base = ucsp[-1];
	557	composite = unicode_combine(base, ucs_ch);
	558	if (composite) {
	559	--ucsp;
	560	ucs_ch = composite;
	561	}
	562	}
	563	}
	564	if (ucs_ch == UCS_ALT_NULL)
	565	ucs_ch = '\0';
	566	}
	567	if (ucs_ch == altslash)
	568	ucs_ch = '/';
	569
	570	push(ucs_ch, &combcharcnt, &ucsp);
	571	continue;
	572
	573	/*
	574	* Escape illegal UTF-8 into something legal.
	575	*/
	576	escape4:
	577	utf8p -= 3;
	578	goto escape;
	579	escape3:
	580	utf8p -= 2;
	581	goto escape;
	582	escape2:
	583	utf8p -= 1;
	584	escape:
	585	if (!escaping) {
	586	result = EINVAL;
	587	goto exit;
	588	}
	589	if (extrabytes > 0)
	590	utf8len += extrabytes;
	591	byte = *(utf8p - 1);
	592
	593	if ((ucsp + 2) >= bufend)
	594	goto toolong;
	595
	596	/* Make a previous combining sequence canonical. */
	597	if (combcharcnt > 1) {
	598	prioritysort(ucsp - combcharcnt, combcharcnt);
	599	}
	600	combcharcnt = 0;
	601
	602	ucs_ch = '%';
	603	*ucsp++ = ucs_ch;
	604	ucs_ch = hexdigits[byte >> 4];
	605	*ucsp++ = ucs_ch;
	606	ucs_ch = hexdigits[byte & 0x0F];
	607	*ucsp++ = ucs_ch;
	608	}
	609	/*
	610	* Make a previous combining sequence canonical
	611	*/
	612	if (combcharcnt > 1) {
	613	prioritysort(ucsp - combcharcnt, combcharcnt);
	614	}
	615
	616	if (flags & UTF_REVERSE_ENDIAN) {
	617	uint16_t *p = bufstart;
	618	while (p < ucsp) {
	619	p = OSSwapInt16(p);
	620	++p;
	621	}
	622	}
	623
	624	exit:
	625	ucslen = (u_int8_t)ucsp - (u_int8_t*)bufstart;
	626
	627	return (result);
	628
	629	toolong:
	630	result = ENAMETOOLONG;
	631	goto exit;
	632	}
	633
	634
	635	/*
	636	* utf8_validatestr - Check for a valid UTF-8 string.
	637	*/
	638	int
	639	utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
	640	{
	641	unsigned int byte;
	642	u_int32_t ch;
	643	unsigned int ucs_ch;
	644	size_t extrabytes;
	645
	646	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
	647	if (byte < 0x80)
	648	continue; /* plain ascii */
	649
	650	extrabytes = utf_extrabytes[byte >> 3];
	651
	652	if (utf8len < extrabytes)
	653	goto invalid;
	654	utf8len -= extrabytes;
	655
	656	switch (extrabytes) {
	657	case 1:
	658	ch = byte; ch <<= 6; /* 1st byte */
	659	byte = utf8p++; / 2nd byte */
	660	if ((byte >> 6) != 2)
	661	goto invalid;
	662	ch += byte;
	663	ch -= 0x00003080UL;
	664	if (ch < 0x0080)
	665	goto invalid;
	666	break;
	667	case 2:
	668	ch = byte; ch <<= 6; /* 1st byte */
	669	byte = utf8p++; / 2nd byte */
	670	if ((byte >> 6) != 2)
	671	goto invalid;
	672	ch += byte; ch <<= 6;
	673	byte = utf8p++; / 3rd byte */
	674	if ((byte >> 6) != 2)
	675	goto invalid;
	676	ch += byte;
	677	ch -= 0x000E2080UL;
	678	if (ch < 0x0800)
	679	goto invalid;
	680	if (ch >= 0xD800) {
	681	if (ch <= 0xDFFF)
	682	goto invalid;
	683	if (ch == 0xFFFE \|\| ch == 0xFFFF)
	684	goto invalid;
	685	}
	686	break;
	687	case 3:
	688	ch = byte; ch <<= 6; /* 1st byte */
	689	byte = utf8p++; / 2nd byte */
	690	if ((byte >> 6) != 2)
	691	goto invalid;
	692	ch += byte; ch <<= 6;
	693	byte = utf8p++; / 3rd byte */
	694	if ((byte >> 6) != 2)
	695	goto invalid;
	696	ch += byte; ch <<= 6;
	697	byte = utf8p++; / 4th byte */
	698	if ((byte >> 6) != 2)
	699	goto invalid;
	700	ch += byte;
	701	ch -= 0x03C82080UL + SP_HALF_BASE;
	702	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
	703	if (ucs_ch < SP_HIGH_FIRST \|\| ucs_ch > SP_HIGH_LAST)
	704	goto invalid;
	705	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
	706	if (ucs_ch < SP_LOW_FIRST \|\| ucs_ch > SP_LOW_LAST)
	707	goto invalid;
	708	break;
	709	default:
	710	goto invalid;
	711	}
	712
	713	}
	714	return (0);
	715	invalid:
	716	return (EINVAL);
	717	}
	718
	719	/*
	720	* utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
	721	*
	722	* This function takes an UTF-8 input string, instr, of inlen bytes
	723	* and produces normalized UTF-8 output into a buffer of buflen bytes
	724	* pointed to by outstr. The size of the output in bytes (not including
	725	* a NULL termination byte) is returned in outlen. In-place conversions
	726	* are not supported (i.e. instr != outstr).]
	727
	728	* FLAGS
	729	* UTF_DECOMPOSED: output string will be fully decomposed (NFD)
	730	*
	731	* UTF_PRECOMPOSED: output string will be precomposed (NFC)
	732	*
	733	* UTF_NO_NULL_TERM: do not add null termination to output string
	734	*
	735	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
	736	*
	737	* ERRORS
	738	* ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
	739	*
	740	* EINVAL: illegal UTF-8 sequence encountered or invalid flags
	741	*/
	742	int
	743	utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
	744	size_t *outlen, size_t buflen, int flags)
	745	{
	746	u_int16_t unicodebuf[32];
	747	u_int16_t* unistr = NULL;
	748	size_t unicode_bytes;
	749	size_t uft8_bytes;
	750	size_t inbuflen;
	751	u_int8_t outbufstart, outbufend;
	752	const u_int8_t *inbufstart;
	753	unsigned int byte;
	754	int decompose, precompose;
	755	int result = 0;
	756
	757	if (flags & ~(UTF_DECOMPOSED \| UTF_PRECOMPOSED \| UTF_NO_NULL_TERM \| UTF_ESCAPE_ILLEGAL)) {
	758	return (EINVAL);
	759	}
	760	decompose = (flags & UTF_DECOMPOSED);
	761	precompose = (flags & UTF_PRECOMPOSED);
	762	if ((decompose && precompose) \|\| (!decompose && !precompose)) {
	763	return (EINVAL);
	764	}
	765	outbufstart = outstr;
	766	outbufend = outbufstart + buflen;
	767	inbufstart = instr;
	768	inbuflen = inlen;
	769
	770	while (inlen-- > 0 && (byte = *instr++) != '\0') {
	771	if (outstr >= outbufend) {
	772	result = ENAMETOOLONG;
	773	goto exit;
	774	}
	775	if (byte >= 0x80) {
	776	goto nonASCII;
	777	}
	778	/* ASCII is already normalized. */
	779	*outstr++ = byte;
	780	}
	781	exit:
	782	*outlen = outstr - outbufstart;
	783	if (((flags & UTF_NO_NULL_TERM) == 0)) {
	784	if (outstr < outbufend)
	785	*outstr++ = '\0';
	786	else
	787	result = ENAMETOOLONG;
	788	}
	789	return (result);
	790
	791
	792	/*
	793	* Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
	794	* functions to perform the normalization. Since this will
	795	* presumably be used to normalize filenames in the back-end
	796	* (on disk or over-the-wire), it should be fast enough.
	797	*/
	798	nonASCII:
	799
	800	/* Make sure the input size is reasonable. */
	801	if (inbuflen > MAXPATHLEN) {
	802	result = ENAMETOOLONG;
	803	goto exit;
	804	}
	805	/*
	806	* Compute worst case Unicode buffer size.
	807	*
	808	* For pre-composed output, every UTF-8 input byte will be at
	809	* most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
	810	* (smallest composite char sequence) may yield 6 Unicode bytes
	811	* (1 base char + 2 combining chars).
	812	*/
	813	unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
	814
	815	if (unicode_bytes <= sizeof(unicodebuf))
	816	unistr = &unicodebuf[0];
	817	else
	818	MALLOC(unistr, uint16_t *, unicode_bytes, M_TEMP, M_WAITOK);
	819
	820	/* Normalize the string. */
	821	result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
	822	unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
	823	if (result == 0) {
	824	/* Put results back into UTF-8. */
	825	result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
	826	&uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
	827	outstr = outbufstart + uft8_bytes;
	828	}
	829	if (unistr && unistr != &unicodebuf[0]) {
	830	FREE(unistr, M_TEMP);
	831	}
	832	goto exit;
	833	}
	834
	835
	836	/*
	837	* Unicode 3.2 decomposition code (derived from Core Foundation)
	838	*/
	839
	840	typedef struct {
	841	u_int32_t _key;
	842	u_int32_t _value;
	843	} unicode_mappings32;
	844
	845	static inline u_int32_t
	846	getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
	847	u_int16_t character)
	848	{
	849	const unicode_mappings32 p, q, *divider;
	850
	851	if ((character < theTable[0]._key) \|\| (character > theTable[numElem-1]._key))
	852	return (0);
	853
	854	p = theTable;
	855	q = p + (numElem-1);
	856	while (p <= q) {
	857	divider = p + ((q - p) >> 1); /* divide by 2 */
	858	if (character < divider->_key) { q = divider - 1; }
	859	else if (character > divider->_key) { p = divider + 1; }
	860	else { return (divider->_value); }
	861	}
	862	return (0);
	863	}
	864
	865	#define RECURSIVE_DECOMPOSITION (1 << 15)
	866	#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
	867
	868	typedef struct {
	869	u_int16_t _key;
	870	u_int16_t _value;
	871	} unicode_mappings16;
	872
	873	static inline u_int16_t
	874	getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
	875	u_int16_t character)
	876	{
	877	const unicode_mappings16 p, q, *divider;
	878
	879	if ((character < theTable[0]._key) \|\| (character > theTable[numElem-1]._key))
	880	return (0);
	881
	882	p = theTable;
	883	q = p + (numElem-1);
	884	while (p <= q) {
	885	divider = p + ((q - p) >> 1); /* divide by 2 */
	886	if (character < divider->_key)
	887	q = divider - 1;
	888	else if (character > divider->_key)
	889	p = divider + 1;
	890	else
	891	return (divider->_value);
	892	}
	893	return (0);
	894	}
	895
	896
	897	static u_int32_t
	898	unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
	899	{
	900	u_int16_t value;
	901	u_int32_t length;
	902	u_int16_t firstChar;
	903	u_int16_t theChar;
	904	const u_int16_t *bmpMappings;
	905	u_int32_t usedLength;
	906
	907	value = getmappedvalue16(
	908	(const unicode_mappings16 *)__CFUniCharDecompositionTable,
	909	__UniCharDecompositionTableLength, character);
	910	length = EXTRACT_COUNT(value);
	911	firstChar = value & 0x0FFF;
	912	theChar = firstChar;
	913	bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
	914	usedLength = 0;
	915
	916	if (value & RECURSIVE_DECOMPOSITION) {
	917	usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
	918
	919	--length; /* Decrement for the first char */
	920	if (!usedLength)
	921	return 0;
	922	++bmpMappings;
	923	convertedChars += usedLength;
	924	}
	925
	926	usedLength += length;
	927
	928	while (length--)
	929	(convertedChars++) = (bmpMappings++);
	930
	931	return (usedLength);
	932	}
	933
	934	#define HANGUL_SBASE 0xAC00
	935	#define HANGUL_LBASE 0x1100
	936	#define HANGUL_VBASE 0x1161
	937	#define HANGUL_TBASE 0x11A7
	938
	939	#define HANGUL_SCOUNT 11172
	940	#define HANGUL_LCOUNT 19
	941	#define HANGUL_VCOUNT 21
	942	#define HANGUL_TCOUNT 28
	943	#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
	944
	945	/*
	946	* unicode_decompose - decompose a composed Unicode char
	947	*
	948	* Composed Unicode characters are forbidden on
	949	* HFS Plus volumes. ucs_decompose will convert a
	950	* composed character into its correct decomposed
	951	* sequence.
	952	*
	953	* Similar to CFUniCharDecomposeCharacter
	954	*/
	955	static int
	956	unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
	957	{
	958	if ((character >= HANGUL_SBASE) &&
	959	(character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
	960	u_int32_t length;
	961
	962	character -= HANGUL_SBASE;
	963	length = (character % HANGUL_TCOUNT ? 3 : 2);
	964
	965	*(convertedChars++) =
	966	character / HANGUL_NCOUNT + HANGUL_LBASE;
	967	*(convertedChars++) =
	968	(character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
	969	if (length > 2)
	970	*convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
	971	return (length);
	972	} else {
	973	return (unicode_recursive_decompose(character, convertedChars));
	974	}
	975	}
	976
	977	/*
	978	* unicode_combine - generate a precomposed Unicode char
	979	*
	980	* Precomposed Unicode characters are required for some volume
	981	* formats and network protocols. unicode_combine will combine
	982	* a decomposed character sequence into a single precomposed
	983	* (composite) character.
	984	*
	985	* Similar toCFUniCharPrecomposeCharacter but unicode_combine
	986	* also handles Hangul Jamo characters.
	987	*/
	988	static u_int16_t
	989	unicode_combine(u_int16_t base, u_int16_t combining)
	990	{
	991	u_int32_t value;
	992
	993	/* Check HANGUL */
	994	if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
	995	/* 2 char Hangul sequences */
	996	if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
	997	(base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
	998	return (HANGUL_SBASE +
	999	((base - HANGUL_LBASE)(HANGUL_VCOUNTHANGUL_TCOUNT)) +
	1000	((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
	1001	}
	1002
	1003	/* 3 char Hangul sequences */
	1004	if ((combining > HANGUL_TBASE) &&
	1005	(base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
	1006	if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
	1007	return (0);
	1008	else
	1009	return (base + (combining - HANGUL_TBASE));
	1010	}
	1011	}
	1012
	1013	value = getmappedvalue32(
	1014	(const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
	1015	__CFUniCharPrecompositionTableLength, combining);
	1016
	1017	if (value) {
	1018	value = getmappedvalue16(
	1019	(const unicode_mappings16 *)
	1020	((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
	1021	(value >> 16), base);
	1022	}
	1023	return (value);
	1024	}
	1025
	1026
	1027	/*
	1028	* prioritysort - order combining chars into canonical order
	1029	*
	1030	* Similar to CFUniCharPrioritySort
	1031	*/
	1032	static void
	1033	prioritysort(u_int16_t* characters, int count)
	1034	{
	1035	u_int32_t p1, p2;
	1036	u_int16_t ch1, ch2;
	1037	u_int16_t *end;
	1038	int changes = 0;
	1039
	1040	end = characters + count;
	1041	do {
	1042	changes = 0;
	1043	ch1 = characters;
	1044	ch2 = characters + 1;
	1045	p2 = get_combining_class(*ch1);
	1046	while (ch2 < end) {
	1047	p1 = p2;
	1048	p2 = get_combining_class(*ch2);
	1049	if (p1 > p2 && p2 != 0) {
	1050	u_int32_t tmp;
	1051
	1052	tmp = *ch1;
	1053	ch1 = ch2;
	1054	*ch2 = tmp;
	1055	changes = 1;
	1056
	1057	/*
	1058	* Make sure that p2 contains the combining class for the
	1059	* character now stored at *ch2. This isn't required for
	1060	* correctness, but it will be more efficient if a character
	1061	* with a large combining class has to "bubble past" several
	1062	* characters with lower combining classes.
	1063	*/
	1064	p2 = p1;
	1065	}
	1066	++ch1;
	1067	++ch2;
	1068	}
	1069	} while (changes);
	1070	}
	1071
	1072
	1073	/*
	1074	* Invalid NTFS filename characters are encodeded using the
	1075	* SFM (Services for Macintosh) private use Unicode characters.
	1076	*
	1077	* These should only be used for SMB, MSDOS or NTFS.
	1078	*
	1079	* Illegal NTFS Char SFM Unicode Char
	1080	* ----------------------------------------
	1081	* 0x01-0x1f 0xf001-0xf01f
	1082	* '"' 0xf020
	1083	* '*' 0xf021
	1084	* '/' 0xf022
	1085	* '<' 0xf023
	1086	* '>' 0xf024
	1087	* '?' 0xf025
	1088	* '\' 0xf026
	1089	* '\|' 0xf027
	1090	* ' ' 0xf028 (Only if last char of the name)
	1091	* '.' 0xf029 (Only if last char of the name)
	1092	* ----------------------------------------
	1093	*
	1094	* Reference: http://support.microsoft.com/kb/q117258/
	1095	*/
	1096
	1097	#define MAX_SFM2MAC 0x29
	1098	#define SFMCODE_PREFIX_MASK 0xf000
	1099
	1100	/*
	1101	* In the Mac OS 9 days the colon was illegal in a file name. For that reason
	1102	* SFM had no conversion for the colon. There is a conversion for the
	1103	* slash. In Mac OS X the slash is illegal in a file name. So for us the colon
	1104	* is a slash and a slash is a colon. So we can just replace the slash with the
	1105	* colon in our tables and everything will just work.
	1106	*/
	1107	static u_int8_t
	1108	sfm2mac[] = {
	1109	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
	1110	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
	1111	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
	1112	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
	1113	0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
	1114	0x20, 0x2e /* 28 - 29 */
	1115	};
	1116	#define SFM2MAC_LEN ((sizeof(sfm2mac))/sizeof(sfm2mac[0]))
	1117
	1118	static u_int8_t
	1119	mac2sfm[] = {
	1120	0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
	1121	0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
	1122	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
	1123	0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
	1124	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
	1125	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
	1126	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
	1127	0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
	1128	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
	1129	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
	1130	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
	1131	0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
	1132	};
	1133	#define MAC2SFM_LEN ((sizeof(mac2sfm))/sizeof(mac2sfm[0]))
	1134
	1135
	1136	/*
	1137	* Encode illegal NTFS filename characters into SFM Private Unicode characters
	1138	*
	1139	* Assumes non-zero ASCII input.
	1140	*/
	1141	static u_int16_t
	1142	ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
	1143	{
	1144	/* The last character of filename cannot be a space or period. */
	1145	if (lastchar) {
	1146	if (ucs_ch == 0x20)
	1147	return (0xf028);
	1148	else if (ucs_ch == 0x2e)
	1149	return (0xf029);
	1150	}
	1151	/* 0x01 - 0x1f is simple transformation. */
	1152	if (ucs_ch <= 0x1f) {
	1153	return (ucs_ch \| 0xf000);
	1154	} else /* 0x20 - 0x7f */ {
	1155	u_int16_t lsb;
	1156
	1157	assert((ucs_ch - 0x0020) < MAC2SFM_LEN);
	1158	lsb = mac2sfm[ucs_ch - 0x0020];
	1159	if (lsb != ucs_ch)
	1160	return(0xf000 \| lsb);
	1161	}
	1162	return (ucs_ch);
	1163	}
	1164
	1165	/*
	1166	* Decode any SFM Private Unicode characters
	1167	*/
	1168	static u_int16_t
	1169	sfm_to_ucs(u_int16_t ucs_ch)
	1170	{
	1171	if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
	1172	((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
	1173	assert((ucs_ch & 0x003f) < SFM2MAC_LEN);
	1174	ucs_ch = sfm2mac[ucs_ch & 0x003f];
	1175	}
	1176	return (ucs_ch);
	1177	}
	1178
	1179