git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	/*
	30	Includes Unicode 3.2 decomposition code derived from Core Foundation
	31	*/
	32
	33	#include <sys/param.h>
	34	#include <sys/utfconv.h>
	35	#include <sys/errno.h>
	36	#include <sys/malloc.h>
	37	#include <libkern/OSByteOrder.h>
	38
	39	/*
	40	* UTF-8 (Unicode Transformation Format)
	41	*
	42	* UTF-8 is the Unicode Transformation Format that serializes a Unicode
	43	* character as a sequence of one to four bytes. Only the shortest form
	44	* required to represent the significant Unicode bits is legal.
	45	*
	46	* UTF-8 Multibyte Codes
	47	*
	48	* Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
	49	* -----------------------------------------------------------------------------
	50	* 1 7 0x0000 0x007F 0xxxxxxx
	51	* 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
	52	* 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
	53	* 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	54	* -----------------------------------------------------------------------------
	55	*/
	56
	57
	58	#define UNICODE_TO_UTF8_LEN(c) \
	59	((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
	60
	61	#define UCS_ALT_NULL 0x2400
	62
	63	/* Surrogate Pair Constants */
	64	#define SP_HALF_SHIFT 10
	65	#define SP_HALF_BASE 0x0010000UL
	66	#define SP_HALF_MASK 0x3FFUL
	67
	68	#define SP_HIGH_FIRST 0xD800UL
	69	#define SP_HIGH_LAST 0xDBFFUL
	70	#define SP_LOW_FIRST 0xDC00UL
	71	#define SP_LOW_LAST 0xDFFFUL
	72
	73
	74	#include "vfs_utfconvdata.h"
	75
	76
	77	/*
	78	* Test for a combining character.
	79	*
	80	* Similar to __CFUniCharIsNonBaseCharacter except that
	81	* unicode_combinable also includes Hangul Jamo characters.
	82	*/
	83	int
	84	unicode_combinable(u_int16_t character)
	85	{
	86	const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
	87	u_int8_t value;
	88
	89	if (character < 0x0300)
	90	return (0);
	91
	92	value = bitmap[(character >> 8) & 0xFF];
	93
	94	if (value == 0xFF) {
	95	return (1);
	96	} else if (value) {
	97	bitmap = bitmap + ((value - 1) * 32) + 256;
	98	return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
	99	}
	100	return (0);
	101	}
	102
	103	/*
	104	* Test for a precomposed character.
	105	*
	106	* Similar to __CFUniCharIsDecomposableCharacter.
	107	*/
	108	int
	109	unicode_decomposeable(u_int16_t character) {
	110	const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
	111	u_int8_t value;
	112
	113	if (character < 0x00C0)
	114	return (0);
	115
	116	value = bitmap[(character >> 8) & 0xFF];
	117
	118	if (value == 0xFF) {
	119	return (1);
	120	} else if (value) {
	121	bitmap = bitmap + ((value - 1) * 32) + 256;
	122	return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
	123	}
	124	return (0);
	125	}
	126
	127
	128	/*
	129	* Get the combing class.
	130	*
	131	* Similar to CFUniCharGetCombiningPropertyForCharacter.
	132	*/
	133	static inline u_int8_t
	134	get_combining_class(u_int16_t character) {
	135	const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
	136
	137	u_int8_t value = bitmap[(character >> 8)];
	138
	139	if (value) {
	140	bitmap = bitmap + (value * 256);
	141	return bitmap[character % 256];
	142	}
	143	return (0);
	144	}
	145
	146
	147	static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
	148
	149	static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
	150
	151	static void priortysort(u_int16_t* characters, int count);
	152
	153	static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
	154
	155	static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
	156
	157
	158	char utf_extrabytes[32] = {
	159	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	160	-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
	161	};
	162
	163	const char hexdigits[16] = {
	164	'0', '1', '2', '3', '4', '5', '6', '7',
	165	'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
	166	};
	167
	168	/*
	169	* utf8_encodelen - Calculate the UTF-8 encoding length
	170	*
	171	* This function takes a Unicode input string, ucsp, of ucslen bytes
	172	* and calculates the size of the UTF-8 output in bytes (not including
	173	* a NULL termination byte). The string must reside in kernel memory.
	174	*
	175	* If '/' chars are possible in the Unicode input then an alternate
	176	* (replacement) char should be provided in altslash.
	177	*
	178	* FLAGS
	179	* UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
	180	*
	181	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	182	*
	183	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	184	*
	185	* UTF_DECOMPOSED: generate fully decomposed output
	186	*
	187	* UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
	188	*
	189	* ERRORS
	190	* None
	191	*/
	192	size_t
	193	utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
	194	{
	195	u_int16_t ucs_ch;
	196	u_int16_t * chp = NULL;
	197	u_int16_t sequence[8];
	198	int extra = 0;
	199	int charcnt;
	200	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
	201	int decompose = (flags & UTF_DECOMPOSED);
	202	size_t len;
	203
	204	charcnt = ucslen / 2;
	205	len = 0;
	206
	207	while (charcnt-- > 0) {
	208	if (extra > 0) {
	209	--extra;
	210	ucs_ch = *chp++;
	211	} else {
	212	ucs_ch = *ucsp++;
	213	if (swapbytes) {
	214	ucs_ch = OSSwapInt16(ucs_ch);
	215	}
	216	if (ucs_ch == '/') {
	217	ucs_ch = altslash ? altslash : '_';
	218	} else if (ucs_ch == '\0') {
	219	ucs_ch = UCS_ALT_NULL;
	220	} else if (decompose && unicode_decomposeable(ucs_ch)) {
	221	extra = unicode_decompose(ucs_ch, sequence) - 1;
	222	charcnt += extra;
	223	ucs_ch = sequence[0];
	224	chp = &sequence[1];
	225	}
	226	}
	227	len += UNICODE_TO_UTF8_LEN(ucs_ch);
	228	}
	229
	230	return (len);
	231	}
	232
	233
	234	/*
	235	* utf8_encodestr - Encodes a Unicode string to UTF-8
	236	*
	237	* NOTES:
	238	* The resulting UTF-8 string is NULL terminated.
	239	*
	240	* If '/' chars are allowed on disk then an alternate
	241	* (replacement) char must be provided in altslash.
	242	*
	243	* input flags:
	244	* UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
	245	*
	246	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	247	*
	248	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	249	*
	250	* UTF_DECOMPOSED: generate fully decomposed output
	251	*
	252	* UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
	253	*
	254	* result:
	255	* ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
	256	*
	257	* EINVAL: Illegal char found; char was replaced by an '_'.
	258	*/
	259	int
	260	utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
	261	size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
	262	{
	263	u_int8_t * bufstart;
	264	u_int8_t * bufend;
	265	u_int16_t ucs_ch;
	266	u_int16_t * chp = NULL;
	267	u_int16_t sequence[8];
	268	int extra = 0;
	269	int charcnt;
	270	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
	271	int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
	272	int decompose = (flags & UTF_DECOMPOSED);
	273	int sfmconv = (flags & UTF_SFM_CONVERSIONS);
	274	int result = 0;
	275
	276	bufstart = utf8p;
	277	bufend = bufstart + buflen;
	278	if (nullterm)
	279	--bufend;
	280	charcnt = ucslen / 2;
	281
	282	while (charcnt-- > 0) {
	283	if (extra > 0) {
	284	--extra;
	285	ucs_ch = *chp++;
	286	} else {
	287	ucs_ch = swapbytes ? OSSwapInt16(ucsp++) : ucsp++;
	288
	289	if (decompose && unicode_decomposeable(ucs_ch)) {
	290	extra = unicode_decompose(ucs_ch, sequence) - 1;
	291	charcnt += extra;
	292	ucs_ch = sequence[0];
	293	chp = &sequence[1];
	294	}
	295	}
	296
	297	/* Slash and NULL are not permitted */
	298	if (ucs_ch == '/') {
	299	if (altslash)
	300	ucs_ch = altslash;
	301	else {
	302	ucs_ch = '_';
	303	result = EINVAL;
	304	}
	305	} else if (ucs_ch == '\0') {
	306	ucs_ch = UCS_ALT_NULL;
	307	}
	308
	309	if (ucs_ch < 0x0080) {
	310	if (utf8p >= bufend) {
	311	result = ENAMETOOLONG;
	312	break;
	313	}
	314	*utf8p++ = ucs_ch;
	315
	316	} else if (ucs_ch < 0x800) {
	317	if ((utf8p + 1) >= bufend) {
	318	result = ENAMETOOLONG;
	319	break;
	320	}
	321	*utf8p++ = 0xc0 \| (ucs_ch >> 6);
	322	*utf8p++ = 0x80 \| (0x3f & ucs_ch);
	323
	324	} else {
	325	/* These chars never valid Unicode. */
	326	if (ucs_ch == 0xFFFE \|\| ucs_ch == 0xFFFF) {
	327	result = EINVAL;
	328	break;
	329	}
	330
	331	/* Combine valid surrogate pairs */
	332	if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
	333	&& charcnt > 0) {
	334	u_int16_t ch2;
	335	u_int32_t pair;
	336
	337	ch2 = swapbytes ? OSSwapInt16(ucsp) : ucsp;
	338	if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
	339	pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
	340	+ (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
	341	if ((utf8p + 3) >= bufend) {
	342	result = ENAMETOOLONG;
	343	break;
	344	}
	345	--charcnt;
	346	++ucsp;
	347	*utf8p++ = 0xf0 \| (pair >> 18);
	348	*utf8p++ = 0x80 \| (0x3f & (pair >> 12));
	349	*utf8p++ = 0x80 \| (0x3f & (pair >> 6));
	350	*utf8p++ = 0x80 \| (0x3f & pair);
	351	continue;
	352	}
	353	} else if (sfmconv) {
	354	ucs_ch = sfm_to_ucs(ucs_ch);
	355	if (ucs_ch < 0x0080) {
	356	if (utf8p >= bufend) {
	357	result = ENAMETOOLONG;
	358	break;
	359	}
	360	*utf8p++ = ucs_ch;
	361	continue;
	362	}
	363	}
	364	if ((utf8p + 2) >= bufend) {
	365	result = ENAMETOOLONG;
	366	break;
	367	}
	368	*utf8p++ = 0xe0 \| (ucs_ch >> 12);
	369	*utf8p++ = 0x80 \| (0x3f & (ucs_ch >> 6));
	370	*utf8p++ = 0x80 \| (0x3f & ucs_ch);
	371	}
	372	}
	373
	374	*utf8len = utf8p - bufstart;
	375	if (nullterm)
	376	*utf8p++ = '\0';
	377
	378	return (result);
	379	}
	380
	381
	382	/*
	383	* utf8_decodestr - Decodes a UTF-8 string back to Unicode
	384	*
	385	* NOTES:
	386	* The input UTF-8 string does not need to be null terminated
	387	* if utf8len is set.
	388	*
	389	* If '/' chars are allowed on disk then an alternate
	390	* (replacement) char must be provided in altslash.
	391	*
	392	* input flags:
	393	* UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
	394	*
	395	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
	396	*
	397	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
	398	*
	399	* UTF_DECOMPOSED: generate fully decomposed output (NFD)
	400	*
	401	* UTF_PRECOMPOSED: generate precomposed output (NFC)
	402	*
	403	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
	404	*
	405	* result:
	406	* ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
	407	*
	408	* EINVAL: Illegal UTF-8 sequence found.
	409	*/
	410	int
	411	utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
	412	size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
	413	{
	414	u_int16_t* bufstart;
	415	u_int16_t* bufend;
	416	unsigned int ucs_ch;
	417	unsigned int byte;
	418	int combcharcnt = 0;
	419	int result = 0;
	420	int decompose, precompose, swapbytes, escaping;
	421	int sfmconv;
	422	int extrabytes;
	423
	424	decompose = (flags & UTF_DECOMPOSED);
	425	precompose = (flags & UTF_PRECOMPOSED);
	426	swapbytes = (flags & UTF_REVERSE_ENDIAN);
	427	escaping = (flags & UTF_ESCAPE_ILLEGAL);
	428	sfmconv = (flags & UTF_SFM_CONVERSIONS);
	429
	430	bufstart = ucsp;
	431	bufend = (u_int16_t )((u_int8_t )ucsp + buflen);
	432
	433	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
	434	if (ucsp >= bufend)
	435	goto toolong;
	436
	437	/* check for ascii */
	438	if (byte < 0x80) {
	439	ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
	440	} else {
	441	u_int32_t ch;
	442
	443	extrabytes = utf_extrabytes[byte >> 3];
	444	if ((extrabytes < 0) \|\| ((int)utf8len < extrabytes)) {
	445	goto escape;
	446	}
	447	utf8len -= extrabytes;
	448
	449	switch (extrabytes) {
	450	case 1:
	451	ch = byte; ch <<= 6; /* 1st byte */
	452	byte = utf8p++; / 2nd byte */
	453	if ((byte >> 6) != 2)
	454	goto escape2;
	455	ch += byte;
	456	ch -= 0x00003080UL;
	457	if (ch < 0x0080)
	458	goto escape2;
	459	ucs_ch = ch;
	460	break;
	461	case 2:
	462	ch = byte; ch <<= 6; /* 1st byte */
	463	byte = utf8p++; / 2nd byte */
	464	if ((byte >> 6) != 2)
	465	goto escape2;
	466	ch += byte; ch <<= 6;
	467	byte = utf8p++; / 3rd byte */
	468	if ((byte >> 6) != 2)
	469	goto escape3;
	470	ch += byte;
	471	ch -= 0x000E2080UL;
	472	if (ch < 0x0800)
	473	goto escape3;
	474	if (ch >= 0xD800) {
	475	if (ch <= 0xDFFF)
	476	goto escape3;
	477	if (ch == 0xFFFE \|\| ch == 0xFFFF)
	478	goto escape3;
	479	}
	480	ucs_ch = ch;
	481	break;
	482	case 3:
	483	ch = byte; ch <<= 6; /* 1st byte */
	484	byte = utf8p++; / 2nd byte */
	485	if ((byte >> 6) != 2)
	486	goto escape2;
	487	ch += byte; ch <<= 6;
	488	byte = utf8p++; / 3rd byte */
	489	if ((byte >> 6) != 2)
	490	goto escape3;
	491	ch += byte; ch <<= 6;
	492	byte = utf8p++; / 4th byte */
	493	if ((byte >> 6) != 2)
	494	goto escape4;
	495	ch += byte;
	496	ch -= 0x03C82080UL + SP_HALF_BASE;
	497	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
	498	if (ucs_ch < SP_HIGH_FIRST \|\| ucs_ch > SP_HIGH_LAST)
	499	goto escape4;
	500	*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
	501	if (ucsp >= bufend)
	502	goto toolong;
	503	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
	504	if (ucs_ch < SP_LOW_FIRST \|\| ucs_ch > SP_LOW_LAST) {
	505	--ucsp;
	506	goto escape4;
	507	}
	508	*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
	509	continue;
	510	default:
	511	result = EINVAL;
	512	goto exit;
	513	}
	514	if (decompose) {
	515	if (unicode_decomposeable(ucs_ch)) {
	516	u_int16_t sequence[8];
	517	int count, i;
	518
	519	/* Before decomposing a new unicode character, sort
	520	* previous combining characters, if any, and reset
	521	* the counter.
	522	*/
	523	if (combcharcnt > 1) {
	524	priortysort(ucsp - combcharcnt, combcharcnt);
	525	}
	526	combcharcnt = 0;
	527
	528	count = unicode_decompose(ucs_ch, sequence);
	529	for (i = 0; i < count; ++i) {
	530	ucs_ch = sequence[i];
	531	*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
	532	if (ucsp >= bufend)
	533	goto toolong;
	534	}
	535	combcharcnt += count - 1;
	536	continue;
	537	}
	538	} else if (precompose && (ucsp != bufstart)) {
	539	u_int16_t composite, base;
	540
	541	if (unicode_combinable(ucs_ch)) {
	542	base = swapbytes ? OSSwapInt16((ucsp - 1)) : (ucsp - 1);
	543	composite = unicode_combine(base, ucs_ch);
	544	if (composite) {
	545	--ucsp;
	546	ucs_ch = composite;
	547	}
	548	}
	549	}
	550	if (ucs_ch == UCS_ALT_NULL)
	551	ucs_ch = '\0';
	552	}
	553	if (ucs_ch == altslash)
	554	ucs_ch = '/';
	555
	556	/*
	557	* Make multiple combining character sequences canonical
	558	*/
	559	if (unicode_combinable(ucs_ch)) {
	560	++combcharcnt; /* start tracking a run */
	561	} else if (combcharcnt) {
	562	if (combcharcnt > 1) {
	563	priortysort(ucsp - combcharcnt, combcharcnt);
	564	}
	565	combcharcnt = 0; /* start over */
	566	}
	567
	568	*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
	569	continue;
	570
	571	/*
	572	* Escape illegal UTF-8 into something legal.
	573	*/
	574	escape4:
	575	utf8p -= 3;
	576	goto escape;
	577	escape3:
	578	utf8p -= 2;
	579	goto escape;
	580	escape2:
	581	utf8p -= 1;
	582	escape:
	583	if (!escaping) {
	584	result = EINVAL;
	585	goto exit;
	586	}
	587	if (extrabytes > 0)
	588	utf8len += extrabytes;
	589	byte = *(utf8p - 1);
	590
	591	if ((ucsp + 2) >= bufend)
	592	goto toolong;
	593
	594	/* Make a previous combining sequence canonical. */
	595	if (combcharcnt > 1) {
	596	priortysort(ucsp - combcharcnt, combcharcnt);
	597	}
	598	combcharcnt = 0;
	599
	600	ucs_ch = '%';
	601	*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
	602	ucs_ch = hexdigits[byte >> 4];
	603	*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
	604	ucs_ch = hexdigits[byte & 0x0F];
	605	*ucsp++ = swapbytes ? OSSwapInt16(ucs_ch) : (u_int16_t)ucs_ch;
	606	}
	607	/*
	608	* Make a previous combining sequence canonical
	609	*/
	610	if (combcharcnt > 1) {
	611	priortysort(ucsp - combcharcnt, combcharcnt);
	612	}
	613	exit:
	614	ucslen = (u_int8_t)ucsp - (u_int8_t*)bufstart;
	615
	616	return (result);
	617
	618	toolong:
	619	result = ENAMETOOLONG;
	620	goto exit;
	621	}
	622
	623
	624	/*
	625	* utf8_validatestr - Check for a valid UTF-8 string.
	626	*/
	627	int
	628	utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
	629	{
	630	unsigned int byte;
	631	u_int32_t ch;
	632	unsigned int ucs_ch;
	633	size_t extrabytes;
	634
	635	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
	636	if (byte < 0x80)
	637	continue; /* plain ascii */
	638
	639	extrabytes = utf_extrabytes[byte >> 3];
	640
	641	if (utf8len < extrabytes)
	642	goto invalid;
	643	utf8len -= extrabytes;
	644
	645	switch (extrabytes) {
	646	case 1:
	647	ch = byte; ch <<= 6; /* 1st byte */
	648	byte = utf8p++; / 2nd byte */
	649	if ((byte >> 6) != 2)
	650	goto invalid;
	651	ch += byte;
	652	ch -= 0x00003080UL;
	653	if (ch < 0x0080)
	654	goto invalid;
	655	break;
	656	case 2:
	657	ch = byte; ch <<= 6; /* 1st byte */
	658	byte = utf8p++; / 2nd byte */
	659	if ((byte >> 6) != 2)
	660	goto invalid;
	661	ch += byte; ch <<= 6;
	662	byte = utf8p++; / 3rd byte */
	663	if ((byte >> 6) != 2)
	664	goto invalid;
	665	ch += byte;
	666	ch -= 0x000E2080UL;
	667	if (ch < 0x0800)
	668	goto invalid;
	669	if (ch >= 0xD800) {
	670	if (ch <= 0xDFFF)
	671	goto invalid;
	672	if (ch == 0xFFFE \|\| ch == 0xFFFF)
	673	goto invalid;
	674	}
	675	break;
	676	case 3:
	677	ch = byte; ch <<= 6; /* 1st byte */
	678	byte = utf8p++; / 2nd byte */
	679	if ((byte >> 6) != 2)
	680	goto invalid;
	681	ch += byte; ch <<= 6;
	682	byte = utf8p++; / 3rd byte */
	683	if ((byte >> 6) != 2)
	684	goto invalid;
	685	ch += byte; ch <<= 6;
	686	byte = utf8p++; / 4th byte */
	687	if ((byte >> 6) != 2)
	688	goto invalid;
	689	ch += byte;
	690	ch -= 0x03C82080UL + SP_HALF_BASE;
	691	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
	692	if (ucs_ch < SP_HIGH_FIRST \|\| ucs_ch > SP_HIGH_LAST)
	693	goto invalid;
	694	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
	695	if (ucs_ch < SP_LOW_FIRST \|\| ucs_ch > SP_LOW_LAST)
	696	goto invalid;
	697	break;
	698	default:
	699	goto invalid;
	700	}
	701
	702	}
	703	return (0);
	704	invalid:
	705	return (EINVAL);
	706	}
	707
	708	/*
	709	* utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
	710	*
	711	* This function takes an UTF-8 input string, instr, of inlen bytes
	712	* and produces normalized UTF-8 output into a buffer of buflen bytes
	713	* pointed to by outstr. The size of the output in bytes (not including
	714	* a NULL termination byte) is returned in outlen. In-place conversions
	715	* are not supported (i.e. instr != outstr).]
	716
	717	* FLAGS
	718	* UTF_DECOMPOSED: output string will be fully decomposed (NFD)
	719	*
	720	* UTF_PRECOMPOSED: output string will be precomposed (NFC)
	721	*
	722	* UTF_NO_NULL_TERM: do not add null termination to output string
	723	*
	724	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
	725	*
	726	* ERRORS
	727	* ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
	728	*
	729	* EINVAL: illegal UTF-8 sequence encountered or invalid flags
	730	*/
	731	int
	732	utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
	733	size_t *outlen, size_t buflen, int flags)
	734	{
	735	u_int16_t unicodebuf[32];
	736	u_int16_t* unistr = NULL;
	737	size_t unicode_bytes;
	738	size_t uft8_bytes;
	739	size_t inbuflen;
	740	u_int8_t outbufstart, outbufend;
	741	const u_int8_t *inbufstart;
	742	unsigned int byte;
	743	int decompose, precompose;
	744	int result = 0;
	745
	746	if (flags & ~(UTF_DECOMPOSED \| UTF_PRECOMPOSED \| UTF_NO_NULL_TERM \| UTF_ESCAPE_ILLEGAL)) {
	747	return (EINVAL);
	748	}
	749	decompose = (flags & UTF_DECOMPOSED);
	750	precompose = (flags & UTF_PRECOMPOSED);
	751	if ((decompose && precompose) \|\| (!decompose && !precompose)) {
	752	return (EINVAL);
	753	}
	754	outbufstart = outstr;
	755	outbufend = outbufstart + buflen;
	756	inbufstart = instr;
	757	inbuflen = inlen;
	758
	759	while (inlen-- > 0 && (byte = *instr++) != '\0') {
	760	if (outstr >= outbufend) {
	761	result = ENAMETOOLONG;
	762	goto exit;
	763	}
	764	if (byte >= 0x80) {
	765	goto nonASCII;
	766	}
	767	/* ASCII is already normalized. */
	768	*outstr++ = byte;
	769	}
	770	exit:
	771	*outlen = outstr - outbufstart;
	772	if (((flags & UTF_NO_NULL_TERM) == 0)) {
	773	if (outstr < outbufend)
	774	*outstr++ = '\0';
	775	else
	776	result = ENAMETOOLONG;
	777	}
	778	return (result);
	779
	780
	781	/*
	782	* Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
	783	* functions to perform the normalization. Since this will
	784	* presumably be used to normalize filenames in the back-end
	785	* (on disk or over-the-wire), it should be fast enough.
	786	*/
	787	nonASCII:
	788
	789	/* Make sure the input size is reasonable. */
	790	if (inbuflen > MAXPATHLEN) {
	791	result = ENAMETOOLONG;
	792	goto exit;
	793	}
	794	/*
	795	* Compute worst case Unicode buffer size.
	796	*
	797	* For pre-composed output, every UTF-8 input byte will be at
	798	* most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
	799	* (smallest composite char sequence) may yield 6 Unicode bytes
	800	* (1 base char + 2 combining chars).
	801	*/
	802	unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
	803
	804	if (unicode_bytes <= sizeof(unicodebuf))
	805	unistr = &unicodebuf[0];
	806	else
	807	MALLOC(unistr, u_int16_t *, unicode_bytes, M_TEMP, M_WAITOK);
	808
	809	/* Normalize the string. */
	810	result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
	811	unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
	812	if (result == 0) {
	813	/* Put results back into UTF-8. */
	814	result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
	815	&uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
	816	outstr = outbufstart + uft8_bytes;
	817	}
	818	if (unistr && unistr != &unicodebuf[0]) {
	819	FREE(unistr, M_TEMP);
	820	}
	821	goto exit;
	822	}
	823
	824
	825	/*
	826	* Unicode 3.2 decomposition code (derived from Core Foundation)
	827	*/
	828
	829	typedef struct {
	830	u_int32_t _key;
	831	u_int32_t _value;
	832	} unicode_mappings32;
	833
	834	static inline u_int32_t
	835	getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
	836	u_int16_t character)
	837	{
	838	const unicode_mappings32 p, q, *divider;
	839
	840	if ((character < theTable[0]._key) \|\| (character > theTable[numElem-1]._key))
	841	return (0);
	842
	843	p = theTable;
	844	q = p + (numElem-1);
	845	while (p <= q) {
	846	divider = p + ((q - p) >> 1); /* divide by 2 */
	847	if (character < divider->_key) { q = divider - 1; }
	848	else if (character > divider->_key) { p = divider + 1; }
	849	else { return (divider->_value); }
	850	}
	851	return (0);
	852	}
	853
	854	#define RECURSIVE_DECOMPOSITION (1 << 15)
	855	#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
	856
	857	typedef struct {
	858	u_int16_t _key;
	859	u_int16_t _value;
	860	} unicode_mappings16;
	861
	862	static inline u_int16_t
	863	getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
	864	u_int16_t character)
	865	{
	866	const unicode_mappings16 p, q, *divider;
	867
	868	if ((character < theTable[0]._key) \|\| (character > theTable[numElem-1]._key))
	869	return (0);
	870
	871	p = theTable;
	872	q = p + (numElem-1);
	873	while (p <= q) {
	874	divider = p + ((q - p) >> 1); /* divide by 2 */
	875	if (character < divider->_key)
	876	q = divider - 1;
	877	else if (character > divider->_key)
	878	p = divider + 1;
	879	else
	880	return (divider->_value);
	881	}
	882	return (0);
	883	}
	884
	885
	886	static u_int32_t
	887	unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
	888	{
	889	u_int16_t value;
	890	u_int32_t length;
	891	u_int16_t firstChar;
	892	u_int16_t theChar;
	893	const u_int16_t *bmpMappings;
	894	u_int32_t usedLength;
	895
	896	value = getmappedvalue16(
	897	(const unicode_mappings16 *)__CFUniCharDecompositionTable,
	898	__UniCharDecompositionTableLength, character);
	899	length = EXTRACT_COUNT(value);
	900	firstChar = value & 0x0FFF;
	901	theChar = firstChar;
	902	bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
	903	usedLength = 0;
	904
	905	if (value & RECURSIVE_DECOMPOSITION) {
	906	usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
	907
	908	--length; /* Decrement for the first char */
	909	if (!usedLength)
	910	return 0;
	911	++bmpMappings;
	912	convertedChars += usedLength;
	913	}
	914
	915	usedLength += length;
	916
	917	while (length--)
	918	(convertedChars++) = (bmpMappings++);
	919
	920	return (usedLength);
	921	}
	922
	923	#define HANGUL_SBASE 0xAC00
	924	#define HANGUL_LBASE 0x1100
	925	#define HANGUL_VBASE 0x1161
	926	#define HANGUL_TBASE 0x11A7
	927
	928	#define HANGUL_SCOUNT 11172
	929	#define HANGUL_LCOUNT 19
	930	#define HANGUL_VCOUNT 21
	931	#define HANGUL_TCOUNT 28
	932	#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
	933
	934	/*
	935	* unicode_decompose - decompose a composed Unicode char
	936	*
	937	* Composed Unicode characters are forbidden on
	938	* HFS Plus volumes. ucs_decompose will convert a
	939	* composed character into its correct decomposed
	940	* sequence.
	941	*
	942	* Similar to CFUniCharDecomposeCharacter
	943	*/
	944	static int
	945	unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
	946	{
	947	if ((character >= HANGUL_SBASE) &&
	948	(character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
	949	u_int32_t length;
	950
	951	character -= HANGUL_SBASE;
	952	length = (character % HANGUL_TCOUNT ? 3 : 2);
	953
	954	*(convertedChars++) =
	955	character / HANGUL_NCOUNT + HANGUL_LBASE;
	956	*(convertedChars++) =
	957	(character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
	958	if (length > 2)
	959	*convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
	960	return (length);
	961	} else {
	962	return (unicode_recursive_decompose(character, convertedChars));
	963	}
	964	}
	965
	966	/*
	967	* unicode_combine - generate a precomposed Unicode char
	968	*
	969	* Precomposed Unicode characters are required for some volume
	970	* formats and network protocols. unicode_combine will combine
	971	* a decomposed character sequence into a single precomposed
	972	* (composite) character.
	973	*
	974	* Similar toCFUniCharPrecomposeCharacter but unicode_combine
	975	* also handles Hangul Jamo characters.
	976	*/
	977	static u_int16_t
	978	unicode_combine(u_int16_t base, u_int16_t combining)
	979	{
	980	u_int32_t value;
	981
	982	/* Check HANGUL */
	983	if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
	984	/* 2 char Hangul sequences */
	985	if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
	986	(base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
	987	return (HANGUL_SBASE +
	988	((base - HANGUL_LBASE)(HANGUL_VCOUNTHANGUL_TCOUNT)) +
	989	((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
	990	}
	991
	992	/* 3 char Hangul sequences */
	993	if ((combining > HANGUL_TBASE) &&
	994	(base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
	995	if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
	996	return (0);
	997	else
	998	return (base + (combining - HANGUL_TBASE));
	999	}
	1000	}
	1001
	1002	value = getmappedvalue32(
	1003	(const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
	1004	__CFUniCharPrecompositionTableLength, combining);
	1005
	1006	if (value) {
	1007	value = getmappedvalue16(
	1008	(const unicode_mappings16 *)
	1009	((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
	1010	(value >> 16), base);
	1011	}
	1012	return (value);
	1013	}
	1014
	1015
	1016	/*
	1017	* priortysort - order combining chars into canonical order
	1018	*
	1019	* Similar to CFUniCharPrioritySort
	1020	*/
	1021	static void
	1022	priortysort(u_int16_t* characters, int count)
	1023	{
	1024	u_int32_t p1, p2;
	1025	u_int16_t ch1, ch2;
	1026	u_int16_t *end;
	1027	int changes = 0;
	1028
	1029	end = characters + count;
	1030	do {
	1031	changes = 0;
	1032	ch1 = characters;
	1033	ch2 = characters + 1;
	1034	p2 = get_combining_class(*ch1);
	1035	while (ch2 < end) {
	1036	p1 = p2;
	1037	p2 = get_combining_class(*ch2);
	1038	if (p1 > p2 && p2 != 0) {
	1039	u_int32_t tmp;
	1040
	1041	tmp = *ch1;
	1042	ch1 = ch2;
	1043	*ch2 = tmp;
	1044	changes = 1;
	1045
	1046	/*
	1047	* Make sure that p2 contains the combining class for the
	1048	* character now stored at *ch2. This isn't required for
	1049	* correctness, but it will be more efficient if a character
	1050	* with a large combining class has to "bubble past" several
	1051	* characters with lower combining classes.
	1052	*/
	1053	p2 = p1;
	1054	}
	1055	++ch1;
	1056	++ch2;
	1057	}
	1058	} while (changes);
	1059	}
	1060
	1061
	1062	/*
	1063	* Invalid NTFS filename characters are encodeded using the
	1064	* SFM (Services for Macintosh) private use Unicode characters.
	1065	*
	1066	* These should only be used for SMB, MSDOS or NTFS.
	1067	*
	1068	* Illegal NTFS Char SFM Unicode Char
	1069	* ----------------------------------------
	1070	* 0x01-0x1f 0xf001-0xf01f
	1071	* '"' 0xf020
	1072	* '*' 0xf021
	1073	* '/' 0xf022
	1074	* '<' 0xf023
	1075	* '>' 0xf024
	1076	* '?' 0xf025
	1077	* '\' 0xf026
	1078	* '\|' 0xf027
	1079	* ' ' 0xf028 (Only if last char of the name)
	1080	* '.' 0xf029 (Only if last char of the name)
	1081	* ----------------------------------------
	1082	*
	1083	* Reference: http://support.microsoft.com/kb/q117258/
	1084	*/
	1085
	1086	#define MAX_SFM2MAC 0x29
	1087	#define SFMCODE_PREFIX_MASK 0xf000
	1088
	1089	/*
	1090	* In the Mac OS 9 days the colon was illegal in a file name. For that reason
	1091	* SFM had no conversion for the colon. There is a conversion for the
	1092	* slash. In Mac OS X the slash is illegal in a file name. So for us the colon
	1093	* is a slash and a slash is a colon. So we can just replace the slash with the
	1094	* colon in our tables and everything will just work.
	1095	*/
	1096	static u_int8_t
	1097	sfm2mac[42] = {
	1098	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
	1099	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
	1100	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
	1101	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
	1102	0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
	1103	0x20, 0x2e /* 28 - 29 */
	1104	};
	1105
	1106	static u_int8_t
	1107	mac2sfm[112] = {
	1108	0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
	1109	0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
	1110	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
	1111	0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
	1112	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
	1113	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
	1114	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
	1115	0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
	1116	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
	1117	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
	1118	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
	1119	0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
	1120	};
	1121
	1122
	1123	/*
	1124	* Encode illegal NTFS filename characters into SFM Private Unicode characters
	1125	*
	1126	* Assumes non-zero ASCII input.
	1127	*/
	1128	static u_int16_t
	1129	ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
	1130	{
	1131	/* The last character of filename cannot be a space or period. */
	1132	if (lastchar) {
	1133	if (ucs_ch == 0x20)
	1134	return (0xf028);
	1135	else if (ucs_ch == 0x2e)
	1136	return (0xf029);
	1137	}
	1138	/* 0x01 - 0x1f is simple transformation. */
	1139	if (ucs_ch <= 0x1f) {
	1140	return (ucs_ch \| 0xf000);
	1141	} else /* 0x20 - 0x7f */ {
	1142	u_int16_t lsb;
	1143
	1144	lsb = mac2sfm[ucs_ch - 0x0020];
	1145	if (lsb != ucs_ch)
	1146	return(0xf000 \| lsb);
	1147	}
	1148	return (ucs_ch);
	1149	}
	1150
	1151	/*
	1152	* Decode any SFM Private Unicode characters
	1153	*/
	1154	static u_int16_t
	1155	sfm_to_ucs(u_int16_t ucs_ch)
	1156	{
	1157	if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
	1158	((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
	1159	ucs_ch = sfm2mac[ucs_ch & 0x003f];
	1160	}
	1161	return (ucs_ch);
	1162	}
	1163
	1164