git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
	7	*
	8	* This file contains Original Code and/or Modifications of Original Code
	9	* as defined in and that are subject to the Apple Public Source License
	10	* Version 2.0 (the 'License'). You may not use this file except in
	11	* compliance with the License. Please obtain a copy of the License at
	12	* http://www.opensource.apple.com/apsl/ and read it before using this
	13	* file.
	14	*
	15	* The Original Code and all software distributed under the License are
	16	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	17	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	18	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	19	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	20	* Please see the License for the specific language governing rights and
	21	* limitations under the License.
	22	*
	23	* @APPLE_LICENSE_HEADER_END@
	24	*/
	25
	26	/*
	27	Includes Unicode 3.2 decomposition code derived from Core Foundation
	28	*/
	29
	30	#include <sys/param.h>
	31	#include <sys/utfconv.h>
	32	#include <sys/errno.h>
	33	#include <architecture/byte_order.h>
	34
	35	/*
	36	* UTF-8 (Unicode Transformation Format)
	37	*
	38	* UTF-8 is the Unicode Transformation Format that serializes a Unicode
	39	* character as a sequence of one to four bytes. Only the shortest form
	40	* required to represent the significant Unicode bits is legal.
	41	*
	42	* UTF-8 Multibyte Codes
	43	*
	44	* Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
	45	* -----------------------------------------------------------------------------
	46	* 1 7 0x0000 0x007F 0xxxxxxx
	47	* 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
	48	* 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
	49	* 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	50	* -----------------------------------------------------------------------------
	51	*/
	52
	53
	54	#define UNICODE_TO_UTF8_LEN(c) \
	55	((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
	56
	57	#define UCS_ALT_NULL 0x2400
	58
	59	/* Surrogate Pair Constants */
	60	#define SP_HALF_SHIFT 10
	61	#define SP_HALF_BASE 0x0010000UL
	62	#define SP_HALF_MASK 0x3FFUL
	63
	64	#define SP_HIGH_FIRST 0xD800UL
	65	#define SP_HIGH_LAST 0xDBFFUL
	66	#define SP_LOW_FIRST 0xDC00UL
	67	#define SP_LOW_LAST 0xDFFFUL
	68
	69
	70	#include "vfs_utfconvdata.h"
	71
	72
	73	/*
	74	* Test for a combining character.
	75	*
	76	* Similar to __CFUniCharIsNonBaseCharacter except that
	77	* unicode_combinable also includes Hangul Jamo characters.
	78	*/
	79	static inline int
	80	unicode_combinable(u_int16_t character)
	81	{
	82	const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
	83	u_int8_t value;
	84
	85	if (character < 0x0300)
	86	return (0);
	87
	88	value = bitmap[(character >> 8) & 0xFF];
	89
	90	if (value == 0xFF) {
	91	return (1);
	92	} else if (value) {
	93	bitmap = bitmap + ((value - 1) * 32) + 256;
	94	return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
	95	}
	96	return (0);
	97	}
	98
	99	/*
	100	* Test for a precomposed character.
	101	*
	102	* Similar to __CFUniCharIsDecomposableCharacter.
	103	*/
	104	static inline int
	105	unicode_decomposeable(u_int16_t character) {
	106	const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
	107	u_int8_t value;
	108
	109	if (character < 0x00C0)
	110	return (0);
	111
	112	value = bitmap[(character >> 8) & 0xFF];
	113
	114	if (value == 0xFF) {
	115	return (1);
	116	} else if (value) {
	117	bitmap = bitmap + ((value - 1) * 32) + 256;
	118	return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
	119	}
	120	return (0);
	121	}
	122
	123	static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
	124
	125	static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
	126
	127
	128	char utf_extrabytes[32] = {
	129	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	130	-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
	131	};
	132
	133
	134	/*
	135	* utf8_encodelen - Calculates the UTF-8 encoding length for a Unicode filename
	136	*
	137	* NOTES:
	138	* If '/' chars are allowed on disk then an alternate
	139	* (replacement) char must be provided in altslash.
	140	*
	141	* input flags:
	142	* UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
	143	*/
	144	size_t
	145	utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
	146	int flags)
	147	{
	148	u_int16_t ucs_ch;
	149	int charcnt;
	150	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
	151	size_t len;
	152
	153	charcnt = ucslen / 2;
	154	len = 0;
	155
	156	while (charcnt-- > 0) {
	157	ucs_ch = *ucsp++;
	158
	159	if (swapbytes)
	160	ucs_ch = NXSwapShort(ucs_ch);
	161	if (ucs_ch == '/')
	162	ucs_ch = altslash ? altslash : '_';
	163	else if (ucs_ch == '\0')
	164	ucs_ch = UCS_ALT_NULL;
	165
	166	len += UNICODE_TO_UTF8_LEN(ucs_ch);
	167	}
	168
	169	return (len);
	170	}
	171
	172
	173	/*
	174	* utf8_encodestr - Encodes a Unicode string to UTF-8
	175	*
	176	* NOTES:
	177	* The resulting UTF-8 string is NULL terminated.
	178	*
	179	* If '/' chars are allowed on disk then an alternate
	180	* (replacement) char must be provided in altslash.
	181	*
	182	* input flags:
	183	* UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
	184	* UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
	185	*
	186	* result:
	187	* ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
	188	* EINVAL: Illegal char found; char was replaced by an '_'.
	189	*/
	190	int
	191	utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
	192	size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
	193	{
	194	u_int8_t * bufstart;
	195	u_int8_t * bufend;
	196	u_int16_t ucs_ch;
	197	u_int16_t * chp = NULL;
	198	u_int16_t sequence[8];
	199	int extra = 0;
	200	int charcnt;
	201	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
	202	int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
	203	int decompose = (flags & UTF_DECOMPOSED);
	204	int result = 0;
	205
	206	bufstart = utf8p;
	207	bufend = bufstart + buflen;
	208	if (nullterm)
	209	--bufend;
	210	charcnt = ucslen / 2;
	211
	212	while (charcnt-- > 0) {
	213	if (extra > 0) {
	214	--extra;
	215	ucs_ch = *chp++;
	216	} else {
	217	ucs_ch = swapbytes ? NXSwapShort(ucsp++) : ucsp++;
	218
	219	if (decompose && unicode_decomposeable(ucs_ch)) {
	220	extra = unicode_decompose(ucs_ch, sequence) - 1;
	221	charcnt += extra;
	222	ucs_ch = sequence[0];
	223	chp = &sequence[1];
	224	}
	225	}
	226
	227	/* Slash and NULL are not permitted */
	228	if (ucs_ch == '/') {
	229	if (altslash)
	230	ucs_ch = altslash;
	231	else {
	232	ucs_ch = '_';
	233	result = EINVAL;
	234	}
	235	} else if (ucs_ch == '\0') {
	236	ucs_ch = UCS_ALT_NULL;
	237	}
	238
	239	if (ucs_ch < 0x0080) {
	240	if (utf8p >= bufend) {
	241	result = ENAMETOOLONG;
	242	break;
	243	}
	244	*utf8p++ = ucs_ch;
	245
	246	} else if (ucs_ch < 0x800) {
	247	if ((utf8p + 1) >= bufend) {
	248	result = ENAMETOOLONG;
	249	break;
	250	}
	251	*utf8p++ = 0xc0 \| (ucs_ch >> 6);
	252	*utf8p++ = 0x80 \| (0x3f & ucs_ch);
	253
	254	} else {
	255	/* Combine valid surrogate pairs */
	256	if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
	257	&& charcnt > 0) {
	258	u_int16_t ch2;
	259	u_int32_t pair;
	260
	261	ch2 = swapbytes ? NXSwapShort(ucsp) : ucsp;
	262	if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
	263	pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
	264	+ (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
	265	if ((utf8p + 3) >= bufend) {
	266	result = ENAMETOOLONG;
	267	break;
	268	}
	269	--charcnt;
	270	++ucsp;
	271	*utf8p++ = 0xf0 \| (pair >> 18);
	272	*utf8p++ = 0x80 \| (0x3f & (pair >> 12));
	273	*utf8p++ = 0x80 \| (0x3f & (pair >> 6));
	274	*utf8p++ = 0x80 \| (0x3f & pair);
	275	continue;
	276	}
	277	}
	278	if ((utf8p + 2) >= bufend) {
	279	result = ENAMETOOLONG;
	280	break;
	281	}
	282	*utf8p++ = 0xe0 \| (ucs_ch >> 12);
	283	*utf8p++ = 0x80 \| (0x3f & (ucs_ch >> 6));
	284	*utf8p++ = 0x80 \| (0x3f & ucs_ch);
	285	}
	286	}
	287
	288	*utf8len = utf8p - bufstart;
	289	if (nullterm)
	290	*utf8p++ = '\0';
	291
	292	return (result);
	293	}
	294
	295
	296	/*
	297	* utf8_decodestr - Decodes a UTF-8 string back to Unicode
	298	*
	299	* NOTES:
	300	* The input UTF-8 string does not need to be null terminated
	301	* if utf8len is set.
	302	*
	303	* If '/' chars are allowed on disk then an alternate
	304	* (replacement) char must be provided in altslash.
	305	*
	306	* input flags:
	307	* UTF_REV_ENDIAN: Unicode byteorder is oposite current runtime
	308	* UTF_DECOMPOSED: Unicode output string must be fully decompsed
	309	*
	310	* result:
	311	* ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
	312	* EINVAL: Illegal UTF-8 sequence found.
	313	*/
	314	int
	315	utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
	316	size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
	317	{
	318	u_int16_t* bufstart;
	319	u_int16_t* bufend;
	320	u_int16_t ucs_ch;
	321	u_int8_t byte;
	322	int result = 0;
	323	int decompose, precompose, swapbytes;
	324
	325	decompose = (flags & UTF_DECOMPOSED);
	326	precompose = (flags & UTF_PRECOMPOSED);
	327	swapbytes = (flags & UTF_REVERSE_ENDIAN);
	328
	329	bufstart = ucsp;
	330	bufend = (u_int16_t )((u_int8_t )ucsp + buflen);
	331
	332	while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
	333	if (ucsp >= bufend)
	334	goto toolong;
	335
	336	/* check for ascii */
	337	if (byte < 0x80) {
	338	ucs_ch = byte; /* 1st byte */
	339	} else {
	340	u_int32_t ch;
	341	int extrabytes = utf_extrabytes[byte >> 3];
	342
	343	if (utf8len < extrabytes)
	344	goto invalid;
	345	utf8len -= extrabytes;
	346
	347	switch (extrabytes) {
	348	case 1: ch = byte; /* 1st byte */
	349	ch <<= 6;
	350	ch += utf8p++; / 2nd byte */
	351	ch -= 0x00003080UL;
	352	if (ch < 0x0080)
	353	goto invalid;
	354	ucs_ch = ch;
	355	break;
	356
	357	case 2: ch = byte; /* 1st byte */
	358	ch <<= 6;
	359	ch += utf8p++; / 2nd byte */
	360	ch <<= 6;
	361	ch += utf8p++; / 3rd byte */
	362	ch -= 0x000E2080UL;
	363	if (ch < 0x0800)
	364	goto invalid;
	365	ucs_ch = ch;
	366	break;
	367
	368	case 3: ch = byte; /* 1st byte */
	369	ch <<= 6;
	370	ch += utf8p++; / 2nd byte */
	371	ch <<= 6;
	372	ch += utf8p++; / 3rd byte */
	373	ch <<= 6;
	374	ch += utf8p++; / 4th byte */
	375	ch -= 0x03C82080UL + SP_HALF_BASE;
	376	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
	377	*ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
	378	if (ucsp >= bufend)
	379	goto toolong;
	380	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
	381	*ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
	382	continue;
	383
	384	default:
	385	goto invalid;
	386	}
	387	if (decompose) {
	388	if (unicode_decomposeable(ucs_ch)) {
	389	u_int16_t sequence[8];
	390	int count, i;
	391
	392	count = unicode_decompose(ucs_ch, sequence);
	393
	394	for (i = 0; i < count; ++i) {
	395	ucs_ch = sequence[i];
	396	*ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
	397	if (ucsp >= bufend)
	398	goto toolong;
	399	}
	400	continue;
	401	}
	402	} else if (precompose && (ucsp != bufstart)) {
	403	u_int16_t composite, base;
	404
	405	if (unicode_combinable(ucs_ch)) {
	406	base = swapbytes ? NXSwapShort((ucsp - 1)) : (ucsp - 1);
	407	composite = unicode_combine(base, ucs_ch);
	408	if (composite) {
	409	--ucsp;
	410	ucs_ch = composite;
	411	}
	412	}
	413	}
	414	if (ucs_ch == UCS_ALT_NULL)
	415	ucs_ch = '\0';
	416	}
	417	if (ucs_ch == altslash)
	418	ucs_ch = '/';
	419
	420	*ucsp++ = swapbytes ? NXSwapShort(ucs_ch) : ucs_ch;
	421	}
	422
	423	exit:
	424	ucslen = (u_int8_t)ucsp - (u_int8_t*)bufstart;
	425
	426	return (result);
	427
	428	invalid:
	429	result = EINVAL;
	430	goto exit;
	431
	432	toolong:
	433	result = ENAMETOOLONG;
	434	goto exit;
	435	}
	436
	437
	438	/*
	439	* Unicode 3.2 decomposition code (derived from Core Foundation)
	440	*/
	441
	442	typedef struct {
	443	u_int32_t _key;
	444	u_int32_t _value;
	445	} unicode_mappings32;
	446
	447	static inline u_int32_t
	448	getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
	449	u_int16_t character)
	450	{
	451	const unicode_mappings32 p, q, *divider;
	452
	453	if ((character < theTable[0]._key) \|\| (character > theTable[numElem-1]._key))
	454	return (0);
	455
	456	p = theTable;
	457	q = p + (numElem-1);
	458	while (p <= q) {
	459	divider = p + ((q - p) >> 1); /* divide by 2 */
	460	if (character < divider->_key) { q = divider - 1; }
	461	else if (character > divider->_key) { p = divider + 1; }
	462	else { return (divider->_value); }
	463	}
	464	return (0);
	465	}
	466
	467	#define RECURSIVE_DECOMPOSITION (1 << 15)
	468	#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
	469
	470	typedef struct {
	471	u_int16_t _key;
	472	u_int16_t _value;
	473	} unicode_mappings16;
	474
	475	static inline u_int16_t
	476	getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
	477	u_int16_t character)
	478	{
	479	const unicode_mappings16 p, q, *divider;
	480
	481	if ((character < theTable[0]._key) \|\| (character > theTable[numElem-1]._key))
	482	return (0);
	483
	484	p = theTable;
	485	q = p + (numElem-1);
	486	while (p <= q) {
	487	divider = p + ((q - p) >> 1); /* divide by 2 */
	488	if (character < divider->_key)
	489	q = divider - 1;
	490	else if (character > divider->_key)
	491	p = divider + 1;
	492	else
	493	return (divider->_value);
	494	}
	495	return (0);
	496	}
	497
	498
	499	static u_int32_t
	500	unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
	501	{
	502	u_int16_t value;
	503	u_int32_t length;
	504	u_int16_t firstChar;
	505	u_int16_t theChar;
	506	const u_int16_t *bmpMappings;
	507	u_int32_t usedLength;
	508
	509	value = getmappedvalue16(
	510	(const unicode_mappings16 *)__CFUniCharDecompositionTable,
	511	__UniCharDecompositionTableLength, character);
	512	length = EXTRACT_COUNT(value);
	513	firstChar = value & 0x0FFF;
	514	theChar = firstChar;
	515	bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
	516	usedLength = 0;
	517
	518	if (value & RECURSIVE_DECOMPOSITION) {
	519	usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
	520
	521	--length; /* Decrement for the first char */
	522	if (!usedLength)
	523	return 0;
	524	++bmpMappings;
	525	convertedChars += usedLength;
	526	}
	527
	528	usedLength += length;
	529
	530	while (length--)
	531	(convertedChars++) = (bmpMappings++);
	532
	533	return (usedLength);
	534	}
	535
	536	#define HANGUL_SBASE 0xAC00
	537	#define HANGUL_LBASE 0x1100
	538	#define HANGUL_VBASE 0x1161
	539	#define HANGUL_TBASE 0x11A7
	540
	541	#define HANGUL_SCOUNT 11172
	542	#define HANGUL_LCOUNT 19
	543	#define HANGUL_VCOUNT 21
	544	#define HANGUL_TCOUNT 28
	545	#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
	546
	547	/*
	548	* unicode_decompose - decompose a composed Unicode char
	549	*
	550	* Composed Unicode characters are forbidden on
	551	* HFS Plus volumes. ucs_decompose will convert a
	552	* composed character into its correct decomposed
	553	* sequence.
	554	*
	555	* Similar to CFUniCharDecomposeCharacter
	556	*/
	557	static int
	558	unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
	559	{
	560	if ((character >= HANGUL_SBASE) &&
	561	(character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
	562	u_int32_t length;
	563
	564	character -= HANGUL_SBASE;
	565	length = (character % HANGUL_TCOUNT ? 3 : 2);
	566
	567	*(convertedChars++) =
	568	character / HANGUL_NCOUNT + HANGUL_LBASE;
	569	*(convertedChars++) =
	570	(character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
	571	if (length > 2)
	572	*convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
	573	return (length);
	574	} else {
	575	return (unicode_recursive_decompose(character, convertedChars));
	576	}
	577	}
	578
	579	/*
	580	* unicode_combine - generate a precomposed Unicode char
	581	*
	582	* Precomposed Unicode characters are required for some volume
	583	* formats and network protocols. unicode_combine will combine
	584	* a decomposed character sequence into a single precomposed
	585	* (composite) character.
	586	*
	587	* Similar toCFUniCharPrecomposeCharacter but unicode_combine
	588	* also handles Hangul Jamo characters.
	589	*/
	590	static u_int16_t
	591	unicode_combine(u_int16_t base, u_int16_t combining)
	592	{
	593	u_int32_t value;
	594
	595	/* Check HANGUL */
	596	if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
	597	/* 2 char Hangul sequences */
	598	if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
	599	(base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
	600	return (HANGUL_SBASE +
	601	((base - HANGUL_LBASE)(HANGUL_VCOUNTHANGUL_TCOUNT)) +
	602	((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
	603	}
	604
	605	/* 3 char Hangul sequences */
	606	if ((combining > HANGUL_TBASE) &&
	607	(base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
	608	if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
	609	return (0);
	610	else
	611	return (base + (combining - HANGUL_TBASE));
	612	}
	613	}
	614
	615	value = getmappedvalue32(
	616	(const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
	617	__CFUniCharPrecompositionTableLength, combining);
	618
	619	if (value) {
	620	value = getmappedvalue16(
	621	(const unicode_mappings16 *)
	622	((u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
	623	(value >> 16), base);
	624	}
	625	return (value);
	626	}
	627