[apple/bootx.git] / bootx.tproj / fs.subproj / HFSCompare.c

/*
 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/*
 *  HFSCompare.c - Functions for working with and comparing HFS nams.
 *
 *  Copyright (c) 1999-2000 Apple Computer, Inc.
 *
 *  DRI: Josh de Cesare
 */

#include <sl.h>
#include "CaseTables.h"


//_______________________________________________________________________
//
//	Routine:	FastRelString
//
//	Output:		returns -1 if str1 < str2
//				returns  1 if str1 > str2
//				return	 0 if equal
//
//_______________________________________________________________________

int32_t	FastRelString(char *str1, char *str2 )
{
	int32_t  bestGuess;
	u_int8_t length, length2;

	
	length = *(str1++);
	length2 = *(str2++);

	if (length == length2)
		bestGuess = 0;
	else if (length < length2)
		bestGuess = -1;
	else
	{
		bestGuess = 1;
		length = length2;
	}

	while (length--)
	{
		u_int32_t aChar, bChar;

		aChar = *(str1++);
		bChar = *(str2++);
		
		if (aChar != bChar)	/* If they don't match exacly, do case conversion */
		{	
			u_int16_t aSortWord, bSortWord;

			aSortWord = gCompareTable[aChar];
			bSortWord = gCompareTable[bChar];

			if (aSortWord > bSortWord)
				return 1;

			if (aSortWord < bSortWord)
				return -1;
		}
		
		/*
		 * If characters match exactly, then go on to next character
		 * immediately without doing any extra work.
		 */
	}
	
	/* if you got to here, then return bestGuess */
	return bestGuess;
}	


//
//	FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
//
//	    IF				RESULT
//	--------------------------
//	str1 < str2		=>	-1
//	str1 = str2		=>	 0
//	str1 > str2		=>	+1
//
//	The lower case table starts with 256 entries (one for each of the upper bytes
//	of the original Unicode char).  If that entry is zero, then all characters with
//	that upper byte are already case folded.  If the entry is non-zero, then it is
//	the _index_ (not byte offset) of the start of the sub-table for the characters
//	with that upper byte.  All ignorable characters are folded to the value zero.
//
//	In pseudocode:
//
//		Let c = source Unicode character
//		Let table[] = lower case table
//
//		lower = table[highbyte(c)]
//		if (lower == 0)
//			lower = c
//		else
//			lower = table[lower+lowbyte(c)]
//
//		if (lower == 0)
//			ignore this character
//
//	To handle ignorable characters, we now need a loop to find the next valid character.
//	Also, we can't pre-compute the number of characters to compare; the string length might
//	be larger than the number of non-ignorable characters.  Further, we must be able to handle
//	ignorable characters at any point in the string, including as the first or last characters.
//	We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
//	Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
//	the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
//	an invalid Unicode character).
//
//	Pseudocode:
//
//		while (1) {
//			c1 = GetNextValidChar(str1)			//	returns zero if at end of string
//			c2 = GetNextValidChar(str2)
//
//			if (c1 != c2) break					//	found a difference
//
//			if (c1 == 0)						//	reached end of string on both strings at once?
//				return 0;						//	yes, so strings are equal
//		}
//
//		// When we get here, c1 != c2.  So, we just need to determine which one is less.
//		if (c1 < c2)
//			return -1;
//		else
//			return 1;
//

int32_t FastUnicodeCompare (u_int16_t *str1, register u_int32_t length1,
			    u_int16_t *str2, register u_int32_t length2)
{
	register u_int16_t c1,c2;
	register u_int16_t temp;

	while (1) {
		/* Set default values for c1, c2 in case there are no more valid chars */
		c1 = 0;
		c2 = 0;
		
		/* Find next non-ignorable char from str1, or zero if no more */
		while (length1 && c1 == 0) {
			c1 = *(str1++);
			--length1;
			if ((temp = gLowerCaseTable[c1>>8]) != 0)		// is there a subtable for this upper byte?
				c1 = gLowerCaseTable[temp + (c1 & 0x00FF)];	// yes, so fold the char
		}
		
		
		/* Find next non-ignorable char from str2, or zero if no more */
		while (length2 && c2 == 0) {
			c2 = *(str2++);
			--length2;
			if ((temp = gLowerCaseTable[c2>>8]) != 0)		// is there a subtable for this upper byte?
				c2 = gLowerCaseTable[temp + (c2 & 0x00FF)];	// yes, so fold the char
		}
		
		if (c1 != c2)	/* found a difference, so stop looping */
			break;
		
		if (c1 == 0)		/* did we reach the end of both strings at the same time? */
			return 0;	/* yes, so strings are equal */
	}
	
	if (c1 < c2)
		return -1;
	else
		return 1;
}


/*
 * UTF-8 (UCS Transformation Format)
 *
 * The following subset of UTF-8 is used to encode UCS-2 filenames. It
 * requires a maximum of three 3 bytes per UCS-2 character.  Only the
 * shortest encoding required to represent the significant UCS-2 bits
 * is legal.
 * 
 * UTF-8 Multibyte Codes
 *
 * Bytes   Bits   UCS-2 Min   UCS-2 Max   UTF-8 Byte Sequence (binary)
 * -------------------------------------------------------------------
 *   1       7     0x0000      0x007F      0xxxxxxx
 *   2      11     0x0080      0x07FF      110xxxxx 10xxxxxx
 *   3      16     0x0800      0xFFFF      1110xxxx 10xxxxxx 10xxxxxx
 * -------------------------------------------------------------------
 */


/*
 * utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a
 * null terminated UTF-8 string at utf8p.
 *
 * ucslen is the number of UCS-2 input characters (not bytes)
 * bufsize is the size of the output buffer in bytes
 */
void
utf_encodestr(const u_int16_t *ucsp, int ucslen, u_int8_t *utf8p, u_int32_t bufsize)
{
	u_int8_t *bufend;
	u_int16_t ucs_ch;
	
	bufend = utf8p + bufsize;

	while (ucslen-- > 0) {
		ucs_ch = *ucsp++;

		if (ucs_ch < 0x0080) {
			if (utf8p >= bufend)
				break;
			if (ucs_ch == '\0')
				continue;	/* skip over embedded NULLs */
			*utf8p++ = ucs_ch;

		} else if (ucs_ch < 0x800) {
			if ((utf8p + 1) >= bufend)
				break;
			*utf8p++ = (ucs_ch >> 6)   | 0xc0;
			*utf8p++ = (ucs_ch & 0x3f) | 0x80;

		} else {
			if ((utf8p + 2) >= bufend)
				break;
			*utf8p++ = (ucs_ch >> 12)         | 0xe0;
			*utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
			*utf8p++ = ((ucs_ch) & 0x3f)      | 0x80;
		}
		
	}
	
	*utf8p = '\0';
}


/*
 * utf_decodestr - Decodes the null terminated UTF-8 string at
 * utf8p into a UCS-2 (Unicode) string at ucsp.
 *
 * ucslen is the number of UCS-2 output characters (not bytes)
 * bufsize is the size of the output buffer in bytes
 */
void
utf_decodestr(const u_int8_t *utf8p, u_int16_t *ucsp, u_int16_t *ucslen, u_int32_t bufsize)
{
	u_int16_t *bufstart;
	u_int16_t *bufend;
	u_int16_t ucs_ch;
	u_int8_t  byte;

	bufstart = ucsp;
	bufend = (u_int16_t *)((u_int8_t *)ucsp + bufsize);

	while ((byte = *utf8p++) != '\0') {
		if (ucsp >= bufend)
			break;

		/* check for ascii */
		if (byte < 0x80) {
			ucs_ch = byte;
			
			*ucsp++ = ucs_ch;
			continue;
		}

		switch (byte & 0xf0) {
		/*  2 byte sequence*/
		case 0xc0:
		case 0xd0:
			/* extract bits 6 - 10 from first byte */
			ucs_ch = (byte & 0x1F) << 6;  
			break;
		/* 3 byte sequence*/
		case 0xe0:
			/* extract bits 12 - 15 from first byte */
			ucs_ch = (byte & 0x0F) << 6;

			/* extract bits 6 - 11 from second byte */
			if (((byte = *utf8p++) & 0xc0) != 0x80)
				goto stop;

			ucs_ch += (byte & 0x3F);
			ucs_ch <<= 6;
			break;
		default:
			goto stop;
		}

		/* extract bits 0 - 5 from final byte */
		if (((byte = *utf8p++) & 0xc0) != 0x80)
			goto stop;
		ucs_ch += (byte & 0x3F);  

		*ucsp++ = ucs_ch;
	}
stop:
	*ucslen = ucsp - bufstart;
}
Commit	Line	Data
04fee52e A	1	/*
	2	* Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22	/*
	23	* HFSCompare.c - Functions for working with and comparing HFS nams.
	24	*
	25	* Copyright (c) 1999-2000 Apple Computer, Inc.
	26	*
	27	* DRI: Josh de Cesare
	28	*/
	29
	30	#include <sl.h>
	31	#include "CaseTables.h"
	32
	33
	34	//_______________________________________________________________________
	35	//
	36	// Routine: FastRelString
	37	//
	38	// Output: returns -1 if str1 < str2
	39	// returns 1 if str1 > str2
	40	// return 0 if equal
	41	//
	42	//_______________________________________________________________________
	43
	44	int32_t FastRelString(char str1, char str2 )
	45	{
	46	int32_t bestGuess;
	47	u_int8_t length, length2;
	48
	49
	50	length = *(str1++);
	51	length2 = *(str2++);
	52
	53	if (length == length2)
	54	bestGuess = 0;
	55	else if (length < length2)
	56	bestGuess = -1;
	57	else
	58	{
	59	bestGuess = 1;
	60	length = length2;
	61	}
	62
	63	while (length--)
	64	{
65	u_int32_t aChar, bChar;
66
67	aChar = *(str1++);
68	bChar = *(str2++);
69
70	if (aChar != bChar) /* If they don't match exacly, do case conversion */
71	{
72	u_int16_t aSortWord, bSortWord;
73
74	aSortWord = gCompareTable[aChar];
75	bSortWord = gCompareTable[bChar];
76
77	if (aSortWord > bSortWord)
78	return 1;
79
80	if (aSortWord < bSortWord)
81	return -1;
82	}
83
84	/*
85	* If characters match exactly, then go on to next character
86	* immediately without doing any extra work.
87	*/
88	}
89
90	/* if you got to here, then return bestGuess */
91	return bestGuess;
92	}
93
94
95
96	//
97	// FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
98	//
99	// IF RESULT
100	// --------------------------
101	// str1 < str2 => -1
102	// str1 = str2 => 0
103	// str1 > str2 => +1
104	//
105	// The lower case table starts with 256 entries (one for each of the upper bytes
106	// of the original Unicode char). If that entry is zero, then all characters with
107	// that upper byte are already case folded. If the entry is non-zero, then it is
108	// the _index_ (not byte offset) of the start of the sub-table for the characters
109	// with that upper byte. All ignorable characters are folded to the value zero.
110	//
111	// In pseudocode:
112	//
113	// Let c = source Unicode character
114	// Let table[] = lower case table
115	//
116	// lower = table[highbyte(c)]
117	// if (lower == 0)
118	// lower = c
119	// else
120	// lower = table[lower+lowbyte(c)]
121	//
122	// if (lower == 0)
123	// ignore this character
124	//
125	// To handle ignorable characters, we now need a loop to find the next valid character.
126	// Also, we can't pre-compute the number of characters to compare; the string length might
127	// be larger than the number of non-ignorable characters. Further, we must be able to handle
128	// ignorable characters at any point in the string, including as the first or last characters.
129	// We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
130	// Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
131	// the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
132	// an invalid Unicode character).
133	//
134	// Pseudocode:
135	//
136	// while (1) {
137	// c1 = GetNextValidChar(str1) // returns zero if at end of string
138	// c2 = GetNextValidChar(str2)
139	//
140	// if (c1 != c2) break // found a difference
141	//
142	// if (c1 == 0) // reached end of string on both strings at once?
143	// return 0; // yes, so strings are equal
144	// }
145	//
146	// // When we get here, c1 != c2. So, we just need to determine which one is less.
147	// if (c1 < c2)
148	// return -1;
149	// else
150	// return 1;
151	//
152
153	int32_t FastUnicodeCompare (u_int16_t *str1, register u_int32_t length1,
154	u_int16_t *str2, register u_int32_t length2)
155	{
156	register u_int16_t c1,c2;
157	register u_int16_t temp;
158
159	while (1) {
160	/* Set default values for c1, c2 in case there are no more valid chars */
161	c1 = 0;
162	c2 = 0;
163
164	/* Find next non-ignorable char from str1, or zero if no more */
165	while (length1 && c1 == 0) {
166	c1 = *(str1++);
167	--length1;
168	if ((temp = gLowerCaseTable[c1>>8]) != 0) // is there a subtable for this upper byte?
169	c1 = gLowerCaseTable[temp + (c1 & 0x00FF)]; // yes, so fold the char
170	}
171
172
173	/* Find next non-ignorable char from str2, or zero if no more */
174	while (length2 && c2 == 0) {
175	c2 = *(str2++);
176	--length2;
177	if ((temp = gLowerCaseTable[c2>>8]) != 0) // is there a subtable for this upper byte?
178	c2 = gLowerCaseTable[temp + (c2 & 0x00FF)]; // yes, so fold the char
179	}
180
181	if (c1 != c2) /* found a difference, so stop looping */
182	break;
183
184	if (c1 == 0) /* did we reach the end of both strings at the same time? */
185	return 0; /* yes, so strings are equal */
186	}
187
188	if (c1 < c2)
189	return -1;
190	else
191	return 1;
192	}
193
194
195	/*
196	* UTF-8 (UCS Transformation Format)
197	*
198	* The following subset of UTF-8 is used to encode UCS-2 filenames. It
199	* requires a maximum of three 3 bytes per UCS-2 character. Only the
200	* shortest encoding required to represent the significant UCS-2 bits
201	* is legal.
202	*
203	* UTF-8 Multibyte Codes
204	*
205	* Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary)
206	* -------------------------------------------------------------------
207	* 1 7 0x0000 0x007F 0xxxxxxx
208	* 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
209	* 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
210	* -------------------------------------------------------------------
211	*/
212
213
214	/*
215	* utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a
216	* null terminated UTF-8 string at utf8p.
217	*
218	* ucslen is the number of UCS-2 input characters (not bytes)
219	* bufsize is the size of the output buffer in bytes
220	*/
221	void
222	utf_encodestr(const u_int16_t ucsp, int ucslen, u_int8_t utf8p, u_int32_t bufsize)
223	{
224	u_int8_t *bufend;
225	u_int16_t ucs_ch;
226
227	bufend = utf8p + bufsize;
228
229	while (ucslen-- > 0) {
230	ucs_ch = *ucsp++;
231
232	if (ucs_ch < 0x0080) {
233	if (utf8p >= bufend)
234	break;
235	if (ucs_ch == '\0')
236	continue; /* skip over embedded NULLs */
237	*utf8p++ = ucs_ch;
238
239	} else if (ucs_ch < 0x800) {
240	if ((utf8p + 1) >= bufend)
241	break;
242	*utf8p++ = (ucs_ch >> 6) \| 0xc0;
243	*utf8p++ = (ucs_ch & 0x3f) \| 0x80;
244
245	} else {
246	if ((utf8p + 2) >= bufend)
247	break;
248	*utf8p++ = (ucs_ch >> 12) \| 0xe0;
249	*utf8p++ = ((ucs_ch >> 6) & 0x3f) \| 0x80;
250	*utf8p++ = ((ucs_ch) & 0x3f) \| 0x80;
251	}
252
253	}
254
255	*utf8p = '\0';
256	}
257
258
259	/*
260	* utf_decodestr - Decodes the null terminated UTF-8 string at
261	* utf8p into a UCS-2 (Unicode) string at ucsp.
262	*
263	* ucslen is the number of UCS-2 output characters (not bytes)
264	* bufsize is the size of the output buffer in bytes
265	*/
266	void
267	utf_decodestr(const u_int8_t utf8p, u_int16_t ucsp, u_int16_t *ucslen, u_int32_t bufsize)
268	{
269	u_int16_t *bufstart;
270	u_int16_t *bufend;
271	u_int16_t ucs_ch;
272	u_int8_t byte;
273
274	bufstart = ucsp;
275	bufend = (u_int16_t )((u_int8_t )ucsp + bufsize);
276
277	while ((byte = *utf8p++) != '\0') {
278	if (ucsp >= bufend)
279	break;
280
281	/* check for ascii */
282	if (byte < 0x80) {
283	ucs_ch = byte;
284
285	*ucsp++ = ucs_ch;
286	continue;
287	}
288
289	switch (byte & 0xf0) {
290	/* 2 byte sequence*/
291	case 0xc0:
292	case 0xd0:
293	/* extract bits 6 - 10 from first byte */
294	ucs_ch = (byte & 0x1F) << 6;
295	break;
296	/* 3 byte sequence*/
297	case 0xe0:
298	/* extract bits 12 - 15 from first byte */
299	ucs_ch = (byte & 0x0F) << 6;
300
301	/* extract bits 6 - 11 from second byte */
302	if (((byte = *utf8p++) & 0xc0) != 0x80)
303	goto stop;
304
305	ucs_ch += (byte & 0x3F);
306	ucs_ch <<= 6;
307	break;
308	default:
309	goto stop;
310	}
311
312	/* extract bits 0 - 5 from final byte */
313	if (((byte = *utf8p++) & 0xc0) != 0x80)
314	goto stop;
315	ucs_ch += (byte & 0x3F);
316
317	*ucsp++ = ucs_ch;
318	}
319	stop:
320	*ucslen = ucsp - bufstart;
321	}