[apple/hfs.git] / livefiles_hfs_plugin / lf_hfs_unicode_wrappers.c

//
//  lf_hfs_unicode_wrappers.c
//  livefiles_hfs
//
//  Created by Yakov Ben Zaken on 22/03/2018.
//

#include "lf_hfs_unicode_wrappers.h"
#include "lf_hfs_ucs_string_cmp_data.h"
#include "lf_hfs_sbunicode.h"


enum {
    kMinFileExtensionChars = 1,    /* does not include dot */
    kMaxFileExtensionChars = 5    /* does not include dot */
};


#define EXTENSIONCHAR(c)    (((c) >= 0x61 && (c) <= 0x7A) || \
                             ((c) >= 0x41 && (c) <= 0x5A) || \
                             ((c) >= 0x30 && (c) <= 0x39))


#define IsHexDigit(c)        (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') || \
                              ((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F'))


static void
GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr );


static u_int32_t
HexStringToInteger( u_int32_t length, const u_int8_t *hexStr );


/*
 * Get filename extension (if any) as a C string
 */
static void
GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr)
{
    u_int64_t    i;
    UniChar    c;
    u_int16_t    extChars;    /* number of extension chars (excluding dot) */
    u_int16_t    maxExtChars;
    Boolean    foundExtension;
    
    extStr[0] = '\0';    /* assume there's no extension */
    
    if ( length < 3 )
        return;        /* "x.y" is smallest possible extension */
    
    if ( length < (kMaxFileExtensionChars + 2) )
        maxExtChars = length - 2;    /* save room for prefix + dot */
    else
        maxExtChars = kMaxFileExtensionChars;
    
    i = length;
    extChars = 0;
    foundExtension = false;
    
    while ( extChars <= maxExtChars ) {
        c = unicodeStr[--i];
        
        /* look for leading dot */
        if ( c == (UniChar) '.' ) {
            if ( extChars > 0 )    /* cannot end with a dot */
                foundExtension = true;
            break;
        }
        
        if ( EXTENSIONCHAR(c) )
            ++extChars;
        else
            break;
    }
    
    /* if we found one then copy it */
    if ( foundExtension ) {
        u_int8_t *extStrPtr = (u_int8_t *)extStr;
        const UniChar *unicodeStrPtr = &unicodeStr[i];
        
        for ( i = 0; i <= extChars; ++i )
            *(extStrPtr++) = (u_int8_t) *(unicodeStrPtr++);
        extStr[extChars + 1] = '\0';    /* terminate extension + dot */
    }
}

//
//    FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
//
//        IF                RESULT
//    --------------------------
//    str1 < str2        =>    -1
//    str1 = str2        =>     0
//    str1 > str2        =>    +1
//
//    The lower case table starts with 256 entries (one for each of the upper bytes
//    of the original Unicode char).  If that entry is zero, then all characters with
//    that upper byte are already case folded.  If the entry is non-zero, then it is
//    the _index_ (not byte offset) of the start of the sub-table for the characters
//    with that upper byte.  All ignorable characters are folded to the value zero.
//
//    In pseudocode:
//
//        Let c = source Unicode character
//        Let table[] = lower case table
//
//        lower = table[highbyte(c)]
//        if (lower == 0)
//            lower = c
//        else
//            lower = table[lower+lowbyte(c)]
//
//        if (lower == 0)
//            ignore this character
//
//    To handle ignorable characters, we now need a loop to find the next valid character.
//    Also, we can't pre-compute the number of characters to compare; the string length might
//    be larger than the number of non-ignorable characters.  Further, we must be able to handle
//    ignorable characters at any point in the string, including as the first or last characters.
//    We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
//    Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
//    the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
//    an invalid Unicode character).
//
//    Pseudocode:
//
//        while (1) {
//            c1 = GetNextValidChar(str1)            //    returns zero if at end of string
//            c2 = GetNextValidChar(str2)
//
//            if (c1 != c2) break                    //    found a difference
//
//            if (c1 == 0)                        //    reached end of string on both strings at once?
//                return 0;                        //    yes, so strings are equal
//        }
//
//        // When we get here, c1 != c2.  So, we just need to determine which one is less.
//        if (c1 < c2)
//            return -1;
//        else
//            return 1;
//

int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1,
                            register ConstUniCharArrayPtr str2, register ItemCount length2)
{
    register u_int16_t     c1,c2;
    register u_int16_t     temp;
    register u_int16_t*    lowerCaseTable;

    lowerCaseTable = (u_int16_t*) gLowerCaseTable;

    while (1) {
        /* Set default values for c1, c2 in case there are no more valid chars */
        c1 = 0;
        c2 = 0;

        /* Find next non-ignorable char from str1, or zero if no more */
        while (length1 && c1 == 0) {
            c1 = *(str1++);
            --length1;
            /* check for basic latin first */
            if (c1 < 0x0100) {
                c1 = gLatinCaseFold[c1];
                break;
            }
            /* case fold if neccessary */
            if ((temp = lowerCaseTable[c1>>8]) != 0)
                c1 = lowerCaseTable[temp + (c1 & 0x00FF)];
        }


        /* Find next non-ignorable char from str2, or zero if no more */
        while (length2 && c2 == 0) {
            c2 = *(str2++);
            --length2;
            /* check for basic latin first */
            if (c2 < 0x0100) {
                c2 = gLatinCaseFold[c2];
                break;
            }
            /* case fold if neccessary */
            if ((temp = lowerCaseTable[c2>>8]) != 0)
                c2 = lowerCaseTable[temp + (c2 & 0x00FF)];
        }

        if (c1 != c2)        //    found a difference, so stop looping
            break;

        if (c1 == 0)        //    did we reach the end of both strings at the same time?
            return 0;        //    yes, so strings are equal
    }

    if (c1 < c2)
        return -1;
    else
        return 1;
}


/*
 * UnicodeBinaryCompare
 * Compare two UTF-16 strings and perform case-sensitive (binary) matching against them.
 *
 * Results are emitted like FastUnicodeCompare:
 *
 *
 *        IF                RESULT
 *    --------------------------
 *    str1 < str2        =>    -1
 *    str1 = str2        =>     0
 *    str1 > str2        =>    +1
 *
 * The case matching source code is greatly simplified due to the lack of case-folding
 * in this comparison routine. We compare, in order: the lengths, then do character-by-
 * character comparisons.
 *
 */
int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount len1,
                              register ConstUniCharArrayPtr str2, register ItemCount len2) {
    uint16_t c1 =0;
    uint16_t c2 =0;
    ItemCount string_length;
    int32_t result = 0;

    /* First generate the string length (for comparison purposes) */
    if (len1 < len2) {
        string_length = len1;
        --result;
    }
    else if (len1 > len2) {
        string_length = len2;
        ++result;
    }
    else {
        string_length = len1;
    }

    /* now compare the two string pointers */
    while (string_length--) {
        c1 = *(str1++);
        c2 = *(str2++);

        if (c1 > c2) {
            result = 1;
            break;
        }

        if (c1 < c2) {
            result = -1;
            break;
        }
        /* If equal, iterate to the next two respective chars */
    }

    return result;
}

/*
 * extract the file id from a mangled name
 */
HFSCatalogNodeID
GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength)
{
    short    extChars;
    short    i;
    u_int8_t    c;
    
    *prefixLength = 0;
    
    if ( filename == NULL )
        return 0;
    
    if ( length < 28 )
        return 0;    /* too small to have been mangled */
    
    /* big enough for a file ID (#10) and an extension (.x) ? */
    if ( length > 5 )
        extChars = CountFilenameExtensionChars(filename, length);
    else
        extChars = 0;
    
    /* skip over dot plus extension characters */
    if ( extChars > 0 )
        length -= (extChars + 1);
    
    /* scan for file id digits */
    for ( i = length - 1; i >= 0; --i) {
        c = filename[i];
        
        /* look for file ID marker */
        if ( c == '#' ) {
            if ( (length - i) < 3 )
                break;    /* too small to be a file ID */
            
            *prefixLength = i;
            return HexStringToInteger(length - i - 1, &filename[i+1]);
        }
        
        if ( !IsHexDigit(c) )
            break;    /* file ID string must have hex digits */
    }
    
    return 0;
}

/*
 * Count filename extension characters (if any)
 */
u_int32_t
CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length )
{
    UniChar    c;
    u_int16_t  maxExtChars;

    if ( length < 3 )
        return 0;    /* "x.y" is smallest possible extension    */
    
    if ( length < (kMaxFileExtensionChars + 2) )
        maxExtChars = length - 2;    /* save room for prefix + dot */
    else
        maxExtChars = kMaxFileExtensionChars;
    
    u_int32_t extChars = 0;        /* number of extension chars (excluding dot) - assume there's no extension */
    u_int32_t i = length - 1;      /* index to last ascii character */

    while ( extChars <= maxExtChars ) {
        c = filename[i--];
        
        /* look for leading dot */
        if ( c == (u_int8_t) '.' )    {
            if ( extChars > 0 )    /* cannot end with a dot */
                return (extChars);
            
            break;
        }
        
        if ( EXTENSIONCHAR(c) )
            ++extChars;
        else
            break;
    }
    
    return 0;
}

static u_int32_t
HexStringToInteger(u_int32_t length, const u_int8_t *hexStr)
{
    u_int32_t        value;
    u_int32_t        i;
    u_int8_t        c;
    const u_int8_t    *p;
    
    value = 0;
    p = hexStr;
    
    for ( i = 0; i < length; ++i ) {
        c = *p++;
        
        if (c >= '0' && c <= '9') {
            value = value << 4;
            value += (u_int32_t) c - (u_int32_t) '0';
        } else if (c >= 'A' && c <= 'F') {
            value = value << 4;
            value += 10 + ((unsigned int) c - (unsigned int) 'A');
        } else {
            return 0;    /* bad character */
        }
    }
    
    return value;
}

OSErr
ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen,
                            ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid)
{
    ByteCount subMaxLen;
    size_t utf8len;
    char fileIDStr[15];
    char extStr[15];
    
    snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid);
    GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr);
    
    /* remove extension chars from source */
    srcLen -= strlen(extStr) * sizeof(UniChar);
    subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr));
    
    (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', UTF_ADD_NULL_TERM);
    
    strlcat((char *)dstStr, fileIDStr, maxDstLen);
    strlcat((char *)dstStr, extStr, maxDstLen);
    *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr));
    
    return noErr;
}
Commit	Line	Data
de8ee011 A	1	//
	2	// lf_hfs_unicode_wrappers.c
	3	// livefiles_hfs
	4	//
	5	// Created by Yakov Ben Zaken on 22/03/2018.
	6	//
	7
	8	#include "lf_hfs_unicode_wrappers.h"
	9	#include "lf_hfs_ucs_string_cmp_data.h"
	10	#include "lf_hfs_sbunicode.h"
	11
	12
	13
	14	enum {
	15	kMinFileExtensionChars = 1, /* does not include dot */
	16	kMaxFileExtensionChars = 5 /* does not include dot */
	17	};
	18
	19
	20	#define EXTENSIONCHAR(c) (((c) >= 0x61 && (c) <= 0x7A) \|\| \
	21	((c) >= 0x41 && (c) <= 0x5A) \|\| \
	22	((c) >= 0x30 && (c) <= 0x39))
	23
	24
	25	#define IsHexDigit(c) (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') \|\| \
	26	((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F'))
	27
	28
	29	static void
	30	GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr );
	31
	32
	33	static u_int32_t
	34	HexStringToInteger( u_int32_t length, const u_int8_t *hexStr );
	35
	36
	37	/*
	38	* Get filename extension (if any) as a C string
	39	*/
	40	static void
	41	GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr)
	42	{
	43	u_int64_t i;
	44	UniChar c;
	45	u_int16_t extChars; /* number of extension chars (excluding dot) */
	46	u_int16_t maxExtChars;
	47	Boolean foundExtension;
	48
	49	extStr[0] = '\0'; /* assume there's no extension */
	50
	51	if ( length < 3 )
	52	return; /* "x.y" is smallest possible extension */
	53
	54	if ( length < (kMaxFileExtensionChars + 2) )
	55	maxExtChars = length - 2; /* save room for prefix + dot */
	56	else
	57	maxExtChars = kMaxFileExtensionChars;
	58
	59	i = length;
	60	extChars = 0;
	61	foundExtension = false;
	62
	63	while ( extChars <= maxExtChars ) {
	64	c = unicodeStr[--i];
65
66	/* look for leading dot */
67	if ( c == (UniChar) '.' ) {
68	if ( extChars > 0 ) /* cannot end with a dot */
69	foundExtension = true;
70	break;
71	}
72
73	if ( EXTENSIONCHAR(c) )
74	++extChars;
75	else
76	break;
77	}
78
79	/* if we found one then copy it */
80	if ( foundExtension ) {
81	u_int8_t extStrPtr = (u_int8_t )extStr;
82	const UniChar *unicodeStrPtr = &unicodeStr[i];
83
84	for ( i = 0; i <= extChars; ++i )
85	(extStrPtr++) = (u_int8_t) (unicodeStrPtr++);
86	extStr[extChars + 1] = '\0'; /* terminate extension + dot */
87	}
88	}
89
90	//
91	// FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
92	//
93	// IF RESULT
94	// --------------------------
95	// str1 < str2 => -1
96	// str1 = str2 => 0
97	// str1 > str2 => +1
98	//
99	// The lower case table starts with 256 entries (one for each of the upper bytes
100	// of the original Unicode char). If that entry is zero, then all characters with
101	// that upper byte are already case folded. If the entry is non-zero, then it is
102	// the _index_ (not byte offset) of the start of the sub-table for the characters
103	// with that upper byte. All ignorable characters are folded to the value zero.
104	//
105	// In pseudocode:
106	//
107	// Let c = source Unicode character
108	// Let table[] = lower case table
109	//
110	// lower = table[highbyte(c)]
111	// if (lower == 0)
112	// lower = c
113	// else
114	// lower = table[lower+lowbyte(c)]
115	//
116	// if (lower == 0)
117	// ignore this character
118	//
119	// To handle ignorable characters, we now need a loop to find the next valid character.
120	// Also, we can't pre-compute the number of characters to compare; the string length might
121	// be larger than the number of non-ignorable characters. Further, we must be able to handle
122	// ignorable characters at any point in the string, including as the first or last characters.
123	// We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
124	// Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
125	// the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
126	// an invalid Unicode character).
127	//
128	// Pseudocode:
129	//
130	// while (1) {
131	// c1 = GetNextValidChar(str1) // returns zero if at end of string
132	// c2 = GetNextValidChar(str2)
133	//
134	// if (c1 != c2) break // found a difference
135	//
136	// if (c1 == 0) // reached end of string on both strings at once?
137	// return 0; // yes, so strings are equal
138	// }
139	//
140	// // When we get here, c1 != c2. So, we just need to determine which one is less.
141	// if (c1 < c2)
142	// return -1;
143	// else
144	// return 1;
145	//
146
147	int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1,
148	register ConstUniCharArrayPtr str2, register ItemCount length2)
149	{
150	register u_int16_t c1,c2;
151	register u_int16_t temp;
152	register u_int16_t* lowerCaseTable;
153
154	lowerCaseTable = (u_int16_t*) gLowerCaseTable;
155
156	while (1) {
157	/* Set default values for c1, c2 in case there are no more valid chars */
158	c1 = 0;
159	c2 = 0;
160
161	/* Find next non-ignorable char from str1, or zero if no more */
162	while (length1 && c1 == 0) {
163	c1 = *(str1++);
164	--length1;
165	/* check for basic latin first */
166	if (c1 < 0x0100) {
167	c1 = gLatinCaseFold[c1];
168	break;
169	}
170	/* case fold if neccessary */
171	if ((temp = lowerCaseTable[c1>>8]) != 0)
172	c1 = lowerCaseTable[temp + (c1 & 0x00FF)];
173	}
174
175
176	/* Find next non-ignorable char from str2, or zero if no more */
177	while (length2 && c2 == 0) {
178	c2 = *(str2++);
179	--length2;
180	/* check for basic latin first */
181	if (c2 < 0x0100) {
182	c2 = gLatinCaseFold[c2];
183	break;
184	}
185	/* case fold if neccessary */
186	if ((temp = lowerCaseTable[c2>>8]) != 0)
187	c2 = lowerCaseTable[temp + (c2 & 0x00FF)];
188	}
189
190	if (c1 != c2) // found a difference, so stop looping
191	break;
192
193	if (c1 == 0) // did we reach the end of both strings at the same time?
194	return 0; // yes, so strings are equal
195	}
196
197	if (c1 < c2)
198	return -1;
199	else
200	return 1;
201	}
202
203
204	/*
205	* UnicodeBinaryCompare
206	* Compare two UTF-16 strings and perform case-sensitive (binary) matching against them.
207	*
208	* Results are emitted like FastUnicodeCompare:
209	*
210	*
211	* IF RESULT
212	* --------------------------
213	* str1 < str2 => -1
214	* str1 = str2 => 0
215	* str1 > str2 => +1
216	*
217	* The case matching source code is greatly simplified due to the lack of case-folding
218	* in this comparison routine. We compare, in order: the lengths, then do character-by-
219	* character comparisons.
220	*
221	*/
222	int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount len1,
223	register ConstUniCharArrayPtr str2, register ItemCount len2) {
224	uint16_t c1 =0;
225	uint16_t c2 =0;
226	ItemCount string_length;
227	int32_t result = 0;
228
229	/* First generate the string length (for comparison purposes) */
230	if (len1 < len2) {
231	string_length = len1;
232	--result;
233	}
234	else if (len1 > len2) {
235	string_length = len2;
236	++result;
237	}
238	else {
239	string_length = len1;
240	}
241
242	/* now compare the two string pointers */
243	while (string_length--) {
244	c1 = *(str1++);
245	c2 = *(str2++);
246
247	if (c1 > c2) {
248	result = 1;
249	break;
250	}
251
252	if (c1 < c2) {
253	result = -1;
254	break;
255	}
256	/* If equal, iterate to the next two respective chars */
257	}
258
259	return result;
260	}
261
262	/*
263	* extract the file id from a mangled name
264	*/
265	HFSCatalogNodeID
266	GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength)
267	{
268	short extChars;
269	short i;
270	u_int8_t c;
271
272	*prefixLength = 0;
273
274	if ( filename == NULL )
275	return 0;
276
277	if ( length < 28 )
278	return 0; /* too small to have been mangled */
279
280	/* big enough for a file ID (#10) and an extension (.x) ? */
281	if ( length > 5 )
282	extChars = CountFilenameExtensionChars(filename, length);
283	else
284	extChars = 0;
285
286	/* skip over dot plus extension characters */
287	if ( extChars > 0 )
288	length -= (extChars + 1);
289
290	/* scan for file id digits */
291	for ( i = length - 1; i >= 0; --i) {
292	c = filename[i];
293
294	/* look for file ID marker */
295	if ( c == '#' ) {
296	if ( (length - i) < 3 )
297	break; /* too small to be a file ID */
298
299	*prefixLength = i;
300	return HexStringToInteger(length - i - 1, &filename[i+1]);
301	}
302
303	if ( !IsHexDigit(c) )
304	break; /* file ID string must have hex digits */
305	}
306
307	return 0;
308	}
309
310	/*
311	* Count filename extension characters (if any)
312	*/
313	u_int32_t
314	CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length )
315	{
316	UniChar c;
317	u_int16_t maxExtChars;
318
319	if ( length < 3 )
320	return 0; /* "x.y" is smallest possible extension */
321
322	if ( length < (kMaxFileExtensionChars + 2) )
323	maxExtChars = length - 2; /* save room for prefix + dot */
324	else
325	maxExtChars = kMaxFileExtensionChars;
326
327	u_int32_t extChars = 0; /* number of extension chars (excluding dot) - assume there's no extension */
328	u_int32_t i = length - 1; /* index to last ascii character */
329
330	while ( extChars <= maxExtChars ) {
331	c = filename[i--];
332
333	/* look for leading dot */
334	if ( c == (u_int8_t) '.' ) {
335	if ( extChars > 0 ) /* cannot end with a dot */
336	return (extChars);
337
338	break;
339	}
340
341	if ( EXTENSIONCHAR(c) )
342	++extChars;
343	else
344	break;
345	}
346
347	return 0;
348	}
349
350	static u_int32_t
351	HexStringToInteger(u_int32_t length, const u_int8_t *hexStr)
352	{
353	u_int32_t value;
354	u_int32_t i;
355	u_int8_t c;
356	const u_int8_t *p;
357
358	value = 0;
359	p = hexStr;
360
361	for ( i = 0; i < length; ++i ) {
362	c = *p++;
363
364	if (c >= '0' && c <= '9') {
365	value = value << 4;
366	value += (u_int32_t) c - (u_int32_t) '0';
367	} else if (c >= 'A' && c <= 'F') {
368	value = value << 4;
369	value += 10 + ((unsigned int) c - (unsigned int) 'A');
370	} else {
371	return 0; /* bad character */
372	}
373	}
374
375	return value;
376	}
377
378	OSErr
379	ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen,
380	ByteCount actualDstLen, unsigned char dstStr, HFSCatalogNodeID cnid)
381	{
382	ByteCount subMaxLen;
383	size_t utf8len;
384	char fileIDStr[15];
385	char extStr[15];
386
387	snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid);
388	GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr);
389
390	/* remove extension chars from source */
391	srcLen -= strlen(extStr) * sizeof(UniChar);
392	subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr));
393
394	(void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', UTF_ADD_NULL_TERM);
395
396	strlcat((char *)dstStr, fileIDStr, maxDstLen);
397	strlcat((char *)dstStr, extStr, maxDstLen);
398	*actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr));
399
400	return noErr;
401	}