X-Git-Url: https://git.saurik.com/apple/hfs.git/blobdiff_plain/de8ee0119e51802e7b74e261b628b6de53b34e6c..ccb1535577c019312b69b95a60bb75c8a3ee22a3:/core/UnicodeWrappers.c diff --git a/core/UnicodeWrappers.c b/core/UnicodeWrappers.c new file mode 100644 index 0000000..8e5b6e6 --- /dev/null +++ b/core/UnicodeWrappers.c @@ -0,0 +1,508 @@ +/* + * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + File: UnicodeWrappers.c + + Contains: Wrapper routines for Unicode conversion and comparison. + +*/ + +#include +#include + +#include "hfs_macos_defs.h" +#include "UCStringCompareData.h" + +#include "FileMgrInternal.h" +#include "HFSUnicodeWrappers.h" + +enum { + kMinFileExtensionChars = 1, /* does not include dot */ + kMaxFileExtensionChars = 5 /* does not include dot */ +}; + + +#define EXTENSIONCHAR(c) (((c) >= 0x61 && (c) <= 0x7A) || \ + ((c) >= 0x41 && (c) <= 0x5A) || \ + ((c) >= 0x30 && (c) <= 0x39)) + + +#define IsHexDigit(c) (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') || \ + ((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F')) + + +static void GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr ); + + +static u_int32_t HexStringToInteger( u_int32_t length, const u_int8_t *hexStr ); + + +/* + * Get filename extension (if any) as a C string + */ +static void +GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr) +{ + u_int32_t i; + UniChar c; + u_int16_t extChars; /* number of extension chars (excluding dot) */ + u_int16_t maxExtChars; + Boolean foundExtension; + + extStr[0] = '\0'; /* assume there's no extension */ + + if ( length < 3 ) + return; /* "x.y" is smallest possible extension */ + + if ( length < (kMaxFileExtensionChars + 2) ) + maxExtChars = length - 2; /* save room for prefix + dot */ + else + maxExtChars = kMaxFileExtensionChars; + + i = length; + extChars = 0; + foundExtension = false; + + while ( extChars <= maxExtChars ) { + c = unicodeStr[--i]; + + /* look for leading dot */ + if ( c == (UniChar) '.' ) { + if ( extChars > 0 ) /* cannot end with a dot */ + foundExtension = true; + break; + } + + if ( EXTENSIONCHAR(c) ) + ++extChars; + else + break; + } + + /* if we found one then copy it */ + if ( foundExtension ) { + u_int8_t *extStrPtr = (u_int8_t *)extStr; + const UniChar *unicodeStrPtr = &unicodeStr[i]; + + for ( i = 0; i <= extChars; ++i ) + *(extStrPtr++) = (u_int8_t) *(unicodeStrPtr++); + extStr[extChars + 1] = '\0'; /* terminate extension + dot */ + } +} + + + +/* + * Count filename extension characters (if any) + */ +u_int32_t +CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length ) +{ + u_int32_t i; + UniChar c; + u_int32_t extChars; /* number of extension chars (excluding dot) */ + u_int16_t maxExtChars; + Boolean foundExtension; + + if ( length < 3 ) + return 0; /* "x.y" is smallest possible extension */ + + if ( length < (kMaxFileExtensionChars + 2) ) + maxExtChars = length - 2; /* save room for prefix + dot */ + else + maxExtChars = kMaxFileExtensionChars; + + extChars = 0; /* assume there's no extension */ + i = length - 1; /* index to last ascii character */ + foundExtension = false; + + while ( extChars <= maxExtChars ) { + c = filename[i--]; + + /* look for leading dot */ + if ( c == (u_int8_t) '.' ) { + if ( extChars > 0 ) /* cannot end with a dot */ + return (extChars); + + break; + } + + if ( EXTENSIONCHAR(c) ) + ++extChars; + else + break; + } + + return 0; +} + + +/* + * extract the file id from a mangled name + */ +HFSCatalogNodeID +GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength) +{ + short extChars; + short i; + u_int8_t c; + + *prefixLength = 0; + + if ( filename == NULL ) + return 0; + + if ( length < 28 ) + return 0; /* too small to have been mangled */ + + /* big enough for a file ID (#10) and an extension (.x) ? */ + if ( length > 5 ) + extChars = CountFilenameExtensionChars(filename, length); + else + extChars = 0; + + /* skip over dot plus extension characters */ + if ( extChars > 0 ) + length -= (extChars + 1); + + /* scan for file id digits */ + for ( i = length - 1; i >= 0; --i) { + c = filename[i]; + + /* look for file ID marker */ + if ( c == '#' ) { + if ( (length - i) < 3 ) + break; /* too small to be a file ID */ + + *prefixLength = i; + return HexStringToInteger(length - i - 1, &filename[i+1]); + } + + if ( !IsHexDigit(c) ) + break; /* file ID string must have hex digits */ + } + + return 0; +} + + + +static u_int32_t +HexStringToInteger(u_int32_t length, const u_int8_t *hexStr) +{ + u_int32_t value; + u_int32_t i; + u_int8_t c; + const u_int8_t *p; + + value = 0; + p = hexStr; + + for ( i = 0; i < length; ++i ) { + c = *p++; + + if (c >= '0' && c <= '9') { + value = value << 4; + value += (u_int32_t) c - (u_int32_t) '0'; + } else if (c >= 'A' && c <= 'F') { + value = value << 4; + value += 10 + ((unsigned int) c - (unsigned int) 'A'); + } else { + return 0; /* bad character */ + } + } + + return value; +} + + +/* + * Routine: FastRelString + * + * Output: returns -1 if str1 < str2 + * returns 1 if str1 > str2 + * return 0 if equal + * + */ +int32_t FastRelString( ConstStr255Param str1, ConstStr255Param str2 ) +{ + u_int16_t* compareTable; + int32_t bestGuess; + u_int8_t length, length2; + u_int8_t delta; + + delta = 0; + length = *(str1++); + length2 = *(str2++); + + if (length == length2) + bestGuess = 0; + else if (length < length2) + { + bestGuess = -1; + delta = length2 - length; + } + else + { + bestGuess = 1; + length = length2; + } + + compareTable = (u_int16_t*) gCompareTable; + + while (length--) + { + u_int8_t aChar, bChar; + + aChar = *(str1++); + bChar = *(str2++); + + if (aChar != bChar) // If they don't match exacly, do case conversion + { + u_int16_t aSortWord, bSortWord; + + aSortWord = compareTable[aChar]; + bSortWord = compareTable[bChar]; + + if (aSortWord > bSortWord) + return 1; + + if (aSortWord < bSortWord) + return -1; + } + + // If characters match exactly, then go on to next character immediately without + // doing any extra work. + } + + // if you got to here, then return bestGuess + return bestGuess; +} + + + +// +// FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering +// +// IF RESULT +// -------------------------- +// str1 < str2 => -1 +// str1 = str2 => 0 +// str1 > str2 => +1 +// +// The lower case table starts with 256 entries (one for each of the upper bytes +// of the original Unicode char). If that entry is zero, then all characters with +// that upper byte are already case folded. If the entry is non-zero, then it is +// the _index_ (not byte offset) of the start of the sub-table for the characters +// with that upper byte. All ignorable characters are folded to the value zero. +// +// In pseudocode: +// +// Let c = source Unicode character +// Let table[] = lower case table +// +// lower = table[highbyte(c)] +// if (lower == 0) +// lower = c +// else +// lower = table[lower+lowbyte(c)] +// +// if (lower == 0) +// ignore this character +// +// To handle ignorable characters, we now need a loop to find the next valid character. +// Also, we can't pre-compute the number of characters to compare; the string length might +// be larger than the number of non-ignorable characters. Further, we must be able to handle +// ignorable characters at any point in the string, including as the first or last characters. +// We use a zero value as a sentinel to detect both end-of-string and ignorable characters. +// Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename, +// the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is +// an invalid Unicode character). +// +// Pseudocode: +// +// while (1) { +// c1 = GetNextValidChar(str1) // returns zero if at end of string +// c2 = GetNextValidChar(str2) +// +// if (c1 != c2) break // found a difference +// +// if (c1 == 0) // reached end of string on both strings at once? +// return 0; // yes, so strings are equal +// } +// +// // When we get here, c1 != c2. So, we just need to determine which one is less. +// if (c1 < c2) +// return -1; +// else +// return 1; +// + +int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1, + register ConstUniCharArrayPtr str2, register ItemCount length2) +{ + register u_int16_t c1,c2; + register u_int16_t temp; + register u_int16_t* lowerCaseTable; + + lowerCaseTable = (u_int16_t*) gLowerCaseTable; + + while (1) { + /* Set default values for c1, c2 in case there are no more valid chars */ + c1 = 0; + c2 = 0; + + /* Find next non-ignorable char from str1, or zero if no more */ + while (length1 && c1 == 0) { + c1 = *(str1++); + --length1; + /* check for basic latin first */ + if (c1 < 0x0100) { + c1 = gLatinCaseFold[c1]; + break; + } + /* case fold if neccessary */ + if ((temp = lowerCaseTable[c1>>8]) != 0) + c1 = lowerCaseTable[temp + (c1 & 0x00FF)]; + } + + + /* Find next non-ignorable char from str2, or zero if no more */ + while (length2 && c2 == 0) { + c2 = *(str2++); + --length2; + /* check for basic latin first */ + if (c2 < 0x0100) { + c2 = gLatinCaseFold[c2]; + break; + } + /* case fold if neccessary */ + if ((temp = lowerCaseTable[c2>>8]) != 0) + c2 = lowerCaseTable[temp + (c2 & 0x00FF)]; + } + + if (c1 != c2) // found a difference, so stop looping + break; + + if (c1 == 0) // did we reach the end of both strings at the same time? + return 0; // yes, so strings are equal + } + + if (c1 < c2) + return -1; + else + return 1; +} + +/* + * UnicodeBinaryCompare + * Compare two UTF-16 strings and perform case-sensitive (binary) matching against them. + * + * Results are emitted like FastUnicodeCompare: + * + * + * IF RESULT + * -------------------------- + * str1 < str2 => -1 + * str1 = str2 => 0 + * str1 > str2 => +1 + * + * The case matching source code is greatly simplified due to the lack of case-folding + * in this comparison routine. We compare, in order: the lengths, then do character-by- + * character comparisons. + * + */ +int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount len1, + register ConstUniCharArrayPtr str2, register ItemCount len2) { + uint16_t c1; + uint16_t c2; + int string_length; + int32_t result = 0; + + /* Set default values for the two character pointers */ + c1 = 0; + c2 = 0; + + /* First generate the string length (for comparison purposes) */ + if (len1 < len2) { + string_length = len1; + --result; + } + else if (len1 > len2) { + string_length = len2; + ++result; + } + else { + string_length = len1; + } + + /* now compare the two string pointers */ + while (string_length--) { + c1 = *(str1++); + c2 = *(str2++); + + if (c1 > c2) { + result = 1; + break; + } + + if (c1 < c2) { + result = -1; + break; + } + /* If equal, iterate to the next two respective chars */ + } + + return result; +} + + +OSErr +ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen, + ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid) +{ + ByteCount subMaxLen; + size_t utf8len; + char fileIDStr[15]; + char extStr[15]; + + snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid); + GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr); + + /* remove extension chars from source */ + srcLen -= strlen(extStr) * sizeof(UniChar); + subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr)); + + (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', 0); + + strlcat((char *)dstStr, fileIDStr, maxDstLen); + strlcat((char *)dstStr, extStr, maxDstLen); + *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr)); + + return noErr; +}