bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29         File:           UnicodeWrappers.c
  30
  31         Contains:       Wrapper routines for Unicode conversion and comparison.
  32
  33 */
  34 #include <sys/param.h>
  35 #include <sys/utfconv.h>
  36
  37 #include "../../hfs_macos_defs.h"
  38 #include "UCStringCompareData.h"
  39
  40 #include "../headers/FileMgrInternal.h"
  41 #include "../headers/HFSUnicodeWrappers.h"
  42
  43 enum {
  44         kMinFileExtensionChars = 1,     /* does not include dot */
  45         kMaxFileExtensionChars = 5      /* does not include dot */
  46 };
  47
  48
  49 #define EXTENSIONCHAR(c)        (((c) >= 0x61 && (c) <= 0x7A) || \
  50                                  ((c) >= 0x41 && (c) <= 0x5A) || \
  51                                  ((c) >= 0x30 && (c) <= 0x39))
  52
  53
  54 #define IsHexDigit(c)           (((c) >= (UInt8) '0' && (c) <= (UInt8) '9') || \
  55                                  ((c) >= (UInt8) 'A' && (c) <= (UInt8) 'F'))
  56
  57
  58 static void     GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr );
  59
  60
  61 static UInt32   HexStringToInteger( UInt32 length, const UInt8 *hexStr );
  62
  63
  64 /*
  65  * Get filename extension (if any) as a C string
  66  */
  67 static void
  68 GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr)
  69 {
  70         UInt32  i;
  71         UniChar c;
  72         UInt16  extChars;       /* number of extension chars (excluding dot) */
  73         UInt16  maxExtChars;
  74         Boolean foundExtension;
  75
  76         extStr[0] = '\0';       /* assume there's no extension */
  77
  78         if ( length < 3 )
  79                 return;         /* "x.y" is smallest possible extension */
  80
  81         if ( length < (kMaxFileExtensionChars + 2) )
  82                 maxExtChars = length - 2;       /* save room for prefix + dot */
  83         else
  84                 maxExtChars = kMaxFileExtensionChars;
  85
  86         i = length;
  87         extChars = 0;
  88         foundExtension = false;
  89
  90         while ( extChars <= maxExtChars ) {
  91                 c = unicodeStr[--i];
  92
  93                 /* look for leading dot */
  94                 if ( c == (UniChar) '.' ) {
  95                         if ( extChars > 0 )     /* cannot end with a dot */
  96                                 foundExtension = true;
  97                         break;
  98                 }
  99
 100                 if ( EXTENSIONCHAR(c) )
 101                         ++extChars;
 102                 else
 103                         break;
 104         }
 105
 106         /* if we found one then copy it */
 107         if ( foundExtension ) {
 108                 UInt8 *extStrPtr = extStr;
 109                 const UniChar *unicodeStrPtr = &unicodeStr[i];
 110
 111                 for ( i = 0; i <= extChars; ++i )
 112                         *(extStrPtr++) = (UInt8) *(unicodeStrPtr++);
 113                 extStr[extChars + 1] = '\0';    /* terminate extension + dot */
 114         }
 115 }
 116
 117
 118
 119 /*
 120  * Count filename extension characters (if any)
 121  */
 122 static UInt32
 123 CountFilenameExtensionChars( const unsigned char * filename, UInt32 length )
 124 {
 125         UInt32  i;
 126         UniChar c;
 127         UInt32  extChars;       /* number of extension chars (excluding dot) */
 128         UInt16  maxExtChars;
 129         Boolean foundExtension;
 130
 131         if ( length < 3 )
 132                 return 0;       /* "x.y" is smallest possible extension */
 133
 134         if ( length < (kMaxFileExtensionChars + 2) )
 135                 maxExtChars = length - 2;       /* save room for prefix + dot */
 136         else
 137                 maxExtChars = kMaxFileExtensionChars;
 138
 139         extChars = 0;           /* assume there's no extension */
 140         i = length - 1;         /* index to last ascii character */
 141         foundExtension = false;
 142
 143         while ( extChars <= maxExtChars ) {
 144                 c = filename[i--];
 145
 146                 /* look for leading dot */
 147                 if ( c == (UInt8) '.' ) {
 148                         if ( extChars > 0 )     /* cannot end with a dot */
 149                                 return (extChars);
 150
 151                         break;
 152                 }
 153
 154                 if ( EXTENSIONCHAR(c) )
 155                         ++extChars;
 156                 else
 157                         break;
 158         }
 159
 160         return 0;
 161 }
 162
 163
 164 /*
 165  * extract the file id from a mangled name
 166  */
 167 HFSCatalogNodeID
 168 GetEmbeddedFileID(const unsigned char * filename, UInt32 length, UInt32 *prefixLength)
 169 {
 170         short   extChars;
 171         short   i;
 172         UInt8   c;
 173
 174         *prefixLength = 0;
 175
 176         if ( filename == NULL )
 177                 return 0;
 178
 179         if ( length < 28 )
 180                 return 0;       /* too small to have been mangled */
 181
 182         /* big enough for a file ID (#10) and an extension (.x) ? */
 183         if ( length > 5 )
 184                 extChars = CountFilenameExtensionChars(filename, length);
 185         else
 186                 extChars = 0;
 187
 188         /* skip over dot plus extension characters */
 189         if ( extChars > 0 )
 190                 length -= (extChars + 1);
 191
 192         /* scan for file id digits */
 193         for ( i = length - 1; i >= 0; --i) {
 194                 c = filename[i];
 195
 196                 /* look for file ID marker */
 197                 if ( c == '#' ) {
 198                         if ( (length - i) < 3 )
 199                                 break;  /* too small to be a file ID */
 200
 201                         *prefixLength = i;
 202                         return HexStringToInteger(length - i - 1, &filename[i+1]);
 203                 }
 204
 205                 if ( !IsHexDigit(c) )
 206                         break;  /* file ID string must have hex digits */
 207         }
 208
 209         return 0;
 210 }
 211
 212
 213
 214 static UInt32
 215 HexStringToInteger(UInt32 length, const UInt8 *hexStr)
 216 {
 217         UInt32          value;
 218         UInt32          i;
 219         UInt8           c;
 220         const UInt8     *p;
 221
 222         value = 0;
 223         p = hexStr;
 224
 225         for ( i = 0; i < length; ++i ) {
 226                 c = *p++;
 227
 228                 if (c >= '0' && c <= '9') {
 229                         value = value << 4;
 230                         value += (UInt32) c - (UInt32) '0';
 231                 } else if (c >= 'A' && c <= 'F') {
 232                         value = value << 4;
 233                         value += 10 + ((unsigned int) c - (unsigned int) 'A');
 234                 } else {
 235                         return 0;       /* bad character */
 236                 }
 237         }
 238
 239         return value;
 240 }
 241
 242
 243 /*
 244  * Routine:     FastRelString
 245  *
 246  * Output:      returns -1 if str1 < str2
 247  *              returns  1 if str1 > str2
 248  *              return   0 if equal
 249  *
 250  */
 251 SInt32  FastRelString( ConstStr255Param str1, ConstStr255Param str2 )
 252 {
 253         UInt16*                 compareTable;
 254         SInt32                  bestGuess;
 255         UInt8                   length, length2;
 256         UInt8                   delta;
 257
 258         delta = 0;
 259         length = *(str1++);
 260         length2 = *(str2++);
 261
 262         if (length == length2)
 263                 bestGuess = 0;
 264         else if (length < length2)
 265         {
 266                 bestGuess = -1;
 267                 delta = length2 - length;
 268         }
 269         else
 270         {
 271                 bestGuess = 1;
 272                 length = length2;
 273         }
 274
 275         compareTable = (UInt16*) gCompareTable;
 276
 277         while (length--)
 278         {
 279                 UInt8   aChar, bChar;
 280
 281                 aChar = *(str1++);
 282                 bChar = *(str2++);
 283
 284                 if (aChar != bChar)             //      If they don't match exacly, do case conversion
 285                 {
 286                         UInt16  aSortWord, bSortWord;
 287
 288                         aSortWord = compareTable[aChar];
 289                         bSortWord = compareTable[bChar];
 290
 291                         if (aSortWord > bSortWord)
 292                                 return 1;
 293
 294                         if (aSortWord < bSortWord)
 295                                 return -1;
 296                 }
 297
 298                 //      If characters match exactly, then go on to next character immediately without
 299                 //      doing any extra work.
 300         }
 301
 302         //      if you got to here, then return bestGuess
 303         return bestGuess;
 304 }
 305
 306
 307
 308 //
 309 //      FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
 310 //
 311 //          IF                          RESULT
 312 //      --------------------------
 313 //      str1 < str2             =>      -1
 314 //      str1 = str2             =>       0
 315 //      str1 > str2             =>      +1
 316 //
 317 //      The lower case table starts with 256 entries (one for each of the upper bytes
 318 //      of the original Unicode char).  If that entry is zero, then all characters with
 319 //      that upper byte are already case folded.  If the entry is non-zero, then it is
 320 //      the _index_ (not byte offset) of the start of the sub-table for the characters
 321 //      with that upper byte.  All ignorable characters are folded to the value zero.
 322 //
 323 //      In pseudocode:
 324 //
 325 //              Let c = source Unicode character
 326 //              Let table[] = lower case table
 327 //
 328 //              lower = table[highbyte(c)]
 329 //              if (lower == 0)
 330 //                      lower = c
 331 //              else
 332 //                      lower = table[lower+lowbyte(c)]
 333 //
 334 //              if (lower == 0)
 335 //                      ignore this character
 336 //
 337 //      To handle ignorable characters, we now need a loop to find the next valid character.
 338 //      Also, we can't pre-compute the number of characters to compare; the string length might
 339 //      be larger than the number of non-ignorable characters.  Further, we must be able to handle
 340 //      ignorable characters at any point in the string, including as the first or last characters.
 341 //      We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 342 //      Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 343 //      the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 344 //      an invalid Unicode character).
 345 //
 346 //      Pseudocode:
 347 //
 348 //              while (1) {
 349 //                      c1 = GetNextValidChar(str1)                     //      returns zero if at end of string
 350 //                      c2 = GetNextValidChar(str2)
 351 //
 352 //                      if (c1 != c2) break                                     //      found a difference
 353 //
 354 //                      if (c1 == 0)                                            //      reached end of string on both strings at once?
 355 //                              return 0;                                               //      yes, so strings are equal
 356 //              }
 357 //
 358 //              // When we get here, c1 != c2.  So, we just need to determine which one is less.
 359 //              if (c1 < c2)
 360 //                      return -1;
 361 //              else
 362 //                      return 1;
 363 //
 364
 365 SInt32 FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1,
 366                                                         register ConstUniCharArrayPtr str2, register ItemCount length2)
 367 {
 368         register UInt16         c1,c2;
 369         register UInt16         temp;
 370         register UInt16*        lowerCaseTable;
 371
 372         lowerCaseTable = (UInt16*) gLowerCaseTable;
 373
 374         while (1) {
 375                 /* Set default values for c1, c2 in case there are no more valid chars */
 376                 c1 = 0;
 377                 c2 = 0;
 378
 379                 /* Find next non-ignorable char from str1, or zero if no more */
 380                 while (length1 && c1 == 0) {
 381                         c1 = *(str1++);
 382                         --length1;
 383                         /* check for basic latin first */
 384                         if (c1 < 0x0100) {
 385                                 c1 = gLatinCaseFold[c1];
 386                                 break;
 387                         }
 388                         /* case fold if neccessary */
 389                         if ((temp = lowerCaseTable[c1>>8]) != 0)
 390                                 c1 = lowerCaseTable[temp + (c1 & 0x00FF)];
 391                 }
 392
 393
 394                 /* Find next non-ignorable char from str2, or zero if no more */
 395                 while (length2 && c2 == 0) {
 396                         c2 = *(str2++);
 397                         --length2;
 398                         /* check for basic latin first */
 399                         if (c2 < 0x0100) {
 400                                 c2 = gLatinCaseFold[c2];
 401                                 break;
 402                         }
 403                         /* case fold if neccessary */
 404                         if ((temp = lowerCaseTable[c2>>8]) != 0)
 405                                 c2 = lowerCaseTable[temp + (c2 & 0x00FF)];
 406                 }
 407
 408                 if (c1 != c2)           //      found a difference, so stop looping
 409                         break;
 410
 411                 if (c1 == 0)            //      did we reach the end of both strings at the same time?
 412                         return 0;               //      yes, so strings are equal
 413         }
 414
 415         if (c1 < c2)
 416                 return -1;
 417         else
 418                 return 1;
 419 }
 420
 421
 422 OSErr
 423 ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen,
 424                                          ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid)
 425 {
 426         ByteCount subMaxLen;
 427         size_t utf8len;
 428         char fileIDStr[15];
 429         char extStr[15];
 430
 431         sprintf(fileIDStr, "#%X", cnid);
 432         GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr);
 433
 434         /* remove extension chars from source */
 435         srcLen -= strlen(extStr) * sizeof(UniChar);
 436         subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr));
 437
 438         (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', 0);
 439
 440         strcat(dstStr, fileIDStr);
 441         strcat(dstStr, extStr);
 442         *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr));
 443
 444         return noErr;
 445 }
 446