bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*
  24         File:           UnicodeWrappers.c
  25
  26         Contains:       Wrapper routines for Unicode conversion and comparison.
  27
  28 */
  29 #include <sys/param.h>
  30 #include <sys/utfconv.h>
  31
  32 #include "../../hfs_macos_defs.h"
  33 #include "UCStringCompareData.h"
  34
  35 #include "../headers/FileMgrInternal.h"
  36 #include "../headers/HFSUnicodeWrappers.h"
  37
  38 enum {
  39         kMinFileExtensionChars = 1,     /* does not include dot */
  40         kMaxFileExtensionChars = 5      /* does not include dot */
  41 };
  42
  43
  44 #define EXTENSIONCHAR(c)        (((c) >= 0x61 && (c) <= 0x7A) || \
  45                                  ((c) >= 0x41 && (c) <= 0x5A) || \
  46                                  ((c) >= 0x30 && (c) <= 0x39))
  47
  48
  49 #define IsHexDigit(c)           (((c) >= (UInt8) '0' && (c) <= (UInt8) '9') || \
  50                                  ((c) >= (UInt8) 'A' && (c) <= (UInt8) 'F'))
  51
  52
  53 static void     GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr );
  54
  55
  56 static UInt32   HexStringToInteger( UInt32 length, const UInt8 *hexStr );
  57
  58
  59 /*
  60  * Get filename extension (if any) as a C string
  61  */
  62 static void
  63 GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr)
  64 {
  65         UInt32  i;
  66         UniChar c;
  67         UInt16  extChars;       /* number of extension chars (excluding dot) */
  68         UInt16  maxExtChars;
  69         Boolean foundExtension;
  70
  71         extStr[0] = '\0';       /* assume there's no extension */
  72
  73         if ( length < 3 )
  74                 return;         /* "x.y" is smallest possible extension */
  75
  76         if ( length < (kMaxFileExtensionChars + 2) )
  77                 maxExtChars = length - 2;       /* save room for prefix + dot */
  78         else
  79                 maxExtChars = kMaxFileExtensionChars;
  80
  81         i = length;
  82         extChars = 0;
  83         foundExtension = false;
  84
  85         while ( extChars <= maxExtChars ) {
  86                 c = unicodeStr[--i];
  87
  88                 /* look for leading dot */
  89                 if ( c == (UniChar) '.' ) {
  90                         if ( extChars > 0 )     /* cannot end with a dot */
  91                                 foundExtension = true;
  92                         break;
  93                 }
  94
  95                 if ( EXTENSIONCHAR(c) )
  96                         ++extChars;
  97                 else
  98                         break;
  99         }
 100
 101         /* if we found one then copy it */
 102         if ( foundExtension ) {
 103                 UInt8 *extStrPtr = extStr;
 104                 const UniChar *unicodeStrPtr = &unicodeStr[i];
 105
 106                 for ( i = 0; i <= extChars; ++i )
 107                         *(extStrPtr++) = (UInt8) *(unicodeStrPtr++);
 108                 extStr[extChars + 1] = '\0';    /* terminate extension + dot */
 109         }
 110 }
 111
 112
 113
 114 /*
 115  * Count filename extension characters (if any)
 116  */
 117 static UInt32
 118 CountFilenameExtensionChars( const unsigned char * filename, UInt32 length )
 119 {
 120         UInt32  i;
 121         UniChar c;
 122         UInt32  extChars;       /* number of extension chars (excluding dot) */
 123         UInt16  maxExtChars;
 124         Boolean foundExtension;
 125
 126         if ( length < 3 )
 127                 return 0;       /* "x.y" is smallest possible extension */
 128
 129         if ( length < (kMaxFileExtensionChars + 2) )
 130                 maxExtChars = length - 2;       /* save room for prefix + dot */
 131         else
 132                 maxExtChars = kMaxFileExtensionChars;
 133
 134         extChars = 0;           /* assume there's no extension */
 135         i = length - 1;         /* index to last ascii character */
 136         foundExtension = false;
 137
 138         while ( extChars <= maxExtChars ) {
 139                 c = filename[i--];
 140
 141                 /* look for leading dot */
 142                 if ( c == (UInt8) '.' ) {
 143                         if ( extChars > 0 )     /* cannot end with a dot */
 144                                 return (extChars);
 145
 146                         break;
 147                 }
 148
 149                 if ( EXTENSIONCHAR(c) )
 150                         ++extChars;
 151                 else
 152                         break;
 153         }
 154
 155         return 0;
 156 }
 157
 158
 159 /*
 160  * extract the file id from a mangled name
 161  */
 162 HFSCatalogNodeID
 163 GetEmbeddedFileID(const unsigned char * filename, UInt32 length, UInt32 *prefixLength)
 164 {
 165         short   extChars;
 166         short   i;
 167         UInt8   c;
 168
 169         *prefixLength = 0;
 170
 171         if ( filename == NULL )
 172                 return 0;
 173
 174         if ( length < 28 )
 175                 return 0;       /* too small to have been mangled */
 176
 177         /* big enough for a file ID (#10) and an extension (.x) ? */
 178         if ( length > 5 )
 179                 extChars = CountFilenameExtensionChars(filename, length);
 180         else
 181                 extChars = 0;
 182
 183         /* skip over dot plus extension characters */
 184         if ( extChars > 0 )
 185                 length -= (extChars + 1);
 186
 187         /* scan for file id digits */
 188         for ( i = length - 1; i >= 0; --i) {
 189                 c = filename[i];
 190
 191                 /* look for file ID marker */
 192                 if ( c == '#' ) {
 193                         if ( (length - i) < 3 )
 194                                 break;  /* too small to be a file ID */
 195
 196                         *prefixLength = i;
 197                         return HexStringToInteger(length - i - 1, &filename[i+1]);
 198                 }
 199
 200                 if ( !IsHexDigit(c) )
 201                         break;  /* file ID string must have hex digits */
 202         }
 203
 204         return 0;
 205 }
 206
 207
 208
 209 static UInt32
 210 HexStringToInteger(UInt32 length, const UInt8 *hexStr)
 211 {
 212         UInt32          value;
 213         UInt32          i;
 214         UInt8           c;
 215         const UInt8     *p;
 216
 217         value = 0;
 218         p = hexStr;
 219
 220         for ( i = 0; i < length; ++i ) {
 221                 c = *p++;
 222
 223                 if (c >= '0' && c <= '9') {
 224                         value = value << 4;
 225                         value += (UInt32) c - (UInt32) '0';
 226                 } else if (c >= 'A' && c <= 'F') {
 227                         value = value << 4;
 228                         value += 10 + ((unsigned int) c - (unsigned int) 'A');
 229                 } else {
 230                         return 0;       /* bad character */
 231                 }
 232         }
 233
 234         return value;
 235 }
 236
 237
 238 /*
 239  * Routine:     FastRelString
 240  *
 241  * Output:      returns -1 if str1 < str2
 242  *              returns  1 if str1 > str2
 243  *              return   0 if equal
 244  *
 245  */
 246 SInt32  FastRelString( ConstStr255Param str1, ConstStr255Param str2 )
 247 {
 248         UInt16*                 compareTable;
 249         SInt32                  bestGuess;
 250         UInt8                   length, length2;
 251         UInt8                   delta;
 252
 253         delta = 0;
 254         length = *(str1++);
 255         length2 = *(str2++);
 256
 257         if (length == length2)
 258                 bestGuess = 0;
 259         else if (length < length2)
 260         {
 261                 bestGuess = -1;
 262                 delta = length2 - length;
 263         }
 264         else
 265         {
 266                 bestGuess = 1;
 267                 length = length2;
 268         }
 269
 270         compareTable = (UInt16*) gCompareTable;
 271
 272         while (length--)
 273         {
 274                 UInt8   aChar, bChar;
 275
 276                 aChar = *(str1++);
 277                 bChar = *(str2++);
 278
 279                 if (aChar != bChar)             //      If they don't match exacly, do case conversion
 280                 {
 281                         UInt16  aSortWord, bSortWord;
 282
 283                         aSortWord = compareTable[aChar];
 284                         bSortWord = compareTable[bChar];
 285
 286                         if (aSortWord > bSortWord)
 287                                 return 1;
 288
 289                         if (aSortWord < bSortWord)
 290                                 return -1;
 291                 }
 292
 293                 //      If characters match exactly, then go on to next character immediately without
 294                 //      doing any extra work.
 295         }
 296
 297         //      if you got to here, then return bestGuess
 298         return bestGuess;
 299 }
 300
 301
 302
 303 //
 304 //      FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
 305 //
 306 //          IF                          RESULT
 307 //      --------------------------
 308 //      str1 < str2             =>      -1
 309 //      str1 = str2             =>       0
 310 //      str1 > str2             =>      +1
 311 //
 312 //      The lower case table starts with 256 entries (one for each of the upper bytes
 313 //      of the original Unicode char).  If that entry is zero, then all characters with
 314 //      that upper byte are already case folded.  If the entry is non-zero, then it is
 315 //      the _index_ (not byte offset) of the start of the sub-table for the characters
 316 //      with that upper byte.  All ignorable characters are folded to the value zero.
 317 //
 318 //      In pseudocode:
 319 //
 320 //              Let c = source Unicode character
 321 //              Let table[] = lower case table
 322 //
 323 //              lower = table[highbyte(c)]
 324 //              if (lower == 0)
 325 //                      lower = c
 326 //              else
 327 //                      lower = table[lower+lowbyte(c)]
 328 //
 329 //              if (lower == 0)
 330 //                      ignore this character
 331 //
 332 //      To handle ignorable characters, we now need a loop to find the next valid character.
 333 //      Also, we can't pre-compute the number of characters to compare; the string length might
 334 //      be larger than the number of non-ignorable characters.  Further, we must be able to handle
 335 //      ignorable characters at any point in the string, including as the first or last characters.
 336 //      We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 337 //      Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 338 //      the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 339 //      an invalid Unicode character).
 340 //
 341 //      Pseudocode:
 342 //
 343 //              while (1) {
 344 //                      c1 = GetNextValidChar(str1)                     //      returns zero if at end of string
 345 //                      c2 = GetNextValidChar(str2)
 346 //
 347 //                      if (c1 != c2) break                                     //      found a difference
 348 //
 349 //                      if (c1 == 0)                                            //      reached end of string on both strings at once?
 350 //                              return 0;                                               //      yes, so strings are equal
 351 //              }
 352 //
 353 //              // When we get here, c1 != c2.  So, we just need to determine which one is less.
 354 //              if (c1 < c2)
 355 //                      return -1;
 356 //              else
 357 //                      return 1;
 358 //
 359
 360 SInt32 FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1,
 361                                                         register ConstUniCharArrayPtr str2, register ItemCount length2)
 362 {
 363         register UInt16         c1,c2;
 364         register UInt16         temp;
 365         register UInt16*        lowerCaseTable;
 366
 367         lowerCaseTable = (UInt16*) gLowerCaseTable;
 368
 369         while (1) {
 370                 /* Set default values for c1, c2 in case there are no more valid chars */
 371                 c1 = 0;
 372                 c2 = 0;
 373
 374                 /* Find next non-ignorable char from str1, or zero if no more */
 375                 while (length1 && c1 == 0) {
 376                         c1 = *(str1++);
 377                         --length1;
 378                         /* check for basic latin first */
 379                         if (c1 < 0x0100) {
 380                                 c1 = gLatinCaseFold[c1];
 381                                 break;
 382                         }
 383                         /* case fold if neccessary */
 384                         if ((temp = lowerCaseTable[c1>>8]) != 0)
 385                                 c1 = lowerCaseTable[temp + (c1 & 0x00FF)];
 386                 }
 387
 388
 389                 /* Find next non-ignorable char from str2, or zero if no more */
 390                 while (length2 && c2 == 0) {
 391                         c2 = *(str2++);
 392                         --length2;
 393                         /* check for basic latin first */
 394                         if (c2 < 0x0100) {
 395                                 c2 = gLatinCaseFold[c2];
 396                                 break;
 397                         }
 398                         /* case fold if neccessary */
 399                         if ((temp = lowerCaseTable[c2>>8]) != 0)
 400                                 c2 = lowerCaseTable[temp + (c2 & 0x00FF)];
 401                 }
 402
 403                 if (c1 != c2)           //      found a difference, so stop looping
 404                         break;
 405
 406                 if (c1 == 0)            //      did we reach the end of both strings at the same time?
 407                         return 0;               //      yes, so strings are equal
 408         }
 409
 410         if (c1 < c2)
 411                 return -1;
 412         else
 413                 return 1;
 414 }
 415
 416
 417 OSErr
 418 ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen,
 419                                          ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid)
 420 {
 421         ByteCount subMaxLen;
 422         size_t utf8len;
 423         char fileIDStr[15];
 424         char extStr[15];
 425
 426         sprintf(fileIDStr, "#%X", cnid);
 427         GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr);
 428
 429         /* remove extension chars from source */
 430         srcLen -= strlen(extStr) * sizeof(UniChar);
 431         subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr));
 432
 433         (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', 0);
 434
 435         strcat(dstStr, fileIDStr);
 436         strcat(dstStr, extStr);
 437         *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr));
 438
 439         return noErr;
 440 }
 441