bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23         File:           UnicodeWrappers.c
  24
  25         Contains:       Wrapper routines for Unicode conversion and comparison.
  26
  27 */
  28 #include <sys/param.h>
  29 #include <sys/utfconv.h>
  30
  31 #include "../../hfs_macos_defs.h"
  32 #include "UCStringCompareData.h"
  33
  34 #include "../headers/FileMgrInternal.h"
  35 #include "../headers/HFSUnicodeWrappers.h"
  36
  37 enum {
  38         kMinFileExtensionChars = 1,     /* does not include dot */
  39         kMaxFileExtensionChars = 5      /* does not include dot */
  40 };
  41
  42
  43 #define EXTENSIONCHAR(c)        (((c) >= 0x61 && (c) <= 0x7A) || \
  44                                  ((c) >= 0x41 && (c) <= 0x5A) || \
  45                                  ((c) >= 0x30 && (c) <= 0x39))
  46
  47
  48 #define IsHexDigit(c)           (((c) >= (UInt8) '0' && (c) <= (UInt8) '9') || \
  49                                  ((c) >= (UInt8) 'A' && (c) <= (UInt8) 'F'))
  50
  51
  52 static void     GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr );
  53
  54
  55 static UInt32   HexStringToInteger( UInt32 length, const UInt8 *hexStr );
  56
  57
  58 /*
  59  * Get filename extension (if any) as a C string
  60  */
  61 static void
  62 GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr)
  63 {
  64         UInt32  i;
  65         UniChar c;
  66         UInt16  extChars;       /* number of extension chars (excluding dot) */
  67         UInt16  maxExtChars;
  68         Boolean foundExtension;
  69
  70         extStr[0] = '\0';       /* assume there's no extension */
  71
  72         if ( length < 3 )
  73                 return;         /* "x.y" is smallest possible extension */
  74
  75         if ( length < (kMaxFileExtensionChars + 2) )
  76                 maxExtChars = length - 2;       /* save room for prefix + dot */
  77         else
  78                 maxExtChars = kMaxFileExtensionChars;
  79
  80         i = length;
  81         extChars = 0;
  82         foundExtension = false;
  83
  84         while ( extChars <= maxExtChars ) {
  85                 c = unicodeStr[--i];
  86
  87                 /* look for leading dot */
  88                 if ( c == (UniChar) '.' ) {
  89                         if ( extChars > 0 )     /* cannot end with a dot */
  90                                 foundExtension = true;
  91                         break;
  92                 }
  93
  94                 if ( EXTENSIONCHAR(c) )
  95                         ++extChars;
  96                 else
  97                         break;
  98         }
  99
 100         /* if we found one then copy it */
 101         if ( foundExtension ) {
 102                 UInt8 *extStrPtr = extStr;
 103                 const UniChar *unicodeStrPtr = &unicodeStr[i];
 104
 105                 for ( i = 0; i <= extChars; ++i )
 106                         *(extStrPtr++) = (UInt8) *(unicodeStrPtr++);
 107                 extStr[extChars + 1] = '\0';    /* terminate extension + dot */
 108         }
 109 }
 110
 111
 112
 113 /*
 114  * Count filename extension characters (if any)
 115  */
 116 static UInt32
 117 CountFilenameExtensionChars( const unsigned char * filename, UInt32 length )
 118 {
 119         UInt32  i;
 120         UniChar c;
 121         UInt32  extChars;       /* number of extension chars (excluding dot) */
 122         UInt16  maxExtChars;
 123         Boolean foundExtension;
 124
 125         if ( length < 3 )
 126                 return 0;       /* "x.y" is smallest possible extension */
 127
 128         if ( length < (kMaxFileExtensionChars + 2) )
 129                 maxExtChars = length - 2;       /* save room for prefix + dot */
 130         else
 131                 maxExtChars = kMaxFileExtensionChars;
 132
 133         extChars = 0;           /* assume there's no extension */
 134         i = length - 1;         /* index to last ascii character */
 135         foundExtension = false;
 136
 137         while ( extChars <= maxExtChars ) {
 138                 c = filename[i--];
 139
 140                 /* look for leading dot */
 141                 if ( c == (UInt8) '.' ) {
 142                         if ( extChars > 0 )     /* cannot end with a dot */
 143                                 return (extChars);
 144
 145                         break;
 146                 }
 147
 148                 if ( EXTENSIONCHAR(c) )
 149                         ++extChars;
 150                 else
 151                         break;
 152         }
 153
 154         return 0;
 155 }
 156
 157
 158 /*
 159  * extract the file id from a mangled name
 160  */
 161 HFSCatalogNodeID
 162 GetEmbeddedFileID(const unsigned char * filename, UInt32 length, UInt32 *prefixLength)
 163 {
 164         short   extChars;
 165         short   i;
 166         UInt8   c;
 167
 168         *prefixLength = 0;
 169
 170         if ( filename == NULL )
 171                 return 0;
 172
 173         if ( length < 28 )
 174                 return 0;       /* too small to have been mangled */
 175
 176         /* big enough for a file ID (#10) and an extension (.x) ? */
 177         if ( length > 5 )
 178                 extChars = CountFilenameExtensionChars(filename, length);
 179         else
 180                 extChars = 0;
 181
 182         /* skip over dot plus extension characters */
 183         if ( extChars > 0 )
 184                 length -= (extChars + 1);
 185
 186         /* scan for file id digits */
 187         for ( i = length - 1; i >= 0; --i) {
 188                 c = filename[i];
 189
 190                 /* look for file ID marker */
 191                 if ( c == '#' ) {
 192                         if ( (length - i) < 3 )
 193                                 break;  /* too small to be a file ID */
 194
 195                         *prefixLength = i;
 196                         return HexStringToInteger(length - i - 1, &filename[i+1]);
 197                 }
 198
 199                 if ( !IsHexDigit(c) )
 200                         break;  /* file ID string must have hex digits */
 201         }
 202
 203         return 0;
 204 }
 205
 206
 207
 208 static UInt32
 209 HexStringToInteger(UInt32 length, const UInt8 *hexStr)
 210 {
 211         UInt32          value;
 212         UInt32          i;
 213         UInt8           c;
 214         const UInt8     *p;
 215
 216         value = 0;
 217         p = hexStr;
 218
 219         for ( i = 0; i < length; ++i ) {
 220                 c = *p++;
 221
 222                 if (c >= '0' && c <= '9') {
 223                         value = value << 4;
 224                         value += (UInt32) c - (UInt32) '0';
 225                 } else if (c >= 'A' && c <= 'F') {
 226                         value = value << 4;
 227                         value += 10 + ((unsigned int) c - (unsigned int) 'A');
 228                 } else {
 229                         return 0;       /* bad character */
 230                 }
 231         }
 232
 233         return value;
 234 }
 235
 236
 237 /*
 238  * Routine:     FastRelString
 239  *
 240  * Output:      returns -1 if str1 < str2
 241  *              returns  1 if str1 > str2
 242  *              return   0 if equal
 243  *
 244  */
 245 SInt32  FastRelString( ConstStr255Param str1, ConstStr255Param str2 )
 246 {
 247         UInt16*                 compareTable;
 248         SInt32                  bestGuess;
 249         UInt8                   length, length2;
 250         UInt8                   delta;
 251
 252         delta = 0;
 253         length = *(str1++);
 254         length2 = *(str2++);
 255
 256         if (length == length2)
 257                 bestGuess = 0;
 258         else if (length < length2)
 259         {
 260                 bestGuess = -1;
 261                 delta = length2 - length;
 262         }
 263         else
 264         {
 265                 bestGuess = 1;
 266                 length = length2;
 267         }
 268
 269         compareTable = (UInt16*) gCompareTable;
 270
 271         while (length--)
 272         {
 273                 UInt8   aChar, bChar;
 274
 275                 aChar = *(str1++);
 276                 bChar = *(str2++);
 277
 278                 if (aChar != bChar)             //      If they don't match exacly, do case conversion
 279                 {
 280                         UInt16  aSortWord, bSortWord;
 281
 282                         aSortWord = compareTable[aChar];
 283                         bSortWord = compareTable[bChar];
 284
 285                         if (aSortWord > bSortWord)
 286                                 return 1;
 287
 288                         if (aSortWord < bSortWord)
 289                                 return -1;
 290                 }
 291
 292                 //      If characters match exactly, then go on to next character immediately without
 293                 //      doing any extra work.
 294         }
 295
 296         //      if you got to here, then return bestGuess
 297         return bestGuess;
 298 }
 299
 300
 301
 302 //
 303 //      FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
 304 //
 305 //          IF                          RESULT
 306 //      --------------------------
 307 //      str1 < str2             =>      -1
 308 //      str1 = str2             =>       0
 309 //      str1 > str2             =>      +1
 310 //
 311 //      The lower case table starts with 256 entries (one for each of the upper bytes
 312 //      of the original Unicode char).  If that entry is zero, then all characters with
 313 //      that upper byte are already case folded.  If the entry is non-zero, then it is
 314 //      the _index_ (not byte offset) of the start of the sub-table for the characters
 315 //      with that upper byte.  All ignorable characters are folded to the value zero.
 316 //
 317 //      In pseudocode:
 318 //
 319 //              Let c = source Unicode character
 320 //              Let table[] = lower case table
 321 //
 322 //              lower = table[highbyte(c)]
 323 //              if (lower == 0)
 324 //                      lower = c
 325 //              else
 326 //                      lower = table[lower+lowbyte(c)]
 327 //
 328 //              if (lower == 0)
 329 //                      ignore this character
 330 //
 331 //      To handle ignorable characters, we now need a loop to find the next valid character.
 332 //      Also, we can't pre-compute the number of characters to compare; the string length might
 333 //      be larger than the number of non-ignorable characters.  Further, we must be able to handle
 334 //      ignorable characters at any point in the string, including as the first or last characters.
 335 //      We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 336 //      Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 337 //      the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 338 //      an invalid Unicode character).
 339 //
 340 //      Pseudocode:
 341 //
 342 //              while (1) {
 343 //                      c1 = GetNextValidChar(str1)                     //      returns zero if at end of string
 344 //                      c2 = GetNextValidChar(str2)
 345 //
 346 //                      if (c1 != c2) break                                     //      found a difference
 347 //
 348 //                      if (c1 == 0)                                            //      reached end of string on both strings at once?
 349 //                              return 0;                                               //      yes, so strings are equal
 350 //              }
 351 //
 352 //              // When we get here, c1 != c2.  So, we just need to determine which one is less.
 353 //              if (c1 < c2)
 354 //                      return -1;
 355 //              else
 356 //                      return 1;
 357 //
 358
 359 SInt32 FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1,
 360                                                         register ConstUniCharArrayPtr str2, register ItemCount length2)
 361 {
 362         register UInt16         c1,c2;
 363         register UInt16         temp;
 364         register UInt16*        lowerCaseTable;
 365
 366         lowerCaseTable = (UInt16*) gLowerCaseTable;
 367
 368         while (1) {
 369                 /* Set default values for c1, c2 in case there are no more valid chars */
 370                 c1 = 0;
 371                 c2 = 0;
 372
 373                 /* Find next non-ignorable char from str1, or zero if no more */
 374                 while (length1 && c1 == 0) {
 375                         c1 = *(str1++);
 376                         --length1;
 377                         /* check for basic latin first */
 378                         if (c1 < 0x0100) {
 379                                 c1 = gLatinCaseFold[c1];
 380                                 break;
 381                         }
 382                         /* case fold if neccessary */
 383                         if ((temp = lowerCaseTable[c1>>8]) != 0)
 384                                 c1 = lowerCaseTable[temp + (c1 & 0x00FF)];
 385                 }
 386
 387
 388                 /* Find next non-ignorable char from str2, or zero if no more */
 389                 while (length2 && c2 == 0) {
 390                         c2 = *(str2++);
 391                         --length2;
 392                         /* check for basic latin first */
 393                         if (c2 < 0x0100) {
 394                                 c2 = gLatinCaseFold[c2];
 395                                 break;
 396                         }
 397                         /* case fold if neccessary */
 398                         if ((temp = lowerCaseTable[c2>>8]) != 0)
 399                                 c2 = lowerCaseTable[temp + (c2 & 0x00FF)];
 400                 }
 401
 402                 if (c1 != c2)           //      found a difference, so stop looping
 403                         break;
 404
 405                 if (c1 == 0)            //      did we reach the end of both strings at the same time?
 406                         return 0;               //      yes, so strings are equal
 407         }
 408
 409         if (c1 < c2)
 410                 return -1;
 411         else
 412                 return 1;
 413 }
 414
 415
 416 OSErr
 417 ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen,
 418                                          ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid)
 419 {
 420         ByteCount subMaxLen;
 421         size_t utf8len;
 422         char fileIDStr[15];
 423         char extStr[15];
 424
 425         sprintf(fileIDStr, "#%X", cnid);
 426         GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr);
 427
 428         /* remove extension chars from source */
 429         srcLen -= strlen(extStr) * sizeof(UniChar);
 430         subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr));
 431
 432         (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', 0);
 433
 434         strcat(dstStr, fileIDStr);
 435         strcat(dstStr, extStr);
 436         *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr));
 437
 438         return noErr;
 439 }
 440