bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c

   1 /*
   2  * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29         File:           UnicodeWrappers.c
  30
  31         Contains:       Wrapper routines for Unicode conversion and comparison.
  32
  33 */
  34
  35 #if HFS
  36 #include <sys/param.h>
  37 #include <sys/utfconv.h>
  38
  39 #include "../../hfs_macos_defs.h"
  40 #include "UCStringCompareData.h"
  41
  42 #include "../headers/FileMgrInternal.h"
  43 #include "../headers/HFSUnicodeWrappers.h"
  44
  45 enum {
  46         kMinFileExtensionChars = 1,     /* does not include dot */
  47         kMaxFileExtensionChars = 5      /* does not include dot */
  48 };
  49
  50
  51 #define EXTENSIONCHAR(c)        (((c) >= 0x61 && (c) <= 0x7A) || \
  52                                  ((c) >= 0x41 && (c) <= 0x5A) || \
  53                                  ((c) >= 0x30 && (c) <= 0x39))
  54
  55
  56 #define IsHexDigit(c)           (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') || \
  57                                  ((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F'))
  58
  59
  60 static void     GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr );
  61
  62
  63 static u_int32_t        HexStringToInteger( u_int32_t length, const u_int8_t *hexStr );
  64
  65
  66 /*
  67  * Get filename extension (if any) as a C string
  68  */
  69 static void
  70 GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr)
  71 {
  72         u_int32_t       i;
  73         UniChar c;
  74         u_int16_t       extChars;       /* number of extension chars (excluding dot) */
  75         u_int16_t       maxExtChars;
  76         Boolean foundExtension;
  77
  78         extStr[0] = '\0';       /* assume there's no extension */
  79
  80         if ( length < 3 )
  81                 return;         /* "x.y" is smallest possible extension */
  82
  83         if ( length < (kMaxFileExtensionChars + 2) )
  84                 maxExtChars = length - 2;       /* save room for prefix + dot */
  85         else
  86                 maxExtChars = kMaxFileExtensionChars;
  87
  88         i = length;
  89         extChars = 0;
  90         foundExtension = false;
  91
  92         while ( extChars <= maxExtChars ) {
  93                 c = unicodeStr[--i];
  94
  95                 /* look for leading dot */
  96                 if ( c == (UniChar) '.' ) {
  97                         if ( extChars > 0 )     /* cannot end with a dot */
  98                                 foundExtension = true;
  99                         break;
 100                 }
 101
 102                 if ( EXTENSIONCHAR(c) )
 103                         ++extChars;
 104                 else
 105                         break;
 106         }
 107
 108         /* if we found one then copy it */
 109         if ( foundExtension ) {
 110                 u_int8_t *extStrPtr = (u_int8_t *)extStr;
 111                 const UniChar *unicodeStrPtr = &unicodeStr[i];
 112
 113                 for ( i = 0; i <= extChars; ++i )
 114                         *(extStrPtr++) = (u_int8_t) *(unicodeStrPtr++);
 115                 extStr[extChars + 1] = '\0';    /* terminate extension + dot */
 116         }
 117 }
 118
 119
 120
 121 /*
 122  * Count filename extension characters (if any)
 123  */
 124 static u_int32_t
 125 CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length )
 126 {
 127         u_int32_t       i;
 128         UniChar c;
 129         u_int32_t       extChars;       /* number of extension chars (excluding dot) */
 130         u_int16_t       maxExtChars;
 131         Boolean foundExtension;
 132
 133         if ( length < 3 )
 134                 return 0;       /* "x.y" is smallest possible extension */
 135
 136         if ( length < (kMaxFileExtensionChars + 2) )
 137                 maxExtChars = length - 2;       /* save room for prefix + dot */
 138         else
 139                 maxExtChars = kMaxFileExtensionChars;
 140
 141         extChars = 0;           /* assume there's no extension */
 142         i = length - 1;         /* index to last ascii character */
 143         foundExtension = false;
 144
 145         while ( extChars <= maxExtChars ) {
 146                 c = filename[i--];
 147
 148                 /* look for leading dot */
 149                 if ( c == (u_int8_t) '.' )      {
 150                         if ( extChars > 0 )     /* cannot end with a dot */
 151                                 return (extChars);
 152
 153                         break;
 154                 }
 155
 156                 if ( EXTENSIONCHAR(c) )
 157                         ++extChars;
 158                 else
 159                         break;
 160         }
 161
 162         return 0;
 163 }
 164
 165
 166 /*
 167  * extract the file id from a mangled name
 168  */
 169 HFSCatalogNodeID
 170 GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength)
 171 {
 172         short   extChars;
 173         short   i;
 174         u_int8_t        c;
 175
 176         *prefixLength = 0;
 177
 178         if ( filename == NULL )
 179                 return 0;
 180
 181         if ( length < 28 )
 182                 return 0;       /* too small to have been mangled */
 183
 184         /* big enough for a file ID (#10) and an extension (.x) ? */
 185         if ( length > 5 )
 186                 extChars = CountFilenameExtensionChars(filename, length);
 187         else
 188                 extChars = 0;
 189
 190         /* skip over dot plus extension characters */
 191         if ( extChars > 0 )
 192                 length -= (extChars + 1);
 193
 194         /* scan for file id digits */
 195         for ( i = length - 1; i >= 0; --i) {
 196                 c = filename[i];
 197
 198                 /* look for file ID marker */
 199                 if ( c == '#' ) {
 200                         if ( (length - i) < 3 )
 201                                 break;  /* too small to be a file ID */
 202
 203                         *prefixLength = i;
 204                         return HexStringToInteger(length - i - 1, &filename[i+1]);
 205                 }
 206
 207                 if ( !IsHexDigit(c) )
 208                         break;  /* file ID string must have hex digits */
 209         }
 210
 211         return 0;
 212 }
 213
 214
 215
 216 static u_int32_t
 217 HexStringToInteger(u_int32_t length, const u_int8_t *hexStr)
 218 {
 219         u_int32_t               value;
 220         u_int32_t               i;
 221         u_int8_t                c;
 222         const u_int8_t  *p;
 223
 224         value = 0;
 225         p = hexStr;
 226
 227         for ( i = 0; i < length; ++i ) {
 228                 c = *p++;
 229
 230                 if (c >= '0' && c <= '9') {
 231                         value = value << 4;
 232                         value += (u_int32_t) c - (u_int32_t) '0';
 233                 } else if (c >= 'A' && c <= 'F') {
 234                         value = value << 4;
 235                         value += 10 + ((unsigned int) c - (unsigned int) 'A');
 236                 } else {
 237                         return 0;       /* bad character */
 238                 }
 239         }
 240
 241         return value;
 242 }
 243
 244
 245 /*
 246  * Routine:     FastRelString
 247  *
 248  * Output:      returns -1 if str1 < str2
 249  *              returns  1 if str1 > str2
 250  *              return   0 if equal
 251  *
 252  */
 253 int32_t FastRelString( ConstStr255Param str1, ConstStr255Param str2 )
 254 {
 255         u_int16_t*              compareTable;
 256         int32_t                 bestGuess;
 257         u_int8_t                length, length2;
 258         u_int8_t                delta;
 259
 260         delta = 0;
 261         length = *(str1++);
 262         length2 = *(str2++);
 263
 264         if (length == length2)
 265                 bestGuess = 0;
 266         else if (length < length2)
 267         {
 268                 bestGuess = -1;
 269                 delta = length2 - length;
 270         }
 271         else
 272         {
 273                 bestGuess = 1;
 274                 length = length2;
 275         }
 276
 277         compareTable = (u_int16_t*) gCompareTable;
 278
 279         while (length--)
 280         {
 281                 u_int8_t        aChar, bChar;
 282
 283                 aChar = *(str1++);
 284                 bChar = *(str2++);
 285
 286                 if (aChar != bChar)             //      If they don't match exacly, do case conversion
 287                 {
 288                         u_int16_t       aSortWord, bSortWord;
 289
 290                         aSortWord = compareTable[aChar];
 291                         bSortWord = compareTable[bChar];
 292
 293                         if (aSortWord > bSortWord)
 294                                 return 1;
 295
 296                         if (aSortWord < bSortWord)
 297                                 return -1;
 298                 }
 299
 300                 //      If characters match exactly, then go on to next character immediately without
 301                 //      doing any extra work.
 302         }
 303
 304         //      if you got to here, then return bestGuess
 305         return bestGuess;
 306 }
 307
 308
 309
 310 //
 311 //      FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
 312 //
 313 //          IF                          RESULT
 314 //      --------------------------
 315 //      str1 < str2             =>      -1
 316 //      str1 = str2             =>       0
 317 //      str1 > str2             =>      +1
 318 //
 319 //      The lower case table starts with 256 entries (one for each of the upper bytes
 320 //      of the original Unicode char).  If that entry is zero, then all characters with
 321 //      that upper byte are already case folded.  If the entry is non-zero, then it is
 322 //      the _index_ (not byte offset) of the start of the sub-table for the characters
 323 //      with that upper byte.  All ignorable characters are folded to the value zero.
 324 //
 325 //      In pseudocode:
 326 //
 327 //              Let c = source Unicode character
 328 //              Let table[] = lower case table
 329 //
 330 //              lower = table[highbyte(c)]
 331 //              if (lower == 0)
 332 //                      lower = c
 333 //              else
 334 //                      lower = table[lower+lowbyte(c)]
 335 //
 336 //              if (lower == 0)
 337 //                      ignore this character
 338 //
 339 //      To handle ignorable characters, we now need a loop to find the next valid character.
 340 //      Also, we can't pre-compute the number of characters to compare; the string length might
 341 //      be larger than the number of non-ignorable characters.  Further, we must be able to handle
 342 //      ignorable characters at any point in the string, including as the first or last characters.
 343 //      We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 344 //      Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 345 //      the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 346 //      an invalid Unicode character).
 347 //
 348 //      Pseudocode:
 349 //
 350 //              while (1) {
 351 //                      c1 = GetNextValidChar(str1)                     //      returns zero if at end of string
 352 //                      c2 = GetNextValidChar(str2)
 353 //
 354 //                      if (c1 != c2) break                                     //      found a difference
 355 //
 356 //                      if (c1 == 0)                                            //      reached end of string on both strings at once?
 357 //                              return 0;                                               //      yes, so strings are equal
 358 //              }
 359 //
 360 //              // When we get here, c1 != c2.  So, we just need to determine which one is less.
 361 //              if (c1 < c2)
 362 //                      return -1;
 363 //              else
 364 //                      return 1;
 365 //
 366
 367 int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1,
 368                                                         register ConstUniCharArrayPtr str2, register ItemCount length2)
 369 {
 370         register u_int16_t              c1,c2;
 371         register u_int16_t              temp;
 372         register u_int16_t*     lowerCaseTable;
 373
 374         lowerCaseTable = (u_int16_t*) gLowerCaseTable;
 375
 376         while (1) {
 377                 /* Set default values for c1, c2 in case there are no more valid chars */
 378                 c1 = 0;
 379                 c2 = 0;
 380
 381                 /* Find next non-ignorable char from str1, or zero if no more */
 382                 while (length1 && c1 == 0) {
 383                         c1 = *(str1++);
 384                         --length1;
 385                         /* check for basic latin first */
 386                         if (c1 < 0x0100) {
 387                                 c1 = gLatinCaseFold[c1];
 388                                 break;
 389                         }
 390                         /* case fold if neccessary */
 391                         if ((temp = lowerCaseTable[c1>>8]) != 0)
 392                                 c1 = lowerCaseTable[temp + (c1 & 0x00FF)];
 393                 }
 394
 395
 396                 /* Find next non-ignorable char from str2, or zero if no more */
 397                 while (length2 && c2 == 0) {
 398                         c2 = *(str2++);
 399                         --length2;
 400                         /* check for basic latin first */
 401                         if (c2 < 0x0100) {
 402                                 c2 = gLatinCaseFold[c2];
 403                                 break;
 404                         }
 405                         /* case fold if neccessary */
 406                         if ((temp = lowerCaseTable[c2>>8]) != 0)
 407                                 c2 = lowerCaseTable[temp + (c2 & 0x00FF)];
 408                 }
 409
 410                 if (c1 != c2)           //      found a difference, so stop looping
 411                         break;
 412
 413                 if (c1 == 0)            //      did we reach the end of both strings at the same time?
 414                         return 0;               //      yes, so strings are equal
 415         }
 416
 417         if (c1 < c2)
 418                 return -1;
 419         else
 420                 return 1;
 421 }
 422
 423
 424 OSErr
 425 ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen,
 426                                          ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid)
 427 {
 428         ByteCount subMaxLen;
 429         size_t utf8len;
 430         char fileIDStr[15];
 431         char extStr[15];
 432
 433         snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid);
 434         GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr);
 435
 436         /* remove extension chars from source */
 437         srcLen -= strlen(extStr) * sizeof(UniChar);
 438         subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr));
 439
 440         (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', 0);
 441
 442         strlcat((char *)dstStr, fileIDStr, maxDstLen);
 443         strlcat((char *)dstStr, extStr, maxDstLen);
 444         *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr));
 445
 446         return noErr;
 447 }
 448
 449 #else /* not HFS - temp workaround until 4277828 is fixed */
 450 /* stubs for exported routines that aren't present when we build kernel without HFS */
 451
 452 #include <sys/types.h>
 453
 454 int32_t FastUnicodeCompare( void * str1, u_int32_t length1, void * str2, u_int32_t length2 );
 455
 456
 457 int32_t FastUnicodeCompare( __unused void * str1,
 458                                                         __unused u_int32_t length1,
 459                                                         __unused void * str2,
 460                                                         __unused u_int32_t length2 )
 461 {
 462         return( 0 );
 463 }
 464
 465
 466 #endif /* HFS */
 467