core/UnicodeWrappers.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29         File:           UnicodeWrappers.c
  30
  31         Contains:       Wrapper routines for Unicode conversion and comparison.
  32
  33 */
  34
  35 #include <sys/param.h>
  36 #include <sys/utfconv.h>
  37
  38 #include "hfs_macos_defs.h"
  39 #include "UCStringCompareData.h"
  40
  41 #include "FileMgrInternal.h"
  42 #include "HFSUnicodeWrappers.h"
  43
  44 enum {
  45         kMinFileExtensionChars = 1,     /* does not include dot */
  46         kMaxFileExtensionChars = 5      /* does not include dot */
  47 };
  48
  49
  50 #define EXTENSIONCHAR(c)        (((c) >= 0x61 && (c) <= 0x7A) || \
  51                                  ((c) >= 0x41 && (c) <= 0x5A) || \
  52                                  ((c) >= 0x30 && (c) <= 0x39))
  53
  54
  55 #define IsHexDigit(c)           (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') || \
  56                                  ((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F'))
  57
  58
  59 static void     GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr );
  60
  61
  62 static u_int32_t        HexStringToInteger( u_int32_t length, const u_int8_t *hexStr );
  63
  64
  65 /*
  66  * Get filename extension (if any) as a C string
  67  */
  68 static void
  69 GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr)
  70 {
  71         u_int32_t       i;
  72         UniChar c;
  73         u_int16_t       extChars;       /* number of extension chars (excluding dot) */
  74         u_int16_t       maxExtChars;
  75         Boolean foundExtension;
  76
  77         extStr[0] = '\0';       /* assume there's no extension */
  78
  79         if ( length < 3 )
  80                 return;         /* "x.y" is smallest possible extension */
  81
  82         if ( length < (kMaxFileExtensionChars + 2) )
  83                 maxExtChars = length - 2;       /* save room for prefix + dot */
  84         else
  85                 maxExtChars = kMaxFileExtensionChars;
  86
  87         i = length;
  88         extChars = 0;
  89         foundExtension = false;
  90
  91         while ( extChars <= maxExtChars ) {
  92                 c = unicodeStr[--i];
  93
  94                 /* look for leading dot */
  95                 if ( c == (UniChar) '.' ) {
  96                         if ( extChars > 0 )     /* cannot end with a dot */
  97                                 foundExtension = true;
  98                         break;
  99                 }
 100
 101                 if ( EXTENSIONCHAR(c) )
 102                         ++extChars;
 103                 else
 104                         break;
 105         }
 106
 107         /* if we found one then copy it */
 108         if ( foundExtension ) {
 109                 u_int8_t *extStrPtr = (u_int8_t *)extStr;
 110                 const UniChar *unicodeStrPtr = &unicodeStr[i];
 111
 112                 for ( i = 0; i <= extChars; ++i )
 113                         *(extStrPtr++) = (u_int8_t) *(unicodeStrPtr++);
 114                 extStr[extChars + 1] = '\0';    /* terminate extension + dot */
 115         }
 116 }
 117
 118
 119
 120 /*
 121  * Count filename extension characters (if any)
 122  */
 123 u_int32_t
 124 CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length )
 125 {
 126         u_int32_t       i;
 127         UniChar c;
 128         u_int32_t       extChars;       /* number of extension chars (excluding dot) */
 129         u_int16_t       maxExtChars;
 130         Boolean foundExtension;
 131
 132         if ( length < 3 )
 133                 return 0;       /* "x.y" is smallest possible extension */
 134
 135         if ( length < (kMaxFileExtensionChars + 2) )
 136                 maxExtChars = length - 2;       /* save room for prefix + dot */
 137         else
 138                 maxExtChars = kMaxFileExtensionChars;
 139
 140         extChars = 0;           /* assume there's no extension */
 141         i = length - 1;         /* index to last ascii character */
 142         foundExtension = false;
 143
 144         while ( extChars <= maxExtChars ) {
 145                 c = filename[i--];
 146
 147                 /* look for leading dot */
 148                 if ( c == (u_int8_t) '.' )      {
 149                         if ( extChars > 0 )     /* cannot end with a dot */
 150                                 return (extChars);
 151
 152                         break;
 153                 }
 154
 155                 if ( EXTENSIONCHAR(c) )
 156                         ++extChars;
 157                 else
 158                         break;
 159         }
 160
 161         return 0;
 162 }
 163
 164
 165 /*
 166  * extract the file id from a mangled name
 167  */
 168 HFSCatalogNodeID
 169 GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength)
 170 {
 171         short   extChars;
 172         short   i;
 173         u_int8_t        c;
 174
 175         *prefixLength = 0;
 176
 177         if ( filename == NULL )
 178                 return 0;
 179
 180         if ( length < 28 )
 181                 return 0;       /* too small to have been mangled */
 182
 183         /* big enough for a file ID (#10) and an extension (.x) ? */
 184         if ( length > 5 )
 185                 extChars = CountFilenameExtensionChars(filename, length);
 186         else
 187                 extChars = 0;
 188
 189         /* skip over dot plus extension characters */
 190         if ( extChars > 0 )
 191                 length -= (extChars + 1);
 192
 193         /* scan for file id digits */
 194         for ( i = length - 1; i >= 0; --i) {
 195                 c = filename[i];
 196
 197                 /* look for file ID marker */
 198                 if ( c == '#' ) {
 199                         if ( (length - i) < 3 )
 200                                 break;  /* too small to be a file ID */
 201
 202                         *prefixLength = i;
 203                         return HexStringToInteger(length - i - 1, &filename[i+1]);
 204                 }
 205
 206                 if ( !IsHexDigit(c) )
 207                         break;  /* file ID string must have hex digits */
 208         }
 209
 210         return 0;
 211 }
 212
 213
 214
 215 static u_int32_t
 216 HexStringToInteger(u_int32_t length, const u_int8_t *hexStr)
 217 {
 218         u_int32_t               value;
 219         u_int32_t               i;
 220         u_int8_t                c;
 221         const u_int8_t  *p;
 222
 223         value = 0;
 224         p = hexStr;
 225
 226         for ( i = 0; i < length; ++i ) {
 227                 c = *p++;
 228
 229                 if (c >= '0' && c <= '9') {
 230                         value = value << 4;
 231                         value += (u_int32_t) c - (u_int32_t) '0';
 232                 } else if (c >= 'A' && c <= 'F') {
 233                         value = value << 4;
 234                         value += 10 + ((unsigned int) c - (unsigned int) 'A');
 235                 } else {
 236                         return 0;       /* bad character */
 237                 }
 238         }
 239
 240         return value;
 241 }
 242
 243
 244 /*
 245  * Routine:     FastRelString
 246  *
 247  * Output:      returns -1 if str1 < str2
 248  *              returns  1 if str1 > str2
 249  *              return   0 if equal
 250  *
 251  */
 252 int32_t FastRelString( ConstStr255Param str1, ConstStr255Param str2 )
 253 {
 254         u_int16_t*              compareTable;
 255         int32_t                 bestGuess;
 256         u_int8_t                length, length2;
 257         u_int8_t                delta;
 258
 259         delta = 0;
 260         length = *(str1++);
 261         length2 = *(str2++);
 262
 263         if (length == length2)
 264                 bestGuess = 0;
 265         else if (length < length2)
 266         {
 267                 bestGuess = -1;
 268                 delta = length2 - length;
 269         }
 270         else
 271         {
 272                 bestGuess = 1;
 273                 length = length2;
 274         }
 275
 276         compareTable = (u_int16_t*) gCompareTable;
 277
 278         while (length--)
 279         {
 280                 u_int8_t        aChar, bChar;
 281
 282                 aChar = *(str1++);
 283                 bChar = *(str2++);
 284
 285                 if (aChar != bChar)             //      If they don't match exacly, do case conversion
 286                 {
 287                         u_int16_t       aSortWord, bSortWord;
 288
 289                         aSortWord = compareTable[aChar];
 290                         bSortWord = compareTable[bChar];
 291
 292                         if (aSortWord > bSortWord)
 293                                 return 1;
 294
 295                         if (aSortWord < bSortWord)
 296                                 return -1;
 297                 }
 298
 299                 //      If characters match exactly, then go on to next character immediately without
 300                 //      doing any extra work.
 301         }
 302
 303         //      if you got to here, then return bestGuess
 304         return bestGuess;
 305 }
 306
 307
 308
 309 //
 310 //      FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
 311 //
 312 //          IF                          RESULT
 313 //      --------------------------
 314 //      str1 < str2             =>      -1
 315 //      str1 = str2             =>       0
 316 //      str1 > str2             =>      +1
 317 //
 318 //      The lower case table starts with 256 entries (one for each of the upper bytes
 319 //      of the original Unicode char).  If that entry is zero, then all characters with
 320 //      that upper byte are already case folded.  If the entry is non-zero, then it is
 321 //      the _index_ (not byte offset) of the start of the sub-table for the characters
 322 //      with that upper byte.  All ignorable characters are folded to the value zero.
 323 //
 324 //      In pseudocode:
 325 //
 326 //              Let c = source Unicode character
 327 //              Let table[] = lower case table
 328 //
 329 //              lower = table[highbyte(c)]
 330 //              if (lower == 0)
 331 //                      lower = c
 332 //              else
 333 //                      lower = table[lower+lowbyte(c)]
 334 //
 335 //              if (lower == 0)
 336 //                      ignore this character
 337 //
 338 //      To handle ignorable characters, we now need a loop to find the next valid character.
 339 //      Also, we can't pre-compute the number of characters to compare; the string length might
 340 //      be larger than the number of non-ignorable characters.  Further, we must be able to handle
 341 //      ignorable characters at any point in the string, including as the first or last characters.
 342 //      We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 343 //      Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 344 //      the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 345 //      an invalid Unicode character).
 346 //
 347 //      Pseudocode:
 348 //
 349 //              while (1) {
 350 //                      c1 = GetNextValidChar(str1)                     //      returns zero if at end of string
 351 //                      c2 = GetNextValidChar(str2)
 352 //
 353 //                      if (c1 != c2) break                                     //      found a difference
 354 //
 355 //                      if (c1 == 0)                                            //      reached end of string on both strings at once?
 356 //                              return 0;                                               //      yes, so strings are equal
 357 //              }
 358 //
 359 //              // When we get here, c1 != c2.  So, we just need to determine which one is less.
 360 //              if (c1 < c2)
 361 //                      return -1;
 362 //              else
 363 //                      return 1;
 364 //
 365
 366 int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1,
 367                                                         register ConstUniCharArrayPtr str2, register ItemCount length2)
 368 {
 369         register u_int16_t              c1,c2;
 370         register u_int16_t              temp;
 371         register u_int16_t*     lowerCaseTable;
 372
 373         lowerCaseTable = (u_int16_t*) gLowerCaseTable;
 374
 375         while (1) {
 376                 /* Set default values for c1, c2 in case there are no more valid chars */
 377                 c1 = 0;
 378                 c2 = 0;
 379
 380                 /* Find next non-ignorable char from str1, or zero if no more */
 381                 while (length1 && c1 == 0) {
 382                         c1 = *(str1++);
 383                         --length1;
 384                         /* check for basic latin first */
 385                         if (c1 < 0x0100) {
 386                                 c1 = gLatinCaseFold[c1];
 387                                 break;
 388                         }
 389                         /* case fold if neccessary */
 390                         if ((temp = lowerCaseTable[c1>>8]) != 0)
 391                                 c1 = lowerCaseTable[temp + (c1 & 0x00FF)];
 392                 }
 393
 394
 395                 /* Find next non-ignorable char from str2, or zero if no more */
 396                 while (length2 && c2 == 0) {
 397                         c2 = *(str2++);
 398                         --length2;
 399                         /* check for basic latin first */
 400                         if (c2 < 0x0100) {
 401                                 c2 = gLatinCaseFold[c2];
 402                                 break;
 403                         }
 404                         /* case fold if neccessary */
 405                         if ((temp = lowerCaseTable[c2>>8]) != 0)
 406                                 c2 = lowerCaseTable[temp + (c2 & 0x00FF)];
 407                 }
 408
 409                 if (c1 != c2)           //      found a difference, so stop looping
 410                         break;
 411
 412                 if (c1 == 0)            //      did we reach the end of both strings at the same time?
 413                         return 0;               //      yes, so strings are equal
 414         }
 415
 416         if (c1 < c2)
 417                 return -1;
 418         else
 419                 return 1;
 420 }
 421
 422 /*
 423  * UnicodeBinaryCompare
 424  * Compare two UTF-16 strings and perform case-sensitive (binary) matching against them.
 425  *
 426  * Results are emitted like FastUnicodeCompare:
 427  *
 428  *
 429  *          IF                          RESULT
 430  *      --------------------------
 431  *      str1 < str2             =>      -1
 432  *      str1 = str2             =>       0
 433  *      str1 > str2             =>      +1
 434  *
 435  * The case matching source code is greatly simplified due to the lack of case-folding
 436  * in this comparison routine. We compare, in order: the lengths, then do character-by-
 437  * character comparisons.
 438  *
 439  */
 440 int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount len1,
 441                                                         register ConstUniCharArrayPtr str2, register ItemCount len2) {
 442         uint16_t c1;
 443         uint16_t c2;
 444         int string_length;
 445         int32_t result = 0;
 446
 447         /* Set default values for the two character pointers */
 448         c1 = 0;
 449         c2 = 0;
 450
 451         /* First generate the string length (for comparison purposes) */
 452         if (len1 < len2) {
 453                 string_length = len1;
 454                 --result;
 455         }
 456         else if (len1 > len2) {
 457                 string_length = len2;
 458                 ++result;
 459         }
 460         else {
 461                 string_length = len1;
 462         }
 463
 464         /* now compare the two string pointers */
 465         while (string_length--) {
 466                 c1 = *(str1++);
 467                 c2 = *(str2++);
 468
 469                 if (c1 > c2) {
 470                         result = 1;
 471                         break;
 472                 }
 473
 474                 if (c1 < c2) {
 475                         result = -1;
 476                         break;
 477                 }
 478                 /* If equal, iterate to the next two respective chars */
 479         }
 480
 481         return result;
 482 }
 483
 484
 485 OSErr
 486 ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen,
 487                                          ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid)
 488 {
 489         ByteCount subMaxLen;
 490         size_t utf8len;
 491         char fileIDStr[15];
 492         char extStr[15];
 493
 494         snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid);
 495         GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr);
 496
 497         /* remove extension chars from source */
 498         srcLen -= strlen(extStr) * sizeof(UniChar);
 499         subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr));
 500
 501         (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', 0);
 502
 503         strlcat((char *)dstStr, fileIDStr, maxDstLen);
 504         strlcat((char *)dstStr, extStr, maxDstLen);
 505         *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr));
 506
 507         return noErr;
 508 }