livefiles_hfs_plugin/lf_hfs_unicode_wrappers.c

   1 //
   2 //  lf_hfs_unicode_wrappers.c
   3 //  livefiles_hfs
   4 //
   5 //  Created by Yakov Ben Zaken on 22/03/2018.
   6 //
   7
   8 #include "lf_hfs_unicode_wrappers.h"
   9 #include "lf_hfs_ucs_string_cmp_data.h"
  10 #include "lf_hfs_sbunicode.h"
  11
  12
  13
  14 enum {
  15     kMinFileExtensionChars = 1,    /* does not include dot */
  16     kMaxFileExtensionChars = 5    /* does not include dot */
  17 };
  18
  19
  20 #define EXTENSIONCHAR(c)    (((c) >= 0x61 && (c) <= 0x7A) || \
  21                              ((c) >= 0x41 && (c) <= 0x5A) || \
  22                              ((c) >= 0x30 && (c) <= 0x39))
  23
  24
  25 #define IsHexDigit(c)        (((c) >= (u_int8_t) '0' && (c) <= (u_int8_t) '9') || \
  26                               ((c) >= (u_int8_t) 'A' && (c) <= (u_int8_t) 'F'))
  27
  28
  29 static void
  30 GetFilenameExtension( ItemCount length, ConstUniCharArrayPtr unicodeStr, char* extStr );
  31
  32
  33 static u_int32_t
  34 HexStringToInteger( u_int32_t length, const u_int8_t *hexStr );
  35
  36
  37 /*
  38  * Get filename extension (if any) as a C string
  39  */
  40 static void
  41 GetFilenameExtension(ItemCount length, ConstUniCharArrayPtr unicodeStr, char * extStr)
  42 {
  43     u_int64_t    i;
  44     UniChar    c;
  45     u_int16_t    extChars;    /* number of extension chars (excluding dot) */
  46     u_int16_t    maxExtChars;
  47     Boolean    foundExtension;
  48
  49     extStr[0] = '\0';    /* assume there's no extension */
  50
  51     if ( length < 3 )
  52         return;        /* "x.y" is smallest possible extension */
  53
  54     if ( length < (kMaxFileExtensionChars + 2) )
  55         maxExtChars = length - 2;    /* save room for prefix + dot */
  56     else
  57         maxExtChars = kMaxFileExtensionChars;
  58
  59     i = length;
  60     extChars = 0;
  61     foundExtension = false;
  62
  63     while ( extChars <= maxExtChars ) {
  64         c = unicodeStr[--i];
  65
  66         /* look for leading dot */
  67         if ( c == (UniChar) '.' ) {
  68             if ( extChars > 0 )    /* cannot end with a dot */
  69                 foundExtension = true;
  70             break;
  71         }
  72
  73         if ( EXTENSIONCHAR(c) )
  74             ++extChars;
  75         else
  76             break;
  77     }
  78
  79     /* if we found one then copy it */
  80     if ( foundExtension ) {
  81         u_int8_t *extStrPtr = (u_int8_t *)extStr;
  82         const UniChar *unicodeStrPtr = &unicodeStr[i];
  83
  84         for ( i = 0; i <= extChars; ++i )
  85             *(extStrPtr++) = (u_int8_t) *(unicodeStrPtr++);
  86         extStr[extChars + 1] = '\0';    /* terminate extension + dot */
  87     }
  88 }
  89
  90 //
  91 //    FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
  92 //
  93 //        IF                RESULT
  94 //    --------------------------
  95 //    str1 < str2        =>    -1
  96 //    str1 = str2        =>     0
  97 //    str1 > str2        =>    +1
  98 //
  99 //    The lower case table starts with 256 entries (one for each of the upper bytes
 100 //    of the original Unicode char).  If that entry is zero, then all characters with
 101 //    that upper byte are already case folded.  If the entry is non-zero, then it is
 102 //    the _index_ (not byte offset) of the start of the sub-table for the characters
 103 //    with that upper byte.  All ignorable characters are folded to the value zero.
 104 //
 105 //    In pseudocode:
 106 //
 107 //        Let c = source Unicode character
 108 //        Let table[] = lower case table
 109 //
 110 //        lower = table[highbyte(c)]
 111 //        if (lower == 0)
 112 //            lower = c
 113 //        else
 114 //            lower = table[lower+lowbyte(c)]
 115 //
 116 //        if (lower == 0)
 117 //            ignore this character
 118 //
 119 //    To handle ignorable characters, we now need a loop to find the next valid character.
 120 //    Also, we can't pre-compute the number of characters to compare; the string length might
 121 //    be larger than the number of non-ignorable characters.  Further, we must be able to handle
 122 //    ignorable characters at any point in the string, including as the first or last characters.
 123 //    We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 124 //    Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 125 //    the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 126 //    an invalid Unicode character).
 127 //
 128 //    Pseudocode:
 129 //
 130 //        while (1) {
 131 //            c1 = GetNextValidChar(str1)            //    returns zero if at end of string
 132 //            c2 = GetNextValidChar(str2)
 133 //
 134 //            if (c1 != c2) break                    //    found a difference
 135 //
 136 //            if (c1 == 0)                        //    reached end of string on both strings at once?
 137 //                return 0;                        //    yes, so strings are equal
 138 //        }
 139 //
 140 //        // When we get here, c1 != c2.  So, we just need to determine which one is less.
 141 //        if (c1 < c2)
 142 //            return -1;
 143 //        else
 144 //            return 1;
 145 //
 146
 147 int32_t FastUnicodeCompare ( register ConstUniCharArrayPtr str1, register ItemCount length1,
 148                             register ConstUniCharArrayPtr str2, register ItemCount length2)
 149 {
 150     register u_int16_t     c1,c2;
 151     register u_int16_t     temp;
 152     register u_int16_t*    lowerCaseTable;
 153
 154     lowerCaseTable = (u_int16_t*) gLowerCaseTable;
 155
 156     while (1) {
 157         /* Set default values for c1, c2 in case there are no more valid chars */
 158         c1 = 0;
 159         c2 = 0;
 160
 161         /* Find next non-ignorable char from str1, or zero if no more */
 162         while (length1 && c1 == 0) {
 163             c1 = *(str1++);
 164             --length1;
 165             /* check for basic latin first */
 166             if (c1 < 0x0100) {
 167                 c1 = gLatinCaseFold[c1];
 168                 break;
 169             }
 170             /* case fold if neccessary */
 171             if ((temp = lowerCaseTable[c1>>8]) != 0)
 172                 c1 = lowerCaseTable[temp + (c1 & 0x00FF)];
 173         }
 174
 175
 176         /* Find next non-ignorable char from str2, or zero if no more */
 177         while (length2 && c2 == 0) {
 178             c2 = *(str2++);
 179             --length2;
 180             /* check for basic latin first */
 181             if (c2 < 0x0100) {
 182                 c2 = gLatinCaseFold[c2];
 183                 break;
 184             }
 185             /* case fold if neccessary */
 186             if ((temp = lowerCaseTable[c2>>8]) != 0)
 187                 c2 = lowerCaseTable[temp + (c2 & 0x00FF)];
 188         }
 189
 190         if (c1 != c2)        //    found a difference, so stop looping
 191             break;
 192
 193         if (c1 == 0)        //    did we reach the end of both strings at the same time?
 194             return 0;        //    yes, so strings are equal
 195     }
 196
 197     if (c1 < c2)
 198         return -1;
 199     else
 200         return 1;
 201 }
 202
 203
 204 /*
 205  * UnicodeBinaryCompare
 206  * Compare two UTF-16 strings and perform case-sensitive (binary) matching against them.
 207  *
 208  * Results are emitted like FastUnicodeCompare:
 209  *
 210  *
 211  *        IF                RESULT
 212  *    --------------------------
 213  *    str1 < str2        =>    -1
 214  *    str1 = str2        =>     0
 215  *    str1 > str2        =>    +1
 216  *
 217  * The case matching source code is greatly simplified due to the lack of case-folding
 218  * in this comparison routine. We compare, in order: the lengths, then do character-by-
 219  * character comparisons.
 220  *
 221  */
 222 int32_t UnicodeBinaryCompare (register ConstUniCharArrayPtr str1, register ItemCount len1,
 223                               register ConstUniCharArrayPtr str2, register ItemCount len2) {
 224     uint16_t c1 =0;
 225     uint16_t c2 =0;
 226     ItemCount string_length;
 227     int32_t result = 0;
 228
 229     /* First generate the string length (for comparison purposes) */
 230     if (len1 < len2) {
 231         string_length = len1;
 232         --result;
 233     }
 234     else if (len1 > len2) {
 235         string_length = len2;
 236         ++result;
 237     }
 238     else {
 239         string_length = len1;
 240     }
 241
 242     /* now compare the two string pointers */
 243     while (string_length--) {
 244         c1 = *(str1++);
 245         c2 = *(str2++);
 246
 247         if (c1 > c2) {
 248             result = 1;
 249             break;
 250         }
 251
 252         if (c1 < c2) {
 253             result = -1;
 254             break;
 255         }
 256         /* If equal, iterate to the next two respective chars */
 257     }
 258
 259     return result;
 260 }
 261
 262 /*
 263  * extract the file id from a mangled name
 264  */
 265 HFSCatalogNodeID
 266 GetEmbeddedFileID(const unsigned char * filename, u_int32_t length, u_int32_t *prefixLength)
 267 {
 268     short    extChars;
 269     short    i;
 270     u_int8_t    c;
 271
 272     *prefixLength = 0;
 273
 274     if ( filename == NULL )
 275         return 0;
 276
 277     if ( length < 28 )
 278         return 0;    /* too small to have been mangled */
 279
 280     /* big enough for a file ID (#10) and an extension (.x) ? */
 281     if ( length > 5 )
 282         extChars = CountFilenameExtensionChars(filename, length);
 283     else
 284         extChars = 0;
 285
 286     /* skip over dot plus extension characters */
 287     if ( extChars > 0 )
 288         length -= (extChars + 1);
 289
 290     /* scan for file id digits */
 291     for ( i = length - 1; i >= 0; --i) {
 292         c = filename[i];
 293
 294         /* look for file ID marker */
 295         if ( c == '#' ) {
 296             if ( (length - i) < 3 )
 297                 break;    /* too small to be a file ID */
 298
 299             *prefixLength = i;
 300             return HexStringToInteger(length - i - 1, &filename[i+1]);
 301         }
 302
 303         if ( !IsHexDigit(c) )
 304             break;    /* file ID string must have hex digits */
 305     }
 306
 307     return 0;
 308 }
 309
 310 /*
 311  * Count filename extension characters (if any)
 312  */
 313 u_int32_t
 314 CountFilenameExtensionChars( const unsigned char * filename, u_int32_t length )
 315 {
 316     UniChar    c;
 317     u_int16_t  maxExtChars;
 318
 319     if ( length < 3 )
 320         return 0;    /* "x.y" is smallest possible extension    */
 321
 322     if ( length < (kMaxFileExtensionChars + 2) )
 323         maxExtChars = length - 2;    /* save room for prefix + dot */
 324     else
 325         maxExtChars = kMaxFileExtensionChars;
 326
 327     u_int32_t extChars = 0;        /* number of extension chars (excluding dot) - assume there's no extension */
 328     u_int32_t i = length - 1;      /* index to last ascii character */
 329
 330     while ( extChars <= maxExtChars ) {
 331         c = filename[i--];
 332
 333         /* look for leading dot */
 334         if ( c == (u_int8_t) '.' )    {
 335             if ( extChars > 0 )    /* cannot end with a dot */
 336                 return (extChars);
 337
 338             break;
 339         }
 340
 341         if ( EXTENSIONCHAR(c) )
 342             ++extChars;
 343         else
 344             break;
 345     }
 346
 347     return 0;
 348 }
 349
 350 static u_int32_t
 351 HexStringToInteger(u_int32_t length, const u_int8_t *hexStr)
 352 {
 353     u_int32_t        value;
 354     u_int32_t        i;
 355     u_int8_t        c;
 356     const u_int8_t    *p;
 357
 358     value = 0;
 359     p = hexStr;
 360
 361     for ( i = 0; i < length; ++i ) {
 362         c = *p++;
 363
 364         if (c >= '0' && c <= '9') {
 365             value = value << 4;
 366             value += (u_int32_t) c - (u_int32_t) '0';
 367         } else if (c >= 'A' && c <= 'F') {
 368             value = value << 4;
 369             value += 10 + ((unsigned int) c - (unsigned int) 'A');
 370         } else {
 371             return 0;    /* bad character */
 372         }
 373     }
 374
 375     return value;
 376 }
 377
 378 OSErr
 379 ConvertUnicodeToUTF8Mangled(ByteCount srcLen, ConstUniCharArrayPtr srcStr, ByteCount maxDstLen,
 380                             ByteCount *actualDstLen, unsigned char* dstStr, HFSCatalogNodeID cnid)
 381 {
 382     ByteCount subMaxLen;
 383     size_t utf8len;
 384     char fileIDStr[15];
 385     char extStr[15];
 386
 387     snprintf(fileIDStr, sizeof(fileIDStr), "#%X", cnid);
 388     GetFilenameExtension(srcLen/sizeof(UniChar), srcStr, extStr);
 389
 390     /* remove extension chars from source */
 391     srcLen -= strlen(extStr) * sizeof(UniChar);
 392     subMaxLen = maxDstLen - (strlen(extStr) + strlen(fileIDStr));
 393
 394     (void) utf8_encodestr(srcStr, srcLen, dstStr, &utf8len, subMaxLen, ':', UTF_ADD_NULL_TERM);
 395
 396     strlcat((char *)dstStr, fileIDStr, maxDstLen);
 397     strlcat((char *)dstStr, extStr, maxDstLen);
 398     *actualDstLen = utf8len + (strlen(extStr) + strlen(fileIDStr));
 399
 400     return noErr;
 401 }