i386/libsaio/hfs_compare.c

   1 /*
   2  * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 2.0 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23  *  HFSCompare.c - Functions for working with and comparing HFS nams.
  24  *
  25  *  Copyright (c) 1999-2000 Apple Computer, Inc.
  26  *
  27  *  DRI: Josh de Cesare
  28  */
  29
  30 #include <sl.h>
  31 #include "hfs_CaseTables.h"
  32
  33 #if ! UNCOMPRESSED
  34
  35 static unsigned short *
  36 UncompressStructure(struct compressed_block *bp, int count, int size)
  37 {
  38     unsigned short *out = malloc(size);
  39     unsigned short *op = out;
  40     unsigned short data;
  41     int i, j;
  42
  43     for (i=0; i<count; i++, bp++) {
  44         data = bp->data;
  45         for (j=0; j<bp->count; j++) {
  46             *op++ = data;
  47             if (bp->type == kTypeAscending) data++;
  48             else if (bp->type == kTypeAscending256) data += 256;
  49         }
  50     }
  51     return out;
  52 }
  53
  54 static void
  55 InitCompareTables(void)
  56 {
  57     if (gCompareTable == 0) {
  58         gCompareTable = UncompressStructure(gCompareTableCompressed,
  59                                             kCompareTableNBlocks, kCompareTableDataSize);
  60         gLowerCaseTable = UncompressStructure(gLowerCaseTableCompressed,
  61                                             kLowerCaseTableNBlocks, kLowerCaseTableDataSize);
  62     }
  63 }
  64
  65 #endif /* ! UNCOMPRESSED */
  66
  67 //_______________________________________________________________________
  68 //
  69 //      Routine:        FastRelString
  70 //
  71 //      Output:         returns -1 if str1 < str2
  72 //                              returns  1 if str1 > str2
  73 //                              return   0 if equal
  74 //
  75 //_______________________________________________________________________
  76
  77 int32_t FastRelString(u_int8_t * str1, u_int8_t * str2)
  78 {
  79         int32_t  bestGuess;
  80         u_int8_t length, length2;
  81
  82 #if ! UNCOMPRESED
  83         InitCompareTables();
  84 #endif
  85
  86         length = *(str1++);
  87         length2 = *(str2++);
  88
  89         if (length == length2)
  90                 bestGuess = 0;
  91         else if (length < length2)
  92                 bestGuess = -1;
  93         else
  94         {
  95                 bestGuess = 1;
  96                 length = length2;
  97         }
  98
  99         while (length--)
 100         {
 101                 u_int32_t aChar, bChar;
 102
 103                 aChar = *(str1++);
 104                 bChar = *(str2++);
 105
 106                 if (aChar != bChar)     /* If they don't match exacly, do case conversion */
 107                 {
 108                         u_int16_t aSortWord, bSortWord;
 109
 110                         aSortWord = gCompareTable[aChar];
 111                         bSortWord = gCompareTable[bChar];
 112
 113                         if (aSortWord > bSortWord)
 114                                 return 1;
 115
 116                         if (aSortWord < bSortWord)
 117                                 return -1;
 118                 }
 119
 120                 /*
 121                  * If characters match exactly, then go on to next character
 122                  * immediately without doing any extra work.
 123                  */
 124         }
 125
 126         /* if you got to here, then return bestGuess */
 127         return bestGuess;
 128 }
 129
 130
 131 //
 132 //      FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
 133 //
 134 //          IF                          RESULT
 135 //      --------------------------
 136 //      str1 < str2             =>      -1
 137 //      str1 = str2             =>       0
 138 //      str1 > str2             =>      +1
 139 //
 140 //      The lower case table starts with 256 entries (one for each of the upper bytes
 141 //      of the original Unicode char).  If that entry is zero, then all characters with
 142 //      that upper byte are already case folded.  If the entry is non-zero, then it is
 143 //      the _index_ (not byte offset) of the start of the sub-table for the characters
 144 //      with that upper byte.  All ignorable characters are folded to the value zero.
 145 //
 146 //      In pseudocode:
 147 //
 148 //              Let c = source Unicode character
 149 //              Let table[] = lower case table
 150 //
 151 //              lower = table[highbyte(c)]
 152 //              if (lower == 0)
 153 //                      lower = c
 154 //              else
 155 //                      lower = table[lower+lowbyte(c)]
 156 //
 157 //              if (lower == 0)
 158 //                      ignore this character
 159 //
 160 //      To handle ignorable characters, we now need a loop to find the next valid character.
 161 //      Also, we can't pre-compute the number of characters to compare; the string length might
 162 //      be larger than the number of non-ignorable characters.  Further, we must be able to handle
 163 //      ignorable characters at any point in the string, including as the first or last characters.
 164 //      We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 165 //      Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 166 //      the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 167 //      an invalid Unicode character).
 168 //
 169 //      Pseudocode:
 170 //
 171 //              while (1) {
 172 //                      c1 = GetNextValidChar(str1)                     //      returns zero if at end of string
 173 //                      c2 = GetNextValidChar(str2)
 174 //
 175 //                      if (c1 != c2) break                                     //      found a difference
 176 //
 177 //                      if (c1 == 0)                                            //      reached end of string on both strings at once?
 178 //                              return 0;                                               //      yes, so strings are equal
 179 //              }
 180 //
 181 //              // When we get here, c1 != c2.  So, we just need to determine which one is less.
 182 //              if (c1 < c2)
 183 //                      return -1;
 184 //              else
 185 //                      return 1;
 186 //
 187
 188 int32_t FastUnicodeCompare( u_int16_t * str1, register u_int32_t length1,
 189                             u_int16_t * str2, register u_int32_t length2 )
 190 {
 191         register u_int16_t c1,c2;
 192         register u_int16_t temp;
 193
 194 #if ! UNCOMPRESSED
 195         InitCompareTables();
 196 #endif
 197
 198         while (1) {
 199                 /* Set default values for c1, c2 in case there are no more valid chars */
 200                 c1 = 0;
 201                 c2 = 0;
 202
 203                 /* Find next non-ignorable char from str1, or zero if no more */
 204                 while (length1 && c1 == 0) {
 205                         c1 = SWAP_BE16(*(str1++));
 206                         --length1;
 207                         if ((temp = gLowerCaseTable[c1>>8]) != 0)               // is there a subtable for this upper byte?
 208                                 c1 = gLowerCaseTable[temp + (c1 & 0x00FF)];     // yes, so fold the char
 209                 }
 210
 211                 /* Find next non-ignorable char from str2, or zero if no more */
 212                 while (length2 && c2 == 0) {
 213                         c2 = SWAP_BE16(*(str2++));
 214                         --length2;
 215                         if ((temp = gLowerCaseTable[c2>>8]) != 0)               // is there a subtable for this upper byte?
 216                                 c2 = gLowerCaseTable[temp + (c2 & 0x00FF)];     // yes, so fold the char
 217                 }
 218
 219                 if (c1 != c2)   /* found a difference, so stop looping */
 220                         break;
 221
 222                 if (c1 == 0)            /* did we reach the end of both strings at the same time? */
 223                         return 0;       /* yes, so strings are equal */
 224         }
 225
 226         if (c1 < c2)
 227                 return -1;
 228         else
 229                 return 1;
 230 }
 231
 232
 233 //
 234 //  BinaryUnicodeCompare - Compare two Unicode strings; produce a relative ordering
 235 //  Compared using a 16-bit binary comparison (no case folding)
 236 //
 237 int32_t BinaryUnicodeCompare (u_int16_t * str1, u_int32_t length1,
 238                               u_int16_t * str2, u_int32_t length2)
 239 {
 240         register u_int16_t c1, c2;
 241         int32_t bestGuess;
 242         u_int32_t length;
 243
 244         bestGuess = 0;
 245
 246         if (length1 < length2) {
 247                 length = length1;
 248                 --bestGuess;
 249         } else if (length1 > length2) {
 250                 length = length2;
 251                 ++bestGuess;
 252         } else {
 253                 length = length1;
 254         }
 255
 256         while (length--) {
 257                 c1 = *(str1++);
 258                 c2 = *(str2++);
 259
 260                 if (c1 > c2)
 261                         return (1);
 262                 if (c1 < c2)
 263                         return (-1);
 264         }
 265
 266         return (bestGuess);
 267 }
 268
 269
 270 /*
 271  * UTF-8 (UCS Transformation Format)
 272  *
 273  * The following subset of UTF-8 is used to encode UCS-2 filenames. It
 274  * requires a maximum of three 3 bytes per UCS-2 character.  Only the
 275  * shortest encoding required to represent the significant UCS-2 bits
 276  * is legal.
 277  *
 278  * UTF-8 Multibyte Codes
 279  *
 280  * Bytes   Bits   UCS-2 Min   UCS-2 Max   UTF-8 Byte Sequence (binary)
 281  * -------------------------------------------------------------------
 282  *   1       7     0x0000      0x007F      0xxxxxxx
 283  *   2      11     0x0080      0x07FF      110xxxxx 10xxxxxx
 284  *   3      16     0x0800      0xFFFF      1110xxxx 10xxxxxx 10xxxxxx
 285  * -------------------------------------------------------------------
 286  */
 287
 288
 289 /*
 290  * utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a
 291  * null terminated UTF-8 string at utf8p.
 292  *
 293  * ucslen is the number of UCS-2 input characters (not bytes)
 294  * bufsize is the size of the output buffer in bytes
 295  */
 296 void
 297 utf_encodestr( const u_int16_t * ucsp, int ucslen,
 298                u_int8_t * utf8p, u_int32_t bufsize, int byte_order )
 299 {
 300         u_int8_t *bufend;
 301         u_int16_t ucs_ch;
 302
 303         bufend = utf8p + bufsize;
 304
 305         while (ucslen-- > 0) {
 306                 if (byte_order == OSBigEndian)
 307                     ucs_ch = SWAP_BE16(*ucsp++);
 308                 else
 309                     ucs_ch = SWAP_LE16(*ucsp++);
 310
 311                 if (ucs_ch < 0x0080) {
 312                         if (utf8p >= bufend)
 313                                 break;
 314                         if (ucs_ch == '\0')
 315                                 continue;       /* skip over embedded NULLs */
 316                         *utf8p++ = ucs_ch;
 317
 318                 } else if (ucs_ch < 0x800) {
 319                         if ((utf8p + 1) >= bufend)
 320                                 break;
 321                         *utf8p++ = (ucs_ch >> 6)   | 0xc0;
 322                         *utf8p++ = (ucs_ch & 0x3f) | 0x80;
 323
 324                 } else {
 325                         if ((utf8p + 2) >= bufend)
 326                                 break;
 327                         *utf8p++ = (ucs_ch >> 12)         | 0xe0;
 328                         *utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
 329                         *utf8p++ = ((ucs_ch) & 0x3f)      | 0x80;
 330                 }
 331         }
 332
 333         *utf8p = '\0';
 334 }
 335
 336
 337 /*
 338  * utf_decodestr - Decodes the null terminated UTF-8 string at
 339  * utf8p into a UCS-2 (Unicode) string at ucsp.
 340  *
 341  * ucslen is the number of UCS-2 output characters (not bytes)
 342  * bufsize is the size of the output buffer in bytes
 343  */
 344 void utf_decodestr(const u_int8_t * utf8p, u_int16_t * ucsp, u_int16_t * ucslen, u_int32_t bufsize, int byte_order)
 345 {
 346         u_int16_t *bufstart;
 347         u_int16_t *bufend;
 348         u_int16_t ucs_ch;
 349         u_int8_t  byte;
 350
 351         bufstart = ucsp;
 352         bufend = (u_int16_t *)((u_int8_t *)ucsp + bufsize);
 353
 354         while ((byte = *utf8p++) != '\0') {
 355                 if (ucsp >= bufend)
 356                         break;
 357
 358                 /* check for ascii */
 359                 if (byte < 0x80) {
 360                         ucs_ch = byte;
 361
 362                         if (byte_order == OSBigEndian)
 363                             *ucsp++ = SWAP_BE16(ucs_ch);
 364                         else
 365                             *ucsp++ = SWAP_LE16(ucs_ch);
 366
 367                         continue;
 368                 }
 369
 370                 switch (byte & 0xf0) {
 371                 /*  2 byte sequence*/
 372                 case 0xc0:
 373                 case 0xd0:
 374                         /* extract bits 6 - 10 from first byte */
 375                         ucs_ch = (byte & 0x1F) << 6;
 376                         break;
 377                 /* 3 byte sequence*/
 378                 case 0xe0:
 379                         /* extract bits 12 - 15 from first byte */
 380                         ucs_ch = (byte & 0x0F) << 6;
 381
 382                         /* extract bits 6 - 11 from second byte */
 383                         if (((byte = *utf8p++) & 0xc0) != 0x80)
 384                                 goto stop;
 385
 386                         ucs_ch += (byte & 0x3F);
 387                         ucs_ch <<= 6;
 388                         break;
 389                 default:
 390                         goto stop;
 391                 }
 392
 393                 /* extract bits 0 - 5 from final byte */
 394                 if (((byte = *utf8p++) & 0xc0) != 0x80)
 395                         goto stop;
 396                 ucs_ch += (byte & 0x3F);
 397
 398                 if (byte_order == OSBigEndian)
 399                     *ucsp++ = SWAP_BE16(ucs_ch);
 400                 else
 401                     *ucsp++ = SWAP_LE16(ucs_ch);
 402         }
 403 stop:
 404         if (byte_order == OSBigEndian)
 405             *ucslen = SWAP_BE16(ucsp - bufstart);
 406         else
 407             *ucslen = SWAP_LE16(ucsp - bufstart);
 408 }