bootx.tproj/fs.subproj/HFSCompare.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /*
  26  *  HFSCompare.c - Functions for working with and comparing HFS nams.
  27  *
  28  *  Copyright (c) 1999-2000 Apple Computer, Inc.
  29  *
  30  *  DRI: Josh de Cesare
  31  */
  32
  33 #include <sl.h>
  34 #include "CaseTables.h"
  35
  36
  37 //_______________________________________________________________________
  38 //
  39 //      Routine:        FastRelString
  40 //
  41 //      Output:         returns -1 if str1 < str2
  42 //                              returns  1 if str1 > str2
  43 //                              return   0 if equal
  44 //
  45 //_______________________________________________________________________
  46
  47 int32_t FastRelString(char *str1, char *str2 )
  48 {
  49         int32_t  bestGuess;
  50         u_int8_t length, length2;
  51
  52
  53         length = *(str1++);
  54         length2 = *(str2++);
  55
  56         if (length == length2)
  57                 bestGuess = 0;
  58         else if (length < length2)
  59                 bestGuess = -1;
  60         else
  61         {
  62                 bestGuess = 1;
  63                 length = length2;
  64         }
  65
  66         while (length--)
  67         {
  68                 u_int32_t aChar, bChar;
  69
  70                 aChar = *(str1++);
  71                 bChar = *(str2++);
  72
  73                 if (aChar != bChar)     /* If they don't match exacly, do case conversion */
  74                 {
  75                         u_int16_t aSortWord, bSortWord;
  76
  77                         aSortWord = gCompareTable[aChar];
  78                         bSortWord = gCompareTable[bChar];
  79
  80                         if (aSortWord > bSortWord)
  81                                 return 1;
  82
  83                         if (aSortWord < bSortWord)
  84                                 return -1;
  85                 }
  86
  87                 /*
  88                  * If characters match exactly, then go on to next character
  89                  * immediately without doing any extra work.
  90                  */
  91         }
  92
  93         /* if you got to here, then return bestGuess */
  94         return bestGuess;
  95 }
  96
  97
  98
  99 //
 100 //      FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
 101 //
 102 //          IF                          RESULT
 103 //      --------------------------
 104 //      str1 < str2             =>      -1
 105 //      str1 = str2             =>       0
 106 //      str1 > str2             =>      +1
 107 //
 108 //      The lower case table starts with 256 entries (one for each of the upper bytes
 109 //      of the original Unicode char).  If that entry is zero, then all characters with
 110 //      that upper byte are already case folded.  If the entry is non-zero, then it is
 111 //      the _index_ (not byte offset) of the start of the sub-table for the characters
 112 //      with that upper byte.  All ignorable characters are folded to the value zero.
 113 //
 114 //      In pseudocode:
 115 //
 116 //              Let c = source Unicode character
 117 //              Let table[] = lower case table
 118 //
 119 //              lower = table[highbyte(c)]
 120 //              if (lower == 0)
 121 //                      lower = c
 122 //              else
 123 //                      lower = table[lower+lowbyte(c)]
 124 //
 125 //              if (lower == 0)
 126 //                      ignore this character
 127 //
 128 //      To handle ignorable characters, we now need a loop to find the next valid character.
 129 //      Also, we can't pre-compute the number of characters to compare; the string length might
 130 //      be larger than the number of non-ignorable characters.  Further, we must be able to handle
 131 //      ignorable characters at any point in the string, including as the first or last characters.
 132 //      We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 133 //      Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 134 //      the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 135 //      an invalid Unicode character).
 136 //
 137 //      Pseudocode:
 138 //
 139 //              while (1) {
 140 //                      c1 = GetNextValidChar(str1)                     //      returns zero if at end of string
 141 //                      c2 = GetNextValidChar(str2)
 142 //
 143 //                      if (c1 != c2) break                                     //      found a difference
 144 //
 145 //                      if (c1 == 0)                                            //      reached end of string on both strings at once?
 146 //                              return 0;                                               //      yes, so strings are equal
 147 //              }
 148 //
 149 //              // When we get here, c1 != c2.  So, we just need to determine which one is less.
 150 //              if (c1 < c2)
 151 //                      return -1;
 152 //              else
 153 //                      return 1;
 154 //
 155
 156 int32_t FastUnicodeCompare (u_int16_t *str1, register u_int32_t length1,
 157                             u_int16_t *str2, register u_int32_t length2)
 158 {
 159         register u_int16_t c1,c2;
 160         register u_int16_t temp;
 161
 162         while (1) {
 163                 /* Set default values for c1, c2 in case there are no more valid chars */
 164                 c1 = 0;
 165                 c2 = 0;
 166
 167                 /* Find next non-ignorable char from str1, or zero if no more */
 168                 while (length1 && c1 == 0) {
 169                         c1 = *(str1++);
 170                         --length1;
 171                         if ((temp = gLowerCaseTable[c1>>8]) != 0)               // is there a subtable for this upper byte?
 172                                 c1 = gLowerCaseTable[temp + (c1 & 0x00FF)];     // yes, so fold the char
 173                 }
 174
 175
 176                 /* Find next non-ignorable char from str2, or zero if no more */
 177                 while (length2 && c2 == 0) {
 178                         c2 = *(str2++);
 179                         --length2;
 180                         if ((temp = gLowerCaseTable[c2>>8]) != 0)               // is there a subtable for this upper byte?
 181                                 c2 = gLowerCaseTable[temp + (c2 & 0x00FF)];     // yes, so fold the char
 182                 }
 183
 184                 if (c1 != c2)   /* found a difference, so stop looping */
 185                         break;
 186
 187                 if (c1 == 0)            /* did we reach the end of both strings at the same time? */
 188                         return 0;       /* yes, so strings are equal */
 189         }
 190
 191         if (c1 < c2)
 192                 return -1;
 193         else
 194                 return 1;
 195 }
 196
 197 //
 198 //  BinaryUnicodeCompare - Compare two Unicode strings; produce a relative ordering
 199 //  Compared using a 16-bit binary comparison (no case folding)
 200 //
 201 int32_t BinaryUnicodeCompare (u_int16_t * str1, u_int32_t length1,
 202                               u_int16_t * str2, u_int32_t length2)
 203 {
 204         register u_int16_t c1, c2;
 205         int32_t bestGuess;
 206         u_int32_t length;
 207
 208         bestGuess = 0;
 209
 210         if (length1 < length2) {
 211                 length = length1;
 212                 --bestGuess;
 213         } else if (length1 > length2) {
 214                 length = length2;
 215                 ++bestGuess;
 216         } else {
 217                 length = length1;
 218         }
 219
 220         while (length--) {
 221                 c1 = *(str1++);
 222                 c2 = *(str2++);
 223
 224                 if (c1 > c2)
 225                         return (1);
 226                 if (c1 < c2)
 227                         return (-1);
 228         }
 229
 230         return (bestGuess);
 231 }
 232
 233 /*
 234  * UTF-8 (UCS Transformation Format)
 235  *
 236  * The following subset of UTF-8 is used to encode UCS-2 filenames. It
 237  * requires a maximum of three 3 bytes per UCS-2 character.  Only the
 238  * shortest encoding required to represent the significant UCS-2 bits
 239  * is legal.
 240  *
 241  * UTF-8 Multibyte Codes
 242  *
 243  * Bytes   Bits   UCS-2 Min   UCS-2 Max   UTF-8 Byte Sequence (binary)
 244  * -------------------------------------------------------------------
 245  *   1       7     0x0000      0x007F      0xxxxxxx
 246  *   2      11     0x0080      0x07FF      110xxxxx 10xxxxxx
 247  *   3      16     0x0800      0xFFFF      1110xxxx 10xxxxxx 10xxxxxx
 248  * -------------------------------------------------------------------
 249  */
 250
 251
 252 /*
 253  * utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a
 254  * null terminated UTF-8 string at utf8p.
 255  *
 256  * ucslen is the number of UCS-2 input characters (not bytes)
 257  * bufsize is the size of the output buffer in bytes
 258  */
 259 void
 260 utf_encodestr(const u_int16_t *ucsp, int ucslen, u_int8_t *utf8p, u_int32_t bufsize)
 261 {
 262         u_int8_t *bufend;
 263         u_int16_t ucs_ch;
 264
 265         bufend = utf8p + bufsize;
 266
 267         while (ucslen-- > 0) {
 268                 ucs_ch = *ucsp++;
 269
 270                 if (ucs_ch < 0x0080) {
 271                         if (utf8p >= bufend)
 272                                 break;
 273                         if (ucs_ch == '\0')
 274                                 continue;       /* skip over embedded NULLs */
 275                         *utf8p++ = ucs_ch;
 276
 277                 } else if (ucs_ch < 0x800) {
 278                         if ((utf8p + 1) >= bufend)
 279                                 break;
 280                         *utf8p++ = (ucs_ch >> 6)   | 0xc0;
 281                         *utf8p++ = (ucs_ch & 0x3f) | 0x80;
 282
 283                 } else {
 284                         if ((utf8p + 2) >= bufend)
 285                                 break;
 286                         *utf8p++ = (ucs_ch >> 12)         | 0xe0;
 287                         *utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
 288                         *utf8p++ = ((ucs_ch) & 0x3f)      | 0x80;
 289                 }
 290
 291         }
 292
 293         *utf8p = '\0';
 294 }
 295
 296
 297 /*
 298  * utf_decodestr - Decodes the null terminated UTF-8 string at
 299  * utf8p into a UCS-2 (Unicode) string at ucsp.
 300  *
 301  * ucslen is the number of UCS-2 output characters (not bytes)
 302  * bufsize is the size of the output buffer in bytes
 303  */
 304 void
 305 utf_decodestr(const u_int8_t *utf8p, u_int16_t *ucsp, u_int16_t *ucslen, u_int32_t bufsize)
 306 {
 307         u_int16_t *bufstart;
 308         u_int16_t *bufend;
 309         u_int16_t ucs_ch;
 310         u_int8_t  byte;
 311
 312         bufstart = ucsp;
 313         bufend = (u_int16_t *)((u_int8_t *)ucsp + bufsize);
 314
 315         while ((byte = *utf8p++) != '\0') {
 316                 if (ucsp >= bufend)
 317                         break;
 318
 319                 /* check for ascii */
 320                 if (byte < 0x80) {
 321                         ucs_ch = byte;
 322
 323                         *ucsp++ = ucs_ch;
 324                         continue;
 325                 }
 326
 327                 switch (byte & 0xf0) {
 328                 /*  2 byte sequence*/
 329                 case 0xc0:
 330                 case 0xd0:
 331                         /* extract bits 6 - 10 from first byte */
 332                         ucs_ch = (byte & 0x1F) << 6;
 333                         break;
 334                 /* 3 byte sequence*/
 335                 case 0xe0:
 336                         /* extract bits 12 - 15 from first byte */
 337                         ucs_ch = (byte & 0x0F) << 6;
 338
 339                         /* extract bits 6 - 11 from second byte */
 340                         if (((byte = *utf8p++) & 0xc0) != 0x80)
 341                                 goto stop;
 342
 343                         ucs_ch += (byte & 0x3F);
 344                         ucs_ch <<= 6;
 345                         break;
 346                 default:
 347                         goto stop;
 348                 }
 349
 350                 /* extract bits 0 - 5 from final byte */
 351                 if (((byte = *utf8p++) & 0xc0) != 0x80)
 352                         goto stop;
 353                 ucs_ch += (byte & 0x3F);
 354
 355                 *ucsp++ = ucs_ch;
 356         }
 357 stop:
 358         *ucslen = ucsp - bufstart;
 359 }