bootx.tproj/fs.subproj/HFSCompare.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /*
  26  *  HFSCompare.c - Functions for working with and comparing HFS nams.
  27  *
  28  *  Copyright (c) 1999-2000 Apple Computer, Inc.
  29  *
  30  *  DRI: Josh de Cesare
  31  */
  32
  33 #include <sl.h>
  34 #include "CaseTables.h"
  35
  36
  37 //_______________________________________________________________________
  38 //
  39 //      Routine:        FastRelString
  40 //
  41 //      Output:         returns -1 if str1 < str2
  42 //                              returns  1 if str1 > str2
  43 //                              return   0 if equal
  44 //
  45 //_______________________________________________________________________
  46
  47 int32_t FastRelString(char *str1, char *str2 )
  48 {
  49         int32_t  bestGuess;
  50         u_int8_t length, length2;
  51
  52
  53         length = *(str1++);
  54         length2 = *(str2++);
  55
  56         if (length == length2)
  57                 bestGuess = 0;
  58         else if (length < length2)
  59                 bestGuess = -1;
  60         else
  61         {
  62                 bestGuess = 1;
  63                 length = length2;
  64         }
  65
  66         while (length--)
  67         {
  68                 u_int32_t aChar, bChar;
  69
  70                 aChar = *(str1++);
  71                 bChar = *(str2++);
  72
  73                 if (aChar != bChar)     /* If they don't match exacly, do case conversion */
  74                 {
  75                         u_int16_t aSortWord, bSortWord;
  76
  77                         aSortWord = gCompareTable[aChar];
  78                         bSortWord = gCompareTable[bChar];
  79
  80                         if (aSortWord > bSortWord)
  81                                 return 1;
  82
  83                         if (aSortWord < bSortWord)
  84                                 return -1;
  85                 }
  86
  87                 /*
  88                  * If characters match exactly, then go on to next character
  89                  * immediately without doing any extra work.
  90                  */
  91         }
  92
  93         /* if you got to here, then return bestGuess */
  94         return bestGuess;
  95 }
  96
  97
  98
  99 //
 100 //      FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
 101 //
 102 //          IF                          RESULT
 103 //      --------------------------
 104 //      str1 < str2             =>      -1
 105 //      str1 = str2             =>       0
 106 //      str1 > str2             =>      +1
 107 //
 108 //      The lower case table starts with 256 entries (one for each of the upper bytes
 109 //      of the original Unicode char).  If that entry is zero, then all characters with
 110 //      that upper byte are already case folded.  If the entry is non-zero, then it is
 111 //      the _index_ (not byte offset) of the start of the sub-table for the characters
 112 //      with that upper byte.  All ignorable characters are folded to the value zero.
 113 //
 114 //      In pseudocode:
 115 //
 116 //              Let c = source Unicode character
 117 //              Let table[] = lower case table
 118 //
 119 //              lower = table[highbyte(c)]
 120 //              if (lower == 0)
 121 //                      lower = c
 122 //              else
 123 //                      lower = table[lower+lowbyte(c)]
 124 //
 125 //              if (lower == 0)
 126 //                      ignore this character
 127 //
 128 //      To handle ignorable characters, we now need a loop to find the next valid character.
 129 //      Also, we can't pre-compute the number of characters to compare; the string length might
 130 //      be larger than the number of non-ignorable characters.  Further, we must be able to handle
 131 //      ignorable characters at any point in the string, including as the first or last characters.
 132 //      We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 133 //      Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 134 //      the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 135 //      an invalid Unicode character).
 136 //
 137 //      Pseudocode:
 138 //
 139 //              while (1) {
 140 //                      c1 = GetNextValidChar(str1)                     //      returns zero if at end of string
 141 //                      c2 = GetNextValidChar(str2)
 142 //
 143 //                      if (c1 != c2) break                                     //      found a difference
 144 //
 145 //                      if (c1 == 0)                                            //      reached end of string on both strings at once?
 146 //                              return 0;                                               //      yes, so strings are equal
 147 //              }
 148 //
 149 //              // When we get here, c1 != c2.  So, we just need to determine which one is less.
 150 //              if (c1 < c2)
 151 //                      return -1;
 152 //              else
 153 //                      return 1;
 154 //
 155
 156 int32_t FastUnicodeCompare (u_int16_t *str1, register u_int32_t length1,
 157                             u_int16_t *str2, register u_int32_t length2)
 158 {
 159         register u_int16_t c1,c2;
 160         register u_int16_t temp;
 161
 162         while (1) {
 163                 /* Set default values for c1, c2 in case there are no more valid chars */
 164                 c1 = 0;
 165                 c2 = 0;
 166
 167                 /* Find next non-ignorable char from str1, or zero if no more */
 168                 while (length1 && c1 == 0) {
 169                         c1 = *(str1++);
 170                         --length1;
 171                         if ((temp = gLowerCaseTable[c1>>8]) != 0)               // is there a subtable for this upper byte?
 172                                 c1 = gLowerCaseTable[temp + (c1 & 0x00FF)];     // yes, so fold the char
 173                 }
 174
 175
 176                 /* Find next non-ignorable char from str2, or zero if no more */
 177                 while (length2 && c2 == 0) {
 178                         c2 = *(str2++);
 179                         --length2;
 180                         if ((temp = gLowerCaseTable[c2>>8]) != 0)               // is there a subtable for this upper byte?
 181                                 c2 = gLowerCaseTable[temp + (c2 & 0x00FF)];     // yes, so fold the char
 182                 }
 183
 184                 if (c1 != c2)   /* found a difference, so stop looping */
 185                         break;
 186
 187                 if (c1 == 0)            /* did we reach the end of both strings at the same time? */
 188                         return 0;       /* yes, so strings are equal */
 189         }
 190
 191         if (c1 < c2)
 192                 return -1;
 193         else
 194                 return 1;
 195 }
 196
 197
 198 /*
 199  * UTF-8 (UCS Transformation Format)
 200  *
 201  * The following subset of UTF-8 is used to encode UCS-2 filenames. It
 202  * requires a maximum of three 3 bytes per UCS-2 character.  Only the
 203  * shortest encoding required to represent the significant UCS-2 bits
 204  * is legal.
 205  *
 206  * UTF-8 Multibyte Codes
 207  *
 208  * Bytes   Bits   UCS-2 Min   UCS-2 Max   UTF-8 Byte Sequence (binary)
 209  * -------------------------------------------------------------------
 210  *   1       7     0x0000      0x007F      0xxxxxxx
 211  *   2      11     0x0080      0x07FF      110xxxxx 10xxxxxx
 212  *   3      16     0x0800      0xFFFF      1110xxxx 10xxxxxx 10xxxxxx
 213  * -------------------------------------------------------------------
 214  */
 215
 216
 217 /*
 218  * utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a
 219  * null terminated UTF-8 string at utf8p.
 220  *
 221  * ucslen is the number of UCS-2 input characters (not bytes)
 222  * bufsize is the size of the output buffer in bytes
 223  */
 224 void
 225 utf_encodestr(const u_int16_t *ucsp, int ucslen, u_int8_t *utf8p, u_int32_t bufsize)
 226 {
 227         u_int8_t *bufend;
 228         u_int16_t ucs_ch;
 229
 230         bufend = utf8p + bufsize;
 231
 232         while (ucslen-- > 0) {
 233                 ucs_ch = *ucsp++;
 234
 235                 if (ucs_ch < 0x0080) {
 236                         if (utf8p >= bufend)
 237                                 break;
 238                         if (ucs_ch == '\0')
 239                                 continue;       /* skip over embedded NULLs */
 240                         *utf8p++ = ucs_ch;
 241
 242                 } else if (ucs_ch < 0x800) {
 243                         if ((utf8p + 1) >= bufend)
 244                                 break;
 245                         *utf8p++ = (ucs_ch >> 6)   | 0xc0;
 246                         *utf8p++ = (ucs_ch & 0x3f) | 0x80;
 247
 248                 } else {
 249                         if ((utf8p + 2) >= bufend)
 250                                 break;
 251                         *utf8p++ = (ucs_ch >> 12)         | 0xe0;
 252                         *utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
 253                         *utf8p++ = ((ucs_ch) & 0x3f)      | 0x80;
 254                 }
 255
 256         }
 257
 258         *utf8p = '\0';
 259 }
 260
 261
 262 /*
 263  * utf_decodestr - Decodes the null terminated UTF-8 string at
 264  * utf8p into a UCS-2 (Unicode) string at ucsp.
 265  *
 266  * ucslen is the number of UCS-2 output characters (not bytes)
 267  * bufsize is the size of the output buffer in bytes
 268  */
 269 void
 270 utf_decodestr(const u_int8_t *utf8p, u_int16_t *ucsp, u_int16_t *ucslen, u_int32_t bufsize)
 271 {
 272         u_int16_t *bufstart;
 273         u_int16_t *bufend;
 274         u_int16_t ucs_ch;
 275         u_int8_t  byte;
 276
 277         bufstart = ucsp;
 278         bufend = (u_int16_t *)((u_int8_t *)ucsp + bufsize);
 279
 280         while ((byte = *utf8p++) != '\0') {
 281                 if (ucsp >= bufend)
 282                         break;
 283
 284                 /* check for ascii */
 285                 if (byte < 0x80) {
 286                         ucs_ch = byte;
 287
 288                         *ucsp++ = ucs_ch;
 289                         continue;
 290                 }
 291
 292                 switch (byte & 0xf0) {
 293                 /*  2 byte sequence*/
 294                 case 0xc0:
 295                 case 0xd0:
 296                         /* extract bits 6 - 10 from first byte */
 297                         ucs_ch = (byte & 0x1F) << 6;
 298                         break;
 299                 /* 3 byte sequence*/
 300                 case 0xe0:
 301                         /* extract bits 12 - 15 from first byte */
 302                         ucs_ch = (byte & 0x0F) << 6;
 303
 304                         /* extract bits 6 - 11 from second byte */
 305                         if (((byte = *utf8p++) & 0xc0) != 0x80)
 306                                 goto stop;
 307
 308                         ucs_ch += (byte & 0x3F);
 309                         ucs_ch <<= 6;
 310                         break;
 311                 default:
 312                         goto stop;
 313                 }
 314
 315                 /* extract bits 0 - 5 from final byte */
 316                 if (((byte = *utf8p++) & 0xc0) != 0x80)
 317                         goto stop;
 318                 ucs_ch += (byte & 0x3F);
 319
 320                 *ucsp++ = ucs_ch;
 321         }
 322 stop:
 323         *ucslen = ucsp - bufstart;
 324 }