bootx.tproj/fs.subproj/HFSCompare.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23  *  HFSCompare.c - Functions for working with and comparing HFS nams.
  24  *
  25  *  Copyright (c) 1999-2000 Apple Computer, Inc.
  26  *
  27  *  DRI: Josh de Cesare
  28  */
  29
  30 #include <sl.h>
  31 #include "CaseTables.h"
  32
  33
  34 //_______________________________________________________________________
  35 //
  36 //      Routine:        FastRelString
  37 //
  38 //      Output:         returns -1 if str1 < str2
  39 //                              returns  1 if str1 > str2
  40 //                              return   0 if equal
  41 //
  42 //_______________________________________________________________________
  43
  44 int32_t FastRelString(char *str1, char *str2 )
  45 {
  46         int32_t  bestGuess;
  47         u_int8_t length, length2;
  48
  49
  50         length = *(str1++);
  51         length2 = *(str2++);
  52
  53         if (length == length2)
  54                 bestGuess = 0;
  55         else if (length < length2)
  56                 bestGuess = -1;
  57         else
  58         {
  59                 bestGuess = 1;
  60                 length = length2;
  61         }
  62
  63         while (length--)
  64         {
  65                 u_int32_t aChar, bChar;
  66
  67                 aChar = *(str1++);
  68                 bChar = *(str2++);
  69
  70                 if (aChar != bChar)     /* If they don't match exacly, do case conversion */
  71                 {
  72                         u_int16_t aSortWord, bSortWord;
  73
  74                         aSortWord = gCompareTable[aChar];
  75                         bSortWord = gCompareTable[bChar];
  76
  77                         if (aSortWord > bSortWord)
  78                                 return 1;
  79
  80                         if (aSortWord < bSortWord)
  81                                 return -1;
  82                 }
  83
  84                 /*
  85                  * If characters match exactly, then go on to next character
  86                  * immediately without doing any extra work.
  87                  */
  88         }
  89
  90         /* if you got to here, then return bestGuess */
  91         return bestGuess;
  92 }
  93
  94
  95
  96 //
  97 //      FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
  98 //
  99 //          IF                          RESULT
 100 //      --------------------------
 101 //      str1 < str2             =>      -1
 102 //      str1 = str2             =>       0
 103 //      str1 > str2             =>      +1
 104 //
 105 //      The lower case table starts with 256 entries (one for each of the upper bytes
 106 //      of the original Unicode char).  If that entry is zero, then all characters with
 107 //      that upper byte are already case folded.  If the entry is non-zero, then it is
 108 //      the _index_ (not byte offset) of the start of the sub-table for the characters
 109 //      with that upper byte.  All ignorable characters are folded to the value zero.
 110 //
 111 //      In pseudocode:
 112 //
 113 //              Let c = source Unicode character
 114 //              Let table[] = lower case table
 115 //
 116 //              lower = table[highbyte(c)]
 117 //              if (lower == 0)
 118 //                      lower = c
 119 //              else
 120 //                      lower = table[lower+lowbyte(c)]
 121 //
 122 //              if (lower == 0)
 123 //                      ignore this character
 124 //
 125 //      To handle ignorable characters, we now need a loop to find the next valid character.
 126 //      Also, we can't pre-compute the number of characters to compare; the string length might
 127 //      be larger than the number of non-ignorable characters.  Further, we must be able to handle
 128 //      ignorable characters at any point in the string, including as the first or last characters.
 129 //      We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 130 //      Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 131 //      the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 132 //      an invalid Unicode character).
 133 //
 134 //      Pseudocode:
 135 //
 136 //              while (1) {
 137 //                      c1 = GetNextValidChar(str1)                     //      returns zero if at end of string
 138 //                      c2 = GetNextValidChar(str2)
 139 //
 140 //                      if (c1 != c2) break                                     //      found a difference
 141 //
 142 //                      if (c1 == 0)                                            //      reached end of string on both strings at once?
 143 //                              return 0;                                               //      yes, so strings are equal
 144 //              }
 145 //
 146 //              // When we get here, c1 != c2.  So, we just need to determine which one is less.
 147 //              if (c1 < c2)
 148 //                      return -1;
 149 //              else
 150 //                      return 1;
 151 //
 152
 153 int32_t FastUnicodeCompare (u_int16_t *str1, register u_int32_t length1,
 154                             u_int16_t *str2, register u_int32_t length2)
 155 {
 156         register u_int16_t c1,c2;
 157         register u_int16_t temp;
 158
 159         while (1) {
 160                 /* Set default values for c1, c2 in case there are no more valid chars */
 161                 c1 = 0;
 162                 c2 = 0;
 163
 164                 /* Find next non-ignorable char from str1, or zero if no more */
 165                 while (length1 && c1 == 0) {
 166                         c1 = *(str1++);
 167                         --length1;
 168                         if ((temp = gLowerCaseTable[c1>>8]) != 0)               // is there a subtable for this upper byte?
 169                                 c1 = gLowerCaseTable[temp + (c1 & 0x00FF)];     // yes, so fold the char
 170                 }
 171
 172
 173                 /* Find next non-ignorable char from str2, or zero if no more */
 174                 while (length2 && c2 == 0) {
 175                         c2 = *(str2++);
 176                         --length2;
 177                         if ((temp = gLowerCaseTable[c2>>8]) != 0)               // is there a subtable for this upper byte?
 178                                 c2 = gLowerCaseTable[temp + (c2 & 0x00FF)];     // yes, so fold the char
 179                 }
 180
 181                 if (c1 != c2)   /* found a difference, so stop looping */
 182                         break;
 183
 184                 if (c1 == 0)            /* did we reach the end of both strings at the same time? */
 185                         return 0;       /* yes, so strings are equal */
 186         }
 187
 188         if (c1 < c2)
 189                 return -1;
 190         else
 191                 return 1;
 192 }
 193
 194
 195 /*
 196  * UTF-8 (UCS Transformation Format)
 197  *
 198  * The following subset of UTF-8 is used to encode UCS-2 filenames. It
 199  * requires a maximum of three 3 bytes per UCS-2 character.  Only the
 200  * shortest encoding required to represent the significant UCS-2 bits
 201  * is legal.
 202  *
 203  * UTF-8 Multibyte Codes
 204  *
 205  * Bytes   Bits   UCS-2 Min   UCS-2 Max   UTF-8 Byte Sequence (binary)
 206  * -------------------------------------------------------------------
 207  *   1       7     0x0000      0x007F      0xxxxxxx
 208  *   2      11     0x0080      0x07FF      110xxxxx 10xxxxxx
 209  *   3      16     0x0800      0xFFFF      1110xxxx 10xxxxxx 10xxxxxx
 210  * -------------------------------------------------------------------
 211  */
 212
 213
 214 /*
 215  * utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a
 216  * null terminated UTF-8 string at utf8p.
 217  *
 218  * ucslen is the number of UCS-2 input characters (not bytes)
 219  * bufsize is the size of the output buffer in bytes
 220  */
 221 void
 222 utf_encodestr(const u_int16_t *ucsp, int ucslen, u_int8_t *utf8p, u_int32_t bufsize)
 223 {
 224         u_int8_t *bufend;
 225         u_int16_t ucs_ch;
 226
 227         bufend = utf8p + bufsize;
 228
 229         while (ucslen-- > 0) {
 230                 ucs_ch = *ucsp++;
 231
 232                 if (ucs_ch < 0x0080) {
 233                         if (utf8p >= bufend)
 234                                 break;
 235                         if (ucs_ch == '\0')
 236                                 continue;       /* skip over embedded NULLs */
 237                         *utf8p++ = ucs_ch;
 238
 239                 } else if (ucs_ch < 0x800) {
 240                         if ((utf8p + 1) >= bufend)
 241                                 break;
 242                         *utf8p++ = (ucs_ch >> 6)   | 0xc0;
 243                         *utf8p++ = (ucs_ch & 0x3f) | 0x80;
 244
 245                 } else {
 246                         if ((utf8p + 2) >= bufend)
 247                                 break;
 248                         *utf8p++ = (ucs_ch >> 12)         | 0xe0;
 249                         *utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
 250                         *utf8p++ = ((ucs_ch) & 0x3f)      | 0x80;
 251                 }
 252
 253         }
 254
 255         *utf8p = '\0';
 256 }
 257
 258
 259 /*
 260  * utf_decodestr - Decodes the null terminated UTF-8 string at
 261  * utf8p into a UCS-2 (Unicode) string at ucsp.
 262  *
 263  * ucslen is the number of UCS-2 output characters (not bytes)
 264  * bufsize is the size of the output buffer in bytes
 265  */
 266 void
 267 utf_decodestr(const u_int8_t *utf8p, u_int16_t *ucsp, u_int16_t *ucslen, u_int32_t bufsize)
 268 {
 269         u_int16_t *bufstart;
 270         u_int16_t *bufend;
 271         u_int16_t ucs_ch;
 272         u_int8_t  byte;
 273
 274         bufstart = ucsp;
 275         bufend = (u_int16_t *)((u_int8_t *)ucsp + bufsize);
 276
 277         while ((byte = *utf8p++) != '\0') {
 278                 if (ucsp >= bufend)
 279                         break;
 280
 281                 /* check for ascii */
 282                 if (byte < 0x80) {
 283                         ucs_ch = byte;
 284
 285                         *ucsp++ = ucs_ch;
 286                         continue;
 287                 }
 288
 289                 switch (byte & 0xf0) {
 290                 /*  2 byte sequence*/
 291                 case 0xc0:
 292                 case 0xd0:
 293                         /* extract bits 6 - 10 from first byte */
 294                         ucs_ch = (byte & 0x1F) << 6;
 295                         break;
 296                 /* 3 byte sequence*/
 297                 case 0xe0:
 298                         /* extract bits 12 - 15 from first byte */
 299                         ucs_ch = (byte & 0x0F) << 6;
 300
 301                         /* extract bits 6 - 11 from second byte */
 302                         if (((byte = *utf8p++) & 0xc0) != 0x80)
 303                                 goto stop;
 304
 305                         ucs_ch += (byte & 0x3F);
 306                         ucs_ch <<= 6;
 307                         break;
 308                 default:
 309                         goto stop;
 310                 }
 311
 312                 /* extract bits 0 - 5 from final byte */
 313                 if (((byte = *utf8p++) & 0xc0) != 0x80)
 314                         goto stop;
 315                 ucs_ch += (byte & 0x3F);
 316
 317                 *ucsp++ = ucs_ch;
 318         }
 319 stop:
 320         *ucslen = ucsp - bufstart;
 321 }