i386/libsaio/hfs_compare.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23  *  HFSCompare.c - Functions for working with and comparing HFS nams.
  24  *
  25  *  Copyright (c) 1999-2000 Apple Computer, Inc.
  26  *
  27  *  DRI: Josh de Cesare
  28  */
  29
  30 #include <sl.h>
  31 #include "hfs_CaseTables.h"
  32
  33 //_______________________________________________________________________
  34 //
  35 //      Routine:        FastRelString
  36 //
  37 //      Output:         returns -1 if str1 < str2
  38 //                              returns  1 if str1 > str2
  39 //                              return   0 if equal
  40 //
  41 //_______________________________________________________________________
  42
  43 int32_t FastRelString(char * str1, char * str2)
  44 {
  45         int32_t  bestGuess;
  46         u_int8_t length, length2;
  47
  48         length = *(str1++);
  49         length2 = *(str2++);
  50
  51         if (length == length2)
  52                 bestGuess = 0;
  53         else if (length < length2)
  54                 bestGuess = -1;
  55         else
  56         {
  57                 bestGuess = 1;
  58                 length = length2;
  59         }
  60
  61         while (length--)
  62         {
  63                 u_int32_t aChar, bChar;
  64
  65                 aChar = *(str1++);
  66                 bChar = *(str2++);
  67
  68                 if (aChar != bChar)     /* If they don't match exacly, do case conversion */
  69                 {
  70                         u_int16_t aSortWord, bSortWord;
  71
  72                         aSortWord = gCompareTable[aChar];
  73                         bSortWord = gCompareTable[bChar];
  74
  75                         if (aSortWord > bSortWord)
  76                                 return 1;
  77
  78                         if (aSortWord < bSortWord)
  79                                 return -1;
  80                 }
  81
  82                 /*
  83                  * If characters match exactly, then go on to next character
  84                  * immediately without doing any extra work.
  85                  */
  86         }
  87
  88         /* if you got to here, then return bestGuess */
  89         return bestGuess;
  90 }
  91
  92
  93 //
  94 //      FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering
  95 //
  96 //          IF                          RESULT
  97 //      --------------------------
  98 //      str1 < str2             =>      -1
  99 //      str1 = str2             =>       0
 100 //      str1 > str2             =>      +1
 101 //
 102 //      The lower case table starts with 256 entries (one for each of the upper bytes
 103 //      of the original Unicode char).  If that entry is zero, then all characters with
 104 //      that upper byte are already case folded.  If the entry is non-zero, then it is
 105 //      the _index_ (not byte offset) of the start of the sub-table for the characters
 106 //      with that upper byte.  All ignorable characters are folded to the value zero.
 107 //
 108 //      In pseudocode:
 109 //
 110 //              Let c = source Unicode character
 111 //              Let table[] = lower case table
 112 //
 113 //              lower = table[highbyte(c)]
 114 //              if (lower == 0)
 115 //                      lower = c
 116 //              else
 117 //                      lower = table[lower+lowbyte(c)]
 118 //
 119 //              if (lower == 0)
 120 //                      ignore this character
 121 //
 122 //      To handle ignorable characters, we now need a loop to find the next valid character.
 123 //      Also, we can't pre-compute the number of characters to compare; the string length might
 124 //      be larger than the number of non-ignorable characters.  Further, we must be able to handle
 125 //      ignorable characters at any point in the string, including as the first or last characters.
 126 //      We use a zero value as a sentinel to detect both end-of-string and ignorable characters.
 127 //      Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,
 128 //      the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is
 129 //      an invalid Unicode character).
 130 //
 131 //      Pseudocode:
 132 //
 133 //              while (1) {
 134 //                      c1 = GetNextValidChar(str1)                     //      returns zero if at end of string
 135 //                      c2 = GetNextValidChar(str2)
 136 //
 137 //                      if (c1 != c2) break                                     //      found a difference
 138 //
 139 //                      if (c1 == 0)                                            //      reached end of string on both strings at once?
 140 //                              return 0;                                               //      yes, so strings are equal
 141 //              }
 142 //
 143 //              // When we get here, c1 != c2.  So, we just need to determine which one is less.
 144 //              if (c1 < c2)
 145 //                      return -1;
 146 //              else
 147 //                      return 1;
 148 //
 149
 150 int32_t FastUnicodeCompare( u_int16_t * str1, register u_int32_t length1,
 151                             u_int16_t * str2, register u_int32_t length2 )
 152 {
 153         register u_int16_t c1,c2;
 154         register u_int16_t temp;
 155
 156         while (1) {
 157                 /* Set default values for c1, c2 in case there are no more valid chars */
 158                 c1 = 0;
 159                 c2 = 0;
 160
 161                 /* Find next non-ignorable char from str1, or zero if no more */
 162                 while (length1 && c1 == 0) {
 163                         c1 = SWAP_BE16(*(str1++));
 164                         --length1;
 165                         if ((temp = gLowerCaseTable[c1>>8]) != 0)               // is there a subtable for this upper byte?
 166                                 c1 = gLowerCaseTable[temp + (c1 & 0x00FF)];     // yes, so fold the char
 167                 }
 168
 169                 /* Find next non-ignorable char from str2, or zero if no more */
 170                 while (length2 && c2 == 0) {
 171                         c2 = SWAP_BE16(*(str2++));
 172                         --length2;
 173                         if ((temp = gLowerCaseTable[c2>>8]) != 0)               // is there a subtable for this upper byte?
 174                                 c2 = gLowerCaseTable[temp + (c2 & 0x00FF)];     // yes, so fold the char
 175                 }
 176
 177                 if (c1 != c2)   /* found a difference, so stop looping */
 178                         break;
 179
 180                 if (c1 == 0)            /* did we reach the end of both strings at the same time? */
 181                         return 0;       /* yes, so strings are equal */
 182         }
 183
 184         if (c1 < c2)
 185                 return -1;
 186         else
 187                 return 1;
 188 }
 189
 190
 191 /*
 192  * UTF-8 (UCS Transformation Format)
 193  *
 194  * The following subset of UTF-8 is used to encode UCS-2 filenames. It
 195  * requires a maximum of three 3 bytes per UCS-2 character.  Only the
 196  * shortest encoding required to represent the significant UCS-2 bits
 197  * is legal.
 198  *
 199  * UTF-8 Multibyte Codes
 200  *
 201  * Bytes   Bits   UCS-2 Min   UCS-2 Max   UTF-8 Byte Sequence (binary)
 202  * -------------------------------------------------------------------
 203  *   1       7     0x0000      0x007F      0xxxxxxx
 204  *   2      11     0x0080      0x07FF      110xxxxx 10xxxxxx
 205  *   3      16     0x0800      0xFFFF      1110xxxx 10xxxxxx 10xxxxxx
 206  * -------------------------------------------------------------------
 207  */
 208
 209
 210 /*
 211  * utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a
 212  * null terminated UTF-8 string at utf8p.
 213  *
 214  * ucslen is the number of UCS-2 input characters (not bytes)
 215  * bufsize is the size of the output buffer in bytes
 216  */
 217 void
 218 utf_encodestr( const u_int16_t * ucsp, int ucslen,
 219                u_int8_t * utf8p, u_int32_t bufsize )
 220 {
 221         u_int8_t *bufend;
 222         u_int16_t ucs_ch;
 223
 224         bufend = utf8p + bufsize;
 225
 226         while (ucslen-- > 0) {
 227                 ucs_ch = SWAP_BE16(*ucsp++);
 228
 229                 if (ucs_ch < 0x0080) {
 230                         if (utf8p >= bufend)
 231                                 break;
 232                         if (ucs_ch == '\0')
 233                                 continue;       /* skip over embedded NULLs */
 234                         *utf8p++ = ucs_ch;
 235
 236                 } else if (ucs_ch < 0x800) {
 237                         if ((utf8p + 1) >= bufend)
 238                                 break;
 239                         *utf8p++ = (ucs_ch >> 6)   | 0xc0;
 240                         *utf8p++ = (ucs_ch & 0x3f) | 0x80;
 241
 242                 } else {
 243                         if ((utf8p + 2) >= bufend)
 244                                 break;
 245                         *utf8p++ = (ucs_ch >> 12)         | 0xe0;
 246                         *utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;
 247                         *utf8p++ = ((ucs_ch) & 0x3f)      | 0x80;
 248                 }
 249         }
 250
 251         *utf8p = '\0';
 252 }
 253
 254
 255 /*
 256  * utf_decodestr - Decodes the null terminated UTF-8 string at
 257  * utf8p into a UCS-2 (Unicode) string at ucsp.
 258  *
 259  * ucslen is the number of UCS-2 output characters (not bytes)
 260  * bufsize is the size of the output buffer in bytes
 261  */
 262 void utf_decodestr(const u_int8_t * utf8p, u_int16_t * ucsp, u_int16_t * ucslen, u_int32_t bufsize)
 263 {
 264         u_int16_t *bufstart;
 265         u_int16_t *bufend;
 266         u_int16_t ucs_ch;
 267         u_int8_t  byte;
 268
 269         bufstart = ucsp;
 270         bufend = (u_int16_t *)((u_int8_t *)ucsp + bufsize);
 271
 272         while ((byte = *utf8p++) != '\0') {
 273                 if (ucsp >= bufend)
 274                         break;
 275
 276                 /* check for ascii */
 277                 if (byte < 0x80) {
 278                         ucs_ch = byte;
 279
 280                         *ucsp++ = SWAP_BE16(ucs_ch);
 281                         continue;
 282                 }
 283
 284                 switch (byte & 0xf0) {
 285                 /*  2 byte sequence*/
 286                 case 0xc0:
 287                 case 0xd0:
 288                         /* extract bits 6 - 10 from first byte */
 289                         ucs_ch = (byte & 0x1F) << 6;
 290                         break;
 291                 /* 3 byte sequence*/
 292                 case 0xe0:
 293                         /* extract bits 12 - 15 from first byte */
 294                         ucs_ch = (byte & 0x0F) << 6;
 295
 296                         /* extract bits 6 - 11 from second byte */
 297                         if (((byte = *utf8p++) & 0xc0) != 0x80)
 298                                 goto stop;
 299
 300                         ucs_ch += (byte & 0x3F);
 301                         ucs_ch <<= 6;
 302                         break;
 303                 default:
 304                         goto stop;
 305                 }
 306
 307                 /* extract bits 0 - 5 from final byte */
 308                 if (((byte = *utf8p++) & 0xc0) != 0x80)
 309                         goto stop;
 310                 ucs_ch += (byte & 0x3F);
 311
 312                 *ucsp++ = SWAP_BE16(ucs_ch);
 313         }
 314 stop:
 315         *ucslen = SWAP_BE16(ucsp - bufstart);
 316 }