bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/proc.h>
  45 #include <sys/kauth.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vnode_internal.h>
  48 #include <sys/uio.h>
  49 #include <sys/vfs_context.h>
  50 #include <sys/fsevents.h>
  51 #include <kern/kalloc.h>
  52 #include <sys/disk.h>
  53 #include <sys/sysctl.h>
  54
  55 #include <miscfs/specfs/specdev.h>
  56
  57 #include <sys/ubc.h>
  58 #include <sys/ubc_internal.h>
  59
  60 #include <vm/vm_pageout.h>
  61 #include <vm/vm_kern.h>
  62
  63 #include <sys/kdebug.h>
  64
  65 #include        "hfs.h"
  66 #include        "hfs_attrlist.h"
  67 #include        "hfs_endian.h"
  68 #include        "hfs_fsctl.h"
  69 #include        "hfs_quota.h"
  70 #include        "hfscommon/headers/FileMgrInternal.h"
  71 #include        "hfscommon/headers/BTreesInternal.h"
  72 #include        "hfs_cnode.h"
  73 #include        "hfs_dbg.h"
  74
  75 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  76
  77 enum {
  78         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  79 };
  80
  81 /* from bsd/vfs/vfs_cluster.c */
  82 extern int is_file_clean(vnode_t vp, off_t filesize);
  83
  84 static int  hfs_clonelink(struct vnode *, int, kauth_cred_t, struct proc *);
  85 static int  hfs_clonefile(struct vnode *, int, int, int);
  86 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  87
  88 int flush_cache_on_write = 0;
  89 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
  90
  91
  92 /*
  93  * Read data from a file.
  94  */
  95 int
  96 hfs_vnop_read(struct vnop_read_args *ap)
  97 {
  98         uio_t uio = ap->a_uio;
  99         struct vnode *vp = ap->a_vp;
 100         struct cnode *cp;
 101         struct filefork *fp;
 102         struct hfsmount *hfsmp;
 103         off_t filesize;
 104         off_t filebytes;
 105         off_t start_resid = uio_resid(uio);
 106         off_t offset = uio_offset(uio);
 107         int retval = 0;
 108
 109
 110         /* Preflight checks */
 111         if (!vnode_isreg(vp)) {
 112                 /* can only read regular files */
 113                 if (vnode_isdir(vp))
 114                         return (EISDIR);
 115                 else
 116                         return (EPERM);
 117         }
 118         if (start_resid == 0)
 119                 return (0);             /* Nothing left to do */
 120         if (offset < 0)
 121                 return (EINVAL);        /* cant read from a negative offset */
 122
 123         cp = VTOC(vp);
 124         fp = VTOF(vp);
 125         hfsmp = VTOHFS(vp);
 126
 127         /* Protect against a size change. */
 128         hfs_lock_truncate(cp, 0);
 129
 130         filesize = fp->ff_size;
 131         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 132         if (offset > filesize) {
 133                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 134                     (offset > (off_t)MAXHFSFILESIZE)) {
 135                         retval = EFBIG;
 136                 }
 137                 goto exit;
 138         }
 139
 140         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
 141                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 142
 143         retval = cluster_read(vp, uio, filesize, ap->a_ioflag);
 144
 145         cp->c_touch_acctime = TRUE;
 146
 147         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 148                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 149
 150         /*
 151          * Keep track blocks read
 152          */
 153         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 154                 int took_cnode_lock = 0;
 155                 off_t bytesread;
 156
 157                 bytesread = start_resid - uio_resid(uio);
 158
 159                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 160                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 161                         hfs_lock(cp, HFS_FORCE_LOCK);
 162                         took_cnode_lock = 1;
 163                 }
 164                 /*
 165                  * If this file hasn't been seen since the start of
 166                  * the current sampling period then start over.
 167                  */
 168                 if (cp->c_atime < hfsmp->hfc_timebase) {
 169                         struct timeval tv;
 170
 171                         fp->ff_bytesread = bytesread;
 172                         microtime(&tv);
 173                         cp->c_atime = tv.tv_sec;
 174                 } else {
 175                         fp->ff_bytesread += bytesread;
 176                 }
 177                 if (took_cnode_lock)
 178                         hfs_unlock(cp);
 179         }
 180 exit:
 181         hfs_unlock_truncate(cp, 0);
 182         return (retval);
 183 }
 184
 185 /*
 186  * Write data to a file.
 187  */
 188 int
 189 hfs_vnop_write(struct vnop_write_args *ap)
 190 {
 191         uio_t uio = ap->a_uio;
 192         struct vnode *vp = ap->a_vp;
 193         struct cnode *cp;
 194         struct filefork *fp;
 195         struct hfsmount *hfsmp;
 196         kauth_cred_t cred = NULL;
 197         off_t origFileSize;
 198         off_t writelimit;
 199         off_t bytesToAdd = 0;
 200         off_t actualBytesAdded;
 201         off_t filebytes;
 202         off_t offset;
 203         size_t resid;
 204         int eflags;
 205         int ioflag = ap->a_ioflag;
 206         int retval = 0;
 207         int lockflags;
 208         int cnode_locked = 0;
 209         int partialwrite = 0;
 210         int exclusive_lock = 0;
 211
 212         // LP64todo - fix this! uio_resid may be 64-bit value
 213         resid = uio_resid(uio);
 214         offset = uio_offset(uio);
 215
 216         if (ioflag & IO_APPEND) {
 217             exclusive_lock = 1;
 218         }
 219
 220         if (offset < 0)
 221                 return (EINVAL);
 222         if (resid == 0)
 223                 return (E_NONE);
 224         if (!vnode_isreg(vp))
 225                 return (EPERM);  /* Can only write regular files */
 226
 227         cp = VTOC(vp);
 228         fp = VTOF(vp);
 229         hfsmp = VTOHFS(vp);
 230
 231         eflags = kEFDeferMask;  /* defer file block allocations */
 232 #ifdef HFS_SPARSE_DEV
 233         /*
 234          * When the underlying device is sparse and space
 235          * is low (< 8MB), stop doing delayed allocations
 236          * and begin doing synchronous I/O.
 237          */
 238         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 239             (hfs_freeblks(hfsmp, 0) < 2048)) {
 240                 eflags &= ~kEFDeferMask;
 241                 ioflag |= IO_SYNC;
 242         }
 243 #endif /* HFS_SPARSE_DEV */
 244
 245 again:
 246         /* Protect against a size change. */
 247         hfs_lock_truncate(cp, exclusive_lock);
 248
 249         if (ioflag & IO_APPEND) {
 250                 uio_setoffset(uio, fp->ff_size);
 251                 offset = fp->ff_size;
 252         }
 253         if ((cp->c_flags & APPEND) && offset != fp->ff_size) {
 254                 retval = EPERM;
 255                 goto exit;
 256         }
 257
 258         origFileSize = fp->ff_size;
 259         writelimit = offset + resid;
 260         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 261
 262         /* If the truncate lock is shared, and if we either have virtual
 263          * blocks or will need to extend the file, upgrade the truncate
 264          * to exclusive lock.  If upgrade fails, we lose the lock and
 265          * have to get exclusive lock again
 266          */
 267         if ((exclusive_lock == 0) &&
 268             ((fp->ff_unallocblocks != 0) || (writelimit > filebytes))) {
 269                 exclusive_lock = 1;
 270                 /* Lock upgrade failed and we lost our shared lock, try again */
 271                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 272                         goto again;
 273                 }
 274         }
 275
 276         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) {
 277                 goto exit;
 278         }
 279         cnode_locked = 1;
 280
 281         if (!exclusive_lock) {
 282                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
 283                              (int)offset, uio_resid(uio), (int)fp->ff_size,
 284                              (int)filebytes, 0);
 285         }
 286
 287         /* Check if we do not need to extend the file */
 288         if (writelimit <= filebytes) {
 289                 goto sizeok;
 290         }
 291
 292         cred = vfs_context_ucred(ap->a_context);
 293         bytesToAdd = writelimit - filebytes;
 294
 295 #if QUOTA
 296         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 297                            cred, 0);
 298         if (retval)
 299                 goto exit;
 300 #endif /* QUOTA */
 301
 302         if (hfs_start_transaction(hfsmp) != 0) {
 303                 retval = EINVAL;
 304                 goto exit;
 305         }
 306
 307         while (writelimit > filebytes) {
 308                 bytesToAdd = writelimit - filebytes;
 309                 if (cred && suser(cred, NULL) != 0)
 310                         eflags |= kEFReserveMask;
 311
 312                 /* Protect extents b-tree and allocation bitmap */
 313                 lockflags = SFL_BITMAP;
 314                 if (overflow_extents(fp))
 315                         lockflags |= SFL_EXTENTS;
 316                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 317
 318                 /* Files that are changing size are not hot file candidates. */
 319                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 320                         fp->ff_bytesread = 0;
 321                 }
 322                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 323                                 0, eflags, &actualBytesAdded));
 324
 325                 hfs_systemfile_unlock(hfsmp, lockflags);
 326
 327                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 328                         retval = ENOSPC;
 329                 if (retval != E_NONE)
 330                         break;
 331                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 332                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
 333                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 334         }
 335         (void) hfs_update(vp, TRUE);
 336         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 337         (void) hfs_end_transaction(hfsmp);
 338
 339         /*
 340          * If we didn't grow the file enough try a partial write.
 341          * POSIX expects this behavior.
 342          */
 343         if ((retval == ENOSPC) && (filebytes > offset)) {
 344                 retval = 0;
 345                 partialwrite = 1;
 346                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 347                 resid -= bytesToAdd;
 348                 writelimit = filebytes;
 349         }
 350 sizeok:
 351         if (retval == E_NONE) {
 352                 off_t filesize;
 353                 off_t zero_off;
 354                 off_t tail_off;
 355                 off_t inval_start;
 356                 off_t inval_end;
 357                 off_t io_start;
 358                 int lflag;
 359                 struct rl_entry *invalid_range;
 360
 361                 if (writelimit > fp->ff_size)
 362                         filesize = writelimit;
 363                 else
 364                         filesize = fp->ff_size;
 365
 366                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 367
 368                 if (offset <= fp->ff_size) {
 369                         zero_off = offset & ~PAGE_MASK_64;
 370
 371                         /* Check to see whether the area between the zero_offset and the start
 372                            of the transfer to see whether is invalid and should be zero-filled
 373                            as part of the transfer:
 374                          */
 375                         if (offset > zero_off) {
 376                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 377                                         lflag |= IO_HEADZEROFILL;
 378                         }
 379                 } else {
 380                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 381
 382                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 383                            read without being zeroed.  The current last block is filled with zeroes
 384                            if it holds valid data but in all cases merely do a little bookkeeping
 385                            to track the area from the end of the current last page to the start of
 386                            the area actually written.  For the same reason only the bytes up to the
 387                            start of the page where this write will start is invalidated; any remainder
 388                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 389
 390                            Note that inval_start, the start of the page after the current EOF,
 391                            may be past the start of the write, in which case the zeroing
 392                            will be handled by the cluser_write of the actual data.
 393                          */
 394                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 395                         inval_end = offset & ~PAGE_MASK_64;
 396                         zero_off = fp->ff_size;
 397
 398                         if ((fp->ff_size & PAGE_MASK_64) &&
 399                                 (rl_scan(&fp->ff_invalidranges,
 400                                                         eof_page_base,
 401                                                         fp->ff_size - 1,
 402                                                         &invalid_range) != RL_NOOVERLAP)) {
 403                                 /* The page containing the EOF is not valid, so the
 404                                    entire page must be made inaccessible now.  If the write
 405                                    starts on a page beyond the page containing the eof
 406                                    (inval_end > eof_page_base), add the
 407                                    whole page to the range to be invalidated.  Otherwise
 408                                    (i.e. if the write starts on the same page), zero-fill
 409                                    the entire page explicitly now:
 410                                  */
 411                                 if (inval_end > eof_page_base) {
 412                                         inval_start = eof_page_base;
 413                                 } else {
 414                                         zero_off = eof_page_base;
 415                                 };
 416                         };
 417
 418                         if (inval_start < inval_end) {
 419                                 struct timeval tv;
 420                                 /* There's some range of data that's going to be marked invalid */
 421
 422                                 if (zero_off < inval_start) {
 423                                         /* The pages between inval_start and inval_end are going to be invalidated,
 424                                            and the actual write will start on a page past inval_end.  Now's the last
 425                                            chance to zero-fill the page containing the EOF:
 426                                          */
 427                                         hfs_unlock(cp);
 428                                         cnode_locked = 0;
 429                                         retval = cluster_write(vp, (uio_t) 0,
 430                                                         fp->ff_size, inval_start,
 431                                                         zero_off, (off_t)0,
 432                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 433                                         hfs_lock(cp, HFS_FORCE_LOCK);
 434                                         cnode_locked = 1;
 435                                         if (retval) goto ioerr_exit;
 436                                         offset = uio_offset(uio);
 437                                 };
 438
 439                                 /* Mark the remaining area of the newly allocated space as invalid: */
 440                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 441                                 microuptime(&tv);
 442                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 443                                 zero_off = fp->ff_size = inval_end;
 444                         };
 445
 446                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 447                 };
 448
 449                 /* Check to see whether the area between the end of the write and the end of
 450                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 451                  */
 452                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 453                 if (tail_off > filesize) tail_off = filesize;
 454                 if (tail_off > writelimit) {
 455                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 456                                 lflag |= IO_TAILZEROFILL;
 457                         };
 458                 };
 459
 460                 /*
 461                  * if the write starts beyond the current EOF (possibly advanced in the
 462                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 463                  * to where the write begins:
 464                  *
 465                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 466                  *       before the current EOF it might be marked as invalid now and must be
 467                  *       made readable (removed from the invalid ranges) before cluster_write
 468                  *       tries to write it:
 469                  */
 470                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 471                 if (io_start < fp->ff_size) {
 472                         off_t io_end;
 473
 474                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 475                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 476                 };
 477
 478                 hfs_unlock(cp);
 479                 cnode_locked = 0;
 480                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 481                                 tail_off, lflag | IO_NOZERODIRTY);
 482                 if (retval) {
 483                         goto ioerr_exit;
 484                 }
 485                 offset = uio_offset(uio);
 486                 if (offset > fp->ff_size) {
 487                         fp->ff_size = offset;
 488
 489                         ubc_setsize(vp, fp->ff_size);       /* XXX check errors */
 490                         /* Files that are changing size are not hot file candidates. */
 491                         if (hfsmp->hfc_stage == HFC_RECORDING)
 492                                 fp->ff_bytesread = 0;
 493                 }
 494                 if (resid > uio_resid(uio)) {
 495                         cp->c_touch_chgtime = TRUE;
 496                         cp->c_touch_modtime = TRUE;
 497                 }
 498         }
 499         if (partialwrite) {
 500                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 501                 resid += bytesToAdd;
 502         }
 503
 504         // XXXdbg - see radar 4871353 for more info
 505         {
 506             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 507                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 508             }
 509         }
 510         HFS_KNOTE(vp, NOTE_WRITE);
 511
 512 ioerr_exit:
 513         /*
 514          * If we successfully wrote any data, and we are not the superuser
 515          * we clear the setuid and setgid bits as a precaution against
 516          * tampering.
 517          */
 518         if (cp->c_mode & (S_ISUID | S_ISGID)) {
 519                 cred = vfs_context_ucred(ap->a_context);
 520                 if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
 521                         if (!cnode_locked) {
 522                                 hfs_lock(cp, HFS_FORCE_LOCK);
 523                                 cnode_locked = 1;
 524                         }
 525                         cp->c_mode &= ~(S_ISUID | S_ISGID);
 526                 }
 527         }
 528         if (retval) {
 529                 if (ioflag & IO_UNIT) {
 530                         if (!cnode_locked) {
 531                                 hfs_lock(cp, HFS_FORCE_LOCK);
 532                                 cnode_locked = 1;
 533                         }
 534                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 535                                            0, ap->a_context);
 536                         // LP64todo - fix this!  resid needs to by user_ssize_t
 537                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 538                         uio_setresid(uio, resid);
 539                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 540                 }
 541         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
 542                 if (!cnode_locked) {
 543                         hfs_lock(cp, HFS_FORCE_LOCK);
 544                         cnode_locked = 1;
 545                 }
 546                 retval = hfs_update(vp, TRUE);
 547         }
 548         /* Updating vcbWrCnt doesn't need to be atomic. */
 549         hfsmp->vcbWrCnt++;
 550
 551         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 552                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 553 exit:
 554         if (cnode_locked)
 555                 hfs_unlock(cp);
 556         hfs_unlock_truncate(cp, exclusive_lock);
 557         return (retval);
 558 }
 559
 560 /* support for the "bulk-access" fcntl */
 561
 562 #define CACHE_LEVELS 16
 563 #define NUM_CACHE_ENTRIES (64*16)
 564 #define PARENT_IDS_FLAG 0x100
 565
 566 struct access_cache {
 567        int numcached;
 568        int cachehits; /* these two for statistics gathering */
 569        int lookups;
 570        unsigned int *acache;
 571        unsigned char *haveaccess;
 572 };
 573
 574 struct access_t {
 575         uid_t     uid;              /* IN: effective user id */
 576         short     flags;            /* IN: access requested (i.e. R_OK) */
 577         short     num_groups;       /* IN: number of groups user belongs to */
 578         int       num_files;        /* IN: number of files to process */
 579         int       *file_ids;        /* IN: array of file ids */
 580         gid_t     *groups;          /* IN: array of groups */
 581         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 582 };
 583
 584 struct user_access_t {
 585         uid_t           uid;                    /* IN: effective user id */
 586         short           flags;                  /* IN: access requested (i.e. R_OK) */
 587         short           num_groups;             /* IN: number of groups user belongs to */
 588         int             num_files;              /* IN: number of files to process */
 589         user_addr_t     file_ids;               /* IN: array of file ids */
 590         user_addr_t     groups;                 /* IN: array of groups */
 591         user_addr_t     access;                 /* OUT: access info for each file (0 for 'has access') */
 592 };
 593
 594
 595 // these are the "extended" versions of the above structures
 596 // note that it is crucial that they be different sized than
 597 // the regular version
 598 struct ext_access_t {
 599         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 600         uint32_t   num_files;       /* IN: number of files to process */
 601         uint32_t   map_size;        /* IN: size of the bit map */
 602         uint32_t  *file_ids;        /* IN: Array of file ids */
 603         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 604         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 605         uint32_t   num_parents;   /* future use */
 606         cnid_t      *parents;   /* future use */
 607 };
 608
 609 struct ext_user_access_t {
 610         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 611         uint32_t      num_files;    /* IN: number of files to process */
 612         uint32_t      map_size;     /* IN: size of the bit map */
 613         user_addr_t   file_ids;     /* IN: array of file ids */
 614         user_addr_t   bitmap;       /* IN: array of groups */
 615         user_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 616         uint32_t      num_parents;/* future use */
 617         user_addr_t   parents;/* future use */
 618 };
 619
 620
 621 /*
 622  * Perform a binary search for the given parent_id. Return value is
 623  * the index if there is a match.  If no_match_indexp is non-NULL it
 624  * will be assigned with the index to insert the item (even if it was
 625  * not found).
 626  */
 627 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 628 {
 629     int index=-1;
 630     unsigned int lo=0;
 631
 632     do {
 633         unsigned int mid = ((hi - lo)/2) + lo;
 634         unsigned int this_id = array[mid];
 635
 636         if (parent_id == this_id) {
 637             hi = mid;
 638             break;
 639         }
 640
 641         if (parent_id < this_id) {
 642             hi = mid;
 643             continue;
 644         }
 645
 646         if (parent_id > this_id) {
 647             lo = mid + 1;
 648             continue;
 649         }
 650     } while(lo < hi);
 651
 652     /* check if lo and hi converged on the match */
 653     if (parent_id == array[hi]) {
 654         index = hi;
 655     }
 656
 657     if (no_match_indexp) {
 658         *no_match_indexp = hi;
 659     }
 660
 661     return index;
 662 }
 663
 664
 665 static int
 666 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 667 {
 668     unsigned int hi;
 669     int matches = 0;
 670     int index, no_match_index;
 671
 672     if (cache->numcached == 0) {
 673         *indexp = 0;
 674         return 0; // table is empty, so insert at index=0 and report no match
 675     }
 676
 677     if (cache->numcached > NUM_CACHE_ENTRIES) {
 678         /*printf("EGAD! numcached is %d... cut our losses and trim to %d\n",
 679           cache->numcached, NUM_CACHE_ENTRIES);*/
 680         cache->numcached = NUM_CACHE_ENTRIES;
 681     }
 682
 683     hi = cache->numcached - 1;
 684
 685     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 686
 687     /* if no existing entry found, find index for new one */
 688     if (index == -1) {
 689         index = no_match_index;
 690         matches = 0;
 691     } else {
 692         matches = 1;
 693     }
 694
 695     *indexp = index;
 696     return matches;
 697 }
 698
 699 /*
 700  * Add a node to the access_cache at the given index (or do a lookup first
 701  * to find the index if -1 is passed in). We currently do a replace rather
 702  * than an insert if the cache is full.
 703  */
 704 static void
 705 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 706 {
 707     int lookup_index = -1;
 708
 709     /* need to do a lookup first if -1 passed for index */
 710     if (index == -1) {
 711         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 712             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 713                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 714                 cache->haveaccess[lookup_index] = access;
 715             }
 716
 717             /* mission accomplished */
 718             return;
 719         } else {
 720             index = lookup_index;
 721         }
 722
 723     }
 724
 725     /* if the cache is full, do a replace rather than an insert */
 726     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 727         //printf("cache is full (%d). replace at index %d\n", cache->numcached, index);
 728         cache->numcached = NUM_CACHE_ENTRIES-1;
 729
 730         if (index > cache->numcached) {
 731             //    printf("index %d pinned to %d\n", index, cache->numcached);
 732             index = cache->numcached;
 733         }
 734     }
 735
 736     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 737         index++;
 738     }
 739
 740     if (index >= 0 && index < cache->numcached) {
 741         /* only do bcopy if we're inserting */
 742         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 743         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 744     }
 745
 746     cache->acache[index] = nodeID;
 747     cache->haveaccess[index] = access;
 748     cache->numcached++;
 749 }
 750
 751
 752 struct cinfo {
 753     uid_t   uid;
 754     gid_t   gid;
 755     mode_t  mode;
 756     cnid_t  parentcnid;
 757     u_int16_t recflags;
 758 };
 759
 760 static int
 761 snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
 762 {
 763     struct cinfo *cip = (struct cinfo *)arg;
 764
 765     cip->uid = attrp->ca_uid;
 766     cip->gid = attrp->ca_gid;
 767     cip->mode = attrp->ca_mode;
 768     cip->parentcnid = descp->cd_parentcnid;
 769     cip->recflags = attrp->ca_recflags;
 770
 771     return (0);
 772 }
 773
 774 /*
 775  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
 776  * isn't incore, then go to the catalog.
 777  */
 778 static int
 779 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, dev_t dev, cnid_t cnid,
 780     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
 781 {
 782     int error = 0;
 783
 784     /* if this id matches the one the fsctl was called with, skip the lookup */
 785     if (cnid == skip_cp->c_cnid) {
 786         cnattrp->ca_uid = skip_cp->c_uid;
 787         cnattrp->ca_gid = skip_cp->c_gid;
 788         cnattrp->ca_mode = skip_cp->c_mode;
 789         keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
 790     } else {
 791         struct cinfo c_info;
 792
 793         /* otherwise, check the cnode hash incase the file/dir is incore */
 794         if (hfs_chash_snoop(dev, cnid, snoop_callback, &c_info) == 0) {
 795             cnattrp->ca_uid = c_info.uid;
 796             cnattrp->ca_gid = c_info.gid;
 797             cnattrp->ca_mode = c_info.mode;
 798             cnattrp->ca_recflags = c_info.recflags;
 799             keyp->hfsPlus.parentID = c_info.parentcnid;
 800         } else {
 801             int lockflags;
 802
 803             lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
 804
 805             /* lookup this cnid in the catalog */
 806             error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
 807
 808             hfs_systemfile_unlock(hfsmp, lockflags);
 809
 810             cache->lookups++;
 811         }
 812     }
 813
 814     return (error);
 815 }
 816
 817
 818 /*
 819  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
 820  * up to CACHE_LEVELS as we progress towards the root.
 821  */
 822 static int
 823 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
 824     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred, dev_t dev,
 825     struct vfs_context *my_context,
 826     char *bitmap,
 827     uint32_t map_size,
 828     cnid_t* parents,
 829     uint32_t num_parents)
 830 {
 831     int                     myErr = 0;
 832     int                     myResult;
 833     HFSCatalogNodeID        thisNodeID;
 834     unsigned int            myPerms;
 835     struct cat_attr         cnattr;
 836     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
 837     CatalogKey              catkey;
 838
 839     int i = 0, ids_to_cache = 0;
 840     int parent_ids[CACHE_LEVELS];
 841
 842     thisNodeID = nodeID;
 843     while (thisNodeID >=  kRootDirID) {
 844         myResult = 0;   /* default to "no access" */
 845
 846         /* check the cache before resorting to hitting the catalog */
 847
 848         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
 849          * to look any further after hitting cached dir */
 850
 851         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
 852             cache->cachehits++;
 853             myErr = cache->haveaccess[cache_index];
 854             if (scope_index != -1) {
 855                 if (myErr == ESRCH) {
 856                     myErr = 0;
 857                 }
 858             } else {
 859                 scope_index = 0;   // so we'll just use the cache result
 860                 scope_idx_start = ids_to_cache;
 861             }
 862             myResult = (myErr == 0) ? 1 : 0;
 863             goto ExitThisRoutine;
 864         }
 865
 866
 867         if (parents) {
 868             int tmp;
 869             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
 870             if (scope_index == -1)
 871                 scope_index = tmp;
 872             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
 873                 scope_idx_start = ids_to_cache;
 874             }
 875         }
 876
 877         /* remember which parents we want to cache */
 878         if (ids_to_cache < CACHE_LEVELS) {
 879             parent_ids[ids_to_cache] = thisNodeID;
 880             ids_to_cache++;
 881         }
 882         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
 883         if (bitmap && map_size) {
 884             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
 885         }
 886
 887
 888         /* do the lookup (checks the cnode hash, then the catalog) */
 889         myErr = do_attr_lookup(hfsmp, cache, dev, thisNodeID, skip_cp, &catkey, &cnattr);
 890         if (myErr) {
 891             goto ExitThisRoutine; /* no access */
 892         }
 893
 894         /* Root always gets access. */
 895         if (suser(myp_ucred, NULL) == 0) {
 896                 thisNodeID = catkey.hfsPlus.parentID;
 897                 myResult = 1;
 898                 continue;
 899         }
 900
 901         // if the thing has acl's, do the full permission check
 902         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
 903             struct vnode *vp;
 904
 905             /* get the vnode for this cnid */
 906             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0);
 907             if ( myErr ) {
 908                 myResult = 0;
 909                 goto ExitThisRoutine;
 910             }
 911
 912             thisNodeID = VTOC(vp)->c_parentcnid;
 913
 914             hfs_unlock(VTOC(vp));
 915
 916             if (vnode_vtype(vp) == VDIR) {
 917                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
 918             } else {
 919                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
 920             }
 921
 922             vnode_put(vp);
 923             if (myErr) {
 924                 myResult = 0;
 925                 goto ExitThisRoutine;
 926             }
 927         } else {
 928             unsigned int flags;
 929
 930             myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
 931                 cnattr.ca_mode, hfsmp->hfs_mp,
 932                 myp_ucred, theProcPtr);
 933
 934             if (cnattr.ca_mode & S_IFDIR) {
 935                 flags = R_OK | X_OK;
 936             } else {
 937                 flags = R_OK;
 938             }
 939             if ( (myPerms & flags) != flags) {
 940                 myResult = 0;
 941                 myErr = EACCES;
 942                 goto ExitThisRoutine;   /* no access */
 943             }
 944
 945             /* up the hierarchy we go */
 946             thisNodeID = catkey.hfsPlus.parentID;
 947         }
 948     }
 949
 950     /* if here, we have access to this node */
 951     myResult = 1;
 952
 953   ExitThisRoutine:
 954     if (parents && myErr == 0 && scope_index == -1) {
 955         myErr = ESRCH;
 956     }
 957
 958     if (myErr) {
 959         myResult = 0;
 960     }
 961     *err = myErr;
 962
 963     /* cache the parent directory(ies) */
 964     for (i = 0; i < ids_to_cache; i++) {
 965         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
 966             add_node(cache, -1, parent_ids[i], ESRCH);
 967         } else {
 968             add_node(cache, -1, parent_ids[i], myErr);
 969         }
 970     }
 971
 972     return (myResult);
 973 }
 974
 975 static int
 976 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
 977     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
 978 {
 979     boolean_t is64bit;
 980
 981     /*
 982      * NOTE: on entry, the vnode is locked. Incase this vnode
 983      * happens to be in our list of file_ids, we'll note it
 984      * avoid calling hfs_chashget_nowait() on that id as that
 985      * will cause a "locking against myself" panic.
 986      */
 987     Boolean check_leaf = true;
 988
 989     struct ext_user_access_t *user_access_structp;
 990     struct ext_user_access_t tmp_user_access;
 991     struct access_cache cache;
 992
 993     int error = 0;
 994     unsigned int i;
 995
 996     dev_t dev = VTOC(vp)->c_dev;
 997
 998     short flags;
 999     unsigned int num_files = 0;
1000     int map_size = 0;
1001     int num_parents = 0;
1002     int *file_ids=NULL;
1003     short *access=NULL;
1004     char *bitmap=NULL;
1005     cnid_t *parents=NULL;
1006     int leaf_index;
1007
1008     cnid_t cnid;
1009     cnid_t prevParent_cnid = 0;
1010     unsigned int myPerms;
1011     short myaccess = 0;
1012     struct cat_attr cnattr;
1013     CatalogKey catkey;
1014     struct cnode *skip_cp = VTOC(vp);
1015     kauth_cred_t cred = vfs_context_ucred(context);
1016     proc_t p = vfs_context_proc(context);
1017
1018     is64bit = proc_is64bit(p);
1019
1020     /* initialize the local cache and buffers */
1021     cache.numcached = 0;
1022     cache.cachehits = 0;
1023     cache.lookups = 0;
1024     cache.acache = NULL;
1025     cache.haveaccess = NULL;
1026
1027     /* struct copyin done during dispatch... need to copy file_id array separately */
1028     if (ap->a_data == NULL) {
1029         error = EINVAL;
1030         goto err_exit_bulk_access;
1031     }
1032
1033     if (is64bit) {
1034         if (arg_size != sizeof(struct ext_user_access_t)) {
1035             error = EINVAL;
1036             goto err_exit_bulk_access;
1037         }
1038
1039         user_access_structp = (struct ext_user_access_t *)ap->a_data;
1040
1041     } else if (arg_size == sizeof(struct access_t)) {
1042         struct access_t *accessp = (struct access_t *)ap->a_data;
1043
1044         // convert an old style bulk-access struct to the new style
1045         tmp_user_access.flags     = accessp->flags;
1046         tmp_user_access.num_files = accessp->num_files;
1047         tmp_user_access.map_size  = 0;
1048         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1049         tmp_user_access.bitmap    = (user_addr_t)NULL;
1050         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1051         tmp_user_access.num_parents = 0;
1052         user_access_structp = &tmp_user_access;
1053
1054     } else if (arg_size == sizeof(struct ext_access_t)) {
1055         struct ext_access_t *accessp = (struct ext_access_t *)ap->a_data;
1056
1057         // up-cast from a 32-bit version of the struct
1058         tmp_user_access.flags     = accessp->flags;
1059         tmp_user_access.num_files = accessp->num_files;
1060         tmp_user_access.map_size  = accessp->map_size;
1061         tmp_user_access.num_parents  = accessp->num_parents;
1062
1063         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1064         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1065         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1066         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1067
1068         user_access_structp = &tmp_user_access;
1069     } else {
1070         error = EINVAL;
1071         goto err_exit_bulk_access;
1072     }
1073
1074     map_size = user_access_structp->map_size;
1075
1076     num_files = user_access_structp->num_files;
1077
1078     num_parents= user_access_structp->num_parents;
1079
1080     if (num_files < 1) {
1081         goto err_exit_bulk_access;
1082     }
1083     if (num_files > 1024) {
1084         error = EINVAL;
1085         goto err_exit_bulk_access;
1086     }
1087
1088     if (num_parents > 1024) {
1089         error = EINVAL;
1090         goto err_exit_bulk_access;
1091     }
1092
1093     file_ids = (int *) kalloc(sizeof(int) * num_files);
1094     access = (short *) kalloc(sizeof(short) * num_files);
1095     if (map_size) {
1096         bitmap = (char *) kalloc(sizeof(char) * map_size);
1097     }
1098
1099     if (num_parents) {
1100         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1101     }
1102
1103     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1104     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1105
1106     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1107         if (file_ids) {
1108             kfree(file_ids, sizeof(int) * num_files);
1109         }
1110         if (bitmap) {
1111             kfree(bitmap, sizeof(char) * map_size);
1112         }
1113         if (access) {
1114             kfree(access, sizeof(short) * num_files);
1115         }
1116         if (cache.acache) {
1117             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1118         }
1119         if (cache.haveaccess) {
1120             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1121         }
1122         if (parents) {
1123             kfree(parents, sizeof(cnid_t) * num_parents);
1124         }
1125         return ENOMEM;
1126     }
1127
1128     // make sure the bitmap is zero'ed out...
1129     if (bitmap) {
1130         bzero(bitmap, (sizeof(char) * map_size));
1131     }
1132
1133     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1134                 num_files * sizeof(int)))) {
1135         goto err_exit_bulk_access;
1136     }
1137
1138     if (num_parents) {
1139         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1140                     num_parents * sizeof(cnid_t)))) {
1141             goto err_exit_bulk_access;
1142         }
1143     }
1144
1145     flags = user_access_structp->flags;
1146     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1147         flags = R_OK;
1148     }
1149
1150     /* check if we've been passed leaf node ids or parent ids */
1151     if (flags & PARENT_IDS_FLAG) {
1152         check_leaf = false;
1153     }
1154
1155     /* Check access to each file_id passed in */
1156     for (i = 0; i < num_files; i++) {
1157         leaf_index=-1;
1158         cnid = (cnid_t) file_ids[i];
1159
1160         /* root always has access */
1161         if ((!parents) && (!suser(cred, NULL))) {
1162             access[i] = 0;
1163             continue;
1164         }
1165
1166         if (check_leaf) {
1167             /* do the lookup (checks the cnode hash, then the catalog) */
1168             error = do_attr_lookup(hfsmp, &cache, dev, cnid, skip_cp, &catkey, &cnattr);
1169             if (error) {
1170                 access[i] = (short) error;
1171                 continue;
1172             }
1173
1174             if (parents) {
1175                 // Check if the leaf matches one of the parent scopes
1176                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1177             }
1178
1179             // if the thing has acl's, do the full permission check
1180             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1181                 struct vnode *cvp;
1182                 int myErr = 0;
1183                 /* get the vnode for this cnid */
1184                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0);
1185                 if ( myErr ) {
1186                     access[i] = myErr;
1187                     continue;
1188                 }
1189
1190                 hfs_unlock(VTOC(cvp));
1191
1192                 if (vnode_vtype(cvp) == VDIR) {
1193                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1194                 } else {
1195                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1196                 }
1197
1198                 vnode_put(cvp);
1199                 if (myErr) {
1200                     access[i] = myErr;
1201                     continue;
1202                 }
1203             } else {
1204                 /* before calling CheckAccess(), check the target file for read access */
1205                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1206                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1207
1208                 /* fail fast if no access */
1209                 if ((myPerms & flags) == 0) {
1210                     access[i] = EACCES;
1211                     continue;
1212                 }
1213             }
1214         } else {
1215             /* we were passed an array of parent ids */
1216             catkey.hfsPlus.parentID = cnid;
1217         }
1218
1219         /* if the last guy had the same parent and had access, we're done */
1220         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0) {
1221             cache.cachehits++;
1222             access[i] = 0;
1223             continue;
1224         }
1225
1226         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1227             skip_cp, p, cred, dev, context,bitmap, map_size, parents, num_parents);
1228
1229         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1230             access[i] = 0; // have access.. no errors to report
1231         } else {
1232             access[i] = (error != 0 ? (short) error : EACCES);
1233         }
1234
1235         prevParent_cnid = catkey.hfsPlus.parentID;
1236     }
1237
1238     /* copyout the access array */
1239     if ((error = copyout((caddr_t)access, user_access_structp->access,
1240                 num_files * sizeof (short)))) {
1241         goto err_exit_bulk_access;
1242     }
1243     if (map_size && bitmap) {
1244         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1245                     map_size * sizeof (char)))) {
1246             goto err_exit_bulk_access;
1247         }
1248     }
1249
1250
1251   err_exit_bulk_access:
1252
1253     //printf("on exit (err %d), numfiles/numcached/cachehits/lookups is %d/%d/%d/%d\n", error, num_files, cache.numcached, cache.cachehits, cache.lookups);
1254
1255     if (file_ids)
1256         kfree(file_ids, sizeof(int) * num_files);
1257     if (parents)
1258         kfree(parents, sizeof(cnid_t) * num_parents);
1259     if (bitmap)
1260         kfree(bitmap, sizeof(char) * map_size);
1261     if (access)
1262         kfree(access, sizeof(short) * num_files);
1263     if (cache.acache)
1264         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1265     if (cache.haveaccess)
1266         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1267
1268     return (error);
1269 }
1270
1271
1272 /* end "bulk-access" support */
1273
1274
1275 /*
1276  * Callback for use with freeze ioctl.
1277  */
1278 static int
1279 hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1280 {
1281         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1282
1283         return 0;
1284 }
1285
1286 /*
1287  * Control filesystem operating characteristics.
1288  */
1289 int
1290 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1291                 vnode_t a_vp;
1292                 int  a_command;
1293                 caddr_t  a_data;
1294                 int  a_fflag;
1295                 vfs_context_t a_context;
1296         } */ *ap)
1297 {
1298         struct vnode * vp = ap->a_vp;
1299         struct hfsmount *hfsmp = VTOHFS(vp);
1300         vfs_context_t context = ap->a_context;
1301         kauth_cred_t cred = vfs_context_ucred(context);
1302         proc_t p = vfs_context_proc(context);
1303         struct vfsstatfs *vfsp;
1304         boolean_t is64bit;
1305
1306         is64bit = proc_is64bit(p);
1307
1308         switch (ap->a_command) {
1309
1310         case HFS_GETPATH:
1311         {
1312                 struct vnode *file_vp;
1313                 cnid_t  cnid;
1314                 int  outlen;
1315                 char *bufptr;
1316                 int error;
1317
1318                 /* Caller must be owner of file system. */
1319                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1320                 if (suser(cred, NULL) &&
1321                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1322                         return (EACCES);
1323                 }
1324                 /* Target vnode must be file system's root. */
1325                 if (!vnode_isvroot(vp)) {
1326                         return (EINVAL);
1327                 }
1328                 bufptr = (char *)ap->a_data;
1329                 cnid = strtoul(bufptr, NULL, 10);
1330
1331                 if ((error = hfs_vget(hfsmp, cnid, &file_vp, 1))) {
1332                         return (error);
1333                 }
1334                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, 0, context);
1335                 vnode_put(file_vp);
1336
1337                 return (error);
1338         }
1339
1340         case HFS_PREV_LINK:
1341         case HFS_NEXT_LINK:
1342         {
1343                 cnid_t linkfileid;
1344                 cnid_t nextlinkid;
1345                 cnid_t prevlinkid;
1346                 int error;
1347
1348                 /* Caller must be owner of file system. */
1349                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1350                 if (suser(cred, NULL) &&
1351                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1352                         return (EACCES);
1353                 }
1354                 /* Target vnode must be file system's root. */
1355                 if (!vnode_isvroot(vp)) {
1356                         return (EINVAL);
1357                 }
1358                 linkfileid = *(cnid_t *)ap->a_data;
1359                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1360                         return (EINVAL);
1361                 }
1362                 if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1363                         return (error);
1364                 }
1365                 if (ap->a_command == HFS_NEXT_LINK) {
1366                         *(cnid_t *)ap->a_data = nextlinkid;
1367                 } else {
1368                         *(cnid_t *)ap->a_data = prevlinkid;
1369                 }
1370                 return (0);
1371         }
1372
1373         case HFS_RESIZE_PROGRESS: {
1374
1375                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1376                 if (suser(cred, NULL) &&
1377                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1378                         return (EACCES); /* must be owner of file system */
1379                 }
1380                 if (!vnode_isvroot(vp)) {
1381                         return (EINVAL);
1382                 }
1383                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1384         }
1385
1386         case HFS_RESIZE_VOLUME: {
1387                 u_int64_t newsize;
1388                 u_int64_t cursize;
1389
1390                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1391                 if (suser(cred, NULL) &&
1392                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1393                         return (EACCES); /* must be owner of file system */
1394                 }
1395                 if (!vnode_isvroot(vp)) {
1396                         return (EINVAL);
1397                 }
1398                 newsize = *(u_int64_t *)ap->a_data;
1399                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1400
1401                 if (newsize > cursize) {
1402                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1403                 } else if (newsize < cursize) {
1404                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1405                 } else {
1406                         return (0);
1407                 }
1408         }
1409         case HFS_CHANGE_NEXT_ALLOCATION: {
1410                 int error = 0;          /* Assume success */
1411                 u_int32_t location;
1412
1413                 if (vnode_vfsisrdonly(vp)) {
1414                         return (EROFS);
1415                 }
1416                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1417                 if (suser(cred, NULL) &&
1418                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1419                         return (EACCES); /* must be owner of file system */
1420                 }
1421                 if (!vnode_isvroot(vp)) {
1422                         return (EINVAL);
1423                 }
1424                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1425                 location = *(u_int32_t *)ap->a_data;
1426                 if ((location >= hfsmp->allocLimit) &&
1427                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1428                         error = EINVAL;
1429                         goto fail_change_next_allocation;
1430                 }
1431                 /* Return previous value. */
1432                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1433                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1434                         /* On magic value for location, set nextAllocation to next block
1435                          * after metadata zone and set flag in mount structure to indicate
1436                          * that nextAllocation should not be updated again.
1437                          */
1438                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1439                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1440                 } else {
1441                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1442                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1443                 }
1444                 MarkVCBDirty(hfsmp);
1445 fail_change_next_allocation:
1446                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1447                 return (error);
1448         }
1449
1450 #ifdef HFS_SPARSE_DEV
1451         case HFS_SETBACKINGSTOREINFO: {
1452                 struct vnode * bsfs_rootvp;
1453                 struct vnode * di_vp;
1454                 struct hfs_backingstoreinfo *bsdata;
1455                 int error = 0;
1456
1457                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1458                         return (EALREADY);
1459                 }
1460                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1461                 if (suser(cred, NULL) &&
1462                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1463                         return (EACCES); /* must be owner of file system */
1464                 }
1465                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1466                 if (bsdata == NULL) {
1467                         return (EINVAL);
1468                 }
1469                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1470                         return (error);
1471                 }
1472                 if ((error = vnode_getwithref(di_vp))) {
1473                         file_drop(bsdata->backingfd);
1474                         return(error);
1475                 }
1476
1477                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1478                         (void)vnode_put(di_vp);
1479                         file_drop(bsdata->backingfd);
1480                         return (EINVAL);
1481                 }
1482
1483                 /*
1484                  * Obtain the backing fs root vnode and keep a reference
1485                  * on it.  This reference will be dropped in hfs_unmount.
1486                  */
1487                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1488                 if (error) {
1489                         (void)vnode_put(di_vp);
1490                         file_drop(bsdata->backingfd);
1491                         return (error);
1492                 }
1493                 vnode_ref(bsfs_rootvp);
1494                 vnode_put(bsfs_rootvp);
1495
1496                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1497                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1498                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
1499                 hfsmp->hfs_sparsebandblks *= 4;
1500
1501                 vfs_markdependency(hfsmp->hfs_mp);
1502
1503                 (void)vnode_put(di_vp);
1504                 file_drop(bsdata->backingfd);
1505                 return (0);
1506         }
1507         case HFS_CLRBACKINGSTOREINFO: {
1508                 struct vnode * tmpvp;
1509
1510                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1511                 if (suser(cred, NULL) &&
1512                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1513                         return (EACCES); /* must be owner of file system */
1514                 }
1515                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1516                     hfsmp->hfs_backingfs_rootvp) {
1517
1518                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1519                         tmpvp = hfsmp->hfs_backingfs_rootvp;
1520                         hfsmp->hfs_backingfs_rootvp = NULLVP;
1521                         hfsmp->hfs_sparsebandblks = 0;
1522                         vnode_rele(tmpvp);
1523                 }
1524                 return (0);
1525         }
1526 #endif /* HFS_SPARSE_DEV */
1527
1528         case F_FREEZE_FS: {
1529                 struct mount *mp;
1530
1531                 if (!is_suser())
1532                         return (EACCES);
1533
1534                 mp = vnode_mount(vp);
1535                 hfsmp = VFSTOHFS(mp);
1536
1537                 if (!(hfsmp->jnl))
1538                         return (ENOTSUP);
1539
1540                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
1541
1542                 // flush things before we get started to try and prevent
1543                 // dirty data from being paged out while we're frozen.
1544                 // note: can't do this after taking the lock as it will
1545                 // deadlock against ourselves.
1546                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
1547                 hfs_global_exclusive_lock_acquire(hfsmp);
1548                 journal_flush(hfsmp->jnl);
1549
1550                 // don't need to iterate on all vnodes, we just need to
1551                 // wait for writes to the system files and the device vnode
1552                 if (HFSTOVCB(hfsmp)->extentsRefNum)
1553                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
1554                 if (HFSTOVCB(hfsmp)->catalogRefNum)
1555                     vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
1556                 if (HFSTOVCB(hfsmp)->allocationsRefNum)
1557                     vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
1558                 if (hfsmp->hfs_attribute_vp)
1559                     vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
1560                 vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
1561
1562                 hfsmp->hfs_freezing_proc = current_proc();
1563
1564                 return (0);
1565         }
1566
1567         case F_THAW_FS: {
1568                 if (!is_suser())
1569                         return (EACCES);
1570
1571                 // if we're not the one who froze the fs then we
1572                 // can't thaw it.
1573                 if (hfsmp->hfs_freezing_proc != current_proc()) {
1574                     return EPERM;
1575                 }
1576
1577                 // NOTE: if you add code here, also go check the
1578                 //       code that "thaws" the fs in hfs_vnop_close()
1579                 //
1580                 hfsmp->hfs_freezing_proc = NULL;
1581                 hfs_global_exclusive_lock_release(hfsmp);
1582                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
1583
1584                 return (0);
1585         }
1586
1587         case HFS_BULKACCESS_FSCTL: {
1588             int size;
1589
1590             if (hfsmp->hfs_flags & HFS_STANDARD) {
1591                 return EINVAL;
1592             }
1593
1594             if (is64bit) {
1595                 size = sizeof(struct user_access_t);
1596             } else {
1597                 size = sizeof(struct access_t);
1598             }
1599
1600             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1601         }
1602
1603         case HFS_EXT_BULKACCESS_FSCTL: {
1604             int size;
1605
1606             if (hfsmp->hfs_flags & HFS_STANDARD) {
1607                 return EINVAL;
1608             }
1609
1610             if (is64bit) {
1611                 size = sizeof(struct ext_user_access_t);
1612             } else {
1613                 size = sizeof(struct ext_access_t);
1614             }
1615
1616             return do_bulk_access_check(hfsmp, vp, ap, size, context);
1617         }
1618
1619         case HFS_SETACLSTATE: {
1620                 int state;
1621
1622                 if (ap->a_data == NULL) {
1623                         return (EINVAL);
1624                 }
1625
1626                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1627                 state = *(int *)ap->a_data;
1628
1629                 // super-user can enable or disable acl's on a volume.
1630                 // the volume owner can only enable acl's
1631                 if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) {
1632                         return (EPERM);
1633                 }
1634                 if (state == 0 || state == 1)
1635                         return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state);
1636                 else
1637                         return (EINVAL);
1638         }
1639
1640         case HFS_SET_XATTREXTENTS_STATE: {
1641                 int state;
1642
1643                 if (ap->a_data == NULL) {
1644                         return (EINVAL);
1645                 }
1646
1647                 state = *(int *)ap->a_data;
1648
1649                 /* Super-user can enable or disable extent-based extended
1650                  * attribute support on a volume
1651                  */
1652                 if (!is_suser()) {
1653                         return (EPERM);
1654                 }
1655                 if (state == 0 || state == 1)
1656                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
1657                 else
1658                         return (EINVAL);
1659         }
1660
1661         case F_FULLFSYNC: {
1662                 int error;
1663
1664                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1665                 if (error == 0) {
1666                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
1667                         hfs_unlock(VTOC(vp));
1668                 }
1669
1670                 return error;
1671         }
1672
1673         case F_CHKCLEAN: {
1674                 register struct cnode *cp;
1675                 int error;
1676
1677                 if (!vnode_isreg(vp))
1678                         return EINVAL;
1679
1680                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK);
1681                 if (error == 0) {
1682                         cp = VTOC(vp);
1683                         /*
1684                          * used by regression test to determine if
1685                          * all the dirty pages (via write) have been cleaned
1686                          * after a call to 'fsysnc'.
1687                          */
1688                         error = is_file_clean(vp, VTOF(vp)->ff_size);
1689                         hfs_unlock(cp);
1690                 }
1691                 return (error);
1692         }
1693
1694         case F_RDADVISE: {
1695                 register struct radvisory *ra;
1696                 struct filefork *fp;
1697                 int error;
1698
1699                 if (!vnode_isreg(vp))
1700                         return EINVAL;
1701
1702                 ra = (struct radvisory *)(ap->a_data);
1703                 fp = VTOF(vp);
1704
1705                 /* Protect against a size change. */
1706                 hfs_lock_truncate(VTOC(vp), TRUE);
1707
1708                 if (ra->ra_offset >= fp->ff_size) {
1709                         error = EFBIG;
1710                 } else {
1711                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
1712                 }
1713
1714                 hfs_unlock_truncate(VTOC(vp), TRUE);
1715                 return (error);
1716         }
1717
1718         case F_READBOOTSTRAP:
1719         case F_WRITEBOOTSTRAP:
1720         {
1721             struct vnode *devvp = NULL;
1722             user_fbootstraptransfer_t *user_bootstrapp;
1723             int devBlockSize;
1724             int error;
1725             uio_t auio;
1726             daddr64_t blockNumber;
1727             u_long blockOffset;
1728             u_long xfersize;
1729             struct buf *bp;
1730             user_fbootstraptransfer_t user_bootstrap;
1731
1732                 if (!vnode_isvroot(vp))
1733                         return (EINVAL);
1734                 /* LP64 - when caller is a 64 bit process then we are passed a pointer
1735                  * to a user_fbootstraptransfer_t else we get a pointer to a
1736                  * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t
1737                  */
1738                 if (is64bit) {
1739                         user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data;
1740                 }
1741                 else {
1742                 fbootstraptransfer_t *bootstrapp = (fbootstraptransfer_t *)ap->a_data;
1743                         user_bootstrapp = &user_bootstrap;
1744                         user_bootstrap.fbt_offset = bootstrapp->fbt_offset;
1745                         user_bootstrap.fbt_length = bootstrapp->fbt_length;
1746                         user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer);
1747                 }
1748                 if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024)
1749                         return EINVAL;
1750
1751             devvp = VTOHFS(vp)->hfs_devvp;
1752                 auio = uio_create(1, user_bootstrapp->fbt_offset,
1753                                                   is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32,
1754                                                   (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ);
1755                 uio_addiov(auio, user_bootstrapp->fbt_buffer, user_bootstrapp->fbt_length);
1756
1757             devBlockSize = vfs_devblocksize(vnode_mount(vp));
1758
1759             while (uio_resid(auio) > 0) {
1760                         blockNumber = uio_offset(auio) / devBlockSize;
1761                         error = (int)buf_bread(devvp, blockNumber, devBlockSize, cred, &bp);
1762                         if (error) {
1763                                 if (bp) buf_brelse(bp);
1764                                 uio_free(auio);
1765                                 return error;
1766                         };
1767
1768                         blockOffset = uio_offset(auio) % devBlockSize;
1769                         xfersize = devBlockSize - blockOffset;
1770                         error = uiomove((caddr_t)buf_dataptr(bp) + blockOffset, (int)xfersize, auio);
1771                         if (error) {
1772                                 buf_brelse(bp);
1773                                 uio_free(auio);
1774                                 return error;
1775                         };
1776                         if (uio_rw(auio) == UIO_WRITE) {
1777                                 error = VNOP_BWRITE(bp);
1778                                 if (error) {
1779                                         uio_free(auio);
1780                         return error;
1781                                 }
1782                         } else {
1783                                 buf_brelse(bp);
1784                         };
1785                 };
1786                 uio_free(auio);
1787         };
1788         return 0;
1789
1790         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
1791         {
1792                 if (is64bit) {
1793                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
1794                 }
1795                 else {
1796                         *(time_t *)(ap->a_data) = to_bsd_time(VTOVCB(vp)->localCreateDate);
1797                 }
1798                 return 0;
1799         }
1800
1801         case HFS_GET_MOUNT_TIME:
1802             return copyout(&hfsmp->hfs_mount_time, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_mount_time));
1803             break;
1804
1805         case HFS_GET_LAST_MTIME:
1806             return copyout(&hfsmp->hfs_last_mounted_mtime, CAST_USER_ADDR_T(ap->a_data), sizeof(hfsmp->hfs_last_mounted_mtime));
1807             break;
1808
1809         case HFS_SET_BOOT_INFO:
1810                 if (!vnode_isvroot(vp))
1811                         return(EINVAL);
1812                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
1813                         return(EACCES); /* must be superuser or owner of filesystem */
1814                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1815                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
1816                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1817                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
1818                 break;
1819
1820         case HFS_GET_BOOT_INFO:
1821                 if (!vnode_isvroot(vp))
1822                         return(EINVAL);
1823                 HFS_MOUNT_LOCK(hfsmp, TRUE);
1824                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
1825                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
1826                 break;
1827
1828         case HFS_MARK_BOOT_CORRUPT:
1829                 /* Mark the boot volume corrupt by setting
1830                  * kHFSVolumeInconsistentBit in the volume header.  This will
1831                  * force fsck_hfs on next mount.
1832                  */
1833                 if (!is_suser()) {
1834                         return EACCES;
1835                 }
1836
1837                 /* Allowed only on the root vnode of the boot volume */
1838                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
1839                     !vnode_isvroot(vp)) {
1840                         return EINVAL;
1841                 }
1842
1843                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
1844                 hfs_mark_volume_inconsistent(hfsmp);
1845                 break;
1846
1847         default:
1848                 return (ENOTTY);
1849         }
1850
1851     /* Should never get here */
1852         return 0;
1853 }
1854
1855 /*
1856  * select
1857  */
1858 int
1859 hfs_vnop_select(__unused struct vnop_select_args *ap)
1860 /*
1861         struct vnop_select_args {
1862                 vnode_t a_vp;
1863                 int  a_which;
1864                 int  a_fflags;
1865                 void *a_wql;
1866                 vfs_context_t a_context;
1867         };
1868 */
1869 {
1870         /*
1871          * We should really check to see if I/O is possible.
1872          */
1873         return (1);
1874 }
1875
1876 /*
1877  * Converts a logical block number to a physical block, and optionally returns
1878  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
1879  * The physical block number is based on the device block size, currently its 512.
1880  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
1881  */
1882 int
1883 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
1884 {
1885         struct filefork *fp = VTOF(vp);
1886         struct hfsmount *hfsmp = VTOHFS(vp);
1887         int  retval = E_NONE;
1888         u_int32_t  logBlockSize;
1889         size_t  bytesContAvail = 0;
1890         off_t  blockposition;
1891         int lockExtBtree;
1892         int lockflags = 0;
1893
1894         /*
1895          * Check for underlying vnode requests and ensure that logical
1896          * to physical mapping is requested.
1897          */
1898         if (vpp != NULL)
1899                 *vpp = hfsmp->hfs_devvp;
1900         if (bnp == NULL)
1901                 return (0);
1902
1903         logBlockSize = GetLogicalBlockSize(vp);
1904         blockposition = (off_t)bn * logBlockSize;
1905
1906         lockExtBtree = overflow_extents(fp);
1907
1908         if (lockExtBtree)
1909                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
1910
1911         retval = MacToVFSError(
1912                             MapFileBlockC (HFSTOVCB(hfsmp),
1913                                             (FCB*)fp,
1914                                             MAXPHYSIO,
1915                                             blockposition,
1916                                             bnp,
1917                                             &bytesContAvail));
1918
1919         if (lockExtBtree)
1920                 hfs_systemfile_unlock(hfsmp, lockflags);
1921
1922         if (retval == E_NONE) {
1923                 /* Figure out how many read ahead blocks there are */
1924                 if (runp != NULL) {
1925                         if (can_cluster(logBlockSize)) {
1926                                 /* Make sure this result never goes negative: */
1927                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
1928                         } else {
1929                                 *runp = 0;
1930                         }
1931                 }
1932         }
1933         return (retval);
1934 }
1935
1936 /*
1937  * Convert logical block number to file offset.
1938  */
1939 int
1940 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
1941 /*
1942         struct vnop_blktooff_args {
1943                 vnode_t a_vp;
1944                 daddr64_t a_lblkno;
1945                 off_t *a_offset;
1946         };
1947 */
1948 {
1949         if (ap->a_vp == NULL)
1950                 return (EINVAL);
1951         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
1952
1953         return(0);
1954 }
1955
1956 /*
1957  * Convert file offset to logical block number.
1958  */
1959 int
1960 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
1961 /*
1962         struct vnop_offtoblk_args {
1963                 vnode_t a_vp;
1964                 off_t a_offset;
1965                 daddr64_t *a_lblkno;
1966         };
1967 */
1968 {
1969         if (ap->a_vp == NULL)
1970                 return (EINVAL);
1971         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
1972
1973         return(0);
1974 }
1975
1976 /*
1977  * Map file offset to physical block number.
1978  *
1979  * If this function is called for write operation, and if the file
1980  * had virtual blocks allocated (delayed allocation), real blocks
1981  * are allocated by calling ExtendFileC().
1982  *
1983  * If this function is called for read operation, and if the file
1984  * had virtual blocks allocated (delayed allocation), no change
1985  * to the size of file is done, and if required, rangelist is
1986  * searched for mapping.
1987  *
1988  * System file cnodes are expected to be locked (shared or exclusive).
1989  */
1990 int
1991 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
1992 /*
1993         struct vnop_blockmap_args {
1994                 vnode_t a_vp;
1995                 off_t a_foffset;
1996                 size_t a_size;
1997                 daddr64_t *a_bpn;
1998                 size_t *a_run;
1999                 void *a_poff;
2000                 int a_flags;
2001                 vfs_context_t a_context;
2002         };
2003 */
2004 {
2005         struct vnode *vp = ap->a_vp;
2006         struct cnode *cp;
2007         struct filefork *fp;
2008         struct hfsmount *hfsmp;
2009         size_t bytesContAvail = 0;
2010         int retval = E_NONE;
2011         int syslocks = 0;
2012         int lockflags = 0;
2013         struct rl_entry *invalid_range;
2014         enum rl_overlaptype overlaptype;
2015         int started_tr = 0;
2016         int tooklock = 0;
2017
2018         /* Do not allow blockmap operation on a directory */
2019         if (vnode_isdir(vp)) {
2020                 return (ENOTSUP);
2021         }
2022
2023         /*
2024          * Check for underlying vnode requests and ensure that logical
2025          * to physical mapping is requested.
2026          */
2027         if (ap->a_bpn == NULL)
2028                 return (0);
2029
2030         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2031                 if (VTOC(vp)->c_lockowner != current_thread()) {
2032                         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
2033                         tooklock = 1;
2034                 }
2035         }
2036         hfsmp = VTOHFS(vp);
2037         cp = VTOC(vp);
2038         fp = VTOF(vp);
2039
2040 retry:
2041         /* Check virtual blocks only when performing write operation */
2042         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2043                 if (hfs_start_transaction(hfsmp) != 0) {
2044                         retval = EINVAL;
2045                         goto exit;
2046                 } else {
2047                         started_tr = 1;
2048                 }
2049                 syslocks = SFL_EXTENTS | SFL_BITMAP;
2050
2051         } else if (overflow_extents(fp)) {
2052                 syslocks = SFL_EXTENTS;
2053         }
2054
2055         if (syslocks)
2056                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
2057
2058         /*
2059          * Check for any delayed allocations.
2060          */
2061         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2062                 int64_t actbytes;
2063                 u_int32_t loanedBlocks;
2064
2065                 //
2066                 // Make sure we have a transaction.  It's possible
2067                 // that we came in and fp->ff_unallocblocks was zero
2068                 // but during the time we blocked acquiring the extents
2069                 // btree, ff_unallocblocks became non-zero and so we
2070                 // will need to start a transaction.
2071                 //
2072                 if (started_tr == 0) {
2073                         if (syslocks) {
2074                                 hfs_systemfile_unlock(hfsmp, lockflags);
2075                                 syslocks = 0;
2076                         }
2077                         goto retry;
2078                 }
2079
2080                 /*
2081                  * Note: ExtendFileC will Release any blocks on loan and
2082                  * aquire real blocks.  So we ask to extend by zero bytes
2083                  * since ExtendFileC will account for the virtual blocks.
2084                  */
2085
2086                 loanedBlocks = fp->ff_unallocblocks;
2087                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
2088                                      kEFAllMask | kEFNoClumpMask, &actbytes);
2089
2090                 if (retval) {
2091                         fp->ff_unallocblocks = loanedBlocks;
2092                         cp->c_blocks += loanedBlocks;
2093                         fp->ff_blocks += loanedBlocks;
2094
2095                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2096                         hfsmp->loanedBlocks += loanedBlocks;
2097                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2098
2099                         hfs_systemfile_unlock(hfsmp, lockflags);
2100                         cp->c_flag |= C_MODIFIED;
2101                         if (started_tr) {
2102                                 (void) hfs_update(vp, TRUE);
2103                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2104
2105                                 hfs_end_transaction(hfsmp);
2106                                 started_tr = 0;
2107                         }
2108                         goto exit;
2109                 }
2110         }
2111
2112         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
2113                                ap->a_bpn, &bytesContAvail);
2114         if (syslocks) {
2115                 hfs_systemfile_unlock(hfsmp, lockflags);
2116                 syslocks = 0;
2117         }
2118
2119         if (started_tr) {
2120                 (void) hfs_update(vp, TRUE);
2121                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2122                 hfs_end_transaction(hfsmp);
2123                 started_tr = 0;
2124         }
2125         if (retval) {
2126                 /* On write, always return error because virtual blocks, if any,
2127                  * should have been allocated in ExtendFileC().  We do not
2128                  * allocate virtual blocks on read, therefore return error
2129                  * only if no virtual blocks are allocated.  Otherwise we search
2130                  * rangelist for zero-fills
2131                  */
2132                 if ((MacToVFSError(retval) != ERANGE) ||
2133                     (ap->a_flags & VNODE_WRITE) ||
2134                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
2135                         goto exit;
2136                 }
2137
2138                 /* Validate if the start offset is within logical file size */
2139                 if (ap->a_foffset > fp->ff_size) {
2140                         goto exit;
2141                 }
2142
2143                 /* Searching file extents has failed for read operation, therefore
2144                  * search rangelist for any uncommitted holes in the file.
2145                  */
2146                 overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2147                                       ap->a_foffset + (off_t)(ap->a_size - 1),
2148                                       &invalid_range);
2149                 switch(overlaptype) {
2150                 case RL_OVERLAPISCONTAINED:
2151                         /* start_offset <= rl_start, end_offset >= rl_end */
2152                         if (ap->a_foffset != invalid_range->rl_start) {
2153                                 break;
2154                         }
2155                 case RL_MATCHINGOVERLAP:
2156                         /* start_offset = rl_start, end_offset = rl_end */
2157                 case RL_OVERLAPCONTAINSRANGE:
2158                         /* start_offset >= rl_start, end_offset <= rl_end */
2159                 case RL_OVERLAPSTARTSBEFORE:
2160                         /* start_offset > rl_start, end_offset >= rl_start */
2161                         if ((off_t)fp->ff_size > (invalid_range->rl_end + 1)) {
2162                                 bytesContAvail = (invalid_range->rl_end + 1) - ap->a_foffset;
2163                         } else {
2164                                 bytesContAvail = fp->ff_size - ap->a_foffset;
2165                         }
2166                         if (bytesContAvail > ap->a_size) {
2167                                 bytesContAvail = ap->a_size;
2168                         }
2169                         *ap->a_bpn = (daddr64_t)-1;
2170                         retval = 0;
2171                         break;
2172                 case RL_OVERLAPENDSAFTER:
2173                         /* start_offset < rl_start, end_offset < rl_end */
2174                 case RL_NOOVERLAP:
2175                         break;
2176                 }
2177                 goto exit;
2178         }
2179
2180         /* MapFileC() found a valid extent in the filefork.  Search the
2181          * mapping information further for invalid file ranges
2182          */
2183         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
2184                               ap->a_foffset + (off_t)bytesContAvail - 1,
2185                               &invalid_range);
2186         if (overlaptype != RL_NOOVERLAP) {
2187                 switch(overlaptype) {
2188                 case RL_MATCHINGOVERLAP:
2189                 case RL_OVERLAPCONTAINSRANGE:
2190                 case RL_OVERLAPSTARTSBEFORE:
2191                         /* There's no valid block for this byte offset */
2192                         *ap->a_bpn = (daddr64_t)-1;
2193                         /* There's no point limiting the amount to be returned
2194                          * if the invalid range that was hit extends all the way
2195                          * to the EOF (i.e. there's no valid bytes between the
2196                          * end of this range and the file's EOF):
2197                          */
2198                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2199                             (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
2200                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2201                         }
2202                         break;
2203
2204                 case RL_OVERLAPISCONTAINED:
2205                 case RL_OVERLAPENDSAFTER:
2206                         /* The range of interest hits an invalid block before the end: */
2207                         if (invalid_range->rl_start == ap->a_foffset) {
2208                                 /* There's actually no valid information to be had starting here: */
2209                                 *ap->a_bpn = (daddr64_t)-1;
2210                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
2211                                     (invalid_range->rl_end + 1 - ap->a_foffset < bytesContAvail)) {
2212                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
2213                                 }
2214                         } else {
2215                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
2216                         }
2217                         break;
2218
2219                 case RL_NOOVERLAP:
2220                         break;
2221                 } /* end switch */
2222                 if (bytesContAvail > ap->a_size)
2223                         bytesContAvail = ap->a_size;
2224         }
2225
2226 exit:
2227         if (retval == 0) {
2228                 if (ap->a_run)
2229                         *ap->a_run = bytesContAvail;
2230
2231                 if (ap->a_poff)
2232                         *(int *)ap->a_poff = 0;
2233         }
2234
2235         if (tooklock)
2236                 hfs_unlock(cp);
2237
2238         return (MacToVFSError(retval));
2239 }
2240
2241
2242 /*
2243  * prepare and issue the I/O
2244  * buf_strategy knows how to deal
2245  * with requests that require
2246  * fragmented I/Os
2247  */
2248 int
2249 hfs_vnop_strategy(struct vnop_strategy_args *ap)
2250 {
2251         buf_t   bp = ap->a_bp;
2252         vnode_t vp = buf_vnode(bp);
2253
2254         return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap));
2255 }
2256
2257
2258 static int
2259 do_hfs_truncate(struct vnode *vp, off_t length, int flags, vfs_context_t context)
2260 {
2261         register struct cnode *cp = VTOC(vp);
2262         struct filefork *fp = VTOF(vp);
2263         struct proc *p = vfs_context_proc(context);;
2264         kauth_cred_t cred = vfs_context_ucred(context);
2265         int retval;
2266         off_t bytesToAdd;
2267         off_t actualBytesAdded;
2268         off_t filebytes;
2269         u_long fileblocks;
2270         int blksize;
2271         struct hfsmount *hfsmp;
2272         int lockflags;
2273
2274         blksize = VTOVCB(vp)->blockSize;
2275         fileblocks = fp->ff_blocks;
2276         filebytes = (off_t)fileblocks * (off_t)blksize;
2277
2278         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
2279                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2280
2281         if (length < 0)
2282                 return (EINVAL);
2283
2284         /* This should only happen with a corrupt filesystem */
2285         if ((off_t)fp->ff_size < 0)
2286                 return (EINVAL);
2287
2288         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
2289                 return (EFBIG);
2290
2291         hfsmp = VTOHFS(vp);
2292
2293         retval = E_NONE;
2294
2295         /* Files that are changing size are not hot file candidates. */
2296         if (hfsmp->hfc_stage == HFC_RECORDING) {
2297                 fp->ff_bytesread = 0;
2298         }
2299
2300         /*
2301          * We cannot just check if fp->ff_size == length (as an optimization)
2302          * since there may be extra physical blocks that also need truncation.
2303          */
2304 #if QUOTA
2305         if ((retval = hfs_getinoquota(cp)))
2306                 return(retval);
2307 #endif /* QUOTA */
2308
2309         /*
2310          * Lengthen the size of the file. We must ensure that the
2311          * last byte of the file is allocated. Since the smallest
2312          * value of ff_size is 0, length will be at least 1.
2313          */
2314         if (length > (off_t)fp->ff_size) {
2315 #if QUOTA
2316                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
2317                                    cred, 0);
2318                 if (retval)
2319                         goto Err_Exit;
2320 #endif /* QUOTA */
2321                 /*
2322                  * If we don't have enough physical space then
2323                  * we need to extend the physical size.
2324                  */
2325                 if (length > filebytes) {
2326                         int eflags;
2327                         u_long blockHint = 0;
2328
2329                         /* All or nothing and don't round up to clumpsize. */
2330                         eflags = kEFAllMask | kEFNoClumpMask;
2331
2332                         if (cred && suser(cred, NULL) != 0)
2333                                 eflags |= kEFReserveMask;  /* keep a reserve */
2334
2335                         /*
2336                          * Allocate Journal and Quota files in metadata zone.
2337                          */
2338                         if (filebytes == 0 &&
2339                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
2340                             hfs_virtualmetafile(cp)) {
2341                                 eflags |= kEFMetadataMask;
2342                                 blockHint = hfsmp->hfs_metazone_start;
2343                         }
2344                         if (hfs_start_transaction(hfsmp) != 0) {
2345                             retval = EINVAL;
2346                             goto Err_Exit;
2347                         }
2348
2349                         /* Protect extents b-tree and allocation bitmap */
2350                         lockflags = SFL_BITMAP;
2351                         if (overflow_extents(fp))
2352                                 lockflags |= SFL_EXTENTS;
2353                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2354
2355                         while ((length > filebytes) && (retval == E_NONE)) {
2356                                 bytesToAdd = length - filebytes;
2357                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
2358                                                     (FCB*)fp,
2359                                                     bytesToAdd,
2360                                                     blockHint,
2361                                                     eflags,
2362                                                     &actualBytesAdded));
2363
2364                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2365                                 if (actualBytesAdded == 0 && retval == E_NONE) {
2366                                         if (length > filebytes)
2367                                                 length = filebytes;
2368                                         break;
2369                                 }
2370                         } /* endwhile */
2371
2372                         hfs_systemfile_unlock(hfsmp, lockflags);
2373
2374                         if (hfsmp->jnl) {
2375                             (void) hfs_update(vp, TRUE);
2376                             (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2377                         }
2378
2379                         hfs_end_transaction(hfsmp);
2380
2381                         if (retval)
2382                                 goto Err_Exit;
2383
2384                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2385                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
2386                 }
2387
2388                 if (!(flags & IO_NOZEROFILL)) {
2389                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
2390                                 struct rl_entry *invalid_range;
2391                                 off_t zero_limit;
2392
2393                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
2394                                 if (length < zero_limit) zero_limit = length;
2395
2396                                 if (length > (off_t)fp->ff_size) {
2397                                         struct timeval tv;
2398
2399                                         /* Extending the file: time to fill out the current last page w. zeroes? */
2400                                         if ((fp->ff_size & PAGE_MASK_64) &&
2401                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
2402                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
2403
2404                                                 /* There's some valid data at the start of the (current) last page
2405                                                    of the file, so zero out the remainder of that page to ensure the
2406                                                    entire page contains valid data.  Since there is no invalid range
2407                                                    possible past the (current) eof, there's no need to remove anything
2408                                                    from the invalid range list before calling cluster_write():  */
2409                                                 hfs_unlock(cp);
2410                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
2411                                                                 fp->ff_size, (off_t)0,
2412                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
2413                                                 hfs_lock(cp, HFS_FORCE_LOCK);
2414                                                 if (retval) goto Err_Exit;
2415
2416                                                 /* Merely invalidate the remaining area, if necessary: */
2417                                                 if (length > zero_limit) {
2418                                                         microuptime(&tv);
2419                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
2420                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2421                                                 }
2422                                         } else {
2423                                         /* The page containing the (current) eof is invalid: just add the
2424                                            remainder of the page to the invalid list, along with the area
2425                                            being newly allocated:
2426                                          */
2427                                         microuptime(&tv);
2428                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
2429                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
2430                                         };
2431                                 }
2432                         } else {
2433                                         panic("hfs_truncate: invoked on non-UBC object?!");
2434                         };
2435                 }
2436                 cp->c_touch_modtime = TRUE;
2437                 fp->ff_size = length;
2438
2439         } else { /* Shorten the size of the file */
2440
2441                 if ((off_t)fp->ff_size > length) {
2442                         /* Any space previously marked as invalid is now irrelevant: */
2443                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
2444                 }
2445
2446                 /*
2447                  * Account for any unmapped blocks. Note that the new
2448                  * file length can still end up with unmapped blocks.
2449                  */
2450                 if (fp->ff_unallocblocks > 0) {
2451                         u_int32_t finalblks;
2452                         u_int32_t loanedBlocks;
2453
2454                         HFS_MOUNT_LOCK(hfsmp, TRUE);
2455
2456                         loanedBlocks = fp->ff_unallocblocks;
2457                         cp->c_blocks -= loanedBlocks;
2458                         fp->ff_blocks -= loanedBlocks;
2459                         fp->ff_unallocblocks = 0;
2460
2461                         hfsmp->loanedBlocks -= loanedBlocks;
2462
2463                         finalblks = (length + blksize - 1) / blksize;
2464                         if (finalblks > fp->ff_blocks) {
2465                                 /* calculate required unmapped blocks */
2466                                 loanedBlocks = finalblks - fp->ff_blocks;
2467                                 hfsmp->loanedBlocks += loanedBlocks;
2468
2469                                 fp->ff_unallocblocks = loanedBlocks;
2470                                 cp->c_blocks += loanedBlocks;
2471                                 fp->ff_blocks += loanedBlocks;
2472                         }
2473                         HFS_MOUNT_UNLOCK(hfsmp, TRUE);
2474                 }
2475
2476                 /*
2477                  * For a TBE process the deallocation of the file blocks is
2478                  * delayed until the file is closed.  And hfs_close calls
2479                  * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
2480                  * isn't set, we make sure this isn't a TBE process.
2481                  */
2482                 if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
2483 #if QUOTA
2484                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
2485 #endif /* QUOTA */
2486                   if (hfs_start_transaction(hfsmp) != 0) {
2487                       retval = EINVAL;
2488                       goto Err_Exit;
2489                   }
2490
2491                         if (fp->ff_unallocblocks == 0) {
2492                                 /* Protect extents b-tree and allocation bitmap */
2493                                 lockflags = SFL_BITMAP;
2494                                 if (overflow_extents(fp))
2495                                         lockflags |= SFL_EXTENTS;
2496                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2497
2498                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
2499                                                 (FCB*)fp, length, false));
2500
2501                                 hfs_systemfile_unlock(hfsmp, lockflags);
2502                         }
2503                         if (hfsmp->jnl) {
2504                                 if (retval == 0) {
2505                                         fp->ff_size = length;
2506                                 }
2507                                 (void) hfs_update(vp, TRUE);
2508                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2509                         }
2510
2511                         hfs_end_transaction(hfsmp);
2512
2513                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
2514                         if (retval)
2515                                 goto Err_Exit;
2516 #if QUOTA
2517                         /* These are bytesreleased */
2518                         (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
2519 #endif /* QUOTA */
2520                 }
2521                 /* Only set update flag if the logical length changes */
2522                 if ((off_t)fp->ff_size != length)
2523                         cp->c_touch_modtime = TRUE;
2524                 fp->ff_size = length;
2525         }
2526         cp->c_touch_chgtime = TRUE;     /* status changed */
2527         cp->c_touch_modtime = TRUE;     /* file data was modified */
2528         retval = hfs_update(vp, MNT_WAIT);
2529         if (retval) {
2530                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
2531                      -1, -1, -1, retval, 0);
2532         }
2533
2534 Err_Exit:
2535
2536         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
2537                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
2538
2539         return (retval);
2540 }
2541
2542
2543
2544 /*
2545  * Truncate a cnode to at most length size, freeing (or adding) the
2546  * disk blocks.
2547  */
2548 __private_extern__
2549 int
2550 hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
2551              vfs_context_t context)
2552 {
2553         struct filefork *fp = VTOF(vp);
2554         off_t filebytes;
2555         u_long fileblocks;
2556         int blksize, error = 0;
2557         struct cnode *cp = VTOC(vp);
2558
2559         /* Cannot truncate an HFS directory! */
2560         if (vnode_isdir(vp)) {
2561                 return (EISDIR);
2562         }
2563         /* A swap file cannot change size. */
2564         if (vnode_isswap(vp) && (length != 0)) {
2565                 return (EPERM);
2566         }
2567
2568         blksize = VTOVCB(vp)->blockSize;
2569         fileblocks = fp->ff_blocks;
2570         filebytes = (off_t)fileblocks * (off_t)blksize;
2571
2572         //
2573         // Have to do this here so that we don't wind up with
2574         // i/o pending for blocks that are about to be released
2575         // if we truncate the file.
2576         //
2577         // If skipsetsize is set, then the caller is responsible
2578         // for the ubc_setsize.
2579         //
2580         if (!skipsetsize)
2581                 ubc_setsize(vp, length);
2582
2583         // have to loop truncating or growing files that are
2584         // really big because otherwise transactions can get
2585         // enormous and consume too many kernel resources.
2586
2587         if (length < filebytes) {
2588                 while (filebytes > length) {
2589                         if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2590                                 filebytes -= HFS_BIGFILE_SIZE;
2591                         } else {
2592                                 filebytes = length;
2593                         }
2594                         cp->c_flag |= C_FORCEUPDATE;
2595                         error = do_hfs_truncate(vp, filebytes, flags, context);
2596                         if (error)
2597                                 break;
2598                 }
2599         } else if (length > filebytes) {
2600                 while (filebytes < length) {
2601                         if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
2602                                 filebytes += HFS_BIGFILE_SIZE;
2603                         } else {
2604                                 filebytes = length;
2605                         }
2606                         cp->c_flag |= C_FORCEUPDATE;
2607                         error = do_hfs_truncate(vp, filebytes, flags, context);
2608                         if (error)
2609                                 break;
2610                 }
2611         } else /* Same logical size */ {
2612
2613                 error = do_hfs_truncate(vp, length, flags, context);
2614         }
2615         /* Files that are changing size are not hot file candidates. */
2616         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
2617                 fp->ff_bytesread = 0;
2618         }
2619
2620         return (error);
2621 }
2622
2623
2624
2625 /*
2626  * Preallocate file storage space.
2627  */
2628 int
2629 hfs_vnop_allocate(struct vnop_allocate_args /* {
2630                 vnode_t a_vp;
2631                 off_t a_length;
2632                 u_int32_t  a_flags;
2633                 off_t *a_bytesallocated;
2634                 off_t a_offset;
2635                 vfs_context_t a_context;
2636         } */ *ap)
2637 {
2638         struct vnode *vp = ap->a_vp;
2639         struct cnode *cp;
2640         struct filefork *fp;
2641         ExtendedVCB *vcb;
2642         off_t length = ap->a_length;
2643         off_t startingPEOF;
2644         off_t moreBytesRequested;
2645         off_t actualBytesAdded;
2646         off_t filebytes;
2647         u_long fileblocks;
2648         int retval, retval2;
2649         u_int32_t blockHint;
2650         u_int32_t extendFlags;   /* For call to ExtendFileC */
2651         struct hfsmount *hfsmp;
2652         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
2653         int lockflags;
2654
2655         *(ap->a_bytesallocated) = 0;
2656
2657         if (!vnode_isreg(vp))
2658                 return (EISDIR);
2659         if (length < (off_t)0)
2660                 return (EINVAL);
2661
2662         cp = VTOC(vp);
2663
2664         hfs_lock_truncate(cp, TRUE);
2665
2666         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
2667                 goto Err_Exit;
2668         }
2669
2670         fp = VTOF(vp);
2671         hfsmp = VTOHFS(vp);
2672         vcb = VTOVCB(vp);
2673
2674         fileblocks = fp->ff_blocks;
2675         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
2676
2677         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
2678                 retval = EINVAL;
2679                 goto Err_Exit;
2680         }
2681
2682         /* Fill in the flags word for the call to Extend the file */
2683
2684         extendFlags = kEFNoClumpMask;
2685         if (ap->a_flags & ALLOCATECONTIG)
2686                 extendFlags |= kEFContigMask;
2687         if (ap->a_flags & ALLOCATEALL)
2688                 extendFlags |= kEFAllMask;
2689         if (cred && suser(cred, NULL) != 0)
2690                 extendFlags |= kEFReserveMask;
2691
2692         retval = E_NONE;
2693         blockHint = 0;
2694         startingPEOF = filebytes;
2695
2696         if (ap->a_flags & ALLOCATEFROMPEOF)
2697                 length += filebytes;
2698         else if (ap->a_flags & ALLOCATEFROMVOL)
2699                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
2700
2701         /* If no changes are necesary, then we're done */
2702         if (filebytes == length)
2703                 goto Std_Exit;
2704
2705         /*
2706          * Lengthen the size of the file. We must ensure that the
2707          * last byte of the file is allocated. Since the smallest
2708          * value of filebytes is 0, length will be at least 1.
2709          */
2710         if (length > filebytes) {
2711                 off_t total_bytes_added = 0, orig_request_size;
2712
2713                 orig_request_size = moreBytesRequested = length - filebytes;
2714
2715 #if QUOTA
2716                 retval = hfs_chkdq(cp,
2717                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
2718                                 cred, 0);
2719                 if (retval)
2720                         goto Err_Exit;
2721
2722 #endif /* QUOTA */
2723                 /*
2724                  * Metadata zone checks.
2725                  */
2726                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
2727                         /*
2728                          * Allocate Journal and Quota files in metadata zone.
2729                          */
2730                         if (hfs_virtualmetafile(cp)) {
2731                                 extendFlags |= kEFMetadataMask;
2732                                 blockHint = hfsmp->hfs_metazone_start;
2733                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
2734                                    (blockHint <= hfsmp->hfs_metazone_end)) {
2735                                 /*
2736                                  * Move blockHint outside metadata zone.
2737                                  */
2738                                 blockHint = hfsmp->hfs_metazone_end + 1;
2739                         }
2740                 }
2741
2742
2743                 while ((length > filebytes) && (retval == E_NONE)) {
2744                     off_t bytesRequested;
2745
2746                     if (hfs_start_transaction(hfsmp) != 0) {
2747                         retval = EINVAL;
2748                         goto Err_Exit;
2749                     }
2750
2751                     /* Protect extents b-tree and allocation bitmap */
2752                     lockflags = SFL_BITMAP;
2753                     if (overflow_extents(fp))
2754                         lockflags |= SFL_EXTENTS;
2755                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
2756
2757                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
2758                         bytesRequested = HFS_BIGFILE_SIZE;
2759                     } else {
2760                         bytesRequested = moreBytesRequested;
2761                     }
2762
2763                     retval = MacToVFSError(ExtendFileC(vcb,
2764                                                 (FCB*)fp,
2765                                                 bytesRequested,
2766                                                 blockHint,
2767                                                 extendFlags,
2768                                                 &actualBytesAdded));
2769
2770                     if (retval == E_NONE) {
2771                         *(ap->a_bytesallocated) += actualBytesAdded;
2772                         total_bytes_added += actualBytesAdded;
2773                         moreBytesRequested -= actualBytesAdded;
2774                         if (blockHint != 0) {
2775                             blockHint += actualBytesAdded / vcb->blockSize;
2776                         }
2777                     }
2778                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
2779
2780                     hfs_systemfile_unlock(hfsmp, lockflags);
2781
2782                     if (hfsmp->jnl) {
2783                         (void) hfs_update(vp, TRUE);
2784                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
2785                     }
2786
2787                     hfs_end_transaction(hfsmp);
2788                 }
2789
2790
2791                 /*
2792                  * if we get an error and no changes were made then exit
2793                  * otherwise we must do the hfs_update to reflect the changes
2794                  */
2795                 if (retval && (startingPEOF == filebytes))
2796                         goto Err_Exit;
2797
2798                 /*
2799                  * Adjust actualBytesAdded to be allocation block aligned, not
2800                  * clump size aligned.
2801                  * NOTE: So what we are reporting does not affect reality
2802                  * until the file is closed, when we truncate the file to allocation
2803                  * block size.
2804                  */
2805                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
2806                         *(ap->a_bytesallocated) =
2807                                 roundup(orig_request_size, (off_t)vcb->blockSize);
2808
2809         } else { /* Shorten the size of the file */
2810
2811                 if (fp->ff_size > length) {
2812                         /*
2813                          * Any buffers that are past the truncation point need to be
2814                          * invalidated (to maintain buffer cache consistency).
2815                          */
2816                 }
2817
2818                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
2819                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
2820
2821                 /*
2822                  * if we get an error and no changes were made then exit
2823                  * otherwise we must do the hfs_update to reflect the changes
2824                  */
2825                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
2826 #if QUOTA
2827                 /* These are  bytesreleased */
2828                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
2829 #endif /* QUOTA */
2830
2831                 if (fp->ff_size > filebytes) {
2832                         fp->ff_size = filebytes;
2833
2834                         hfs_unlock(cp);
2835                         ubc_setsize(vp, fp->ff_size);
2836                         hfs_lock(cp, HFS_FORCE_LOCK);
2837                 }
2838         }
2839
2840 Std_Exit:
2841         cp->c_touch_chgtime = TRUE;
2842         cp->c_touch_modtime = TRUE;
2843         retval2 = hfs_update(vp, MNT_WAIT);
2844
2845         if (retval == 0)
2846                 retval = retval2;
2847 Err_Exit:
2848         hfs_unlock_truncate(cp, TRUE);
2849         hfs_unlock(cp);
2850         return (retval);
2851 }
2852
2853
2854 /*
2855  * Pagein for HFS filesystem
2856  */
2857 int
2858 hfs_vnop_pagein(struct vnop_pagein_args *ap)
2859 /*
2860         struct vnop_pagein_args {
2861                 vnode_t a_vp,
2862                 upl_t         a_pl,
2863                 vm_offset_t   a_pl_offset,
2864                 off_t         a_f_offset,
2865                 size_t        a_size,
2866                 int           a_flags
2867                 vfs_context_t a_context;
2868         };
2869 */
2870 {
2871         vnode_t vp = ap->a_vp;
2872         int error;
2873
2874         error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
2875                                ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags);
2876         /*
2877          * Keep track of blocks read.
2878          */
2879         if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
2880                 struct cnode *cp;
2881                 struct filefork *fp;
2882                 int bytesread;
2883                 int took_cnode_lock = 0;
2884
2885                 cp = VTOC(vp);
2886                 fp = VTOF(vp);
2887
2888                 if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
2889                         bytesread = fp->ff_size;
2890                 else
2891                         bytesread = ap->a_size;
2892
2893                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
2894                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
2895                         hfs_lock(cp, HFS_FORCE_LOCK);
2896                         took_cnode_lock = 1;
2897                 }
2898                 /*
2899                  * If this file hasn't been seen since the start of
2900                  * the current sampling period then start over.
2901                  */
2902                 if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
2903                         struct timeval tv;
2904
2905                         fp->ff_bytesread = bytesread;
2906                         microtime(&tv);
2907                         cp->c_atime = tv.tv_sec;
2908                 } else {
2909                         fp->ff_bytesread += bytesread;
2910                 }
2911                 cp->c_touch_acctime = TRUE;
2912                 if (took_cnode_lock)
2913                         hfs_unlock(cp);
2914         }
2915         return (error);
2916 }
2917
2918 /*
2919  * Pageout for HFS filesystem.
2920  */
2921 int
2922 hfs_vnop_pageout(struct vnop_pageout_args *ap)
2923 /*
2924         struct vnop_pageout_args {
2925            vnode_t a_vp,
2926            upl_t         a_pl,
2927            vm_offset_t   a_pl_offset,
2928            off_t         a_f_offset,
2929            size_t        a_size,
2930            int           a_flags
2931            vfs_context_t a_context;
2932         };
2933 */
2934 {
2935         vnode_t vp = ap->a_vp;
2936         struct cnode *cp;
2937         struct filefork *fp;
2938         int retval;
2939         off_t filesize;
2940
2941         cp = VTOC(vp);
2942         fp = VTOF(vp);
2943
2944         if (vnode_isswap(vp)) {
2945                 filesize = fp->ff_size;
2946         } else {
2947                 off_t end_of_range;
2948                 int tooklock = 0;
2949
2950                 if (cp->c_lockowner != current_thread()) {
2951                     if ( (retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
2952                         if (!(ap->a_flags & UPL_NOCOMMIT)) {
2953                                 ubc_upl_abort_range(ap->a_pl,
2954                                                     ap->a_pl_offset,
2955                                                     ap->a_size,
2956                                                     UPL_ABORT_FREE_ON_EMPTY);
2957                         }
2958                         return (retval);
2959                     }
2960                     tooklock = 1;
2961                 }
2962
2963                 filesize = fp->ff_size;
2964                 end_of_range = ap->a_f_offset + ap->a_size - 1;
2965
2966                 if (end_of_range >= filesize) {
2967                         end_of_range = (off_t)(filesize - 1);
2968                 }
2969                 if (ap->a_f_offset < filesize) {
2970                         rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
2971                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
2972                 }
2973
2974                 if (tooklock) {
2975                     hfs_unlock(cp);
2976                 }
2977         }
2978
2979         retval = cluster_pageout(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
2980                                  ap->a_size, filesize, ap->a_flags);
2981
2982         /*
2983          * If data was written, and setuid or setgid bits are set and
2984          * this process is not the superuser then clear the setuid and
2985          * setgid bits as a precaution against tampering.
2986          */
2987         if ((retval == 0) &&
2988             (cp->c_mode & (S_ISUID | S_ISGID)) &&
2989             (vfs_context_suser(ap->a_context) != 0)) {
2990                 hfs_lock(cp, HFS_FORCE_LOCK);
2991                 cp->c_mode &= ~(S_ISUID | S_ISGID);
2992                 cp->c_touch_chgtime = TRUE;
2993                 hfs_unlock(cp);
2994         }
2995         return (retval);
2996 }
2997
2998 /*
2999  * Intercept B-Tree node writes to unswap them if necessary.
3000  */
3001 int
3002 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
3003 {
3004         int retval = 0;
3005         register struct buf *bp = ap->a_bp;
3006         register struct vnode *vp = buf_vnode(bp);
3007         BlockDescriptor block;
3008
3009         /* Trap B-Tree writes */
3010         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
3011             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
3012             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
3013             (vp == VTOHFS(vp)->hfc_filevp)) {
3014
3015                 /*
3016                  * Swap and validate the node if it is in native byte order.
3017                  * This is always be true on big endian, so we always validate
3018                  * before writing here.  On little endian, the node typically has
3019                  * been swapped and validated when it was written to the journal,
3020                  * so we won't do anything here.
3021                  */
3022                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
3023                         /* Prepare the block pointer */
3024                         block.blockHeader = bp;
3025                         block.buffer = (char *)buf_dataptr(bp);
3026                         block.blockNum = buf_lblkno(bp);
3027                         /* not found in cache ==> came from disk */
3028                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
3029                         block.blockSize = buf_count(bp);
3030
3031                         /* Endian un-swap B-Tree node */
3032                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig);
3033                         if (retval)
3034                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
3035                 }
3036         }
3037
3038         /* This buffer shouldn't be locked anymore but if it is clear it */
3039         if ((buf_flags(bp) & B_LOCKED)) {
3040                 // XXXdbg
3041                 if (VTOHFS(vp)->jnl) {
3042                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
3043                 }
3044                 buf_clearflags(bp, B_LOCKED);
3045         }
3046         retval = vn_bwrite (ap);
3047
3048         return (retval);
3049 }
3050
3051 /*
3052  * Relocate a file to a new location on disk
3053  *  cnode must be locked on entry
3054  *
3055  * Relocation occurs by cloning the file's data from its
3056  * current set of blocks to a new set of blocks. During
3057  * the relocation all of the blocks (old and new) are
3058  * owned by the file.
3059  *
3060  * -----------------
3061  * |///////////////|
3062  * -----------------
3063  * 0               N (file offset)
3064  *
3065  * -----------------     -----------------
3066  * |///////////////|     |               |     STEP 1 (acquire new blocks)
3067  * -----------------     -----------------
3068  * 0               N     N+1             2N
3069  *
3070  * -----------------     -----------------
3071  * |///////////////|     |///////////////|     STEP 2 (clone data)
3072  * -----------------     -----------------
3073  * 0               N     N+1             2N
3074  *
3075  *                       -----------------
3076  *                       |///////////////|     STEP 3 (head truncate blocks)
3077  *                       -----------------
3078  *                       0               N
3079  *
3080  * During steps 2 and 3 page-outs to file offsets less
3081  * than or equal to N are suspended.
3082  *
3083  * During step 3 page-ins to the file get suspended.
3084  */
3085 __private_extern__
3086 int
3087 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
3088         struct  proc *p)
3089 {
3090         struct  cnode *cp;
3091         struct  filefork *fp;
3092         struct  hfsmount *hfsmp;
3093         u_int32_t  headblks;
3094         u_int32_t  datablks;
3095         u_int32_t  blksize;
3096         u_int32_t  growsize;
3097         u_int32_t  nextallocsave;
3098         daddr64_t  sector_a,  sector_b;
3099         int eflags;
3100         off_t  newbytes;
3101         int  retval;
3102         int lockflags = 0;
3103         int took_trunc_lock = 0;
3104         int started_tr = 0;
3105         enum vtype vnodetype;
3106
3107         vnodetype = vnode_vtype(vp);
3108         if (vnodetype != VREG && vnodetype != VLNK) {
3109                 return (EPERM);
3110         }
3111
3112         hfsmp = VTOHFS(vp);
3113         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
3114                 return (ENOSPC);
3115         }
3116
3117         cp = VTOC(vp);
3118         fp = VTOF(vp);
3119         if (fp->ff_unallocblocks)
3120                 return (EINVAL);
3121         blksize = hfsmp->blockSize;
3122         if (blockHint == 0)
3123                 blockHint = hfsmp->nextAllocation;
3124
3125         if ((fp->ff_size > 0x7fffffff) ||
3126             ((fp->ff_size > blksize) && vnodetype == VLNK)) {
3127                 return (EFBIG);
3128         }
3129
3130         //
3131         // We do not believe that this call to hfs_fsync() is
3132         // necessary and it causes a journal transaction
3133         // deadlock so we are removing it.
3134         //
3135         //if (vnodetype == VREG && !vnode_issystem(vp)) {
3136         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
3137         //      if (retval)
3138         //              return (retval);
3139         //}
3140
3141         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
3142                 hfs_unlock(cp);
3143                 hfs_lock_truncate(cp, TRUE);
3144                 /* Force lock since callers expects lock to be held. */
3145                 if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) {
3146                         hfs_unlock_truncate(cp, TRUE);
3147                         return (retval);
3148                 }
3149                 /* No need to continue if file was removed. */
3150                 if (cp->c_flag & C_NOEXISTS) {
3151                         hfs_unlock_truncate(cp, TRUE);
3152                         return (ENOENT);
3153                 }
3154                 took_trunc_lock = 1;
3155         }
3156         headblks = fp->ff_blocks;
3157         datablks = howmany(fp->ff_size, blksize);
3158         growsize = datablks * blksize;
3159         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
3160         if (blockHint >= hfsmp->hfs_metazone_start &&
3161             blockHint <= hfsmp->hfs_metazone_end)
3162                 eflags |= kEFMetadataMask;
3163
3164         if (hfs_start_transaction(hfsmp) != 0) {
3165                 if (took_trunc_lock)
3166                         hfs_unlock_truncate(cp, TRUE);
3167             return (EINVAL);
3168         }
3169         started_tr = 1;
3170         /*
3171          * Protect the extents b-tree and the allocation bitmap
3172          * during MapFileBlockC and ExtendFileC operations.
3173          */
3174         lockflags = SFL_BITMAP;
3175         if (overflow_extents(fp))
3176                 lockflags |= SFL_EXTENTS;
3177         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3178
3179         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
3180         if (retval) {
3181                 retval = MacToVFSError(retval);
3182                 goto out;
3183         }
3184
3185         /*
3186          * STEP 1 - acquire new allocation blocks.
3187          */
3188         nextallocsave = hfsmp->nextAllocation;
3189         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
3190         if (eflags & kEFMetadataMask) {
3191                 HFS_MOUNT_LOCK(hfsmp, TRUE);
3192                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
3193                 MarkVCBDirty(hfsmp);
3194                 HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3195         }
3196
3197         retval = MacToVFSError(retval);
3198         if (retval == 0) {
3199                 cp->c_flag |= C_MODIFIED;
3200                 if (newbytes < growsize) {
3201                         retval = ENOSPC;
3202                         goto restore;
3203                 } else if (fp->ff_blocks < (headblks + datablks)) {
3204                         printf("hfs_relocate: allocation failed");
3205                         retval = ENOSPC;
3206                         goto restore;
3207                 }
3208
3209                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
3210                 if (retval) {
3211                         retval = MacToVFSError(retval);
3212                 } else if ((sector_a + 1) == sector_b) {
3213                         retval = ENOSPC;
3214                         goto restore;
3215                 } else if ((eflags & kEFMetadataMask) &&
3216                            ((((u_int64_t)sector_b * hfsmp->hfs_phys_block_size) / blksize) >
3217                               hfsmp->hfs_metazone_end)) {
3218                         const char * filestr;
3219                         char emptystr = '\0';
3220
3221                         if (cp->c_desc.cd_nameptr != NULL) {
3222                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
3223                         } else if (vnode_name(vp) != NULL) {
3224                                 filestr = vnode_name(vp);
3225                         } else {
3226                                 filestr = &emptystr;
3227                         }
3228                         printf("hfs_relocate: %s didn't move into MDZ (%d blks)\n", filestr, fp->ff_blocks);
3229                         retval = ENOSPC;
3230                         goto restore;
3231                 }
3232         }
3233         /* Done with system locks and journal for now. */
3234         hfs_systemfile_unlock(hfsmp, lockflags);
3235         lockflags = 0;
3236         hfs_end_transaction(hfsmp);
3237         started_tr = 0;
3238
3239         if (retval) {
3240                 /*
3241                  * Check to see if failure is due to excessive fragmentation.
3242                  */
3243                 if ((retval == ENOSPC) &&
3244                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
3245                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
3246                 }
3247                 goto out;
3248         }
3249         /*
3250          * STEP 2 - clone file data into the new allocation blocks.
3251          */
3252
3253         if (vnodetype == VLNK)
3254                 retval = hfs_clonelink(vp, blksize, cred, p);
3255         else if (vnode_issystem(vp))
3256                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
3257         else
3258                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
3259
3260         /* Start transaction for step 3 or for a restore. */
3261         if (hfs_start_transaction(hfsmp) != 0) {
3262                 retval = EINVAL;
3263                 goto out;
3264         }
3265         started_tr = 1;
3266         if (retval)
3267                 goto restore;
3268
3269         /*
3270          * STEP 3 - switch to cloned data and remove old blocks.
3271          */
3272         lockflags = SFL_BITMAP;
3273         if (overflow_extents(fp))
3274                 lockflags |= SFL_EXTENTS;
3275         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3276
3277         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
3278
3279         hfs_systemfile_unlock(hfsmp, lockflags);
3280         lockflags = 0;
3281         if (retval)
3282                 goto restore;
3283 out:
3284         if (took_trunc_lock)
3285                 hfs_unlock_truncate(cp, TRUE);
3286
3287         if (lockflags) {
3288                 hfs_systemfile_unlock(hfsmp, lockflags);
3289                 lockflags = 0;
3290         }
3291
3292         /* Push cnode's new extent data to disk. */
3293         if (retval == 0) {
3294                 (void) hfs_update(vp, MNT_WAIT);
3295         }
3296         if (hfsmp->jnl) {
3297                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
3298                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
3299                 else
3300                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
3301         }
3302 exit:
3303         if (started_tr)
3304                 hfs_end_transaction(hfsmp);
3305
3306         return (retval);
3307
3308 restore:
3309         if (fp->ff_blocks == headblks) {
3310                 if (took_trunc_lock)
3311                         hfs_unlock_truncate(cp, TRUE);
3312                 goto exit;
3313         }
3314         /*
3315          * Give back any newly allocated space.
3316          */
3317         if (lockflags == 0) {
3318                 lockflags = SFL_BITMAP;
3319                 if (overflow_extents(fp))
3320                         lockflags |= SFL_EXTENTS;
3321                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3322         }
3323
3324         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false);
3325
3326         hfs_systemfile_unlock(hfsmp, lockflags);
3327         lockflags = 0;
3328
3329         if (took_trunc_lock)
3330                 hfs_unlock_truncate(cp, TRUE);
3331         goto exit;
3332 }
3333
3334
3335 /*
3336  * Clone a symlink.
3337  *
3338  */
3339 static int
3340 hfs_clonelink(struct vnode *vp, int blksize, kauth_cred_t cred, __unused struct proc *p)
3341 {
3342         struct buf *head_bp = NULL;
3343         struct buf *tail_bp = NULL;
3344         int error;
3345
3346
3347         error = (int)buf_meta_bread(vp, (daddr64_t)0, blksize, cred, &head_bp);
3348         if (error)
3349                 goto out;
3350
3351         tail_bp = buf_getblk(vp, (daddr64_t)1, blksize, 0, 0, BLK_META);
3352         if (tail_bp == NULL) {
3353                 error = EIO;
3354                 goto out;
3355         }
3356         bcopy((char *)buf_dataptr(head_bp), (char *)buf_dataptr(tail_bp), blksize);
3357         error = (int)buf_bwrite(tail_bp);
3358 out:
3359         if (head_bp) {
3360                 buf_markinvalid(head_bp);
3361                 buf_brelse(head_bp);
3362         }
3363         (void) buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
3364
3365         return (error);
3366 }
3367
3368 /*
3369  * Clone a file's data within the file.
3370  *
3371  */
3372 static int
3373 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
3374 {
3375         caddr_t  bufp;
3376         size_t  writebase;
3377         size_t  bufsize;
3378         size_t  copysize;
3379         size_t  iosize;
3380         off_t   filesize;
3381         size_t  offset;
3382         uio_t auio;
3383         int  error = 0;
3384
3385         filesize = VTOF(vp)->ff_blocks * blksize;  /* virtual file size */
3386         writebase = blkstart * blksize;
3387         copysize = blkcnt * blksize;
3388         iosize = bufsize = MIN(copysize, 128 * 1024);
3389         offset = 0;
3390
3391         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
3392                 return (ENOMEM);
3393         }
3394         hfs_unlock(VTOC(vp));
3395
3396         auio = uio_create(1, 0, UIO_SYSSPACE32, UIO_READ);
3397
3398         while (offset < copysize) {
3399                 iosize = MIN(copysize - offset, iosize);
3400
3401                 uio_reset(auio, offset, UIO_SYSSPACE32, UIO_READ);
3402                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3403
3404                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
3405                 if (error) {
3406                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
3407                         break;
3408                 }
3409                 if (uio_resid(auio) != 0) {
3410                         printf("clonedata: cluster_read: uio_resid = %lld\n", uio_resid(auio));
3411                         error = EIO;
3412                         break;
3413                 }
3414
3415                 uio_reset(auio, writebase + offset, UIO_SYSSPACE32, UIO_WRITE);
3416                 uio_addiov(auio, (uintptr_t)bufp, iosize);
3417
3418                 error = cluster_write(vp, auio, filesize + offset,
3419                                       filesize + offset + iosize,
3420                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
3421                 if (error) {
3422                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
3423                         break;
3424                 }
3425                 if (uio_resid(auio) != 0) {
3426                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
3427                         error = EIO;
3428                         break;
3429                 }
3430                 offset += iosize;
3431         }
3432         uio_free(auio);
3433
3434         /*
3435          * No need to call ubc_sync_range or hfs_invalbuf
3436          * since the file was copied using IO_NOCACHE.
3437          */
3438
3439         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
3440
3441         hfs_lock(VTOC(vp), HFS_FORCE_LOCK);
3442         return (error);
3443 }
3444
3445 /*
3446  * Clone a system (metadata) file.
3447  *
3448  */
3449 static int
3450 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
3451                  kauth_cred_t cred, struct proc *p)
3452 {
3453         caddr_t  bufp;
3454         char * offset;
3455         size_t  bufsize;
3456         size_t  iosize;
3457         struct buf *bp = NULL;
3458         daddr64_t  blkno;
3459         daddr64_t  blk;
3460         daddr64_t  start_blk;
3461         daddr64_t  last_blk;
3462         int  breadcnt;
3463         int  i;
3464         int  error = 0;
3465
3466
3467         iosize = GetLogicalBlockSize(vp);
3468         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
3469         breadcnt = bufsize / iosize;
3470
3471         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
3472                 return (ENOMEM);
3473         }
3474         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
3475         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
3476         blkno = 0;
3477
3478         while (blkno < last_blk) {
3479                 /*
3480                  * Read up to a megabyte
3481                  */
3482                 offset = bufp;
3483                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
3484                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
3485                         if (error) {
3486                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
3487                                 goto out;
3488                         }
3489                         if (buf_count(bp) != iosize) {
3490                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
3491                                 goto out;
3492                         }
3493                         bcopy((char *)buf_dataptr(bp), offset, iosize);
3494
3495                         buf_markinvalid(bp);
3496                         buf_brelse(bp);
3497                         bp = NULL;
3498
3499                         offset += iosize;
3500                 }
3501
3502                 /*
3503                  * Write up to a megabyte
3504                  */
3505                 offset = bufp;
3506                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
3507                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
3508                         if (bp == NULL) {
3509                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
3510                                 error = EIO;
3511                                 goto out;
3512                         }
3513                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
3514                         error = (int)buf_bwrite(bp);
3515                         bp = NULL;
3516                         if (error)
3517                                 goto out;
3518                         offset += iosize;
3519                 }
3520         }
3521 out:
3522         if (bp) {
3523                 buf_brelse(bp);
3524         }
3525
3526         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
3527
3528         error = hfs_fsync(vp, MNT_WAIT, 0, p);
3529
3530         return (error);
3531 }