bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/buf_internal.h>
  45 #include <sys/proc.h>
  46 #include <sys/kauth.h>
  47 #include <sys/vnode.h>
  48 #include <sys/vnode_internal.h>
  49 #include <sys/uio.h>
  50 #include <sys/vfs_context.h>
  51 #include <sys/fsevents.h>
  52 #include <kern/kalloc.h>
  53 #include <sys/disk.h>
  54 #include <sys/sysctl.h>
  55 #include <sys/fsctl.h>
  56 #include <sys/mount_internal.h>
  57 #include <sys/file_internal.h>
  58
  59 #include <miscfs/specfs/specdev.h>
  60
  61 #include <sys/ubc.h>
  62 #include <sys/ubc_internal.h>
  63
  64 #include <vm/vm_pageout.h>
  65 #include <vm/vm_kern.h>
  66
  67 #include <sys/kdebug.h>
  68
  69 #include        "hfs.h"
  70 #include        "hfs_attrlist.h"
  71 #include        "hfs_endian.h"
  72 #include        "hfs_fsctl.h"
  73 #include        "hfs_quota.h"
  74 #include        "hfscommon/headers/FileMgrInternal.h"
  75 #include        "hfscommon/headers/BTreesInternal.h"
  76 #include        "hfs_cnode.h"
  77 #include        "hfs_dbg.h"
  78
  79 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  80
  81 enum {
  82         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  83 };
  84
  85 /* from bsd/hfs/hfs_vfsops.c */
  86 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  87
  88 static int  hfs_clonefile(struct vnode *, int, int, int);
  89 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  90 static int  hfs_minorupdate(struct vnode *vp);
  91 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  92
  93 /* from bsd/hfs/hfs_vnops.c */
  94 extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp);
  95
  96
  97
  98 int flush_cache_on_write = 0;
  99 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
 100
 101 /*
 102  * Read data from a file.
 103  */
 104 int
 105 hfs_vnop_read(struct vnop_read_args *ap)
 106 {
 107         /*
 108            struct vnop_read_args {
 109            struct vnodeop_desc *a_desc;
 110            vnode_t a_vp;
 111            struct uio *a_uio;
 112            int a_ioflag;
 113            vfs_context_t a_context;
 114            };
 115          */
 116
 117         uio_t uio = ap->a_uio;
 118         struct vnode *vp = ap->a_vp;
 119         struct cnode *cp;
 120         struct filefork *fp;
 121         struct hfsmount *hfsmp;
 122         off_t filesize;
 123         off_t filebytes;
 124         off_t start_resid = uio_resid(uio);
 125         off_t offset = uio_offset(uio);
 126         int retval = 0;
 127         int took_truncate_lock = 0;
 128         int io_throttle = 0;
 129         int throttled_count = 0;
 130
 131         /* Preflight checks */
 132         if (!vnode_isreg(vp)) {
 133                 /* can only read regular files */
 134                 if (vnode_isdir(vp))
 135                         return (EISDIR);
 136                 else
 137                         return (EPERM);
 138         }
 139         if (start_resid == 0)
 140                 return (0);             /* Nothing left to do */
 141         if (offset < 0)
 142                 return (EINVAL);        /* cant read from a negative offset */
 143
 144         if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 145                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 146                 /* Don't allow unencrypted io request from user space */
 147                 return EPERM;
 148         }
 149
 150
 151
 152 #if HFS_COMPRESSION
 153         if (VNODE_IS_RSRC(vp)) {
 154                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 155                         return 0;
 156                 }
 157                 /* otherwise read the resource fork normally */
 158         } else {
 159                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 160                 if (compressed) {
 161                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 162                         if (compressed) {
 163                                 if (retval == 0) {
 164                                         /* successful read, update the access time */
 165                                         VTOC(vp)->c_touch_acctime = TRUE;
 166
 167                                         /* compressed files are not hot file candidates */
 168                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 169                                                 VTOF(vp)->ff_bytesread = 0;
 170                                         }
 171                                 }
 172                                 return retval;
 173                         }
 174                         /* otherwise the file was converted back to a regular file while we were reading it */
 175                         retval = 0;
 176                 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 177                         int error;
 178
 179                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 180                         if (error) {
 181                                 return error;
 182                         }
 183
 184                 }
 185         }
 186 #endif /* HFS_COMPRESSION */
 187
 188         cp = VTOC(vp);
 189         fp = VTOF(vp);
 190         hfsmp = VTOHFS(vp);
 191
 192 #if CONFIG_PROTECT
 193         if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
 194                 goto exit;
 195         }
 196 #endif
 197
 198         /*
 199          * If this read request originated from a syscall (as opposed to
 200          * an in-kernel page fault or something), then set it up for
 201          * throttle checks
 202          */
 203         if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
 204                 io_throttle = IO_RETURN_ON_THROTTLE;
 205         }
 206
 207 read_again:
 208
 209         /* Protect against a size change. */
 210         hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 211         took_truncate_lock = 1;
 212
 213         filesize = fp->ff_size;
 214         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 215
 216         /*
 217          * Check the file size. Note that per POSIX spec, we return 0 at
 218          * file EOF, so attempting a read at an offset that is too big
 219          * should just return 0 on HFS+. Since the return value was initialized
 220          * to 0 above, we just jump to exit.  HFS Standard has its own behavior.
 221          */
 222         if (offset > filesize) {
 223                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 224                     (offset > (off_t)MAXHFSFILESIZE)) {
 225                         retval = EFBIG;
 226                 }
 227                 goto exit;
 228         }
 229
 230         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
 231                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 232
 233         retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 234
 235         cp->c_touch_acctime = TRUE;
 236
 237         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
 238                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 239
 240         /*
 241          * Keep track blocks read
 242          */
 243         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 244                 int took_cnode_lock = 0;
 245                 off_t bytesread;
 246
 247                 bytesread = start_resid - uio_resid(uio);
 248
 249                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 250                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 251                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 252                         took_cnode_lock = 1;
 253                 }
 254                 /*
 255                  * If this file hasn't been seen since the start of
 256                  * the current sampling period then start over.
 257                  */
 258                 if (cp->c_atime < hfsmp->hfc_timebase) {
 259                         struct timeval tv;
 260
 261                         fp->ff_bytesread = bytesread;
 262                         microtime(&tv);
 263                         cp->c_atime = tv.tv_sec;
 264                 } else {
 265                         fp->ff_bytesread += bytesread;
 266                 }
 267                 if (took_cnode_lock)
 268                         hfs_unlock(cp);
 269         }
 270 exit:
 271         if (took_truncate_lock) {
 272                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 273         }
 274         if (retval == EAGAIN) {
 275                 throttle_lowpri_io(1);
 276                 throttled_count++;
 277
 278                 retval = 0;
 279                 goto read_again;
 280         }
 281         if (throttled_count) {
 282                 throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
 283         }
 284         return (retval);
 285 }
 286
 287 /*
 288  * Write data to a file.
 289  */
 290 int
 291 hfs_vnop_write(struct vnop_write_args *ap)
 292 {
 293         uio_t uio = ap->a_uio;
 294         struct vnode *vp = ap->a_vp;
 295         struct cnode *cp;
 296         struct filefork *fp;
 297         struct hfsmount *hfsmp;
 298         kauth_cred_t cred = NULL;
 299         off_t origFileSize;
 300         off_t writelimit;
 301         off_t bytesToAdd = 0;
 302         off_t actualBytesAdded;
 303         off_t filebytes;
 304         off_t offset;
 305         ssize_t resid;
 306         int eflags;
 307         int ioflag = ap->a_ioflag;
 308         int retval = 0;
 309         int lockflags;
 310         int cnode_locked = 0;
 311         int partialwrite = 0;
 312         int do_snapshot = 1;
 313         time_t orig_ctime=VTOC(vp)->c_ctime;
 314         int took_truncate_lock = 0;
 315         int io_return_on_throttle = 0;
 316         int throttled_count = 0;
 317         struct rl_entry *invalid_range;
 318
 319 #if HFS_COMPRESSION
 320         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 321                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 322                 switch(state) {
 323                         case FILE_IS_COMPRESSED:
 324                                 return EACCES;
 325                         case FILE_IS_CONVERTING:
 326                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 327                                    bother with snapshots or else we will deadlock.
 328                                 */
 329                                 do_snapshot = 0;
 330                                 break;
 331                         default:
 332                                 printf("invalid state %d for compressed file\n", state);
 333                                 /* fall through */
 334                 }
 335         } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 336                 int error;
 337
 338                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 339                 if (error != 0) {
 340                         return error;
 341                 }
 342         }
 343
 344         if (do_snapshot) {
 345                 check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 346         }
 347
 348 #endif
 349
 350         if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 351                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 352                 /* Don't allow unencrypted io request from user space */
 353                 return EPERM;
 354         }
 355
 356
 357         resid = uio_resid(uio);
 358         offset = uio_offset(uio);
 359
 360         if (offset < 0)
 361                 return (EINVAL);
 362         if (resid == 0)
 363                 return (E_NONE);
 364         if (!vnode_isreg(vp))
 365                 return (EPERM);  /* Can only write regular files */
 366
 367         cp = VTOC(vp);
 368         fp = VTOF(vp);
 369         hfsmp = VTOHFS(vp);
 370
 371 #if CONFIG_PROTECT
 372         if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
 373                 goto exit;
 374         }
 375 #endif
 376
 377         eflags = kEFDeferMask;  /* defer file block allocations */
 378 #if HFS_SPARSE_DEV
 379         /*
 380          * When the underlying device is sparse and space
 381          * is low (< 8MB), stop doing delayed allocations
 382          * and begin doing synchronous I/O.
 383          */
 384         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 385             (hfs_freeblks(hfsmp, 0) < 2048)) {
 386                 eflags &= ~kEFDeferMask;
 387                 ioflag |= IO_SYNC;
 388         }
 389 #endif /* HFS_SPARSE_DEV */
 390
 391         if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
 392                         (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
 393                 io_return_on_throttle = IO_RETURN_ON_THROTTLE;
 394         }
 395
 396 again:
 397         /*
 398          * Protect against a size change.
 399          *
 400          * Note: If took_truncate_lock is true, then we previously got the lock shared
 401          * but needed to upgrade to exclusive.  So try getting it exclusive from the
 402          * start.
 403          */
 404         if (ioflag & IO_APPEND || took_truncate_lock) {
 405                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 406         }
 407         else {
 408                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 409         }
 410         took_truncate_lock = 1;
 411
 412         /* Update UIO */
 413         if (ioflag & IO_APPEND) {
 414                 uio_setoffset(uio, fp->ff_size);
 415                 offset = fp->ff_size;
 416         }
 417         if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
 418                 retval = EPERM;
 419                 goto exit;
 420         }
 421
 422         origFileSize = fp->ff_size;
 423         writelimit = offset + resid;
 424         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 425
 426         /*
 427          * We may need an exclusive truncate lock for several reasons, all
 428          * of which are because we may be writing to a (portion of a) block
 429          * for the first time, and we need to make sure no readers see the
 430          * prior, uninitialized contents of the block.  The cases are:
 431          *
 432          * 1. We have unallocated (delayed allocation) blocks.  We may be
 433          *    allocating new blocks to the file and writing to them.
 434          *    (A more precise check would be whether the range we're writing
 435          *    to contains delayed allocation blocks.)
 436          * 2. We need to extend the file.  The bytes between the old EOF
 437          *    and the new EOF are not yet initialized.  This is important
 438          *    even if we're not allocating new blocks to the file.  If the
 439          *    old EOF and new EOF are in the same block, we still need to
 440          *    protect that range of bytes until they are written for the
 441          *    first time.
 442          * 3. The write overlaps some invalid ranges (delayed zero fill; that
 443          *    part of the file has been allocated, but not yet written).
 444          *
 445          * If we had a shared lock with the above cases, we need to try to upgrade
 446          * to an exclusive lock.  If the upgrade fails, we will lose the shared
 447          * lock, and will need to take the truncate lock again; the took_truncate_lock
 448          * flag will still be set, causing us to try for an exclusive lock next time.
 449          *
 450          * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode
 451          * lock is held, since it protects the range lists.
 452          */
 453         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 454             ((fp->ff_unallocblocks != 0) ||
 455              (writelimit > origFileSize))) {
 456                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 457                         /*
 458                          * Lock upgrade failed and we lost our shared lock, try again.
 459                          * Note: we do not set took_truncate_lock=0 here.  Leaving it
 460                          * set to 1 will cause us to try to get the lock exclusive.
 461                          */
 462                         goto again;
 463                 }
 464                 else {
 465                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 466                         cp->c_truncatelockowner = current_thread();
 467                 }
 468         }
 469
 470         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 471                 goto exit;
 472         }
 473         cnode_locked = 1;
 474
 475         /*
 476          * Now that we have the cnode lock, see if there are delayed zero fill ranges
 477          * overlapping our write.  If so, we need the truncate lock exclusive (see above).
 478          */
 479         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 480             (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) {
 481                 /*
 482                  * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes
 483                  * a deadlock, rather than simply returning failure.  (That is, it apparently does
 484                  * not behave like a "try_lock").  Since this condition is rare, just drop the
 485                  * cnode lock and try again.  Since took_truncate_lock is set, we will
 486                  * automatically take the truncate lock exclusive.
 487                  */
 488                 hfs_unlock(cp);
 489                 cnode_locked = 0;
 490                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 491                 goto again;
 492         }
 493
 494         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
 495                      (int)offset, uio_resid(uio), (int)fp->ff_size,
 496                      (int)filebytes, 0);
 497
 498         /* Check if we do not need to extend the file */
 499         if (writelimit <= filebytes) {
 500                 goto sizeok;
 501         }
 502
 503         cred = vfs_context_ucred(ap->a_context);
 504         bytesToAdd = writelimit - filebytes;
 505
 506 #if QUOTA
 507         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 508                            cred, 0);
 509         if (retval)
 510                 goto exit;
 511 #endif /* QUOTA */
 512
 513         if (hfs_start_transaction(hfsmp) != 0) {
 514                 retval = EINVAL;
 515                 goto exit;
 516         }
 517
 518         while (writelimit > filebytes) {
 519                 bytesToAdd = writelimit - filebytes;
 520                 if (cred && suser(cred, NULL) != 0)
 521                         eflags |= kEFReserveMask;
 522
 523                 /* Protect extents b-tree and allocation bitmap */
 524                 lockflags = SFL_BITMAP;
 525                 if (overflow_extents(fp))
 526                         lockflags |= SFL_EXTENTS;
 527                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 528
 529                 /* Files that are changing size are not hot file candidates. */
 530                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 531                         fp->ff_bytesread = 0;
 532                 }
 533                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 534                                 0, eflags, &actualBytesAdded));
 535
 536                 hfs_systemfile_unlock(hfsmp, lockflags);
 537
 538                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 539                         retval = ENOSPC;
 540                 if (retval != E_NONE)
 541                         break;
 542                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 543                 KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
 544                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 545         }
 546         (void) hfs_update(vp, TRUE);
 547         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 548         (void) hfs_end_transaction(hfsmp);
 549
 550         /*
 551          * If we didn't grow the file enough try a partial write.
 552          * POSIX expects this behavior.
 553          */
 554         if ((retval == ENOSPC) && (filebytes > offset)) {
 555                 retval = 0;
 556                 partialwrite = 1;
 557                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 558                 resid -= bytesToAdd;
 559                 writelimit = filebytes;
 560         }
 561 sizeok:
 562         if (retval == E_NONE) {
 563                 off_t filesize;
 564                 off_t zero_off;
 565                 off_t tail_off;
 566                 off_t inval_start;
 567                 off_t inval_end;
 568                 off_t io_start;
 569                 int lflag;
 570
 571                 if (writelimit > fp->ff_size)
 572                         filesize = writelimit;
 573                 else
 574                         filesize = fp->ff_size;
 575
 576                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 577
 578                 if (offset <= fp->ff_size) {
 579                         zero_off = offset & ~PAGE_MASK_64;
 580
 581                         /* Check to see whether the area between the zero_offset and the start
 582                            of the transfer to see whether is invalid and should be zero-filled
 583                            as part of the transfer:
 584                          */
 585                         if (offset > zero_off) {
 586                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 587                                         lflag |= IO_HEADZEROFILL;
 588                         }
 589                 } else {
 590                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 591
 592                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 593                            read without being zeroed.  The current last block is filled with zeroes
 594                            if it holds valid data but in all cases merely do a little bookkeeping
 595                            to track the area from the end of the current last page to the start of
 596                            the area actually written.  For the same reason only the bytes up to the
 597                            start of the page where this write will start is invalidated; any remainder
 598                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 599
 600                            Note that inval_start, the start of the page after the current EOF,
 601                            may be past the start of the write, in which case the zeroing
 602                            will be handled by the cluser_write of the actual data.
 603                          */
 604                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 605                         inval_end = offset & ~PAGE_MASK_64;
 606                         zero_off = fp->ff_size;
 607
 608                         if ((fp->ff_size & PAGE_MASK_64) &&
 609                                 (rl_scan(&fp->ff_invalidranges,
 610                                                         eof_page_base,
 611                                                         fp->ff_size - 1,
 612                                                         &invalid_range) != RL_NOOVERLAP)) {
 613                                 /* The page containing the EOF is not valid, so the
 614                                    entire page must be made inaccessible now.  If the write
 615                                    starts on a page beyond the page containing the eof
 616                                    (inval_end > eof_page_base), add the
 617                                    whole page to the range to be invalidated.  Otherwise
 618                                    (i.e. if the write starts on the same page), zero-fill
 619                                    the entire page explicitly now:
 620                                  */
 621                                 if (inval_end > eof_page_base) {
 622                                         inval_start = eof_page_base;
 623                                 } else {
 624                                         zero_off = eof_page_base;
 625                                 };
 626                         };
 627
 628                         if (inval_start < inval_end) {
 629                                 struct timeval tv;
 630                                 /* There's some range of data that's going to be marked invalid */
 631
 632                                 if (zero_off < inval_start) {
 633                                         /* The pages between inval_start and inval_end are going to be invalidated,
 634                                            and the actual write will start on a page past inval_end.  Now's the last
 635                                            chance to zero-fill the page containing the EOF:
 636                                          */
 637                                         hfs_unlock(cp);
 638                                         cnode_locked = 0;
 639                                         retval = cluster_write(vp, (uio_t) 0,
 640                                                         fp->ff_size, inval_start,
 641                                                         zero_off, (off_t)0,
 642                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 643                                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 644                                         cnode_locked = 1;
 645                                         if (retval) goto ioerr_exit;
 646                                         offset = uio_offset(uio);
 647                                 };
 648
 649                                 /* Mark the remaining area of the newly allocated space as invalid: */
 650                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 651                                 microuptime(&tv);
 652                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 653                                 zero_off = fp->ff_size = inval_end;
 654                         };
 655
 656                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 657                 };
 658
 659                 /* Check to see whether the area between the end of the write and the end of
 660                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 661                  */
 662                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 663                 if (tail_off > filesize) tail_off = filesize;
 664                 if (tail_off > writelimit) {
 665                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 666                                 lflag |= IO_TAILZEROFILL;
 667                         };
 668                 };
 669
 670                 /*
 671                  * if the write starts beyond the current EOF (possibly advanced in the
 672                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 673                  * to where the write begins:
 674                  *
 675                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 676                  *       before the current EOF it might be marked as invalid now and must be
 677                  *       made readable (removed from the invalid ranges) before cluster_write
 678                  *       tries to write it:
 679                  */
 680                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 681                 if (io_start < fp->ff_size) {
 682                         off_t io_end;
 683
 684                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 685                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 686                 };
 687
 688                 hfs_unlock(cp);
 689                 cnode_locked = 0;
 690
 691                 /*
 692                  * We need to tell UBC the fork's new size BEFORE calling
 693                  * cluster_write, in case any of the new pages need to be
 694                  * paged out before cluster_write completes (which does happen
 695                  * in embedded systems due to extreme memory pressure).
 696                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 697                  * will be, so that it can pass that on to cluster_pageout, and
 698                  * allow those pageouts.
 699                  *
 700                  * We don't update ff_size yet since we don't want pageins to
 701                  * be able to see uninitialized data between the old and new
 702                  * EOF, until cluster_write has completed and initialized that
 703                  * part of the file.
 704                  *
 705                  * The vnode pager relies on the file size last given to UBC via
 706                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 707                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 708                  * zero, unless we are extending the file via write.
 709                  */
 710                 if (filesize > fp->ff_size) {
 711                         fp->ff_new_size = filesize;
 712                         ubc_setsize(vp, filesize);
 713                 }
 714                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 715                                 tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle);
 716                 if (retval) {
 717                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 718
 719                         if (retval == EAGAIN) {
 720                                 /*
 721                                  * EAGAIN indicates that we still have I/O to do, but
 722                                  * that we now need to be throttled
 723                                  */
 724                                 if (resid != uio_resid(uio)) {
 725                                         /*
 726                                          * did manage to do some I/O before returning EAGAIN
 727                                          */
 728                                         resid = uio_resid(uio);
 729                                         offset = uio_offset(uio);
 730
 731                                         cp->c_touch_chgtime = TRUE;
 732                                         cp->c_touch_modtime = TRUE;
 733                                         hfs_incr_gencount(cp);
 734                                 }
 735                                 if (filesize > fp->ff_size) {
 736                                         /*
 737                                          * we called ubc_setsize before the call to
 738                                          * cluster_write... since we only partially
 739                                          * completed the I/O, we need to
 740                                          * re-adjust our idea of the filesize based
 741                                          * on our interim EOF
 742                                          */
 743                                         ubc_setsize(vp, offset);
 744
 745                                         fp->ff_size = offset;
 746                                 }
 747                                 goto exit;
 748                         }
 749                         if (filesize > origFileSize) {
 750                                 ubc_setsize(vp, origFileSize);
 751                         }
 752                         goto ioerr_exit;
 753                 }
 754
 755                 if (filesize > origFileSize) {
 756                         fp->ff_size = filesize;
 757
 758                         /* Files that are changing size are not hot file candidates. */
 759                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 760                                 fp->ff_bytesread = 0;
 761                         }
 762                 }
 763                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 764         }
 765         if (partialwrite) {
 766                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 767                 resid += bytesToAdd;
 768         }
 769
 770         // XXXdbg - see radar 4871353 for more info
 771         {
 772             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 773                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 774             }
 775         }
 776
 777 ioerr_exit:
 778         if (resid > uio_resid(uio)) {
 779                 if (!cnode_locked) {
 780                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 781                         cnode_locked = 1;
 782                 }
 783
 784                 cp->c_touch_chgtime = TRUE;
 785                 cp->c_touch_modtime = TRUE;
 786                 hfs_incr_gencount(cp);
 787
 788                 /*
 789                  * If we successfully wrote any data, and we are not the superuser
 790                  * we clear the setuid and setgid bits as a precaution against
 791                  * tampering.
 792                  */
 793                 if (cp->c_mode & (S_ISUID | S_ISGID)) {
 794                         cred = vfs_context_ucred(ap->a_context);
 795                         if (cred && suser(cred, NULL)) {
 796                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
 797                         }
 798                 }
 799         }
 800         if (retval) {
 801                 if (ioflag & IO_UNIT) {
 802                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 803                                            0, ap->a_context);
 804                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 805                         uio_setresid(uio, resid);
 806                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 807                 }
 808         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
 809                 retval = hfs_update(vp, TRUE);
 810
 811         /* Updating vcbWrCnt doesn't need to be atomic. */
 812         hfsmp->vcbWrCnt++;
 813
 814         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
 815                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 816 exit:
 817         if (cnode_locked)
 818                 hfs_unlock(cp);
 819
 820         if (took_truncate_lock) {
 821                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 822         }
 823         if (retval == EAGAIN) {
 824                 throttle_lowpri_io(1);
 825                 throttled_count++;
 826
 827                 retval = 0;
 828                 goto again;
 829         }
 830         if (throttled_count) {
 831                 throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
 832         }
 833         return (retval);
 834 }
 835
 836 /* support for the "bulk-access" fcntl */
 837
 838 #define CACHE_LEVELS 16
 839 #define NUM_CACHE_ENTRIES (64*16)
 840 #define PARENT_IDS_FLAG 0x100
 841
 842 struct access_cache {
 843        int numcached;
 844        int cachehits; /* these two for statistics gathering */
 845        int lookups;
 846        unsigned int *acache;
 847        unsigned char *haveaccess;
 848 };
 849
 850 struct access_t {
 851         uid_t     uid;              /* IN: effective user id */
 852         short     flags;            /* IN: access requested (i.e. R_OK) */
 853         short     num_groups;       /* IN: number of groups user belongs to */
 854         int       num_files;        /* IN: number of files to process */
 855         int       *file_ids;        /* IN: array of file ids */
 856         gid_t     *groups;          /* IN: array of groups */
 857         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 858 } __attribute__((unavailable)); // this structure is for reference purposes only
 859
 860 struct user32_access_t {
 861         uid_t     uid;              /* IN: effective user id */
 862         short     flags;            /* IN: access requested (i.e. R_OK) */
 863         short     num_groups;       /* IN: number of groups user belongs to */
 864         int       num_files;        /* IN: number of files to process */
 865         user32_addr_t      file_ids;        /* IN: array of file ids */
 866         user32_addr_t      groups;          /* IN: array of groups */
 867         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 868 };
 869
 870 struct user64_access_t {
 871         uid_t           uid;                    /* IN: effective user id */
 872         short           flags;                  /* IN: access requested (i.e. R_OK) */
 873         short           num_groups;             /* IN: number of groups user belongs to */
 874         int             num_files;              /* IN: number of files to process */
 875         user64_addr_t   file_ids;               /* IN: array of file ids */
 876         user64_addr_t   groups;                 /* IN: array of groups */
 877         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 878 };
 879
 880
 881 // these are the "extended" versions of the above structures
 882 // note that it is crucial that they be different sized than
 883 // the regular version
 884 struct ext_access_t {
 885         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 886         uint32_t   num_files;       /* IN: number of files to process */
 887         uint32_t   map_size;        /* IN: size of the bit map */
 888         uint32_t  *file_ids;        /* IN: Array of file ids */
 889         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 890         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 891         uint32_t   num_parents;   /* future use */
 892         cnid_t      *parents;   /* future use */
 893 } __attribute__((unavailable)); // this structure is for reference purposes only
 894
 895 struct user32_ext_access_t {
 896         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 897         uint32_t   num_files;       /* IN: number of files to process */
 898         uint32_t   map_size;        /* IN: size of the bit map */
 899         user32_addr_t  file_ids;        /* IN: Array of file ids */
 900         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 901         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 902         uint32_t   num_parents;   /* future use */
 903         user32_addr_t parents;   /* future use */
 904 };
 905
 906 struct user64_ext_access_t {
 907         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 908         uint32_t      num_files;    /* IN: number of files to process */
 909         uint32_t      map_size;     /* IN: size of the bit map */
 910         user64_addr_t   file_ids;     /* IN: array of file ids */
 911         user64_addr_t   bitmap;       /* IN: array of groups */
 912         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 913         uint32_t      num_parents;/* future use */
 914         user64_addr_t   parents;/* future use */
 915 };
 916
 917
 918 /*
 919  * Perform a binary search for the given parent_id. Return value is
 920  * the index if there is a match.  If no_match_indexp is non-NULL it
 921  * will be assigned with the index to insert the item (even if it was
 922  * not found).
 923  */
 924 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 925 {
 926     int index=-1;
 927     unsigned int lo=0;
 928
 929     do {
 930         unsigned int mid = ((hi - lo)/2) + lo;
 931         unsigned int this_id = array[mid];
 932
 933         if (parent_id == this_id) {
 934             hi = mid;
 935             break;
 936         }
 937
 938         if (parent_id < this_id) {
 939             hi = mid;
 940             continue;
 941         }
 942
 943         if (parent_id > this_id) {
 944             lo = mid + 1;
 945             continue;
 946         }
 947     } while(lo < hi);
 948
 949     /* check if lo and hi converged on the match */
 950     if (parent_id == array[hi]) {
 951         index = hi;
 952     }
 953
 954     if (no_match_indexp) {
 955         *no_match_indexp = hi;
 956     }
 957
 958     return index;
 959 }
 960
 961
 962 static int
 963 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 964 {
 965     unsigned int hi;
 966     int matches = 0;
 967     int index, no_match_index;
 968
 969     if (cache->numcached == 0) {
 970         *indexp = 0;
 971         return 0; // table is empty, so insert at index=0 and report no match
 972     }
 973
 974     if (cache->numcached > NUM_CACHE_ENTRIES) {
 975         cache->numcached = NUM_CACHE_ENTRIES;
 976     }
 977
 978     hi = cache->numcached - 1;
 979
 980     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 981
 982     /* if no existing entry found, find index for new one */
 983     if (index == -1) {
 984         index = no_match_index;
 985         matches = 0;
 986     } else {
 987         matches = 1;
 988     }
 989
 990     *indexp = index;
 991     return matches;
 992 }
 993
 994 /*
 995  * Add a node to the access_cache at the given index (or do a lookup first
 996  * to find the index if -1 is passed in). We currently do a replace rather
 997  * than an insert if the cache is full.
 998  */
 999 static void
1000 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
1001 {
1002     int lookup_index = -1;
1003
1004     /* need to do a lookup first if -1 passed for index */
1005     if (index == -1) {
1006         if (lookup_bucket(cache, &lookup_index, nodeID)) {
1007             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
1008                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
1009                 cache->haveaccess[lookup_index] = access;
1010             }
1011
1012             /* mission accomplished */
1013             return;
1014         } else {
1015             index = lookup_index;
1016         }
1017
1018     }
1019
1020     /* if the cache is full, do a replace rather than an insert */
1021     if (cache->numcached >= NUM_CACHE_ENTRIES) {
1022         cache->numcached = NUM_CACHE_ENTRIES-1;
1023
1024         if (index > cache->numcached) {
1025             index = cache->numcached;
1026         }
1027     }
1028
1029     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
1030         index++;
1031     }
1032
1033     if (index >= 0 && index < cache->numcached) {
1034         /* only do bcopy if we're inserting */
1035         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
1036         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
1037     }
1038
1039     cache->acache[index] = nodeID;
1040     cache->haveaccess[index] = access;
1041     cache->numcached++;
1042 }
1043
1044
1045 struct cinfo {
1046     uid_t   uid;
1047     gid_t   gid;
1048     mode_t  mode;
1049     cnid_t  parentcnid;
1050     u_int16_t recflags;
1051 };
1052
1053 static int
1054 snoop_callback(const cnode_t *cp, void *arg)
1055 {
1056     struct cinfo *cip = arg;
1057
1058     cip->uid = cp->c_uid;
1059     cip->gid = cp->c_gid;
1060     cip->mode = cp->c_mode;
1061     cip->parentcnid = cp->c_parentcnid;
1062     cip->recflags = cp->c_attr.ca_recflags;
1063
1064     return (0);
1065 }
1066
1067 /*
1068  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1069  * isn't incore, then go to the catalog.
1070  */
1071 static int
1072 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1073     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1074 {
1075     int error = 0;
1076
1077     /* if this id matches the one the fsctl was called with, skip the lookup */
1078     if (cnid == skip_cp->c_cnid) {
1079                 cnattrp->ca_uid = skip_cp->c_uid;
1080                 cnattrp->ca_gid = skip_cp->c_gid;
1081                 cnattrp->ca_mode = skip_cp->c_mode;
1082                 cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1083                 keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1084     } else {
1085                 struct cinfo c_info;
1086
1087                 /* otherwise, check the cnode hash incase the file/dir is incore */
1088                 error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
1089
1090                 if (error == EACCES) {
1091                         // File is deleted
1092                         return ENOENT;
1093                 } else if (!error) {
1094                         cnattrp->ca_uid = c_info.uid;
1095                         cnattrp->ca_gid = c_info.gid;
1096                         cnattrp->ca_mode = c_info.mode;
1097                         cnattrp->ca_recflags = c_info.recflags;
1098                         keyp->hfsPlus.parentID = c_info.parentcnid;
1099                 } else {
1100                         int lockflags;
1101
1102                         if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1103                                 throttle_lowpri_io(1);
1104
1105                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1106
1107                         /* lookup this cnid in the catalog */
1108                         error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1109
1110                         hfs_systemfile_unlock(hfsmp, lockflags);
1111
1112                         cache->lookups++;
1113                 }
1114     }
1115
1116     return (error);
1117 }
1118
1119
1120 /*
1121  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1122  * up to CACHE_LEVELS as we progress towards the root.
1123  */
1124 static int
1125 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1126     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1127     struct vfs_context *my_context,
1128     char *bitmap,
1129     uint32_t map_size,
1130     cnid_t* parents,
1131     uint32_t num_parents)
1132 {
1133     int                     myErr = 0;
1134     int                     myResult;
1135     HFSCatalogNodeID        thisNodeID;
1136     unsigned int            myPerms;
1137     struct cat_attr         cnattr;
1138     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1139     CatalogKey              catkey;
1140
1141     int i = 0, ids_to_cache = 0;
1142     int parent_ids[CACHE_LEVELS];
1143
1144     thisNodeID = nodeID;
1145     while (thisNodeID >=  kRootDirID) {
1146         myResult = 0;   /* default to "no access" */
1147
1148         /* check the cache before resorting to hitting the catalog */
1149
1150         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1151          * to look any further after hitting cached dir */
1152
1153         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1154             cache->cachehits++;
1155             myErr = cache->haveaccess[cache_index];
1156             if (scope_index != -1) {
1157                 if (myErr == ESRCH) {
1158                     myErr = 0;
1159                 }
1160             } else {
1161                 scope_index = 0;   // so we'll just use the cache result
1162                 scope_idx_start = ids_to_cache;
1163             }
1164             myResult = (myErr == 0) ? 1 : 0;
1165             goto ExitThisRoutine;
1166         }
1167
1168
1169         if (parents) {
1170             int tmp;
1171             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1172             if (scope_index == -1)
1173                 scope_index = tmp;
1174             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1175                 scope_idx_start = ids_to_cache;
1176             }
1177         }
1178
1179         /* remember which parents we want to cache */
1180         if (ids_to_cache < CACHE_LEVELS) {
1181             parent_ids[ids_to_cache] = thisNodeID;
1182             ids_to_cache++;
1183         }
1184         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1185         if (bitmap && map_size) {
1186             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1187         }
1188
1189
1190         /* do the lookup (checks the cnode hash, then the catalog) */
1191         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1192         if (myErr) {
1193             goto ExitThisRoutine; /* no access */
1194         }
1195
1196         /* Root always gets access. */
1197         if (suser(myp_ucred, NULL) == 0) {
1198                 thisNodeID = catkey.hfsPlus.parentID;
1199                 myResult = 1;
1200                 continue;
1201         }
1202
1203         // if the thing has acl's, do the full permission check
1204         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1205             struct vnode *vp;
1206
1207             /* get the vnode for this cnid */
1208             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1209             if ( myErr ) {
1210                 myResult = 0;
1211                 goto ExitThisRoutine;
1212             }
1213
1214             thisNodeID = VTOC(vp)->c_parentcnid;
1215
1216             hfs_unlock(VTOC(vp));
1217
1218             if (vnode_vtype(vp) == VDIR) {
1219                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1220             } else {
1221                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1222             }
1223
1224             vnode_put(vp);
1225             if (myErr) {
1226                 myResult = 0;
1227                 goto ExitThisRoutine;
1228             }
1229         } else {
1230             unsigned int flags;
1231                 int mode = cnattr.ca_mode & S_IFMT;
1232                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1233
1234                 if (mode == S_IFDIR) {
1235                         flags = R_OK | X_OK;
1236                 } else {
1237                         flags = R_OK;
1238                 }
1239                 if ( (myPerms & flags) != flags) {
1240                         myResult = 0;
1241                         myErr = EACCES;
1242                         goto ExitThisRoutine;   /* no access */
1243                 }
1244
1245             /* up the hierarchy we go */
1246             thisNodeID = catkey.hfsPlus.parentID;
1247         }
1248     }
1249
1250     /* if here, we have access to this node */
1251     myResult = 1;
1252
1253   ExitThisRoutine:
1254     if (parents && myErr == 0 && scope_index == -1) {
1255         myErr = ESRCH;
1256     }
1257
1258     if (myErr) {
1259         myResult = 0;
1260     }
1261     *err = myErr;
1262
1263     /* cache the parent directory(ies) */
1264     for (i = 0; i < ids_to_cache; i++) {
1265         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1266             add_node(cache, -1, parent_ids[i], ESRCH);
1267         } else {
1268             add_node(cache, -1, parent_ids[i], myErr);
1269         }
1270     }
1271
1272     return (myResult);
1273 }
1274
1275 static int
1276 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1277     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1278 {
1279     boolean_t is64bit;
1280
1281     /*
1282      * NOTE: on entry, the vnode has an io_ref. In case this vnode
1283      * happens to be in our list of file_ids, we'll note it
1284      * avoid calling hfs_chashget_nowait() on that id as that
1285      * will cause a "locking against myself" panic.
1286      */
1287     Boolean check_leaf = true;
1288
1289     struct user64_ext_access_t *user_access_structp;
1290     struct user64_ext_access_t tmp_user_access;
1291     struct access_cache cache;
1292
1293     int error = 0, prev_parent_check_ok=1;
1294     unsigned int i;
1295
1296     short flags;
1297     unsigned int num_files = 0;
1298     int map_size = 0;
1299     int num_parents = 0;
1300     int *file_ids=NULL;
1301     short *access=NULL;
1302     char *bitmap=NULL;
1303     cnid_t *parents=NULL;
1304     int leaf_index;
1305
1306     cnid_t cnid;
1307     cnid_t prevParent_cnid = 0;
1308     unsigned int myPerms;
1309     short myaccess = 0;
1310     struct cat_attr cnattr;
1311     CatalogKey catkey;
1312     struct cnode *skip_cp = VTOC(vp);
1313     kauth_cred_t cred = vfs_context_ucred(context);
1314     proc_t p = vfs_context_proc(context);
1315
1316     is64bit = proc_is64bit(p);
1317
1318     /* initialize the local cache and buffers */
1319     cache.numcached = 0;
1320     cache.cachehits = 0;
1321     cache.lookups = 0;
1322     cache.acache = NULL;
1323     cache.haveaccess = NULL;
1324
1325     /* struct copyin done during dispatch... need to copy file_id array separately */
1326     if (ap->a_data == NULL) {
1327         error = EINVAL;
1328         goto err_exit_bulk_access;
1329     }
1330
1331     if (is64bit) {
1332         if (arg_size != sizeof(struct user64_ext_access_t)) {
1333             error = EINVAL;
1334             goto err_exit_bulk_access;
1335         }
1336
1337         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1338
1339     } else if (arg_size == sizeof(struct user32_access_t)) {
1340         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1341
1342         // convert an old style bulk-access struct to the new style
1343         tmp_user_access.flags     = accessp->flags;
1344         tmp_user_access.num_files = accessp->num_files;
1345         tmp_user_access.map_size  = 0;
1346         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1347         tmp_user_access.bitmap    = USER_ADDR_NULL;
1348         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1349         tmp_user_access.num_parents = 0;
1350         user_access_structp = &tmp_user_access;
1351
1352     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1353         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1354
1355         // up-cast from a 32-bit version of the struct
1356         tmp_user_access.flags     = accessp->flags;
1357         tmp_user_access.num_files = accessp->num_files;
1358         tmp_user_access.map_size  = accessp->map_size;
1359         tmp_user_access.num_parents  = accessp->num_parents;
1360
1361         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1362         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1363         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1364         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1365
1366         user_access_structp = &tmp_user_access;
1367     } else {
1368         error = EINVAL;
1369         goto err_exit_bulk_access;
1370     }
1371
1372     map_size = user_access_structp->map_size;
1373
1374     num_files = user_access_structp->num_files;
1375
1376     num_parents= user_access_structp->num_parents;
1377
1378     if (num_files < 1) {
1379         goto err_exit_bulk_access;
1380     }
1381     if (num_files > 1024) {
1382         error = EINVAL;
1383         goto err_exit_bulk_access;
1384     }
1385
1386     if (num_parents > 1024) {
1387         error = EINVAL;
1388         goto err_exit_bulk_access;
1389     }
1390
1391     file_ids = (int *) kalloc(sizeof(int) * num_files);
1392     access = (short *) kalloc(sizeof(short) * num_files);
1393     if (map_size) {
1394         bitmap = (char *) kalloc(sizeof(char) * map_size);
1395     }
1396
1397     if (num_parents) {
1398         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1399     }
1400
1401     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1402     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1403
1404     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1405         if (file_ids) {
1406             kfree(file_ids, sizeof(int) * num_files);
1407         }
1408         if (bitmap) {
1409             kfree(bitmap, sizeof(char) * map_size);
1410         }
1411         if (access) {
1412             kfree(access, sizeof(short) * num_files);
1413         }
1414         if (cache.acache) {
1415             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1416         }
1417         if (cache.haveaccess) {
1418             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1419         }
1420         if (parents) {
1421             kfree(parents, sizeof(cnid_t) * num_parents);
1422         }
1423         return ENOMEM;
1424     }
1425
1426     // make sure the bitmap is zero'ed out...
1427     if (bitmap) {
1428         bzero(bitmap, (sizeof(char) * map_size));
1429     }
1430
1431     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1432                 num_files * sizeof(int)))) {
1433         goto err_exit_bulk_access;
1434     }
1435
1436     if (num_parents) {
1437         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1438                     num_parents * sizeof(cnid_t)))) {
1439             goto err_exit_bulk_access;
1440         }
1441     }
1442
1443     flags = user_access_structp->flags;
1444     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1445         flags = R_OK;
1446     }
1447
1448     /* check if we've been passed leaf node ids or parent ids */
1449     if (flags & PARENT_IDS_FLAG) {
1450         check_leaf = false;
1451     }
1452
1453     /* Check access to each file_id passed in */
1454     for (i = 0; i < num_files; i++) {
1455         leaf_index=-1;
1456         cnid = (cnid_t) file_ids[i];
1457
1458         /* root always has access */
1459         if ((!parents) && (!suser(cred, NULL))) {
1460             access[i] = 0;
1461             continue;
1462         }
1463
1464         if (check_leaf) {
1465             /* do the lookup (checks the cnode hash, then the catalog) */
1466             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1467             if (error) {
1468                 access[i] = (short) error;
1469                 continue;
1470             }
1471
1472             if (parents) {
1473                 // Check if the leaf matches one of the parent scopes
1474                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1475                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1476                     prev_parent_check_ok = 0;
1477                 else if (leaf_index >= 0)
1478                     prev_parent_check_ok = 1;
1479             }
1480
1481             // if the thing has acl's, do the full permission check
1482             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1483                 struct vnode *cvp;
1484                 int myErr = 0;
1485                 /* get the vnode for this cnid */
1486                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1487                 if ( myErr ) {
1488                     access[i] = myErr;
1489                     continue;
1490                 }
1491
1492                 hfs_unlock(VTOC(cvp));
1493
1494                 if (vnode_vtype(cvp) == VDIR) {
1495                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1496                 } else {
1497                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1498                 }
1499
1500                 vnode_put(cvp);
1501                 if (myErr) {
1502                     access[i] = myErr;
1503                     continue;
1504                 }
1505             } else {
1506                 /* before calling CheckAccess(), check the target file for read access */
1507                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1508                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1509
1510                 /* fail fast if no access */
1511                 if ((myPerms & flags) == 0) {
1512                     access[i] = EACCES;
1513                     continue;
1514                 }
1515             }
1516         } else {
1517             /* we were passed an array of parent ids */
1518             catkey.hfsPlus.parentID = cnid;
1519         }
1520
1521         /* if the last guy had the same parent and had access, we're done */
1522         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1523             cache.cachehits++;
1524             access[i] = 0;
1525             continue;
1526         }
1527
1528         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1529             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1530
1531         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1532             access[i] = 0; // have access.. no errors to report
1533         } else {
1534             access[i] = (error != 0 ? (short) error : EACCES);
1535         }
1536
1537         prevParent_cnid = catkey.hfsPlus.parentID;
1538     }
1539
1540     /* copyout the access array */
1541     if ((error = copyout((caddr_t)access, user_access_structp->access,
1542                 num_files * sizeof (short)))) {
1543         goto err_exit_bulk_access;
1544     }
1545     if (map_size && bitmap) {
1546         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1547                     map_size * sizeof (char)))) {
1548             goto err_exit_bulk_access;
1549         }
1550     }
1551
1552
1553   err_exit_bulk_access:
1554
1555     if (file_ids)
1556         kfree(file_ids, sizeof(int) * num_files);
1557     if (parents)
1558         kfree(parents, sizeof(cnid_t) * num_parents);
1559     if (bitmap)
1560         kfree(bitmap, sizeof(char) * map_size);
1561     if (access)
1562         kfree(access, sizeof(short) * num_files);
1563     if (cache.acache)
1564         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1565     if (cache.haveaccess)
1566         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1567
1568     return (error);
1569 }
1570
1571
1572 /* end "bulk-access" support */
1573
1574
1575 /*
1576  * Control filesystem operating characteristics.
1577  */
1578 int
1579 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1580                 vnode_t a_vp;
1581                 long  a_command;
1582                 caddr_t  a_data;
1583                 int  a_fflag;
1584                 vfs_context_t a_context;
1585         } */ *ap)
1586 {
1587         struct vnode * vp = ap->a_vp;
1588         struct hfsmount *hfsmp = VTOHFS(vp);
1589         vfs_context_t context = ap->a_context;
1590         kauth_cred_t cred = vfs_context_ucred(context);
1591         proc_t p = vfs_context_proc(context);
1592         struct vfsstatfs *vfsp;
1593         boolean_t is64bit;
1594         off_t jnl_start, jnl_size;
1595         struct hfs_journal_info *jip;
1596 #if HFS_COMPRESSION
1597         int compressed = 0;
1598         off_t uncompressed_size = -1;
1599         int decmpfs_error = 0;
1600
1601         if (ap->a_command == F_RDADVISE) {
1602                 /* we need to inspect the decmpfs state of the file as early as possible */
1603                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1604                 if (compressed) {
1605                         if (VNODE_IS_RSRC(vp)) {
1606                                 /* if this is the resource fork, treat it as if it were empty */
1607                                 uncompressed_size = 0;
1608                         } else {
1609                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1610                                 if (decmpfs_error != 0) {
1611                                         /* failed to get the uncompressed size, we'll check for this later */
1612                                         uncompressed_size = -1;
1613                                 }
1614                         }
1615                 }
1616         }
1617 #endif /* HFS_COMPRESSION */
1618
1619         is64bit = proc_is64bit(p);
1620
1621 #if CONFIG_PROTECT
1622         {
1623                 int error = 0;
1624                 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1625                         return error;
1626                 }
1627         }
1628 #endif /* CONFIG_PROTECT */
1629
1630         switch (ap->a_command) {
1631
1632         case HFS_GETPATH:
1633         {
1634                 struct vnode *file_vp;
1635                 cnid_t  cnid;
1636                 int  outlen;
1637                 char *bufptr;
1638                 int error;
1639                 int flags = 0;
1640
1641                 /* Caller must be owner of file system. */
1642                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1643                 if (suser(cred, NULL) &&
1644                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1645                         return (EACCES);
1646                 }
1647                 /* Target vnode must be file system's root. */
1648                 if (!vnode_isvroot(vp)) {
1649                         return (EINVAL);
1650                 }
1651                 bufptr = (char *)ap->a_data;
1652                 cnid = strtoul(bufptr, NULL, 10);
1653                 if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1654                         flags |= BUILDPATH_VOLUME_RELATIVE;
1655                 }
1656
1657                 /* We need to call hfs_vfs_vget to leverage the code that will
1658                  * fix the origin list for us if needed, as opposed to calling
1659                  * hfs_vget, since we will need the parent for build_path call.
1660                  */
1661
1662                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1663                         return (error);
1664                 }
1665                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1666                 vnode_put(file_vp);
1667
1668                 return (error);
1669         }
1670
1671         case HFS_TRANSFER_DOCUMENT_ID:
1672         {
1673                 struct cnode *cp = NULL;
1674                 int error;
1675                 u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1676                 struct fileproc *to_fp;
1677                 struct vnode *to_vp;
1678                 struct cnode *to_cp;
1679
1680                 cp = VTOC(vp);
1681
1682                 if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1683                         //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1684                         return error;
1685                 }
1686                 if ( (error = vnode_getwithref(to_vp)) ) {
1687                         file_drop(to_fd);
1688                         return error;
1689                 }
1690
1691                 if (VTOHFS(to_vp) != hfsmp) {
1692                         error = EXDEV;
1693                         goto transfer_cleanup;
1694                 }
1695
1696                 int need_unlock = 1;
1697                 to_cp = VTOC(to_vp);
1698                 error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1699                 if (error != 0) {
1700                         //printf("could not lock the pair of cnodes (error %d)\n", error);
1701                         goto transfer_cleanup;
1702                 }
1703
1704                 if (!(cp->c_bsdflags & UF_TRACKED)) {
1705                         error = EINVAL;
1706                 } else if (to_cp->c_bsdflags & UF_TRACKED) {
1707                         //
1708                         // if the destination is already tracked, return an error
1709                         // as otherwise it's a silent deletion of the target's
1710                         // document-id
1711                         //
1712                         error = EEXIST;
1713                 } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1714                         //
1715                         // we can use the FndrExtendedFileInfo because the doc-id is the first
1716                         // thing in both it and the ExtendedDirInfo struct which is fixed in
1717                         // format and can not change layout
1718                         //
1719                         struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1720                         struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1721
1722                         if (f_extinfo->document_id == 0) {
1723                                 uint32_t new_id;
1724
1725                                 hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
1726
1727                                 if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1728                                         //
1729                                         // re-lock the pair now that we have the document-id
1730                                         //
1731                                         hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1732                                         f_extinfo->document_id = new_id;
1733                                 } else {
1734                                         goto transfer_cleanup;
1735                                 }
1736                         }
1737
1738                         to_extinfo->document_id = f_extinfo->document_id;
1739                         f_extinfo->document_id = 0;
1740                         //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1741
1742                         // make sure the destination is also UF_TRACKED
1743                         to_cp->c_bsdflags |= UF_TRACKED;
1744                         cp->c_bsdflags &= ~UF_TRACKED;
1745
1746                         // mark the cnodes dirty
1747                         cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1748                         to_cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1749
1750                         int lockflags;
1751                         if ((error = hfs_start_transaction(hfsmp)) == 0) {
1752
1753                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1754
1755                                 (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1756                                 (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1757
1758                                 hfs_systemfile_unlock (hfsmp, lockflags);
1759                                 (void) hfs_end_transaction(hfsmp);
1760                         }
1761
1762 #if CONFIG_FSE
1763                         add_fsevent(FSE_DOCID_CHANGED, context,
1764                                     FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1765                                     FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
1766                                     FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
1767                                     FSE_ARG_INT32, to_extinfo->document_id,
1768                                     FSE_ARG_DONE);
1769
1770                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1771                         need_unlock = 0;
1772
1773                         if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1774                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1775                         }
1776                         if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1777                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1778                         }
1779 #else
1780                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1781                         need_unlock = 0;
1782 #endif
1783                 }
1784
1785                 if (need_unlock) {
1786                         hfs_unlockpair(cp, to_cp);
1787                 }
1788
1789         transfer_cleanup:
1790                 vnode_put(to_vp);
1791                 file_drop(to_fd);
1792
1793                 return error;
1794         }
1795
1796
1797
1798         case HFS_PREV_LINK:
1799         case HFS_NEXT_LINK:
1800         {
1801                 cnid_t linkfileid;
1802                 cnid_t nextlinkid;
1803                 cnid_t prevlinkid;
1804                 int error;
1805
1806                 /* Caller must be owner of file system. */
1807                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1808                 if (suser(cred, NULL) &&
1809                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1810                         return (EACCES);
1811                 }
1812                 /* Target vnode must be file system's root. */
1813                 if (!vnode_isvroot(vp)) {
1814                         return (EINVAL);
1815                 }
1816                 linkfileid = *(cnid_t *)ap->a_data;
1817                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1818                         return (EINVAL);
1819                 }
1820                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1821                         return (error);
1822                 }
1823                 if (ap->a_command == HFS_NEXT_LINK) {
1824                         *(cnid_t *)ap->a_data = nextlinkid;
1825                 } else {
1826                         *(cnid_t *)ap->a_data = prevlinkid;
1827                 }
1828                 return (0);
1829         }
1830
1831         case HFS_RESIZE_PROGRESS: {
1832
1833                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1834                 if (suser(cred, NULL) &&
1835                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1836                         return (EACCES); /* must be owner of file system */
1837                 }
1838                 if (!vnode_isvroot(vp)) {
1839                         return (EINVAL);
1840                 }
1841                 /* file system must not be mounted read-only */
1842                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1843                         return (EROFS);
1844                 }
1845
1846                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1847         }
1848
1849         case HFS_RESIZE_VOLUME: {
1850                 u_int64_t newsize;
1851                 u_int64_t cursize;
1852
1853                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1854                 if (suser(cred, NULL) &&
1855                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1856                         return (EACCES); /* must be owner of file system */
1857                 }
1858                 if (!vnode_isvroot(vp)) {
1859                         return (EINVAL);
1860                 }
1861
1862                 /* filesystem must not be mounted read only */
1863                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1864                         return (EROFS);
1865                 }
1866                 newsize = *(u_int64_t *)ap->a_data;
1867                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1868
1869                 if (newsize > cursize) {
1870                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1871                 } else if (newsize < cursize) {
1872                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1873                 } else {
1874                         return (0);
1875                 }
1876         }
1877         case HFS_CHANGE_NEXT_ALLOCATION: {
1878                 int error = 0;          /* Assume success */
1879                 u_int32_t location;
1880
1881                 if (vnode_vfsisrdonly(vp)) {
1882                         return (EROFS);
1883                 }
1884                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1885                 if (suser(cred, NULL) &&
1886                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1887                         return (EACCES); /* must be owner of file system */
1888                 }
1889                 if (!vnode_isvroot(vp)) {
1890                         return (EINVAL);
1891                 }
1892                 hfs_lock_mount(hfsmp);
1893                 location = *(u_int32_t *)ap->a_data;
1894                 if ((location >= hfsmp->allocLimit) &&
1895                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1896                         error = EINVAL;
1897                         goto fail_change_next_allocation;
1898                 }
1899                 /* Return previous value. */
1900                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1901                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1902                         /* On magic value for location, set nextAllocation to next block
1903                          * after metadata zone and set flag in mount structure to indicate
1904                          * that nextAllocation should not be updated again.
1905                          */
1906                         if (hfsmp->hfs_metazone_end != 0) {
1907                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1908                         }
1909                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1910                 } else {
1911                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1912                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1913                 }
1914                 MarkVCBDirty(hfsmp);
1915 fail_change_next_allocation:
1916                 hfs_unlock_mount(hfsmp);
1917                 return (error);
1918         }
1919
1920 #if HFS_SPARSE_DEV
1921         case HFS_SETBACKINGSTOREINFO: {
1922                 struct vnode * bsfs_rootvp;
1923                 struct vnode * di_vp;
1924                 struct hfs_backingstoreinfo *bsdata;
1925                 int error = 0;
1926
1927                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1928                         return (EROFS);
1929                 }
1930                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1931                         return (EALREADY);
1932                 }
1933                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1934                 if (suser(cred, NULL) &&
1935                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1936                         return (EACCES); /* must be owner of file system */
1937                 }
1938                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1939                 if (bsdata == NULL) {
1940                         return (EINVAL);
1941                 }
1942                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1943                         return (error);
1944                 }
1945                 if ((error = vnode_getwithref(di_vp))) {
1946                         file_drop(bsdata->backingfd);
1947                         return(error);
1948                 }
1949
1950                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1951                         (void)vnode_put(di_vp);
1952                         file_drop(bsdata->backingfd);
1953                         return (EINVAL);
1954                 }
1955
1956                 /*
1957                  * Obtain the backing fs root vnode and keep a reference
1958                  * on it.  This reference will be dropped in hfs_unmount.
1959                  */
1960                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1961                 if (error) {
1962                         (void)vnode_put(di_vp);
1963                         file_drop(bsdata->backingfd);
1964                         return (error);
1965                 }
1966                 vnode_ref(bsfs_rootvp);
1967                 vnode_put(bsfs_rootvp);
1968
1969                 hfs_lock_mount(hfsmp);
1970                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1971                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1972                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
1973                 hfs_unlock_mount(hfsmp);
1974
1975                 /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
1976
1977                 /*
1978                  * If the sparse image is on a sparse image file (as opposed to a sparse
1979                  * bundle), then we may need to limit the free space to the maximum size
1980                  * of a file on that volume.  So we query (using pathconf), and if we get
1981                  * a meaningful result, we cache the number of blocks for later use in
1982                  * hfs_freeblks().
1983                  */
1984                 hfsmp->hfs_backingfs_maxblocks = 0;
1985                 if (vnode_vtype(di_vp) == VREG) {
1986                         int terr;
1987                         int hostbits;
1988                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1989                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1990                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1991
1992                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1993                         }
1994                 }
1995
1996                 /* The free extent cache is managed differently for sparse devices.
1997                  * There is a window between which the volume is mounted and the
1998                  * device is marked as sparse, so the free extent cache for this
1999                  * volume is currently initialized as normal volume (sorted by block
2000                  * count).  Reset the cache so that it will be rebuilt again
2001                  * for sparse device (sorted by start block).
2002                  */
2003                 ResetVCBFreeExtCache(hfsmp);
2004
2005                 (void)vnode_put(di_vp);
2006                 file_drop(bsdata->backingfd);
2007                 return (0);
2008         }
2009         case HFS_CLRBACKINGSTOREINFO: {
2010                 struct vnode * tmpvp;
2011
2012                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2013                 if (suser(cred, NULL) &&
2014                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2015                         return (EACCES); /* must be owner of file system */
2016                 }
2017                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2018                         return (EROFS);
2019                 }
2020
2021                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
2022                     hfsmp->hfs_backingfs_rootvp) {
2023
2024                         hfs_lock_mount(hfsmp);
2025                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2026                         tmpvp = hfsmp->hfs_backingfs_rootvp;
2027                         hfsmp->hfs_backingfs_rootvp = NULLVP;
2028                         hfsmp->hfs_sparsebandblks = 0;
2029                         hfs_unlock_mount(hfsmp);
2030
2031                         vnode_rele(tmpvp);
2032                 }
2033                 return (0);
2034         }
2035 #endif /* HFS_SPARSE_DEV */
2036
2037         /* Change the next CNID stored in the VH */
2038         case HFS_CHANGE_NEXTCNID: {
2039                 int error = 0;          /* Assume success */
2040                 u_int32_t fileid;
2041                 int wraparound = 0;
2042                 int lockflags = 0;
2043
2044                 if (vnode_vfsisrdonly(vp)) {
2045                         return (EROFS);
2046                 }
2047                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2048                 if (suser(cred, NULL) &&
2049                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2050                         return (EACCES); /* must be owner of file system */
2051                 }
2052
2053                 fileid = *(u_int32_t *)ap->a_data;
2054
2055                 /* Must have catalog lock excl. to advance the CNID pointer */
2056                 lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2057
2058                 hfs_lock_mount(hfsmp);
2059
2060                 /* If it is less than the current next CNID, force the wraparound bit to be set */
2061                 if (fileid < hfsmp->vcbNxtCNID) {
2062                         wraparound=1;
2063                 }
2064
2065                 /* Return previous value. */
2066                 *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2067
2068                 hfsmp->vcbNxtCNID = fileid;
2069
2070                 if (wraparound) {
2071                         hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2072                 }
2073
2074                 MarkVCBDirty(hfsmp);
2075                 hfs_unlock_mount(hfsmp);
2076                 hfs_systemfile_unlock (hfsmp, lockflags);
2077
2078                 return (error);
2079         }
2080
2081         case F_FREEZE_FS: {
2082                 struct mount *mp;
2083
2084                 mp = vnode_mount(vp);
2085                 hfsmp = VFSTOHFS(mp);
2086
2087                 if (!(hfsmp->jnl))
2088                         return (ENOTSUP);
2089
2090                 vfsp = vfs_statfs(mp);
2091
2092                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2093                         !kauth_cred_issuser(cred))
2094                         return (EACCES);
2095
2096                 return hfs_freeze(hfsmp);
2097         }
2098
2099         case F_THAW_FS: {
2100                 vfsp = vfs_statfs(vnode_mount(vp));
2101                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2102                         !kauth_cred_issuser(cred))
2103                         return (EACCES);
2104
2105                 return hfs_thaw(hfsmp, current_proc());
2106         }
2107
2108         case HFS_BULKACCESS_FSCTL: {
2109             int size;
2110
2111             if (hfsmp->hfs_flags & HFS_STANDARD) {
2112                 return EINVAL;
2113             }
2114
2115             if (is64bit) {
2116                 size = sizeof(struct user64_access_t);
2117             } else {
2118                 size = sizeof(struct user32_access_t);
2119             }
2120
2121             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2122         }
2123
2124         case HFS_EXT_BULKACCESS_FSCTL: {
2125             int size;
2126
2127             if (hfsmp->hfs_flags & HFS_STANDARD) {
2128                 return EINVAL;
2129             }
2130
2131             if (is64bit) {
2132                 size = sizeof(struct user64_ext_access_t);
2133             } else {
2134                 size = sizeof(struct user32_ext_access_t);
2135             }
2136
2137             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2138         }
2139
2140         case HFS_SET_XATTREXTENTS_STATE: {
2141                 int state;
2142
2143                 if (ap->a_data == NULL) {
2144                         return (EINVAL);
2145                 }
2146
2147                 state = *(int *)ap->a_data;
2148
2149                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2150                         return (EROFS);
2151                 }
2152
2153                 /* Super-user can enable or disable extent-based extended
2154                  * attribute support on a volume
2155                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
2156                  * are enabled by default, so any change will be transient only
2157                  * till the volume is remounted.
2158                  */
2159                 if (!kauth_cred_issuser(kauth_cred_get())) {
2160                         return (EPERM);
2161                 }
2162                 if (state == 0 || state == 1)
2163                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2164                 else
2165                         return (EINVAL);
2166         }
2167
2168         case F_SETSTATICCONTENT: {
2169                 int error;
2170                 int enable_static = 0;
2171                 struct cnode *cp = NULL;
2172                 /*
2173                  * lock the cnode, decorate the cnode flag, and bail out.
2174                  * VFS should have already authenticated the caller for us.
2175                  */
2176
2177                 if (ap->a_data) {
2178                         /*
2179                          * Note that even though ap->a_data is of type caddr_t,
2180                          * the fcntl layer at the syscall handler will pass in NULL
2181                          * or 1 depending on what the argument supplied to the fcntl
2182                          * was.  So it is in fact correct to check the ap->a_data
2183                          * argument for zero or non-zero value when deciding whether or not
2184                          * to enable the static bit in the cnode.
2185                          */
2186                         enable_static = 1;
2187                 }
2188                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2189                         return EROFS;
2190                 }
2191                 cp = VTOC(vp);
2192
2193                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2194                 if (error == 0) {
2195                         if (enable_static) {
2196                                 cp->c_flag |= C_SSD_STATIC;
2197                         }
2198                         else {
2199                                 cp->c_flag &= ~C_SSD_STATIC;
2200                         }
2201                         hfs_unlock (cp);
2202                 }
2203                 return error;
2204         }
2205
2206         case F_SET_GREEDY_MODE: {
2207                 int error;
2208                 int enable_greedy_mode = 0;
2209                 struct cnode *cp = NULL;
2210                 /*
2211                  * lock the cnode, decorate the cnode flag, and bail out.
2212                  * VFS should have already authenticated the caller for us.
2213                  */
2214
2215                 if (ap->a_data) {
2216                         /*
2217                          * Note that even though ap->a_data is of type caddr_t,
2218                          * the fcntl layer at the syscall handler will pass in NULL
2219                          * or 1 depending on what the argument supplied to the fcntl
2220                          * was.  So it is in fact correct to check the ap->a_data
2221                          * argument for zero or non-zero value when deciding whether or not
2222                          * to enable the greedy mode bit in the cnode.
2223                          */
2224                         enable_greedy_mode = 1;
2225                 }
2226                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2227                         return EROFS;
2228                 }
2229                 cp = VTOC(vp);
2230
2231                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2232                 if (error == 0) {
2233                         if (enable_greedy_mode) {
2234                                 cp->c_flag |= C_SSD_GREEDY_MODE;
2235                         }
2236                         else {
2237                                 cp->c_flag &= ~C_SSD_GREEDY_MODE;
2238                         }
2239                         hfs_unlock (cp);
2240                 }
2241                 return error;
2242         }
2243
2244         case F_SETIOTYPE: {
2245                 int error;
2246                 uint32_t iotypeflag = 0;
2247
2248                 struct cnode *cp = NULL;
2249                 /*
2250                  * lock the cnode, decorate the cnode flag, and bail out.
2251                  * VFS should have already authenticated the caller for us.
2252                  */
2253
2254                 if (ap->a_data == NULL) {
2255                         return EINVAL;
2256                 }
2257
2258                 /*
2259                  * Note that even though ap->a_data is of type caddr_t, we
2260                  * can only use 32 bits of flag values.
2261                  */
2262                 iotypeflag = (uint32_t) ap->a_data;
2263                 switch (iotypeflag) {
2264                         case F_IOTYPE_ISOCHRONOUS:
2265                                 break;
2266                         default:
2267                                 return EINVAL;
2268                 }
2269
2270
2271                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2272                         return EROFS;
2273                 }
2274                 cp = VTOC(vp);
2275
2276                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2277                 if (error == 0) {
2278                         switch (iotypeflag) {
2279                                 case F_IOTYPE_ISOCHRONOUS:
2280                                         cp->c_flag |= C_IO_ISOCHRONOUS;
2281                                         break;
2282                                 default:
2283                                         break;
2284                         }
2285                         hfs_unlock (cp);
2286                 }
2287                 return error;
2288         }
2289
2290         case F_MAKECOMPRESSED: {
2291                 int error = 0;
2292                 uint32_t gen_counter;
2293                 struct cnode *cp = NULL;
2294                 int reset_decmp = 0;
2295
2296                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2297                         return EROFS;
2298                 }
2299
2300                 /*
2301                  * acquire & lock the cnode.
2302                  * VFS should have already authenticated the caller for us.
2303                  */
2304
2305                 if (ap->a_data) {
2306                         /*
2307                          * Cast the pointer into a uint32_t so we can extract the
2308                          * supplied generation counter.
2309                          */
2310                         gen_counter = *((uint32_t*)ap->a_data);
2311                 }
2312                 else {
2313                         return EINVAL;
2314                 }
2315
2316 #if HFS_COMPRESSION
2317                 cp = VTOC(vp);
2318                 /* Grab truncate lock first; we may truncate the file */
2319                 hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2320
2321                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2322                 if (error) {
2323                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2324                         return error;
2325                 }
2326
2327                 /* Are there any other usecounts/FDs? */
2328                 if (vnode_isinuse(vp, 1)) {
2329                         hfs_unlock(cp);
2330                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2331                         return EBUSY;
2332                 }
2333
2334                 /* now we have the cnode locked down; Validate arguments */
2335                 if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2336                         /* EINVAL if you are trying to manipulate an IMMUTABLE file */
2337                         hfs_unlock(cp);
2338                         hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2339                         return EINVAL;
2340                 }
2341
2342                 if ((hfs_get_gencount (cp)) == gen_counter) {
2343                         /*
2344                          * OK, the gen_counter matched.  Go for it:
2345                          * Toggle state bits, truncate file, and suppress mtime update
2346                          */
2347                         reset_decmp = 1;
2348                         cp->c_bsdflags |= UF_COMPRESSED;
2349
2350                         error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
2351                                                                  ap->a_context);
2352                 }
2353                 else {
2354                         error = ESTALE;
2355                 }
2356
2357                 /* Unlock cnode before executing decmpfs ; they may need to get an EA */
2358                 hfs_unlock(cp);
2359
2360                 /*
2361                  * Reset the decmp state while still holding the truncate lock. We need to
2362                  * serialize here against a listxattr on this node which may occur at any
2363                  * time.
2364                  *
2365                  * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2366                  * that will still potentially require getting the com.apple.decmpfs EA. If the
2367                  * EA is required, then we can't hold the cnode lock, because the getxattr call is
2368                  * generic(through VFS), and can't pass along any info telling it that we're already
2369                  * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2370                  * and trying to fill in the hfs_file_is_compressed info during the callback
2371                  * operation, which will result in deadlock against the b-tree node.
2372                  *
2373                  * So, to serialize against listxattr (which will grab buf_t meta references on
2374                  * the b-tree blocks), we hold the truncate lock as we're manipulating the
2375                  * decmpfs payload.
2376                  */
2377                 if ((reset_decmp) && (error == 0)) {
2378                         decmpfs_cnode *dp = VTOCMP (vp);
2379                         if (dp != NULL) {
2380                                 decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2381                         }
2382
2383                         /* Initialize the decmpfs node as needed */
2384                         (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2385                 }
2386
2387                 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2388
2389 #endif
2390                 return error;
2391         }
2392
2393         case F_SETBACKINGSTORE: {
2394
2395                 int error = 0;
2396
2397                 /*
2398                  * See comment in F_SETSTATICCONTENT re: using
2399              * a null check for a_data
2400                  */
2401                 if (ap->a_data) {
2402                         error = hfs_set_backingstore (vp, 1);
2403                 }
2404                 else {
2405                         error = hfs_set_backingstore (vp, 0);
2406                 }
2407
2408                 return error;
2409         }
2410
2411         case F_GETPATH_MTMINFO: {
2412                 int error = 0;
2413
2414                 int *data = (int*) ap->a_data;
2415
2416                 /* Ask if this is a backingstore vnode */
2417                 error = hfs_is_backingstore (vp, data);
2418
2419                 return error;
2420         }
2421
2422         case F_FULLFSYNC: {
2423                 int error;
2424
2425                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2426                         return (EROFS);
2427                 }
2428                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2429                 if (error == 0) {
2430                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
2431                         hfs_unlock(VTOC(vp));
2432                 }
2433
2434                 return error;
2435         }
2436
2437         case F_CHKCLEAN: {
2438                 register struct cnode *cp;
2439                 int error;
2440
2441                 if (!vnode_isreg(vp))
2442                         return EINVAL;
2443
2444                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2445                 if (error == 0) {
2446                         cp = VTOC(vp);
2447                         /*
2448                          * used by regression test to determine if
2449                          * all the dirty pages (via write) have been cleaned
2450                          * after a call to 'fsysnc'.
2451                          */
2452                         error = is_file_clean(vp, VTOF(vp)->ff_size);
2453                         hfs_unlock(cp);
2454                 }
2455                 return (error);
2456         }
2457
2458         case F_RDADVISE: {
2459                 register struct radvisory *ra;
2460                 struct filefork *fp;
2461                 int error;
2462
2463                 if (!vnode_isreg(vp))
2464                         return EINVAL;
2465
2466                 ra = (struct radvisory *)(ap->a_data);
2467                 fp = VTOF(vp);
2468
2469                 /* Protect against a size change. */
2470                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2471
2472 #if HFS_COMPRESSION
2473                 if (compressed && (uncompressed_size == -1)) {
2474                         /* fetching the uncompressed size failed above, so return the error */
2475                         error = decmpfs_error;
2476                 } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
2477                                    (!compressed && (ra->ra_offset >= fp->ff_size))) {
2478                         error = EFBIG;
2479                 }
2480 #else /* HFS_COMPRESSION */
2481                 if (ra->ra_offset >= fp->ff_size) {
2482                         error = EFBIG;
2483                 }
2484 #endif /* HFS_COMPRESSION */
2485                 else {
2486                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2487                 }
2488
2489                 hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2490                 return (error);
2491         }
2492
2493         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2494         {
2495                 if (is64bit) {
2496                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2497                 }
2498                 else {
2499                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2500                 }
2501                 return 0;
2502         }
2503
2504         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2505             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2506             break;
2507
2508         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2509             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2510             break;
2511
2512         case HFS_FSCTL_GET_VERY_LOW_DISK:
2513             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2514             break;
2515
2516         case HFS_FSCTL_SET_VERY_LOW_DISK:
2517             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2518                 return EINVAL;
2519             }
2520
2521             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2522             break;
2523
2524         case HFS_FSCTL_GET_LOW_DISK:
2525             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2526             break;
2527
2528         case HFS_FSCTL_SET_LOW_DISK:
2529             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2530                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2531
2532                 return EINVAL;
2533             }
2534
2535             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2536             break;
2537
2538         case HFS_FSCTL_GET_DESIRED_DISK:
2539             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2540             break;
2541
2542         case HFS_FSCTL_SET_DESIRED_DISK:
2543             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2544                 return EINVAL;
2545             }
2546
2547             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2548             break;
2549
2550         case HFS_VOLUME_STATUS:
2551             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2552             break;
2553
2554         case HFS_SET_BOOT_INFO:
2555                 if (!vnode_isvroot(vp))
2556                         return(EINVAL);
2557                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2558                         return(EACCES); /* must be superuser or owner of filesystem */
2559                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2560                         return (EROFS);
2561                 }
2562                 hfs_lock_mount (hfsmp);
2563                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2564                 hfs_unlock_mount (hfsmp);
2565                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2566                 break;
2567
2568         case HFS_GET_BOOT_INFO:
2569                 if (!vnode_isvroot(vp))
2570                         return(EINVAL);
2571                 hfs_lock_mount (hfsmp);
2572                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2573                 hfs_unlock_mount(hfsmp);
2574                 break;
2575
2576         case HFS_MARK_BOOT_CORRUPT:
2577                 /* Mark the boot volume corrupt by setting
2578                  * kHFSVolumeInconsistentBit in the volume header.  This will
2579                  * force fsck_hfs on next mount.
2580                  */
2581                 if (!kauth_cred_issuser(kauth_cred_get())) {
2582                         return EACCES;
2583                 }
2584
2585                 /* Allowed only on the root vnode of the boot volume */
2586                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2587                     !vnode_isvroot(vp)) {
2588                         return EINVAL;
2589                 }
2590                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2591                         return (EROFS);
2592                 }
2593                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2594                 hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
2595                 break;
2596
2597         case HFS_FSCTL_GET_JOURNAL_INFO:
2598                 jip = (struct hfs_journal_info*)ap->a_data;
2599
2600                 if (vp == NULLVP)
2601                         return EINVAL;
2602
2603             if (hfsmp->jnl == NULL) {
2604                         jnl_start = 0;
2605                         jnl_size  = 0;
2606             } else {
2607                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2608                         jnl_size  = (off_t)hfsmp->jnl_size;
2609             }
2610
2611                 jip->jstart = jnl_start;
2612                 jip->jsize = jnl_size;
2613                 break;
2614
2615         case HFS_SET_ALWAYS_ZEROFILL: {
2616             struct cnode *cp = VTOC(vp);
2617
2618             if (*(int *)ap->a_data) {
2619                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2620             } else {
2621                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2622             }
2623             break;
2624         }
2625
2626         case HFS_DISABLE_METAZONE: {
2627                 /* Only root can disable metadata zone */
2628                 if (!kauth_cred_issuser(kauth_cred_get())) {
2629                         return EACCES;
2630                 }
2631                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2632                         return (EROFS);
2633                 }
2634
2635                 /* Disable metadata zone now */
2636                 (void) hfs_metadatazone_init(hfsmp, true);
2637                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2638                 break;
2639         }
2640
2641
2642         case HFS_FSINFO_METADATA_BLOCKS: {
2643                 int error;
2644                 struct hfsinfo_metadata *hinfo;
2645
2646                 hinfo = (struct hfsinfo_metadata *)ap->a_data;
2647
2648                 /* Get information about number of metadata blocks */
2649                 error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
2650                 if (error) {
2651                         return error;
2652                 }
2653
2654                 break;
2655         }
2656
2657         case HFS_GET_FSINFO: {
2658                 hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
2659
2660                 /* Only root is allowed to get fsinfo */
2661                 if (!kauth_cred_issuser(kauth_cred_get())) {
2662                         return EACCES;
2663                 }
2664
2665                 /*
2666                  * Make sure that the caller's version number matches with
2667                  * the kernel's version number.  This will make sure that
2668                  * if the structures being read/written into are changed
2669                  * by the kernel, the caller will not read incorrect data.
2670                  *
2671                  * The first three fields --- request_type, version and
2672                  * flags are same for all the hfs_fsinfo structures, so
2673                  * we can access the version number by assuming any
2674                  * structure for now.
2675                  */
2676                 if (fsinfo->header.version != HFS_FSINFO_VERSION) {
2677                         return ENOTSUP;
2678                 }
2679
2680                 /* Make sure that the current file system is not marked inconsistent */
2681                 if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2682                         return EIO;
2683                 }
2684
2685                 return hfs_get_fsinfo(hfsmp, ap->a_data);
2686         }
2687
2688         case HFS_CS_FREESPACE_TRIM: {
2689                 int error = 0;
2690                 int lockflags = 0;
2691
2692                 /* Only root allowed */
2693                 if (!kauth_cred_issuser(kauth_cred_get())) {
2694                         return EACCES;
2695                 }
2696
2697                 /*
2698                  * This core functionality is similar to hfs_scan_blocks().
2699                  * The main difference is that hfs_scan_blocks() is called
2700                  * as part of mount where we are assured that the journal is
2701                  * empty to start with.  This fcntl() can be called on a
2702                  * mounted volume, therefore it has to flush the content of
2703                  * the journal as well as ensure the state of summary table.
2704                  *
2705                  * This fcntl scans over the entire allocation bitmap,
2706                  * creates list of all the free blocks, and issues TRIM
2707                  * down to the underlying device.  This can take long time
2708                  * as it can generate up to 512MB of read I/O.
2709                  */
2710
2711                 if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
2712                         error = hfs_init_summary(hfsmp);
2713                         if (error) {
2714                                 printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
2715                                 return error;
2716                         }
2717                 }
2718
2719                 /*
2720                  * The journal maintains list of recently deallocated blocks to
2721                  * issue DKIOCUNMAPs when the corresponding journal transaction is
2722                  * flushed to the disk.  To avoid any race conditions, we only
2723                  * want one active trim list and only one thread issuing DKIOCUNMAPs.
2724                  * Therefore we make sure that the journal trim list is sync'ed,
2725                  * empty, and not modifiable for the duration of our scan.
2726                  *
2727                  * Take the journal lock before flushing the journal to the disk.
2728                  * We will keep on holding the journal lock till we don't get the
2729                  * bitmap lock to make sure that no new journal transactions can
2730                  * start.  This will make sure that the journal trim list is not
2731                  * modified after the journal flush and before getting bitmap lock.
2732                  * We can release the journal lock after we acquire the bitmap
2733                  * lock as it will prevent any further block deallocations.
2734                  */
2735                 hfs_journal_lock(hfsmp);
2736
2737                 /* Flush the journal and wait for all I/Os to finish up */
2738                 error = hfs_journal_flush(hfsmp, TRUE);
2739                 if (error) {
2740                         hfs_journal_unlock(hfsmp);
2741                         return error;
2742                 }
2743
2744                 /* Take bitmap lock to ensure it is not being modified */
2745                 lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
2746
2747                 /* Release the journal lock */
2748                 hfs_journal_unlock(hfsmp);
2749
2750                 /*
2751                  * ScanUnmapBlocks reads the bitmap in large block size
2752                  * (up to 1MB) unlike the runtime which reads the bitmap
2753                  * in the 4K block size.  This can cause buf_t collisions
2754                  * and potential data corruption.  To avoid this, we
2755                  * invalidate all the existing buffers associated with
2756                  * the bitmap vnode before scanning it.
2757                  *
2758                  * Note: ScanUnmapBlock() cleans up all the buffers
2759                  * after itself, so there won't be any large buffers left
2760                  * for us to clean up after it returns.
2761                  */
2762                 error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
2763                 if (error) {
2764                         hfs_systemfile_unlock(hfsmp, lockflags);
2765                         return error;
2766                 }
2767
2768                 /* Traverse bitmap and issue DKIOCUNMAPs */
2769                 error = ScanUnmapBlocks(hfsmp);
2770                 hfs_systemfile_unlock(hfsmp, lockflags);
2771                 if (error) {
2772                         return error;
2773                 }
2774
2775                 break;
2776         }
2777
2778         default:
2779                 return (ENOTTY);
2780         }
2781
2782         return 0;
2783 }
2784
2785 /*
2786  * select
2787  */
2788 int
2789 hfs_vnop_select(__unused struct vnop_select_args *ap)
2790 /*
2791         struct vnop_select_args {
2792                 vnode_t a_vp;
2793                 int  a_which;
2794                 int  a_fflags;
2795                 void *a_wql;
2796                 vfs_context_t a_context;
2797         };
2798 */
2799 {
2800         /*
2801          * We should really check to see if I/O is possible.
2802          */
2803         return (1);
2804 }
2805
2806 /*
2807  * Converts a logical block number to a physical block, and optionally returns
2808  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2809  * The physical block number is based on the device block size, currently its 512.
2810  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2811  */
2812 int
2813 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2814 {
2815         struct filefork *fp = VTOF(vp);
2816         struct hfsmount *hfsmp = VTOHFS(vp);
2817         int  retval = E_NONE;
2818         u_int32_t  logBlockSize;
2819         size_t  bytesContAvail = 0;
2820         off_t  blockposition;
2821         int lockExtBtree;
2822         int lockflags = 0;
2823
2824         /*
2825          * Check for underlying vnode requests and ensure that logical
2826          * to physical mapping is requested.
2827          */
2828         if (vpp != NULL)
2829                 *vpp = hfsmp->hfs_devvp;
2830         if (bnp == NULL)
2831                 return (0);
2832
2833         logBlockSize = GetLogicalBlockSize(vp);
2834         blockposition = (off_t)bn * logBlockSize;
2835
2836         lockExtBtree = overflow_extents(fp);
2837
2838         if (lockExtBtree)
2839                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2840
2841         retval = MacToVFSError(
2842                             MapFileBlockC (HFSTOVCB(hfsmp),
2843                                             (FCB*)fp,
2844                                             MAXPHYSIO,
2845                                             blockposition,
2846                                             bnp,
2847                                             &bytesContAvail));
2848
2849         if (lockExtBtree)
2850                 hfs_systemfile_unlock(hfsmp, lockflags);
2851
2852         if (retval == E_NONE) {
2853                 /* Figure out how many read ahead blocks there are */
2854                 if (runp != NULL) {
2855                         if (can_cluster(logBlockSize)) {
2856                                 /* Make sure this result never goes negative: */
2857                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2858                         } else {
2859                                 *runp = 0;
2860                         }
2861                 }
2862         }
2863         return (retval);
2864 }
2865
2866 /*
2867  * Convert logical block number to file offset.
2868  */
2869 int
2870 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2871 /*
2872         struct vnop_blktooff_args {
2873                 vnode_t a_vp;
2874                 daddr64_t a_lblkno;
2875                 off_t *a_offset;
2876         };
2877 */
2878 {
2879         if (ap->a_vp == NULL)
2880                 return (EINVAL);
2881         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2882
2883         return(0);
2884 }
2885
2886 /*
2887  * Convert file offset to logical block number.
2888  */
2889 int
2890 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2891 /*
2892         struct vnop_offtoblk_args {
2893                 vnode_t a_vp;
2894                 off_t a_offset;
2895                 daddr64_t *a_lblkno;
2896         };
2897 */
2898 {
2899         if (ap->a_vp == NULL)
2900                 return (EINVAL);
2901         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2902
2903         return(0);
2904 }
2905
2906 /*
2907  * Map file offset to physical block number.
2908  *
2909  * If this function is called for write operation, and if the file
2910  * had virtual blocks allocated (delayed allocation), real blocks
2911  * are allocated by calling ExtendFileC().
2912  *
2913  * If this function is called for read operation, and if the file
2914  * had virtual blocks allocated (delayed allocation), no change
2915  * to the size of file is done, and if required, rangelist is
2916  * searched for mapping.
2917  *
2918  * System file cnodes are expected to be locked (shared or exclusive).
2919  */
2920 int
2921 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2922 /*
2923         struct vnop_blockmap_args {
2924                 vnode_t a_vp;
2925                 off_t a_foffset;
2926                 size_t a_size;
2927                 daddr64_t *a_bpn;
2928                 size_t *a_run;
2929                 void *a_poff;
2930                 int a_flags;
2931                 vfs_context_t a_context;
2932         };
2933 */
2934 {
2935         struct vnode *vp = ap->a_vp;
2936         struct cnode *cp;
2937         struct filefork *fp;
2938         struct hfsmount *hfsmp;
2939         size_t bytesContAvail = 0;
2940         int retval = E_NONE;
2941         int syslocks = 0;
2942         int lockflags = 0;
2943         struct rl_entry *invalid_range;
2944         enum rl_overlaptype overlaptype;
2945         int started_tr = 0;
2946         int tooklock = 0;
2947
2948 #if HFS_COMPRESSION
2949         if (VNODE_IS_RSRC(vp)) {
2950                 /* allow blockmaps to the resource fork */
2951         } else {
2952                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2953                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2954                         switch(state) {
2955                                 case FILE_IS_COMPRESSED:
2956                                         return ENOTSUP;
2957                                 case FILE_IS_CONVERTING:
2958                                         /* if FILE_IS_CONVERTING, we allow blockmap */
2959                                         break;
2960                                 default:
2961                                         printf("invalid state %d for compressed file\n", state);
2962                                         /* fall through */
2963                         }
2964                 }
2965         }
2966 #endif /* HFS_COMPRESSION */
2967
2968         /* Do not allow blockmap operation on a directory */
2969         if (vnode_isdir(vp)) {
2970                 return (ENOTSUP);
2971         }
2972
2973         /*
2974          * Check for underlying vnode requests and ensure that logical
2975          * to physical mapping is requested.
2976          */
2977         if (ap->a_bpn == NULL)
2978                 return (0);
2979
2980         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2981                 if (VTOC(vp)->c_lockowner != current_thread()) {
2982                         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
2983                         tooklock = 1;
2984                 }
2985         }
2986         hfsmp = VTOHFS(vp);
2987         cp = VTOC(vp);
2988         fp = VTOF(vp);
2989
2990 retry:
2991         /* Check virtual blocks only when performing write operation */
2992         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2993                 if (hfs_start_transaction(hfsmp) != 0) {
2994                         retval = EINVAL;
2995                         goto exit;
2996                 } else {
2997                         started_tr = 1;
2998                 }
2999                 syslocks = SFL_EXTENTS | SFL_BITMAP;
3000
3001         } else if (overflow_extents(fp)) {
3002                 syslocks = SFL_EXTENTS;
3003         }
3004
3005         if (syslocks)
3006                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
3007
3008         /*
3009          * Check for any delayed allocations.
3010          */
3011         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3012                 int64_t actbytes;
3013                 u_int32_t loanedBlocks;
3014
3015                 //
3016                 // Make sure we have a transaction.  It's possible
3017                 // that we came in and fp->ff_unallocblocks was zero
3018                 // but during the time we blocked acquiring the extents
3019                 // btree, ff_unallocblocks became non-zero and so we
3020                 // will need to start a transaction.
3021                 //
3022                 if (started_tr == 0) {
3023                         if (syslocks) {
3024                                 hfs_systemfile_unlock(hfsmp, lockflags);
3025                                 syslocks = 0;
3026                         }
3027                         goto retry;
3028                 }
3029
3030                 /*
3031                  * Note: ExtendFileC will Release any blocks on loan and
3032                  * aquire real blocks.  So we ask to extend by zero bytes
3033                  * since ExtendFileC will account for the virtual blocks.
3034                  */
3035
3036                 loanedBlocks = fp->ff_unallocblocks;
3037                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3038                                      kEFAllMask | kEFNoClumpMask, &actbytes);
3039
3040                 if (retval) {
3041                         fp->ff_unallocblocks = loanedBlocks;
3042                         cp->c_blocks += loanedBlocks;
3043                         fp->ff_blocks += loanedBlocks;
3044
3045                         hfs_lock_mount (hfsmp);
3046                         hfsmp->loanedBlocks += loanedBlocks;
3047                         hfs_unlock_mount (hfsmp);
3048
3049                         hfs_systemfile_unlock(hfsmp, lockflags);
3050                         cp->c_flag |= C_MODIFIED;
3051                         if (started_tr) {
3052                                 (void) hfs_update(vp, TRUE);
3053                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3054
3055                                 hfs_end_transaction(hfsmp);
3056                                 started_tr = 0;
3057                         }
3058                         goto exit;
3059                 }
3060         }
3061
3062         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
3063                                ap->a_bpn, &bytesContAvail);
3064         if (syslocks) {
3065                 hfs_systemfile_unlock(hfsmp, lockflags);
3066                 syslocks = 0;
3067         }
3068
3069         if (started_tr) {
3070                 (void) hfs_update(vp, TRUE);
3071                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3072                 hfs_end_transaction(hfsmp);
3073                 started_tr = 0;
3074         }
3075         if (retval) {
3076                 /* On write, always return error because virtual blocks, if any,
3077                  * should have been allocated in ExtendFileC().  We do not
3078                  * allocate virtual blocks on read, therefore return error
3079                  * only if no virtual blocks are allocated.  Otherwise we search
3080                  * rangelist for zero-fills
3081                  */
3082                 if ((MacToVFSError(retval) != ERANGE) ||
3083                     (ap->a_flags & VNODE_WRITE) ||
3084                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3085                         goto exit;
3086                 }
3087
3088                 /* Validate if the start offset is within logical file size */
3089                 if (ap->a_foffset >= fp->ff_size) {
3090                         goto exit;
3091                 }
3092
3093                 /*
3094                  * At this point, we have encountered a failure during
3095                  * MapFileBlockC that resulted in ERANGE, and we are not servicing
3096                  * a write, and there are borrowed blocks.
3097                  *
3098                  * However, the cluster layer will not call blockmap for
3099                  * blocks that are borrowed and in-cache.  We have to assume that
3100                  * because we observed ERANGE being emitted from MapFileBlockC, this
3101                  * extent range is not valid on-disk.  So we treat this as a
3102                  * mapping that needs to be zero-filled prior to reading.
3103                  *
3104                  * Note that under certain circumstances (such as non-contiguous
3105                  * userland VM mappings in the calling process), cluster_io
3106                  * may be forced to split a large I/O driven by hfs_vnop_write
3107                  * into multiple sub-I/Os that necessitate a RMW cycle.  If this is
3108                  * the case here, then we have already removed the invalid range list
3109                  * mapping prior to getting to this blockmap call, so we should not
3110                  * search the invalid rangelist for this byte range.
3111                  */
3112
3113                 bytesContAvail = fp->ff_size - ap->a_foffset;
3114                 /*
3115                  * Clip the contiguous available bytes to, at most, the allowable
3116                  * maximum or the amount requested.
3117                  */
3118
3119                 if (bytesContAvail > ap->a_size) {
3120                         bytesContAvail = ap->a_size;
3121                 }
3122
3123                 *ap->a_bpn = (daddr64_t) -1;
3124                 retval = 0;
3125
3126                 goto exit;
3127         }
3128
3129         /* MapFileC() found a valid extent in the filefork.  Search the
3130          * mapping information further for invalid file ranges
3131          */
3132         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3133                               ap->a_foffset + (off_t)bytesContAvail - 1,
3134                               &invalid_range);
3135         if (overlaptype != RL_NOOVERLAP) {
3136                 switch(overlaptype) {
3137                 case RL_MATCHINGOVERLAP:
3138                 case RL_OVERLAPCONTAINSRANGE:
3139                 case RL_OVERLAPSTARTSBEFORE:
3140                         /* There's no valid block for this byte offset */
3141                         *ap->a_bpn = (daddr64_t)-1;
3142                         /* There's no point limiting the amount to be returned
3143                          * if the invalid range that was hit extends all the way
3144                          * to the EOF (i.e. there's no valid bytes between the
3145                          * end of this range and the file's EOF):
3146                          */
3147                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3148                             ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3149                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3150                         }
3151                         break;
3152
3153                 case RL_OVERLAPISCONTAINED:
3154                 case RL_OVERLAPENDSAFTER:
3155                         /* The range of interest hits an invalid block before the end: */
3156                         if (invalid_range->rl_start == ap->a_foffset) {
3157                                 /* There's actually no valid information to be had starting here: */
3158                                 *ap->a_bpn = (daddr64_t)-1;
3159                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3160                                     ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3161                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3162                                 }
3163                         } else {
3164                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
3165                         }
3166                         break;
3167
3168                 case RL_NOOVERLAP:
3169                         break;
3170                 } /* end switch */
3171                 if (bytesContAvail > ap->a_size)
3172                         bytesContAvail = ap->a_size;
3173         }
3174
3175 exit:
3176         if (retval == 0) {
3177                 if (ap->a_run)
3178                         *ap->a_run = bytesContAvail;
3179
3180                 if (ap->a_poff)
3181                         *(int *)ap->a_poff = 0;
3182         }
3183
3184         if (tooklock)
3185                 hfs_unlock(cp);
3186
3187         return (MacToVFSError(retval));
3188 }
3189
3190 /*
3191  * prepare and issue the I/O
3192  * buf_strategy knows how to deal
3193  * with requests that require
3194  * fragmented I/Os
3195  */
3196 int
3197 hfs_vnop_strategy(struct vnop_strategy_args *ap)
3198 {
3199         buf_t   bp = ap->a_bp;
3200         vnode_t vp = buf_vnode(bp);
3201         int error = 0;
3202
3203         /* Mark buffer as containing static data if cnode flag set */
3204         if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3205                 buf_markstatic(bp);
3206         }
3207
3208         /* Mark buffer as containing static data if cnode flag set */
3209         if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3210                 bufattr_markgreedymode(&bp->b_attr);
3211         }
3212
3213         /* mark buffer as containing burst mode data if cnode flag set */
3214         if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
3215                 bufattr_markisochronous(&bp->b_attr);
3216         }
3217
3218 #if CONFIG_PROTECT
3219         cnode_t *cp = NULL;
3220
3221         if ((!bufattr_rawencrypted(&bp->b_attr)) &&
3222                         ((cp = cp_get_protected_cnode(vp)) != NULL)) {
3223                 /*
3224                  * We rely upon the truncate lock to protect the
3225                  * CP cache key from getting tossed prior to our IO finishing here.
3226                  * Nearly all cluster io calls to manipulate file payload from HFS
3227                  * take the truncate lock before calling into the cluster
3228                  * layer to ensure the file size does not change, or that they
3229                  * have exclusive right to change the EOF of the file.
3230                  * That same guarantee protects us here since the code that
3231                  * deals with CP lock events must now take the truncate lock
3232                  * before doing anything.
3233                  *
3234                  * There is 1 exception here:
3235                  * 1) One exception should be the VM swapfile IO, because HFS will
3236                  * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the
3237                  * swapfile code only without holding the truncate lock.  This is because
3238                  * individual swapfiles are maintained at fixed-length sizes by the VM code.
3239                  * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to
3240                  * create our own UPL and thus take the truncate lock before calling
3241                  * into the cluster layer.  In that case, however, we are not concerned
3242                  * with the CP blob being wiped out in the middle of the IO
3243                  * because there isn't anything to toss; the VM swapfile key stays
3244                  * in-core as long as the file is open.
3245                  */
3246
3247
3248                 /*
3249                  * Last chance: If this data protected I/O does not have unwrapped keys
3250                  * present, then try to get them.  We already know that it should, by this point.
3251                  */
3252                 if (cp->c_cpentry->cp_flags & (CP_KEY_FLUSHED | CP_NEEDS_KEYS)) {
3253                         int io_op = ( (buf_flags(bp) & B_READ) ? CP_READ_ACCESS : CP_WRITE_ACCESS);
3254                         if ((error = cp_handle_vnop(vp, io_op, 0)) != 0) {
3255                                 /*
3256                                  * We have to be careful here.  By this point in the I/O path, VM or the cluster
3257                                  * engine has prepared a buf_t with the proper file offsets and all the rest,
3258                                  * so simply erroring out will result in us leaking this particular buf_t.
3259                                  * We need to properly decorate the buf_t just as buf_strategy would so as
3260                                  * to make it appear that the I/O errored out with the particular error code.
3261                                  */
3262                                 buf_seterror (bp, error);
3263                                 buf_biodone(bp);
3264                                 return error;
3265                         }
3266                 }
3267
3268                 /*
3269                  *NB:
3270                  * For filesystem resize, we may not have access to the underlying
3271                  * file's cache key for whatever reason (device may be locked).  However,
3272                  * we do not need it since we are going to use the temporary HFS-wide resize key
3273                  * which is generated once we start relocating file content.  If this file's I/O
3274                  * should be done using the resize key, it will have been supplied already, so
3275                  * do not attach the file's cp blob to the buffer.
3276                  */
3277                 if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) {
3278                         buf_setcpaddr(bp, cp->c_cpentry);
3279                 }
3280         }
3281 #endif /* CONFIG_PROTECT */
3282
3283         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3284
3285         return error;
3286 }
3287
3288 static int
3289 hfs_minorupdate(struct vnode *vp) {
3290         struct cnode *cp = VTOC(vp);
3291         cp->c_flag &= ~C_MODIFIED;
3292         cp->c_touch_acctime = 0;
3293         cp->c_touch_chgtime = 0;
3294         cp->c_touch_modtime = 0;
3295
3296         return 0;
3297 }
3298
3299 int
3300 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3301 {
3302         register struct cnode *cp = VTOC(vp);
3303         struct filefork *fp = VTOF(vp);
3304         kauth_cred_t cred = vfs_context_ucred(context);
3305         int retval;
3306         off_t bytesToAdd;
3307         off_t actualBytesAdded;
3308         off_t filebytes;
3309         u_int32_t fileblocks;
3310         int blksize;
3311         struct hfsmount *hfsmp;
3312         int lockflags;
3313         int skipupdate = (truncateflags & HFS_TRUNCATE_SKIPUPDATE);
3314         int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3315
3316         blksize = VTOVCB(vp)->blockSize;
3317         fileblocks = fp->ff_blocks;
3318         filebytes = (off_t)fileblocks * (off_t)blksize;
3319
3320         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
3321                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3322
3323         if (length < 0)
3324                 return (EINVAL);
3325
3326         /* This should only happen with a corrupt filesystem */
3327         if ((off_t)fp->ff_size < 0)
3328                 return (EINVAL);
3329
3330         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3331                 return (EFBIG);
3332
3333         hfsmp = VTOHFS(vp);
3334
3335         retval = E_NONE;
3336
3337         /* Files that are changing size are not hot file candidates. */
3338         if (hfsmp->hfc_stage == HFC_RECORDING) {
3339                 fp->ff_bytesread = 0;
3340         }
3341
3342         /*
3343          * We cannot just check if fp->ff_size == length (as an optimization)
3344          * since there may be extra physical blocks that also need truncation.
3345          */
3346 #if QUOTA
3347         if ((retval = hfs_getinoquota(cp)))
3348                 return(retval);
3349 #endif /* QUOTA */
3350
3351         /*
3352          * Lengthen the size of the file. We must ensure that the
3353          * last byte of the file is allocated. Since the smallest
3354          * value of ff_size is 0, length will be at least 1.
3355          */
3356         if (length > (off_t)fp->ff_size) {
3357 #if QUOTA
3358                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3359                                    cred, 0);
3360                 if (retval)
3361                         goto Err_Exit;
3362 #endif /* QUOTA */
3363                 /*
3364                  * If we don't have enough physical space then
3365                  * we need to extend the physical size.
3366                  */
3367                 if (length > filebytes) {
3368                         int eflags;
3369                         u_int32_t blockHint = 0;
3370
3371                         /* All or nothing and don't round up to clumpsize. */
3372                         eflags = kEFAllMask | kEFNoClumpMask;
3373
3374                         if (cred && (suser(cred, NULL) != 0)) {
3375                                 eflags |= kEFReserveMask;  /* keep a reserve */
3376                         }
3377
3378                         /*
3379                          * Allocate Journal and Quota files in metadata zone.
3380                          */
3381                         if (filebytes == 0 &&
3382                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3383                             hfs_virtualmetafile(cp)) {
3384                                 eflags |= kEFMetadataMask;
3385                                 blockHint = hfsmp->hfs_metazone_start;
3386                         }
3387                         if (hfs_start_transaction(hfsmp) != 0) {
3388                             retval = EINVAL;
3389                             goto Err_Exit;
3390                         }
3391
3392                         /* Protect extents b-tree and allocation bitmap */
3393                         lockflags = SFL_BITMAP;
3394                         if (overflow_extents(fp))
3395                                 lockflags |= SFL_EXTENTS;
3396                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3397
3398                         /*
3399                          * Keep growing the file as long as the current EOF is
3400                          * less than the desired value.
3401                          */
3402                         while ((length > filebytes) && (retval == E_NONE)) {
3403                                 bytesToAdd = length - filebytes;
3404                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3405                                                     (FCB*)fp,
3406                                                     bytesToAdd,
3407                                                     blockHint,
3408                                                     eflags,
3409                                                     &actualBytesAdded));
3410
3411                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3412                                 if (actualBytesAdded == 0 && retval == E_NONE) {
3413                                         if (length > filebytes)
3414                                                 length = filebytes;
3415                                         break;
3416                                 }
3417                         } /* endwhile */
3418
3419                         hfs_systemfile_unlock(hfsmp, lockflags);
3420
3421                         if (hfsmp->jnl) {
3422                                 if (skipupdate) {
3423                                         (void) hfs_minorupdate(vp);
3424                                 }
3425                                 else {
3426                                         (void) hfs_update(vp, TRUE);
3427                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3428                                 }
3429                         }
3430
3431                         hfs_end_transaction(hfsmp);
3432
3433                         if (retval)
3434                                 goto Err_Exit;
3435
3436                         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3437                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3438                 }
3439
3440                 if (ISSET(flags, IO_NOZEROFILL)) {
3441                         // An optimisation for the hibernation file
3442                         if (vnode_isswap(vp))
3443                                 rl_remove_all(&fp->ff_invalidranges);
3444                 } else {
3445                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
3446                                 struct rl_entry *invalid_range;
3447                                 off_t zero_limit;
3448
3449                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
3450                                 if (length < zero_limit) zero_limit = length;
3451
3452                                 if (length > (off_t)fp->ff_size) {
3453                                         struct timeval tv;
3454
3455                                         /* Extending the file: time to fill out the current last page w. zeroes? */
3456                                         if ((fp->ff_size & PAGE_MASK_64) &&
3457                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
3458                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
3459
3460                                                 /* There's some valid data at the start of the (current) last page
3461                                                    of the file, so zero out the remainder of that page to ensure the
3462                                                    entire page contains valid data.  Since there is no invalid range
3463                                                    possible past the (current) eof, there's no need to remove anything
3464                                                    from the invalid range list before calling cluster_write():  */
3465                                                 hfs_unlock(cp);
3466                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
3467                                                                 fp->ff_size, (off_t)0,
3468                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
3469                                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3470                                                 if (retval) goto Err_Exit;
3471
3472                                                 /* Merely invalidate the remaining area, if necessary: */
3473                                                 if (length > zero_limit) {
3474                                                         microuptime(&tv);
3475                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
3476                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3477                                                 }
3478                                         } else {
3479                                         /* The page containing the (current) eof is invalid: just add the
3480                                            remainder of the page to the invalid list, along with the area
3481                                            being newly allocated:
3482                                          */
3483                                         microuptime(&tv);
3484                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3485                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3486                                         };
3487                                 }
3488                         } else {
3489                                         panic("hfs_truncate: invoked on non-UBC object?!");
3490                         };
3491                 }
3492                 if (suppress_times == 0) {
3493                         cp->c_touch_modtime = TRUE;
3494                 }
3495                 fp->ff_size = length;
3496
3497         } else { /* Shorten the size of the file */
3498
3499                 // An optimisation for the hibernation file
3500                 if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
3501                         rl_remove_all(&fp->ff_invalidranges);
3502                 } else if ((off_t)fp->ff_size > length) {
3503                         /* Any space previously marked as invalid is now irrelevant: */
3504                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3505                 }
3506
3507                 /*
3508                  * Account for any unmapped blocks. Note that the new
3509                  * file length can still end up with unmapped blocks.
3510                  */
3511                 if (fp->ff_unallocblocks > 0) {
3512                         u_int32_t finalblks;
3513                         u_int32_t loanedBlocks;
3514
3515                         hfs_lock_mount(hfsmp);
3516                         loanedBlocks = fp->ff_unallocblocks;
3517                         cp->c_blocks -= loanedBlocks;
3518                         fp->ff_blocks -= loanedBlocks;
3519                         fp->ff_unallocblocks = 0;
3520
3521                         hfsmp->loanedBlocks -= loanedBlocks;
3522
3523                         finalblks = (length + blksize - 1) / blksize;
3524                         if (finalblks > fp->ff_blocks) {
3525                                 /* calculate required unmapped blocks */
3526                                 loanedBlocks = finalblks - fp->ff_blocks;
3527                                 hfsmp->loanedBlocks += loanedBlocks;
3528
3529                                 fp->ff_unallocblocks = loanedBlocks;
3530                                 cp->c_blocks += loanedBlocks;
3531                                 fp->ff_blocks += loanedBlocks;
3532                         }
3533                         hfs_unlock_mount (hfsmp);
3534                 }
3535
3536 #if QUOTA
3537                 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3538 #endif /* QUOTA */
3539                 if (hfs_start_transaction(hfsmp) != 0) {
3540                         retval = EINVAL;
3541                         goto Err_Exit;
3542                 }
3543
3544                 if (fp->ff_unallocblocks == 0) {
3545                         /* Protect extents b-tree and allocation bitmap */
3546                         lockflags = SFL_BITMAP;
3547                         if (overflow_extents(fp))
3548                                 lockflags |= SFL_EXTENTS;
3549                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3550
3551                         retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3552                                                                                                  FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3553
3554                         hfs_systemfile_unlock(hfsmp, lockflags);
3555                 }
3556                 if (hfsmp->jnl) {
3557                         if (retval == 0) {
3558                                 fp->ff_size = length;
3559                         }
3560                         if (skipupdate) {
3561                                 (void) hfs_minorupdate(vp);
3562                         }
3563                         else {
3564                                 (void) hfs_update(vp, TRUE);
3565                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3566                         }
3567                 }
3568                 hfs_end_transaction(hfsmp);
3569
3570                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3571                 if (retval)
3572                         goto Err_Exit;
3573 #if QUOTA
3574                 /* These are bytesreleased */
3575                 (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3576 #endif /* QUOTA */
3577
3578                 /*
3579                  * Only set update flag if the logical length changes & we aren't
3580                  * suppressing modtime updates.
3581                  */
3582                 if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3583                         cp->c_touch_modtime = TRUE;
3584                 }
3585                 fp->ff_size = length;
3586         }
3587         if (cp->c_mode & (S_ISUID | S_ISGID)) {
3588                 if (!vfs_context_issuser(context)) {
3589                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3590                         skipupdate = 0;
3591                 }
3592         }
3593         if (skipupdate) {
3594                 retval = hfs_minorupdate(vp);
3595         }
3596         else {
3597                 cp->c_touch_chgtime = TRUE;     /* status changed */
3598                 if (suppress_times == 0) {
3599                         cp->c_touch_modtime = TRUE;     /* file data was modified */
3600
3601                         /*
3602                          * If we are not suppressing the modtime update, then
3603                          * update the gen count as well.
3604                          */
3605                         if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3606                                 hfs_incr_gencount(cp);
3607                         }
3608                 }
3609
3610                 retval = hfs_update(vp, MNT_WAIT);
3611         }
3612         if (retval) {
3613                 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3614                      -1, -1, -1, retval, 0);
3615         }
3616
3617 Err_Exit:
3618
3619         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
3620                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3621
3622         return (retval);
3623 }
3624
3625 /*
3626  * Preparation which must be done prior to deleting the catalog record
3627  * of a file or directory.  In order to make the on-disk as safe as possible,
3628  * we remove the catalog entry before releasing the bitmap blocks and the
3629  * overflow extent records.  However, some work must be done prior to deleting
3630  * the catalog record.
3631  *
3632  * When calling this function, the cnode must exist both in memory and on-disk.
3633  * If there are both resource fork and data fork vnodes, this function should
3634  * be called on both.
3635  */
3636
3637 int
3638 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3639
3640         struct filefork *fp = VTOF(vp);
3641         struct cnode *cp = VTOC(vp);
3642 #if QUOTA
3643         int retval = 0;
3644 #endif /* QUOTA */
3645
3646         /* Cannot truncate an HFS directory! */
3647         if (vnode_isdir(vp)) {
3648                 return (EISDIR);
3649         }
3650
3651         /*
3652          * See the comment below in hfs_truncate for why we need to call
3653          * setsize here.  Essentially we want to avoid pending IO if we
3654          * already know that the blocks are going to be released here.
3655          * This function is only called when totally removing all storage for a file, so
3656          * we can take a shortcut and immediately setsize (0);
3657          */
3658         ubc_setsize(vp, 0);
3659
3660         /* This should only happen with a corrupt filesystem */
3661         if ((off_t)fp->ff_size < 0)
3662                 return (EINVAL);
3663
3664         /*
3665          * We cannot just check if fp->ff_size == length (as an optimization)
3666          * since there may be extra physical blocks that also need truncation.
3667          */
3668 #if QUOTA
3669         if ((retval = hfs_getinoquota(cp))) {
3670                 return(retval);
3671         }
3672 #endif /* QUOTA */
3673
3674         /* Wipe out any invalid ranges which have yet to be backed by disk */
3675         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3676
3677         /*
3678          * Account for any unmapped blocks. Since we're deleting the
3679          * entire file, we don't have to worry about just shrinking
3680          * to a smaller number of borrowed blocks.
3681          */
3682         if (fp->ff_unallocblocks > 0) {
3683                 u_int32_t loanedBlocks;
3684
3685                 hfs_lock_mount (hfsmp);
3686                 loanedBlocks = fp->ff_unallocblocks;
3687                 cp->c_blocks -= loanedBlocks;
3688                 fp->ff_blocks -= loanedBlocks;
3689                 fp->ff_unallocblocks = 0;
3690
3691                 hfsmp->loanedBlocks -= loanedBlocks;
3692
3693                 hfs_unlock_mount (hfsmp);
3694         }
3695
3696         return 0;
3697 }
3698
3699
3700 /*
3701  * Special wrapper around calling TruncateFileC.  This function is useable
3702  * even when the catalog record does not exist any longer, making it ideal
3703  * for use when deleting a file.  The simplification here is that we know
3704  * that we are releasing all blocks.
3705  *
3706  * Note that this function may be called when there is no vnode backing
3707  * the file fork in question.  We may call this from hfs_vnop_inactive
3708  * to clear out resource fork data (and may not want to clear out the data
3709  * fork yet).  As a result, we pointer-check both sets of inputs before
3710  * doing anything with them.
3711  *
3712  * The caller is responsible for saving off a copy of the filefork(s)
3713  * embedded within the cnode prior to calling this function.  The pointers
3714  * supplied as arguments must be valid even if the cnode is no longer valid.
3715  */
3716
3717 int
3718 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3719                                          struct filefork *rsrcfork, u_int32_t fileid) {
3720
3721         off_t filebytes;
3722         u_int32_t fileblocks;
3723         int blksize = 0;
3724         int error = 0;
3725         int lockflags;
3726
3727         blksize = hfsmp->blockSize;
3728
3729         /* Data Fork */
3730         if (datafork) {
3731                 datafork->ff_size = 0;
3732
3733                 fileblocks = datafork->ff_blocks;
3734                 filebytes = (off_t)fileblocks * (off_t)blksize;
3735
3736                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3737
3738                 while (filebytes > 0) {
3739                         if (filebytes > HFS_BIGFILE_SIZE) {
3740                                 filebytes -= HFS_BIGFILE_SIZE;
3741                         } else {
3742                                 filebytes = 0;
3743                         }
3744
3745                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3746                         if (hfs_start_transaction(hfsmp) != 0) {
3747                                 error = EINVAL;
3748                                 break;
3749                         }
3750
3751                         if (datafork->ff_unallocblocks == 0) {
3752                                 /* Protect extents b-tree and allocation bitmap */
3753                                 lockflags = SFL_BITMAP;
3754                                 if (overflow_extents(datafork))
3755                                         lockflags |= SFL_EXTENTS;
3756                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3757
3758                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3759
3760                                 hfs_systemfile_unlock(hfsmp, lockflags);
3761                         }
3762                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3763
3764                         /* Finish the transaction and start over if necessary */
3765                         hfs_end_transaction(hfsmp);
3766
3767                         if (error) {
3768                                 break;
3769                         }
3770                 }
3771         }
3772
3773         /* Resource fork */
3774         if (error == 0 && rsrcfork) {
3775                 rsrcfork->ff_size = 0;
3776
3777                 fileblocks = rsrcfork->ff_blocks;
3778                 filebytes = (off_t)fileblocks * (off_t)blksize;
3779
3780                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3781
3782                 while (filebytes > 0) {
3783                         if (filebytes > HFS_BIGFILE_SIZE) {
3784                                 filebytes -= HFS_BIGFILE_SIZE;
3785                         } else {
3786                                 filebytes = 0;
3787                         }
3788
3789                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3790                         if (hfs_start_transaction(hfsmp) != 0) {
3791                                 error = EINVAL;
3792                                 break;
3793                         }
3794
3795                         if (rsrcfork->ff_unallocblocks == 0) {
3796                                 /* Protect extents b-tree and allocation bitmap */
3797                                 lockflags = SFL_BITMAP;
3798                                 if (overflow_extents(rsrcfork))
3799                                         lockflags |= SFL_EXTENTS;
3800                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3801
3802                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3803
3804                                 hfs_systemfile_unlock(hfsmp, lockflags);
3805                         }
3806                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3807
3808                         /* Finish the transaction and start over if necessary */
3809                         hfs_end_transaction(hfsmp);
3810
3811                         if (error) {
3812                                 break;
3813                         }
3814                 }
3815         }
3816
3817         return error;
3818 }
3819
3820 errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
3821 {
3822         errno_t error;
3823
3824         /*
3825          * Call ubc_setsize to give the VM subsystem a chance to do
3826          * whatever it needs to with existing pages before we delete
3827          * blocks.  Note that symlinks don't use the UBC so we'll
3828          * get back ENOENT in that case.
3829          */
3830         if (have_cnode_lock) {
3831                 error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
3832                 if (error == EAGAIN) {
3833                         cnode_t *cp = VTOC(vp);
3834
3835                         if (cp->c_truncatelockowner != current_thread()) {
3836 #if DEVELOPMENT || DEBUG
3837                                 panic("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
3838 #else
3839                                 printf("hfs: hfs_ubc_setsize called without exclusive truncate lock!\n");
3840 #endif
3841                         }
3842
3843                         hfs_unlock(cp);
3844                         error = ubc_setsize_ex(vp, len, 0);
3845                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
3846                 }
3847         } else
3848                 error = ubc_setsize_ex(vp, len, 0);
3849
3850         return error == ENOENT ? 0 : error;
3851 }
3852
3853 /*
3854  * Truncate a cnode to at most length size, freeing (or adding) the
3855  * disk blocks.
3856  */
3857 int
3858 hfs_truncate(struct vnode *vp, off_t length, int flags,
3859                          int truncateflags, vfs_context_t context)
3860 {
3861         struct filefork *fp = VTOF(vp);
3862         off_t filebytes;
3863         u_int32_t fileblocks;
3864         int blksize;
3865         errno_t error = 0;
3866         struct cnode *cp = VTOC(vp);
3867
3868         /* Cannot truncate an HFS directory! */
3869         if (vnode_isdir(vp)) {
3870                 return (EISDIR);
3871         }
3872         /* A swap file cannot change size. */
3873         if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
3874                 return (EPERM);
3875         }
3876
3877         blksize = VTOVCB(vp)->blockSize;
3878         fileblocks = fp->ff_blocks;
3879         filebytes = (off_t)fileblocks * (off_t)blksize;
3880
3881         bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
3882
3883         error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
3884         if (error)
3885                 return error;
3886
3887         if (!caller_has_cnode_lock) {
3888                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3889                 if (error)
3890                         return error;
3891         }
3892
3893         // have to loop truncating or growing files that are
3894         // really big because otherwise transactions can get
3895         // enormous and consume too many kernel resources.
3896
3897         if (length < filebytes) {
3898                 while (filebytes > length) {
3899                         if ((filebytes - length) > HFS_BIGFILE_SIZE) {
3900                                 filebytes -= HFS_BIGFILE_SIZE;
3901                         } else {
3902                                 filebytes = length;
3903                         }
3904                         cp->c_flag |= C_FORCEUPDATE;
3905                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3906                         if (error)
3907                                 break;
3908                 }
3909         } else if (length > filebytes) {
3910                 while (filebytes < length) {
3911                         if ((length - filebytes) > HFS_BIGFILE_SIZE) {
3912                                 filebytes += HFS_BIGFILE_SIZE;
3913                         } else {
3914                                 filebytes = length;
3915                         }
3916                         cp->c_flag |= C_FORCEUPDATE;
3917                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3918                         if (error)
3919                                 break;
3920                 }
3921         } else /* Same logical size */ {
3922
3923                 error = do_hfs_truncate(vp, length, flags, truncateflags, context);
3924         }
3925         /* Files that are changing size are not hot file candidates. */
3926         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3927                 fp->ff_bytesread = 0;
3928         }
3929
3930         if (!caller_has_cnode_lock)
3931                 hfs_unlock(cp);
3932
3933         // Make sure UBC's size matches up (in case we didn't completely succeed)
3934         errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
3935         if (!error)
3936                 error = err2;
3937
3938         return error;
3939 }
3940
3941
3942 /*
3943  * Preallocate file storage space.
3944  */
3945 int
3946 hfs_vnop_allocate(struct vnop_allocate_args /* {
3947                 vnode_t a_vp;
3948                 off_t a_length;
3949                 u_int32_t  a_flags;
3950                 off_t *a_bytesallocated;
3951                 off_t a_offset;
3952                 vfs_context_t a_context;
3953         } */ *ap)
3954 {
3955         struct vnode *vp = ap->a_vp;
3956         struct cnode *cp;
3957         struct filefork *fp;
3958         ExtendedVCB *vcb;
3959         off_t length = ap->a_length;
3960         off_t startingPEOF;
3961         off_t moreBytesRequested;
3962         off_t actualBytesAdded;
3963         off_t filebytes;
3964         u_int32_t fileblocks;
3965         int retval, retval2;
3966         u_int32_t blockHint;
3967         u_int32_t extendFlags;   /* For call to ExtendFileC */
3968         struct hfsmount *hfsmp;
3969         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
3970         int lockflags;
3971         time_t orig_ctime;
3972
3973         *(ap->a_bytesallocated) = 0;
3974
3975         if (!vnode_isreg(vp))
3976                 return (EISDIR);
3977         if (length < (off_t)0)
3978                 return (EINVAL);
3979
3980         cp = VTOC(vp);
3981
3982         orig_ctime = VTOC(vp)->c_ctime;
3983
3984         check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
3985
3986         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3987
3988         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
3989                 goto Err_Exit;
3990         }
3991
3992         fp = VTOF(vp);
3993         hfsmp = VTOHFS(vp);
3994         vcb = VTOVCB(vp);
3995
3996         fileblocks = fp->ff_blocks;
3997         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
3998
3999         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
4000                 retval = EINVAL;
4001                 goto Err_Exit;
4002         }
4003
4004         /* Fill in the flags word for the call to Extend the file */
4005
4006         extendFlags = kEFNoClumpMask;
4007         if (ap->a_flags & ALLOCATECONTIG)
4008                 extendFlags |= kEFContigMask;
4009         if (ap->a_flags & ALLOCATEALL)
4010                 extendFlags |= kEFAllMask;
4011         if (cred && suser(cred, NULL) != 0)
4012                 extendFlags |= kEFReserveMask;
4013         if (hfs_virtualmetafile(cp))
4014                 extendFlags |= kEFMetadataMask;
4015
4016         retval = E_NONE;
4017         blockHint = 0;
4018         startingPEOF = filebytes;
4019
4020         if (ap->a_flags & ALLOCATEFROMPEOF)
4021                 length += filebytes;
4022         else if (ap->a_flags & ALLOCATEFROMVOL)
4023                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
4024
4025         /* If no changes are necesary, then we're done */
4026         if (filebytes == length)
4027                 goto Std_Exit;
4028
4029         /*
4030          * Lengthen the size of the file. We must ensure that the
4031          * last byte of the file is allocated. Since the smallest
4032          * value of filebytes is 0, length will be at least 1.
4033          */
4034         if (length > filebytes) {
4035                 off_t total_bytes_added = 0, orig_request_size;
4036
4037                 orig_request_size = moreBytesRequested = length - filebytes;
4038
4039 #if QUOTA
4040                 retval = hfs_chkdq(cp,
4041                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
4042                                 cred, 0);
4043                 if (retval)
4044                         goto Err_Exit;
4045
4046 #endif /* QUOTA */
4047                 /*
4048                  * Metadata zone checks.
4049                  */
4050                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
4051                         /*
4052                          * Allocate Journal and Quota files in metadata zone.
4053                          */
4054                         if (hfs_virtualmetafile(cp)) {
4055                                 blockHint = hfsmp->hfs_metazone_start;
4056                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
4057                                    (blockHint <= hfsmp->hfs_metazone_end)) {
4058                                 /*
4059                                  * Move blockHint outside metadata zone.
4060                                  */
4061                                 blockHint = hfsmp->hfs_metazone_end + 1;
4062                         }
4063                 }
4064
4065
4066                 while ((length > filebytes) && (retval == E_NONE)) {
4067                     off_t bytesRequested;
4068
4069                     if (hfs_start_transaction(hfsmp) != 0) {
4070                         retval = EINVAL;
4071                         goto Err_Exit;
4072                     }
4073
4074                     /* Protect extents b-tree and allocation bitmap */
4075                     lockflags = SFL_BITMAP;
4076                     if (overflow_extents(fp))
4077                                 lockflags |= SFL_EXTENTS;
4078                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4079
4080                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4081                                 bytesRequested = HFS_BIGFILE_SIZE;
4082                     } else {
4083                                 bytesRequested = moreBytesRequested;
4084                     }
4085
4086                     if (extendFlags & kEFContigMask) {
4087                             // if we're on a sparse device, this will force it to do a
4088                             // full scan to find the space needed.
4089                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4090                     }
4091
4092                     retval = MacToVFSError(ExtendFileC(vcb,
4093                                                 (FCB*)fp,
4094                                                 bytesRequested,
4095                                                 blockHint,
4096                                                 extendFlags,
4097                                                 &actualBytesAdded));
4098
4099                     if (retval == E_NONE) {
4100                         *(ap->a_bytesallocated) += actualBytesAdded;
4101                         total_bytes_added += actualBytesAdded;
4102                         moreBytesRequested -= actualBytesAdded;
4103                         if (blockHint != 0) {
4104                             blockHint += actualBytesAdded / vcb->blockSize;
4105                         }
4106                     }
4107                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4108
4109                     hfs_systemfile_unlock(hfsmp, lockflags);
4110
4111                     if (hfsmp->jnl) {
4112                         (void) hfs_update(vp, TRUE);
4113                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4114                     }
4115
4116                     hfs_end_transaction(hfsmp);
4117                 }
4118
4119
4120                 /*
4121                  * if we get an error and no changes were made then exit
4122                  * otherwise we must do the hfs_update to reflect the changes
4123                  */
4124                 if (retval && (startingPEOF == filebytes))
4125                         goto Err_Exit;
4126
4127                 /*
4128                  * Adjust actualBytesAdded to be allocation block aligned, not
4129                  * clump size aligned.
4130                  * NOTE: So what we are reporting does not affect reality
4131                  * until the file is closed, when we truncate the file to allocation
4132                  * block size.
4133                  */
4134                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4135                         *(ap->a_bytesallocated) =
4136                                 roundup(orig_request_size, (off_t)vcb->blockSize);
4137
4138         } else { /* Shorten the size of the file */
4139
4140                 /*
4141                  * N.B. At present, this code is never called.  If and when we
4142                  * do start using it, it looks like there might be slightly
4143                  * strange semantics with the file size: it's possible for the
4144                  * file size to *increase* e.g. if current file size is 5,
4145                  * length is 1024 and filebytes is 4096, the file size will
4146                  * end up being 1024 bytes.  This isn't necessarily a problem
4147                  * but it's not consistent with the code above which doesn't
4148                  * change the file size.
4149                  */
4150
4151                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
4152                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4153
4154                 /*
4155                  * if we get an error and no changes were made then exit
4156                  * otherwise we must do the hfs_update to reflect the changes
4157                  */
4158                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4159 #if QUOTA
4160                 /* These are  bytesreleased */
4161                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4162 #endif /* QUOTA */
4163
4164                 if (fp->ff_size > filebytes) {
4165                         fp->ff_size = filebytes;
4166
4167                         hfs_ubc_setsize(vp, fp->ff_size, true);
4168                 }
4169         }
4170
4171 Std_Exit:
4172         cp->c_touch_chgtime = TRUE;
4173         cp->c_touch_modtime = TRUE;
4174         retval2 = hfs_update(vp, MNT_WAIT);
4175
4176         if (retval == 0)
4177                 retval = retval2;
4178 Err_Exit:
4179         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4180         hfs_unlock(cp);
4181         return (retval);
4182 }
4183
4184
4185 /*
4186  * Pagein for HFS filesystem
4187  */
4188 int
4189 hfs_vnop_pagein(struct vnop_pagein_args *ap)
4190 /*
4191         struct vnop_pagein_args {
4192                 vnode_t a_vp,
4193                 upl_t         a_pl,
4194                 vm_offset_t   a_pl_offset,
4195                 off_t         a_f_offset,
4196                 size_t        a_size,
4197                 int           a_flags
4198                 vfs_context_t a_context;
4199         };
4200 */
4201 {
4202         vnode_t         vp;
4203         struct cnode    *cp;
4204         struct filefork *fp;
4205         int             error = 0;
4206         upl_t           upl;
4207         upl_page_info_t *pl;
4208         off_t           f_offset;
4209         off_t           page_needed_f_offset;
4210         int             offset;
4211         int             isize;
4212         int             upl_size;
4213         int             pg_index;
4214         boolean_t       truncate_lock_held = FALSE;
4215         boolean_t       file_converted = FALSE;
4216         kern_return_t   kret;
4217
4218         vp = ap->a_vp;
4219         cp = VTOC(vp);
4220         fp = VTOF(vp);
4221
4222 #if CONFIG_PROTECT
4223         if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4224                 /*
4225                  * If we errored here, then this means that one of two things occurred:
4226                  * 1. there was a problem with the decryption of the key.
4227                  * 2. the device is locked and we are not allowed to access this particular file.
4228                  *
4229                  * Either way, this means that we need to shut down this upl now.  As long as
4230                  * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4231                  * then we create a upl and immediately abort it.
4232                  */
4233                 if (ap->a_pl == NULL) {
4234                         /* create the upl */
4235                         ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4236                                         UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4237                         /* mark the range as needed so it doesn't immediately get discarded upon abort */
4238                         ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4239
4240                         /* Abort the range */
4241                         ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4242                 }
4243
4244
4245                 return error;
4246         }
4247 #endif /* CONFIG_PROTECT */
4248
4249         if (ap->a_pl != NULL) {
4250                 /*
4251                  * this can only happen for swap files now that
4252                  * we're asking for V2 paging behavior...
4253                  * so don't need to worry about decompression, or
4254                  * keeping track of blocks read or taking the truncate lock
4255                  */
4256                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4257                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4258                 goto pagein_done;
4259         }
4260
4261         page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
4262
4263 retry_pagein:
4264         /*
4265          * take truncate lock (shared/recursive) to guard against
4266          * zero-fill thru fsync interfering, but only for v2
4267          *
4268          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4269          * lock shared and we are allowed to recurse 1 level if this thread already
4270          * owns the lock exclusively... this can legally occur
4271          * if we are doing a shrinking ftruncate against a file
4272          * that is mapped private, and the pages being truncated
4273          * do not currently exist in the cache... in that case
4274          * we will have to page-in the missing pages in order
4275          * to provide them to the private mapping... we must
4276          * also call hfs_unlock_truncate with a postive been_recursed
4277          * arg to indicate that if we have recursed, there is no need to drop
4278          * the lock.  Allowing this simple recursion is necessary
4279          * in order to avoid a certain deadlock... since the ftruncate
4280          * already holds the truncate lock exclusively, if we try
4281          * to acquire it shared to protect the pagein path, we will
4282          * hang this thread
4283          *
4284          * NOTE: The if () block below is a workaround in order to prevent a
4285          * VM deadlock. See rdar://7853471.
4286          *
4287          * If we are in a forced unmount, then launchd will still have the
4288          * dyld_shared_cache file mapped as it is trying to reboot.  If we
4289          * take the truncate lock here to service a page fault, then our
4290          * thread could deadlock with the forced-unmount.  The forced unmount
4291          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4292          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
4293          * thread will think it needs to copy all of the data out of the file
4294          * and into a VM copy object.  If we hold the cnode lock here, then that
4295          * VM operation will not be able to proceed, because we'll set a busy page
4296          * before attempting to grab the lock.  Note that this isn't as simple as "don't
4297          * call ubc_setsize" because doing that would just shift the problem to the
4298          * ubc_msync done before the vnode is reclaimed.
4299          *
4300          * So, if a forced unmount on this volume is in flight AND the cnode is
4301          * marked C_DELETED, then just go ahead and do the page in without taking
4302          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4303          * that is not going to be available on the next mount, this seems like a
4304          * OK solution from a correctness point of view, even though it is hacky.
4305          */
4306         if (vfs_isforce(vp->v_mount)) {
4307                 if (cp->c_flag & C_DELETED) {
4308                         /* If we don't get it, then just go ahead and operate without the lock */
4309                         truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4310                 }
4311         }
4312         else {
4313                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4314                 truncate_lock_held = TRUE;
4315         }
4316
4317         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4318
4319         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4320                 error = EINVAL;
4321                 goto pagein_done;
4322         }
4323         ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4324
4325         upl_size = isize = ap->a_size;
4326
4327         /*
4328          * Scan from the back to find the last page in the UPL, so that we
4329          * aren't looking at a UPL that may have already been freed by the
4330          * preceding aborts/completions.
4331          */
4332         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4333                 if (upl_page_present(pl, --pg_index))
4334                         break;
4335                 if (pg_index == 0) {
4336                         /*
4337                          * no absent pages were found in the range specified
4338                          * just abort the UPL to get rid of it and then we're done
4339                          */
4340                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4341                         goto pagein_done;
4342                 }
4343         }
4344         /*
4345          * initialize the offset variables before we touch the UPL.
4346          * f_offset is the position into the file, in bytes
4347          * offset is the position into the UPL, in bytes
4348          * pg_index is the pg# of the UPL we're operating on
4349          * isize is the offset into the UPL of the last page that is present.
4350          */
4351         isize = ((pg_index + 1) * PAGE_SIZE);
4352         pg_index = 0;
4353         offset = 0;
4354         f_offset = ap->a_f_offset;
4355
4356         while (isize) {
4357                 int  xsize;
4358                 int  num_of_pages;
4359
4360                 if ( !upl_page_present(pl, pg_index)) {
4361                         /*
4362                          * we asked for RET_ONLY_ABSENT, so it's possible
4363                          * to get back empty slots in the UPL.
4364                          * just skip over them
4365                          */
4366                         f_offset += PAGE_SIZE;
4367                         offset   += PAGE_SIZE;
4368                         isize    -= PAGE_SIZE;
4369                         pg_index++;
4370
4371                         continue;
4372                 }
4373                 /*
4374                  * We know that we have at least one absent page.
4375                  * Now checking to see how many in a row we have
4376                  */
4377                 num_of_pages = 1;
4378                 xsize = isize - PAGE_SIZE;
4379
4380                 while (xsize) {
4381                         if ( !upl_page_present(pl, pg_index + num_of_pages))
4382                                 break;
4383                         num_of_pages++;
4384                         xsize -= PAGE_SIZE;
4385                 }
4386                 xsize = num_of_pages * PAGE_SIZE;
4387
4388 #if HFS_COMPRESSION
4389                 if (VNODE_IS_RSRC(vp)) {
4390                         /* allow pageins of the resource fork */
4391                 } else {
4392                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4393
4394                         if (compressed) {
4395
4396                                 if (truncate_lock_held) {
4397                                         /*
4398                                          * can't hold the truncate lock when calling into the decmpfs layer
4399                                          * since it calls back into this layer... even though we're only
4400                                          * holding the lock in shared mode, and the re-entrant path only
4401                                          * takes the lock shared, we can deadlock if some other thread
4402                                          * tries to grab the lock exclusively in between.
4403                                          */
4404                                         hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4405                                         truncate_lock_held = FALSE;
4406                                 }
4407                                 ap->a_pl = upl;
4408                                 ap->a_pl_offset = offset;
4409                                 ap->a_f_offset = f_offset;
4410                                 ap->a_size = xsize;
4411
4412                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4413                                 /*
4414                                  * note that decpfs_pagein_compressed can change the state of
4415                                  * 'compressed'... it will set it to 0 if the file is no longer
4416                                  * compressed once the compression lock is successfully taken
4417                                  * i.e. we would block on that lock while the file is being inflated
4418                                  */
4419                                 if (compressed) {
4420                                         if (error == 0) {
4421                                                 /* successful page-in, update the access time */
4422                                                 VTOC(vp)->c_touch_acctime = TRUE;
4423
4424                                                 /* compressed files are not hot file candidates */
4425                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4426                                                         fp->ff_bytesread = 0;
4427                                                 }
4428                                         } else if (error == EAGAIN) {
4429                                                 /*
4430                                                  * EAGAIN indicates someone else already holds the compression lock...
4431                                                  * to avoid deadlocking, we'll abort this range of pages with an
4432                                                  * indication that the pagein needs to be redriven
4433                                                  */
4434                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4435                                         } else if (error == ENOSPC) {
4436
4437                                                 if (upl_size == PAGE_SIZE)
4438                                                         panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
4439
4440                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4441
4442                                                 ap->a_size = PAGE_SIZE;
4443                                                 ap->a_pl = NULL;
4444                                                 ap->a_pl_offset = 0;
4445                                                 ap->a_f_offset = page_needed_f_offset;
4446
4447                                                 goto retry_pagein;
4448                                         }
4449                                         goto pagein_next_range;
4450                                 }
4451                                 else {
4452                                         /*
4453                                          * Set file_converted only if the file became decompressed while we were
4454                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
4455                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4456                                          * condition below, since we could have avoided taking the truncate lock to prevent
4457                                          * a deadlock in the force unmount case.
4458                                          */
4459                                         file_converted = TRUE;
4460                                 }
4461                         }
4462                         if (file_converted == TRUE) {
4463                                 /*
4464                                  * the file was converted back to a regular file after we first saw it as compressed
4465                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4466                                  * reset a_size so that we consider what remains of the original request
4467                                  * and null out a_upl and a_pl_offset.
4468                                  *
4469                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
4470                                  * successfully decompressed the range in question for this file.
4471                                  */
4472                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4473
4474                                 ap->a_size = isize;
4475                                 ap->a_pl = NULL;
4476                                 ap->a_pl_offset = 0;
4477
4478                                 /* Reset file_converted back to false so that we don't infinite-loop. */
4479                                 file_converted = FALSE;
4480                                 goto retry_pagein;
4481                         }
4482                 }
4483 #endif
4484                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4485
4486                 /*
4487                  * Keep track of blocks read.
4488                  */
4489                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4490                         int bytesread;
4491                         int took_cnode_lock = 0;
4492
4493                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4494                                 bytesread = fp->ff_size;
4495                         else
4496                                 bytesread = xsize;
4497
4498                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4499                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4500                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4501                                 took_cnode_lock = 1;
4502                         }
4503                         /*
4504                          * If this file hasn't been seen since the start of
4505                          * the current sampling period then start over.
4506                          */
4507                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4508                                 struct timeval tv;
4509
4510                                 fp->ff_bytesread = bytesread;
4511                                 microtime(&tv);
4512                                 cp->c_atime = tv.tv_sec;
4513                         } else {
4514                                 fp->ff_bytesread += bytesread;
4515                         }
4516                         cp->c_touch_acctime = TRUE;
4517                         if (took_cnode_lock)
4518                                 hfs_unlock(cp);
4519                 }
4520 pagein_next_range:
4521                 f_offset += xsize;
4522                 offset   += xsize;
4523                 isize    -= xsize;
4524                 pg_index += num_of_pages;
4525
4526                 error = 0;
4527         }
4528
4529 pagein_done:
4530         if (truncate_lock_held == TRUE) {
4531                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4532                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4533         }
4534
4535         return (error);
4536 }
4537
4538 /*
4539  * Pageout for HFS filesystem.
4540  */
4541 int
4542 hfs_vnop_pageout(struct vnop_pageout_args *ap)
4543 /*
4544         struct vnop_pageout_args {
4545            vnode_t a_vp,
4546            upl_t         a_pl,
4547            vm_offset_t   a_pl_offset,
4548            off_t         a_f_offset,
4549            size_t        a_size,
4550            int           a_flags
4551            vfs_context_t a_context;
4552         };
4553 */
4554 {
4555         vnode_t vp = ap->a_vp;
4556         struct cnode *cp;
4557         struct filefork *fp;
4558         int retval = 0;
4559         off_t filesize;
4560         upl_t           upl;
4561         upl_page_info_t* pl;
4562         vm_offset_t     a_pl_offset;
4563         int             a_flags;
4564         int is_pageoutv2 = 0;
4565         kern_return_t kret;
4566
4567         cp = VTOC(vp);
4568         fp = VTOF(vp);
4569
4570         /*
4571          * Figure out where the file ends, for pageout purposes.  If
4572          * ff_new_size > ff_size, then we're in the middle of extending the
4573          * file via a write, so it is safe (and necessary) that we be able
4574          * to pageout up to that point.
4575          */
4576         filesize = fp->ff_size;
4577         if (fp->ff_new_size > filesize)
4578                 filesize = fp->ff_new_size;
4579
4580         a_flags = ap->a_flags;
4581         a_pl_offset = ap->a_pl_offset;
4582
4583         /*
4584          * we can tell if we're getting the new or old behavior from the UPL
4585          */
4586         if ((upl = ap->a_pl) == NULL) {
4587                 int request_flags;
4588
4589                 is_pageoutv2 = 1;
4590                 /*
4591                  * we're in control of any UPL we commit
4592                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4593                  */
4594                 a_flags &= ~UPL_NOCOMMIT;
4595                 a_pl_offset = 0;
4596
4597                 /*
4598                  * For V2 semantics, we want to take the cnode truncate lock
4599                  * shared to guard against the file size changing via zero-filling.
4600                  *
4601                  * However, we have to be careful because we may be invoked
4602                  * via the ubc_msync path to write out dirty mmap'd pages
4603                  * in response to a lock event on a content-protected
4604                  * filesystem (e.g. to write out class A files).
4605                  * As a result, we want to take the truncate lock 'SHARED' with
4606                  * the mini-recursion locktype so that we don't deadlock/panic
4607                  * because we may be already holding the truncate lock exclusive to force any other
4608                  * IOs to have blocked behind us.
4609                  */
4610                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4611
4612                 if (a_flags & UPL_MSYNC) {
4613                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4614                 }
4615                 else {
4616                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4617                 }
4618
4619                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4620
4621                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4622                         retval = EINVAL;
4623                         goto pageout_done;
4624                 }
4625         }
4626         /*
4627          * from this point forward upl points at the UPL we're working with
4628          * it was either passed in or we succesfully created it
4629          */
4630
4631         /*
4632          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4633          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4634          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4635          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4636          * logic in vnode_pageout except that we need to do it after grabbing the truncate
4637          * lock in HFS so that we don't lock invert ourselves.
4638          *
4639          * Note that we can still get into this function on behalf of the default pager with
4640          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4641          * since fsync and other writing threads will grab the locks, then mark the
4642          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4643          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4644          * we do not try to grab anything for the pre-V2 case, which should only be accessed
4645          * by the paging/VM system.
4646          */
4647
4648         if (is_pageoutv2) {
4649                 off_t f_offset;
4650                 int offset;
4651                 int isize;
4652                 int pg_index;
4653                 int error;
4654                 int error_ret = 0;
4655
4656                 isize = ap->a_size;
4657                 f_offset = ap->a_f_offset;
4658
4659                 /*
4660                  * Scan from the back to find the last page in the UPL, so that we
4661                  * aren't looking at a UPL that may have already been freed by the
4662                  * preceding aborts/completions.
4663                  */
4664                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4665                         if (upl_page_present(pl, --pg_index))
4666                                 break;
4667                         if (pg_index == 0) {
4668                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4669                                 goto pageout_done;
4670                         }
4671                 }
4672
4673                 /*
4674                  * initialize the offset variables before we touch the UPL.
4675                  * a_f_offset is the position into the file, in bytes
4676                  * offset is the position into the UPL, in bytes
4677                  * pg_index is the pg# of the UPL we're operating on.
4678                  * isize is the offset into the UPL of the last non-clean page.
4679                  */
4680                 isize = ((pg_index + 1) * PAGE_SIZE);
4681
4682                 offset = 0;
4683                 pg_index = 0;
4684
4685                 while (isize) {
4686                         int  xsize;
4687                         int  num_of_pages;
4688
4689                         if ( !upl_page_present(pl, pg_index)) {
4690                                 /*
4691                                  * we asked for RET_ONLY_DIRTY, so it's possible
4692                                  * to get back empty slots in the UPL.
4693                                  * just skip over them
4694                                  */
4695                                 f_offset += PAGE_SIZE;
4696                                 offset   += PAGE_SIZE;
4697                                 isize    -= PAGE_SIZE;
4698                                 pg_index++;
4699
4700                                 continue;
4701                         }
4702                         if ( !upl_dirty_page(pl, pg_index)) {
4703                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4704                         }
4705
4706                         /*
4707                          * We know that we have at least one dirty page.
4708                          * Now checking to see how many in a row we have
4709                          */
4710                         num_of_pages = 1;
4711                         xsize = isize - PAGE_SIZE;
4712
4713                         while (xsize) {
4714                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4715                                         break;
4716                                 num_of_pages++;
4717                                 xsize -= PAGE_SIZE;
4718                         }
4719                         xsize = num_of_pages * PAGE_SIZE;
4720
4721                         if (!vnode_isswap(vp)) {
4722                                 off_t end_of_range;
4723                                 int tooklock;
4724
4725                                 tooklock = 0;
4726
4727                                 if (cp->c_lockowner != current_thread()) {
4728                                         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4729                                                 /*
4730                                                  * we're in the v2 path, so we are the
4731                                                  * owner of the UPL... we may have already
4732                                                  * processed some of the UPL, so abort it
4733                                                  * from the current working offset to the
4734                                                  * end of the UPL
4735                                                  */
4736                                                 ubc_upl_abort_range(upl,
4737                                                                     offset,
4738                                                                     ap->a_size - offset,
4739                                                                     UPL_ABORT_FREE_ON_EMPTY);
4740                                                 goto pageout_done;
4741                                         }
4742                                         tooklock = 1;
4743                                 }
4744                                 end_of_range = f_offset + xsize - 1;
4745
4746                                 if (end_of_range >= filesize) {
4747                                         end_of_range = (off_t)(filesize - 1);
4748                                 }
4749                                 if (f_offset < filesize) {
4750                                         rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
4751                                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4752                                 }
4753                                 if (tooklock) {
4754                                         hfs_unlock(cp);
4755                                 }
4756                         }
4757                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
4758                                                         xsize, filesize, a_flags))) {
4759                                 if (error_ret == 0)
4760                                         error_ret = error;
4761                         }
4762                         f_offset += xsize;
4763                         offset   += xsize;
4764                         isize    -= xsize;
4765                         pg_index += num_of_pages;
4766                 }
4767                 /* capture errnos bubbled out of cluster_pageout if they occurred */
4768                 if (error_ret != 0) {
4769                         retval = error_ret;
4770                 }
4771         } /* end block for v2 pageout behavior */
4772         else {
4773                 if (!vnode_isswap(vp)) {
4774                         off_t end_of_range;
4775                         int tooklock = 0;
4776
4777                         if (cp->c_lockowner != current_thread()) {
4778                                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4779                                         if (!(a_flags & UPL_NOCOMMIT)) {
4780                                                 ubc_upl_abort_range(upl,
4781                                                                     a_pl_offset,
4782                                                                     ap->a_size,
4783                                                                     UPL_ABORT_FREE_ON_EMPTY);
4784                                         }
4785                                         goto pageout_done;
4786                                 }
4787                                 tooklock = 1;
4788                         }
4789                         end_of_range = ap->a_f_offset + ap->a_size - 1;
4790
4791                         if (end_of_range >= filesize) {
4792                                 end_of_range = (off_t)(filesize - 1);
4793                         }
4794                         if (ap->a_f_offset < filesize) {
4795                                 rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
4796                                 cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4797                         }
4798
4799                         if (tooklock) {
4800                                 hfs_unlock(cp);
4801                         }
4802                 }
4803                 /*
4804                  * just call cluster_pageout for old pre-v2 behavior
4805                  */
4806                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4807                                 ap->a_size, filesize, a_flags);
4808         }
4809
4810         /*
4811          * If data was written, update the modification time of the file
4812          * but only if it's mapped writable; we will have touched the
4813          * modifcation time for direct writes.
4814          */
4815         if (retval == 0 && (ubc_is_mapped_writable(vp)
4816                                                 || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
4817                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4818
4819                 // Check again with lock
4820                 bool mapped_writable = ubc_is_mapped_writable(vp);
4821                 if (mapped_writable
4822                         || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
4823                         cp->c_touch_modtime = TRUE;
4824                         cp->c_touch_chgtime = TRUE;
4825
4826                         /*
4827                          * We only need to increment the generation counter if
4828                          * it's currently mapped writable because we incremented
4829                          * the counter in hfs_vnop_mnomap.
4830                          */
4831                         if (mapped_writable)
4832                                 hfs_incr_gencount(VTOC(vp));
4833
4834                         /*
4835                          * If setuid or setgid bits are set and this process is
4836                          * not the superuser then clear the setuid and setgid bits
4837                          * as a precaution against tampering.
4838                          */
4839                         if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4840                                 (vfs_context_suser(ap->a_context) != 0)) {
4841                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
4842                         }
4843                 }
4844
4845                 hfs_unlock(cp);
4846         }
4847
4848 pageout_done:
4849         if (is_pageoutv2) {
4850                 /*
4851                  * Release the truncate lock.  Note that because
4852                  * we may have taken the lock recursively by
4853                  * being invoked via ubc_msync due to lockdown,
4854                  * we should release it recursively, too.
4855                  */
4856                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4857         }
4858         return (retval);
4859 }
4860
4861 /*
4862  * Intercept B-Tree node writes to unswap them if necessary.
4863  */
4864 int
4865 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4866 {
4867         int retval = 0;
4868         register struct buf *bp = ap->a_bp;
4869         register struct vnode *vp = buf_vnode(bp);
4870         BlockDescriptor block;
4871
4872         /* Trap B-Tree writes */
4873         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4874             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4875             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4876             (vp == VTOHFS(vp)->hfc_filevp)) {
4877
4878                 /*
4879                  * Swap and validate the node if it is in native byte order.
4880                  * This is always be true on big endian, so we always validate
4881                  * before writing here.  On little endian, the node typically has
4882                  * been swapped and validated when it was written to the journal,
4883                  * so we won't do anything here.
4884                  */
4885                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4886                         /* Prepare the block pointer */
4887                         block.blockHeader = bp;
4888                         block.buffer = (char *)buf_dataptr(bp);
4889                         block.blockNum = buf_lblkno(bp);
4890                         /* not found in cache ==> came from disk */
4891                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4892                         block.blockSize = buf_count(bp);
4893
4894                         /* Endian un-swap B-Tree node */
4895                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4896                         if (retval)
4897                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4898                 }
4899         }
4900
4901         /* This buffer shouldn't be locked anymore but if it is clear it */
4902         if ((buf_flags(bp) & B_LOCKED)) {
4903                 // XXXdbg
4904                 if (VTOHFS(vp)->jnl) {
4905                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4906                 }
4907                 buf_clearflags(bp, B_LOCKED);
4908         }
4909         retval = vn_bwrite (ap);
4910
4911         return (retval);
4912 }
4913
4914 /*
4915  * Relocate a file to a new location on disk
4916  *  cnode must be locked on entry
4917  *
4918  * Relocation occurs by cloning the file's data from its
4919  * current set of blocks to a new set of blocks. During
4920  * the relocation all of the blocks (old and new) are
4921  * owned by the file.
4922  *
4923  * -----------------
4924  * |///////////////|
4925  * -----------------
4926  * 0               N (file offset)
4927  *
4928  * -----------------     -----------------
4929  * |///////////////|     |               |     STEP 1 (acquire new blocks)
4930  * -----------------     -----------------
4931  * 0               N     N+1             2N
4932  *
4933  * -----------------     -----------------
4934  * |///////////////|     |///////////////|     STEP 2 (clone data)
4935  * -----------------     -----------------
4936  * 0               N     N+1             2N
4937  *
4938  *                       -----------------
4939  *                       |///////////////|     STEP 3 (head truncate blocks)
4940  *                       -----------------
4941  *                       0               N
4942  *
4943  * During steps 2 and 3 page-outs to file offsets less
4944  * than or equal to N are suspended.
4945  *
4946  * During step 3 page-ins to the file get suspended.
4947  */
4948 int
4949 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
4950         struct  proc *p)
4951 {
4952         struct  cnode *cp;
4953         struct  filefork *fp;
4954         struct  hfsmount *hfsmp;
4955         u_int32_t  headblks;
4956         u_int32_t  datablks;
4957         u_int32_t  blksize;
4958         u_int32_t  growsize;
4959         u_int32_t  nextallocsave;
4960         daddr64_t  sector_a,  sector_b;
4961         int eflags;
4962         off_t  newbytes;
4963         int  retval;
4964         int lockflags = 0;
4965         int took_trunc_lock = 0;
4966         int started_tr = 0;
4967         enum vtype vnodetype;
4968
4969         vnodetype = vnode_vtype(vp);
4970         if (vnodetype != VREG) {
4971                 /* Not allowed to move symlinks. */
4972                 return (EPERM);
4973         }
4974
4975         hfsmp = VTOHFS(vp);
4976         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
4977                 return (ENOSPC);
4978         }
4979
4980         cp = VTOC(vp);
4981         fp = VTOF(vp);
4982         if (fp->ff_unallocblocks)
4983                 return (EINVAL);
4984
4985 #if CONFIG_PROTECT
4986         /*
4987          * <rdar://problem/9118426>
4988          * Disable HFS file relocation on content-protected filesystems
4989          */
4990         if (cp_fs_protected (hfsmp->hfs_mp)) {
4991                 return EINVAL;
4992         }
4993 #endif
4994         /* If it's an SSD, also disable HFS relocation */
4995         if (hfsmp->hfs_flags & HFS_SSD) {
4996                 return EINVAL;
4997         }
4998
4999
5000         blksize = hfsmp->blockSize;
5001         if (blockHint == 0)
5002                 blockHint = hfsmp->nextAllocation;
5003
5004         if (fp->ff_size > 0x7fffffff) {
5005                 return (EFBIG);
5006         }
5007
5008         //
5009         // We do not believe that this call to hfs_fsync() is
5010         // necessary and it causes a journal transaction
5011         // deadlock so we are removing it.
5012         //
5013         //if (vnodetype == VREG && !vnode_issystem(vp)) {
5014         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
5015         //      if (retval)
5016         //              return (retval);
5017         //}
5018
5019         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
5020                 hfs_unlock(cp);
5021                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
5022                 /* Force lock since callers expects lock to be held. */
5023                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
5024                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5025                         return (retval);
5026                 }
5027                 /* No need to continue if file was removed. */
5028                 if (cp->c_flag & C_NOEXISTS) {
5029                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5030                         return (ENOENT);
5031                 }
5032                 took_trunc_lock = 1;
5033         }
5034         headblks = fp->ff_blocks;
5035         datablks = howmany(fp->ff_size, blksize);
5036         growsize = datablks * blksize;
5037         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
5038         if (blockHint >= hfsmp->hfs_metazone_start &&
5039             blockHint <= hfsmp->hfs_metazone_end)
5040                 eflags |= kEFMetadataMask;
5041
5042         if (hfs_start_transaction(hfsmp) != 0) {
5043                 if (took_trunc_lock)
5044                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5045             return (EINVAL);
5046         }
5047         started_tr = 1;
5048         /*
5049          * Protect the extents b-tree and the allocation bitmap
5050          * during MapFileBlockC and ExtendFileC operations.
5051          */
5052         lockflags = SFL_BITMAP;
5053         if (overflow_extents(fp))
5054                 lockflags |= SFL_EXTENTS;
5055         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5056
5057         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
5058         if (retval) {
5059                 retval = MacToVFSError(retval);
5060                 goto out;
5061         }
5062
5063         /*
5064          * STEP 1 - acquire new allocation blocks.
5065          */
5066         nextallocsave = hfsmp->nextAllocation;
5067         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
5068         if (eflags & kEFMetadataMask) {
5069                 hfs_lock_mount(hfsmp);
5070                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
5071                 MarkVCBDirty(hfsmp);
5072                 hfs_unlock_mount(hfsmp);
5073         }
5074
5075         retval = MacToVFSError(retval);
5076         if (retval == 0) {
5077                 cp->c_flag |= C_MODIFIED;
5078                 if (newbytes < growsize) {
5079                         retval = ENOSPC;
5080                         goto restore;
5081                 } else if (fp->ff_blocks < (headblks + datablks)) {
5082                         printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
5083                         retval = ENOSPC;
5084                         goto restore;
5085                 }
5086
5087                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
5088                 if (retval) {
5089                         retval = MacToVFSError(retval);
5090                 } else if ((sector_a + 1) == sector_b) {
5091                         retval = ENOSPC;
5092                         goto restore;
5093                 } else if ((eflags & kEFMetadataMask) &&
5094                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5095                               hfsmp->hfs_metazone_end)) {
5096 #if 0
5097                         const char * filestr;
5098                         char emptystr = '\0';
5099
5100                         if (cp->c_desc.cd_nameptr != NULL) {
5101                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5102                         } else if (vnode_name(vp) != NULL) {
5103                                 filestr = vnode_name(vp);
5104                         } else {
5105                                 filestr = &emptystr;
5106                         }
5107 #endif
5108                         retval = ENOSPC;
5109                         goto restore;
5110                 }
5111         }
5112         /* Done with system locks and journal for now. */
5113         hfs_systemfile_unlock(hfsmp, lockflags);
5114         lockflags = 0;
5115         hfs_end_transaction(hfsmp);
5116         started_tr = 0;
5117
5118         if (retval) {
5119                 /*
5120                  * Check to see if failure is due to excessive fragmentation.
5121                  */
5122                 if ((retval == ENOSPC) &&
5123                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5124                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5125                 }
5126                 goto out;
5127         }
5128         /*
5129          * STEP 2 - clone file data into the new allocation blocks.
5130          */
5131
5132         if (vnodetype == VLNK)
5133                 retval = EPERM;
5134         else if (vnode_issystem(vp))
5135                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5136         else
5137                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
5138
5139         /* Start transaction for step 3 or for a restore. */
5140         if (hfs_start_transaction(hfsmp) != 0) {
5141                 retval = EINVAL;
5142                 goto out;
5143         }
5144         started_tr = 1;
5145         if (retval)
5146                 goto restore;
5147
5148         /*
5149          * STEP 3 - switch to cloned data and remove old blocks.
5150          */
5151         lockflags = SFL_BITMAP;
5152         if (overflow_extents(fp))
5153                 lockflags |= SFL_EXTENTS;
5154         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5155
5156         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5157
5158         hfs_systemfile_unlock(hfsmp, lockflags);
5159         lockflags = 0;
5160         if (retval)
5161                 goto restore;
5162 out:
5163         if (took_trunc_lock)
5164                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5165
5166         if (lockflags) {
5167                 hfs_systemfile_unlock(hfsmp, lockflags);
5168                 lockflags = 0;
5169         }
5170
5171         /* Push cnode's new extent data to disk. */
5172         if (retval == 0) {
5173                 (void) hfs_update(vp, MNT_WAIT);
5174         }
5175         if (hfsmp->jnl) {
5176                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5177                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
5178                 else
5179                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
5180         }
5181 exit:
5182         if (started_tr)
5183                 hfs_end_transaction(hfsmp);
5184
5185         return (retval);
5186
5187 restore:
5188         if (fp->ff_blocks == headblks) {
5189                 if (took_trunc_lock)
5190                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5191                 goto exit;
5192         }
5193         /*
5194          * Give back any newly allocated space.
5195          */
5196         if (lockflags == 0) {
5197                 lockflags = SFL_BITMAP;
5198                 if (overflow_extents(fp))
5199                         lockflags |= SFL_EXTENTS;
5200                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5201         }
5202
5203         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5204                                                  FTOC(fp)->c_fileid, false);
5205
5206         hfs_systemfile_unlock(hfsmp, lockflags);
5207         lockflags = 0;
5208
5209         if (took_trunc_lock)
5210                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5211         goto exit;
5212 }
5213
5214
5215 /*
5216  * Clone a file's data within the file.
5217  *
5218  */
5219 static int
5220 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5221 {
5222         caddr_t  bufp;
5223         size_t  bufsize;
5224         size_t  copysize;
5225         size_t  iosize;
5226         size_t  offset;
5227         off_t   writebase;
5228         uio_t auio;
5229         int  error = 0;
5230
5231         writebase = blkstart * blksize;
5232         copysize = blkcnt * blksize;
5233         iosize = bufsize = MIN(copysize, 128 * 1024);
5234         offset = 0;
5235
5236         hfs_unlock(VTOC(vp));
5237
5238 #if CONFIG_PROTECT
5239         if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5240                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5241                 return (error);
5242         }
5243 #endif /* CONFIG_PROTECT */
5244
5245         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5246                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5247                 return (ENOMEM);
5248         }
5249
5250         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5251
5252         while (offset < copysize) {
5253                 iosize = MIN(copysize - offset, iosize);
5254
5255                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5256                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5257
5258                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5259                 if (error) {
5260                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
5261                         break;
5262                 }
5263                 if (uio_resid(auio) != 0) {
5264                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5265                         error = EIO;
5266                         break;
5267                 }
5268
5269                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5270                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5271
5272                 error = cluster_write(vp, auio, writebase + offset,
5273                                       writebase + offset + iosize,
5274                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5275                 if (error) {
5276                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
5277                         break;
5278                 }
5279                 if (uio_resid(auio) != 0) {
5280                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5281                         error = EIO;
5282                         break;
5283                 }
5284                 offset += iosize;
5285         }
5286         uio_free(auio);
5287
5288         if ((blksize & PAGE_MASK)) {
5289                 /*
5290                  * since the copy may not have started on a PAGE
5291                  * boundary (or may not have ended on one), we
5292                  * may have pages left in the cache since NOCACHE
5293                  * will let partially written pages linger...
5294                  * lets just flush the entire range to make sure
5295                  * we don't have any pages left that are beyond
5296                  * (or intersect) the real LEOF of this file
5297                  */
5298                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5299         } else {
5300                 /*
5301                  * No need to call ubc_msync or hfs_invalbuf
5302                  * since the file was copied using IO_NOCACHE and
5303                  * the copy was done starting and ending on a page
5304                  * boundary in the file.
5305                  */
5306         }
5307         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5308
5309         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5310         return (error);
5311 }
5312
5313 /*
5314  * Clone a system (metadata) file.
5315  *
5316  */
5317 static int
5318 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5319                  kauth_cred_t cred, struct proc *p)
5320 {
5321         caddr_t  bufp;
5322         char * offset;
5323         size_t  bufsize;
5324         size_t  iosize;
5325         struct buf *bp = NULL;
5326         daddr64_t  blkno;
5327         daddr64_t  blk;
5328         daddr64_t  start_blk;
5329         daddr64_t  last_blk;
5330         int  breadcnt;
5331         int  i;
5332         int  error = 0;
5333
5334
5335         iosize = GetLogicalBlockSize(vp);
5336         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5337         breadcnt = bufsize / iosize;
5338
5339         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5340                 return (ENOMEM);
5341         }
5342         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5343         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5344         blkno = 0;
5345
5346         while (blkno < last_blk) {
5347                 /*
5348                  * Read up to a megabyte
5349                  */
5350                 offset = bufp;
5351                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5352                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5353                         if (error) {
5354                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
5355                                 goto out;
5356                         }
5357                         if (buf_count(bp) != iosize) {
5358                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5359                                 goto out;
5360                         }
5361                         bcopy((char *)buf_dataptr(bp), offset, iosize);
5362
5363                         buf_markinvalid(bp);
5364                         buf_brelse(bp);
5365                         bp = NULL;
5366
5367                         offset += iosize;
5368                 }
5369
5370                 /*
5371                  * Write up to a megabyte
5372                  */
5373                 offset = bufp;
5374                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5375                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5376                         if (bp == NULL) {
5377                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5378                                 error = EIO;
5379                                 goto out;
5380                         }
5381                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
5382                         error = (int)buf_bwrite(bp);
5383                         bp = NULL;
5384                         if (error)
5385                                 goto out;
5386                         offset += iosize;
5387                 }
5388         }
5389 out:
5390         if (bp) {
5391                 buf_brelse(bp);
5392         }
5393
5394         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5395
5396         error = hfs_fsync(vp, MNT_WAIT, 0, p);
5397
5398         return (error);
5399 }