bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/buf_internal.h>
  45 #include <sys/proc.h>
  46 #include <sys/kauth.h>
  47 #include <sys/vnode.h>
  48 #include <sys/vnode_internal.h>
  49 #include <sys/uio.h>
  50 #include <sys/vfs_context.h>
  51 #include <sys/fsevents.h>
  52 #include <kern/kalloc.h>
  53 #include <sys/disk.h>
  54 #include <sys/sysctl.h>
  55 #include <sys/fsctl.h>
  56 #include <sys/mount_internal.h>
  57 #include <sys/file_internal.h>
  58
  59 #include <miscfs/specfs/specdev.h>
  60
  61 #include <sys/ubc.h>
  62 #include <sys/ubc_internal.h>
  63
  64 #include <vm/vm_pageout.h>
  65 #include <vm/vm_kern.h>
  66
  67 #include <sys/kdebug.h>
  68
  69 #include        "hfs.h"
  70 #include        "hfs_attrlist.h"
  71 #include        "hfs_endian.h"
  72 #include        "hfs_fsctl.h"
  73 #include        "hfs_quota.h"
  74 #include        "hfscommon/headers/FileMgrInternal.h"
  75 #include        "hfscommon/headers/BTreesInternal.h"
  76 #include        "hfs_cnode.h"
  77 #include        "hfs_dbg.h"
  78
  79 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  80
  81 enum {
  82         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  83 };
  84
  85 /* from bsd/hfs/hfs_vfsops.c */
  86 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  87
  88 static int  hfs_clonefile(struct vnode *, int, int, int);
  89 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  90 static int  hfs_minorupdate(struct vnode *vp);
  91 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
  92
  93 /* from bsd/hfs/hfs_vnops.c */
  94 extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp);
  95
  96
  97
  98 int flush_cache_on_write = 0;
  99 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
 100
 101 /*
 102  * Read data from a file.
 103  */
 104 int
 105 hfs_vnop_read(struct vnop_read_args *ap)
 106 {
 107         /*
 108            struct vnop_read_args {
 109            struct vnodeop_desc *a_desc;
 110            vnode_t a_vp;
 111            struct uio *a_uio;
 112            int a_ioflag;
 113            vfs_context_t a_context;
 114            };
 115          */
 116
 117         uio_t uio = ap->a_uio;
 118         struct vnode *vp = ap->a_vp;
 119         struct cnode *cp;
 120         struct filefork *fp;
 121         struct hfsmount *hfsmp;
 122         off_t filesize;
 123         off_t filebytes;
 124         off_t start_resid = uio_resid(uio);
 125         off_t offset = uio_offset(uio);
 126         int retval = 0;
 127         int took_truncate_lock = 0;
 128         int io_throttle = 0;
 129
 130         /* Preflight checks */
 131         if (!vnode_isreg(vp)) {
 132                 /* can only read regular files */
 133                 if (vnode_isdir(vp))
 134                         return (EISDIR);
 135                 else
 136                         return (EPERM);
 137         }
 138         if (start_resid == 0)
 139                 return (0);             /* Nothing left to do */
 140         if (offset < 0)
 141                 return (EINVAL);        /* cant read from a negative offset */
 142
 143
 144
 145 #if HFS_COMPRESSION
 146         if (VNODE_IS_RSRC(vp)) {
 147                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 148                         return 0;
 149                 }
 150                 /* otherwise read the resource fork normally */
 151         } else {
 152                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 153                 if (compressed) {
 154                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 155                         if (compressed) {
 156                                 if (retval == 0) {
 157                                         /* successful read, update the access time */
 158                                         VTOC(vp)->c_touch_acctime = TRUE;
 159
 160                                         /* compressed files are not hot file candidates */
 161                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 162                                                 VTOF(vp)->ff_bytesread = 0;
 163                                         }
 164                                 }
 165                                 return retval;
 166                         }
 167                         /* otherwise the file was converted back to a regular file while we were reading it */
 168                         retval = 0;
 169                 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 170                         int error;
 171
 172                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 173                         if (error) {
 174                                 return error;
 175                         }
 176
 177                 }
 178         }
 179 #endif /* HFS_COMPRESSION */
 180
 181         cp = VTOC(vp);
 182         fp = VTOF(vp);
 183         hfsmp = VTOHFS(vp);
 184
 185 #if CONFIG_PROTECT
 186         if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
 187                 goto exit;
 188         }
 189 #endif
 190
 191         /*
 192          * If this read request originated from a syscall (as opposed to
 193          * an in-kernel page fault or something), then set it up for
 194          * throttle checks
 195          */
 196         if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
 197                 io_throttle = IO_RETURN_ON_THROTTLE;
 198         }
 199
 200 read_again:
 201
 202         /* Protect against a size change. */
 203         hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 204         took_truncate_lock = 1;
 205
 206         filesize = fp->ff_size;
 207         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 208         if (offset > filesize) {
 209                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 210                     (offset > (off_t)MAXHFSFILESIZE)) {
 211                         retval = EFBIG;
 212                 }
 213                 goto exit;
 214         }
 215
 216         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_START,
 217                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 218
 219         retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 220
 221         cp->c_touch_acctime = TRUE;
 222
 223         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 12)) | DBG_FUNC_END,
 224                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 225
 226         /*
 227          * Keep track blocks read
 228          */
 229         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 230                 int took_cnode_lock = 0;
 231                 off_t bytesread;
 232
 233                 bytesread = start_resid - uio_resid(uio);
 234
 235                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 236                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 237                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 238                         took_cnode_lock = 1;
 239                 }
 240                 /*
 241                  * If this file hasn't been seen since the start of
 242                  * the current sampling period then start over.
 243                  */
 244                 if (cp->c_atime < hfsmp->hfc_timebase) {
 245                         struct timeval tv;
 246
 247                         fp->ff_bytesread = bytesread;
 248                         microtime(&tv);
 249                         cp->c_atime = tv.tv_sec;
 250                 } else {
 251                         fp->ff_bytesread += bytesread;
 252                 }
 253                 if (took_cnode_lock)
 254                         hfs_unlock(cp);
 255         }
 256 exit:
 257         if (took_truncate_lock) {
 258                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 259         }
 260         if (retval == EAGAIN) {
 261                 throttle_lowpri_io(1);
 262
 263                 retval = 0;
 264                 goto read_again;
 265         }
 266         return (retval);
 267 }
 268
 269 /*
 270  * Write data to a file.
 271  */
 272 int
 273 hfs_vnop_write(struct vnop_write_args *ap)
 274 {
 275         uio_t uio = ap->a_uio;
 276         struct vnode *vp = ap->a_vp;
 277         struct cnode *cp;
 278         struct filefork *fp;
 279         struct hfsmount *hfsmp;
 280         kauth_cred_t cred = NULL;
 281         off_t origFileSize;
 282         off_t writelimit;
 283         off_t bytesToAdd = 0;
 284         off_t actualBytesAdded;
 285         off_t filebytes;
 286         off_t offset;
 287         ssize_t resid;
 288         int eflags;
 289         int ioflag = ap->a_ioflag;
 290         int retval = 0;
 291         int lockflags;
 292         int cnode_locked = 0;
 293         int partialwrite = 0;
 294         int do_snapshot = 1;
 295         time_t orig_ctime=VTOC(vp)->c_ctime;
 296         int took_truncate_lock = 0;
 297         int io_return_on_throttle = 0;
 298         struct rl_entry *invalid_range;
 299
 300 #if HFS_COMPRESSION
 301         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 302                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 303                 switch(state) {
 304                         case FILE_IS_COMPRESSED:
 305                                 return EACCES;
 306                         case FILE_IS_CONVERTING:
 307                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 308                                    bother with snapshots or else we will deadlock.
 309                                 */
 310                                 do_snapshot = 0;
 311                                 break;
 312                         default:
 313                                 printf("invalid state %d for compressed file\n", state);
 314                                 /* fall through */
 315                 }
 316         } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 317                 int error;
 318
 319                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 320                 if (error != 0) {
 321                         return error;
 322                 }
 323         }
 324
 325         if (do_snapshot) {
 326                 check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 327         }
 328
 329 #endif
 330
 331         resid = uio_resid(uio);
 332         offset = uio_offset(uio);
 333
 334         if (offset < 0)
 335                 return (EINVAL);
 336         if (resid == 0)
 337                 return (E_NONE);
 338         if (!vnode_isreg(vp))
 339                 return (EPERM);  /* Can only write regular files */
 340
 341         cp = VTOC(vp);
 342         fp = VTOF(vp);
 343         hfsmp = VTOHFS(vp);
 344
 345 #if CONFIG_PROTECT
 346         if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
 347                 goto exit;
 348         }
 349 #endif
 350
 351         eflags = kEFDeferMask;  /* defer file block allocations */
 352 #if HFS_SPARSE_DEV
 353         /*
 354          * When the underlying device is sparse and space
 355          * is low (< 8MB), stop doing delayed allocations
 356          * and begin doing synchronous I/O.
 357          */
 358         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 359             (hfs_freeblks(hfsmp, 0) < 2048)) {
 360                 eflags &= ~kEFDeferMask;
 361                 ioflag |= IO_SYNC;
 362         }
 363 #endif /* HFS_SPARSE_DEV */
 364
 365         if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
 366                         (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
 367                 io_return_on_throttle = IO_RETURN_ON_THROTTLE;
 368         }
 369
 370 again:
 371         /* Protect against a size change. */
 372         /*
 373          * Protect against a size change.
 374          *
 375          * Note: If took_truncate_lock is true, then we previously got the lock shared
 376          * but needed to upgrade to exclusive.  So try getting it exclusive from the
 377          * start.
 378          */
 379         if (ioflag & IO_APPEND || took_truncate_lock) {
 380                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 381         }
 382         else {
 383                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 384         }
 385         took_truncate_lock = 1;
 386
 387         /* Update UIO */
 388         if (ioflag & IO_APPEND) {
 389                 uio_setoffset(uio, fp->ff_size);
 390                 offset = fp->ff_size;
 391         }
 392         if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
 393                 retval = EPERM;
 394                 goto exit;
 395         }
 396
 397         origFileSize = fp->ff_size;
 398         writelimit = offset + resid;
 399         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 400
 401         /*
 402          * We may need an exclusive truncate lock for several reasons, all
 403          * of which are because we may be writing to a (portion of a) block
 404          * for the first time, and we need to make sure no readers see the
 405          * prior, uninitialized contents of the block.  The cases are:
 406          *
 407          * 1. We have unallocated (delayed allocation) blocks.  We may be
 408          *    allocating new blocks to the file and writing to them.
 409          *    (A more precise check would be whether the range we're writing
 410          *    to contains delayed allocation blocks.)
 411          * 2. We need to extend the file.  The bytes between the old EOF
 412          *    and the new EOF are not yet initialized.  This is important
 413          *    even if we're not allocating new blocks to the file.  If the
 414          *    old EOF and new EOF are in the same block, we still need to
 415          *    protect that range of bytes until they are written for the
 416          *    first time.
 417          * 3. The write overlaps some invalid ranges (delayed zero fill; that
 418          *    part of the file has been allocated, but not yet written).
 419          *
 420          * If we had a shared lock with the above cases, we need to try to upgrade
 421          * to an exclusive lock.  If the upgrade fails, we will lose the shared
 422          * lock, and will need to take the truncate lock again; the took_truncate_lock
 423          * flag will still be set, causing us to try for an exclusive lock next time.
 424          *
 425          * NOTE: Testing for #3 (delayed zero fill) needs to be done while the cnode
 426          * lock is held, since it protects the range lists.
 427          */
 428         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 429             ((fp->ff_unallocblocks != 0) ||
 430              (writelimit > origFileSize))) {
 431                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 432                         /*
 433                          * Lock upgrade failed and we lost our shared lock, try again.
 434                          * Note: we do not set took_truncate_lock=0 here.  Leaving it
 435                          * set to 1 will cause us to try to get the lock exclusive.
 436                          */
 437                         goto again;
 438                 }
 439                 else {
 440                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 441                         cp->c_truncatelockowner = current_thread();
 442                 }
 443         }
 444
 445         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 446                 goto exit;
 447         }
 448         cnode_locked = 1;
 449
 450         if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
 451                 hfs_incr_gencount (cp);
 452         }
 453
 454         /*
 455          * Now that we have the cnode lock, see if there are delayed zero fill ranges
 456          * overlapping our write.  If so, we need the truncate lock exclusive (see above).
 457          */
 458         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 459             (rl_scan(&fp->ff_invalidranges, offset, writelimit-1, &invalid_range) != RL_NOOVERLAP)) {
 460                 /*
 461                  * When testing, it appeared that calling lck_rw_lock_shared_to_exclusive() causes
 462                  * a deadlock, rather than simply returning failure.  (That is, it apparently does
 463                  * not behave like a "try_lock").  Since this condition is rare, just drop the
 464                  * cnode lock and try again.  Since took_truncate_lock is set, we will
 465                  * automatically take the truncate lock exclusive.
 466                  */
 467                 hfs_unlock(cp);
 468                 cnode_locked = 0;
 469                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 470                 goto again;
 471         }
 472
 473         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START,
 474                      (int)offset, uio_resid(uio), (int)fp->ff_size,
 475                      (int)filebytes, 0);
 476
 477         /* Check if we do not need to extend the file */
 478         if (writelimit <= filebytes) {
 479                 goto sizeok;
 480         }
 481
 482         cred = vfs_context_ucred(ap->a_context);
 483         bytesToAdd = writelimit - filebytes;
 484
 485 #if QUOTA
 486         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 487                            cred, 0);
 488         if (retval)
 489                 goto exit;
 490 #endif /* QUOTA */
 491
 492         if (hfs_start_transaction(hfsmp) != 0) {
 493                 retval = EINVAL;
 494                 goto exit;
 495         }
 496
 497         while (writelimit > filebytes) {
 498                 bytesToAdd = writelimit - filebytes;
 499                 if (cred && suser(cred, NULL) != 0)
 500                         eflags |= kEFReserveMask;
 501
 502                 /* Protect extents b-tree and allocation bitmap */
 503                 lockflags = SFL_BITMAP;
 504                 if (overflow_extents(fp))
 505                         lockflags |= SFL_EXTENTS;
 506                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 507
 508                 /* Files that are changing size are not hot file candidates. */
 509                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 510                         fp->ff_bytesread = 0;
 511                 }
 512                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 513                                 0, eflags, &actualBytesAdded));
 514
 515                 hfs_systemfile_unlock(hfsmp, lockflags);
 516
 517                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 518                         retval = ENOSPC;
 519                 if (retval != E_NONE)
 520                         break;
 521                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 522                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_NONE,
 523                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 524         }
 525         (void) hfs_update(vp, TRUE);
 526         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 527         (void) hfs_end_transaction(hfsmp);
 528
 529         /*
 530          * If we didn't grow the file enough try a partial write.
 531          * POSIX expects this behavior.
 532          */
 533         if ((retval == ENOSPC) && (filebytes > offset)) {
 534                 retval = 0;
 535                 partialwrite = 1;
 536                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 537                 resid -= bytesToAdd;
 538                 writelimit = filebytes;
 539         }
 540 sizeok:
 541         if (retval == E_NONE) {
 542                 off_t filesize;
 543                 off_t zero_off;
 544                 off_t tail_off;
 545                 off_t inval_start;
 546                 off_t inval_end;
 547                 off_t io_start;
 548                 int lflag;
 549
 550                 if (writelimit > fp->ff_size)
 551                         filesize = writelimit;
 552                 else
 553                         filesize = fp->ff_size;
 554
 555                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 556
 557                 if (offset <= fp->ff_size) {
 558                         zero_off = offset & ~PAGE_MASK_64;
 559
 560                         /* Check to see whether the area between the zero_offset and the start
 561                            of the transfer to see whether is invalid and should be zero-filled
 562                            as part of the transfer:
 563                          */
 564                         if (offset > zero_off) {
 565                                 if (rl_scan(&fp->ff_invalidranges, zero_off, offset - 1, &invalid_range) != RL_NOOVERLAP)
 566                                         lflag |= IO_HEADZEROFILL;
 567                         }
 568                 } else {
 569                         off_t eof_page_base = fp->ff_size & ~PAGE_MASK_64;
 570
 571                         /* The bytes between fp->ff_size and uio->uio_offset must never be
 572                            read without being zeroed.  The current last block is filled with zeroes
 573                            if it holds valid data but in all cases merely do a little bookkeeping
 574                            to track the area from the end of the current last page to the start of
 575                            the area actually written.  For the same reason only the bytes up to the
 576                            start of the page where this write will start is invalidated; any remainder
 577                            before uio->uio_offset is explicitly zeroed as part of the cluster_write.
 578
 579                            Note that inval_start, the start of the page after the current EOF,
 580                            may be past the start of the write, in which case the zeroing
 581                            will be handled by the cluser_write of the actual data.
 582                          */
 583                         inval_start = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 584                         inval_end = offset & ~PAGE_MASK_64;
 585                         zero_off = fp->ff_size;
 586
 587                         if ((fp->ff_size & PAGE_MASK_64) &&
 588                                 (rl_scan(&fp->ff_invalidranges,
 589                                                         eof_page_base,
 590                                                         fp->ff_size - 1,
 591                                                         &invalid_range) != RL_NOOVERLAP)) {
 592                                 /* The page containing the EOF is not valid, so the
 593                                    entire page must be made inaccessible now.  If the write
 594                                    starts on a page beyond the page containing the eof
 595                                    (inval_end > eof_page_base), add the
 596                                    whole page to the range to be invalidated.  Otherwise
 597                                    (i.e. if the write starts on the same page), zero-fill
 598                                    the entire page explicitly now:
 599                                  */
 600                                 if (inval_end > eof_page_base) {
 601                                         inval_start = eof_page_base;
 602                                 } else {
 603                                         zero_off = eof_page_base;
 604                                 };
 605                         };
 606
 607                         if (inval_start < inval_end) {
 608                                 struct timeval tv;
 609                                 /* There's some range of data that's going to be marked invalid */
 610
 611                                 if (zero_off < inval_start) {
 612                                         /* The pages between inval_start and inval_end are going to be invalidated,
 613                                            and the actual write will start on a page past inval_end.  Now's the last
 614                                            chance to zero-fill the page containing the EOF:
 615                                          */
 616                                         hfs_unlock(cp);
 617                                         cnode_locked = 0;
 618                                         retval = cluster_write(vp, (uio_t) 0,
 619                                                         fp->ff_size, inval_start,
 620                                                         zero_off, (off_t)0,
 621                                                         lflag | IO_HEADZEROFILL | IO_NOZERODIRTY);
 622                                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 623                                         cnode_locked = 1;
 624                                         if (retval) goto ioerr_exit;
 625                                         offset = uio_offset(uio);
 626                                 };
 627
 628                                 /* Mark the remaining area of the newly allocated space as invalid: */
 629                                 rl_add(inval_start, inval_end - 1 , &fp->ff_invalidranges);
 630                                 microuptime(&tv);
 631                                 cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 632                                 zero_off = fp->ff_size = inval_end;
 633                         };
 634
 635                         if (offset > zero_off) lflag |= IO_HEADZEROFILL;
 636                 };
 637
 638                 /* Check to see whether the area between the end of the write and the end of
 639                    the page it falls in is invalid and should be zero-filled as part of the transfer:
 640                  */
 641                 tail_off = (writelimit + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
 642                 if (tail_off > filesize) tail_off = filesize;
 643                 if (tail_off > writelimit) {
 644                         if (rl_scan(&fp->ff_invalidranges, writelimit, tail_off - 1, &invalid_range) != RL_NOOVERLAP) {
 645                                 lflag |= IO_TAILZEROFILL;
 646                         };
 647                 };
 648
 649                 /*
 650                  * if the write starts beyond the current EOF (possibly advanced in the
 651                  * zeroing of the last block, above), then we'll zero fill from the current EOF
 652                  * to where the write begins:
 653                  *
 654                  * NOTE: If (and ONLY if) the portion of the file about to be written is
 655                  *       before the current EOF it might be marked as invalid now and must be
 656                  *       made readable (removed from the invalid ranges) before cluster_write
 657                  *       tries to write it:
 658                  */
 659                 io_start = (lflag & IO_HEADZEROFILL) ? zero_off : offset;
 660                 if (io_start < fp->ff_size) {
 661                         off_t io_end;
 662
 663                         io_end = (lflag & IO_TAILZEROFILL) ? tail_off : writelimit;
 664                         rl_remove(io_start, io_end - 1, &fp->ff_invalidranges);
 665                 };
 666
 667                 hfs_unlock(cp);
 668                 cnode_locked = 0;
 669
 670                 /*
 671                  * We need to tell UBC the fork's new size BEFORE calling
 672                  * cluster_write, in case any of the new pages need to be
 673                  * paged out before cluster_write completes (which does happen
 674                  * in embedded systems due to extreme memory pressure).
 675                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 676                  * will be, so that it can pass that on to cluster_pageout, and
 677                  * allow those pageouts.
 678                  *
 679                  * We don't update ff_size yet since we don't want pageins to
 680                  * be able to see uninitialized data between the old and new
 681                  * EOF, until cluster_write has completed and initialized that
 682                  * part of the file.
 683                  *
 684                  * The vnode pager relies on the file size last given to UBC via
 685                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 686                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 687                  * zero, unless we are extending the file via write.
 688                  */
 689                 if (filesize > fp->ff_size) {
 690                         fp->ff_new_size = filesize;
 691                         ubc_setsize(vp, filesize);
 692                 }
 693                 retval = cluster_write(vp, uio, fp->ff_size, filesize, zero_off,
 694                                 tail_off, lflag | IO_NOZERODIRTY | io_return_on_throttle);
 695                 if (retval) {
 696                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 697
 698                         if (retval == EAGAIN) {
 699                                 /*
 700                                  * EAGAIN indicates that we still have I/O to do, but
 701                                  * that we now need to be throttled
 702                                  */
 703                                 if (resid != uio_resid(uio)) {
 704                                         /*
 705                                          * did manage to do some I/O before returning EAGAIN
 706                                          */
 707                                         resid = uio_resid(uio);
 708                                         offset = uio_offset(uio);
 709
 710                                         cp->c_touch_chgtime = TRUE;
 711                                         cp->c_touch_modtime = TRUE;
 712                                 }
 713                                 if (filesize > fp->ff_size) {
 714                                         /*
 715                                          * we called ubc_setsize before the call to
 716                                          * cluster_write... since we only partially
 717                                          * completed the I/O, we need to
 718                                          * re-adjust our idea of the filesize based
 719                                          * on our interim EOF
 720                                          */
 721                                         ubc_setsize(vp, offset);
 722
 723                                         fp->ff_size = offset;
 724                                 }
 725                                 goto exit;
 726                         }
 727                         if (filesize > origFileSize) {
 728                                 ubc_setsize(vp, origFileSize);
 729                         }
 730                         goto ioerr_exit;
 731                 }
 732
 733                 if (filesize > origFileSize) {
 734                         fp->ff_size = filesize;
 735
 736                         /* Files that are changing size are not hot file candidates. */
 737                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 738                                 fp->ff_bytesread = 0;
 739                         }
 740                 }
 741                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 742
 743                 /* If we wrote some bytes, then touch the change and mod times */
 744                 if (resid > uio_resid(uio)) {
 745                         cp->c_touch_chgtime = TRUE;
 746                         cp->c_touch_modtime = TRUE;
 747                 }
 748         }
 749         if (partialwrite) {
 750                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 751                 resid += bytesToAdd;
 752         }
 753
 754         // XXXdbg - see radar 4871353 for more info
 755         {
 756             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 757                 VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL);
 758             }
 759         }
 760
 761 ioerr_exit:
 762         /*
 763          * If we successfully wrote any data, and we are not the superuser
 764          * we clear the setuid and setgid bits as a precaution against
 765          * tampering.
 766          */
 767         if (cp->c_mode & (S_ISUID | S_ISGID)) {
 768                 cred = vfs_context_ucred(ap->a_context);
 769                 if (resid > uio_resid(uio) && cred && suser(cred, NULL)) {
 770                         if (!cnode_locked) {
 771                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 772                                 cnode_locked = 1;
 773                         }
 774                         cp->c_mode &= ~(S_ISUID | S_ISGID);
 775                 }
 776         }
 777         if (retval) {
 778                 if (ioflag & IO_UNIT) {
 779                         if (!cnode_locked) {
 780                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 781                                 cnode_locked = 1;
 782                         }
 783                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 784                                            0, 0, ap->a_context);
 785                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 786                         uio_setresid(uio, resid);
 787                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 788                 }
 789         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio))) {
 790                 if (!cnode_locked) {
 791                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 792                         cnode_locked = 1;
 793                 }
 794                 retval = hfs_update(vp, TRUE);
 795         }
 796         /* Updating vcbWrCnt doesn't need to be atomic. */
 797         hfsmp->vcbWrCnt++;
 798
 799         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_END,
 800                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 801 exit:
 802         if (cnode_locked)
 803                 hfs_unlock(cp);
 804
 805         if (took_truncate_lock) {
 806                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 807         }
 808         if (retval == EAGAIN) {
 809                 throttle_lowpri_io(1);
 810
 811                 retval = 0;
 812                 goto again;
 813         }
 814         return (retval);
 815 }
 816
 817 /* support for the "bulk-access" fcntl */
 818
 819 #define CACHE_LEVELS 16
 820 #define NUM_CACHE_ENTRIES (64*16)
 821 #define PARENT_IDS_FLAG 0x100
 822
 823 struct access_cache {
 824        int numcached;
 825        int cachehits; /* these two for statistics gathering */
 826        int lookups;
 827        unsigned int *acache;
 828        unsigned char *haveaccess;
 829 };
 830
 831 struct access_t {
 832         uid_t     uid;              /* IN: effective user id */
 833         short     flags;            /* IN: access requested (i.e. R_OK) */
 834         short     num_groups;       /* IN: number of groups user belongs to */
 835         int       num_files;        /* IN: number of files to process */
 836         int       *file_ids;        /* IN: array of file ids */
 837         gid_t     *groups;          /* IN: array of groups */
 838         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 839 } __attribute__((unavailable)); // this structure is for reference purposes only
 840
 841 struct user32_access_t {
 842         uid_t     uid;              /* IN: effective user id */
 843         short     flags;            /* IN: access requested (i.e. R_OK) */
 844         short     num_groups;       /* IN: number of groups user belongs to */
 845         int       num_files;        /* IN: number of files to process */
 846         user32_addr_t      file_ids;        /* IN: array of file ids */
 847         user32_addr_t      groups;          /* IN: array of groups */
 848         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 849 };
 850
 851 struct user64_access_t {
 852         uid_t           uid;                    /* IN: effective user id */
 853         short           flags;                  /* IN: access requested (i.e. R_OK) */
 854         short           num_groups;             /* IN: number of groups user belongs to */
 855         int             num_files;              /* IN: number of files to process */
 856         user64_addr_t   file_ids;               /* IN: array of file ids */
 857         user64_addr_t   groups;                 /* IN: array of groups */
 858         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 859 };
 860
 861
 862 // these are the "extended" versions of the above structures
 863 // note that it is crucial that they be different sized than
 864 // the regular version
 865 struct ext_access_t {
 866         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 867         uint32_t   num_files;       /* IN: number of files to process */
 868         uint32_t   map_size;        /* IN: size of the bit map */
 869         uint32_t  *file_ids;        /* IN: Array of file ids */
 870         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 871         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 872         uint32_t   num_parents;   /* future use */
 873         cnid_t      *parents;   /* future use */
 874 } __attribute__((unavailable)); // this structure is for reference purposes only
 875
 876 struct user32_ext_access_t {
 877         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 878         uint32_t   num_files;       /* IN: number of files to process */
 879         uint32_t   map_size;        /* IN: size of the bit map */
 880         user32_addr_t  file_ids;        /* IN: Array of file ids */
 881         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 882         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 883         uint32_t   num_parents;   /* future use */
 884         user32_addr_t parents;   /* future use */
 885 };
 886
 887 struct user64_ext_access_t {
 888         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 889         uint32_t      num_files;    /* IN: number of files to process */
 890         uint32_t      map_size;     /* IN: size of the bit map */
 891         user64_addr_t   file_ids;     /* IN: array of file ids */
 892         user64_addr_t   bitmap;       /* IN: array of groups */
 893         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 894         uint32_t      num_parents;/* future use */
 895         user64_addr_t   parents;/* future use */
 896 };
 897
 898
 899 /*
 900  * Perform a binary search for the given parent_id. Return value is
 901  * the index if there is a match.  If no_match_indexp is non-NULL it
 902  * will be assigned with the index to insert the item (even if it was
 903  * not found).
 904  */
 905 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 906 {
 907     int index=-1;
 908     unsigned int lo=0;
 909
 910     do {
 911         unsigned int mid = ((hi - lo)/2) + lo;
 912         unsigned int this_id = array[mid];
 913
 914         if (parent_id == this_id) {
 915             hi = mid;
 916             break;
 917         }
 918
 919         if (parent_id < this_id) {
 920             hi = mid;
 921             continue;
 922         }
 923
 924         if (parent_id > this_id) {
 925             lo = mid + 1;
 926             continue;
 927         }
 928     } while(lo < hi);
 929
 930     /* check if lo and hi converged on the match */
 931     if (parent_id == array[hi]) {
 932         index = hi;
 933     }
 934
 935     if (no_match_indexp) {
 936         *no_match_indexp = hi;
 937     }
 938
 939     return index;
 940 }
 941
 942
 943 static int
 944 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 945 {
 946     unsigned int hi;
 947     int matches = 0;
 948     int index, no_match_index;
 949
 950     if (cache->numcached == 0) {
 951         *indexp = 0;
 952         return 0; // table is empty, so insert at index=0 and report no match
 953     }
 954
 955     if (cache->numcached > NUM_CACHE_ENTRIES) {
 956         cache->numcached = NUM_CACHE_ENTRIES;
 957     }
 958
 959     hi = cache->numcached - 1;
 960
 961     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 962
 963     /* if no existing entry found, find index for new one */
 964     if (index == -1) {
 965         index = no_match_index;
 966         matches = 0;
 967     } else {
 968         matches = 1;
 969     }
 970
 971     *indexp = index;
 972     return matches;
 973 }
 974
 975 /*
 976  * Add a node to the access_cache at the given index (or do a lookup first
 977  * to find the index if -1 is passed in). We currently do a replace rather
 978  * than an insert if the cache is full.
 979  */
 980 static void
 981 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 982 {
 983     int lookup_index = -1;
 984
 985     /* need to do a lookup first if -1 passed for index */
 986     if (index == -1) {
 987         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 988             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 989                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 990                 cache->haveaccess[lookup_index] = access;
 991             }
 992
 993             /* mission accomplished */
 994             return;
 995         } else {
 996             index = lookup_index;
 997         }
 998
 999     }
1000
1001     /* if the cache is full, do a replace rather than an insert */
1002     if (cache->numcached >= NUM_CACHE_ENTRIES) {
1003         cache->numcached = NUM_CACHE_ENTRIES-1;
1004
1005         if (index > cache->numcached) {
1006             index = cache->numcached;
1007         }
1008     }
1009
1010     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
1011         index++;
1012     }
1013
1014     if (index >= 0 && index < cache->numcached) {
1015         /* only do bcopy if we're inserting */
1016         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
1017         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
1018     }
1019
1020     cache->acache[index] = nodeID;
1021     cache->haveaccess[index] = access;
1022     cache->numcached++;
1023 }
1024
1025
1026 struct cinfo {
1027     uid_t   uid;
1028     gid_t   gid;
1029     mode_t  mode;
1030     cnid_t  parentcnid;
1031     u_int16_t recflags;
1032 };
1033
1034 static int
1035 snoop_callback(const struct cat_desc *descp, const struct cat_attr *attrp, void * arg)
1036 {
1037     struct cinfo *cip = (struct cinfo *)arg;
1038
1039     cip->uid = attrp->ca_uid;
1040     cip->gid = attrp->ca_gid;
1041     cip->mode = attrp->ca_mode;
1042     cip->parentcnid = descp->cd_parentcnid;
1043     cip->recflags = attrp->ca_recflags;
1044
1045     return (0);
1046 }
1047
1048 /*
1049  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1050  * isn't incore, then go to the catalog.
1051  */
1052 static int
1053 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1054     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1055 {
1056     int error = 0;
1057
1058     /* if this id matches the one the fsctl was called with, skip the lookup */
1059     if (cnid == skip_cp->c_cnid) {
1060         cnattrp->ca_uid = skip_cp->c_uid;
1061         cnattrp->ca_gid = skip_cp->c_gid;
1062         cnattrp->ca_mode = skip_cp->c_mode;
1063         cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1064         keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1065     } else {
1066         struct cinfo c_info;
1067
1068         /* otherwise, check the cnode hash incase the file/dir is incore */
1069         if (hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info) == 0) {
1070             cnattrp->ca_uid = c_info.uid;
1071             cnattrp->ca_gid = c_info.gid;
1072             cnattrp->ca_mode = c_info.mode;
1073             cnattrp->ca_recflags = c_info.recflags;
1074             keyp->hfsPlus.parentID = c_info.parentcnid;
1075         } else {
1076             int lockflags;
1077
1078             if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1079                     throttle_lowpri_io(1);
1080
1081             lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1082
1083             /* lookup this cnid in the catalog */
1084             error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1085
1086             hfs_systemfile_unlock(hfsmp, lockflags);
1087
1088             cache->lookups++;
1089         }
1090     }
1091
1092     return (error);
1093 }
1094
1095
1096 /*
1097  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1098  * up to CACHE_LEVELS as we progress towards the root.
1099  */
1100 static int
1101 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1102     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1103     struct vfs_context *my_context,
1104     char *bitmap,
1105     uint32_t map_size,
1106     cnid_t* parents,
1107     uint32_t num_parents)
1108 {
1109     int                     myErr = 0;
1110     int                     myResult;
1111     HFSCatalogNodeID        thisNodeID;
1112     unsigned int            myPerms;
1113     struct cat_attr         cnattr;
1114     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1115     CatalogKey              catkey;
1116
1117     int i = 0, ids_to_cache = 0;
1118     int parent_ids[CACHE_LEVELS];
1119
1120     thisNodeID = nodeID;
1121     while (thisNodeID >=  kRootDirID) {
1122         myResult = 0;   /* default to "no access" */
1123
1124         /* check the cache before resorting to hitting the catalog */
1125
1126         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1127          * to look any further after hitting cached dir */
1128
1129         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1130             cache->cachehits++;
1131             myErr = cache->haveaccess[cache_index];
1132             if (scope_index != -1) {
1133                 if (myErr == ESRCH) {
1134                     myErr = 0;
1135                 }
1136             } else {
1137                 scope_index = 0;   // so we'll just use the cache result
1138                 scope_idx_start = ids_to_cache;
1139             }
1140             myResult = (myErr == 0) ? 1 : 0;
1141             goto ExitThisRoutine;
1142         }
1143
1144
1145         if (parents) {
1146             int tmp;
1147             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1148             if (scope_index == -1)
1149                 scope_index = tmp;
1150             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1151                 scope_idx_start = ids_to_cache;
1152             }
1153         }
1154
1155         /* remember which parents we want to cache */
1156         if (ids_to_cache < CACHE_LEVELS) {
1157             parent_ids[ids_to_cache] = thisNodeID;
1158             ids_to_cache++;
1159         }
1160         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1161         if (bitmap && map_size) {
1162             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1163         }
1164
1165
1166         /* do the lookup (checks the cnode hash, then the catalog) */
1167         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1168         if (myErr) {
1169             goto ExitThisRoutine; /* no access */
1170         }
1171
1172         /* Root always gets access. */
1173         if (suser(myp_ucred, NULL) == 0) {
1174                 thisNodeID = catkey.hfsPlus.parentID;
1175                 myResult = 1;
1176                 continue;
1177         }
1178
1179         // if the thing has acl's, do the full permission check
1180         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1181             struct vnode *vp;
1182
1183             /* get the vnode for this cnid */
1184             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1185             if ( myErr ) {
1186                 myResult = 0;
1187                 goto ExitThisRoutine;
1188             }
1189
1190             thisNodeID = VTOC(vp)->c_parentcnid;
1191
1192             hfs_unlock(VTOC(vp));
1193
1194             if (vnode_vtype(vp) == VDIR) {
1195                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1196             } else {
1197                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1198             }
1199
1200             vnode_put(vp);
1201             if (myErr) {
1202                 myResult = 0;
1203                 goto ExitThisRoutine;
1204             }
1205         } else {
1206             unsigned int flags;
1207                 int mode = cnattr.ca_mode & S_IFMT;
1208                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1209
1210                 if (mode == S_IFDIR) {
1211                         flags = R_OK | X_OK;
1212                 } else {
1213                         flags = R_OK;
1214                 }
1215                 if ( (myPerms & flags) != flags) {
1216                         myResult = 0;
1217                         myErr = EACCES;
1218                         goto ExitThisRoutine;   /* no access */
1219                 }
1220
1221             /* up the hierarchy we go */
1222             thisNodeID = catkey.hfsPlus.parentID;
1223         }
1224     }
1225
1226     /* if here, we have access to this node */
1227     myResult = 1;
1228
1229   ExitThisRoutine:
1230     if (parents && myErr == 0 && scope_index == -1) {
1231         myErr = ESRCH;
1232     }
1233
1234     if (myErr) {
1235         myResult = 0;
1236     }
1237     *err = myErr;
1238
1239     /* cache the parent directory(ies) */
1240     for (i = 0; i < ids_to_cache; i++) {
1241         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1242             add_node(cache, -1, parent_ids[i], ESRCH);
1243         } else {
1244             add_node(cache, -1, parent_ids[i], myErr);
1245         }
1246     }
1247
1248     return (myResult);
1249 }
1250
1251 static int
1252 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1253     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1254 {
1255     boolean_t is64bit;
1256
1257     /*
1258      * NOTE: on entry, the vnode has an io_ref. In case this vnode
1259      * happens to be in our list of file_ids, we'll note it
1260      * avoid calling hfs_chashget_nowait() on that id as that
1261      * will cause a "locking against myself" panic.
1262      */
1263     Boolean check_leaf = true;
1264
1265     struct user64_ext_access_t *user_access_structp;
1266     struct user64_ext_access_t tmp_user_access;
1267     struct access_cache cache;
1268
1269     int error = 0, prev_parent_check_ok=1;
1270     unsigned int i;
1271
1272     short flags;
1273     unsigned int num_files = 0;
1274     int map_size = 0;
1275     int num_parents = 0;
1276     int *file_ids=NULL;
1277     short *access=NULL;
1278     char *bitmap=NULL;
1279     cnid_t *parents=NULL;
1280     int leaf_index;
1281
1282     cnid_t cnid;
1283     cnid_t prevParent_cnid = 0;
1284     unsigned int myPerms;
1285     short myaccess = 0;
1286     struct cat_attr cnattr;
1287     CatalogKey catkey;
1288     struct cnode *skip_cp = VTOC(vp);
1289     kauth_cred_t cred = vfs_context_ucred(context);
1290     proc_t p = vfs_context_proc(context);
1291
1292     is64bit = proc_is64bit(p);
1293
1294     /* initialize the local cache and buffers */
1295     cache.numcached = 0;
1296     cache.cachehits = 0;
1297     cache.lookups = 0;
1298     cache.acache = NULL;
1299     cache.haveaccess = NULL;
1300
1301     /* struct copyin done during dispatch... need to copy file_id array separately */
1302     if (ap->a_data == NULL) {
1303         error = EINVAL;
1304         goto err_exit_bulk_access;
1305     }
1306
1307     if (is64bit) {
1308         if (arg_size != sizeof(struct user64_ext_access_t)) {
1309             error = EINVAL;
1310             goto err_exit_bulk_access;
1311         }
1312
1313         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1314
1315     } else if (arg_size == sizeof(struct user32_access_t)) {
1316         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1317
1318         // convert an old style bulk-access struct to the new style
1319         tmp_user_access.flags     = accessp->flags;
1320         tmp_user_access.num_files = accessp->num_files;
1321         tmp_user_access.map_size  = 0;
1322         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1323         tmp_user_access.bitmap    = USER_ADDR_NULL;
1324         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1325         tmp_user_access.num_parents = 0;
1326         user_access_structp = &tmp_user_access;
1327
1328     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1329         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1330
1331         // up-cast from a 32-bit version of the struct
1332         tmp_user_access.flags     = accessp->flags;
1333         tmp_user_access.num_files = accessp->num_files;
1334         tmp_user_access.map_size  = accessp->map_size;
1335         tmp_user_access.num_parents  = accessp->num_parents;
1336
1337         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1338         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1339         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1340         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1341
1342         user_access_structp = &tmp_user_access;
1343     } else {
1344         error = EINVAL;
1345         goto err_exit_bulk_access;
1346     }
1347
1348     map_size = user_access_structp->map_size;
1349
1350     num_files = user_access_structp->num_files;
1351
1352     num_parents= user_access_structp->num_parents;
1353
1354     if (num_files < 1) {
1355         goto err_exit_bulk_access;
1356     }
1357     if (num_files > 1024) {
1358         error = EINVAL;
1359         goto err_exit_bulk_access;
1360     }
1361
1362     if (num_parents > 1024) {
1363         error = EINVAL;
1364         goto err_exit_bulk_access;
1365     }
1366
1367     file_ids = (int *) kalloc(sizeof(int) * num_files);
1368     access = (short *) kalloc(sizeof(short) * num_files);
1369     if (map_size) {
1370         bitmap = (char *) kalloc(sizeof(char) * map_size);
1371     }
1372
1373     if (num_parents) {
1374         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1375     }
1376
1377     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1378     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1379
1380     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1381         if (file_ids) {
1382             kfree(file_ids, sizeof(int) * num_files);
1383         }
1384         if (bitmap) {
1385             kfree(bitmap, sizeof(char) * map_size);
1386         }
1387         if (access) {
1388             kfree(access, sizeof(short) * num_files);
1389         }
1390         if (cache.acache) {
1391             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1392         }
1393         if (cache.haveaccess) {
1394             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1395         }
1396         if (parents) {
1397             kfree(parents, sizeof(cnid_t) * num_parents);
1398         }
1399         return ENOMEM;
1400     }
1401
1402     // make sure the bitmap is zero'ed out...
1403     if (bitmap) {
1404         bzero(bitmap, (sizeof(char) * map_size));
1405     }
1406
1407     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1408                 num_files * sizeof(int)))) {
1409         goto err_exit_bulk_access;
1410     }
1411
1412     if (num_parents) {
1413         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1414                     num_parents * sizeof(cnid_t)))) {
1415             goto err_exit_bulk_access;
1416         }
1417     }
1418
1419     flags = user_access_structp->flags;
1420     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1421         flags = R_OK;
1422     }
1423
1424     /* check if we've been passed leaf node ids or parent ids */
1425     if (flags & PARENT_IDS_FLAG) {
1426         check_leaf = false;
1427     }
1428
1429     /* Check access to each file_id passed in */
1430     for (i = 0; i < num_files; i++) {
1431         leaf_index=-1;
1432         cnid = (cnid_t) file_ids[i];
1433
1434         /* root always has access */
1435         if ((!parents) && (!suser(cred, NULL))) {
1436             access[i] = 0;
1437             continue;
1438         }
1439
1440         if (check_leaf) {
1441             /* do the lookup (checks the cnode hash, then the catalog) */
1442             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1443             if (error) {
1444                 access[i] = (short) error;
1445                 continue;
1446             }
1447
1448             if (parents) {
1449                 // Check if the leaf matches one of the parent scopes
1450                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1451                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1452                     prev_parent_check_ok = 0;
1453                 else if (leaf_index >= 0)
1454                     prev_parent_check_ok = 1;
1455             }
1456
1457             // if the thing has acl's, do the full permission check
1458             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1459                 struct vnode *cvp;
1460                 int myErr = 0;
1461                 /* get the vnode for this cnid */
1462                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1463                 if ( myErr ) {
1464                     access[i] = myErr;
1465                     continue;
1466                 }
1467
1468                 hfs_unlock(VTOC(cvp));
1469
1470                 if (vnode_vtype(cvp) == VDIR) {
1471                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1472                 } else {
1473                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1474                 }
1475
1476                 vnode_put(cvp);
1477                 if (myErr) {
1478                     access[i] = myErr;
1479                     continue;
1480                 }
1481             } else {
1482                 /* before calling CheckAccess(), check the target file for read access */
1483                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1484                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1485
1486                 /* fail fast if no access */
1487                 if ((myPerms & flags) == 0) {
1488                     access[i] = EACCES;
1489                     continue;
1490                 }
1491             }
1492         } else {
1493             /* we were passed an array of parent ids */
1494             catkey.hfsPlus.parentID = cnid;
1495         }
1496
1497         /* if the last guy had the same parent and had access, we're done */
1498         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1499             cache.cachehits++;
1500             access[i] = 0;
1501             continue;
1502         }
1503
1504         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1505             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1506
1507         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1508             access[i] = 0; // have access.. no errors to report
1509         } else {
1510             access[i] = (error != 0 ? (short) error : EACCES);
1511         }
1512
1513         prevParent_cnid = catkey.hfsPlus.parentID;
1514     }
1515
1516     /* copyout the access array */
1517     if ((error = copyout((caddr_t)access, user_access_structp->access,
1518                 num_files * sizeof (short)))) {
1519         goto err_exit_bulk_access;
1520     }
1521     if (map_size && bitmap) {
1522         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1523                     map_size * sizeof (char)))) {
1524             goto err_exit_bulk_access;
1525         }
1526     }
1527
1528
1529   err_exit_bulk_access:
1530
1531     if (file_ids)
1532         kfree(file_ids, sizeof(int) * num_files);
1533     if (parents)
1534         kfree(parents, sizeof(cnid_t) * num_parents);
1535     if (bitmap)
1536         kfree(bitmap, sizeof(char) * map_size);
1537     if (access)
1538         kfree(access, sizeof(short) * num_files);
1539     if (cache.acache)
1540         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1541     if (cache.haveaccess)
1542         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1543
1544     return (error);
1545 }
1546
1547
1548 /* end "bulk-access" support */
1549
1550
1551 /*
1552  * Callback for use with freeze ioctl.
1553  */
1554 static int
1555 hfs_freezewrite_callback(struct vnode *vp, __unused void *cargs)
1556 {
1557         vnode_waitforwrites(vp, 0, 0, 0, "hfs freeze");
1558
1559         return 0;
1560 }
1561
1562 /*
1563  * Control filesystem operating characteristics.
1564  */
1565 int
1566 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1567                 vnode_t a_vp;
1568                 int  a_command;
1569                 caddr_t  a_data;
1570                 int  a_fflag;
1571                 vfs_context_t a_context;
1572         } */ *ap)
1573 {
1574         struct vnode * vp = ap->a_vp;
1575         struct hfsmount *hfsmp = VTOHFS(vp);
1576         vfs_context_t context = ap->a_context;
1577         kauth_cred_t cred = vfs_context_ucred(context);
1578         proc_t p = vfs_context_proc(context);
1579         struct vfsstatfs *vfsp;
1580         boolean_t is64bit;
1581         off_t jnl_start, jnl_size;
1582         struct hfs_journal_info *jip;
1583 #if HFS_COMPRESSION
1584         int compressed = 0;
1585         off_t uncompressed_size = -1;
1586         int decmpfs_error = 0;
1587
1588         if (ap->a_command == F_RDADVISE) {
1589                 /* we need to inspect the decmpfs state of the file as early as possible */
1590                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1591                 if (compressed) {
1592                         if (VNODE_IS_RSRC(vp)) {
1593                                 /* if this is the resource fork, treat it as if it were empty */
1594                                 uncompressed_size = 0;
1595                         } else {
1596                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1597                                 if (decmpfs_error != 0) {
1598                                         /* failed to get the uncompressed size, we'll check for this later */
1599                                         uncompressed_size = -1;
1600                                 }
1601                         }
1602                 }
1603         }
1604 #endif /* HFS_COMPRESSION */
1605
1606         is64bit = proc_is64bit(p);
1607
1608 #if CONFIG_PROTECT
1609         {
1610                 int error = 0;
1611                 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1612                         return error;
1613                 }
1614         }
1615 #endif /* CONFIG_PROTECT */
1616
1617         switch (ap->a_command) {
1618
1619         case HFS_GETPATH:
1620         {
1621                 struct vnode *file_vp;
1622                 cnid_t  cnid;
1623                 int  outlen;
1624                 char *bufptr;
1625                 int error;
1626                 int flags = 0;
1627
1628                 /* Caller must be owner of file system. */
1629                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1630                 if (suser(cred, NULL) &&
1631                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1632                         return (EACCES);
1633                 }
1634                 /* Target vnode must be file system's root. */
1635                 if (!vnode_isvroot(vp)) {
1636                         return (EINVAL);
1637                 }
1638                 bufptr = (char *)ap->a_data;
1639                 cnid = strtoul(bufptr, NULL, 10);
1640                 if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1641                         flags |= BUILDPATH_VOLUME_RELATIVE;
1642                 }
1643
1644                 /* We need to call hfs_vfs_vget to leverage the code that will
1645                  * fix the origin list for us if needed, as opposed to calling
1646                  * hfs_vget, since we will need the parent for build_path call.
1647                  */
1648
1649                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1650                         return (error);
1651                 }
1652                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1653                 vnode_put(file_vp);
1654
1655                 return (error);
1656         }
1657
1658         case HFS_GET_WRITE_GEN_COUNTER:
1659         {
1660                 struct cnode *cp = NULL;
1661                 int error;
1662                 u_int32_t *counter = (u_int32_t *)ap->a_data;
1663
1664                 cp = VTOC(vp);
1665
1666                 if (!vnode_isdir(vp) && !(vnode_isreg(vp)) &&
1667                                 !(vnode_islnk(vp))) {
1668                         error = EBADF;
1669                         *counter = 0;
1670                         return error;
1671                 }
1672
1673                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
1674                 if (error == 0) {
1675                         struct ubc_info *uip;
1676                         int is_mapped_writable = 0;
1677
1678                         if (UBCINFOEXISTS(vp)) {
1679                                 uip = vp->v_ubcinfo;
1680                                 if ((uip->ui_flags & UI_ISMAPPED) && (uip->ui_flags & UI_MAPPEDWRITE)) {
1681                                         is_mapped_writable = 1;
1682                                 }
1683                         }
1684
1685
1686                         if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1687                                 uint32_t gcount = hfs_get_gencount(cp);
1688                                 //
1689                                 // Even though we return EBUSY for files that are mmap'ed
1690                                 // we also want to bump the value so that the write-gen
1691                                 // counter will always be different once the file is unmapped
1692                                 // (since the file may be unmapped but the pageouts have not
1693                                 // yet happened).
1694                                 //
1695                                 if (is_mapped_writable) {
1696                                         hfs_incr_gencount (cp);
1697                                         gcount = hfs_get_gencount(cp);
1698                                 }
1699
1700                                 *counter = gcount;
1701                         } else if (S_ISDIR(cp->c_attr.ca_mode)) {
1702                                 *counter = hfs_get_gencount(cp);
1703                         } else {
1704                                 /* not a file or dir? silently return */
1705                                 *counter = 0;
1706                         }
1707                         hfs_unlock (cp);
1708
1709                         if (is_mapped_writable) {
1710                                 error = EBUSY;
1711                         }
1712                 }
1713
1714                 return error;
1715         }
1716
1717         case HFS_GET_DOCUMENT_ID:
1718         {
1719                 struct cnode *cp = NULL;
1720                 int error=0;
1721                 u_int32_t *document_id = (u_int32_t *)ap->a_data;
1722
1723                 cp = VTOC(vp);
1724
1725                 if (cp->c_desc.cd_cnid == kHFSRootFolderID) {
1726                         // the root-dir always has document id '2' (aka kHFSRootFolderID)
1727                         *document_id = kHFSRootFolderID;
1728
1729                 } else if ((S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode))) {
1730                         int mark_it = 0;
1731                         uint32_t tmp_doc_id;
1732
1733                         //
1734                         // we can use the FndrExtendedFileInfo because the doc-id is the first
1735                         // thing in both it and the FndrExtendedDirInfo struct which is fixed
1736                         // in format and can not change layout
1737                         //
1738                         struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1739
1740                         hfs_lock(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
1741
1742                         //
1743                         // if the cnode isn't UF_TRACKED and the doc-id-allocate flag isn't set
1744                         // then just return a zero for the doc-id
1745                         //
1746                         if (!(cp->c_bsdflags & UF_TRACKED) && !(ap->a_fflag & HFS_DOCUMENT_ID_ALLOCATE)) {
1747                                 *document_id = 0;
1748                                 hfs_unlock(cp);
1749                                 return 0;
1750                         }
1751
1752                         //
1753                         // if the cnode isn't UF_TRACKED and the doc-id-allocate flag IS set,
1754                         // then set mark_it so we know to set the UF_TRACKED flag once the
1755                         // cnode is locked.
1756                         //
1757                         if (!(cp->c_bsdflags & UF_TRACKED) && (ap->a_fflag & HFS_DOCUMENT_ID_ALLOCATE)) {
1758                                 mark_it = 1;
1759                         }
1760
1761                         tmp_doc_id = extinfo->document_id;   // get a copy of this
1762
1763                         hfs_unlock(cp);   // in case we have to call hfs_generate_document_id()
1764
1765                         //
1766                         // If the document_id isn't set, get a new one and then set it.
1767                         // Note: we first get the document id, then lock the cnode to
1768                         // avoid any deadlock potential between cp and the root vnode.
1769                         //
1770                         uint32_t new_id;
1771                         if (tmp_doc_id == 0 && (error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1772
1773                                 if ((error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT)) == 0) {
1774                                         extinfo->document_id = tmp_doc_id = new_id;
1775                                         //printf("ASSIGNING: doc-id %d to ino %d\n", extinfo->document_id, cp->c_fileid);
1776
1777                                         if (mark_it) {
1778                                                 cp->c_bsdflags |= UF_TRACKED;
1779                                         }
1780
1781                                         // mark the cnode dirty
1782                                         cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1783
1784                                         int lockflags;
1785                                         if ((error = hfs_start_transaction(hfsmp)) == 0) {
1786                                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1787
1788                                                 (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1789
1790                                                 hfs_systemfile_unlock (hfsmp, lockflags);
1791                                                 (void) hfs_end_transaction(hfsmp);
1792                                         }
1793
1794 #if CONFIG_FSE
1795                                         add_fsevent(FSE_DOCID_CHANGED, context,
1796                                                     FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1797                                                     FSE_ARG_INO,   (ino64_t)0,             // src inode #
1798                                                     FSE_ARG_INO,   (ino64_t)cp->c_fileid,  // dst inode #
1799                                                     FSE_ARG_INT32, extinfo->document_id,
1800                                                     FSE_ARG_DONE);
1801
1802                                         hfs_unlock (cp);    // so we can send the STAT_CHANGED event without deadlocking
1803
1804                                         if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1805                                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1806                                         }
1807 #else
1808                                         hfs_unlock (cp);
1809 #endif
1810                                 }
1811                         }
1812
1813                         *document_id = tmp_doc_id;
1814                 } else {
1815                         *document_id = 0;
1816                 }
1817
1818                 return error;
1819         }
1820
1821         case HFS_TRANSFER_DOCUMENT_ID:
1822         {
1823                 struct cnode *cp = NULL;
1824                 int error;
1825                 u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1826                 struct fileproc *to_fp;
1827                 struct vnode *to_vp;
1828                 struct cnode *to_cp;
1829
1830                 cp = VTOC(vp);
1831
1832                 if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1833                         //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1834                         return error;
1835                 }
1836                 if ( (error = vnode_getwithref(to_vp)) ) {
1837                         file_drop(to_fd);
1838                         return error;
1839                 }
1840
1841                 if (VTOHFS(to_vp) != hfsmp) {
1842                         error = EXDEV;
1843                         goto transfer_cleanup;
1844                 }
1845
1846                 int need_unlock = 1;
1847                 to_cp = VTOC(to_vp);
1848                 error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1849                 if (error != 0) {
1850                         //printf("could not lock the pair of cnodes (error %d)\n", error);
1851                         goto transfer_cleanup;
1852                 }
1853
1854                 if (!(cp->c_bsdflags & UF_TRACKED)) {
1855                         error = EINVAL;
1856                 } else if (to_cp->c_bsdflags & UF_TRACKED) {
1857                         //
1858                         // if the destination is already tracked, return an error
1859                         // as otherwise it's a silent deletion of the target's
1860                         // document-id
1861                         //
1862                         error = EEXIST;
1863                 } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1864                         //
1865                         // we can use the FndrExtendedFileInfo because the doc-id is the first
1866                         // thing in both it and the ExtendedDirInfo struct which is fixed in
1867                         // format and can not change layout
1868                         //
1869                         struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1870                         struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1871
1872                         if (f_extinfo->document_id == 0) {
1873                                 uint32_t new_id;
1874
1875                                 hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
1876
1877                                 if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1878                                         //
1879                                         // re-lock the pair now that we have the document-id
1880                                         //
1881                                         hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1882                                         f_extinfo->document_id = new_id;
1883                                 } else {
1884                                         goto transfer_cleanup;
1885                                 }
1886                         }
1887
1888                         to_extinfo->document_id = f_extinfo->document_id;
1889                         f_extinfo->document_id = 0;
1890                         //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1891
1892                         // make sure the destination is also UF_TRACKED
1893                         to_cp->c_bsdflags |= UF_TRACKED;
1894                         cp->c_bsdflags &= ~UF_TRACKED;
1895
1896                         // mark the cnodes dirty
1897                         cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1898                         to_cp->c_flag |= C_MODIFIED | C_FORCEUPDATE;
1899
1900                         int lockflags;
1901                         if ((error = hfs_start_transaction(hfsmp)) == 0) {
1902
1903                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1904
1905                                 (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1906                                 (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1907
1908                                 hfs_systemfile_unlock (hfsmp, lockflags);
1909                                 (void) hfs_end_transaction(hfsmp);
1910                         }
1911
1912 #if CONFIG_FSE
1913                         add_fsevent(FSE_DOCID_CHANGED, context,
1914                                     FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1915                                     FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
1916                                     FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
1917                                     FSE_ARG_INT32, to_extinfo->document_id,
1918                                     FSE_ARG_DONE);
1919
1920                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1921                         need_unlock = 0;
1922
1923                         if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1924                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1925                         }
1926                         if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1927                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1928                         }
1929 #else
1930                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1931                         need_unlock = 0;
1932 #endif
1933                 }
1934
1935                 if (need_unlock) {
1936                         hfs_unlockpair(cp, to_cp);
1937                 }
1938
1939         transfer_cleanup:
1940                 vnode_put(to_vp);
1941                 file_drop(to_fd);
1942
1943                 return error;
1944         }
1945
1946         case HFS_PREV_LINK:
1947         case HFS_NEXT_LINK:
1948         {
1949                 cnid_t linkfileid;
1950                 cnid_t nextlinkid;
1951                 cnid_t prevlinkid;
1952                 int error;
1953
1954                 /* Caller must be owner of file system. */
1955                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1956                 if (suser(cred, NULL) &&
1957                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1958                         return (EACCES);
1959                 }
1960                 /* Target vnode must be file system's root. */
1961                 if (!vnode_isvroot(vp)) {
1962                         return (EINVAL);
1963                 }
1964                 linkfileid = *(cnid_t *)ap->a_data;
1965                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1966                         return (EINVAL);
1967                 }
1968                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1969                         return (error);
1970                 }
1971                 if (ap->a_command == HFS_NEXT_LINK) {
1972                         *(cnid_t *)ap->a_data = nextlinkid;
1973                 } else {
1974                         *(cnid_t *)ap->a_data = prevlinkid;
1975                 }
1976                 return (0);
1977         }
1978
1979         case HFS_RESIZE_PROGRESS: {
1980
1981                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1982                 if (suser(cred, NULL) &&
1983                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1984                         return (EACCES); /* must be owner of file system */
1985                 }
1986                 if (!vnode_isvroot(vp)) {
1987                         return (EINVAL);
1988                 }
1989                 /* file system must not be mounted read-only */
1990                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1991                         return (EROFS);
1992                 }
1993
1994                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1995         }
1996
1997         case HFS_RESIZE_VOLUME: {
1998                 u_int64_t newsize;
1999                 u_int64_t cursize;
2000
2001                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2002                 if (suser(cred, NULL) &&
2003                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2004                         return (EACCES); /* must be owner of file system */
2005                 }
2006                 if (!vnode_isvroot(vp)) {
2007                         return (EINVAL);
2008                 }
2009
2010                 /* filesystem must not be mounted read only */
2011                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2012                         return (EROFS);
2013                 }
2014                 newsize = *(u_int64_t *)ap->a_data;
2015                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
2016
2017                 if (newsize > cursize) {
2018                         return hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
2019                 } else if (newsize < cursize) {
2020                         return hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
2021                 } else {
2022                         return (0);
2023                 }
2024         }
2025         case HFS_CHANGE_NEXT_ALLOCATION: {
2026                 int error = 0;          /* Assume success */
2027                 u_int32_t location;
2028
2029                 if (vnode_vfsisrdonly(vp)) {
2030                         return (EROFS);
2031                 }
2032                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2033                 if (suser(cred, NULL) &&
2034                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2035                         return (EACCES); /* must be owner of file system */
2036                 }
2037                 if (!vnode_isvroot(vp)) {
2038                         return (EINVAL);
2039                 }
2040                 hfs_lock_mount(hfsmp);
2041                 location = *(u_int32_t *)ap->a_data;
2042                 if ((location >= hfsmp->allocLimit) &&
2043                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
2044                         error = EINVAL;
2045                         goto fail_change_next_allocation;
2046                 }
2047                 /* Return previous value. */
2048                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
2049                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
2050                         /* On magic value for location, set nextAllocation to next block
2051                          * after metadata zone and set flag in mount structure to indicate
2052                          * that nextAllocation should not be updated again.
2053                          */
2054                         if (hfsmp->hfs_metazone_end != 0) {
2055                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
2056                         }
2057                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
2058                 } else {
2059                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
2060                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
2061                 }
2062                 MarkVCBDirty(hfsmp);
2063 fail_change_next_allocation:
2064                 hfs_unlock_mount(hfsmp);
2065                 return (error);
2066         }
2067
2068 #if HFS_SPARSE_DEV
2069         case HFS_SETBACKINGSTOREINFO: {
2070                 struct vnode * bsfs_rootvp;
2071                 struct vnode * di_vp;
2072                 struct hfs_backingstoreinfo *bsdata;
2073                 int error = 0;
2074
2075                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2076                         return (EROFS);
2077                 }
2078                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
2079                         return (EALREADY);
2080                 }
2081                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2082                 if (suser(cred, NULL) &&
2083                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2084                         return (EACCES); /* must be owner of file system */
2085                 }
2086                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
2087                 if (bsdata == NULL) {
2088                         return (EINVAL);
2089                 }
2090                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
2091                         return (error);
2092                 }
2093                 if ((error = vnode_getwithref(di_vp))) {
2094                         file_drop(bsdata->backingfd);
2095                         return(error);
2096                 }
2097
2098                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
2099                         (void)vnode_put(di_vp);
2100                         file_drop(bsdata->backingfd);
2101                         return (EINVAL);
2102                 }
2103
2104                 /*
2105                  * Obtain the backing fs root vnode and keep a reference
2106                  * on it.  This reference will be dropped in hfs_unmount.
2107                  */
2108                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
2109                 if (error) {
2110                         (void)vnode_put(di_vp);
2111                         file_drop(bsdata->backingfd);
2112                         return (error);
2113                 }
2114                 vnode_ref(bsfs_rootvp);
2115                 vnode_put(bsfs_rootvp);
2116
2117                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
2118
2119                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
2120                 /* The free extent cache is managed differently for sparse devices.
2121                  * There is a window between which the volume is mounted and the
2122                  * device is marked as sparse, so the free extent cache for this
2123                  * volume is currently initialized as normal volume (sorted by block
2124                  * count).  Reset the cache so that it will be rebuilt again
2125                  * for sparse device (sorted by start block).
2126                  */
2127                 ResetVCBFreeExtCache(hfsmp);
2128
2129                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize;
2130                 hfsmp->hfs_sparsebandblks *= 4;
2131
2132                 /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
2133
2134                 /*
2135                  * If the sparse image is on a sparse image file (as opposed to a sparse
2136                  * bundle), then we may need to limit the free space to the maximum size
2137                  * of a file on that volume.  So we query (using pathconf), and if we get
2138                  * a meaningful result, we cache the number of blocks for later use in
2139                  * hfs_freeblks().
2140                  */
2141                 hfsmp->hfs_backingfs_maxblocks = 0;
2142                 if (vnode_vtype(di_vp) == VREG) {
2143                         int terr;
2144                         int hostbits;
2145                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
2146                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
2147                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
2148
2149                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
2150                         }
2151                 }
2152
2153                 (void)vnode_put(di_vp);
2154                 file_drop(bsdata->backingfd);
2155                 return (0);
2156         }
2157         case HFS_CLRBACKINGSTOREINFO: {
2158                 struct vnode * tmpvp;
2159
2160                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2161                 if (suser(cred, NULL) &&
2162                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2163                         return (EACCES); /* must be owner of file system */
2164                 }
2165                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2166                         return (EROFS);
2167                 }
2168
2169                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
2170                     hfsmp->hfs_backingfs_rootvp) {
2171
2172                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2173                         tmpvp = hfsmp->hfs_backingfs_rootvp;
2174                         hfsmp->hfs_backingfs_rootvp = NULLVP;
2175                         hfsmp->hfs_sparsebandblks = 0;
2176                         vnode_rele(tmpvp);
2177                 }
2178                 return (0);
2179         }
2180 #endif /* HFS_SPARSE_DEV */
2181
2182         /* Change the next CNID stored in the VH */
2183         case HFS_CHANGE_NEXTCNID: {
2184                 int error = 0;          /* Assume success */
2185                 u_int32_t fileid;
2186                 int wraparound = 0;
2187                 int lockflags = 0;
2188
2189                 if (vnode_vfsisrdonly(vp)) {
2190                         return (EROFS);
2191                 }
2192                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2193                 if (suser(cred, NULL) &&
2194                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2195                         return (EACCES); /* must be owner of file system */
2196                 }
2197
2198                 fileid = *(u_int32_t *)ap->a_data;
2199
2200                 /* Must have catalog lock excl. to advance the CNID pointer */
2201                 lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2202
2203                 hfs_lock_mount(hfsmp);
2204
2205                 /* If it is less than the current next CNID, force the wraparound bit to be set */
2206                 if (fileid < hfsmp->vcbNxtCNID) {
2207                         wraparound=1;
2208                 }
2209
2210                 /* Return previous value. */
2211                 *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2212
2213                 hfsmp->vcbNxtCNID = fileid;
2214
2215                 if (wraparound) {
2216                         hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2217                 }
2218
2219                 MarkVCBDirty(hfsmp);
2220                 hfs_unlock_mount(hfsmp);
2221                 hfs_systemfile_unlock (hfsmp, lockflags);
2222
2223                 return (error);
2224         }
2225
2226         case F_FREEZE_FS: {
2227                 struct mount *mp;
2228
2229                 mp = vnode_mount(vp);
2230                 hfsmp = VFSTOHFS(mp);
2231
2232                 if (!(hfsmp->jnl))
2233                         return (ENOTSUP);
2234
2235                 vfsp = vfs_statfs(mp);
2236
2237                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2238                         !kauth_cred_issuser(cred))
2239                         return (EACCES);
2240
2241                 lck_rw_lock_exclusive(&hfsmp->hfs_insync);
2242
2243                 // flush things before we get started to try and prevent
2244                 // dirty data from being paged out while we're frozen.
2245                 // note: can't do this after taking the lock as it will
2246                 // deadlock against ourselves.
2247                 vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL);
2248                 hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
2249
2250                 // DO NOT call hfs_journal_flush() because that takes a
2251                 // shared lock on the global exclusive lock!
2252                 journal_flush(hfsmp->jnl, TRUE);
2253
2254                 // don't need to iterate on all vnodes, we just need to
2255                 // wait for writes to the system files and the device vnode
2256                 //
2257                 // Now that journal flush waits for all metadata blocks to
2258                 // be written out, waiting for btree writes is probably no
2259                 // longer required.
2260                 if (HFSTOVCB(hfsmp)->extentsRefNum)
2261                     vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze");
2262                 if (HFSTOVCB(hfsmp)->catalogRefNum)
2263                     vnode_waitforwrites(HFSTOVCB(hfsmp)->catalogRefNum, 0, 0, 0, "hfs freeze");
2264                 if (HFSTOVCB(hfsmp)->allocationsRefNum)
2265                     vnode_waitforwrites(HFSTOVCB(hfsmp)->allocationsRefNum, 0, 0, 0, "hfs freeze");
2266                 if (hfsmp->hfs_attribute_vp)
2267                     vnode_waitforwrites(hfsmp->hfs_attribute_vp, 0, 0, 0, "hfs freeze");
2268                 vnode_waitforwrites(hfsmp->hfs_devvp, 0, 0, 0, "hfs freeze");
2269
2270                 hfsmp->hfs_freezing_proc = current_proc();
2271
2272                 return (0);
2273         }
2274
2275         case F_THAW_FS: {
2276                 vfsp = vfs_statfs(vnode_mount(vp));
2277                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2278                         !kauth_cred_issuser(cred))
2279                         return (EACCES);
2280
2281                 // if we're not the one who froze the fs then we
2282                 // can't thaw it.
2283                 if (hfsmp->hfs_freezing_proc != current_proc()) {
2284                     return EPERM;
2285                 }
2286
2287                 // NOTE: if you add code here, also go check the
2288                 //       code that "thaws" the fs in hfs_vnop_close()
2289                 //
2290                 hfsmp->hfs_freezing_proc = NULL;
2291                 hfs_unlock_global (hfsmp);
2292                 lck_rw_unlock_exclusive(&hfsmp->hfs_insync);
2293
2294                 return (0);
2295         }
2296
2297         case HFS_BULKACCESS_FSCTL: {
2298             int size;
2299
2300             if (hfsmp->hfs_flags & HFS_STANDARD) {
2301                 return EINVAL;
2302             }
2303
2304             if (is64bit) {
2305                 size = sizeof(struct user64_access_t);
2306             } else {
2307                 size = sizeof(struct user32_access_t);
2308             }
2309
2310             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2311         }
2312
2313         case HFS_EXT_BULKACCESS_FSCTL: {
2314             int size;
2315
2316             if (hfsmp->hfs_flags & HFS_STANDARD) {
2317                 return EINVAL;
2318             }
2319
2320             if (is64bit) {
2321                 size = sizeof(struct user64_ext_access_t);
2322             } else {
2323                 size = sizeof(struct user32_ext_access_t);
2324             }
2325
2326             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2327         }
2328
2329         case HFS_SET_XATTREXTENTS_STATE: {
2330                 int state;
2331
2332                 if (ap->a_data == NULL) {
2333                         return (EINVAL);
2334                 }
2335
2336                 state = *(int *)ap->a_data;
2337
2338                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2339                         return (EROFS);
2340                 }
2341
2342                 /* Super-user can enable or disable extent-based extended
2343                  * attribute support on a volume
2344                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
2345                  * are enabled by default, so any change will be transient only
2346                  * till the volume is remounted.
2347                  */
2348                 if (!kauth_cred_issuser(kauth_cred_get())) {
2349                         return (EPERM);
2350                 }
2351                 if (state == 0 || state == 1)
2352                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2353                 else
2354                         return (EINVAL);
2355         }
2356
2357         case F_SETSTATICCONTENT: {
2358                 int error;
2359                 int enable_static = 0;
2360                 struct cnode *cp = NULL;
2361                 /*
2362                  * lock the cnode, decorate the cnode flag, and bail out.
2363                  * VFS should have already authenticated the caller for us.
2364                  */
2365
2366                 if (ap->a_data) {
2367                         /*
2368                          * Note that even though ap->a_data is of type caddr_t,
2369                          * the fcntl layer at the syscall handler will pass in NULL
2370                          * or 1 depending on what the argument supplied to the fcntl
2371                          * was.  So it is in fact correct to check the ap->a_data
2372                          * argument for zero or non-zero value when deciding whether or not
2373                          * to enable the static bit in the cnode.
2374                          */
2375                         enable_static = 1;
2376                 }
2377                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2378                         return EROFS;
2379                 }
2380                 cp = VTOC(vp);
2381
2382                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2383                 if (error == 0) {
2384                         if (enable_static) {
2385                                 cp->c_flag |= C_SSD_STATIC;
2386                         }
2387                         else {
2388                                 cp->c_flag &= ~C_SSD_STATIC;
2389                         }
2390                         hfs_unlock (cp);
2391                 }
2392                 return error;
2393         }
2394
2395         case F_SET_GREEDY_MODE: {
2396                 int error;
2397                 int enable_greedy_mode = 0;
2398                 struct cnode *cp = NULL;
2399                 /*
2400                  * lock the cnode, decorate the cnode flag, and bail out.
2401                  * VFS should have already authenticated the caller for us.
2402                  */
2403
2404                 if (ap->a_data) {
2405                         /*
2406                          * Note that even though ap->a_data is of type caddr_t,
2407                          * the fcntl layer at the syscall handler will pass in NULL
2408                          * or 1 depending on what the argument supplied to the fcntl
2409                          * was.  So it is in fact correct to check the ap->a_data
2410                          * argument for zero or non-zero value when deciding whether or not
2411                          * to enable the greedy mode bit in the cnode.
2412                          */
2413                         enable_greedy_mode = 1;
2414                 }
2415                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2416                         return EROFS;
2417                 }
2418                 cp = VTOC(vp);
2419
2420                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2421                 if (error == 0) {
2422                         if (enable_greedy_mode) {
2423                                 cp->c_flag |= C_SSD_GREEDY_MODE;
2424                         }
2425                         else {
2426                                 cp->c_flag &= ~C_SSD_GREEDY_MODE;
2427                         }
2428                         hfs_unlock (cp);
2429                 }
2430                 return error;
2431         }
2432
2433         case F_MAKECOMPRESSED: {
2434                 int error = 0;
2435                 uint32_t gen_counter;
2436                 struct cnode *cp = NULL;
2437                 int reset_decmp = 0;
2438
2439                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2440                         return EROFS;
2441                 }
2442
2443                 /*
2444                  * acquire & lock the cnode.
2445                  * VFS should have already authenticated the caller for us.
2446                  */
2447
2448                 if (ap->a_data) {
2449                         /*
2450                          * Cast the pointer into a uint32_t so we can extract the
2451                          * supplied generation counter.
2452                          */
2453                         gen_counter = *((uint32_t*)ap->a_data);
2454                 }
2455                 else {
2456                         return EINVAL;
2457                 }
2458
2459 #if HFS_COMPRESSION
2460                 cp = VTOC(vp);
2461                 /* Grab truncate lock first; we may truncate the file */
2462                 hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2463
2464                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2465                 if (error) {
2466                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2467                         return error;
2468                 }
2469
2470                 /* Are there any other usecounts/FDs? */
2471                 if (vnode_isinuse(vp, 1)) {
2472                         hfs_unlock(cp);
2473                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2474                         return EBUSY;
2475                 }
2476
2477
2478                 /* now we have the cnode locked down; Validate arguments */
2479                 if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2480                         /* EINVAL if you are trying to manipulate an IMMUTABLE file */
2481                         hfs_unlock(cp);
2482                         hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2483                         return EINVAL;
2484                 }
2485
2486                 if ((hfs_get_gencount (cp)) == gen_counter) {
2487                         /*
2488                          * OK, the gen_counter matched.  Go for it:
2489                          * Toggle state bits, truncate file, and suppress mtime update
2490                          */
2491                         reset_decmp = 1;
2492                         cp->c_bsdflags |= UF_COMPRESSED;
2493
2494                         error = hfs_truncate(vp, 0, IO_NDELAY, 0, (HFS_TRUNCATE_SKIPTIMES), ap->a_context);
2495                 }
2496                 else {
2497                         error = ESTALE;
2498                 }
2499
2500                 /* Unlock cnode before executing decmpfs ; they may need to get an EA */
2501                 hfs_unlock(cp);
2502
2503                 /*
2504                  * Reset the decmp state while still holding the truncate lock. We need to
2505                  * serialize here against a listxattr on this node which may occur at any
2506                  * time.
2507                  *
2508                  * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2509                  * that will still potentially require getting the com.apple.decmpfs EA. If the
2510                  * EA is required, then we can't hold the cnode lock, because the getxattr call is
2511                  * generic(through VFS), and can't pass along any info telling it that we're already
2512                  * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2513                  * and trying to fill in the hfs_file_is_compressed info during the callback
2514                  * operation, which will result in deadlock against the b-tree node.
2515                  *
2516                  * So, to serialize against listxattr (which will grab buf_t meta references on
2517                  * the b-tree blocks), we hold the truncate lock as we're manipulating the
2518                  * decmpfs payload.
2519                  */
2520                 if ((reset_decmp) && (error == 0)) {
2521                         decmpfs_cnode *dp = VTOCMP (vp);
2522                         if (dp != NULL) {
2523                                 decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2524                         }
2525
2526                         /* Initialize the decmpfs node as needed */
2527                         (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2528                 }
2529
2530                 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2531
2532 #endif
2533                 return error;
2534         }
2535
2536         case F_SETBACKINGSTORE: {
2537
2538                 int error = 0;
2539
2540                 /*
2541                  * See comment in F_SETSTATICCONTENT re: using
2542              * a null check for a_data
2543                  */
2544                 if (ap->a_data) {
2545                         error = hfs_set_backingstore (vp, 1);
2546                 }
2547                 else {
2548                         error = hfs_set_backingstore (vp, 0);
2549                 }
2550
2551                 return error;
2552         }
2553
2554         case F_GETPATH_MTMINFO: {
2555                 int error = 0;
2556
2557                 int *data = (int*) ap->a_data;
2558
2559                 /* Ask if this is a backingstore vnode */
2560                 error = hfs_is_backingstore (vp, data);
2561
2562                 return error;
2563         }
2564
2565         case F_FULLFSYNC: {
2566                 int error;
2567
2568                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2569                         return (EROFS);
2570                 }
2571                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2572                 if (error == 0) {
2573                         error = hfs_fsync(vp, MNT_WAIT, TRUE, p);
2574                         hfs_unlock(VTOC(vp));
2575                 }
2576
2577                 return error;
2578         }
2579
2580         case F_CHKCLEAN: {
2581                 register struct cnode *cp;
2582                 int error;
2583
2584                 if (!vnode_isreg(vp))
2585                         return EINVAL;
2586
2587                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2588                 if (error == 0) {
2589                         cp = VTOC(vp);
2590                         /*
2591                          * used by regression test to determine if
2592                          * all the dirty pages (via write) have been cleaned
2593                          * after a call to 'fsysnc'.
2594                          */
2595                         error = is_file_clean(vp, VTOF(vp)->ff_size);
2596                         hfs_unlock(cp);
2597                 }
2598                 return (error);
2599         }
2600
2601         case F_RDADVISE: {
2602                 register struct radvisory *ra;
2603                 struct filefork *fp;
2604                 int error;
2605
2606                 if (!vnode_isreg(vp))
2607                         return EINVAL;
2608
2609                 ra = (struct radvisory *)(ap->a_data);
2610                 fp = VTOF(vp);
2611
2612                 /* Protect against a size change. */
2613                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2614
2615 #if HFS_COMPRESSION
2616                 if (compressed && (uncompressed_size == -1)) {
2617                         /* fetching the uncompressed size failed above, so return the error */
2618                         error = decmpfs_error;
2619                 } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
2620                                    (!compressed && (ra->ra_offset >= fp->ff_size))) {
2621                         error = EFBIG;
2622                 }
2623 #else /* HFS_COMPRESSION */
2624                 if (ra->ra_offset >= fp->ff_size) {
2625                         error = EFBIG;
2626                 }
2627 #endif /* HFS_COMPRESSION */
2628                 else {
2629                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2630                 }
2631
2632                 hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2633                 return (error);
2634         }
2635
2636         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2637         {
2638                 if (is64bit) {
2639                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2640                 }
2641                 else {
2642                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2643                 }
2644                 return 0;
2645         }
2646
2647         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2648             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2649             break;
2650
2651         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2652             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2653             break;
2654
2655         case HFS_FSCTL_GET_VERY_LOW_DISK:
2656             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2657             break;
2658
2659         case HFS_FSCTL_SET_VERY_LOW_DISK:
2660             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2661                 return EINVAL;
2662             }
2663
2664             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2665             break;
2666
2667         case HFS_FSCTL_GET_LOW_DISK:
2668             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2669             break;
2670
2671         case HFS_FSCTL_SET_LOW_DISK:
2672             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2673                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2674
2675                 return EINVAL;
2676             }
2677
2678             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2679             break;
2680
2681         case HFS_FSCTL_GET_DESIRED_DISK:
2682             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2683             break;
2684
2685         case HFS_FSCTL_SET_DESIRED_DISK:
2686             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2687                 return EINVAL;
2688             }
2689
2690             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2691             break;
2692
2693         case HFS_VOLUME_STATUS:
2694             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2695             break;
2696
2697         case HFS_SET_BOOT_INFO:
2698                 if (!vnode_isvroot(vp))
2699                         return(EINVAL);
2700                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2701                         return(EACCES); /* must be superuser or owner of filesystem */
2702                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2703                         return (EROFS);
2704                 }
2705                 hfs_lock_mount (hfsmp);
2706                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2707                 hfs_unlock_mount (hfsmp);
2708                 (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2709                 break;
2710
2711         case HFS_GET_BOOT_INFO:
2712                 if (!vnode_isvroot(vp))
2713                         return(EINVAL);
2714                 hfs_lock_mount (hfsmp);
2715                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2716                 hfs_unlock_mount(hfsmp);
2717                 break;
2718
2719         case HFS_MARK_BOOT_CORRUPT:
2720                 /* Mark the boot volume corrupt by setting
2721                  * kHFSVolumeInconsistentBit in the volume header.  This will
2722                  * force fsck_hfs on next mount.
2723                  */
2724                 if (!kauth_cred_issuser(kauth_cred_get())) {
2725                         return EACCES;
2726                 }
2727
2728                 /* Allowed only on the root vnode of the boot volume */
2729                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2730                     !vnode_isvroot(vp)) {
2731                         return EINVAL;
2732                 }
2733                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2734                         return (EROFS);
2735                 }
2736                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2737                 hfs_mark_volume_inconsistent(hfsmp);
2738                 break;
2739
2740         case HFS_FSCTL_GET_JOURNAL_INFO:
2741                 jip = (struct hfs_journal_info*)ap->a_data;
2742
2743                 if (vp == NULLVP)
2744                         return EINVAL;
2745
2746             if (hfsmp->jnl == NULL) {
2747                         jnl_start = 0;
2748                         jnl_size  = 0;
2749             } else {
2750                         jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
2751                         jnl_size  = (off_t)hfsmp->jnl_size;
2752             }
2753
2754                 jip->jstart = jnl_start;
2755                 jip->jsize = jnl_size;
2756                 break;
2757
2758         case HFS_SET_ALWAYS_ZEROFILL: {
2759             struct cnode *cp = VTOC(vp);
2760
2761             if (*(int *)ap->a_data) {
2762                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2763             } else {
2764                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2765             }
2766             break;
2767         }
2768
2769         case HFS_DISABLE_METAZONE: {
2770                 /* Only root can disable metadata zone */
2771                 if (!kauth_cred_issuser(kauth_cred_get())) {
2772                         return EACCES;
2773                 }
2774                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2775                         return (EROFS);
2776                 }
2777
2778                 /* Disable metadata zone now */
2779                 (void) hfs_metadatazone_init(hfsmp, true);
2780                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2781                 break;
2782         }
2783
2784         default:
2785                 return (ENOTTY);
2786         }
2787
2788         return 0;
2789 }
2790
2791 /*
2792  * select
2793  */
2794 int
2795 hfs_vnop_select(__unused struct vnop_select_args *ap)
2796 /*
2797         struct vnop_select_args {
2798                 vnode_t a_vp;
2799                 int  a_which;
2800                 int  a_fflags;
2801                 void *a_wql;
2802                 vfs_context_t a_context;
2803         };
2804 */
2805 {
2806         /*
2807          * We should really check to see if I/O is possible.
2808          */
2809         return (1);
2810 }
2811
2812 /*
2813  * Converts a logical block number to a physical block, and optionally returns
2814  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2815  * The physical block number is based on the device block size, currently its 512.
2816  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2817  */
2818 int
2819 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2820 {
2821         struct filefork *fp = VTOF(vp);
2822         struct hfsmount *hfsmp = VTOHFS(vp);
2823         int  retval = E_NONE;
2824         u_int32_t  logBlockSize;
2825         size_t  bytesContAvail = 0;
2826         off_t  blockposition;
2827         int lockExtBtree;
2828         int lockflags = 0;
2829
2830         /*
2831          * Check for underlying vnode requests and ensure that logical
2832          * to physical mapping is requested.
2833          */
2834         if (vpp != NULL)
2835                 *vpp = hfsmp->hfs_devvp;
2836         if (bnp == NULL)
2837                 return (0);
2838
2839         logBlockSize = GetLogicalBlockSize(vp);
2840         blockposition = (off_t)bn * logBlockSize;
2841
2842         lockExtBtree = overflow_extents(fp);
2843
2844         if (lockExtBtree)
2845                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2846
2847         retval = MacToVFSError(
2848                             MapFileBlockC (HFSTOVCB(hfsmp),
2849                                             (FCB*)fp,
2850                                             MAXPHYSIO,
2851                                             blockposition,
2852                                             bnp,
2853                                             &bytesContAvail));
2854
2855         if (lockExtBtree)
2856                 hfs_systemfile_unlock(hfsmp, lockflags);
2857
2858         if (retval == E_NONE) {
2859                 /* Figure out how many read ahead blocks there are */
2860                 if (runp != NULL) {
2861                         if (can_cluster(logBlockSize)) {
2862                                 /* Make sure this result never goes negative: */
2863                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2864                         } else {
2865                                 *runp = 0;
2866                         }
2867                 }
2868         }
2869         return (retval);
2870 }
2871
2872 /*
2873  * Convert logical block number to file offset.
2874  */
2875 int
2876 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2877 /*
2878         struct vnop_blktooff_args {
2879                 vnode_t a_vp;
2880                 daddr64_t a_lblkno;
2881                 off_t *a_offset;
2882         };
2883 */
2884 {
2885         if (ap->a_vp == NULL)
2886                 return (EINVAL);
2887         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2888
2889         return(0);
2890 }
2891
2892 /*
2893  * Convert file offset to logical block number.
2894  */
2895 int
2896 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2897 /*
2898         struct vnop_offtoblk_args {
2899                 vnode_t a_vp;
2900                 off_t a_offset;
2901                 daddr64_t *a_lblkno;
2902         };
2903 */
2904 {
2905         if (ap->a_vp == NULL)
2906                 return (EINVAL);
2907         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2908
2909         return(0);
2910 }
2911
2912 /*
2913  * Map file offset to physical block number.
2914  *
2915  * If this function is called for write operation, and if the file
2916  * had virtual blocks allocated (delayed allocation), real blocks
2917  * are allocated by calling ExtendFileC().
2918  *
2919  * If this function is called for read operation, and if the file
2920  * had virtual blocks allocated (delayed allocation), no change
2921  * to the size of file is done, and if required, rangelist is
2922  * searched for mapping.
2923  *
2924  * System file cnodes are expected to be locked (shared or exclusive).
2925  */
2926 int
2927 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
2928 /*
2929         struct vnop_blockmap_args {
2930                 vnode_t a_vp;
2931                 off_t a_foffset;
2932                 size_t a_size;
2933                 daddr64_t *a_bpn;
2934                 size_t *a_run;
2935                 void *a_poff;
2936                 int a_flags;
2937                 vfs_context_t a_context;
2938         };
2939 */
2940 {
2941         struct vnode *vp = ap->a_vp;
2942         struct cnode *cp;
2943         struct filefork *fp;
2944         struct hfsmount *hfsmp;
2945         size_t bytesContAvail = 0;
2946         int retval = E_NONE;
2947         int syslocks = 0;
2948         int lockflags = 0;
2949         struct rl_entry *invalid_range;
2950         enum rl_overlaptype overlaptype;
2951         int started_tr = 0;
2952         int tooklock = 0;
2953
2954 #if HFS_COMPRESSION
2955         if (VNODE_IS_RSRC(vp)) {
2956                 /* allow blockmaps to the resource fork */
2957         } else {
2958                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
2959                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
2960                         switch(state) {
2961                                 case FILE_IS_COMPRESSED:
2962                                         return ENOTSUP;
2963                                 case FILE_IS_CONVERTING:
2964                                         /* if FILE_IS_CONVERTING, we allow blockmap */
2965                                         break;
2966                                 default:
2967                                         printf("invalid state %d for compressed file\n", state);
2968                                         /* fall through */
2969                         }
2970                 }
2971         }
2972 #endif /* HFS_COMPRESSION */
2973
2974         /* Do not allow blockmap operation on a directory */
2975         if (vnode_isdir(vp)) {
2976                 return (ENOTSUP);
2977         }
2978
2979         /*
2980          * Check for underlying vnode requests and ensure that logical
2981          * to physical mapping is requested.
2982          */
2983         if (ap->a_bpn == NULL)
2984                 return (0);
2985
2986         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
2987                 if (VTOC(vp)->c_lockowner != current_thread()) {
2988                         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
2989                         tooklock = 1;
2990                 }
2991         }
2992         hfsmp = VTOHFS(vp);
2993         cp = VTOC(vp);
2994         fp = VTOF(vp);
2995
2996 retry:
2997         /* Check virtual blocks only when performing write operation */
2998         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
2999                 if (hfs_start_transaction(hfsmp) != 0) {
3000                         retval = EINVAL;
3001                         goto exit;
3002                 } else {
3003                         started_tr = 1;
3004                 }
3005                 syslocks = SFL_EXTENTS | SFL_BITMAP;
3006
3007         } else if (overflow_extents(fp)) {
3008                 syslocks = SFL_EXTENTS;
3009         }
3010
3011         if (syslocks)
3012                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
3013
3014         /*
3015          * Check for any delayed allocations.
3016          */
3017         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3018                 int64_t actbytes;
3019                 u_int32_t loanedBlocks;
3020
3021                 //
3022                 // Make sure we have a transaction.  It's possible
3023                 // that we came in and fp->ff_unallocblocks was zero
3024                 // but during the time we blocked acquiring the extents
3025                 // btree, ff_unallocblocks became non-zero and so we
3026                 // will need to start a transaction.
3027                 //
3028                 if (started_tr == 0) {
3029                         if (syslocks) {
3030                                 hfs_systemfile_unlock(hfsmp, lockflags);
3031                                 syslocks = 0;
3032                         }
3033                         goto retry;
3034                 }
3035
3036                 /*
3037                  * Note: ExtendFileC will Release any blocks on loan and
3038                  * aquire real blocks.  So we ask to extend by zero bytes
3039                  * since ExtendFileC will account for the virtual blocks.
3040                  */
3041
3042                 loanedBlocks = fp->ff_unallocblocks;
3043                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3044                                      kEFAllMask | kEFNoClumpMask, &actbytes);
3045
3046                 if (retval) {
3047                         fp->ff_unallocblocks = loanedBlocks;
3048                         cp->c_blocks += loanedBlocks;
3049                         fp->ff_blocks += loanedBlocks;
3050
3051                         hfs_lock_mount (hfsmp);
3052                         hfsmp->loanedBlocks += loanedBlocks;
3053                         hfs_unlock_mount (hfsmp);
3054
3055                         hfs_systemfile_unlock(hfsmp, lockflags);
3056                         cp->c_flag |= C_MODIFIED;
3057                         if (started_tr) {
3058                                 (void) hfs_update(vp, TRUE);
3059                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3060
3061                                 hfs_end_transaction(hfsmp);
3062                                 started_tr = 0;
3063                         }
3064                         goto exit;
3065                 }
3066         }
3067
3068         retval = MapFileBlockC(hfsmp, (FCB *)fp, ap->a_size, ap->a_foffset,
3069                                ap->a_bpn, &bytesContAvail);
3070         if (syslocks) {
3071                 hfs_systemfile_unlock(hfsmp, lockflags);
3072                 syslocks = 0;
3073         }
3074
3075         if (started_tr) {
3076                 (void) hfs_update(vp, TRUE);
3077                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3078                 hfs_end_transaction(hfsmp);
3079                 started_tr = 0;
3080         }
3081         if (retval) {
3082                 /* On write, always return error because virtual blocks, if any,
3083                  * should have been allocated in ExtendFileC().  We do not
3084                  * allocate virtual blocks on read, therefore return error
3085                  * only if no virtual blocks are allocated.  Otherwise we search
3086                  * rangelist for zero-fills
3087                  */
3088                 if ((MacToVFSError(retval) != ERANGE) ||
3089                     (ap->a_flags & VNODE_WRITE) ||
3090                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3091                         goto exit;
3092                 }
3093
3094                 /* Validate if the start offset is within logical file size */
3095                 if (ap->a_foffset >= fp->ff_size) {
3096                         goto exit;
3097                 }
3098
3099                 /*
3100                  * At this point, we have encountered a failure during
3101                  * MapFileBlockC that resulted in ERANGE, and we are not servicing
3102                  * a write, and there are borrowed blocks.
3103                  *
3104                  * However, the cluster layer will not call blockmap for
3105                  * blocks that are borrowed and in-cache.  We have to assume that
3106                  * because we observed ERANGE being emitted from MapFileBlockC, this
3107                  * extent range is not valid on-disk.  So we treat this as a
3108                  * mapping that needs to be zero-filled prior to reading.
3109                  *
3110                  * Note that under certain circumstances (such as non-contiguous
3111                  * userland VM mappings in the calling process), cluster_io
3112                  * may be forced to split a large I/O driven by hfs_vnop_write
3113                  * into multiple sub-I/Os that necessitate a RMW cycle.  If this is
3114                  * the case here, then we have already removed the invalid range list
3115                  * mapping prior to getting to this blockmap call, so we should not
3116                  * search the invalid rangelist for this byte range.
3117                  */
3118
3119                 bytesContAvail = fp->ff_size - ap->a_foffset;
3120                 /*
3121                  * Clip the contiguous available bytes to, at most, the allowable
3122                  * maximum or the amount requested.
3123                  */
3124
3125                 if (bytesContAvail > ap->a_size) {
3126                         bytesContAvail = ap->a_size;
3127                 }
3128
3129                 *ap->a_bpn = (daddr64_t) -1;
3130                 retval = 0;
3131
3132                 goto exit;
3133         }
3134
3135         /* MapFileC() found a valid extent in the filefork.  Search the
3136          * mapping information further for invalid file ranges
3137          */
3138         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3139                               ap->a_foffset + (off_t)bytesContAvail - 1,
3140                               &invalid_range);
3141         if (overlaptype != RL_NOOVERLAP) {
3142                 switch(overlaptype) {
3143                 case RL_MATCHINGOVERLAP:
3144                 case RL_OVERLAPCONTAINSRANGE:
3145                 case RL_OVERLAPSTARTSBEFORE:
3146                         /* There's no valid block for this byte offset */
3147                         *ap->a_bpn = (daddr64_t)-1;
3148                         /* There's no point limiting the amount to be returned
3149                          * if the invalid range that was hit extends all the way
3150                          * to the EOF (i.e. there's no valid bytes between the
3151                          * end of this range and the file's EOF):
3152                          */
3153                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3154                             ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3155                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3156                         }
3157                         break;
3158
3159                 case RL_OVERLAPISCONTAINED:
3160                 case RL_OVERLAPENDSAFTER:
3161                         /* The range of interest hits an invalid block before the end: */
3162                         if (invalid_range->rl_start == ap->a_foffset) {
3163                                 /* There's actually no valid information to be had starting here: */
3164                                 *ap->a_bpn = (daddr64_t)-1;
3165                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3166                                     ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3167                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3168                                 }
3169                         } else {
3170                                 bytesContAvail = invalid_range->rl_start - ap->a_foffset;
3171                         }
3172                         break;
3173
3174                 case RL_NOOVERLAP:
3175                         break;
3176                 } /* end switch */
3177                 if (bytesContAvail > ap->a_size)
3178                         bytesContAvail = ap->a_size;
3179         }
3180
3181 exit:
3182         if (retval == 0) {
3183                 if (ap->a_run)
3184                         *ap->a_run = bytesContAvail;
3185
3186                 if (ap->a_poff)
3187                         *(int *)ap->a_poff = 0;
3188         }
3189
3190         if (tooklock)
3191                 hfs_unlock(cp);
3192
3193         return (MacToVFSError(retval));
3194 }
3195
3196 /*
3197  * prepare and issue the I/O
3198  * buf_strategy knows how to deal
3199  * with requests that require
3200  * fragmented I/Os
3201  */
3202 int
3203 hfs_vnop_strategy(struct vnop_strategy_args *ap)
3204 {
3205         buf_t   bp = ap->a_bp;
3206         vnode_t vp = buf_vnode(bp);
3207         int error = 0;
3208
3209         /* Mark buffer as containing static data if cnode flag set */
3210         if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3211                 buf_markstatic(bp);
3212         }
3213
3214         /* Mark buffer as containing static data if cnode flag set */
3215         if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3216                 bufattr_markgreedymode((bufattr_t)(&bp->b_attr));
3217         }
3218
3219 #if CONFIG_PROTECT
3220         cnode_t *cp = NULL;
3221
3222         if ((cp = cp_get_protected_cnode(vp)) != NULL) {
3223                 /*
3224                  * We rely upon the truncate lock to protect the
3225                  * CP cache key from getting tossed prior to our IO finishing here.
3226                  * Nearly all cluster io calls to manipulate file payload from HFS
3227                  * take the truncate lock before calling into the cluster
3228                  * layer to ensure the file size does not change, or that they
3229                  * have exclusive right to change the EOF of the file.
3230                  * That same guarantee protects us here since the code that
3231                  * deals with CP lock events must now take the truncate lock
3232                  * before doing anything.
3233                  *
3234                  * There is 1 exception here:
3235                  * 1) One exception should be the VM swapfile IO, because HFS will
3236                  * funnel the VNOP_PAGEOUT directly into a cluster_pageout call for the
3237                  * swapfile code only without holding the truncate lock.  This is because
3238                  * individual swapfiles are maintained at fixed-length sizes by the VM code.
3239                  * In non-swapfile IO we use PAGEOUT_V2 semantics which allow us to
3240                  * create our own UPL and thus take the truncate lock before calling
3241                  * into the cluster layer.  In that case, however, we are not concerned
3242                  * with the CP blob being wiped out in the middle of the IO
3243                  * because there isn't anything to toss; the VM swapfile key stays
3244                  * in-core as long as the file is open.
3245                  *
3246                  * NB:
3247                  * For filesystem resize, we may not have access to the underlying
3248                  * file's cache key for whatever reason (device may be locked).  However,
3249                  * we do not need it since we are going to use the temporary HFS-wide resize key
3250                  * which is generated once we start relocating file content.  If this file's I/O
3251                  * should be done using the resize key, it will have been supplied already, so
3252                  * do not attach the file's cp blob to the buffer.
3253                  */
3254                 if ((cp->c_cpentry->cp_flags & CP_RELOCATION_INFLIGHT) == 0) {
3255                         buf_setcpaddr(bp, cp->c_cpentry);
3256                 }
3257         }
3258 #endif /* CONFIG_PROTECT */
3259
3260         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3261
3262         return error;
3263 }
3264
3265 static int
3266 hfs_minorupdate(struct vnode *vp) {
3267         struct cnode *cp = VTOC(vp);
3268         cp->c_flag &= ~C_MODIFIED;
3269         cp->c_touch_acctime = 0;
3270         cp->c_touch_chgtime = 0;
3271         cp->c_touch_modtime = 0;
3272
3273         return 0;
3274 }
3275
3276 int
3277 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3278 {
3279         register struct cnode *cp = VTOC(vp);
3280         struct filefork *fp = VTOF(vp);
3281         struct proc *p = vfs_context_proc(context);;
3282         kauth_cred_t cred = vfs_context_ucred(context);
3283         int retval;
3284         off_t bytesToAdd;
3285         off_t actualBytesAdded;
3286         off_t filebytes;
3287         u_int32_t fileblocks;
3288         int blksize;
3289         struct hfsmount *hfsmp;
3290         int lockflags;
3291         int skipupdate = (truncateflags & HFS_TRUNCATE_SKIPUPDATE);
3292         int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3293
3294         blksize = VTOVCB(vp)->blockSize;
3295         fileblocks = fp->ff_blocks;
3296         filebytes = (off_t)fileblocks * (off_t)blksize;
3297
3298         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_START,
3299                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3300
3301         if (length < 0)
3302                 return (EINVAL);
3303
3304         /* This should only happen with a corrupt filesystem */
3305         if ((off_t)fp->ff_size < 0)
3306                 return (EINVAL);
3307
3308         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3309                 return (EFBIG);
3310
3311         hfsmp = VTOHFS(vp);
3312
3313         retval = E_NONE;
3314
3315         /* Files that are changing size are not hot file candidates. */
3316         if (hfsmp->hfc_stage == HFC_RECORDING) {
3317                 fp->ff_bytesread = 0;
3318         }
3319
3320         /*
3321          * We cannot just check if fp->ff_size == length (as an optimization)
3322          * since there may be extra physical blocks that also need truncation.
3323          */
3324 #if QUOTA
3325         if ((retval = hfs_getinoquota(cp)))
3326                 return(retval);
3327 #endif /* QUOTA */
3328
3329         /*
3330          * Lengthen the size of the file. We must ensure that the
3331          * last byte of the file is allocated. Since the smallest
3332          * value of ff_size is 0, length will be at least 1.
3333          */
3334         if (length > (off_t)fp->ff_size) {
3335 #if QUOTA
3336                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3337                                    cred, 0);
3338                 if (retval)
3339                         goto Err_Exit;
3340 #endif /* QUOTA */
3341                 /*
3342                  * If we don't have enough physical space then
3343                  * we need to extend the physical size.
3344                  */
3345                 if (length > filebytes) {
3346                         int eflags;
3347                         u_int32_t blockHint = 0;
3348
3349                         /* All or nothing and don't round up to clumpsize. */
3350                         eflags = kEFAllMask | kEFNoClumpMask;
3351
3352                         if (cred && suser(cred, NULL) != 0)
3353                                 eflags |= kEFReserveMask;  /* keep a reserve */
3354
3355                         /*
3356                          * Allocate Journal and Quota files in metadata zone.
3357                          */
3358                         if (filebytes == 0 &&
3359                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3360                             hfs_virtualmetafile(cp)) {
3361                                 eflags |= kEFMetadataMask;
3362                                 blockHint = hfsmp->hfs_metazone_start;
3363                         }
3364                         if (hfs_start_transaction(hfsmp) != 0) {
3365                             retval = EINVAL;
3366                             goto Err_Exit;
3367                         }
3368
3369                         /* Protect extents b-tree and allocation bitmap */
3370                         lockflags = SFL_BITMAP;
3371                         if (overflow_extents(fp))
3372                                 lockflags |= SFL_EXTENTS;
3373                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3374
3375                         while ((length > filebytes) && (retval == E_NONE)) {
3376                                 bytesToAdd = length - filebytes;
3377                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3378                                                     (FCB*)fp,
3379                                                     bytesToAdd,
3380                                                     blockHint,
3381                                                     eflags,
3382                                                     &actualBytesAdded));
3383
3384                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3385                                 if (actualBytesAdded == 0 && retval == E_NONE) {
3386                                         if (length > filebytes)
3387                                                 length = filebytes;
3388                                         break;
3389                                 }
3390                         } /* endwhile */
3391
3392                         hfs_systemfile_unlock(hfsmp, lockflags);
3393
3394                         if (hfsmp->jnl) {
3395                                 if (skipupdate) {
3396                                         (void) hfs_minorupdate(vp);
3397                                 }
3398                                 else {
3399                                         (void) hfs_update(vp, TRUE);
3400                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3401                                 }
3402                         }
3403
3404                         hfs_end_transaction(hfsmp);
3405
3406                         if (retval)
3407                                 goto Err_Exit;
3408
3409                         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
3410                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3411                 }
3412
3413                 if (!(flags & IO_NOZEROFILL)) {
3414                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
3415                                 struct rl_entry *invalid_range;
3416                                 off_t zero_limit;
3417
3418                                 zero_limit = (fp->ff_size + (PAGE_SIZE_64 - 1)) & ~PAGE_MASK_64;
3419                                 if (length < zero_limit) zero_limit = length;
3420
3421                                 if (length > (off_t)fp->ff_size) {
3422                                         struct timeval tv;
3423
3424                                         /* Extending the file: time to fill out the current last page w. zeroes? */
3425                                         if ((fp->ff_size & PAGE_MASK_64) &&
3426                                             (rl_scan(&fp->ff_invalidranges, fp->ff_size & ~PAGE_MASK_64,
3427                                             fp->ff_size - 1, &invalid_range) == RL_NOOVERLAP)) {
3428
3429                                                 /* There's some valid data at the start of the (current) last page
3430                                                    of the file, so zero out the remainder of that page to ensure the
3431                                                    entire page contains valid data.  Since there is no invalid range
3432                                                    possible past the (current) eof, there's no need to remove anything
3433                                                    from the invalid range list before calling cluster_write():  */
3434                                                 hfs_unlock(cp);
3435                                                 retval = cluster_write(vp, (struct uio *) 0, fp->ff_size, zero_limit,
3436                                                                 fp->ff_size, (off_t)0,
3437                                                                 (flags & IO_SYNC) | IO_HEADZEROFILL | IO_NOZERODIRTY);
3438                                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3439                                                 if (retval) goto Err_Exit;
3440
3441                                                 /* Merely invalidate the remaining area, if necessary: */
3442                                                 if (length > zero_limit) {
3443                                                         microuptime(&tv);
3444                                                         rl_add(zero_limit, length - 1, &fp->ff_invalidranges);
3445                                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3446                                                 }
3447                                         } else {
3448                                         /* The page containing the (current) eof is invalid: just add the
3449                                            remainder of the page to the invalid list, along with the area
3450                                            being newly allocated:
3451                                          */
3452                                         microuptime(&tv);
3453                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3454                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3455                                         };
3456                                 }
3457                         } else {
3458                                         panic("hfs_truncate: invoked on non-UBC object?!");
3459                         };
3460                 }
3461                 if (suppress_times == 0) {
3462                         cp->c_touch_modtime = TRUE;
3463                 }
3464                 fp->ff_size = length;
3465
3466         } else { /* Shorten the size of the file */
3467
3468                 if ((off_t)fp->ff_size > length) {
3469                         /* Any space previously marked as invalid is now irrelevant: */
3470                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3471                 }
3472
3473                 /*
3474                  * Account for any unmapped blocks. Note that the new
3475                  * file length can still end up with unmapped blocks.
3476                  */
3477                 if (fp->ff_unallocblocks > 0) {
3478                         u_int32_t finalblks;
3479                         u_int32_t loanedBlocks;
3480
3481                         hfs_lock_mount(hfsmp);
3482                         loanedBlocks = fp->ff_unallocblocks;
3483                         cp->c_blocks -= loanedBlocks;
3484                         fp->ff_blocks -= loanedBlocks;
3485                         fp->ff_unallocblocks = 0;
3486
3487                         hfsmp->loanedBlocks -= loanedBlocks;
3488
3489                         finalblks = (length + blksize - 1) / blksize;
3490                         if (finalblks > fp->ff_blocks) {
3491                                 /* calculate required unmapped blocks */
3492                                 loanedBlocks = finalblks - fp->ff_blocks;
3493                                 hfsmp->loanedBlocks += loanedBlocks;
3494
3495                                 fp->ff_unallocblocks = loanedBlocks;
3496                                 cp->c_blocks += loanedBlocks;
3497                                 fp->ff_blocks += loanedBlocks;
3498                         }
3499                         hfs_unlock_mount (hfsmp);
3500                 }
3501
3502                 /*
3503                  * For a TBE process the deallocation of the file blocks is
3504                  * delayed until the file is closed.  And hfs_close calls
3505                  * truncate with the IO_NDELAY flag set.  So when IO_NDELAY
3506                  * isn't set, we make sure this isn't a TBE process.
3507                  */
3508                 if ((flags & IO_NDELAY) || (proc_tbe(p) == 0)) {
3509 #if QUOTA
3510                   off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3511 #endif /* QUOTA */
3512                   if (hfs_start_transaction(hfsmp) != 0) {
3513                       retval = EINVAL;
3514                       goto Err_Exit;
3515                   }
3516
3517                         if (fp->ff_unallocblocks == 0) {
3518                                 /* Protect extents b-tree and allocation bitmap */
3519                                 lockflags = SFL_BITMAP;
3520                                 if (overflow_extents(fp))
3521                                         lockflags |= SFL_EXTENTS;
3522                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3523
3524                                 retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3525                                                                                                          FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3526
3527                                 hfs_systemfile_unlock(hfsmp, lockflags);
3528                         }
3529                         if (hfsmp->jnl) {
3530                                 if (retval == 0) {
3531                                         fp->ff_size = length;
3532                                 }
3533                                 if (skipupdate) {
3534                                         (void) hfs_minorupdate(vp);
3535                                 }
3536                                 else {
3537                                         (void) hfs_update(vp, TRUE);
3538                                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3539                                 }
3540                         }
3541                         hfs_end_transaction(hfsmp);
3542
3543                         filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3544                         if (retval)
3545                                 goto Err_Exit;
3546 #if QUOTA
3547                         /* These are bytesreleased */
3548                         (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3549 #endif /* QUOTA */
3550                 }
3551                 /*
3552                  * Only set update flag if the logical length changes & we aren't
3553                  * suppressing modtime updates.
3554                  */
3555                 if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3556                         cp->c_touch_modtime = TRUE;
3557                 }
3558                 fp->ff_size = length;
3559         }
3560         if (cp->c_mode & (S_ISUID | S_ISGID)) {
3561                 if (!vfs_context_issuser(context)) {
3562                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3563                         skipupdate = 0;
3564                 }
3565         }
3566         if (skipupdate) {
3567                 retval = hfs_minorupdate(vp);
3568         }
3569         else {
3570                 cp->c_touch_chgtime = TRUE;     /* status changed */
3571                 if (suppress_times == 0) {
3572                         cp->c_touch_modtime = TRUE;     /* file data was modified */
3573
3574                         /*
3575                          * If we are not suppressing the modtime update, then
3576                          * update the gen count as well.
3577                          */
3578                         if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3579                                 hfs_incr_gencount(cp);
3580                         }
3581                 }
3582
3583                 retval = hfs_update(vp, MNT_WAIT);
3584         }
3585         if (retval) {
3586                 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_NONE,
3587                      -1, -1, -1, retval, 0);
3588         }
3589
3590 Err_Exit:
3591
3592         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 7)) | DBG_FUNC_END,
3593                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3594
3595         return (retval);
3596 }
3597
3598 /*
3599  * Preparation which must be done prior to deleting the catalog record
3600  * of a file or directory.  In order to make the on-disk as safe as possible,
3601  * we remove the catalog entry before releasing the bitmap blocks and the
3602  * overflow extent records.  However, some work must be done prior to deleting
3603  * the catalog record.
3604  *
3605  * When calling this function, the cnode must exist both in memory and on-disk.
3606  * If there are both resource fork and data fork vnodes, this function should
3607  * be called on both.
3608  */
3609
3610 int
3611 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3612
3613         struct filefork *fp = VTOF(vp);
3614         struct cnode *cp = VTOC(vp);
3615 #if QUOTA
3616         int retval = 0;
3617 #endif /* QUOTA */
3618
3619         /* Cannot truncate an HFS directory! */
3620         if (vnode_isdir(vp)) {
3621                 return (EISDIR);
3622         }
3623
3624         /*
3625          * See the comment below in hfs_truncate for why we need to call
3626          * setsize here.  Essentially we want to avoid pending IO if we
3627          * already know that the blocks are going to be released here.
3628          * This function is only called when totally removing all storage for a file, so
3629          * we can take a shortcut and immediately setsize (0);
3630          */
3631         ubc_setsize(vp, 0);
3632
3633         /* This should only happen with a corrupt filesystem */
3634         if ((off_t)fp->ff_size < 0)
3635                 return (EINVAL);
3636
3637         /*
3638          * We cannot just check if fp->ff_size == length (as an optimization)
3639          * since there may be extra physical blocks that also need truncation.
3640          */
3641 #if QUOTA
3642         if ((retval = hfs_getinoquota(cp))) {
3643                 return(retval);
3644         }
3645 #endif /* QUOTA */
3646
3647         /* Wipe out any invalid ranges which have yet to be backed by disk */
3648         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3649
3650         /*
3651          * Account for any unmapped blocks. Since we're deleting the
3652          * entire file, we don't have to worry about just shrinking
3653          * to a smaller number of borrowed blocks.
3654          */
3655         if (fp->ff_unallocblocks > 0) {
3656                 u_int32_t loanedBlocks;
3657
3658                 hfs_lock_mount (hfsmp);
3659                 loanedBlocks = fp->ff_unallocblocks;
3660                 cp->c_blocks -= loanedBlocks;
3661                 fp->ff_blocks -= loanedBlocks;
3662                 fp->ff_unallocblocks = 0;
3663
3664                 hfsmp->loanedBlocks -= loanedBlocks;
3665
3666                 hfs_unlock_mount (hfsmp);
3667         }
3668
3669         return 0;
3670 }
3671
3672
3673 /*
3674  * Special wrapper around calling TruncateFileC.  This function is useable
3675  * even when the catalog record does not exist any longer, making it ideal
3676  * for use when deleting a file.  The simplification here is that we know
3677  * that we are releasing all blocks.
3678  *
3679  * Note that this function may be called when there is no vnode backing
3680  * the file fork in question.  We may call this from hfs_vnop_inactive
3681  * to clear out resource fork data (and may not want to clear out the data
3682  * fork yet).  As a result, we pointer-check both sets of inputs before
3683  * doing anything with them.
3684  *
3685  * The caller is responsible for saving off a copy of the filefork(s)
3686  * embedded within the cnode prior to calling this function.  The pointers
3687  * supplied as arguments must be valid even if the cnode is no longer valid.
3688  */
3689
3690 int
3691 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3692                                          struct filefork *rsrcfork, u_int32_t fileid) {
3693
3694         off_t filebytes;
3695         u_int32_t fileblocks;
3696         int blksize = 0;
3697         int error = 0;
3698         int lockflags;
3699
3700         blksize = hfsmp->blockSize;
3701
3702         /* Data Fork */
3703         if ((datafork != NULL) && (datafork->ff_blocks > 0)) {
3704                 fileblocks = datafork->ff_blocks;
3705                 filebytes = (off_t)fileblocks * (off_t)blksize;
3706
3707                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3708
3709                 while (filebytes > 0) {
3710                         if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) {
3711                                 filebytes -= HFS_BIGFILE_SIZE;
3712                         } else {
3713                                 filebytes = 0;
3714                         }
3715
3716                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3717                         if (hfs_start_transaction(hfsmp) != 0) {
3718                                 error = EINVAL;
3719                                 break;
3720                         }
3721
3722                         if (datafork->ff_unallocblocks == 0) {
3723                                 /* Protect extents b-tree and allocation bitmap */
3724                                 lockflags = SFL_BITMAP;
3725                                 if (overflow_extents(datafork))
3726                                         lockflags |= SFL_EXTENTS;
3727                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3728
3729                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3730
3731                                 hfs_systemfile_unlock(hfsmp, lockflags);
3732                         }
3733                         if (error == 0) {
3734                                 datafork->ff_size = filebytes;
3735                         }
3736                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3737
3738                         /* Finish the transaction and start over if necessary */
3739                         hfs_end_transaction(hfsmp);
3740
3741                         if (error) {
3742                                 break;
3743                         }
3744                 }
3745         }
3746
3747         /* Resource fork */
3748         if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) {
3749                 fileblocks = rsrcfork->ff_blocks;
3750                 filebytes = (off_t)fileblocks * (off_t)blksize;
3751
3752                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3753
3754                 while (filebytes > 0) {
3755                         if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) {
3756                                 filebytes -= HFS_BIGFILE_SIZE;
3757                         } else {
3758                                 filebytes = 0;
3759                         }
3760
3761                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3762                         if (hfs_start_transaction(hfsmp) != 0) {
3763                                 error = EINVAL;
3764                                 break;
3765                         }
3766
3767                         if (rsrcfork->ff_unallocblocks == 0) {
3768                                 /* Protect extents b-tree and allocation bitmap */
3769                                 lockflags = SFL_BITMAP;
3770                                 if (overflow_extents(rsrcfork))
3771                                         lockflags |= SFL_EXTENTS;
3772                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3773
3774                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3775
3776                                 hfs_systemfile_unlock(hfsmp, lockflags);
3777                         }
3778                         if (error == 0) {
3779                                 rsrcfork->ff_size = filebytes;
3780                         }
3781                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3782
3783                         /* Finish the transaction and start over if necessary */
3784                         hfs_end_transaction(hfsmp);
3785
3786                         if (error) {
3787                                 break;
3788                         }
3789                 }
3790         }
3791
3792         return error;
3793 }
3794
3795
3796 /*
3797  * Truncate a cnode to at most length size, freeing (or adding) the
3798  * disk blocks.
3799  */
3800 int
3801 hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize,
3802              int truncateflags, vfs_context_t context)
3803 {
3804         struct filefork *fp = VTOF(vp);
3805         off_t filebytes;
3806         u_int32_t fileblocks;
3807         int blksize, error = 0;
3808         struct cnode *cp = VTOC(vp);
3809
3810         /* Cannot truncate an HFS directory! */
3811         if (vnode_isdir(vp)) {
3812                 return (EISDIR);
3813         }
3814         /* A swap file cannot change size. */
3815         if (vnode_isswap(vp) && (length != 0)) {
3816                 return (EPERM);
3817         }
3818
3819         blksize = VTOVCB(vp)->blockSize;
3820         fileblocks = fp->ff_blocks;
3821         filebytes = (off_t)fileblocks * (off_t)blksize;
3822
3823         //
3824         // Have to do this here so that we don't wind up with
3825         // i/o pending for blocks that are about to be released
3826         // if we truncate the file.
3827         //
3828         // If skipsetsize is set, then the caller is responsible
3829         // for the ubc_setsize.
3830         //
3831         // Even if skipsetsize is set, if the length is zero we
3832         // want to call ubc_setsize() because as of SnowLeopard
3833         // it will no longer cause any page-ins and it will drop
3834         // any dirty pages so that we don't do any i/o that we
3835         // don't have to.  This also prevents a race where i/o
3836         // for truncated blocks may overwrite later data if the
3837         // blocks get reallocated to a different file.
3838         //
3839         if (!skipsetsize || length == 0)
3840                 ubc_setsize(vp, length);
3841
3842         // have to loop truncating or growing files that are
3843         // really big because otherwise transactions can get
3844         // enormous and consume too many kernel resources.
3845
3846         if (length < filebytes) {
3847                 while (filebytes > length) {
3848                         if ((filebytes - length) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3849                                 filebytes -= HFS_BIGFILE_SIZE;
3850                         } else {
3851                                 filebytes = length;
3852                         }
3853                         cp->c_flag |= C_FORCEUPDATE;
3854                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3855                         if (error)
3856                                 break;
3857                 }
3858         } else if (length > filebytes) {
3859                 while (filebytes < length) {
3860                         if ((length - filebytes) > HFS_BIGFILE_SIZE && overflow_extents(fp)) {
3861                                 filebytes += HFS_BIGFILE_SIZE;
3862                         } else {
3863                                 filebytes = length;
3864                         }
3865                         cp->c_flag |= C_FORCEUPDATE;
3866                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3867                         if (error)
3868                                 break;
3869                 }
3870         } else /* Same logical size */ {
3871
3872                 error = do_hfs_truncate(vp, length, flags, truncateflags, context);
3873         }
3874         /* Files that are changing size are not hot file candidates. */
3875         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
3876                 fp->ff_bytesread = 0;
3877         }
3878
3879         return (error);
3880 }
3881
3882
3883
3884 /*
3885  * Preallocate file storage space.
3886  */
3887 int
3888 hfs_vnop_allocate(struct vnop_allocate_args /* {
3889                 vnode_t a_vp;
3890                 off_t a_length;
3891                 u_int32_t  a_flags;
3892                 off_t *a_bytesallocated;
3893                 off_t a_offset;
3894                 vfs_context_t a_context;
3895         } */ *ap)
3896 {
3897         struct vnode *vp = ap->a_vp;
3898         struct cnode *cp;
3899         struct filefork *fp;
3900         ExtendedVCB *vcb;
3901         off_t length = ap->a_length;
3902         off_t startingPEOF;
3903         off_t moreBytesRequested;
3904         off_t actualBytesAdded;
3905         off_t filebytes;
3906         u_int32_t fileblocks;
3907         int retval, retval2;
3908         u_int32_t blockHint;
3909         u_int32_t extendFlags;   /* For call to ExtendFileC */
3910         struct hfsmount *hfsmp;
3911         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
3912         int lockflags;
3913         time_t orig_ctime;
3914
3915         *(ap->a_bytesallocated) = 0;
3916
3917         if (!vnode_isreg(vp))
3918                 return (EISDIR);
3919         if (length < (off_t)0)
3920                 return (EINVAL);
3921
3922         cp = VTOC(vp);
3923
3924         orig_ctime = VTOC(vp)->c_ctime;
3925
3926         check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
3927
3928         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3929
3930         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
3931                 goto Err_Exit;
3932         }
3933
3934         fp = VTOF(vp);
3935         hfsmp = VTOHFS(vp);
3936         vcb = VTOVCB(vp);
3937
3938         fileblocks = fp->ff_blocks;
3939         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
3940
3941         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
3942                 retval = EINVAL;
3943                 goto Err_Exit;
3944         }
3945
3946         /* Fill in the flags word for the call to Extend the file */
3947
3948         extendFlags = kEFNoClumpMask;
3949         if (ap->a_flags & ALLOCATECONTIG)
3950                 extendFlags |= kEFContigMask;
3951         if (ap->a_flags & ALLOCATEALL)
3952                 extendFlags |= kEFAllMask;
3953         if (cred && suser(cred, NULL) != 0)
3954                 extendFlags |= kEFReserveMask;
3955         if (hfs_virtualmetafile(cp))
3956                 extendFlags |= kEFMetadataMask;
3957
3958         retval = E_NONE;
3959         blockHint = 0;
3960         startingPEOF = filebytes;
3961
3962         if (ap->a_flags & ALLOCATEFROMPEOF)
3963                 length += filebytes;
3964         else if (ap->a_flags & ALLOCATEFROMVOL)
3965                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
3966
3967         /* If no changes are necesary, then we're done */
3968         if (filebytes == length)
3969                 goto Std_Exit;
3970
3971         /*
3972          * Lengthen the size of the file. We must ensure that the
3973          * last byte of the file is allocated. Since the smallest
3974          * value of filebytes is 0, length will be at least 1.
3975          */
3976         if (length > filebytes) {
3977                 off_t total_bytes_added = 0, orig_request_size;
3978
3979                 orig_request_size = moreBytesRequested = length - filebytes;
3980
3981 #if QUOTA
3982                 retval = hfs_chkdq(cp,
3983                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
3984                                 cred, 0);
3985                 if (retval)
3986                         goto Err_Exit;
3987
3988 #endif /* QUOTA */
3989                 /*
3990                  * Metadata zone checks.
3991                  */
3992                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
3993                         /*
3994                          * Allocate Journal and Quota files in metadata zone.
3995                          */
3996                         if (hfs_virtualmetafile(cp)) {
3997                                 blockHint = hfsmp->hfs_metazone_start;
3998                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
3999                                    (blockHint <= hfsmp->hfs_metazone_end)) {
4000                                 /*
4001                                  * Move blockHint outside metadata zone.
4002                                  */
4003                                 blockHint = hfsmp->hfs_metazone_end + 1;
4004                         }
4005                 }
4006
4007
4008                 while ((length > filebytes) && (retval == E_NONE)) {
4009                     off_t bytesRequested;
4010
4011                     if (hfs_start_transaction(hfsmp) != 0) {
4012                         retval = EINVAL;
4013                         goto Err_Exit;
4014                     }
4015
4016                     /* Protect extents b-tree and allocation bitmap */
4017                     lockflags = SFL_BITMAP;
4018                     if (overflow_extents(fp))
4019                         lockflags |= SFL_EXTENTS;
4020                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4021
4022                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4023                         bytesRequested = HFS_BIGFILE_SIZE;
4024                     } else {
4025                         bytesRequested = moreBytesRequested;
4026                     }
4027
4028                     if (extendFlags & kEFContigMask) {
4029                             // if we're on a sparse device, this will force it to do a
4030                             // full scan to find the space needed.
4031                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4032                     }
4033
4034                     retval = MacToVFSError(ExtendFileC(vcb,
4035                                                 (FCB*)fp,
4036                                                 bytesRequested,
4037                                                 blockHint,
4038                                                 extendFlags,
4039                                                 &actualBytesAdded));
4040
4041                     if (retval == E_NONE) {
4042                         *(ap->a_bytesallocated) += actualBytesAdded;
4043                         total_bytes_added += actualBytesAdded;
4044                         moreBytesRequested -= actualBytesAdded;
4045                         if (blockHint != 0) {
4046                             blockHint += actualBytesAdded / vcb->blockSize;
4047                         }
4048                     }
4049                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4050
4051                     hfs_systemfile_unlock(hfsmp, lockflags);
4052
4053                     if (hfsmp->jnl) {
4054                         (void) hfs_update(vp, TRUE);
4055                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4056                     }
4057
4058                     hfs_end_transaction(hfsmp);
4059                 }
4060
4061
4062                 /*
4063                  * if we get an error and no changes were made then exit
4064                  * otherwise we must do the hfs_update to reflect the changes
4065                  */
4066                 if (retval && (startingPEOF == filebytes))
4067                         goto Err_Exit;
4068
4069                 /*
4070                  * Adjust actualBytesAdded to be allocation block aligned, not
4071                  * clump size aligned.
4072                  * NOTE: So what we are reporting does not affect reality
4073                  * until the file is closed, when we truncate the file to allocation
4074                  * block size.
4075                  */
4076                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4077                         *(ap->a_bytesallocated) =
4078                                 roundup(orig_request_size, (off_t)vcb->blockSize);
4079
4080         } else { /* Shorten the size of the file */
4081
4082                 if (fp->ff_size > length) {
4083                         /*
4084                          * Any buffers that are past the truncation point need to be
4085                          * invalidated (to maintain buffer cache consistency).
4086                          */
4087                 }
4088
4089                 retval = hfs_truncate(vp, length, 0, 0, 0, ap->a_context);
4090                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4091
4092                 /*
4093                  * if we get an error and no changes were made then exit
4094                  * otherwise we must do the hfs_update to reflect the changes
4095                  */
4096                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4097 #if QUOTA
4098                 /* These are  bytesreleased */
4099                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4100 #endif /* QUOTA */
4101
4102                 if (fp->ff_size > filebytes) {
4103                         fp->ff_size = filebytes;
4104
4105                         hfs_unlock(cp);
4106                         ubc_setsize(vp, fp->ff_size);
4107                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4108                 }
4109         }
4110
4111 Std_Exit:
4112         cp->c_touch_chgtime = TRUE;
4113         cp->c_touch_modtime = TRUE;
4114         retval2 = hfs_update(vp, MNT_WAIT);
4115
4116         if (retval == 0)
4117                 retval = retval2;
4118 Err_Exit:
4119         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4120         hfs_unlock(cp);
4121         return (retval);
4122 }
4123
4124
4125 /*
4126  * Pagein for HFS filesystem
4127  */
4128 int
4129 hfs_vnop_pagein(struct vnop_pagein_args *ap)
4130 /*
4131         struct vnop_pagein_args {
4132                 vnode_t a_vp,
4133                 upl_t         a_pl,
4134                 vm_offset_t   a_pl_offset,
4135                 off_t         a_f_offset,
4136                 size_t        a_size,
4137                 int           a_flags
4138                 vfs_context_t a_context;
4139         };
4140 */
4141 {
4142         vnode_t         vp;
4143         struct cnode    *cp;
4144         struct filefork *fp;
4145         int             error = 0;
4146         upl_t           upl;
4147         upl_page_info_t *pl;
4148         off_t           f_offset;
4149         int             offset;
4150         int             isize;
4151         int             pg_index;
4152         boolean_t       truncate_lock_held = FALSE;
4153         boolean_t       file_converted = FALSE;
4154         kern_return_t   kret;
4155
4156         vp = ap->a_vp;
4157         cp = VTOC(vp);
4158         fp = VTOF(vp);
4159
4160 #if CONFIG_PROTECT
4161         if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4162                 /*
4163                  * If we errored here, then this means that one of two things occurred:
4164                  * 1. there was a problem with the decryption of the key.
4165                  * 2. the device is locked and we are not allowed to access this particular file.
4166                  *
4167                  * Either way, this means that we need to shut down this upl now.  As long as
4168                  * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4169                  * then we create a upl and immediately abort it.
4170                  */
4171                 if (ap->a_pl == NULL) {
4172                         /* create the upl */
4173                         ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4174                                         UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4175                         /* mark the range as needed so it doesn't immediately get discarded upon abort */
4176                         ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4177
4178                         /* Abort the range */
4179                         ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4180                 }
4181
4182
4183                 return error;
4184         }
4185 #endif /* CONFIG_PROTECT */
4186
4187         if (ap->a_pl != NULL) {
4188                 /*
4189                  * this can only happen for swap files now that
4190                  * we're asking for V2 paging behavior...
4191                  * so don't need to worry about decompression, or
4192                  * keeping track of blocks read or taking the truncate lock
4193                  */
4194                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4195                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4196                 goto pagein_done;
4197         }
4198
4199 retry_pagein:
4200         /*
4201          * take truncate lock (shared/recursive) to guard against
4202          * zero-fill thru fsync interfering, but only for v2
4203          *
4204          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4205          * lock shared and we are allowed to recurse 1 level if this thread already
4206          * owns the lock exclusively... this can legally occur
4207          * if we are doing a shrinking ftruncate against a file
4208          * that is mapped private, and the pages being truncated
4209          * do not currently exist in the cache... in that case
4210          * we will have to page-in the missing pages in order
4211          * to provide them to the private mapping... we must
4212          * also call hfs_unlock_truncate with a postive been_recursed
4213          * arg to indicate that if we have recursed, there is no need to drop
4214          * the lock.  Allowing this simple recursion is necessary
4215          * in order to avoid a certain deadlock... since the ftruncate
4216          * already holds the truncate lock exclusively, if we try
4217          * to acquire it shared to protect the pagein path, we will
4218          * hang this thread
4219          *
4220          * NOTE: The if () block below is a workaround in order to prevent a
4221          * VM deadlock. See rdar://7853471.
4222          *
4223          * If we are in a forced unmount, then launchd will still have the
4224          * dyld_shared_cache file mapped as it is trying to reboot.  If we
4225          * take the truncate lock here to service a page fault, then our
4226          * thread could deadlock with the forced-unmount.  The forced unmount
4227          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4228          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
4229          * thread will think it needs to copy all of the data out of the file
4230          * and into a VM copy object.  If we hold the cnode lock here, then that
4231          * VM operation will not be able to proceed, because we'll set a busy page
4232          * before attempting to grab the lock.  Note that this isn't as simple as "don't
4233          * call ubc_setsize" because doing that would just shift the problem to the
4234          * ubc_msync done before the vnode is reclaimed.
4235          *
4236          * So, if a forced unmount on this volume is in flight AND the cnode is
4237          * marked C_DELETED, then just go ahead and do the page in without taking
4238          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4239          * that is not going to be available on the next mount, this seems like a
4240          * OK solution from a correctness point of view, even though it is hacky.
4241          */
4242         if (vfs_isforce(vp->v_mount)) {
4243                 if (cp->c_flag & C_DELETED) {
4244                         /* If we don't get it, then just go ahead and operate without the lock */
4245                         truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4246                 }
4247         }
4248         else {
4249                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4250                 truncate_lock_held = TRUE;
4251         }
4252
4253         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4254
4255         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4256                 error = EINVAL;
4257                 goto pagein_done;
4258         }
4259         ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4260
4261         isize = ap->a_size;
4262
4263         /*
4264          * Scan from the back to find the last page in the UPL, so that we
4265          * aren't looking at a UPL that may have already been freed by the
4266          * preceding aborts/completions.
4267          */
4268         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4269                 if (upl_page_present(pl, --pg_index))
4270                         break;
4271                 if (pg_index == 0) {
4272                         /*
4273                          * no absent pages were found in the range specified
4274                          * just abort the UPL to get rid of it and then we're done
4275                          */
4276                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4277                         goto pagein_done;
4278                 }
4279         }
4280         /*
4281          * initialize the offset variables before we touch the UPL.
4282          * f_offset is the position into the file, in bytes
4283          * offset is the position into the UPL, in bytes
4284          * pg_index is the pg# of the UPL we're operating on
4285          * isize is the offset into the UPL of the last page that is present.
4286          */
4287         isize = ((pg_index + 1) * PAGE_SIZE);
4288         pg_index = 0;
4289         offset = 0;
4290         f_offset = ap->a_f_offset;
4291
4292         while (isize) {
4293                 int  xsize;
4294                 int  num_of_pages;
4295
4296                 if ( !upl_page_present(pl, pg_index)) {
4297                         /*
4298                          * we asked for RET_ONLY_ABSENT, so it's possible
4299                          * to get back empty slots in the UPL.
4300                          * just skip over them
4301                          */
4302                         f_offset += PAGE_SIZE;
4303                         offset   += PAGE_SIZE;
4304                         isize    -= PAGE_SIZE;
4305                         pg_index++;
4306
4307                         continue;
4308                 }
4309                 /*
4310                  * We know that we have at least one absent page.
4311                  * Now checking to see how many in a row we have
4312                  */
4313                 num_of_pages = 1;
4314                 xsize = isize - PAGE_SIZE;
4315
4316                 while (xsize) {
4317                         if ( !upl_page_present(pl, pg_index + num_of_pages))
4318                                 break;
4319                         num_of_pages++;
4320                         xsize -= PAGE_SIZE;
4321                 }
4322                 xsize = num_of_pages * PAGE_SIZE;
4323
4324 #if HFS_COMPRESSION
4325                 if (VNODE_IS_RSRC(vp)) {
4326                         /* allow pageins of the resource fork */
4327                 } else {
4328                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4329
4330                         if (compressed) {
4331                                 if (truncate_lock_held) {
4332                                         /*
4333                                          * can't hold the truncate lock when calling into the decmpfs layer
4334                                          * since it calls back into this layer... even though we're only
4335                                          * holding the lock in shared mode, and the re-entrant path only
4336                                          * takes the lock shared, we can deadlock if some other thread
4337                                          * tries to grab the lock exclusively in between.
4338                                          */
4339                                         hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4340                                         truncate_lock_held = FALSE;
4341                                 }
4342                                 ap->a_pl = upl;
4343                                 ap->a_pl_offset = offset;
4344                                 ap->a_f_offset = f_offset;
4345                                 ap->a_size = xsize;
4346
4347                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4348                                 /*
4349                                  * note that decpfs_pagein_compressed can change the state of
4350                                  * 'compressed'... it will set it to 0 if the file is no longer
4351                                  * compressed once the compression lock is successfully taken
4352                                  * i.e. we would block on that lock while the file is being inflated
4353                                  */
4354                                 if (compressed) {
4355                                         if (error == 0) {
4356                                                 /* successful page-in, update the access time */
4357                                                 VTOC(vp)->c_touch_acctime = TRUE;
4358
4359                                                 /* compressed files are not hot file candidates */
4360                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4361                                                         fp->ff_bytesread = 0;
4362                                                 }
4363                                         } else if (error == EAGAIN) {
4364                                                 /*
4365                                                  * EAGAIN indicates someone else already holds the compression lock...
4366                                                  * to avoid deadlocking, we'll abort this range of pages with an
4367                                                  * indication that the pagein needs to be redriven
4368                                                  */
4369                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4370                                         }
4371                                         goto pagein_next_range;
4372                                 }
4373                                 else {
4374                                         /*
4375                                          * Set file_converted only if the file became decompressed while we were
4376                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
4377                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4378                                          * condition below, since we could have avoided taking the truncate lock to prevent
4379                                          * a deadlock in the force unmount case.
4380                                          */
4381                                         file_converted = TRUE;
4382                                 }
4383                         }
4384                         if (file_converted == TRUE) {
4385                                 /*
4386                                  * the file was converted back to a regular file after we first saw it as compressed
4387                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4388                                  * reset a_size so that we consider what remains of the original request
4389                                  * and null out a_upl and a_pl_offset.
4390                                  *
4391                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
4392                                  * successfully decompressed the range in question for this file.
4393                                  */
4394                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4395
4396                                 ap->a_size = isize;
4397                                 ap->a_pl = NULL;
4398                                 ap->a_pl_offset = 0;
4399
4400                                 /* Reset file_converted back to false so that we don't infinite-loop. */
4401                                 file_converted = FALSE;
4402                                 goto retry_pagein;
4403                         }
4404                 }
4405 #endif
4406                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4407
4408                 /*
4409                  * Keep track of blocks read.
4410                  */
4411                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4412                         int bytesread;
4413                         int took_cnode_lock = 0;
4414
4415                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4416                                 bytesread = fp->ff_size;
4417                         else
4418                                 bytesread = xsize;
4419
4420                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4421                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4422                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4423                                 took_cnode_lock = 1;
4424                         }
4425                         /*
4426                          * If this file hasn't been seen since the start of
4427                          * the current sampling period then start over.
4428                          */
4429                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4430                                 struct timeval tv;
4431
4432                                 fp->ff_bytesread = bytesread;
4433                                 microtime(&tv);
4434                                 cp->c_atime = tv.tv_sec;
4435                         } else {
4436                                 fp->ff_bytesread += bytesread;
4437                         }
4438                         cp->c_touch_acctime = TRUE;
4439                         if (took_cnode_lock)
4440                                 hfs_unlock(cp);
4441                 }
4442 pagein_next_range:
4443                 f_offset += xsize;
4444                 offset   += xsize;
4445                 isize    -= xsize;
4446                 pg_index += num_of_pages;
4447
4448                 error = 0;
4449         }
4450
4451 pagein_done:
4452         if (truncate_lock_held == TRUE) {
4453                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4454                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4455         }
4456
4457         return (error);
4458 }
4459
4460 /*
4461  * Pageout for HFS filesystem.
4462  */
4463 int
4464 hfs_vnop_pageout(struct vnop_pageout_args *ap)
4465 /*
4466         struct vnop_pageout_args {
4467            vnode_t a_vp,
4468            upl_t         a_pl,
4469            vm_offset_t   a_pl_offset,
4470            off_t         a_f_offset,
4471            size_t        a_size,
4472            int           a_flags
4473            vfs_context_t a_context;
4474         };
4475 */
4476 {
4477         vnode_t vp = ap->a_vp;
4478         struct cnode *cp;
4479         struct filefork *fp;
4480         int retval = 0;
4481         off_t filesize;
4482         upl_t           upl;
4483         upl_page_info_t* pl;
4484         vm_offset_t     a_pl_offset;
4485         int             a_flags;
4486         int is_pageoutv2 = 0;
4487         kern_return_t kret;
4488
4489         cp = VTOC(vp);
4490         fp = VTOF(vp);
4491
4492         /*
4493          * Figure out where the file ends, for pageout purposes.  If
4494          * ff_new_size > ff_size, then we're in the middle of extending the
4495          * file via a write, so it is safe (and necessary) that we be able
4496          * to pageout up to that point.
4497          */
4498         filesize = fp->ff_size;
4499         if (fp->ff_new_size > filesize)
4500                 filesize = fp->ff_new_size;
4501
4502         a_flags = ap->a_flags;
4503         a_pl_offset = ap->a_pl_offset;
4504
4505         if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
4506                 hfs_incr_gencount (cp);
4507         }
4508
4509         /*
4510          * we can tell if we're getting the new or old behavior from the UPL
4511          */
4512         if ((upl = ap->a_pl) == NULL) {
4513                 int request_flags;
4514
4515                 is_pageoutv2 = 1;
4516                 /*
4517                  * we're in control of any UPL we commit
4518                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4519                  */
4520                 a_flags &= ~UPL_NOCOMMIT;
4521                 a_pl_offset = 0;
4522
4523                 /*
4524                  * For V2 semantics, we want to take the cnode truncate lock
4525                  * shared to guard against the file size changing via zero-filling.
4526                  *
4527                  * However, we have to be careful because we may be invoked
4528                  * via the ubc_msync path to write out dirty mmap'd pages
4529                  * in response to a lock event on a content-protected
4530                  * filesystem (e.g. to write out class A files).
4531                  * As a result, we want to take the truncate lock 'SHARED' with
4532                  * the mini-recursion locktype so that we don't deadlock/panic
4533                  * because we may be already holding the truncate lock exclusive to force any other
4534                  * IOs to have blocked behind us.
4535                  */
4536                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4537
4538                 if (a_flags & UPL_MSYNC) {
4539                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4540                 }
4541                 else {
4542                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4543                 }
4544
4545                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4546
4547                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4548                         retval = EINVAL;
4549                         goto pageout_done;
4550                 }
4551         }
4552         /*
4553          * from this point forward upl points at the UPL we're working with
4554          * it was either passed in or we succesfully created it
4555          */
4556
4557         /*
4558          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4559          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4560          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4561          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4562          * logic in vnode_pageout except that we need to do it after grabbing the truncate
4563          * lock in HFS so that we don't lock invert ourselves.
4564          *
4565          * Note that we can still get into this function on behalf of the default pager with
4566          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4567          * since fsync and other writing threads will grab the locks, then mark the
4568          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4569          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4570          * we do not try to grab anything for the pre-V2 case, which should only be accessed
4571          * by the paging/VM system.
4572          */
4573
4574         if (is_pageoutv2) {
4575                 off_t f_offset;
4576                 int offset;
4577                 int isize;
4578                 int pg_index;
4579                 int error;
4580                 int error_ret = 0;
4581
4582                 isize = ap->a_size;
4583                 f_offset = ap->a_f_offset;
4584
4585                 /*
4586                  * Scan from the back to find the last page in the UPL, so that we
4587                  * aren't looking at a UPL that may have already been freed by the
4588                  * preceding aborts/completions.
4589                  */
4590                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4591                         if (upl_page_present(pl, --pg_index))
4592                                 break;
4593                         if (pg_index == 0) {
4594                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4595                                 goto pageout_done;
4596                         }
4597                 }
4598
4599                 /*
4600                  * initialize the offset variables before we touch the UPL.
4601                  * a_f_offset is the position into the file, in bytes
4602                  * offset is the position into the UPL, in bytes
4603                  * pg_index is the pg# of the UPL we're operating on.
4604                  * isize is the offset into the UPL of the last non-clean page.
4605                  */
4606                 isize = ((pg_index + 1) * PAGE_SIZE);
4607
4608                 offset = 0;
4609                 pg_index = 0;
4610
4611                 while (isize) {
4612                         int  xsize;
4613                         int  num_of_pages;
4614
4615                         if ( !upl_page_present(pl, pg_index)) {
4616                                 /*
4617                                  * we asked for RET_ONLY_DIRTY, so it's possible
4618                                  * to get back empty slots in the UPL.
4619                                  * just skip over them
4620                                  */
4621                                 f_offset += PAGE_SIZE;
4622                                 offset   += PAGE_SIZE;
4623                                 isize    -= PAGE_SIZE;
4624                                 pg_index++;
4625
4626                                 continue;
4627                         }
4628                         if ( !upl_dirty_page(pl, pg_index)) {
4629                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4630                         }
4631
4632                         /*
4633                          * We know that we have at least one dirty page.
4634                          * Now checking to see how many in a row we have
4635                          */
4636                         num_of_pages = 1;
4637                         xsize = isize - PAGE_SIZE;
4638
4639                         while (xsize) {
4640                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4641                                         break;
4642                                 num_of_pages++;
4643                                 xsize -= PAGE_SIZE;
4644                         }
4645                         xsize = num_of_pages * PAGE_SIZE;
4646
4647                         if (!vnode_isswap(vp)) {
4648                                 off_t end_of_range;
4649                                 int tooklock;
4650
4651                                 tooklock = 0;
4652
4653                                 if (cp->c_lockowner != current_thread()) {
4654                                         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4655                                                 /*
4656                                                  * we're in the v2 path, so we are the
4657                                                  * owner of the UPL... we may have already
4658                                                  * processed some of the UPL, so abort it
4659                                                  * from the current working offset to the
4660                                                  * end of the UPL
4661                                                  */
4662                                                 ubc_upl_abort_range(upl,
4663                                                                     offset,
4664                                                                     ap->a_size - offset,
4665                                                                     UPL_ABORT_FREE_ON_EMPTY);
4666                                                 goto pageout_done;
4667                                         }
4668                                         tooklock = 1;
4669                                 }
4670                                 end_of_range = f_offset + xsize - 1;
4671
4672                                 if (end_of_range >= filesize) {
4673                                         end_of_range = (off_t)(filesize - 1);
4674                                 }
4675                                 if (f_offset < filesize) {
4676                                         rl_remove(f_offset, end_of_range, &fp->ff_invalidranges);
4677                                         cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4678                                 }
4679                                 if (tooklock) {
4680                                         hfs_unlock(cp);
4681                                 }
4682                         }
4683                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
4684                                                         xsize, filesize, a_flags))) {
4685                                 if (error_ret == 0)
4686                                         error_ret = error;
4687                         }
4688                         f_offset += xsize;
4689                         offset   += xsize;
4690                         isize    -= xsize;
4691                         pg_index += num_of_pages;
4692                 }
4693                 /* capture errnos bubbled out of cluster_pageout if they occurred */
4694                 if (error_ret != 0) {
4695                         retval = error_ret;
4696                 }
4697         } /* end block for v2 pageout behavior */
4698         else {
4699                 if (!vnode_isswap(vp)) {
4700                         off_t end_of_range;
4701                         int tooklock = 0;
4702
4703                         if (cp->c_lockowner != current_thread()) {
4704                                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4705                                         if (!(a_flags & UPL_NOCOMMIT)) {
4706                                                 ubc_upl_abort_range(upl,
4707                                                                     a_pl_offset,
4708                                                                     ap->a_size,
4709                                                                     UPL_ABORT_FREE_ON_EMPTY);
4710                                         }
4711                                         goto pageout_done;
4712                                 }
4713                                 tooklock = 1;
4714                         }
4715                         end_of_range = ap->a_f_offset + ap->a_size - 1;
4716
4717                         if (end_of_range >= filesize) {
4718                                 end_of_range = (off_t)(filesize - 1);
4719                         }
4720                         if (ap->a_f_offset < filesize) {
4721                                 rl_remove(ap->a_f_offset, end_of_range, &fp->ff_invalidranges);
4722                                 cp->c_flag |= C_MODIFIED;  /* leof is dirty */
4723                         }
4724
4725                         if (tooklock) {
4726                                 hfs_unlock(cp);
4727                         }
4728                 }
4729                 /*
4730                  * just call cluster_pageout for old pre-v2 behavior
4731                  */
4732                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4733                                 ap->a_size, filesize, a_flags);
4734         }
4735
4736         /*
4737          * If data was written, update the modification time of the file.
4738          * If setuid or setgid bits are set and this process is not the
4739          * superuser then clear the setuid and setgid bits as a precaution
4740          * against tampering.
4741          */
4742         if (retval == 0) {
4743                 cp->c_touch_modtime = TRUE;
4744                 cp->c_touch_chgtime = TRUE;
4745                 if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4746                     (vfs_context_suser(ap->a_context) != 0)) {
4747                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4748                         cp->c_mode &= ~(S_ISUID | S_ISGID);
4749                         hfs_unlock(cp);
4750                 }
4751         }
4752
4753 pageout_done:
4754         if (is_pageoutv2) {
4755                 /*
4756                  * Release the truncate lock.  Note that because
4757                  * we may have taken the lock recursively by
4758                  * being invoked via ubc_msync due to lockdown,
4759                  * we should release it recursively, too.
4760                  */
4761                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4762         }
4763         return (retval);
4764 }
4765
4766 /*
4767  * Intercept B-Tree node writes to unswap them if necessary.
4768  */
4769 int
4770 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4771 {
4772         int retval = 0;
4773         register struct buf *bp = ap->a_bp;
4774         register struct vnode *vp = buf_vnode(bp);
4775         BlockDescriptor block;
4776
4777         /* Trap B-Tree writes */
4778         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4779             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4780             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4781             (vp == VTOHFS(vp)->hfc_filevp)) {
4782
4783                 /*
4784                  * Swap and validate the node if it is in native byte order.
4785                  * This is always be true on big endian, so we always validate
4786                  * before writing here.  On little endian, the node typically has
4787                  * been swapped and validated when it was written to the journal,
4788                  * so we won't do anything here.
4789                  */
4790                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4791                         /* Prepare the block pointer */
4792                         block.blockHeader = bp;
4793                         block.buffer = (char *)buf_dataptr(bp);
4794                         block.blockNum = buf_lblkno(bp);
4795                         /* not found in cache ==> came from disk */
4796                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4797                         block.blockSize = buf_count(bp);
4798
4799                         /* Endian un-swap B-Tree node */
4800                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4801                         if (retval)
4802                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4803                 }
4804         }
4805
4806         /* This buffer shouldn't be locked anymore but if it is clear it */
4807         if ((buf_flags(bp) & B_LOCKED)) {
4808                 // XXXdbg
4809                 if (VTOHFS(vp)->jnl) {
4810                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4811                 }
4812                 buf_clearflags(bp, B_LOCKED);
4813         }
4814         retval = vn_bwrite (ap);
4815
4816         return (retval);
4817 }
4818
4819 /*
4820  * Relocate a file to a new location on disk
4821  *  cnode must be locked on entry
4822  *
4823  * Relocation occurs by cloning the file's data from its
4824  * current set of blocks to a new set of blocks. During
4825  * the relocation all of the blocks (old and new) are
4826  * owned by the file.
4827  *
4828  * -----------------
4829  * |///////////////|
4830  * -----------------
4831  * 0               N (file offset)
4832  *
4833  * -----------------     -----------------
4834  * |///////////////|     |               |     STEP 1 (acquire new blocks)
4835  * -----------------     -----------------
4836  * 0               N     N+1             2N
4837  *
4838  * -----------------     -----------------
4839  * |///////////////|     |///////////////|     STEP 2 (clone data)
4840  * -----------------     -----------------
4841  * 0               N     N+1             2N
4842  *
4843  *                       -----------------
4844  *                       |///////////////|     STEP 3 (head truncate blocks)
4845  *                       -----------------
4846  *                       0               N
4847  *
4848  * During steps 2 and 3 page-outs to file offsets less
4849  * than or equal to N are suspended.
4850  *
4851  * During step 3 page-ins to the file get suspended.
4852  */
4853 int
4854 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
4855         struct  proc *p)
4856 {
4857         struct  cnode *cp;
4858         struct  filefork *fp;
4859         struct  hfsmount *hfsmp;
4860         u_int32_t  headblks;
4861         u_int32_t  datablks;
4862         u_int32_t  blksize;
4863         u_int32_t  growsize;
4864         u_int32_t  nextallocsave;
4865         daddr64_t  sector_a,  sector_b;
4866         int eflags;
4867         off_t  newbytes;
4868         int  retval;
4869         int lockflags = 0;
4870         int took_trunc_lock = 0;
4871         int started_tr = 0;
4872         enum vtype vnodetype;
4873
4874         vnodetype = vnode_vtype(vp);
4875         if (vnodetype != VREG) {
4876                 /* Not allowed to move symlinks. */
4877                 return (EPERM);
4878         }
4879
4880         hfsmp = VTOHFS(vp);
4881         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
4882                 return (ENOSPC);
4883         }
4884
4885         cp = VTOC(vp);
4886         fp = VTOF(vp);
4887         if (fp->ff_unallocblocks)
4888                 return (EINVAL);
4889
4890 #if CONFIG_PROTECT
4891         /*
4892          * <rdar://problem/9118426>
4893          * Disable HFS file relocation on content-protected filesystems
4894          */
4895         if (cp_fs_protected (hfsmp->hfs_mp)) {
4896                 return EINVAL;
4897         }
4898 #endif
4899         /* If it's an SSD, also disable HFS relocation */
4900         if (hfsmp->hfs_flags & HFS_SSD) {
4901                 return EINVAL;
4902         }
4903
4904
4905         blksize = hfsmp->blockSize;
4906         if (blockHint == 0)
4907                 blockHint = hfsmp->nextAllocation;
4908
4909         if (fp->ff_size > 0x7fffffff) {
4910                 return (EFBIG);
4911         }
4912
4913         //
4914         // We do not believe that this call to hfs_fsync() is
4915         // necessary and it causes a journal transaction
4916         // deadlock so we are removing it.
4917         //
4918         //if (vnodetype == VREG && !vnode_issystem(vp)) {
4919         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
4920         //      if (retval)
4921         //              return (retval);
4922         //}
4923
4924         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
4925                 hfs_unlock(cp);
4926                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4927                 /* Force lock since callers expects lock to be held. */
4928                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
4929                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4930                         return (retval);
4931                 }
4932                 /* No need to continue if file was removed. */
4933                 if (cp->c_flag & C_NOEXISTS) {
4934                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4935                         return (ENOENT);
4936                 }
4937                 took_trunc_lock = 1;
4938         }
4939         headblks = fp->ff_blocks;
4940         datablks = howmany(fp->ff_size, blksize);
4941         growsize = datablks * blksize;
4942         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
4943         if (blockHint >= hfsmp->hfs_metazone_start &&
4944             blockHint <= hfsmp->hfs_metazone_end)
4945                 eflags |= kEFMetadataMask;
4946
4947         if (hfs_start_transaction(hfsmp) != 0) {
4948                 if (took_trunc_lock)
4949                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4950             return (EINVAL);
4951         }
4952         started_tr = 1;
4953         /*
4954          * Protect the extents b-tree and the allocation bitmap
4955          * during MapFileBlockC and ExtendFileC operations.
4956          */
4957         lockflags = SFL_BITMAP;
4958         if (overflow_extents(fp))
4959                 lockflags |= SFL_EXTENTS;
4960         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4961
4962         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
4963         if (retval) {
4964                 retval = MacToVFSError(retval);
4965                 goto out;
4966         }
4967
4968         /*
4969          * STEP 1 - acquire new allocation blocks.
4970          */
4971         nextallocsave = hfsmp->nextAllocation;
4972         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
4973         if (eflags & kEFMetadataMask) {
4974                 hfs_lock_mount(hfsmp);
4975                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
4976                 MarkVCBDirty(hfsmp);
4977                 hfs_unlock_mount(hfsmp);
4978         }
4979
4980         retval = MacToVFSError(retval);
4981         if (retval == 0) {
4982                 cp->c_flag |= C_MODIFIED;
4983                 if (newbytes < growsize) {
4984                         retval = ENOSPC;
4985                         goto restore;
4986                 } else if (fp->ff_blocks < (headblks + datablks)) {
4987                         printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
4988                         retval = ENOSPC;
4989                         goto restore;
4990                 }
4991
4992                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
4993                 if (retval) {
4994                         retval = MacToVFSError(retval);
4995                 } else if ((sector_a + 1) == sector_b) {
4996                         retval = ENOSPC;
4997                         goto restore;
4998                 } else if ((eflags & kEFMetadataMask) &&
4999                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5000                               hfsmp->hfs_metazone_end)) {
5001 #if 0
5002                         const char * filestr;
5003                         char emptystr = '\0';
5004
5005                         if (cp->c_desc.cd_nameptr != NULL) {
5006                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5007                         } else if (vnode_name(vp) != NULL) {
5008                                 filestr = vnode_name(vp);
5009                         } else {
5010                                 filestr = &emptystr;
5011                         }
5012 #endif
5013                         retval = ENOSPC;
5014                         goto restore;
5015                 }
5016         }
5017         /* Done with system locks and journal for now. */
5018         hfs_systemfile_unlock(hfsmp, lockflags);
5019         lockflags = 0;
5020         hfs_end_transaction(hfsmp);
5021         started_tr = 0;
5022
5023         if (retval) {
5024                 /*
5025                  * Check to see if failure is due to excessive fragmentation.
5026                  */
5027                 if ((retval == ENOSPC) &&
5028                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5029                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5030                 }
5031                 goto out;
5032         }
5033         /*
5034          * STEP 2 - clone file data into the new allocation blocks.
5035          */
5036
5037         if (vnodetype == VLNK)
5038                 retval = EPERM;
5039         else if (vnode_issystem(vp))
5040                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5041         else
5042                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
5043
5044         /* Start transaction for step 3 or for a restore. */
5045         if (hfs_start_transaction(hfsmp) != 0) {
5046                 retval = EINVAL;
5047                 goto out;
5048         }
5049         started_tr = 1;
5050         if (retval)
5051                 goto restore;
5052
5053         /*
5054          * STEP 3 - switch to cloned data and remove old blocks.
5055          */
5056         lockflags = SFL_BITMAP;
5057         if (overflow_extents(fp))
5058                 lockflags |= SFL_EXTENTS;
5059         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5060
5061         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5062
5063         hfs_systemfile_unlock(hfsmp, lockflags);
5064         lockflags = 0;
5065         if (retval)
5066                 goto restore;
5067 out:
5068         if (took_trunc_lock)
5069                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5070
5071         if (lockflags) {
5072                 hfs_systemfile_unlock(hfsmp, lockflags);
5073                 lockflags = 0;
5074         }
5075
5076         /* Push cnode's new extent data to disk. */
5077         if (retval == 0) {
5078                 (void) hfs_update(vp, MNT_WAIT);
5079         }
5080         if (hfsmp->jnl) {
5081                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5082                         (void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
5083                 else
5084                         (void) hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
5085         }
5086 exit:
5087         if (started_tr)
5088                 hfs_end_transaction(hfsmp);
5089
5090         return (retval);
5091
5092 restore:
5093         if (fp->ff_blocks == headblks) {
5094                 if (took_trunc_lock)
5095                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5096                 goto exit;
5097         }
5098         /*
5099          * Give back any newly allocated space.
5100          */
5101         if (lockflags == 0) {
5102                 lockflags = SFL_BITMAP;
5103                 if (overflow_extents(fp))
5104                         lockflags |= SFL_EXTENTS;
5105                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5106         }
5107
5108         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5109                                                  FTOC(fp)->c_fileid, false);
5110
5111         hfs_systemfile_unlock(hfsmp, lockflags);
5112         lockflags = 0;
5113
5114         if (took_trunc_lock)
5115                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5116         goto exit;
5117 }
5118
5119
5120 /*
5121  * Clone a file's data within the file.
5122  *
5123  */
5124 static int
5125 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5126 {
5127         caddr_t  bufp;
5128         size_t  bufsize;
5129         size_t  copysize;
5130         size_t  iosize;
5131         size_t  offset;
5132         off_t   writebase;
5133         uio_t auio;
5134         int  error = 0;
5135
5136         writebase = blkstart * blksize;
5137         copysize = blkcnt * blksize;
5138         iosize = bufsize = MIN(copysize, 128 * 1024);
5139         offset = 0;
5140
5141         hfs_unlock(VTOC(vp));
5142
5143 #if CONFIG_PROTECT
5144         if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5145                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5146                 return (error);
5147         }
5148 #endif /* CONFIG_PROTECT */
5149
5150         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5151                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5152                 return (ENOMEM);
5153         }
5154
5155         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5156
5157         while (offset < copysize) {
5158                 iosize = MIN(copysize - offset, iosize);
5159
5160                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5161                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5162
5163                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5164                 if (error) {
5165                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
5166                         break;
5167                 }
5168                 if (uio_resid(auio) != 0) {
5169                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5170                         error = EIO;
5171                         break;
5172                 }
5173
5174                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5175                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5176
5177                 error = cluster_write(vp, auio, writebase + offset,
5178                                       writebase + offset + iosize,
5179                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5180                 if (error) {
5181                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
5182                         break;
5183                 }
5184                 if (uio_resid(auio) != 0) {
5185                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5186                         error = EIO;
5187                         break;
5188                 }
5189                 offset += iosize;
5190         }
5191         uio_free(auio);
5192
5193         if ((blksize & PAGE_MASK)) {
5194                 /*
5195                  * since the copy may not have started on a PAGE
5196                  * boundary (or may not have ended on one), we
5197                  * may have pages left in the cache since NOCACHE
5198                  * will let partially written pages linger...
5199                  * lets just flush the entire range to make sure
5200                  * we don't have any pages left that are beyond
5201                  * (or intersect) the real LEOF of this file
5202                  */
5203                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5204         } else {
5205                 /*
5206                  * No need to call ubc_sync_range or hfs_invalbuf
5207                  * since the file was copied using IO_NOCACHE and
5208                  * the copy was done starting and ending on a page
5209                  * boundary in the file.
5210                  */
5211         }
5212         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5213
5214         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5215         return (error);
5216 }
5217
5218 /*
5219  * Clone a system (metadata) file.
5220  *
5221  */
5222 static int
5223 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5224                  kauth_cred_t cred, struct proc *p)
5225 {
5226         caddr_t  bufp;
5227         char * offset;
5228         size_t  bufsize;
5229         size_t  iosize;
5230         struct buf *bp = NULL;
5231         daddr64_t  blkno;
5232         daddr64_t  blk;
5233         daddr64_t  start_blk;
5234         daddr64_t  last_blk;
5235         int  breadcnt;
5236         int  i;
5237         int  error = 0;
5238
5239
5240         iosize = GetLogicalBlockSize(vp);
5241         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5242         breadcnt = bufsize / iosize;
5243
5244         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) {
5245                 return (ENOMEM);
5246         }
5247         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5248         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5249         blkno = 0;
5250
5251         while (blkno < last_blk) {
5252                 /*
5253                  * Read up to a megabyte
5254                  */
5255                 offset = bufp;
5256                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5257                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5258                         if (error) {
5259                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
5260                                 goto out;
5261                         }
5262                         if (buf_count(bp) != iosize) {
5263                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5264                                 goto out;
5265                         }
5266                         bcopy((char *)buf_dataptr(bp), offset, iosize);
5267
5268                         buf_markinvalid(bp);
5269                         buf_brelse(bp);
5270                         bp = NULL;
5271
5272                         offset += iosize;
5273                 }
5274
5275                 /*
5276                  * Write up to a megabyte
5277                  */
5278                 offset = bufp;
5279                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5280                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5281                         if (bp == NULL) {
5282                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5283                                 error = EIO;
5284                                 goto out;
5285                         }
5286                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
5287                         error = (int)buf_bwrite(bp);
5288                         bp = NULL;
5289                         if (error)
5290                                 goto out;
5291                         offset += iosize;
5292                 }
5293         }
5294 out:
5295         if (bp) {
5296                 buf_brelse(bp);
5297         }
5298
5299         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5300
5301         error = hfs_fsync(vp, MNT_WAIT, 0, p);
5302
5303         return (error);
5304 }