bsd/hfs/hfs_readwrite.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*      @(#)hfs_readwrite.c     1.0
  29  *
  30  *      (c) 1998-2001 Apple Computer, Inc.  All Rights Reserved
  31  *
  32  *      hfs_readwrite.c -- vnode operations to deal with reading and writing files.
  33  *
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/resourcevar.h>
  39 #include <sys/kernel.h>
  40 #include <sys/fcntl.h>
  41 #include <sys/filedesc.h>
  42 #include <sys/stat.h>
  43 #include <sys/buf.h>
  44 #include <sys/buf_internal.h>
  45 #include <sys/proc.h>
  46 #include <sys/kauth.h>
  47 #include <sys/vnode.h>
  48 #include <sys/vnode_internal.h>
  49 #include <sys/uio.h>
  50 #include <sys/vfs_context.h>
  51 #include <sys/fsevents.h>
  52 #include <kern/kalloc.h>
  53 #include <sys/disk.h>
  54 #include <sys/sysctl.h>
  55 #include <sys/fsctl.h>
  56 #include <sys/mount_internal.h>
  57 #include <sys/file_internal.h>
  58
  59 #include <libkern/OSDebug.h>
  60
  61 #include <miscfs/specfs/specdev.h>
  62
  63 #include <sys/ubc.h>
  64 #include <sys/ubc_internal.h>
  65
  66 #include <vm/vm_pageout.h>
  67 #include <vm/vm_kern.h>
  68
  69 #include <IOKit/IOBSD.h>
  70
  71 #include <sys/kdebug.h>
  72
  73 #include        "hfs.h"
  74 #include        "hfs_attrlist.h"
  75 #include        "hfs_endian.h"
  76 #include        "hfs_fsctl.h"
  77 #include        "hfs_quota.h"
  78 #include        "hfscommon/headers/FileMgrInternal.h"
  79 #include        "hfscommon/headers/BTreesInternal.h"
  80 #include        "hfs_cnode.h"
  81 #include        "hfs_dbg.h"
  82
  83
  84 #define can_cluster(size) ((((size & (4096-1))) == 0) && (size <= (MAXPHYSIO/2)))
  85
  86 enum {
  87         MAXHFSFILESIZE = 0x7FFFFFFF             /* this needs to go in the mount structure */
  88 };
  89
  90 /* from bsd/hfs/hfs_vfsops.c */
  91 extern int hfs_vfs_vget (struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
  92
  93 /* from hfs_hotfiles.c */
  94 extern int hfs_pin_overflow_extents (struct hfsmount *hfsmp, uint32_t fileid,
  95                                               uint8_t forktype, uint32_t *pinned);
  96
  97 static int  hfs_clonefile(struct vnode *, int, int, int);
  98 static int  hfs_clonesysfile(struct vnode *, int, int, int, kauth_cred_t, struct proc *);
  99 static int  do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, vfs_context_t context);
 100
 101 /* from bsd/hfs/hfs_vnops.c */
 102 extern decmpfs_cnode* hfs_lazy_init_decmpfs_cnode (struct cnode *cp);
 103
 104
 105
 106 int flush_cache_on_write = 0;
 107 SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files");
 108
 109 /*
 110  * Read data from a file.
 111  */
 112 int
 113 hfs_vnop_read(struct vnop_read_args *ap)
 114 {
 115         /*
 116            struct vnop_read_args {
 117            struct vnodeop_desc *a_desc;
 118            vnode_t a_vp;
 119            struct uio *a_uio;
 120            int a_ioflag;
 121            vfs_context_t a_context;
 122            };
 123          */
 124
 125         uio_t uio = ap->a_uio;
 126         struct vnode *vp = ap->a_vp;
 127         struct cnode *cp;
 128         struct filefork *fp;
 129         struct hfsmount *hfsmp;
 130         off_t filesize;
 131         off_t filebytes;
 132         off_t start_resid = uio_resid(uio);
 133         off_t offset = uio_offset(uio);
 134         int retval = 0;
 135         int took_truncate_lock = 0;
 136         int io_throttle = 0;
 137         int throttled_count = 0;
 138
 139         /* Preflight checks */
 140         if (!vnode_isreg(vp)) {
 141                 /* can only read regular files */
 142                 if (vnode_isdir(vp))
 143                         return (EISDIR);
 144                 else
 145                         return (EPERM);
 146         }
 147         if (start_resid == 0)
 148                 return (0);             /* Nothing left to do */
 149         if (offset < 0)
 150                 return (EINVAL);        /* cant read from a negative offset */
 151
 152 #if SECURE_KERNEL
 153         if ((ap->a_ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 154                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 155                 /* Don't allow unencrypted io request from user space */
 156                 return EPERM;
 157         }
 158 #endif
 159
 160 #if HFS_COMPRESSION
 161         if (VNODE_IS_RSRC(vp)) {
 162                 if (hfs_hides_rsrc(ap->a_context, VTOC(vp), 1)) { /* 1 == don't take the cnode lock */
 163                         return 0;
 164                 }
 165                 /* otherwise read the resource fork normally */
 166         } else {
 167                 int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
 168                 if (compressed) {
 169                         retval = decmpfs_read_compressed(ap, &compressed, VTOCMP(vp));
 170                         if (retval == 0 && !(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
 171                                 (void) hfs_addhotfile(vp);
 172                         }
 173                         if (compressed) {
 174                                 if (retval == 0) {
 175                                         /* successful read, update the access time */
 176                                         VTOC(vp)->c_touch_acctime = TRUE;
 177
 178                                         //
 179                                         // compressed files are not traditional hot file candidates
 180                                         // but they may be for CF (which ignores the ff_bytesread
 181                                         // field)
 182                                         //
 183                                         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
 184                                                 VTOF(vp)->ff_bytesread = 0;
 185                                         }
 186                                 }
 187                                 return retval;
 188                         }
 189                         /* otherwise the file was converted back to a regular file while we were reading it */
 190                         retval = 0;
 191                 } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 192                         int error;
 193
 194                         error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP);
 195                         if (error) {
 196                                 return error;
 197                         }
 198
 199                 }
 200         }
 201 #endif /* HFS_COMPRESSION */
 202
 203         cp = VTOC(vp);
 204         fp = VTOF(vp);
 205         hfsmp = VTOHFS(vp);
 206
 207 #if CONFIG_PROTECT
 208         if ((retval = cp_handle_vnop (vp, CP_READ_ACCESS, ap->a_ioflag)) != 0) {
 209                 goto exit;
 210         }
 211
 212 #endif // CONFIG_PROTECT
 213
 214         /*
 215          * If this read request originated from a syscall (as opposed to
 216          * an in-kernel page fault or something), then set it up for
 217          * throttle checks
 218          */
 219         if (ap->a_ioflag & IO_SYSCALL_DISPATCH) {
 220                 io_throttle = IO_RETURN_ON_THROTTLE;
 221         }
 222
 223 read_again:
 224
 225         /* Protect against a size change. */
 226         hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 227         took_truncate_lock = 1;
 228
 229         filesize = fp->ff_size;
 230         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 231
 232         /*
 233          * Check the file size. Note that per POSIX spec, we return 0 at
 234          * file EOF, so attempting a read at an offset that is too big
 235          * should just return 0 on HFS+. Since the return value was initialized
 236          * to 0 above, we just jump to exit.  HFS Standard has its own behavior.
 237          */
 238         if (offset > filesize) {
 239                 if ((hfsmp->hfs_flags & HFS_STANDARD) &&
 240                     (offset > (off_t)MAXHFSFILESIZE)) {
 241                         retval = EFBIG;
 242                 }
 243                 goto exit;
 244         }
 245
 246         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_START,
 247                 (int)uio_offset(uio), uio_resid(uio), (int)filesize, (int)filebytes, 0);
 248
 249         retval = cluster_read(vp, uio, filesize, ap->a_ioflag |io_throttle);
 250
 251         cp->c_touch_acctime = TRUE;
 252
 253         KERNEL_DEBUG(HFSDBG_READ | DBG_FUNC_END,
 254                 (int)uio_offset(uio), uio_resid(uio), (int)filesize,  (int)filebytes, 0);
 255
 256         /*
 257          * Keep track blocks read
 258          */
 259         if (hfsmp->hfc_stage == HFC_RECORDING && retval == 0) {
 260                 int took_cnode_lock = 0;
 261                 off_t bytesread;
 262
 263                 bytesread = start_resid - uio_resid(uio);
 264
 265                 /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
 266                 if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff) {
 267                         hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 268                         took_cnode_lock = 1;
 269                 }
 270                 /*
 271                  * If this file hasn't been seen since the start of
 272                  * the current sampling period then start over.
 273                  */
 274                 if (cp->c_atime < hfsmp->hfc_timebase) {
 275                         struct timeval tv;
 276
 277                         fp->ff_bytesread = bytesread;
 278                         microtime(&tv);
 279                         cp->c_atime = tv.tv_sec;
 280                 } else {
 281                         fp->ff_bytesread += bytesread;
 282                 }
 283
 284                 if (!(ap->a_ioflag & IO_EVTONLY) && vnode_isfastdevicecandidate(vp)) {
 285                         //
 286                         // We don't add hotfiles for processes doing IO_EVTONLY I/O
 287                         // on the assumption that they're system processes such as
 288                         // mdworker which scan everything in the system (and thus
 289                         // do not represent user-initiated access to files)
 290                         //
 291                         (void) hfs_addhotfile(vp);
 292                 }
 293                 if (took_cnode_lock)
 294                         hfs_unlock(cp);
 295         }
 296 exit:
 297         if (took_truncate_lock) {
 298                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 299         }
 300         if (retval == EAGAIN) {
 301                 throttle_lowpri_io(1);
 302                 throttled_count++;
 303
 304                 retval = 0;
 305                 goto read_again;
 306         }
 307         if (throttled_count) {
 308                 throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
 309         }
 310         return (retval);
 311 }
 312
 313 /*
 314  * Ideally, this wouldn't be necessary; the cluster code should be
 315  * able to handle this on the read-side.  See <rdar://20420068>.
 316  */
 317 static errno_t hfs_zero_eof_page(vnode_t vp, off_t zero_up_to)
 318 {
 319         assert(VTOC(vp)->c_lockowner != current_thread());
 320         assert(VTOC(vp)->c_truncatelockowner == current_thread());
 321
 322         struct filefork *fp = VTOF(vp);
 323
 324         if (!(fp->ff_size & PAGE_MASK_64) || zero_up_to <= fp->ff_size) {
 325                 // Nothing to do
 326                 return 0;
 327         }
 328
 329         zero_up_to = MIN(zero_up_to, (off_t)round_page_64(fp->ff_size));
 330
 331         /* N.B. At present, @zero_up_to is not important because the cluster
 332            code will always zero up to the end of the page anyway. */
 333         return cluster_write(vp, NULL, fp->ff_size, zero_up_to,
 334                                                  fp->ff_size, 0, IO_HEADZEROFILL);
 335 }
 336
 337 /*
 338  * Write data to a file.
 339  */
 340 int
 341 hfs_vnop_write(struct vnop_write_args *ap)
 342 {
 343         uio_t uio = ap->a_uio;
 344         struct vnode *vp = ap->a_vp;
 345         struct cnode *cp;
 346         struct filefork *fp;
 347         struct hfsmount *hfsmp;
 348         kauth_cred_t cred = NULL;
 349         off_t origFileSize;
 350         off_t writelimit;
 351         off_t bytesToAdd = 0;
 352         off_t actualBytesAdded;
 353         off_t filebytes;
 354         off_t offset;
 355         ssize_t resid;
 356         int eflags;
 357         int ioflag = ap->a_ioflag;
 358         int retval = 0;
 359         int lockflags;
 360         int cnode_locked = 0;
 361         int partialwrite = 0;
 362         int do_snapshot = 1;
 363         time_t orig_ctime=VTOC(vp)->c_ctime;
 364         int took_truncate_lock = 0;
 365         int io_return_on_throttle = 0;
 366         int throttled_count = 0;
 367
 368 #if HFS_COMPRESSION
 369         if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
 370                 int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
 371                 switch(state) {
 372                         case FILE_IS_COMPRESSED:
 373                                 return EACCES;
 374                         case FILE_IS_CONVERTING:
 375                                 /* if FILE_IS_CONVERTING, we allow writes but do not
 376                                    bother with snapshots or else we will deadlock.
 377                                 */
 378                                 do_snapshot = 0;
 379                                 break;
 380                         default:
 381                                 printf("invalid state %d for compressed file\n", state);
 382                                 /* fall through */
 383                 }
 384         } else if ((VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
 385                 int error;
 386
 387                 error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP);
 388                 if (error != 0) {
 389                         return error;
 390                 }
 391         }
 392
 393         if (do_snapshot) {
 394                 check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio);
 395         }
 396
 397 #endif
 398
 399 #if SECURE_KERNEL
 400         if ((ioflag & (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) ==
 401                                                 (IO_SKIP_ENCRYPTION|IO_SYSCALL_DISPATCH)) {
 402                 /* Don't allow unencrypted io request from user space */
 403                 return EPERM;
 404         }
 405 #endif
 406
 407         resid = uio_resid(uio);
 408         offset = uio_offset(uio);
 409
 410         if (offset < 0)
 411                 return (EINVAL);
 412         if (resid == 0)
 413                 return (E_NONE);
 414         if (!vnode_isreg(vp))
 415                 return (EPERM);  /* Can only write regular files */
 416
 417         cp = VTOC(vp);
 418         fp = VTOF(vp);
 419         hfsmp = VTOHFS(vp);
 420
 421 #if CONFIG_PROTECT
 422         if ((retval = cp_handle_vnop (vp, CP_WRITE_ACCESS, 0)) != 0) {
 423                 goto exit;
 424         }
 425 #endif
 426
 427         eflags = kEFDeferMask;  /* defer file block allocations */
 428 #if HFS_SPARSE_DEV
 429         /*
 430          * When the underlying device is sparse and space
 431          * is low (< 8MB), stop doing delayed allocations
 432          * and begin doing synchronous I/O.
 433          */
 434         if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
 435             (hfs_freeblks(hfsmp, 0) < 2048)) {
 436                 eflags &= ~kEFDeferMask;
 437                 ioflag |= IO_SYNC;
 438         }
 439 #endif /* HFS_SPARSE_DEV */
 440
 441         if ((ioflag & (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) ==
 442                         (IO_SINGLE_WRITER | IO_SYSCALL_DISPATCH)) {
 443                 io_return_on_throttle = IO_RETURN_ON_THROTTLE;
 444         }
 445
 446 again:
 447         /*
 448          * Protect against a size change.
 449          *
 450          * Note: If took_truncate_lock is true, then we previously got the lock shared
 451          * but needed to upgrade to exclusive.  So try getting it exclusive from the
 452          * start.
 453          */
 454         if (ioflag & IO_APPEND || took_truncate_lock) {
 455                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
 456         }
 457         else {
 458                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_DEFAULT);
 459         }
 460         took_truncate_lock = 1;
 461
 462         /* Update UIO */
 463         if (ioflag & IO_APPEND) {
 464                 uio_setoffset(uio, fp->ff_size);
 465                 offset = fp->ff_size;
 466         }
 467         if ((cp->c_bsdflags & APPEND) && offset != fp->ff_size) {
 468                 retval = EPERM;
 469                 goto exit;
 470         }
 471
 472         cred = vfs_context_ucred(ap->a_context);
 473         if (cred && suser(cred, NULL) != 0)
 474                 eflags |= kEFReserveMask;
 475
 476         origFileSize = fp->ff_size;
 477         writelimit = offset + resid;
 478
 479         /*
 480          * We may need an exclusive truncate lock for several reasons, all
 481          * of which are because we may be writing to a (portion of a) block
 482          * for the first time, and we need to make sure no readers see the
 483          * prior, uninitialized contents of the block.  The cases are:
 484          *
 485          * 1. We have unallocated (delayed allocation) blocks.  We may be
 486          *    allocating new blocks to the file and writing to them.
 487          *    (A more precise check would be whether the range we're writing
 488          *    to contains delayed allocation blocks.)
 489          * 2. We need to extend the file.  The bytes between the old EOF
 490          *    and the new EOF are not yet initialized.  This is important
 491          *    even if we're not allocating new blocks to the file.  If the
 492          *    old EOF and new EOF are in the same block, we still need to
 493          *    protect that range of bytes until they are written for the
 494          *    first time.
 495          *
 496          * If we had a shared lock with the above cases, we need to try to upgrade
 497          * to an exclusive lock.  If the upgrade fails, we will lose the shared
 498          * lock, and will need to take the truncate lock again; the took_truncate_lock
 499          * flag will still be set, causing us to try for an exclusive lock next time.
 500          */
 501         if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) &&
 502             ((fp->ff_unallocblocks != 0) ||
 503              (writelimit > origFileSize))) {
 504                 if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) {
 505                         /*
 506                          * Lock upgrade failed and we lost our shared lock, try again.
 507                          * Note: we do not set took_truncate_lock=0 here.  Leaving it
 508                          * set to 1 will cause us to try to get the lock exclusive.
 509                          */
 510                         goto again;
 511                 }
 512                 else {
 513                         /* Store the owner in the c_truncatelockowner field if we successfully upgrade */
 514                         cp->c_truncatelockowner = current_thread();
 515                 }
 516         }
 517
 518         if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
 519                 goto exit;
 520         }
 521         cnode_locked = 1;
 522
 523         filebytes = hfs_blk_to_bytes(fp->ff_blocks, hfsmp->blockSize);
 524
 525         if (offset > filebytes
 526                 && (hfs_blk_to_bytes(hfs_freeblks(hfsmp, ISSET(eflags, kEFReserveMask)),
 527                                                          hfsmp->blockSize) < offset - filebytes)) {
 528                 retval = ENOSPC;
 529                 goto exit;
 530         }
 531
 532         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_START,
 533                      (int)offset, uio_resid(uio), (int)fp->ff_size,
 534                      (int)filebytes, 0);
 535
 536         /* Check if we do not need to extend the file */
 537         if (writelimit <= filebytes) {
 538                 goto sizeok;
 539         }
 540
 541         bytesToAdd = writelimit - filebytes;
 542
 543 #if QUOTA
 544         retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, hfsmp->blockSize)),
 545                            cred, 0);
 546         if (retval)
 547                 goto exit;
 548 #endif /* QUOTA */
 549
 550         if (hfs_start_transaction(hfsmp) != 0) {
 551                 retval = EINVAL;
 552                 goto exit;
 553         }
 554
 555         while (writelimit > filebytes) {
 556                 bytesToAdd = writelimit - filebytes;
 557
 558                 /* Protect extents b-tree and allocation bitmap */
 559                 lockflags = SFL_BITMAP;
 560                 if (overflow_extents(fp))
 561                         lockflags |= SFL_EXTENTS;
 562                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
 563
 564                 /* Files that are changing size are not hot file candidates. */
 565                 if (hfsmp->hfc_stage == HFC_RECORDING) {
 566                         fp->ff_bytesread = 0;
 567                 }
 568                 retval = MacToVFSError(ExtendFileC (hfsmp, (FCB*)fp, bytesToAdd,
 569                                 0, eflags, &actualBytesAdded));
 570
 571                 hfs_systemfile_unlock(hfsmp, lockflags);
 572
 573                 if ((actualBytesAdded == 0) && (retval == E_NONE))
 574                         retval = ENOSPC;
 575                 if (retval != E_NONE)
 576                         break;
 577                 filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 578                 KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_NONE,
 579                         (int)offset, uio_resid(uio), (int)fp->ff_size,  (int)filebytes, 0);
 580         }
 581         (void) hfs_update(vp, 0);
 582         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
 583         (void) hfs_end_transaction(hfsmp);
 584
 585         /*
 586          * If we didn't grow the file enough try a partial write.
 587          * POSIX expects this behavior.
 588          */
 589         if ((retval == ENOSPC) && (filebytes > offset)) {
 590                 retval = 0;
 591                 partialwrite = 1;
 592                 uio_setresid(uio, (uio_resid(uio) - bytesToAdd));
 593                 resid -= bytesToAdd;
 594                 writelimit = filebytes;
 595         }
 596 sizeok:
 597         if (retval == E_NONE) {
 598                 off_t filesize;
 599                 off_t head_off;
 600                 int lflag;
 601
 602                 if (writelimit > fp->ff_size) {
 603                         filesize = writelimit;
 604                         struct timeval tv;
 605                         rl_add(fp->ff_size, writelimit - 1 , &fp->ff_invalidranges);
 606                         microuptime(&tv);
 607                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
 608                 } else
 609                         filesize = fp->ff_size;
 610
 611                 lflag = ioflag & ~(IO_TAILZEROFILL | IO_HEADZEROFILL | IO_NOZEROVALID | IO_NOZERODIRTY);
 612
 613                 /*
 614                  * We no longer use IO_HEADZEROFILL or IO_TAILZEROFILL (except
 615                  * for one case below).  For the regions that lie before the
 616                  * beginning and after the end of this write that are in the
 617                  * same page, we let the cluster code handle zeroing that out
 618                  * if necessary.  If those areas are not cached, the cluster
 619                  * code will try and read those areas in, and in the case
 620                  * where those regions have never been written to,
 621                  * hfs_vnop_blockmap will consult the invalid ranges and then
 622                  * indicate that.  The cluster code will zero out those areas.
 623                  */
 624
 625                 head_off = trunc_page_64(offset);
 626
 627                 if (head_off < offset && head_off >= fp->ff_size) {
 628                         /*
 629                          * The first page is beyond current EOF, so as an
 630                          * optimisation, we can pass IO_HEADZEROFILL.
 631                          */
 632                         lflag |= IO_HEADZEROFILL;
 633                 }
 634
 635                 hfs_unlock(cp);
 636                 cnode_locked = 0;
 637
 638                 /*
 639                  * We need to tell UBC the fork's new size BEFORE calling
 640                  * cluster_write, in case any of the new pages need to be
 641                  * paged out before cluster_write completes (which does happen
 642                  * in embedded systems due to extreme memory pressure).
 643                  * Similarly, we need to tell hfs_vnop_pageout what the new EOF
 644                  * will be, so that it can pass that on to cluster_pageout, and
 645                  * allow those pageouts.
 646                  *
 647                  * We don't update ff_size yet since we don't want pageins to
 648                  * be able to see uninitialized data between the old and new
 649                  * EOF, until cluster_write has completed and initialized that
 650                  * part of the file.
 651                  *
 652                  * The vnode pager relies on the file size last given to UBC via
 653                  * ubc_setsize.  hfs_vnop_pageout relies on fp->ff_new_size or
 654                  * ff_size (whichever is larger).  NOTE: ff_new_size is always
 655                  * zero, unless we are extending the file via write.
 656                  */
 657                 if (filesize > fp->ff_size) {
 658                         retval = hfs_zero_eof_page(vp, offset);
 659                         if (retval)
 660                                 goto exit;
 661                         fp->ff_new_size = filesize;
 662                         ubc_setsize(vp, filesize);
 663                 }
 664                 retval = cluster_write(vp, uio, fp->ff_size, filesize, head_off,
 665                                                            0, lflag | IO_NOZERODIRTY | io_return_on_throttle);
 666                 if (retval) {
 667                         fp->ff_new_size = 0;    /* no longer extending; use ff_size */
 668
 669                         if (retval == EAGAIN) {
 670                                 /*
 671                                  * EAGAIN indicates that we still have I/O to do, but
 672                                  * that we now need to be throttled
 673                                  */
 674                                 if (resid != uio_resid(uio)) {
 675                                         /*
 676                                          * did manage to do some I/O before returning EAGAIN
 677                                          */
 678                                         resid = uio_resid(uio);
 679                                         offset = uio_offset(uio);
 680
 681                                         cp->c_touch_chgtime = TRUE;
 682                                         cp->c_touch_modtime = TRUE;
 683                                         hfs_incr_gencount(cp);
 684                                 }
 685                                 if (filesize > fp->ff_size) {
 686                                         /*
 687                                          * we called ubc_setsize before the call to
 688                                          * cluster_write... since we only partially
 689                                          * completed the I/O, we need to
 690                                          * re-adjust our idea of the filesize based
 691                                          * on our interim EOF
 692                                          */
 693                                         ubc_setsize(vp, offset);
 694
 695                                         fp->ff_size = offset;
 696                                 }
 697                                 goto exit;
 698                         }
 699                         if (filesize > origFileSize) {
 700                                 ubc_setsize(vp, origFileSize);
 701                         }
 702                         goto ioerr_exit;
 703                 }
 704
 705                 if (filesize > origFileSize) {
 706                         fp->ff_size = filesize;
 707
 708                         /* Files that are changing size are not hot file candidates. */
 709                         if (hfsmp->hfc_stage == HFC_RECORDING) {
 710                                 fp->ff_bytesread = 0;
 711                         }
 712                 }
 713                 fp->ff_new_size = 0;    /* ff_size now has the correct size */
 714         }
 715         if (partialwrite) {
 716                 uio_setresid(uio, (uio_resid(uio) + bytesToAdd));
 717                 resid += bytesToAdd;
 718         }
 719
 720         // XXXdbg - see radar 4871353 for more info
 721         {
 722             if (flush_cache_on_write && ((ioflag & IO_NOCACHE) || vnode_isnocache(vp))) {
 723                         hfs_flush(hfsmp, HFS_FLUSH_CACHE);
 724             }
 725         }
 726
 727 ioerr_exit:
 728         if (!cnode_locked) {
 729                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
 730                 cnode_locked = 1;
 731         }
 732
 733         if (resid > uio_resid(uio)) {
 734                 cp->c_touch_chgtime = TRUE;
 735                 cp->c_touch_modtime = TRUE;
 736                 hfs_incr_gencount(cp);
 737
 738                 /*
 739                  * If we successfully wrote any data, and we are not the superuser
 740                  * we clear the setuid and setgid bits as a precaution against
 741                  * tampering.
 742                  */
 743                 if (cp->c_mode & (S_ISUID | S_ISGID)) {
 744                         cred = vfs_context_ucred(ap->a_context);
 745                         if (cred && suser(cred, NULL)) {
 746                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
 747                         }
 748                 }
 749         }
 750         if (retval) {
 751                 if (ioflag & IO_UNIT) {
 752                         (void)hfs_truncate(vp, origFileSize, ioflag & IO_SYNC,
 753                                            0, ap->a_context);
 754                         uio_setoffset(uio, (uio_offset(uio) - (resid - uio_resid(uio))));
 755                         uio_setresid(uio, resid);
 756                         filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize;
 757                 }
 758         } else if ((ioflag & IO_SYNC) && (resid > uio_resid(uio)))
 759                 retval = hfs_update(vp, 0);
 760
 761         /* Updating vcbWrCnt doesn't need to be atomic. */
 762         hfsmp->vcbWrCnt++;
 763
 764         KERNEL_DEBUG(HFSDBG_WRITE | DBG_FUNC_END,
 765                 (int)uio_offset(uio), uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0);
 766 exit:
 767         if (retval && took_truncate_lock
 768                 && cp->c_truncatelockowner == current_thread()) {
 769                 fp->ff_new_size = 0;
 770                 rl_remove(fp->ff_size, RL_INFINITY, &fp->ff_invalidranges);
 771         }
 772
 773         if (cnode_locked)
 774                 hfs_unlock(cp);
 775
 776         if (took_truncate_lock) {
 777                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
 778         }
 779         if (retval == EAGAIN) {
 780                 throttle_lowpri_io(1);
 781                 throttled_count++;
 782
 783                 retval = 0;
 784                 goto again;
 785         }
 786         if (throttled_count) {
 787                 throttle_info_reset_window((uthread_t)get_bsdthread_info(current_thread()));
 788         }
 789         return (retval);
 790 }
 791
 792 /* support for the "bulk-access" fcntl */
 793
 794 #define CACHE_LEVELS 16
 795 #define NUM_CACHE_ENTRIES (64*16)
 796 #define PARENT_IDS_FLAG 0x100
 797
 798 struct access_cache {
 799        int numcached;
 800        int cachehits; /* these two for statistics gathering */
 801        int lookups;
 802        unsigned int *acache;
 803        unsigned char *haveaccess;
 804 };
 805
 806 struct access_t {
 807         uid_t     uid;              /* IN: effective user id */
 808         short     flags;            /* IN: access requested (i.e. R_OK) */
 809         short     num_groups;       /* IN: number of groups user belongs to */
 810         int       num_files;        /* IN: number of files to process */
 811         int       *file_ids;        /* IN: array of file ids */
 812         gid_t     *groups;          /* IN: array of groups */
 813         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 814 } __attribute__((unavailable)); // this structure is for reference purposes only
 815
 816 struct user32_access_t {
 817         uid_t     uid;              /* IN: effective user id */
 818         short     flags;            /* IN: access requested (i.e. R_OK) */
 819         short     num_groups;       /* IN: number of groups user belongs to */
 820         int       num_files;        /* IN: number of files to process */
 821         user32_addr_t      file_ids;        /* IN: array of file ids */
 822         user32_addr_t      groups;          /* IN: array of groups */
 823         user32_addr_t      access;          /* OUT: access info for each file (0 for 'has access') */
 824 };
 825
 826 struct user64_access_t {
 827         uid_t           uid;                    /* IN: effective user id */
 828         short           flags;                  /* IN: access requested (i.e. R_OK) */
 829         short           num_groups;             /* IN: number of groups user belongs to */
 830         int             num_files;              /* IN: number of files to process */
 831         user64_addr_t   file_ids;               /* IN: array of file ids */
 832         user64_addr_t   groups;                 /* IN: array of groups */
 833         user64_addr_t   access;                 /* OUT: access info for each file (0 for 'has access') */
 834 };
 835
 836
 837 // these are the "extended" versions of the above structures
 838 // note that it is crucial that they be different sized than
 839 // the regular version
 840 struct ext_access_t {
 841         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 842         uint32_t   num_files;       /* IN: number of files to process */
 843         uint32_t   map_size;        /* IN: size of the bit map */
 844         uint32_t  *file_ids;        /* IN: Array of file ids */
 845         char      *bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 846         short     *access;          /* OUT: access info for each file (0 for 'has access') */
 847         uint32_t   num_parents;   /* future use */
 848         cnid_t      *parents;   /* future use */
 849 } __attribute__((unavailable)); // this structure is for reference purposes only
 850
 851 struct user32_ext_access_t {
 852         uint32_t   flags;           /* IN: access requested (i.e. R_OK) */
 853         uint32_t   num_files;       /* IN: number of files to process */
 854         uint32_t   map_size;        /* IN: size of the bit map */
 855         user32_addr_t  file_ids;        /* IN: Array of file ids */
 856         user32_addr_t     bitmap;          /* OUT: hash-bitmap of interesting directory ids */
 857         user32_addr_t access;          /* OUT: access info for each file (0 for 'has access') */
 858         uint32_t   num_parents;   /* future use */
 859         user32_addr_t parents;   /* future use */
 860 };
 861
 862 struct user64_ext_access_t {
 863         uint32_t      flags;        /* IN: access requested (i.e. R_OK) */
 864         uint32_t      num_files;    /* IN: number of files to process */
 865         uint32_t      map_size;     /* IN: size of the bit map */
 866         user64_addr_t   file_ids;     /* IN: array of file ids */
 867         user64_addr_t   bitmap;       /* IN: array of groups */
 868         user64_addr_t   access;       /* OUT: access info for each file (0 for 'has access') */
 869         uint32_t      num_parents;/* future use */
 870         user64_addr_t   parents;/* future use */
 871 };
 872
 873
 874 /*
 875  * Perform a binary search for the given parent_id. Return value is
 876  * the index if there is a match.  If no_match_indexp is non-NULL it
 877  * will be assigned with the index to insert the item (even if it was
 878  * not found).
 879  */
 880 static int cache_binSearch(cnid_t *array, unsigned int hi, cnid_t parent_id, int *no_match_indexp)
 881 {
 882     int index=-1;
 883     unsigned int lo=0;
 884
 885     do {
 886         unsigned int mid = ((hi - lo)/2) + lo;
 887         unsigned int this_id = array[mid];
 888
 889         if (parent_id == this_id) {
 890             hi = mid;
 891             break;
 892         }
 893
 894         if (parent_id < this_id) {
 895             hi = mid;
 896             continue;
 897         }
 898
 899         if (parent_id > this_id) {
 900             lo = mid + 1;
 901             continue;
 902         }
 903     } while(lo < hi);
 904
 905     /* check if lo and hi converged on the match */
 906     if (parent_id == array[hi]) {
 907         index = hi;
 908     }
 909
 910     if (no_match_indexp) {
 911         *no_match_indexp = hi;
 912     }
 913
 914     return index;
 915 }
 916
 917
 918 static int
 919 lookup_bucket(struct access_cache *cache, int *indexp, cnid_t parent_id)
 920 {
 921     unsigned int hi;
 922     int matches = 0;
 923     int index, no_match_index;
 924
 925     if (cache->numcached == 0) {
 926         *indexp = 0;
 927         return 0; // table is empty, so insert at index=0 and report no match
 928     }
 929
 930     if (cache->numcached > NUM_CACHE_ENTRIES) {
 931         cache->numcached = NUM_CACHE_ENTRIES;
 932     }
 933
 934     hi = cache->numcached - 1;
 935
 936     index = cache_binSearch(cache->acache, hi, parent_id, &no_match_index);
 937
 938     /* if no existing entry found, find index for new one */
 939     if (index == -1) {
 940         index = no_match_index;
 941         matches = 0;
 942     } else {
 943         matches = 1;
 944     }
 945
 946     *indexp = index;
 947     return matches;
 948 }
 949
 950 /*
 951  * Add a node to the access_cache at the given index (or do a lookup first
 952  * to find the index if -1 is passed in). We currently do a replace rather
 953  * than an insert if the cache is full.
 954  */
 955 static void
 956 add_node(struct access_cache *cache, int index, cnid_t nodeID, int access)
 957 {
 958     int lookup_index = -1;
 959
 960     /* need to do a lookup first if -1 passed for index */
 961     if (index == -1) {
 962         if (lookup_bucket(cache, &lookup_index, nodeID)) {
 963             if (cache->haveaccess[lookup_index] != access && cache->haveaccess[lookup_index] == ESRCH) {
 964                 // only update an entry if the previous access was ESRCH (i.e. a scope checking error)
 965                 cache->haveaccess[lookup_index] = access;
 966             }
 967
 968             /* mission accomplished */
 969             return;
 970         } else {
 971             index = lookup_index;
 972         }
 973
 974     }
 975
 976     /* if the cache is full, do a replace rather than an insert */
 977     if (cache->numcached >= NUM_CACHE_ENTRIES) {
 978         cache->numcached = NUM_CACHE_ENTRIES-1;
 979
 980         if (index > cache->numcached) {
 981             index = cache->numcached;
 982         }
 983     }
 984
 985     if (index < cache->numcached && index < NUM_CACHE_ENTRIES && nodeID > cache->acache[index]) {
 986         index++;
 987     }
 988
 989     if (index >= 0 && index < cache->numcached) {
 990         /* only do bcopy if we're inserting */
 991         bcopy( cache->acache+index, cache->acache+(index+1), (cache->numcached - index)*sizeof(int) );
 992         bcopy( cache->haveaccess+index, cache->haveaccess+(index+1), (cache->numcached - index)*sizeof(unsigned char) );
 993     }
 994
 995     cache->acache[index] = nodeID;
 996     cache->haveaccess[index] = access;
 997     cache->numcached++;
 998 }
 999
1000
1001 struct cinfo {
1002     uid_t   uid;
1003     gid_t   gid;
1004     mode_t  mode;
1005     cnid_t  parentcnid;
1006     u_int16_t recflags;
1007 };
1008
1009 static int
1010 snoop_callback(const cnode_t *cp, void *arg)
1011 {
1012     struct cinfo *cip = arg;
1013
1014     cip->uid = cp->c_uid;
1015     cip->gid = cp->c_gid;
1016     cip->mode = cp->c_mode;
1017     cip->parentcnid = cp->c_parentcnid;
1018     cip->recflags = cp->c_attr.ca_recflags;
1019
1020     return (0);
1021 }
1022
1023 /*
1024  * Lookup the cnid's attr info (uid, gid, and mode) as well as its parent id. If the item
1025  * isn't incore, then go to the catalog.
1026  */
1027 static int
1028 do_attr_lookup(struct hfsmount *hfsmp, struct access_cache *cache, cnid_t cnid,
1029     struct cnode *skip_cp, CatalogKey *keyp, struct cat_attr *cnattrp)
1030 {
1031     int error = 0;
1032
1033     /* if this id matches the one the fsctl was called with, skip the lookup */
1034     if (cnid == skip_cp->c_cnid) {
1035                 cnattrp->ca_uid = skip_cp->c_uid;
1036                 cnattrp->ca_gid = skip_cp->c_gid;
1037                 cnattrp->ca_mode = skip_cp->c_mode;
1038                 cnattrp->ca_recflags = skip_cp->c_attr.ca_recflags;
1039                 keyp->hfsPlus.parentID = skip_cp->c_parentcnid;
1040     } else {
1041                 struct cinfo c_info;
1042
1043                 /* otherwise, check the cnode hash incase the file/dir is incore */
1044                 error = hfs_chash_snoop(hfsmp, cnid, 0, snoop_callback, &c_info);
1045
1046                 if (error == EACCES) {
1047                         // File is deleted
1048                         return ENOENT;
1049                 } else if (!error) {
1050                         cnattrp->ca_uid = c_info.uid;
1051                         cnattrp->ca_gid = c_info.gid;
1052                         cnattrp->ca_mode = c_info.mode;
1053                         cnattrp->ca_recflags = c_info.recflags;
1054                         keyp->hfsPlus.parentID = c_info.parentcnid;
1055                 } else {
1056                         int lockflags;
1057
1058                         if (throttle_io_will_be_throttled(-1, HFSTOVFS(hfsmp)))
1059                                 throttle_lowpri_io(1);
1060
1061                         lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
1062
1063                         /* lookup this cnid in the catalog */
1064                         error = cat_getkeyplusattr(hfsmp, cnid, keyp, cnattrp);
1065
1066                         hfs_systemfile_unlock(hfsmp, lockflags);
1067
1068                         cache->lookups++;
1069                 }
1070     }
1071
1072     return (error);
1073 }
1074
1075
1076 /*
1077  * Compute whether we have access to the given directory (nodeID) and all its parents. Cache
1078  * up to CACHE_LEVELS as we progress towards the root.
1079  */
1080 static int
1081 do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HFSCatalogNodeID nodeID,
1082     struct cnode *skip_cp, struct proc *theProcPtr, kauth_cred_t myp_ucred,
1083     struct vfs_context *my_context,
1084     char *bitmap,
1085     uint32_t map_size,
1086     cnid_t* parents,
1087     uint32_t num_parents)
1088 {
1089     int                     myErr = 0;
1090     int                     myResult;
1091     HFSCatalogNodeID        thisNodeID;
1092     unsigned int            myPerms;
1093     struct cat_attr         cnattr;
1094     int                     cache_index = -1, scope_index = -1, scope_idx_start = -1;
1095     CatalogKey              catkey;
1096
1097     int i = 0, ids_to_cache = 0;
1098     int parent_ids[CACHE_LEVELS];
1099
1100     thisNodeID = nodeID;
1101     while (thisNodeID >=  kRootDirID) {
1102         myResult = 0;   /* default to "no access" */
1103
1104         /* check the cache before resorting to hitting the catalog */
1105
1106         /* ASSUMPTION: access info of cached entries is "final"... i.e. no need
1107          * to look any further after hitting cached dir */
1108
1109         if (lookup_bucket(cache, &cache_index, thisNodeID)) {
1110             cache->cachehits++;
1111             myErr = cache->haveaccess[cache_index];
1112             if (scope_index != -1) {
1113                 if (myErr == ESRCH) {
1114                     myErr = 0;
1115                 }
1116             } else {
1117                 scope_index = 0;   // so we'll just use the cache result
1118                 scope_idx_start = ids_to_cache;
1119             }
1120             myResult = (myErr == 0) ? 1 : 0;
1121             goto ExitThisRoutine;
1122         }
1123
1124
1125         if (parents) {
1126             int tmp;
1127             tmp = cache_binSearch(parents, num_parents-1, thisNodeID, NULL);
1128             if (scope_index == -1)
1129                 scope_index = tmp;
1130             if (tmp != -1 && scope_idx_start == -1 && ids_to_cache < CACHE_LEVELS) {
1131                 scope_idx_start = ids_to_cache;
1132             }
1133         }
1134
1135         /* remember which parents we want to cache */
1136         if (ids_to_cache < CACHE_LEVELS) {
1137             parent_ids[ids_to_cache] = thisNodeID;
1138             ids_to_cache++;
1139         }
1140         // Inefficient (using modulo) and we might want to use a hash function, not rely on the node id to be "nice"...
1141         if (bitmap && map_size) {
1142             bitmap[(thisNodeID/8)%(map_size)]|=(1<<(thisNodeID&7));
1143         }
1144
1145
1146         /* do the lookup (checks the cnode hash, then the catalog) */
1147         myErr = do_attr_lookup(hfsmp, cache, thisNodeID, skip_cp, &catkey, &cnattr);
1148         if (myErr) {
1149             goto ExitThisRoutine; /* no access */
1150         }
1151
1152         /* Root always gets access. */
1153         if (suser(myp_ucred, NULL) == 0) {
1154                 thisNodeID = catkey.hfsPlus.parentID;
1155                 myResult = 1;
1156                 continue;
1157         }
1158
1159         // if the thing has acl's, do the full permission check
1160         if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1161             struct vnode *vp;
1162
1163             /* get the vnode for this cnid */
1164             myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0);
1165             if ( myErr ) {
1166                 myResult = 0;
1167                 goto ExitThisRoutine;
1168             }
1169
1170             thisNodeID = VTOC(vp)->c_parentcnid;
1171
1172             hfs_unlock(VTOC(vp));
1173
1174             if (vnode_vtype(vp) == VDIR) {
1175                 myErr = vnode_authorize(vp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), my_context);
1176             } else {
1177                 myErr = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, my_context);
1178             }
1179
1180             vnode_put(vp);
1181             if (myErr) {
1182                 myResult = 0;
1183                 goto ExitThisRoutine;
1184             }
1185         } else {
1186             unsigned int flags;
1187                 int mode = cnattr.ca_mode & S_IFMT;
1188                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr);
1189
1190                 if (mode == S_IFDIR) {
1191                         flags = R_OK | X_OK;
1192                 } else {
1193                         flags = R_OK;
1194                 }
1195                 if ( (myPerms & flags) != flags) {
1196                         myResult = 0;
1197                         myErr = EACCES;
1198                         goto ExitThisRoutine;   /* no access */
1199                 }
1200
1201             /* up the hierarchy we go */
1202             thisNodeID = catkey.hfsPlus.parentID;
1203         }
1204     }
1205
1206     /* if here, we have access to this node */
1207     myResult = 1;
1208
1209   ExitThisRoutine:
1210     if (parents && myErr == 0 && scope_index == -1) {
1211         myErr = ESRCH;
1212     }
1213
1214     if (myErr) {
1215         myResult = 0;
1216     }
1217     *err = myErr;
1218
1219     /* cache the parent directory(ies) */
1220     for (i = 0; i < ids_to_cache; i++) {
1221         if (myErr == 0 && parents && (scope_idx_start == -1 || i > scope_idx_start)) {
1222             add_node(cache, -1, parent_ids[i], ESRCH);
1223         } else {
1224             add_node(cache, -1, parent_ids[i], myErr);
1225         }
1226     }
1227
1228     return (myResult);
1229 }
1230
1231 static int
1232 do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp,
1233     struct vnop_ioctl_args *ap, int arg_size, vfs_context_t context)
1234 {
1235     boolean_t is64bit;
1236
1237     /*
1238      * NOTE: on entry, the vnode has an io_ref. In case this vnode
1239      * happens to be in our list of file_ids, we'll note it
1240      * avoid calling hfs_chashget_nowait() on that id as that
1241      * will cause a "locking against myself" panic.
1242      */
1243     Boolean check_leaf = true;
1244
1245     struct user64_ext_access_t *user_access_structp;
1246     struct user64_ext_access_t tmp_user_access;
1247     struct access_cache cache;
1248
1249     int error = 0, prev_parent_check_ok=1;
1250     unsigned int i;
1251
1252     short flags;
1253     unsigned int num_files = 0;
1254     int map_size = 0;
1255     int num_parents = 0;
1256     int *file_ids=NULL;
1257     short *access=NULL;
1258     char *bitmap=NULL;
1259     cnid_t *parents=NULL;
1260     int leaf_index;
1261
1262     cnid_t cnid;
1263     cnid_t prevParent_cnid = 0;
1264     unsigned int myPerms;
1265     short myaccess = 0;
1266     struct cat_attr cnattr;
1267     CatalogKey catkey;
1268     struct cnode *skip_cp = VTOC(vp);
1269     kauth_cred_t cred = vfs_context_ucred(context);
1270     proc_t p = vfs_context_proc(context);
1271
1272     is64bit = proc_is64bit(p);
1273
1274     /* initialize the local cache and buffers */
1275     cache.numcached = 0;
1276     cache.cachehits = 0;
1277     cache.lookups = 0;
1278     cache.acache = NULL;
1279     cache.haveaccess = NULL;
1280
1281     /* struct copyin done during dispatch... need to copy file_id array separately */
1282     if (ap->a_data == NULL) {
1283         error = EINVAL;
1284         goto err_exit_bulk_access;
1285     }
1286
1287     if (is64bit) {
1288         if (arg_size != sizeof(struct user64_ext_access_t)) {
1289             error = EINVAL;
1290             goto err_exit_bulk_access;
1291         }
1292
1293         user_access_structp = (struct user64_ext_access_t *)ap->a_data;
1294
1295     } else if (arg_size == sizeof(struct user32_access_t)) {
1296         struct user32_access_t *accessp = (struct user32_access_t *)ap->a_data;
1297
1298         // convert an old style bulk-access struct to the new style
1299         tmp_user_access.flags     = accessp->flags;
1300         tmp_user_access.num_files = accessp->num_files;
1301         tmp_user_access.map_size  = 0;
1302         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1303         tmp_user_access.bitmap    = USER_ADDR_NULL;
1304         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1305         tmp_user_access.num_parents = 0;
1306         user_access_structp = &tmp_user_access;
1307
1308     } else if (arg_size == sizeof(struct user32_ext_access_t)) {
1309         struct user32_ext_access_t *accessp = (struct user32_ext_access_t *)ap->a_data;
1310
1311         // up-cast from a 32-bit version of the struct
1312         tmp_user_access.flags     = accessp->flags;
1313         tmp_user_access.num_files = accessp->num_files;
1314         tmp_user_access.map_size  = accessp->map_size;
1315         tmp_user_access.num_parents  = accessp->num_parents;
1316
1317         tmp_user_access.file_ids  = CAST_USER_ADDR_T(accessp->file_ids);
1318         tmp_user_access.bitmap    = CAST_USER_ADDR_T(accessp->bitmap);
1319         tmp_user_access.access    = CAST_USER_ADDR_T(accessp->access);
1320         tmp_user_access.parents    = CAST_USER_ADDR_T(accessp->parents);
1321
1322         user_access_structp = &tmp_user_access;
1323     } else {
1324         error = EINVAL;
1325         goto err_exit_bulk_access;
1326     }
1327
1328     map_size = user_access_structp->map_size;
1329
1330     num_files = user_access_structp->num_files;
1331
1332     num_parents= user_access_structp->num_parents;
1333
1334     if (num_files < 1) {
1335         goto err_exit_bulk_access;
1336     }
1337     if (num_files > 1024) {
1338         error = EINVAL;
1339         goto err_exit_bulk_access;
1340     }
1341
1342     if (num_parents > 1024) {
1343         error = EINVAL;
1344         goto err_exit_bulk_access;
1345     }
1346
1347     file_ids = (int *) kalloc(sizeof(int) * num_files);
1348     access = (short *) kalloc(sizeof(short) * num_files);
1349     if (map_size) {
1350         bitmap = (char *) kalloc(sizeof(char) * map_size);
1351     }
1352
1353     if (num_parents) {
1354         parents = (cnid_t *) kalloc(sizeof(cnid_t) * num_parents);
1355     }
1356
1357     cache.acache = (unsigned int *) kalloc(sizeof(int) * NUM_CACHE_ENTRIES);
1358     cache.haveaccess = (unsigned char *) kalloc(sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1359
1360     if (file_ids == NULL || access == NULL || (map_size != 0 && bitmap == NULL) || cache.acache == NULL || cache.haveaccess == NULL) {
1361         if (file_ids) {
1362             kfree(file_ids, sizeof(int) * num_files);
1363         }
1364         if (bitmap) {
1365             kfree(bitmap, sizeof(char) * map_size);
1366         }
1367         if (access) {
1368             kfree(access, sizeof(short) * num_files);
1369         }
1370         if (cache.acache) {
1371             kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1372         }
1373         if (cache.haveaccess) {
1374             kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1375         }
1376         if (parents) {
1377             kfree(parents, sizeof(cnid_t) * num_parents);
1378         }
1379         return ENOMEM;
1380     }
1381
1382     // make sure the bitmap is zero'ed out...
1383     if (bitmap) {
1384         bzero(bitmap, (sizeof(char) * map_size));
1385     }
1386
1387     if ((error = copyin(user_access_structp->file_ids, (caddr_t)file_ids,
1388                 num_files * sizeof(int)))) {
1389         goto err_exit_bulk_access;
1390     }
1391
1392     if (num_parents) {
1393         if ((error = copyin(user_access_structp->parents, (caddr_t)parents,
1394                     num_parents * sizeof(cnid_t)))) {
1395             goto err_exit_bulk_access;
1396         }
1397     }
1398
1399     flags = user_access_structp->flags;
1400     if ((flags & (F_OK | R_OK | W_OK | X_OK)) == 0) {
1401         flags = R_OK;
1402     }
1403
1404     /* check if we've been passed leaf node ids or parent ids */
1405     if (flags & PARENT_IDS_FLAG) {
1406         check_leaf = false;
1407     }
1408
1409     /* Check access to each file_id passed in */
1410     for (i = 0; i < num_files; i++) {
1411         leaf_index=-1;
1412         cnid = (cnid_t) file_ids[i];
1413
1414         /* root always has access */
1415         if ((!parents) && (!suser(cred, NULL))) {
1416             access[i] = 0;
1417             continue;
1418         }
1419
1420         if (check_leaf) {
1421             /* do the lookup (checks the cnode hash, then the catalog) */
1422             error = do_attr_lookup(hfsmp, &cache, cnid, skip_cp, &catkey, &cnattr);
1423             if (error) {
1424                 access[i] = (short) error;
1425                 continue;
1426             }
1427
1428             if (parents) {
1429                 // Check if the leaf matches one of the parent scopes
1430                 leaf_index = cache_binSearch(parents, num_parents-1, cnid, NULL);
1431                 if (leaf_index >= 0 && parents[leaf_index] == cnid)
1432                     prev_parent_check_ok = 0;
1433                 else if (leaf_index >= 0)
1434                     prev_parent_check_ok = 1;
1435             }
1436
1437             // if the thing has acl's, do the full permission check
1438             if ((cnattr.ca_recflags & kHFSHasSecurityMask) != 0) {
1439                 struct vnode *cvp;
1440                 int myErr = 0;
1441                 /* get the vnode for this cnid */
1442                 myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0);
1443                 if ( myErr ) {
1444                     access[i] = myErr;
1445                     continue;
1446                 }
1447
1448                 hfs_unlock(VTOC(cvp));
1449
1450                 if (vnode_vtype(cvp) == VDIR) {
1451                     myErr = vnode_authorize(cvp, NULL, (KAUTH_VNODE_SEARCH | KAUTH_VNODE_LIST_DIRECTORY), context);
1452                 } else {
1453                     myErr = vnode_authorize(cvp, NULL, KAUTH_VNODE_READ_DATA, context);
1454                 }
1455
1456                 vnode_put(cvp);
1457                 if (myErr) {
1458                     access[i] = myErr;
1459                     continue;
1460                 }
1461             } else {
1462                 /* before calling CheckAccess(), check the target file for read access */
1463                 myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid,
1464                     cnattr.ca_mode, hfsmp->hfs_mp, cred, p);
1465
1466                 /* fail fast if no access */
1467                 if ((myPerms & flags) == 0) {
1468                     access[i] = EACCES;
1469                     continue;
1470                 }
1471             }
1472         } else {
1473             /* we were passed an array of parent ids */
1474             catkey.hfsPlus.parentID = cnid;
1475         }
1476
1477         /* if the last guy had the same parent and had access, we're done */
1478         if (i > 0 && catkey.hfsPlus.parentID == prevParent_cnid && access[i-1] == 0 && prev_parent_check_ok) {
1479             cache.cachehits++;
1480             access[i] = 0;
1481             continue;
1482         }
1483
1484         myaccess = do_access_check(hfsmp, &error, &cache, catkey.hfsPlus.parentID,
1485             skip_cp, p, cred, context,bitmap, map_size, parents, num_parents);
1486
1487         if (myaccess || (error == ESRCH && leaf_index != -1)) {
1488             access[i] = 0; // have access.. no errors to report
1489         } else {
1490             access[i] = (error != 0 ? (short) error : EACCES);
1491         }
1492
1493         prevParent_cnid = catkey.hfsPlus.parentID;
1494     }
1495
1496     /* copyout the access array */
1497     if ((error = copyout((caddr_t)access, user_access_structp->access,
1498                 num_files * sizeof (short)))) {
1499         goto err_exit_bulk_access;
1500     }
1501     if (map_size && bitmap) {
1502         if ((error = copyout((caddr_t)bitmap, user_access_structp->bitmap,
1503                     map_size * sizeof (char)))) {
1504             goto err_exit_bulk_access;
1505         }
1506     }
1507
1508
1509   err_exit_bulk_access:
1510
1511     if (file_ids)
1512         kfree(file_ids, sizeof(int) * num_files);
1513     if (parents)
1514         kfree(parents, sizeof(cnid_t) * num_parents);
1515     if (bitmap)
1516         kfree(bitmap, sizeof(char) * map_size);
1517     if (access)
1518         kfree(access, sizeof(short) * num_files);
1519     if (cache.acache)
1520         kfree(cache.acache, sizeof(int) * NUM_CACHE_ENTRIES);
1521     if (cache.haveaccess)
1522         kfree(cache.haveaccess, sizeof(unsigned char) * NUM_CACHE_ENTRIES);
1523
1524     return (error);
1525 }
1526
1527
1528 /* end "bulk-access" support */
1529
1530
1531 /*
1532  * Control filesystem operating characteristics.
1533  */
1534 int
1535 hfs_vnop_ioctl( struct vnop_ioctl_args /* {
1536                 vnode_t a_vp;
1537                 long  a_command;
1538                 caddr_t  a_data;
1539                 int  a_fflag;
1540                 vfs_context_t a_context;
1541         } */ *ap)
1542 {
1543         struct vnode * vp = ap->a_vp;
1544         struct hfsmount *hfsmp = VTOHFS(vp);
1545         vfs_context_t context = ap->a_context;
1546         kauth_cred_t cred = vfs_context_ucred(context);
1547         proc_t p = vfs_context_proc(context);
1548         struct vfsstatfs *vfsp;
1549         boolean_t is64bit;
1550         off_t jnl_start, jnl_size;
1551         struct hfs_journal_info *jip;
1552 #if HFS_COMPRESSION
1553         int compressed = 0;
1554         off_t uncompressed_size = -1;
1555         int decmpfs_error = 0;
1556
1557         if (ap->a_command == F_RDADVISE) {
1558                 /* we need to inspect the decmpfs state of the file as early as possible */
1559                 compressed = hfs_file_is_compressed(VTOC(vp), 0);
1560                 if (compressed) {
1561                         if (VNODE_IS_RSRC(vp)) {
1562                                 /* if this is the resource fork, treat it as if it were empty */
1563                                 uncompressed_size = 0;
1564                         } else {
1565                                 decmpfs_error = hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0);
1566                                 if (decmpfs_error != 0) {
1567                                         /* failed to get the uncompressed size, we'll check for this later */
1568                                         uncompressed_size = -1;
1569                                 }
1570                         }
1571                 }
1572         }
1573 #endif /* HFS_COMPRESSION */
1574
1575         is64bit = proc_is64bit(p);
1576
1577 #if CONFIG_PROTECT
1578         {
1579                 int error = 0;
1580                 if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
1581                         return error;
1582                 }
1583         }
1584 #endif /* CONFIG_PROTECT */
1585
1586         switch (ap->a_command) {
1587
1588         case HFS_GETPATH:
1589         {
1590                 struct vnode *file_vp;
1591                 cnid_t  cnid;
1592                 int  outlen;
1593                 char *bufptr;
1594                 int error;
1595                 int flags = 0;
1596
1597                 /* Caller must be owner of file system. */
1598                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1599                 if (suser(cred, NULL) &&
1600                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1601                         return (EACCES);
1602                 }
1603                 /* Target vnode must be file system's root. */
1604                 if (!vnode_isvroot(vp)) {
1605                         return (EINVAL);
1606                 }
1607                 bufptr = (char *)ap->a_data;
1608                 cnid = strtoul(bufptr, NULL, 10);
1609                 if (ap->a_fflag & HFS_GETPATH_VOLUME_RELATIVE) {
1610                         flags |= BUILDPATH_VOLUME_RELATIVE;
1611                 }
1612
1613                 /* We need to call hfs_vfs_vget to leverage the code that will
1614                  * fix the origin list for us if needed, as opposed to calling
1615                  * hfs_vget, since we will need the parent for build_path call.
1616                  */
1617
1618                 if ((error = hfs_vfs_vget(HFSTOVFS(hfsmp), cnid, &file_vp, context))) {
1619                         return (error);
1620                 }
1621                 error = build_path(file_vp, bufptr, sizeof(pathname_t), &outlen, flags, context);
1622                 vnode_put(file_vp);
1623
1624                 return (error);
1625         }
1626
1627         case HFS_TRANSFER_DOCUMENT_ID:
1628         {
1629                 struct cnode *cp = NULL;
1630                 int error;
1631                 u_int32_t to_fd = *(u_int32_t *)ap->a_data;
1632                 struct fileproc *to_fp;
1633                 struct vnode *to_vp;
1634                 struct cnode *to_cp;
1635
1636                 cp = VTOC(vp);
1637
1638                 if ((error = fp_getfvp(p, to_fd, &to_fp, &to_vp)) != 0) {
1639                         //printf("could not get the vnode for fd %d (err %d)\n", to_fd, error);
1640                         return error;
1641                 }
1642                 if ( (error = vnode_getwithref(to_vp)) ) {
1643                         file_drop(to_fd);
1644                         return error;
1645                 }
1646
1647                 if (VTOHFS(to_vp) != hfsmp) {
1648                         error = EXDEV;
1649                         goto transfer_cleanup;
1650                 }
1651
1652                 int need_unlock = 1;
1653                 to_cp = VTOC(to_vp);
1654                 error = hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1655                 if (error != 0) {
1656                         //printf("could not lock the pair of cnodes (error %d)\n", error);
1657                         goto transfer_cleanup;
1658                 }
1659
1660                 if (!(cp->c_bsdflags & UF_TRACKED)) {
1661                         error = EINVAL;
1662                 } else if (to_cp->c_bsdflags & UF_TRACKED) {
1663                         //
1664                         // if the destination is already tracked, return an error
1665                         // as otherwise it's a silent deletion of the target's
1666                         // document-id
1667                         //
1668                         error = EEXIST;
1669                 } else if (S_ISDIR(cp->c_attr.ca_mode) || S_ISREG(cp->c_attr.ca_mode) || S_ISLNK(cp->c_attr.ca_mode)) {
1670                         //
1671                         // we can use the FndrExtendedFileInfo because the doc-id is the first
1672                         // thing in both it and the ExtendedDirInfo struct which is fixed in
1673                         // format and can not change layout
1674                         //
1675                         struct FndrExtendedFileInfo *f_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)cp->c_finderinfo + 16);
1676                         struct FndrExtendedFileInfo *to_extinfo = (struct FndrExtendedFileInfo *)((u_int8_t*)to_cp->c_finderinfo + 16);
1677
1678                         if (f_extinfo->document_id == 0) {
1679                                 uint32_t new_id;
1680
1681                                 hfs_unlockpair(cp, to_cp);  // have to unlock to be able to get a new-id
1682
1683                                 if ((error = hfs_generate_document_id(hfsmp, &new_id)) == 0) {
1684                                         //
1685                                         // re-lock the pair now that we have the document-id
1686                                         //
1687                                         hfs_lockpair(cp, to_cp, HFS_EXCLUSIVE_LOCK);
1688                                         f_extinfo->document_id = new_id;
1689                                 } else {
1690                                         goto transfer_cleanup;
1691                                 }
1692                         }
1693
1694                         to_extinfo->document_id = f_extinfo->document_id;
1695                         f_extinfo->document_id = 0;
1696                         //printf("TRANSFERRING: doc-id %d from ino %d to ino %d\n", to_extinfo->document_id, cp->c_fileid, to_cp->c_fileid);
1697
1698                         // make sure the destination is also UF_TRACKED
1699                         to_cp->c_bsdflags |= UF_TRACKED;
1700                         cp->c_bsdflags &= ~UF_TRACKED;
1701
1702                         // mark the cnodes dirty
1703                         cp->c_flag |= C_MODIFIED;
1704                         to_cp->c_flag |= C_MODIFIED;
1705
1706                         int lockflags;
1707                         if ((error = hfs_start_transaction(hfsmp)) == 0) {
1708
1709                                 lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
1710
1711                                 (void) cat_update(hfsmp, &cp->c_desc, &cp->c_attr, NULL, NULL);
1712                                 (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, NULL, NULL);
1713
1714                                 hfs_systemfile_unlock (hfsmp, lockflags);
1715                                 (void) hfs_end_transaction(hfsmp);
1716                         }
1717
1718 #if CONFIG_FSE
1719                         add_fsevent(FSE_DOCID_CHANGED, context,
1720                                     FSE_ARG_DEV,   hfsmp->hfs_raw_dev,
1721                                     FSE_ARG_INO,   (ino64_t)cp->c_fileid,       // src inode #
1722                                     FSE_ARG_INO,   (ino64_t)to_cp->c_fileid,    // dst inode #
1723                                     FSE_ARG_INT32, to_extinfo->document_id,
1724                                     FSE_ARG_DONE);
1725
1726                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1727                         need_unlock = 0;
1728
1729                         if (need_fsevent(FSE_STAT_CHANGED, vp)) {
1730                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, vp, FSE_ARG_DONE);
1731                         }
1732                         if (need_fsevent(FSE_STAT_CHANGED, to_vp)) {
1733                                 add_fsevent(FSE_STAT_CHANGED, context, FSE_ARG_VNODE, to_vp, FSE_ARG_DONE);
1734                         }
1735 #else
1736                         hfs_unlockpair(cp, to_cp);    // unlock this so we can send the fsevents
1737                         need_unlock = 0;
1738 #endif
1739                 }
1740
1741                 if (need_unlock) {
1742                         hfs_unlockpair(cp, to_cp);
1743                 }
1744
1745         transfer_cleanup:
1746                 vnode_put(to_vp);
1747                 file_drop(to_fd);
1748
1749                 return error;
1750         }
1751
1752
1753
1754         case HFS_PREV_LINK:
1755         case HFS_NEXT_LINK:
1756         {
1757                 cnid_t linkfileid;
1758                 cnid_t nextlinkid;
1759                 cnid_t prevlinkid;
1760                 int error;
1761
1762                 /* Caller must be owner of file system. */
1763                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1764                 if (suser(cred, NULL) &&
1765                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1766                         return (EACCES);
1767                 }
1768                 /* Target vnode must be file system's root. */
1769                 if (!vnode_isvroot(vp)) {
1770                         return (EINVAL);
1771                 }
1772                 linkfileid = *(cnid_t *)ap->a_data;
1773                 if (linkfileid < kHFSFirstUserCatalogNodeID) {
1774                         return (EINVAL);
1775                 }
1776                 if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) {
1777                         return (error);
1778                 }
1779                 if (ap->a_command == HFS_NEXT_LINK) {
1780                         *(cnid_t *)ap->a_data = nextlinkid;
1781                 } else {
1782                         *(cnid_t *)ap->a_data = prevlinkid;
1783                 }
1784                 return (0);
1785         }
1786
1787         case HFS_RESIZE_PROGRESS: {
1788
1789                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1790                 if (suser(cred, NULL) &&
1791                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1792                         return (EACCES); /* must be owner of file system */
1793                 }
1794                 if (!vnode_isvroot(vp)) {
1795                         return (EINVAL);
1796                 }
1797                 /* file system must not be mounted read-only */
1798                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1799                         return (EROFS);
1800                 }
1801
1802                 return hfs_resize_progress(hfsmp, (u_int32_t *)ap->a_data);
1803         }
1804
1805         case HFS_RESIZE_VOLUME: {
1806                 u_int64_t newsize;
1807                 u_int64_t cursize;
1808                 int ret;
1809
1810                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1811                 if (suser(cred, NULL) &&
1812                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1813                         return (EACCES); /* must be owner of file system */
1814                 }
1815                 if (!vnode_isvroot(vp)) {
1816                         return (EINVAL);
1817                 }
1818
1819                 /* filesystem must not be mounted read only */
1820                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1821                         return (EROFS);
1822                 }
1823                 newsize = *(u_int64_t *)ap->a_data;
1824                 cursize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
1825
1826                 if (newsize == cursize) {
1827                         return (0);
1828                 }
1829                 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeWillResize);
1830                 if (newsize > cursize) {
1831                         ret = hfs_extendfs(hfsmp, *(u_int64_t *)ap->a_data, context);
1832                 } else {
1833                         ret = hfs_truncatefs(hfsmp, *(u_int64_t *)ap->a_data, context);
1834                 }
1835                 IOBSDMountChange(hfsmp->hfs_mp, kIOMountChangeDidResize);
1836                 return (ret);
1837         }
1838         case HFS_CHANGE_NEXT_ALLOCATION: {
1839                 int error = 0;          /* Assume success */
1840                 u_int32_t location;
1841
1842                 if (vnode_vfsisrdonly(vp)) {
1843                         return (EROFS);
1844                 }
1845                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1846                 if (suser(cred, NULL) &&
1847                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1848                         return (EACCES); /* must be owner of file system */
1849                 }
1850                 if (!vnode_isvroot(vp)) {
1851                         return (EINVAL);
1852                 }
1853                 hfs_lock_mount(hfsmp);
1854                 location = *(u_int32_t *)ap->a_data;
1855                 if ((location >= hfsmp->allocLimit) &&
1856                         (location != HFS_NO_UPDATE_NEXT_ALLOCATION)) {
1857                         error = EINVAL;
1858                         goto fail_change_next_allocation;
1859                 }
1860                 /* Return previous value. */
1861                 *(u_int32_t *)ap->a_data = hfsmp->nextAllocation;
1862                 if (location == HFS_NO_UPDATE_NEXT_ALLOCATION) {
1863                         /* On magic value for location, set nextAllocation to next block
1864                          * after metadata zone and set flag in mount structure to indicate
1865                          * that nextAllocation should not be updated again.
1866                          */
1867                         if (hfsmp->hfs_metazone_end != 0) {
1868                                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_end + 1);
1869                         }
1870                         hfsmp->hfs_flags |= HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1871                 } else {
1872                         hfsmp->hfs_flags &= ~HFS_SKIP_UPDATE_NEXT_ALLOCATION;
1873                         HFS_UPDATE_NEXT_ALLOCATION(hfsmp, location);
1874                 }
1875                 MarkVCBDirty(hfsmp);
1876 fail_change_next_allocation:
1877                 hfs_unlock_mount(hfsmp);
1878                 return (error);
1879         }
1880
1881 #if HFS_SPARSE_DEV
1882         case HFS_SETBACKINGSTOREINFO: {
1883                 struct vnode * bsfs_rootvp;
1884                 struct vnode * di_vp;
1885                 struct hfs_backingstoreinfo *bsdata;
1886                 int error = 0;
1887
1888                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1889                         return (EROFS);
1890                 }
1891                 if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
1892                         return (EALREADY);
1893                 }
1894                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1895                 if (suser(cred, NULL) &&
1896                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1897                         return (EACCES); /* must be owner of file system */
1898                 }
1899                 bsdata = (struct hfs_backingstoreinfo *)ap->a_data;
1900                 if (bsdata == NULL) {
1901                         return (EINVAL);
1902                 }
1903                 if ((error = file_vnode(bsdata->backingfd, &di_vp))) {
1904                         return (error);
1905                 }
1906                 if ((error = vnode_getwithref(di_vp))) {
1907                         file_drop(bsdata->backingfd);
1908                         return(error);
1909                 }
1910
1911                 if (vnode_mount(vp) == vnode_mount(di_vp)) {
1912                         (void)vnode_put(di_vp);
1913                         file_drop(bsdata->backingfd);
1914                         return (EINVAL);
1915                 }
1916
1917                 /*
1918                  * Obtain the backing fs root vnode and keep a reference
1919                  * on it.  This reference will be dropped in hfs_unmount.
1920                  */
1921                 error = VFS_ROOT(vnode_mount(di_vp), &bsfs_rootvp, NULL); /* XXX use context! */
1922                 if (error) {
1923                         (void)vnode_put(di_vp);
1924                         file_drop(bsdata->backingfd);
1925                         return (error);
1926                 }
1927                 vnode_ref(bsfs_rootvp);
1928                 vnode_put(bsfs_rootvp);
1929
1930                 hfs_lock_mount(hfsmp);
1931                 hfsmp->hfs_backingfs_rootvp = bsfs_rootvp;
1932                 hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
1933                 hfsmp->hfs_sparsebandblks = bsdata->bandsize / hfsmp->blockSize * 4;
1934                 hfs_unlock_mount(hfsmp);
1935
1936                 /* We check the MNTK_VIRTUALDEV bit instead of marking the dependent process */
1937
1938                 /*
1939                  * If the sparse image is on a sparse image file (as opposed to a sparse
1940                  * bundle), then we may need to limit the free space to the maximum size
1941                  * of a file on that volume.  So we query (using pathconf), and if we get
1942                  * a meaningful result, we cache the number of blocks for later use in
1943                  * hfs_freeblks().
1944                  */
1945                 hfsmp->hfs_backingfs_maxblocks = 0;
1946                 if (vnode_vtype(di_vp) == VREG) {
1947                         int terr;
1948                         int hostbits;
1949                         terr = vn_pathconf(di_vp, _PC_FILESIZEBITS, &hostbits, context);
1950                         if (terr == 0 && hostbits != 0 && hostbits < 64) {
1951                                 u_int64_t hostfilesizemax = ((u_int64_t)1) << hostbits;
1952
1953                                 hfsmp->hfs_backingfs_maxblocks = hostfilesizemax / hfsmp->blockSize;
1954                         }
1955                 }
1956
1957                 /* The free extent cache is managed differently for sparse devices.
1958                  * There is a window between which the volume is mounted and the
1959                  * device is marked as sparse, so the free extent cache for this
1960                  * volume is currently initialized as normal volume (sorted by block
1961                  * count).  Reset the cache so that it will be rebuilt again
1962                  * for sparse device (sorted by start block).
1963                  */
1964                 ResetVCBFreeExtCache(hfsmp);
1965
1966                 (void)vnode_put(di_vp);
1967                 file_drop(bsdata->backingfd);
1968                 return (0);
1969         }
1970         case HFS_CLRBACKINGSTOREINFO: {
1971                 struct vnode * tmpvp;
1972
1973                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
1974                 if (suser(cred, NULL) &&
1975                         kauth_cred_getuid(cred) != vfsp->f_owner) {
1976                         return (EACCES); /* must be owner of file system */
1977                 }
1978                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
1979                         return (EROFS);
1980                 }
1981
1982                 if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) &&
1983                     hfsmp->hfs_backingfs_rootvp) {
1984
1985                         hfs_lock_mount(hfsmp);
1986                         hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
1987                         tmpvp = hfsmp->hfs_backingfs_rootvp;
1988                         hfsmp->hfs_backingfs_rootvp = NULLVP;
1989                         hfsmp->hfs_sparsebandblks = 0;
1990                         hfs_unlock_mount(hfsmp);
1991
1992                         vnode_rele(tmpvp);
1993                 }
1994                 return (0);
1995         }
1996 #endif /* HFS_SPARSE_DEV */
1997
1998         /* Change the next CNID stored in the VH */
1999         case HFS_CHANGE_NEXTCNID: {
2000                 int error = 0;          /* Assume success */
2001                 u_int32_t fileid;
2002                 int wraparound = 0;
2003                 int lockflags = 0;
2004
2005                 if (vnode_vfsisrdonly(vp)) {
2006                         return (EROFS);
2007                 }
2008                 vfsp = vfs_statfs(HFSTOVFS(hfsmp));
2009                 if (suser(cred, NULL) &&
2010                         kauth_cred_getuid(cred) != vfsp->f_owner) {
2011                         return (EACCES); /* must be owner of file system */
2012                 }
2013
2014                 fileid = *(u_int32_t *)ap->a_data;
2015
2016                 /* Must have catalog lock excl. to advance the CNID pointer */
2017                 lockflags = hfs_systemfile_lock (hfsmp, SFL_CATALOG , HFS_EXCLUSIVE_LOCK);
2018
2019                 hfs_lock_mount(hfsmp);
2020
2021                 /* If it is less than the current next CNID, force the wraparound bit to be set */
2022                 if (fileid < hfsmp->vcbNxtCNID) {
2023                         wraparound=1;
2024                 }
2025
2026                 /* Return previous value. */
2027                 *(u_int32_t *)ap->a_data = hfsmp->vcbNxtCNID;
2028
2029                 hfsmp->vcbNxtCNID = fileid;
2030
2031                 if (wraparound) {
2032                         hfsmp->vcbAtrb |= kHFSCatalogNodeIDsReusedMask;
2033                 }
2034
2035                 MarkVCBDirty(hfsmp);
2036                 hfs_unlock_mount(hfsmp);
2037                 hfs_systemfile_unlock (hfsmp, lockflags);
2038
2039                 return (error);
2040         }
2041
2042         case F_FREEZE_FS: {
2043                 struct mount *mp;
2044
2045                 mp = vnode_mount(vp);
2046                 hfsmp = VFSTOHFS(mp);
2047
2048                 if (!(hfsmp->jnl))
2049                         return (ENOTSUP);
2050
2051                 vfsp = vfs_statfs(mp);
2052
2053                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2054                         !kauth_cred_issuser(cred))
2055                         return (EACCES);
2056
2057                 return hfs_freeze(hfsmp);
2058         }
2059
2060         case F_THAW_FS: {
2061                 vfsp = vfs_statfs(vnode_mount(vp));
2062                 if (kauth_cred_getuid(cred) != vfsp->f_owner &&
2063                         !kauth_cred_issuser(cred))
2064                         return (EACCES);
2065
2066                 return hfs_thaw(hfsmp, current_proc());
2067         }
2068
2069         case HFS_EXT_BULKACCESS_FSCTL: {
2070             int size;
2071
2072             if (hfsmp->hfs_flags & HFS_STANDARD) {
2073                 return EINVAL;
2074             }
2075
2076             if (is64bit) {
2077                 size = sizeof(struct user64_ext_access_t);
2078             } else {
2079                 size = sizeof(struct user32_ext_access_t);
2080             }
2081
2082             return do_bulk_access_check(hfsmp, vp, ap, size, context);
2083         }
2084
2085         case HFS_SET_XATTREXTENTS_STATE: {
2086                 int state;
2087
2088                 if (ap->a_data == NULL) {
2089                         return (EINVAL);
2090                 }
2091
2092                 state = *(int *)ap->a_data;
2093
2094                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2095                         return (EROFS);
2096                 }
2097
2098                 /* Super-user can enable or disable extent-based extended
2099                  * attribute support on a volume
2100                  * Note: Starting Mac OS X 10.7, extent-based extended attributes
2101                  * are enabled by default, so any change will be transient only
2102                  * till the volume is remounted.
2103                  */
2104                 if (!kauth_cred_issuser(kauth_cred_get())) {
2105                         return (EPERM);
2106                 }
2107                 if (state == 0 || state == 1)
2108                         return hfs_set_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE, state);
2109                 else
2110                         return (EINVAL);
2111         }
2112
2113         case F_SETSTATICCONTENT: {
2114                 int error;
2115                 int enable_static = 0;
2116                 struct cnode *cp = NULL;
2117                 /*
2118                  * lock the cnode, decorate the cnode flag, and bail out.
2119                  * VFS should have already authenticated the caller for us.
2120                  */
2121
2122                 if (ap->a_data) {
2123                         /*
2124                          * Note that even though ap->a_data is of type caddr_t,
2125                          * the fcntl layer at the syscall handler will pass in NULL
2126                          * or 1 depending on what the argument supplied to the fcntl
2127                          * was.  So it is in fact correct to check the ap->a_data
2128                          * argument for zero or non-zero value when deciding whether or not
2129                          * to enable the static bit in the cnode.
2130                          */
2131                         enable_static = 1;
2132                 }
2133                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2134                         return EROFS;
2135                 }
2136                 cp = VTOC(vp);
2137
2138                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2139                 if (error == 0) {
2140                         if (enable_static) {
2141                                 cp->c_flag |= C_SSD_STATIC;
2142                         }
2143                         else {
2144                                 cp->c_flag &= ~C_SSD_STATIC;
2145                         }
2146                         hfs_unlock (cp);
2147                 }
2148                 return error;
2149         }
2150
2151         case F_SET_GREEDY_MODE: {
2152                 int error;
2153                 int enable_greedy_mode = 0;
2154                 struct cnode *cp = NULL;
2155                 /*
2156                  * lock the cnode, decorate the cnode flag, and bail out.
2157                  * VFS should have already authenticated the caller for us.
2158                  */
2159
2160                 if (ap->a_data) {
2161                         /*
2162                          * Note that even though ap->a_data is of type caddr_t,
2163                          * the fcntl layer at the syscall handler will pass in NULL
2164                          * or 1 depending on what the argument supplied to the fcntl
2165                          * was.  So it is in fact correct to check the ap->a_data
2166                          * argument for zero or non-zero value when deciding whether or not
2167                          * to enable the greedy mode bit in the cnode.
2168                          */
2169                         enable_greedy_mode = 1;
2170                 }
2171                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2172                         return EROFS;
2173                 }
2174                 cp = VTOC(vp);
2175
2176                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2177                 if (error == 0) {
2178                         if (enable_greedy_mode) {
2179                                 cp->c_flag |= C_SSD_GREEDY_MODE;
2180                         }
2181                         else {
2182                                 cp->c_flag &= ~C_SSD_GREEDY_MODE;
2183                         }
2184                         hfs_unlock (cp);
2185                 }
2186                 return error;
2187         }
2188
2189         case F_SETIOTYPE: {
2190                 int error;
2191                 uint32_t iotypeflag = 0;
2192
2193                 struct cnode *cp = NULL;
2194                 /*
2195                  * lock the cnode, decorate the cnode flag, and bail out.
2196                  * VFS should have already authenticated the caller for us.
2197                  */
2198
2199                 if (ap->a_data == NULL) {
2200                         return EINVAL;
2201                 }
2202
2203                 /*
2204                  * Note that even though ap->a_data is of type caddr_t, we
2205                  * can only use 32 bits of flag values.
2206                  */
2207                 iotypeflag = (uint32_t) ap->a_data;
2208                 switch (iotypeflag) {
2209                         case F_IOTYPE_ISOCHRONOUS:
2210                                 break;
2211                         default:
2212                                 return EINVAL;
2213                 }
2214
2215
2216                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2217                         return EROFS;
2218                 }
2219                 cp = VTOC(vp);
2220
2221                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2222                 if (error == 0) {
2223                         switch (iotypeflag) {
2224                                 case F_IOTYPE_ISOCHRONOUS:
2225                                         cp->c_flag |= C_IO_ISOCHRONOUS;
2226                                         break;
2227                                 default:
2228                                         break;
2229                         }
2230                         hfs_unlock (cp);
2231                 }
2232                 return error;
2233         }
2234
2235         case F_MAKECOMPRESSED: {
2236                 int error = 0;
2237                 uint32_t gen_counter;
2238                 struct cnode *cp = NULL;
2239                 int reset_decmp = 0;
2240
2241                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2242                         return EROFS;
2243                 }
2244
2245                 /*
2246                  * acquire & lock the cnode.
2247                  * VFS should have already authenticated the caller for us.
2248                  */
2249
2250                 if (ap->a_data) {
2251                         /*
2252                          * Cast the pointer into a uint32_t so we can extract the
2253                          * supplied generation counter.
2254                          */
2255                         gen_counter = *((uint32_t*)ap->a_data);
2256                 }
2257                 else {
2258                         return EINVAL;
2259                 }
2260
2261 #if HFS_COMPRESSION
2262                 cp = VTOC(vp);
2263                 /* Grab truncate lock first; we may truncate the file */
2264                 hfs_lock_truncate (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2265
2266                 error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2267                 if (error) {
2268                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2269                         return error;
2270                 }
2271
2272                 /* Are there any other usecounts/FDs? */
2273                 if (vnode_isinuse(vp, 1)) {
2274                         hfs_unlock(cp);
2275                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
2276                         return EBUSY;
2277                 }
2278
2279                 /* now we have the cnode locked down; Validate arguments */
2280                 if (cp->c_attr.ca_flags & (UF_IMMUTABLE | UF_COMPRESSED)) {
2281                         /* EINVAL if you are trying to manipulate an IMMUTABLE file */
2282                         hfs_unlock(cp);
2283                         hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2284                         return EINVAL;
2285                 }
2286
2287                 if ((hfs_get_gencount (cp)) == gen_counter) {
2288                         /*
2289                          * OK, the gen_counter matched.  Go for it:
2290                          * Toggle state bits, truncate file, and suppress mtime update
2291                          */
2292                         reset_decmp = 1;
2293                         cp->c_bsdflags |= UF_COMPRESSED;
2294
2295                         error = hfs_truncate(vp, 0, IO_NDELAY, HFS_TRUNCATE_SKIPTIMES,
2296                                                                  ap->a_context);
2297                 }
2298                 else {
2299                         error = ESTALE;
2300                 }
2301
2302                 /* Unlock cnode before executing decmpfs ; they may need to get an EA */
2303                 hfs_unlock(cp);
2304
2305                 /*
2306                  * Reset the decmp state while still holding the truncate lock. We need to
2307                  * serialize here against a listxattr on this node which may occur at any
2308                  * time.
2309                  *
2310                  * Even if '0/skiplock' is passed in 2nd argument to hfs_file_is_compressed,
2311                  * that will still potentially require getting the com.apple.decmpfs EA. If the
2312                  * EA is required, then we can't hold the cnode lock, because the getxattr call is
2313                  * generic(through VFS), and can't pass along any info telling it that we're already
2314                  * holding it (the lock). If we don't serialize, then we risk listxattr stopping
2315                  * and trying to fill in the hfs_file_is_compressed info during the callback
2316                  * operation, which will result in deadlock against the b-tree node.
2317                  *
2318                  * So, to serialize against listxattr (which will grab buf_t meta references on
2319                  * the b-tree blocks), we hold the truncate lock as we're manipulating the
2320                  * decmpfs payload.
2321                  */
2322                 if ((reset_decmp) && (error == 0)) {
2323                         decmpfs_cnode *dp = VTOCMP (vp);
2324                         if (dp != NULL) {
2325                                 decmpfs_cnode_set_vnode_state(dp, FILE_TYPE_UNKNOWN, 0);
2326                         }
2327
2328                         /* Initialize the decmpfs node as needed */
2329                         (void) hfs_file_is_compressed (cp, 0); /* ok to take lock */
2330                 }
2331
2332                 hfs_unlock_truncate (cp, HFS_LOCK_DEFAULT);
2333
2334 #endif
2335                 return error;
2336         }
2337
2338         case F_SETBACKINGSTORE: {
2339
2340                 int error = 0;
2341
2342                 /*
2343                  * See comment in F_SETSTATICCONTENT re: using
2344              * a null check for a_data
2345                  */
2346                 if (ap->a_data) {
2347                         error = hfs_set_backingstore (vp, 1);
2348                 }
2349                 else {
2350                         error = hfs_set_backingstore (vp, 0);
2351                 }
2352
2353                 return error;
2354         }
2355
2356         case F_GETPATH_MTMINFO: {
2357                 int error = 0;
2358
2359                 int *data = (int*) ap->a_data;
2360
2361                 /* Ask if this is a backingstore vnode */
2362                 error = hfs_is_backingstore (vp, data);
2363
2364                 return error;
2365         }
2366
2367         case F_FULLFSYNC: {
2368                 int error;
2369
2370                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2371                         return (EROFS);
2372                 }
2373                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2374                 if (error == 0) {
2375                         error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_FULL, p);
2376                         hfs_unlock(VTOC(vp));
2377                 }
2378
2379                 return error;
2380         }
2381
2382         case F_BARRIERFSYNC: {
2383                 int error;
2384
2385                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2386                         return (EROFS);
2387                 }
2388                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2389                 if (error == 0) {
2390                         error = hfs_fsync(vp, MNT_WAIT, HFS_FSYNC_BARRIER, p);
2391                         hfs_unlock(VTOC(vp));
2392                 }
2393
2394                 return error;
2395         }
2396
2397         case F_CHKCLEAN: {
2398                 register struct cnode *cp;
2399                 int error;
2400
2401                 if (!vnode_isreg(vp))
2402                         return EINVAL;
2403
2404                 error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2405                 if (error == 0) {
2406                         cp = VTOC(vp);
2407                         /*
2408                          * used by regression test to determine if
2409                          * all the dirty pages (via write) have been cleaned
2410                          * after a call to 'fsysnc'.
2411                          */
2412                         error = is_file_clean(vp, VTOF(vp)->ff_size);
2413                         hfs_unlock(cp);
2414                 }
2415                 return (error);
2416         }
2417
2418         case F_RDADVISE: {
2419                 register struct radvisory *ra;
2420                 struct filefork *fp;
2421                 int error;
2422
2423                 if (!vnode_isreg(vp))
2424                         return EINVAL;
2425
2426                 ra = (struct radvisory *)(ap->a_data);
2427                 fp = VTOF(vp);
2428
2429                 /* Protect against a size change. */
2430                 hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2431
2432 #if HFS_COMPRESSION
2433                 if (compressed && (uncompressed_size == -1)) {
2434                         /* fetching the uncompressed size failed above, so return the error */
2435                         error = decmpfs_error;
2436                 } else if ((compressed && (ra->ra_offset >= uncompressed_size)) ||
2437                                    (!compressed && (ra->ra_offset >= fp->ff_size))) {
2438                         error = EFBIG;
2439                 }
2440 #else /* HFS_COMPRESSION */
2441                 if (ra->ra_offset >= fp->ff_size) {
2442                         error = EFBIG;
2443                 }
2444 #endif /* HFS_COMPRESSION */
2445                 else {
2446                         error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count);
2447                 }
2448
2449                 hfs_unlock_truncate(VTOC(vp), HFS_LOCK_DEFAULT);
2450                 return (error);
2451         }
2452
2453         case _IOC(IOC_OUT,'h', 4, 0):     /* Create date in local time */
2454         {
2455                 if (is64bit) {
2456                         *(user_time_t *)(ap->a_data) = (user_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2457                 }
2458                 else {
2459                         *(user32_time_t *)(ap->a_data) = (user32_time_t) (to_bsd_time(VTOVCB(vp)->localCreateDate));
2460                 }
2461                 return 0;
2462         }
2463
2464         case SPOTLIGHT_FSCTL_GET_MOUNT_TIME:
2465             *(uint32_t *)ap->a_data = hfsmp->hfs_mount_time;
2466             break;
2467
2468         case SPOTLIGHT_FSCTL_GET_LAST_MTIME:
2469             *(uint32_t *)ap->a_data = hfsmp->hfs_last_mounted_mtime;
2470             break;
2471
2472         case HFS_FSCTL_GET_VERY_LOW_DISK:
2473             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_dangerlimit;
2474             break;
2475
2476         case HFS_FSCTL_SET_VERY_LOW_DISK:
2477             if (*(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_warninglimit) {
2478                 return EINVAL;
2479             }
2480
2481             hfsmp->hfs_freespace_notify_dangerlimit = *(uint32_t *)ap->a_data;
2482             break;
2483
2484         case HFS_FSCTL_GET_LOW_DISK:
2485             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_warninglimit;
2486             break;
2487
2488         case HFS_FSCTL_SET_LOW_DISK:
2489             if (   *(uint32_t *)ap->a_data >= hfsmp->hfs_freespace_notify_desiredlevel
2490                 || *(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_dangerlimit) {
2491
2492                 return EINVAL;
2493             }
2494
2495             hfsmp->hfs_freespace_notify_warninglimit = *(uint32_t *)ap->a_data;
2496             break;
2497
2498         case HFS_FSCTL_GET_DESIRED_DISK:
2499             *(uint32_t*)ap->a_data = hfsmp->hfs_freespace_notify_desiredlevel;
2500             break;
2501
2502         case HFS_FSCTL_SET_DESIRED_DISK:
2503             if (*(uint32_t *)ap->a_data <= hfsmp->hfs_freespace_notify_warninglimit) {
2504                 return EINVAL;
2505             }
2506
2507             hfsmp->hfs_freespace_notify_desiredlevel = *(uint32_t *)ap->a_data;
2508             break;
2509
2510         case HFS_VOLUME_STATUS:
2511             *(uint32_t *)ap->a_data = hfsmp->hfs_notification_conditions;
2512             break;
2513
2514         case HFS_SET_BOOT_INFO:
2515                 if (!vnode_isvroot(vp))
2516                         return(EINVAL);
2517                 if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(HFSTOVFS(hfsmp))->f_owner))
2518                         return(EACCES); /* must be superuser or owner of filesystem */
2519                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2520                         return (EROFS);
2521                 }
2522                 hfs_lock_mount (hfsmp);
2523                 bcopy(ap->a_data, &hfsmp->vcbFndrInfo, sizeof(hfsmp->vcbFndrInfo));
2524                 hfs_unlock_mount (hfsmp);
2525                 (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT);
2526                 break;
2527
2528         case HFS_GET_BOOT_INFO:
2529                 if (!vnode_isvroot(vp))
2530                         return(EINVAL);
2531                 hfs_lock_mount (hfsmp);
2532                 bcopy(&hfsmp->vcbFndrInfo, ap->a_data, sizeof(hfsmp->vcbFndrInfo));
2533                 hfs_unlock_mount(hfsmp);
2534                 break;
2535
2536         case HFS_MARK_BOOT_CORRUPT:
2537                 /* Mark the boot volume corrupt by setting
2538                  * kHFSVolumeInconsistentBit in the volume header.  This will
2539                  * force fsck_hfs on next mount.
2540                  */
2541                 if (!kauth_cred_issuser(kauth_cred_get())) {
2542                         return EACCES;
2543                 }
2544
2545                 /* Allowed only on the root vnode of the boot volume */
2546                 if (!(vfs_flags(HFSTOVFS(hfsmp)) & MNT_ROOTFS) ||
2547                     !vnode_isvroot(vp)) {
2548                         return EINVAL;
2549                 }
2550                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2551                         return (EROFS);
2552                 }
2553                 printf ("hfs_vnop_ioctl: Marking the boot volume corrupt.\n");
2554                 hfs_mark_inconsistent(hfsmp, HFS_FSCK_FORCED);
2555                 break;
2556
2557         case HFS_FSCTL_GET_JOURNAL_INFO:
2558                 jip = (struct hfs_journal_info*)ap->a_data;
2559
2560                 if (vp == NULLVP)
2561                         return EINVAL;
2562
2563             if (hfsmp->jnl == NULL) {
2564                         jnl_start = 0;
2565                         jnl_size  = 0;
2566             } else {
2567                         jnl_start = hfs_blk_to_bytes(hfsmp->jnl_start, hfsmp->blockSize) + hfsmp->hfsPlusIOPosOffset;
2568                         jnl_size  = hfsmp->jnl_size;
2569             }
2570
2571                 jip->jstart = jnl_start;
2572                 jip->jsize = jnl_size;
2573                 break;
2574
2575         case HFS_SET_ALWAYS_ZEROFILL: {
2576             struct cnode *cp = VTOC(vp);
2577
2578             if (*(int *)ap->a_data) {
2579                 cp->c_flag |= C_ALWAYS_ZEROFILL;
2580             } else {
2581                 cp->c_flag &= ~C_ALWAYS_ZEROFILL;
2582             }
2583             break;
2584         }
2585
2586         case HFS_DISABLE_METAZONE: {
2587                 /* Only root can disable metadata zone */
2588                 if (!kauth_cred_issuser(kauth_cred_get())) {
2589                         return EACCES;
2590                 }
2591                 if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2592                         return (EROFS);
2593                 }
2594
2595                 /* Disable metadata zone now */
2596                 (void) hfs_metadatazone_init(hfsmp, true);
2597                 printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN);
2598                 break;
2599         }
2600
2601
2602         case HFS_FSINFO_METADATA_BLOCKS: {
2603                 int error;
2604                 struct hfsinfo_metadata *hinfo;
2605
2606                 hinfo = (struct hfsinfo_metadata *)ap->a_data;
2607
2608                 /* Get information about number of metadata blocks */
2609                 error = hfs_getinfo_metadata_blocks(hfsmp, hinfo);
2610                 if (error) {
2611                         return error;
2612                 }
2613
2614                 break;
2615         }
2616
2617         case HFS_GET_FSINFO: {
2618                 hfs_fsinfo *fsinfo = (hfs_fsinfo *)ap->a_data;
2619
2620                 /* Only root is allowed to get fsinfo */
2621                 if (!kauth_cred_issuser(kauth_cred_get())) {
2622                         return EACCES;
2623                 }
2624
2625                 /*
2626                  * Make sure that the caller's version number matches with
2627                  * the kernel's version number.  This will make sure that
2628                  * if the structures being read/written into are changed
2629                  * by the kernel, the caller will not read incorrect data.
2630                  *
2631                  * The first three fields --- request_type, version and
2632                  * flags are same for all the hfs_fsinfo structures, so
2633                  * we can access the version number by assuming any
2634                  * structure for now.
2635                  */
2636                 if (fsinfo->header.version != HFS_FSINFO_VERSION) {
2637                         return ENOTSUP;
2638                 }
2639
2640                 /* Make sure that the current file system is not marked inconsistent */
2641                 if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2642                         return EIO;
2643                 }
2644
2645                 return hfs_get_fsinfo(hfsmp, ap->a_data);
2646         }
2647
2648         case HFS_CS_FREESPACE_TRIM: {
2649                 int error = 0;
2650                 int lockflags = 0;
2651
2652                 /* Only root allowed */
2653                 if (!kauth_cred_issuser(kauth_cred_get())) {
2654                         return EACCES;
2655                 }
2656
2657                 /*
2658                  * This core functionality is similar to hfs_scan_blocks().
2659                  * The main difference is that hfs_scan_blocks() is called
2660                  * as part of mount where we are assured that the journal is
2661                  * empty to start with.  This fcntl() can be called on a
2662                  * mounted volume, therefore it has to flush the content of
2663                  * the journal as well as ensure the state of summary table.
2664                  *
2665                  * This fcntl scans over the entire allocation bitmap,
2666                  * creates list of all the free blocks, and issues TRIM
2667                  * down to the underlying device.  This can take long time
2668                  * as it can generate up to 512MB of read I/O.
2669                  */
2670
2671                 if ((hfsmp->hfs_flags & HFS_SUMMARY_TABLE) == 0) {
2672                         error = hfs_init_summary(hfsmp);
2673                         if (error) {
2674                                 printf("hfs: fsctl() could not initialize summary table for %s\n", hfsmp->vcbVN);
2675                                 return error;
2676                         }
2677                 }
2678
2679                 /*
2680                  * The journal maintains list of recently deallocated blocks to
2681                  * issue DKIOCUNMAPs when the corresponding journal transaction is
2682                  * flushed to the disk.  To avoid any race conditions, we only
2683                  * want one active trim list and only one thread issuing DKIOCUNMAPs.
2684                  * Therefore we make sure that the journal trim list is sync'ed,
2685                  * empty, and not modifiable for the duration of our scan.
2686                  *
2687                  * Take the journal lock before flushing the journal to the disk.
2688                  * We will keep on holding the journal lock till we don't get the
2689                  * bitmap lock to make sure that no new journal transactions can
2690                  * start.  This will make sure that the journal trim list is not
2691                  * modified after the journal flush and before getting bitmap lock.
2692                  * We can release the journal lock after we acquire the bitmap
2693                  * lock as it will prevent any further block deallocations.
2694                  */
2695                 hfs_journal_lock(hfsmp);
2696
2697                 /* Flush the journal and wait for all I/Os to finish up */
2698                 error = hfs_flush(hfsmp, HFS_FLUSH_JOURNAL_META);
2699                 if (error) {
2700                         hfs_journal_unlock(hfsmp);
2701                         return error;
2702                 }
2703
2704                 /* Take bitmap lock to ensure it is not being modified */
2705                 lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
2706
2707                 /* Release the journal lock */
2708                 hfs_journal_unlock(hfsmp);
2709
2710                 /*
2711                  * ScanUnmapBlocks reads the bitmap in large block size
2712                  * (up to 1MB) unlike the runtime which reads the bitmap
2713                  * in the 4K block size.  This can cause buf_t collisions
2714                  * and potential data corruption.  To avoid this, we
2715                  * invalidate all the existing buffers associated with
2716                  * the bitmap vnode before scanning it.
2717                  *
2718                  * Note: ScanUnmapBlock() cleans up all the buffers
2719                  * after itself, so there won't be any large buffers left
2720                  * for us to clean up after it returns.
2721                  */
2722                 error = buf_invalidateblks(hfsmp->hfs_allocation_vp, 0, 0, 0);
2723                 if (error) {
2724                         hfs_systemfile_unlock(hfsmp, lockflags);
2725                         return error;
2726                 }
2727
2728                 /* Traverse bitmap and issue DKIOCUNMAPs */
2729                 error = ScanUnmapBlocks(hfsmp);
2730                 hfs_systemfile_unlock(hfsmp, lockflags);
2731                 if (error) {
2732                         return error;
2733                 }
2734
2735                 break;
2736         }
2737
2738         case HFS_SET_HOTFILE_STATE: {
2739                 int error;
2740                 struct cnode *cp = VTOC(vp);
2741                 uint32_t hf_state = *((uint32_t*)ap->a_data);
2742                 uint32_t num_unpinned = 0;
2743
2744                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
2745                 if (error) {
2746                         return error;
2747                 }
2748
2749                 // printf("hfs: setting hotfile state %d on %s\n", hf_state, vp->v_name);
2750                 if (hf_state == HFS_MARK_FASTDEVCANDIDATE) {
2751                         vnode_setfastdevicecandidate(vp);
2752
2753                         cp->c_attr.ca_recflags |= kHFSFastDevCandidateMask;
2754                         cp->c_attr.ca_recflags &= ~kHFSDoNotFastDevPinMask;
2755                         cp->c_flag |= C_MODIFIED;
2756                 } else if (hf_state == HFS_UNMARK_FASTDEVCANDIDATE || hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2757                         vnode_clearfastdevicecandidate(vp);
2758                         hfs_removehotfile(vp);
2759
2760                         if (cp->c_attr.ca_recflags & kHFSFastDevPinnedMask) {
2761                                 hfs_pin_vnode(hfsmp, vp, HFS_UNPIN_IT, &num_unpinned, ap->a_context);
2762                         }
2763
2764                         if (hf_state == HFS_NEVER_FASTDEVCANDIDATE) {
2765                                 cp->c_attr.ca_recflags |= kHFSDoNotFastDevPinMask;
2766                         }
2767                         cp->c_attr.ca_recflags &= ~(kHFSFastDevCandidateMask|kHFSFastDevPinnedMask);
2768                         cp->c_flag |= C_MODIFIED;
2769
2770                 } else {
2771                         error = EINVAL;
2772                 }
2773
2774                 if (num_unpinned != 0) {
2775                         lck_mtx_lock(&hfsmp->hfc_mutex);
2776                         hfsmp->hfs_hotfile_freeblks += num_unpinned;
2777                         lck_mtx_unlock(&hfsmp->hfc_mutex);
2778                 }
2779
2780                 hfs_unlock(cp);
2781                 return error;
2782                 break;
2783         }
2784
2785         case HFS_REPIN_HOTFILE_STATE: {
2786                 int error=0;
2787                 uint32_t repin_what = *((uint32_t*)ap->a_data);
2788
2789                 /* Only root allowed */
2790                 if (!kauth_cred_issuser(kauth_cred_get())) {
2791                         return EACCES;
2792                 }
2793
2794                 if (!(hfsmp->hfs_flags & (HFS_CS_METADATA_PIN | HFS_CS_HOTFILE_PIN))) {
2795                         // this system is neither regular Fusion or Cooperative Fusion
2796                         // so this fsctl makes no sense.
2797                         return EINVAL;
2798                 }
2799
2800                 //
2801                 // After a converting a CoreStorage volume to be encrypted, the
2802                 // extents could have moved around underneath us.  This call
2803                 // allows corestoraged to re-pin everything that should be
2804                 // pinned (it would happen on the next reboot too but that could
2805                 // be a long time away).
2806                 //
2807                 if ((repin_what & HFS_REPIN_METADATA) && (hfsmp->hfs_flags & HFS_CS_METADATA_PIN)) {
2808                         hfs_pin_fs_metadata(hfsmp);
2809                 }
2810                 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_HOTFILE_PIN)) {
2811                         hfs_repin_hotfiles(hfsmp);
2812                 }
2813                 if ((repin_what & HFS_REPIN_USERDATA) && (hfsmp->hfs_flags & HFS_CS_SWAPFILE_PIN)) {
2814                         //XXX Swapfiles (marked SWAP_PINNED) may have moved too.
2815                         //XXX Do we care? They have a more transient/dynamic nature/lifetime.
2816                 }
2817
2818                 return error;
2819                 break;
2820         }
2821
2822
2823         default:
2824                 return (ENOTTY);
2825         }
2826
2827         return 0;
2828 }
2829
2830 /*
2831  * select
2832  */
2833 int
2834 hfs_vnop_select(__unused struct vnop_select_args *ap)
2835 /*
2836         struct vnop_select_args {
2837                 vnode_t a_vp;
2838                 int  a_which;
2839                 int  a_fflags;
2840                 void *a_wql;
2841                 vfs_context_t a_context;
2842         };
2843 */
2844 {
2845         /*
2846          * We should really check to see if I/O is possible.
2847          */
2848         return (1);
2849 }
2850
2851 /*
2852  * Converts a logical block number to a physical block, and optionally returns
2853  * the amount of remaining blocks in a run. The logical block is based on hfsNode.logBlockSize.
2854  * The physical block number is based on the device block size, currently its 512.
2855  * The block run is returned in logical blocks, and is the REMAINING amount of blocks
2856  */
2857 int
2858 hfs_bmap(struct vnode *vp, daddr_t bn, struct vnode **vpp, daddr64_t *bnp, unsigned int *runp)
2859 {
2860         struct filefork *fp = VTOF(vp);
2861         struct hfsmount *hfsmp = VTOHFS(vp);
2862         int  retval = E_NONE;
2863         u_int32_t  logBlockSize;
2864         size_t  bytesContAvail = 0;
2865         off_t  blockposition;
2866         int lockExtBtree;
2867         int lockflags = 0;
2868
2869         /*
2870          * Check for underlying vnode requests and ensure that logical
2871          * to physical mapping is requested.
2872          */
2873         if (vpp != NULL)
2874                 *vpp = hfsmp->hfs_devvp;
2875         if (bnp == NULL)
2876                 return (0);
2877
2878         logBlockSize = GetLogicalBlockSize(vp);
2879         blockposition = (off_t)bn * logBlockSize;
2880
2881         lockExtBtree = overflow_extents(fp);
2882
2883         if (lockExtBtree)
2884                 lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2885
2886         retval = MacToVFSError(
2887                             MapFileBlockC (HFSTOVCB(hfsmp),
2888                                             (FCB*)fp,
2889                                             MAXPHYSIO,
2890                                             blockposition,
2891                                             bnp,
2892                                             &bytesContAvail));
2893
2894         if (lockExtBtree)
2895                 hfs_systemfile_unlock(hfsmp, lockflags);
2896
2897         if (retval == E_NONE) {
2898                 /* Figure out how many read ahead blocks there are */
2899                 if (runp != NULL) {
2900                         if (can_cluster(logBlockSize)) {
2901                                 /* Make sure this result never goes negative: */
2902                                 *runp = (bytesContAvail < logBlockSize) ? 0 : (bytesContAvail / logBlockSize) - 1;
2903                         } else {
2904                                 *runp = 0;
2905                         }
2906                 }
2907         }
2908         return (retval);
2909 }
2910
2911 /*
2912  * Convert logical block number to file offset.
2913  */
2914 int
2915 hfs_vnop_blktooff(struct vnop_blktooff_args *ap)
2916 /*
2917         struct vnop_blktooff_args {
2918                 vnode_t a_vp;
2919                 daddr64_t a_lblkno;
2920                 off_t *a_offset;
2921         };
2922 */
2923 {
2924         if (ap->a_vp == NULL)
2925                 return (EINVAL);
2926         *ap->a_offset = (off_t)ap->a_lblkno * (off_t)GetLogicalBlockSize(ap->a_vp);
2927
2928         return(0);
2929 }
2930
2931 /*
2932  * Convert file offset to logical block number.
2933  */
2934 int
2935 hfs_vnop_offtoblk(struct vnop_offtoblk_args *ap)
2936 /*
2937         struct vnop_offtoblk_args {
2938                 vnode_t a_vp;
2939                 off_t a_offset;
2940                 daddr64_t *a_lblkno;
2941         };
2942 */
2943 {
2944         if (ap->a_vp == NULL)
2945                 return (EINVAL);
2946         *ap->a_lblkno = (daddr64_t)(ap->a_offset / (off_t)GetLogicalBlockSize(ap->a_vp));
2947
2948         return(0);
2949 }
2950
2951 /*
2952  * Map file offset to physical block number.
2953  *
2954  * If this function is called for write operation, and if the file
2955  * had virtual blocks allocated (delayed allocation), real blocks
2956  * are allocated by calling ExtendFileC().
2957  *
2958  * If this function is called for read operation, and if the file
2959  * had virtual blocks allocated (delayed allocation), no change
2960  * to the size of file is done, and if required, rangelist is
2961  * searched for mapping.
2962  *
2963  * System file cnodes are expected to be locked (shared or exclusive).
2964  *
2965  * -- INVALID RANGES --
2966  *
2967  * Invalid ranges are used to keep track of where we have extended a
2968  * file, but have not yet written that data to disk.  In the past we
2969  * would clear up the invalid ranges as we wrote to those areas, but
2970  * before data was actually flushed to disk.  The problem with that
2971  * approach is that the data can be left in the cache and is therefore
2972  * still not valid on disk.  So now we clear up the ranges here, when
2973  * the flags field has VNODE_WRITE set, indicating a write is about to
2974  * occur.  This isn't ideal (ideally we want to clear them up when
2975  * know the data has been successfully written), but it's the best we
2976  * can do.
2977  *
2978  * For reads, we use the invalid ranges here in block map to indicate
2979  * to the caller that the data should be zeroed (a_bpn == -1).  We
2980  * have to be careful about what ranges we return to the cluster code.
2981  * Currently the cluster code can only handle non-rounded values for
2982  * the EOF; it cannot handle funny sized ranges in the middle of the
2983  * file (the main problem is that it sends down odd sized I/Os to the
2984  * disk).  Our code currently works because whilst the very first
2985  * offset and the last offset in the invalid ranges are not aligned,
2986  * gaps in the invalid ranges between the first and last, have to be
2987  * aligned (because we always write page sized blocks).  For example,
2988  * consider this arrangement:
2989  *
2990  *         +-------------+-----+-------+------+
2991  *         |             |XXXXX|       |XXXXXX|
2992  *         +-------------+-----+-------+------+
2993  *                       a     b       c      d
2994  *
2995  * This shows two invalid ranges <a, b> and <c, d>.  Whilst a and d
2996  * are not necessarily aligned, b and c *must* be.
2997  *
2998  * Zero-filling occurs in a number of ways:
2999  *
3000  *   1. When a read occurs and we return with a_bpn == -1.
3001  *
3002  *   2. When hfs_fsync or hfs_filedone calls hfs_flush_invalid_ranges
3003  *      which will cause us to iterate over the ranges bringing in
3004  *      pages that are not present in the cache and zeroing them.  Any
3005  *      pages that are already in the cache are left untouched.  Note
3006  *      that hfs_fsync does not always flush invalid ranges.
3007  *
3008  *   3. When we extend a file we zero out from the old EOF to the end
3009  *      of the page.  It would be nice if we didn't have to do this if
3010  *      the page wasn't present (and could defer it), but because of
3011  *      the problem described above, we have to.
3012  *
3013  * The invalid ranges are also used to restrict the size that we write
3014  * out on disk: see hfs_prepare_fork_for_update.
3015  *
3016  * Note that invalid ranges are ignored when neither the VNODE_READ or
3017  * the VNODE_WRITE flag is specified.  This is useful for the
3018  * F_LOG2PHYS* fcntls which are not interested in invalid ranges: they
3019  * just want to know whether blocks are physically allocated or not.
3020  */
3021 int
3022 hfs_vnop_blockmap(struct vnop_blockmap_args *ap)
3023 /*
3024         struct vnop_blockmap_args {
3025                 vnode_t a_vp;
3026                 off_t a_foffset;
3027                 size_t a_size;
3028                 daddr64_t *a_bpn;
3029                 size_t *a_run;
3030                 void *a_poff;
3031                 int a_flags;
3032                 vfs_context_t a_context;
3033         };
3034 */
3035 {
3036         struct vnode *vp = ap->a_vp;
3037         struct cnode *cp;
3038         struct filefork *fp;
3039         struct hfsmount *hfsmp;
3040         size_t bytesContAvail = ap->a_size;
3041         int retval = E_NONE;
3042         int syslocks = 0;
3043         int lockflags = 0;
3044         struct rl_entry *invalid_range;
3045         enum rl_overlaptype overlaptype;
3046         int started_tr = 0;
3047         int tooklock = 0;
3048
3049 #if HFS_COMPRESSION
3050         if (VNODE_IS_RSRC(vp)) {
3051                 /* allow blockmaps to the resource fork */
3052         } else {
3053                 if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */
3054                         int state = decmpfs_cnode_get_vnode_state(VTOCMP(vp));
3055                         switch(state) {
3056                                 case FILE_IS_COMPRESSED:
3057                                         return ENOTSUP;
3058                                 case FILE_IS_CONVERTING:
3059                                         /* if FILE_IS_CONVERTING, we allow blockmap */
3060                                         break;
3061                                 default:
3062                                         printf("invalid state %d for compressed file\n", state);
3063                                         /* fall through */
3064                         }
3065                 }
3066         }
3067 #endif /* HFS_COMPRESSION */
3068
3069         /* Do not allow blockmap operation on a directory */
3070         if (vnode_isdir(vp)) {
3071                 return (ENOTSUP);
3072         }
3073
3074         /*
3075          * Check for underlying vnode requests and ensure that logical
3076          * to physical mapping is requested.
3077          */
3078         if (ap->a_bpn == NULL)
3079                 return (0);
3080
3081         hfsmp = VTOHFS(vp);
3082         cp = VTOC(vp);
3083         fp = VTOF(vp);
3084
3085         if ( !vnode_issystem(vp) && !vnode_islnk(vp) && !vnode_isswap(vp)) {
3086                 if (cp->c_lockowner != current_thread()) {
3087                         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3088                         tooklock = 1;
3089                 }
3090
3091                 // For reads, check the invalid ranges
3092                 if (ISSET(ap->a_flags, VNODE_READ)) {
3093                         if (ap->a_foffset >= fp->ff_size) {
3094                                 retval = ERANGE;
3095                                 goto exit;
3096                         }
3097
3098                         overlaptype = rl_scan(&fp->ff_invalidranges, ap->a_foffset,
3099                                                                   ap->a_foffset + (off_t)bytesContAvail - 1,
3100                                                                   &invalid_range);
3101                         switch(overlaptype) {
3102                                 case RL_MATCHINGOVERLAP:
3103                                 case RL_OVERLAPCONTAINSRANGE:
3104                                 case RL_OVERLAPSTARTSBEFORE:
3105                                         /* There's no valid block for this byte offset */
3106                                         *ap->a_bpn = (daddr64_t)-1;
3107                                         /* There's no point limiting the amount to be returned
3108                                          * if the invalid range that was hit extends all the way
3109                                          * to the EOF (i.e. there's no valid bytes between the
3110                                          * end of this range and the file's EOF):
3111                                          */
3112                                         if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3113                                                 ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3114                                                 bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3115                                         }
3116
3117                                         retval = 0;
3118                                         goto exit;
3119
3120                                 case RL_OVERLAPISCONTAINED:
3121                                 case RL_OVERLAPENDSAFTER:
3122                                         /* The range of interest hits an invalid block before the end: */
3123                                         if (invalid_range->rl_start == ap->a_foffset) {
3124                                                 /* There's actually no valid information to be had starting here: */
3125                                                 *ap->a_bpn = (daddr64_t)-1;
3126                                                 if (((off_t)fp->ff_size > (invalid_range->rl_end + 1)) &&
3127                                                         ((size_t)(invalid_range->rl_end + 1 - ap->a_foffset) < bytesContAvail)) {
3128                                                         bytesContAvail = invalid_range->rl_end + 1 - ap->a_foffset;
3129                                                 }
3130
3131                                                 retval = 0;
3132                                                 goto exit;
3133                                         } else {
3134                                                 /*
3135                                                  * Sadly, the lower layers don't like us to
3136                                                  * return unaligned ranges, so we skip over
3137                                                  * any invalid ranges here that are less than
3138                                                  * a page: zeroing of those bits is not our
3139                                                  * responsibility (it's dealt with elsewhere).
3140                                                  */
3141                                                 do {
3142                                                         off_t rounded_start = round_page_64(invalid_range->rl_start);
3143                                                         if ((off_t)bytesContAvail < rounded_start - ap->a_foffset)
3144                                                                 break;
3145                                                         if (rounded_start < invalid_range->rl_end + 1) {
3146                                                                 bytesContAvail = rounded_start - ap->a_foffset;
3147                                                                 break;
3148                                                         }
3149                                                 } while ((invalid_range = TAILQ_NEXT(invalid_range,
3150                                                                                                                          rl_link)));
3151                                         }
3152                                         break;
3153
3154                                 case RL_NOOVERLAP:
3155                                         break;
3156                         } // switch
3157                 }
3158         }
3159
3160 #if CONFIG_PROTECT
3161         if (cp->c_cpentry) {
3162                 const int direction = (ISSET(ap->a_flags, VNODE_WRITE)
3163                                                            ? VNODE_WRITE : VNODE_READ);
3164
3165                 cp_io_params_t io_params;
3166                 cp_io_params(hfsmp, cp->c_cpentry,
3167                                          off_rsrc_make(ap->a_foffset, VNODE_IS_RSRC(vp)),
3168                                          direction, &io_params);
3169
3170                 if (io_params.max_len < (off_t)bytesContAvail)
3171                         bytesContAvail = io_params.max_len;
3172
3173                 if (io_params.phys_offset != -1) {
3174                         *ap->a_bpn = ((io_params.phys_offset + hfsmp->hfsPlusIOPosOffset)
3175                                                   / hfsmp->hfs_logical_block_size);
3176
3177                         retval = 0;
3178                         goto exit;
3179                 }
3180         }
3181 #endif
3182
3183 retry:
3184
3185         /* Check virtual blocks only when performing write operation */
3186         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3187                 if (hfs_start_transaction(hfsmp) != 0) {
3188                         retval = EINVAL;
3189                         goto exit;
3190                 } else {
3191                         started_tr = 1;
3192                 }
3193                 syslocks = SFL_EXTENTS | SFL_BITMAP;
3194
3195         } else if (overflow_extents(fp)) {
3196                 syslocks = SFL_EXTENTS;
3197         }
3198
3199         if (syslocks)
3200                 lockflags = hfs_systemfile_lock(hfsmp, syslocks, HFS_EXCLUSIVE_LOCK);
3201
3202         /*
3203          * Check for any delayed allocations.
3204          */
3205         if ((ap->a_flags & VNODE_WRITE) && (fp->ff_unallocblocks != 0)) {
3206                 int64_t actbytes;
3207                 u_int32_t loanedBlocks;
3208
3209                 //
3210                 // Make sure we have a transaction.  It's possible
3211                 // that we came in and fp->ff_unallocblocks was zero
3212                 // but during the time we blocked acquiring the extents
3213                 // btree, ff_unallocblocks became non-zero and so we
3214                 // will need to start a transaction.
3215                 //
3216                 if (started_tr == 0) {
3217                         if (syslocks) {
3218                                 hfs_systemfile_unlock(hfsmp, lockflags);
3219                                 syslocks = 0;
3220                         }
3221                         goto retry;
3222                 }
3223
3224                 /*
3225                  * Note: ExtendFileC will Release any blocks on loan and
3226                  * aquire real blocks.  So we ask to extend by zero bytes
3227                  * since ExtendFileC will account for the virtual blocks.
3228                  */
3229
3230                 loanedBlocks = fp->ff_unallocblocks;
3231                 retval = ExtendFileC(hfsmp, (FCB*)fp, 0, 0,
3232                                      kEFAllMask | kEFNoClumpMask, &actbytes);
3233
3234                 if (retval) {
3235                         fp->ff_unallocblocks = loanedBlocks;
3236                         cp->c_blocks += loanedBlocks;
3237                         fp->ff_blocks += loanedBlocks;
3238
3239                         hfs_lock_mount (hfsmp);
3240                         hfsmp->loanedBlocks += loanedBlocks;
3241                         hfs_unlock_mount (hfsmp);
3242
3243                         hfs_systemfile_unlock(hfsmp, lockflags);
3244                         cp->c_flag |= C_MODIFIED;
3245                         if (started_tr) {
3246                                 (void) hfs_update(vp, 0);
3247                                 (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3248
3249                                 hfs_end_transaction(hfsmp);
3250                                 started_tr = 0;
3251                         }
3252                         goto exit;
3253                 }
3254         }
3255
3256         retval = MapFileBlockC(hfsmp, (FCB *)fp, bytesContAvail, ap->a_foffset,
3257                                ap->a_bpn, &bytesContAvail);
3258         if (syslocks) {
3259                 hfs_systemfile_unlock(hfsmp, lockflags);
3260                 syslocks = 0;
3261         }
3262
3263         if (retval) {
3264                 /* On write, always return error because virtual blocks, if any,
3265                  * should have been allocated in ExtendFileC().  We do not
3266                  * allocate virtual blocks on read, therefore return error
3267                  * only if no virtual blocks are allocated.  Otherwise we search
3268                  * rangelist for zero-fills
3269                  */
3270                 if ((MacToVFSError(retval) != ERANGE) ||
3271                     (ap->a_flags & VNODE_WRITE) ||
3272                     ((ap->a_flags & VNODE_READ) && (fp->ff_unallocblocks == 0))) {
3273                         goto exit;
3274                 }
3275
3276                 /* Validate if the start offset is within logical file size */
3277                 if (ap->a_foffset >= fp->ff_size) {
3278                         goto exit;
3279                 }
3280
3281                 /*
3282                  * At this point, we have encountered a failure during
3283                  * MapFileBlockC that resulted in ERANGE, and we are not
3284                  * servicing a write, and there are borrowed blocks.
3285                  *
3286                  * However, the cluster layer will not call blockmap for
3287                  * blocks that are borrowed and in-cache.  We have to assume
3288                  * that because we observed ERANGE being emitted from
3289                  * MapFileBlockC, this extent range is not valid on-disk.  So
3290                  * we treat this as a mapping that needs to be zero-filled
3291                  * prior to reading.
3292                  */
3293
3294                 if (fp->ff_size - ap->a_foffset < (off_t)bytesContAvail)
3295                         bytesContAvail = fp->ff_size - ap->a_foffset;
3296
3297                 *ap->a_bpn = (daddr64_t) -1;
3298                 retval = 0;
3299
3300                 goto exit;
3301         }
3302
3303 exit:
3304         if (retval == 0) {
3305                 if (ISSET(ap->a_flags, VNODE_WRITE)) {
3306                         struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
3307
3308                         // See if we might be overlapping invalid ranges...
3309                         if (r && (ap->a_foffset + (off_t)bytesContAvail) > r->rl_start) {
3310                                 /*
3311                                  * Mark the file as needing an update if we think the
3312                                  * on-disk EOF has changed.
3313                                  */
3314                                 if (ap->a_foffset <= r->rl_start)
3315                                         SET(cp->c_flag, C_MODIFIED);
3316
3317                                 /*
3318                                  * This isn't the ideal place to put this.  Ideally, we
3319                                  * should do something *after* we have successfully
3320                                  * written to the range, but that's difficult to do
3321                                  * because we cannot take locks in the callback.  At
3322                                  * present, the cluster code will call us with VNODE_WRITE
3323                                  * set just before it's about to write the data so we know
3324                                  * that data is about to be written.  If we get an I/O
3325                                  * error at this point then chances are the metadata
3326                                  * update to follow will also have an I/O error so the
3327                                  * risk here is small.
3328                                  */
3329                                 rl_remove(ap->a_foffset, ap->a_foffset + bytesContAvail - 1,
3330                                                   &fp->ff_invalidranges);
3331
3332                                 if (!TAILQ_FIRST(&fp->ff_invalidranges)) {
3333                                         cp->c_flag &= ~C_ZFWANTSYNC;
3334                                         cp->c_zftimeout = 0;
3335                                 }
3336                         }
3337                 }
3338
3339                 if (ap->a_run)
3340                         *ap->a_run = bytesContAvail;
3341
3342                 if (ap->a_poff)
3343                         *(int *)ap->a_poff = 0;
3344         }
3345
3346         if (started_tr) {
3347                 hfs_update(vp, TRUE);
3348                 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3349                 hfs_end_transaction(hfsmp);
3350                 started_tr = 0;
3351         }
3352
3353         if (tooklock)
3354                 hfs_unlock(cp);
3355
3356         return (MacToVFSError(retval));
3357 }
3358
3359 /*
3360  * prepare and issue the I/O
3361  * buf_strategy knows how to deal
3362  * with requests that require
3363  * fragmented I/Os
3364  */
3365 int
3366 hfs_vnop_strategy(struct vnop_strategy_args *ap)
3367 {
3368         buf_t   bp = ap->a_bp;
3369         vnode_t vp = buf_vnode(bp);
3370         int error = 0;
3371
3372         /* Mark buffer as containing static data if cnode flag set */
3373         if (VTOC(vp)->c_flag & C_SSD_STATIC) {
3374                 buf_markstatic(bp);
3375         }
3376
3377         /* Mark buffer as containing static data if cnode flag set */
3378         if (VTOC(vp)->c_flag & C_SSD_GREEDY_MODE) {
3379                 bufattr_markgreedymode(&bp->b_attr);
3380         }
3381
3382         /* mark buffer as containing burst mode data if cnode flag set */
3383         if (VTOC(vp)->c_flag & C_IO_ISOCHRONOUS) {
3384                 bufattr_markisochronous(&bp->b_attr);
3385         }
3386
3387 #if CONFIG_PROTECT
3388         error = cp_handle_strategy(bp);
3389
3390         if (error)
3391                 return error;
3392 #endif
3393
3394         error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap);
3395
3396         return error;
3397 }
3398
3399 int
3400 do_hfs_truncate(struct vnode *vp, off_t length, int flags, int truncateflags, vfs_context_t context)
3401 {
3402         register struct cnode *cp = VTOC(vp);
3403         struct filefork *fp = VTOF(vp);
3404         kauth_cred_t cred = vfs_context_ucred(context);
3405         int retval;
3406         off_t bytesToAdd;
3407         off_t actualBytesAdded;
3408         off_t filebytes;
3409         u_int32_t fileblocks;
3410         int blksize;
3411         struct hfsmount *hfsmp;
3412         int lockflags;
3413         int suppress_times = (truncateflags & HFS_TRUNCATE_SKIPTIMES);
3414
3415         blksize = VTOVCB(vp)->blockSize;
3416         fileblocks = fp->ff_blocks;
3417         filebytes = (off_t)fileblocks * (off_t)blksize;
3418
3419         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_START,
3420                  (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3421
3422         if (length < 0)
3423                 return (EINVAL);
3424
3425         /* This should only happen with a corrupt filesystem */
3426         if ((off_t)fp->ff_size < 0)
3427                 return (EINVAL);
3428
3429         if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
3430                 return (EFBIG);
3431
3432         hfsmp = VTOHFS(vp);
3433
3434         retval = E_NONE;
3435
3436         /* Files that are changing size are not hot file candidates. */
3437         if (hfsmp->hfc_stage == HFC_RECORDING) {
3438                 fp->ff_bytesread = 0;
3439         }
3440
3441         /*
3442          * We cannot just check if fp->ff_size == length (as an optimization)
3443          * since there may be extra physical blocks that also need truncation.
3444          */
3445 #if QUOTA
3446         if ((retval = hfs_getinoquota(cp)))
3447                 return(retval);
3448 #endif /* QUOTA */
3449
3450         /*
3451          * Lengthen the size of the file. We must ensure that the
3452          * last byte of the file is allocated. Since the smallest
3453          * value of ff_size is 0, length will be at least 1.
3454          */
3455         if (length > (off_t)fp->ff_size) {
3456 #if QUOTA
3457                 retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
3458                                    cred, 0);
3459                 if (retval)
3460                         goto Err_Exit;
3461 #endif /* QUOTA */
3462                 /*
3463                  * If we don't have enough physical space then
3464                  * we need to extend the physical size.
3465                  */
3466                 if (length > filebytes) {
3467                         int eflags;
3468                         u_int32_t blockHint = 0;
3469
3470                         /* All or nothing and don't round up to clumpsize. */
3471                         eflags = kEFAllMask | kEFNoClumpMask;
3472
3473                         if (cred && (suser(cred, NULL) != 0)) {
3474                                 eflags |= kEFReserveMask;  /* keep a reserve */
3475                         }
3476
3477                         /*
3478                          * Allocate Journal and Quota files in metadata zone.
3479                          */
3480                         if (filebytes == 0 &&
3481                             hfsmp->hfs_flags & HFS_METADATA_ZONE &&
3482                             hfs_virtualmetafile(cp)) {
3483                                 eflags |= kEFMetadataMask;
3484                                 blockHint = hfsmp->hfs_metazone_start;
3485                         }
3486                         if (hfs_start_transaction(hfsmp) != 0) {
3487                             retval = EINVAL;
3488                             goto Err_Exit;
3489                         }
3490
3491                         /* Protect extents b-tree and allocation bitmap */
3492                         lockflags = SFL_BITMAP;
3493                         if (overflow_extents(fp))
3494                                 lockflags |= SFL_EXTENTS;
3495                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3496
3497                         /*
3498                          * Keep growing the file as long as the current EOF is
3499                          * less than the desired value.
3500                          */
3501                         while ((length > filebytes) && (retval == E_NONE)) {
3502                                 bytesToAdd = length - filebytes;
3503                                 retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
3504                                                     (FCB*)fp,
3505                                                     bytesToAdd,
3506                                                     blockHint,
3507                                                     eflags,
3508                                                     &actualBytesAdded));
3509
3510                                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3511                                 if (actualBytesAdded == 0 && retval == E_NONE) {
3512                                         if (length > filebytes)
3513                                                 length = filebytes;
3514                                         break;
3515                                 }
3516                         } /* endwhile */
3517
3518                         hfs_systemfile_unlock(hfsmp, lockflags);
3519
3520                         if (hfsmp->jnl) {
3521                                 hfs_update(vp, 0);
3522                                 hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3523                         }
3524
3525                         hfs_end_transaction(hfsmp);
3526
3527                         if (retval)
3528                                 goto Err_Exit;
3529
3530                         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3531                                 (int)length, (int)fp->ff_size, (int)filebytes, 0, 0);
3532                 }
3533
3534                 if (ISSET(flags, IO_NOZEROFILL)) {
3535                         // An optimisation for the hibernation file
3536                         if (vnode_isswap(vp))
3537                                 rl_remove_all(&fp->ff_invalidranges);
3538                 } else {
3539                         if (UBCINFOEXISTS(vp)  && (vnode_issystem(vp) == 0) && retval == E_NONE) {
3540                                 if (length > (off_t)fp->ff_size) {
3541                                         struct timeval tv;
3542
3543                                         /* Extending the file: time to fill out the current last page w. zeroes? */
3544                                         if (fp->ff_size & PAGE_MASK_64) {
3545                                                 /* There might be some valid data at the start of the (current) last page
3546                                                    of the file, so zero out the remainder of that page to ensure the
3547                                                    entire page contains valid data. */
3548                                                 hfs_unlock(cp);
3549                                                 retval = hfs_zero_eof_page(vp, length);
3550                                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
3551                                                 if (retval) goto Err_Exit;
3552                                         }
3553                                         microuptime(&tv);
3554                                         rl_add(fp->ff_size, length - 1, &fp->ff_invalidranges);
3555                                         cp->c_zftimeout = tv.tv_sec + ZFTIMELIMIT;
3556                                 }
3557                         } else {
3558                                         panic("hfs_truncate: invoked on non-UBC object?!");
3559                         };
3560                 }
3561                 if (suppress_times == 0) {
3562                         cp->c_touch_modtime = TRUE;
3563                 }
3564                 fp->ff_size = length;
3565
3566         } else { /* Shorten the size of the file */
3567
3568                 // An optimisation for the hibernation file
3569                 if (ISSET(flags, IO_NOZEROFILL) && vnode_isswap(vp)) {
3570                         rl_remove_all(&fp->ff_invalidranges);
3571                 } else if ((off_t)fp->ff_size > length) {
3572                         /* Any space previously marked as invalid is now irrelevant: */
3573                         rl_remove(length, fp->ff_size - 1, &fp->ff_invalidranges);
3574                 }
3575
3576                 /*
3577                  * Account for any unmapped blocks. Note that the new
3578                  * file length can still end up with unmapped blocks.
3579                  */
3580                 if (fp->ff_unallocblocks > 0) {
3581                         u_int32_t finalblks;
3582                         u_int32_t loanedBlocks;
3583
3584                         hfs_lock_mount(hfsmp);
3585                         loanedBlocks = fp->ff_unallocblocks;
3586                         cp->c_blocks -= loanedBlocks;
3587                         fp->ff_blocks -= loanedBlocks;
3588                         fp->ff_unallocblocks = 0;
3589
3590                         hfsmp->loanedBlocks -= loanedBlocks;
3591
3592                         finalblks = (length + blksize - 1) / blksize;
3593                         if (finalblks > fp->ff_blocks) {
3594                                 /* calculate required unmapped blocks */
3595                                 loanedBlocks = finalblks - fp->ff_blocks;
3596                                 hfsmp->loanedBlocks += loanedBlocks;
3597
3598                                 fp->ff_unallocblocks = loanedBlocks;
3599                                 cp->c_blocks += loanedBlocks;
3600                                 fp->ff_blocks += loanedBlocks;
3601                         }
3602                         hfs_unlock_mount (hfsmp);
3603                 }
3604
3605                 off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
3606                 if (hfs_start_transaction(hfsmp) != 0) {
3607                         retval = EINVAL;
3608                         goto Err_Exit;
3609                 }
3610
3611                 if (fp->ff_unallocblocks == 0) {
3612                         /* Protect extents b-tree and allocation bitmap */
3613                         lockflags = SFL_BITMAP;
3614                         if (overflow_extents(fp))
3615                                 lockflags |= SFL_EXTENTS;
3616                         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3617
3618                         retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0,
3619                                                                                                  FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false));
3620
3621                         hfs_systemfile_unlock(hfsmp, lockflags);
3622                 }
3623                 if (hfsmp->jnl) {
3624                         if (retval == 0) {
3625                                 fp->ff_size = length;
3626                         }
3627                         hfs_update(vp, 0);
3628                         hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3629                 }
3630                 hfs_end_transaction(hfsmp);
3631
3632                 filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
3633                 if (retval)
3634                         goto Err_Exit;
3635 #if QUOTA
3636                 /* These are bytesreleased */
3637                 (void) hfs_chkdq(cp, (int64_t)-(savedbytes - filebytes), NOCRED, 0);
3638 #endif /* QUOTA */
3639
3640                 //
3641                 // Unlike when growing a file, we adjust the hotfile block count here
3642                 // instead of deeper down in the block allocation code because we do
3643                 // not necessarily have a vnode or "fcb" at the time we're deleting
3644                 // the file and so we wouldn't know if it was hotfile cached or not
3645                 //
3646                 hfs_hotfile_adjust_blocks(vp, (int64_t)((savedbytes - filebytes) / blksize));
3647
3648
3649                 /*
3650                  * Only set update flag if the logical length changes & we aren't
3651                  * suppressing modtime updates.
3652                  */
3653                 if (((off_t)fp->ff_size != length) && (suppress_times == 0)) {
3654                         cp->c_touch_modtime = TRUE;
3655                 }
3656                 fp->ff_size = length;
3657         }
3658         if (cp->c_mode & (S_ISUID | S_ISGID)) {
3659                 if (!vfs_context_issuser(context))
3660                         cp->c_mode &= ~(S_ISUID | S_ISGID);
3661         }
3662         cp->c_flag |= C_MODIFIED;
3663         cp->c_touch_chgtime = TRUE;     /* status changed */
3664         if (suppress_times == 0) {
3665                 cp->c_touch_modtime = TRUE;     /* file data was modified */
3666
3667                 /*
3668                  * If we are not suppressing the modtime update, then
3669                  * update the gen count as well.
3670                  */
3671                 if (S_ISREG(cp->c_attr.ca_mode) || S_ISLNK (cp->c_attr.ca_mode)) {
3672                         hfs_incr_gencount(cp);
3673                 }
3674         }
3675
3676         retval = hfs_update(vp, 0);
3677         if (retval) {
3678                 KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_NONE,
3679                      -1, -1, -1, retval, 0);
3680         }
3681
3682 Err_Exit:
3683
3684         KERNEL_DEBUG(HFSDBG_TRUNCATE | DBG_FUNC_END,
3685                  (int)length, (int)fp->ff_size, (int)filebytes, retval, 0);
3686
3687         return (retval);
3688 }
3689
3690 /*
3691  * Preparation which must be done prior to deleting the catalog record
3692  * of a file or directory.  In order to make the on-disk as safe as possible,
3693  * we remove the catalog entry before releasing the bitmap blocks and the
3694  * overflow extent records.  However, some work must be done prior to deleting
3695  * the catalog record.
3696  *
3697  * When calling this function, the cnode must exist both in memory and on-disk.
3698  * If there are both resource fork and data fork vnodes, this function should
3699  * be called on both.
3700  */
3701
3702 int
3703 hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) {
3704
3705         struct filefork *fp = VTOF(vp);
3706         struct cnode *cp = VTOC(vp);
3707 #if QUOTA
3708         int retval = 0;
3709 #endif /* QUOTA */
3710
3711         /* Cannot truncate an HFS directory! */
3712         if (vnode_isdir(vp)) {
3713                 return (EISDIR);
3714         }
3715
3716         /*
3717          * See the comment below in hfs_truncate for why we need to call
3718          * setsize here.  Essentially we want to avoid pending IO if we
3719          * already know that the blocks are going to be released here.
3720          * This function is only called when totally removing all storage for a file, so
3721          * we can take a shortcut and immediately setsize (0);
3722          */
3723         ubc_setsize(vp, 0);
3724
3725         /* This should only happen with a corrupt filesystem */
3726         if ((off_t)fp->ff_size < 0)
3727                 return (EINVAL);
3728
3729         /*
3730          * We cannot just check if fp->ff_size == length (as an optimization)
3731          * since there may be extra physical blocks that also need truncation.
3732          */
3733 #if QUOTA
3734         if ((retval = hfs_getinoquota(cp))) {
3735                 return(retval);
3736         }
3737 #endif /* QUOTA */
3738
3739         /* Wipe out any invalid ranges which have yet to be backed by disk */
3740         rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges);
3741
3742         /*
3743          * Account for any unmapped blocks. Since we're deleting the
3744          * entire file, we don't have to worry about just shrinking
3745          * to a smaller number of borrowed blocks.
3746          */
3747         if (fp->ff_unallocblocks > 0) {
3748                 u_int32_t loanedBlocks;
3749
3750                 hfs_lock_mount (hfsmp);
3751                 loanedBlocks = fp->ff_unallocblocks;
3752                 cp->c_blocks -= loanedBlocks;
3753                 fp->ff_blocks -= loanedBlocks;
3754                 fp->ff_unallocblocks = 0;
3755
3756                 hfsmp->loanedBlocks -= loanedBlocks;
3757
3758                 hfs_unlock_mount (hfsmp);
3759         }
3760
3761         return 0;
3762 }
3763
3764
3765 /*
3766  * Special wrapper around calling TruncateFileC.  This function is useable
3767  * even when the catalog record does not exist any longer, making it ideal
3768  * for use when deleting a file.  The simplification here is that we know
3769  * that we are releasing all blocks.
3770  *
3771  * Note that this function may be called when there is no vnode backing
3772  * the file fork in question.  We may call this from hfs_vnop_inactive
3773  * to clear out resource fork data (and may not want to clear out the data
3774  * fork yet).  As a result, we pointer-check both sets of inputs before
3775  * doing anything with them.
3776  *
3777  * The caller is responsible for saving off a copy of the filefork(s)
3778  * embedded within the cnode prior to calling this function.  The pointers
3779  * supplied as arguments must be valid even if the cnode is no longer valid.
3780  */
3781
3782 int
3783 hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork,
3784                                          struct filefork *rsrcfork, u_int32_t fileid) {
3785
3786         off_t filebytes;
3787         u_int32_t fileblocks;
3788         int blksize = 0;
3789         int error = 0;
3790         int lockflags;
3791
3792         blksize = hfsmp->blockSize;
3793
3794         /* Data Fork */
3795         if (datafork) {
3796                 off_t prev_filebytes;
3797                 datafork->ff_size = 0;
3798
3799                 fileblocks = datafork->ff_blocks;
3800                 filebytes = (off_t)fileblocks * (off_t)blksize;
3801                 prev_filebytes = filebytes;
3802
3803                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3804
3805                 while (filebytes > 0) {
3806                         if (filebytes > HFS_BIGFILE_SIZE) {
3807                                 filebytes -= HFS_BIGFILE_SIZE;
3808                         } else {
3809                                 filebytes = 0;
3810                         }
3811
3812                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3813                         if (hfs_start_transaction(hfsmp) != 0) {
3814                                 error = EINVAL;
3815                                 break;
3816                         }
3817
3818                         if (datafork->ff_unallocblocks == 0) {
3819                                 /* Protect extents b-tree and allocation bitmap */
3820                                 lockflags = SFL_BITMAP;
3821                                 if (overflow_extents(datafork))
3822                                         lockflags |= SFL_EXTENTS;
3823                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3824
3825                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false));
3826
3827                                 hfs_systemfile_unlock(hfsmp, lockflags);
3828                         }
3829                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3830
3831                         struct cnode *cp = datafork ? FTOC(datafork) : NULL;
3832                         struct vnode *vp;
3833                         vp = cp ? CTOV(cp, 0) : NULL;
3834                         hfs_hotfile_adjust_blocks(vp, (int64_t)((prev_filebytes - filebytes) / blksize));
3835                         prev_filebytes = filebytes;
3836
3837                         /* Finish the transaction and start over if necessary */
3838                         hfs_end_transaction(hfsmp);
3839
3840                         if (error) {
3841                                 break;
3842                         }
3843                 }
3844         }
3845
3846         /* Resource fork */
3847         if (error == 0 && rsrcfork) {
3848                 rsrcfork->ff_size = 0;
3849
3850                 fileblocks = rsrcfork->ff_blocks;
3851                 filebytes = (off_t)fileblocks * (off_t)blksize;
3852
3853                 /* We killed invalid ranges and loaned blocks before we removed the catalog entry */
3854
3855                 while (filebytes > 0) {
3856                         if (filebytes > HFS_BIGFILE_SIZE) {
3857                                 filebytes -= HFS_BIGFILE_SIZE;
3858                         } else {
3859                                 filebytes = 0;
3860                         }
3861
3862                         /* Start a transaction, and wipe out as many blocks as we can in this iteration */
3863                         if (hfs_start_transaction(hfsmp) != 0) {
3864                                 error = EINVAL;
3865                                 break;
3866                         }
3867
3868                         if (rsrcfork->ff_unallocblocks == 0) {
3869                                 /* Protect extents b-tree and allocation bitmap */
3870                                 lockflags = SFL_BITMAP;
3871                                 if (overflow_extents(rsrcfork))
3872                                         lockflags |= SFL_EXTENTS;
3873                                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
3874
3875                                 error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false));
3876
3877                                 hfs_systemfile_unlock(hfsmp, lockflags);
3878                         }
3879                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
3880
3881                         /* Finish the transaction and start over if necessary */
3882                         hfs_end_transaction(hfsmp);
3883
3884                         if (error) {
3885                                 break;
3886                         }
3887                 }
3888         }
3889
3890         return error;
3891 }
3892
3893 errno_t hfs_ubc_setsize(vnode_t vp, off_t len, bool have_cnode_lock)
3894 {
3895         errno_t error;
3896
3897         /*
3898          * Call ubc_setsize to give the VM subsystem a chance to do
3899          * whatever it needs to with existing pages before we delete
3900          * blocks.  Note that symlinks don't use the UBC so we'll
3901          * get back ENOENT in that case.
3902          */
3903         if (have_cnode_lock) {
3904                 error = ubc_setsize_ex(vp, len, UBC_SETSIZE_NO_FS_REENTRY);
3905                 if (error == EAGAIN) {
3906                         cnode_t *cp = VTOC(vp);
3907
3908                         if (cp->c_truncatelockowner != current_thread()) {
3909 #if DEVELOPMENT || DEBUG
3910                                 panic("hfs: hfs_ubc_setsize called without exclusive truncate lock!");
3911 #else
3912                                 printf("hfs: hfs_ubc_setsize called without exclusive truncate lock!\n");
3913 #endif
3914                         }
3915
3916                         hfs_unlock(cp);
3917                         error = ubc_setsize_ex(vp, len, 0);
3918                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
3919                 }
3920         } else
3921                 error = ubc_setsize_ex(vp, len, 0);
3922
3923         return error == ENOENT ? 0 : error;
3924 }
3925
3926 /*
3927  * Truncate a cnode to at most length size, freeing (or adding) the
3928  * disk blocks.
3929  */
3930 int
3931 hfs_truncate(struct vnode *vp, off_t length, int flags,
3932                          int truncateflags, vfs_context_t context)
3933 {
3934         struct filefork *fp = VTOF(vp);
3935         off_t filebytes;
3936         u_int32_t fileblocks;
3937         int blksize;
3938         errno_t error = 0;
3939         struct cnode *cp = VTOC(vp);
3940         hfsmount_t *hfsmp = VTOHFS(vp);
3941
3942         /* Cannot truncate an HFS directory! */
3943         if (vnode_isdir(vp)) {
3944                 return (EISDIR);
3945         }
3946         /* A swap file cannot change size. */
3947         if (vnode_isswap(vp) && length && !ISSET(flags, IO_NOAUTH)) {
3948                 return (EPERM);
3949         }
3950
3951         blksize = hfsmp->blockSize;
3952         fileblocks = fp->ff_blocks;
3953         filebytes = (off_t)fileblocks * (off_t)blksize;
3954
3955         bool caller_has_cnode_lock = (cp->c_lockowner == current_thread());
3956
3957         error = hfs_ubc_setsize(vp, length, caller_has_cnode_lock);
3958         if (error)
3959                 return error;
3960
3961         if (!caller_has_cnode_lock) {
3962                 error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
3963                 if (error)
3964                         return error;
3965         }
3966
3967         // have to loop truncating or growing files that are
3968         // really big because otherwise transactions can get
3969         // enormous and consume too many kernel resources.
3970
3971         if (length < filebytes) {
3972                 while (filebytes > length) {
3973                         if ((filebytes - length) > HFS_BIGFILE_SIZE) {
3974                                 filebytes -= HFS_BIGFILE_SIZE;
3975                         } else {
3976                                 filebytes = length;
3977                         }
3978                         error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3979                         if (error)
3980                                 break;
3981                 }
3982         } else if (length > filebytes) {
3983                 kauth_cred_t cred = vfs_context_ucred(context);
3984                 const bool keep_reserve = cred && suser(cred, NULL) != 0;
3985
3986                 if (hfs_freeblks(hfsmp, keep_reserve)
3987                         < howmany(length - filebytes, blksize)) {
3988                         error = ENOSPC;
3989                 } else {
3990                         while (filebytes < length) {
3991                                 if ((length - filebytes) > HFS_BIGFILE_SIZE) {
3992                                         filebytes += HFS_BIGFILE_SIZE;
3993                                 } else {
3994                                         filebytes = length;
3995                                 }
3996                                 error = do_hfs_truncate(vp, filebytes, flags, truncateflags, context);
3997                                 if (error)
3998                                         break;
3999                         }
4000                 }
4001         } else /* Same logical size */ {
4002
4003                 error = do_hfs_truncate(vp, length, flags, truncateflags, context);
4004         }
4005         /* Files that are changing size are not hot file candidates. */
4006         if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4007                 fp->ff_bytesread = 0;
4008         }
4009
4010
4011         if (!caller_has_cnode_lock)
4012                 hfs_unlock(cp);
4013
4014         // Make sure UBC's size matches up (in case we didn't completely succeed)
4015         errno_t err2 = hfs_ubc_setsize(vp, fp->ff_size, caller_has_cnode_lock);
4016         if (!error)
4017                 error = err2;
4018
4019         return error;
4020 }
4021
4022
4023 /*
4024  * Preallocate file storage space.
4025  */
4026 int
4027 hfs_vnop_allocate(struct vnop_allocate_args /* {
4028                 vnode_t a_vp;
4029                 off_t a_length;
4030                 u_int32_t  a_flags;
4031                 off_t *a_bytesallocated;
4032                 off_t a_offset;
4033                 vfs_context_t a_context;
4034         } */ *ap)
4035 {
4036         struct vnode *vp = ap->a_vp;
4037         struct cnode *cp;
4038         struct filefork *fp;
4039         ExtendedVCB *vcb;
4040         off_t length = ap->a_length;
4041         off_t startingPEOF;
4042         off_t moreBytesRequested;
4043         off_t actualBytesAdded;
4044         off_t filebytes;
4045         u_int32_t fileblocks;
4046         int retval, retval2;
4047         u_int32_t blockHint;
4048         u_int32_t extendFlags;   /* For call to ExtendFileC */
4049         struct hfsmount *hfsmp;
4050         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
4051         int lockflags;
4052         time_t orig_ctime;
4053
4054         *(ap->a_bytesallocated) = 0;
4055
4056         if (!vnode_isreg(vp))
4057                 return (EISDIR);
4058         if (length < (off_t)0)
4059                 return (EINVAL);
4060
4061         cp = VTOC(vp);
4062
4063         orig_ctime = VTOC(vp)->c_ctime;
4064
4065         check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL);
4066
4067         hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
4068
4069         if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT))) {
4070                 goto Err_Exit;
4071         }
4072
4073         fp = VTOF(vp);
4074         hfsmp = VTOHFS(vp);
4075         vcb = VTOVCB(vp);
4076
4077         fileblocks = fp->ff_blocks;
4078         filebytes = (off_t)fileblocks * (off_t)vcb->blockSize;
4079
4080         if ((ap->a_flags & ALLOCATEFROMVOL) && (length < filebytes)) {
4081                 retval = EINVAL;
4082                 goto Err_Exit;
4083         }
4084
4085         /* Fill in the flags word for the call to Extend the file */
4086
4087         extendFlags = kEFNoClumpMask;
4088         if (ap->a_flags & ALLOCATECONTIG)
4089                 extendFlags |= kEFContigMask;
4090         if (ap->a_flags & ALLOCATEALL)
4091                 extendFlags |= kEFAllMask;
4092         if (cred && suser(cred, NULL) != 0)
4093                 extendFlags |= kEFReserveMask;
4094         if (hfs_virtualmetafile(cp))
4095                 extendFlags |= kEFMetadataMask;
4096
4097         retval = E_NONE;
4098         blockHint = 0;
4099         startingPEOF = filebytes;
4100
4101         if (ap->a_flags & ALLOCATEFROMPEOF)
4102                 length += filebytes;
4103         else if (ap->a_flags & ALLOCATEFROMVOL)
4104                 blockHint = ap->a_offset / VTOVCB(vp)->blockSize;
4105
4106         /* If no changes are necesary, then we're done */
4107         if (filebytes == length)
4108                 goto Std_Exit;
4109
4110         /*
4111          * Lengthen the size of the file. We must ensure that the
4112          * last byte of the file is allocated. Since the smallest
4113          * value of filebytes is 0, length will be at least 1.
4114          */
4115         if (length > filebytes) {
4116                 if (ISSET(extendFlags, kEFAllMask)
4117                         && (hfs_freeblks(hfsmp, ISSET(extendFlags, kEFReserveMask))
4118                                 < howmany(length - filebytes, hfsmp->blockSize))) {
4119                         retval = ENOSPC;
4120                         goto Err_Exit;
4121                 }
4122
4123                 off_t total_bytes_added = 0, orig_request_size;
4124
4125                 orig_request_size = moreBytesRequested = length - filebytes;
4126
4127 #if QUOTA
4128                 retval = hfs_chkdq(cp,
4129                                 (int64_t)(roundup(moreBytesRequested, vcb->blockSize)),
4130                                 cred, 0);
4131                 if (retval)
4132                         goto Err_Exit;
4133
4134 #endif /* QUOTA */
4135                 /*
4136                  * Metadata zone checks.
4137                  */
4138                 if (hfsmp->hfs_flags & HFS_METADATA_ZONE) {
4139                         /*
4140                          * Allocate Journal and Quota files in metadata zone.
4141                          */
4142                         if (hfs_virtualmetafile(cp)) {
4143                                 blockHint = hfsmp->hfs_metazone_start;
4144                         } else if ((blockHint >= hfsmp->hfs_metazone_start) &&
4145                                    (blockHint <= hfsmp->hfs_metazone_end)) {
4146                                 /*
4147                                  * Move blockHint outside metadata zone.
4148                                  */
4149                                 blockHint = hfsmp->hfs_metazone_end + 1;
4150                         }
4151                 }
4152
4153
4154                 while ((length > filebytes) && (retval == E_NONE)) {
4155                     off_t bytesRequested;
4156
4157                     if (hfs_start_transaction(hfsmp) != 0) {
4158                         retval = EINVAL;
4159                         goto Err_Exit;
4160                     }
4161
4162                     /* Protect extents b-tree and allocation bitmap */
4163                     lockflags = SFL_BITMAP;
4164                     if (overflow_extents(fp))
4165                                 lockflags |= SFL_EXTENTS;
4166                     lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
4167
4168                     if (moreBytesRequested >= HFS_BIGFILE_SIZE) {
4169                                 bytesRequested = HFS_BIGFILE_SIZE;
4170                     } else {
4171                                 bytesRequested = moreBytesRequested;
4172                     }
4173
4174                     if (extendFlags & kEFContigMask) {
4175                             // if we're on a sparse device, this will force it to do a
4176                             // full scan to find the space needed.
4177                             hfsmp->hfs_flags &= ~HFS_DID_CONTIG_SCAN;
4178                     }
4179
4180                     retval = MacToVFSError(ExtendFileC(vcb,
4181                                                 (FCB*)fp,
4182                                                 bytesRequested,
4183                                                 blockHint,
4184                                                 extendFlags,
4185                                                 &actualBytesAdded));
4186
4187                     if (retval == E_NONE) {
4188                         *(ap->a_bytesallocated) += actualBytesAdded;
4189                         total_bytes_added += actualBytesAdded;
4190                         moreBytesRequested -= actualBytesAdded;
4191                         if (blockHint != 0) {
4192                             blockHint += actualBytesAdded / vcb->blockSize;
4193                         }
4194                     }
4195                     filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4196
4197                     hfs_systemfile_unlock(hfsmp, lockflags);
4198
4199                     if (hfsmp->jnl) {
4200                         (void) hfs_update(vp, 0);
4201                         (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0);
4202                     }
4203
4204                     hfs_end_transaction(hfsmp);
4205                 }
4206
4207
4208                 /*
4209                  * if we get an error and no changes were made then exit
4210                  * otherwise we must do the hfs_update to reflect the changes
4211                  */
4212                 if (retval && (startingPEOF == filebytes))
4213                         goto Err_Exit;
4214
4215                 /*
4216                  * Adjust actualBytesAdded to be allocation block aligned, not
4217                  * clump size aligned.
4218                  * NOTE: So what we are reporting does not affect reality
4219                  * until the file is closed, when we truncate the file to allocation
4220                  * block size.
4221                  */
4222                 if (total_bytes_added != 0 && orig_request_size < total_bytes_added)
4223                         *(ap->a_bytesallocated) =
4224                                 roundup(orig_request_size, (off_t)vcb->blockSize);
4225
4226         } else { /* Shorten the size of the file */
4227
4228                 /*
4229                  * N.B. At present, this code is never called.  If and when we
4230                  * do start using it, it looks like there might be slightly
4231                  * strange semantics with the file size: it's possible for the
4232                  * file size to *increase* e.g. if current file size is 5,
4233                  * length is 1024 and filebytes is 4096, the file size will
4234                  * end up being 1024 bytes.  This isn't necessarily a problem
4235                  * but it's not consistent with the code above which doesn't
4236                  * change the file size.
4237                  */
4238
4239                 retval = hfs_truncate(vp, length, 0, 0, ap->a_context);
4240                 filebytes = (off_t)fp->ff_blocks * (off_t)vcb->blockSize;
4241
4242                 /*
4243                  * if we get an error and no changes were made then exit
4244                  * otherwise we must do the hfs_update to reflect the changes
4245                  */
4246                 if (retval && (startingPEOF == filebytes)) goto Err_Exit;
4247 #if QUOTA
4248                 /* These are  bytesreleased */
4249                 (void) hfs_chkdq(cp, (int64_t)-((startingPEOF - filebytes)), NOCRED,0);
4250 #endif /* QUOTA */
4251
4252                 if (fp->ff_size > filebytes) {
4253                         fp->ff_size = filebytes;
4254
4255                         hfs_ubc_setsize(vp, fp->ff_size, true);
4256                 }
4257         }
4258
4259 Std_Exit:
4260         cp->c_flag |= C_MODIFIED;
4261         cp->c_touch_chgtime = TRUE;
4262         cp->c_touch_modtime = TRUE;
4263         retval2 = hfs_update(vp, 0);
4264
4265         if (retval == 0)
4266                 retval = retval2;
4267 Err_Exit:
4268         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
4269         hfs_unlock(cp);
4270         return (retval);
4271 }
4272
4273
4274 /*
4275  * Pagein for HFS filesystem
4276  */
4277 int
4278 hfs_vnop_pagein(struct vnop_pagein_args *ap)
4279 /*
4280         struct vnop_pagein_args {
4281                 vnode_t a_vp,
4282                 upl_t         a_pl,
4283                 vm_offset_t   a_pl_offset,
4284                 off_t         a_f_offset,
4285                 size_t        a_size,
4286                 int           a_flags
4287                 vfs_context_t a_context;
4288         };
4289 */
4290 {
4291         vnode_t         vp;
4292         struct cnode    *cp;
4293         struct filefork *fp;
4294         int             error = 0;
4295         upl_t           upl;
4296         upl_page_info_t *pl;
4297         off_t           f_offset;
4298         off_t           page_needed_f_offset;
4299         int             offset;
4300         int             isize;
4301         int             upl_size;
4302         int             pg_index;
4303         boolean_t       truncate_lock_held = FALSE;
4304         boolean_t       file_converted = FALSE;
4305         kern_return_t   kret;
4306
4307         vp = ap->a_vp;
4308         cp = VTOC(vp);
4309         fp = VTOF(vp);
4310
4311 #if CONFIG_PROTECT
4312         if ((error = cp_handle_vnop(vp, CP_READ_ACCESS | CP_WRITE_ACCESS, 0)) != 0) {
4313                 /*
4314                  * If we errored here, then this means that one of two things occurred:
4315                  * 1. there was a problem with the decryption of the key.
4316                  * 2. the device is locked and we are not allowed to access this particular file.
4317                  *
4318                  * Either way, this means that we need to shut down this upl now.  As long as
4319                  * the pl pointer is NULL (meaning that we're supposed to create the UPL ourselves)
4320                  * then we create a upl and immediately abort it.
4321                  */
4322                 if (ap->a_pl == NULL) {
4323                         /* create the upl */
4324                         ubc_create_upl (vp, ap->a_f_offset, ap->a_size, &upl, &pl,
4325                                         UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4326                         /* mark the range as needed so it doesn't immediately get discarded upon abort */
4327                         ubc_upl_range_needed (upl, ap->a_pl_offset / PAGE_SIZE, 1);
4328
4329                         /* Abort the range */
4330                         ubc_upl_abort_range (upl, 0, ap->a_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
4331                 }
4332
4333
4334                 return error;
4335         }
4336 #endif /* CONFIG_PROTECT */
4337
4338         if (ap->a_pl != NULL) {
4339                 /*
4340                  * this can only happen for swap files now that
4341                  * we're asking for V2 paging behavior...
4342                  * so don't need to worry about decompression, or
4343                  * keeping track of blocks read or taking the truncate lock
4344                  */
4345                 error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset,
4346                                        ap->a_size, (off_t)fp->ff_size, ap->a_flags);
4347                 goto pagein_done;
4348         }
4349
4350         page_needed_f_offset = ap->a_f_offset + ap->a_pl_offset;
4351
4352 retry_pagein:
4353         /*
4354          * take truncate lock (shared/recursive) to guard against
4355          * zero-fill thru fsync interfering, but only for v2
4356          *
4357          * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the
4358          * lock shared and we are allowed to recurse 1 level if this thread already
4359          * owns the lock exclusively... this can legally occur
4360          * if we are doing a shrinking ftruncate against a file
4361          * that is mapped private, and the pages being truncated
4362          * do not currently exist in the cache... in that case
4363          * we will have to page-in the missing pages in order
4364          * to provide them to the private mapping... we must
4365          * also call hfs_unlock_truncate with a postive been_recursed
4366          * arg to indicate that if we have recursed, there is no need to drop
4367          * the lock.  Allowing this simple recursion is necessary
4368          * in order to avoid a certain deadlock... since the ftruncate
4369          * already holds the truncate lock exclusively, if we try
4370          * to acquire it shared to protect the pagein path, we will
4371          * hang this thread
4372          *
4373          * NOTE: The if () block below is a workaround in order to prevent a
4374          * VM deadlock. See rdar://7853471.
4375          *
4376          * If we are in a forced unmount, then launchd will still have the
4377          * dyld_shared_cache file mapped as it is trying to reboot.  If we
4378          * take the truncate lock here to service a page fault, then our
4379          * thread could deadlock with the forced-unmount.  The forced unmount
4380          * thread will try to reclaim the dyld_shared_cache vnode, but since it's
4381          * marked C_DELETED, it will call ubc_setsize(0).  As a result, the unmount
4382          * thread will think it needs to copy all of the data out of the file
4383          * and into a VM copy object.  If we hold the cnode lock here, then that
4384          * VM operation will not be able to proceed, because we'll set a busy page
4385          * before attempting to grab the lock.  Note that this isn't as simple as "don't
4386          * call ubc_setsize" because doing that would just shift the problem to the
4387          * ubc_msync done before the vnode is reclaimed.
4388          *
4389          * So, if a forced unmount on this volume is in flight AND the cnode is
4390          * marked C_DELETED, then just go ahead and do the page in without taking
4391          * the lock (thus suspending pagein_v2 semantics temporarily).  Since it's on a file
4392          * that is not going to be available on the next mount, this seems like a
4393          * OK solution from a correctness point of view, even though it is hacky.
4394          */
4395         if (vfs_isforce(vp->v_mount)) {
4396                 if (cp->c_flag & C_DELETED) {
4397                         /* If we don't get it, then just go ahead and operate without the lock */
4398                         truncate_lock_held = hfs_try_trunclock(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4399                 }
4400         }
4401         else {
4402                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4403                 truncate_lock_held = TRUE;
4404         }
4405
4406         kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT);
4407
4408         if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4409                 error = EINVAL;
4410                 goto pagein_done;
4411         }
4412         ubc_upl_range_needed(upl, ap->a_pl_offset / PAGE_SIZE, 1);
4413
4414         upl_size = isize = ap->a_size;
4415
4416         /*
4417          * Scan from the back to find the last page in the UPL, so that we
4418          * aren't looking at a UPL that may have already been freed by the
4419          * preceding aborts/completions.
4420          */
4421         for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4422                 if (upl_page_present(pl, --pg_index))
4423                         break;
4424                 if (pg_index == 0) {
4425                         /*
4426                          * no absent pages were found in the range specified
4427                          * just abort the UPL to get rid of it and then we're done
4428                          */
4429                         ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4430                         goto pagein_done;
4431                 }
4432         }
4433         /*
4434          * initialize the offset variables before we touch the UPL.
4435          * f_offset is the position into the file, in bytes
4436          * offset is the position into the UPL, in bytes
4437          * pg_index is the pg# of the UPL we're operating on
4438          * isize is the offset into the UPL of the last page that is present.
4439          */
4440         isize = ((pg_index + 1) * PAGE_SIZE);
4441         pg_index = 0;
4442         offset = 0;
4443         f_offset = ap->a_f_offset;
4444
4445         while (isize) {
4446                 int  xsize;
4447                 int  num_of_pages;
4448
4449                 if ( !upl_page_present(pl, pg_index)) {
4450                         /*
4451                          * we asked for RET_ONLY_ABSENT, so it's possible
4452                          * to get back empty slots in the UPL.
4453                          * just skip over them
4454                          */
4455                         f_offset += PAGE_SIZE;
4456                         offset   += PAGE_SIZE;
4457                         isize    -= PAGE_SIZE;
4458                         pg_index++;
4459
4460                         continue;
4461                 }
4462                 /*
4463                  * We know that we have at least one absent page.
4464                  * Now checking to see how many in a row we have
4465                  */
4466                 num_of_pages = 1;
4467                 xsize = isize - PAGE_SIZE;
4468
4469                 while (xsize) {
4470                         if ( !upl_page_present(pl, pg_index + num_of_pages))
4471                                 break;
4472                         num_of_pages++;
4473                         xsize -= PAGE_SIZE;
4474                 }
4475                 xsize = num_of_pages * PAGE_SIZE;
4476
4477 #if HFS_COMPRESSION
4478                 if (VNODE_IS_RSRC(vp)) {
4479                         /* allow pageins of the resource fork */
4480                 } else {
4481                         int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */
4482
4483                         if (compressed) {
4484
4485                                 if (truncate_lock_held) {
4486                                         /*
4487                                          * can't hold the truncate lock when calling into the decmpfs layer
4488                                          * since it calls back into this layer... even though we're only
4489                                          * holding the lock in shared mode, and the re-entrant path only
4490                                          * takes the lock shared, we can deadlock if some other thread
4491                                          * tries to grab the lock exclusively in between.
4492                                          */
4493                                         hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4494                                         truncate_lock_held = FALSE;
4495                                 }
4496                                 ap->a_pl = upl;
4497                                 ap->a_pl_offset = offset;
4498                                 ap->a_f_offset = f_offset;
4499                                 ap->a_size = xsize;
4500
4501                                 error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp));
4502                                 /*
4503                                  * note that decpfs_pagein_compressed can change the state of
4504                                  * 'compressed'... it will set it to 0 if the file is no longer
4505                                  * compressed once the compression lock is successfully taken
4506                                  * i.e. we would block on that lock while the file is being inflated
4507                                  */
4508                                 if (error == 0 && vnode_isfastdevicecandidate(vp)) {
4509                                         (void) hfs_addhotfile(vp);
4510                                 }
4511                                 if (compressed) {
4512                                         if (error == 0) {
4513                                                 /* successful page-in, update the access time */
4514                                                 VTOC(vp)->c_touch_acctime = TRUE;
4515
4516                                                 //
4517                                                 // compressed files are not traditional hot file candidates
4518                                                 // but they may be for CF (which ignores the ff_bytesread
4519                                                 // field)
4520                                                 //
4521                                                 if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) {
4522                                                         fp->ff_bytesread = 0;
4523                                                 }
4524                                         } else if (error == EAGAIN) {
4525                                                 /*
4526                                                  * EAGAIN indicates someone else already holds the compression lock...
4527                                                  * to avoid deadlocking, we'll abort this range of pages with an
4528                                                  * indication that the pagein needs to be redriven
4529                                                  */
4530                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
4531                                         } else if (error == ENOSPC) {
4532
4533                                                 if (upl_size == PAGE_SIZE)
4534                                                         panic("decmpfs_pagein_compressed: couldn't ubc_upl_map a single page\n");
4535
4536                                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4537
4538                                                 ap->a_size = PAGE_SIZE;
4539                                                 ap->a_pl = NULL;
4540                                                 ap->a_pl_offset = 0;
4541                                                 ap->a_f_offset = page_needed_f_offset;
4542
4543                                                 goto retry_pagein;
4544                                         }
4545                                         goto pagein_next_range;
4546                                 }
4547                                 else {
4548                                         /*
4549                                          * Set file_converted only if the file became decompressed while we were
4550                                          * paging in.  If it were still compressed, we would re-start the loop using the goto
4551                                          * in the above block.  This avoid us overloading truncate_lock_held as our retry_pagein
4552                                          * condition below, since we could have avoided taking the truncate lock to prevent
4553                                          * a deadlock in the force unmount case.
4554                                          */
4555                                         file_converted = TRUE;
4556                                 }
4557                         }
4558                         if (file_converted == TRUE) {
4559                                 /*
4560                                  * the file was converted back to a regular file after we first saw it as compressed
4561                                  * we need to abort the upl, retake the truncate lock, recreate the UPL and start over
4562                                  * reset a_size so that we consider what remains of the original request
4563                                  * and null out a_upl and a_pl_offset.
4564                                  *
4565                                  * We should only be able to get into this block if the decmpfs_pagein_compressed
4566                                  * successfully decompressed the range in question for this file.
4567                                  */
4568                                 ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY);
4569
4570                                 ap->a_size = isize;
4571                                 ap->a_pl = NULL;
4572                                 ap->a_pl_offset = 0;
4573
4574                                 /* Reset file_converted back to false so that we don't infinite-loop. */
4575                                 file_converted = FALSE;
4576                                 goto retry_pagein;
4577                         }
4578                 }
4579 #endif
4580                 error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags);
4581
4582                 /*
4583                  * Keep track of blocks read.
4584                  */
4585                 if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) {
4586                         int bytesread;
4587                         int took_cnode_lock = 0;
4588
4589                         if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE)
4590                                 bytesread = fp->ff_size;
4591                         else
4592                                 bytesread = xsize;
4593
4594                         /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */
4595                         if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) {
4596                                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4597                                 took_cnode_lock = 1;
4598                         }
4599                         /*
4600                          * If this file hasn't been seen since the start of
4601                          * the current sampling period then start over.
4602                          */
4603                         if (cp->c_atime < VTOHFS(vp)->hfc_timebase) {
4604                                 struct timeval tv;
4605
4606                                 fp->ff_bytesread = bytesread;
4607                                 microtime(&tv);
4608                                 cp->c_atime = tv.tv_sec;
4609                         } else {
4610                                 fp->ff_bytesread += bytesread;
4611                         }
4612                         cp->c_touch_acctime = TRUE;
4613
4614                         if (vnode_isfastdevicecandidate(vp)) {
4615                                 (void) hfs_addhotfile(vp);
4616                         }
4617                         if (took_cnode_lock)
4618                                 hfs_unlock(cp);
4619                 }
4620 pagein_next_range:
4621                 f_offset += xsize;
4622                 offset   += xsize;
4623                 isize    -= xsize;
4624                 pg_index += num_of_pages;
4625
4626                 error = 0;
4627         }
4628
4629 pagein_done:
4630         if (truncate_lock_held == TRUE) {
4631                 /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */
4632                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4633         }
4634
4635         return (error);
4636 }
4637
4638 /*
4639  * Pageout for HFS filesystem.
4640  */
4641 int
4642 hfs_vnop_pageout(struct vnop_pageout_args *ap)
4643 /*
4644         struct vnop_pageout_args {
4645            vnode_t a_vp,
4646            upl_t         a_pl,
4647            vm_offset_t   a_pl_offset,
4648            off_t         a_f_offset,
4649            size_t        a_size,
4650            int           a_flags
4651            vfs_context_t a_context;
4652         };
4653 */
4654 {
4655         vnode_t vp = ap->a_vp;
4656         struct cnode *cp;
4657         struct filefork *fp;
4658         int retval = 0;
4659         off_t filesize;
4660         upl_t           upl;
4661         upl_page_info_t* pl;
4662         vm_offset_t     a_pl_offset;
4663         int             a_flags;
4664         int is_pageoutv2 = 0;
4665         kern_return_t kret;
4666
4667         cp = VTOC(vp);
4668         fp = VTOF(vp);
4669
4670         a_flags = ap->a_flags;
4671         a_pl_offset = ap->a_pl_offset;
4672
4673         /*
4674          * we can tell if we're getting the new or old behavior from the UPL
4675          */
4676         if ((upl = ap->a_pl) == NULL) {
4677                 int request_flags;
4678
4679                 is_pageoutv2 = 1;
4680                 /*
4681                  * we're in control of any UPL we commit
4682                  * make sure someone hasn't accidentally passed in UPL_NOCOMMIT
4683                  */
4684                 a_flags &= ~UPL_NOCOMMIT;
4685                 a_pl_offset = 0;
4686
4687                 /*
4688                  * For V2 semantics, we want to take the cnode truncate lock
4689                  * shared to guard against the file size changing via zero-filling.
4690                  *
4691                  * However, we have to be careful because we may be invoked
4692                  * via the ubc_msync path to write out dirty mmap'd pages
4693                  * in response to a lock event on a content-protected
4694                  * filesystem (e.g. to write out class A files).
4695                  * As a result, we want to take the truncate lock 'SHARED' with
4696                  * the mini-recursion locktype so that we don't deadlock/panic
4697                  * because we may be already holding the truncate lock exclusive to force any other
4698                  * IOs to have blocked behind us.
4699                  */
4700                 hfs_lock_truncate(cp, HFS_SHARED_LOCK, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4701
4702                 if (a_flags & UPL_MSYNC) {
4703                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
4704                 }
4705                 else {
4706                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
4707                 }
4708
4709                 kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
4710
4711                 if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
4712                         retval = EINVAL;
4713                         goto pageout_done;
4714                 }
4715         }
4716         /*
4717          * from this point forward upl points at the UPL we're working with
4718          * it was either passed in or we succesfully created it
4719          */
4720
4721         /*
4722          * Figure out where the file ends, for pageout purposes.  If
4723          * ff_new_size > ff_size, then we're in the middle of extending the
4724          * file via a write, so it is safe (and necessary) that we be able
4725          * to pageout up to that point.
4726          */
4727         filesize = fp->ff_size;
4728         if (fp->ff_new_size > filesize)
4729                 filesize = fp->ff_new_size;
4730
4731         /*
4732          * Now that HFS is opting into VFC_VFSVNOP_PAGEOUTV2, we may need to operate on our own
4733          * UPL instead of relying on the UPL passed into us.  We go ahead and do that here,
4734          * scanning for dirty ranges.  We'll issue our own N cluster_pageout calls, for
4735          * N dirty ranges in the UPL.  Note that this is almost a direct copy of the
4736          * logic in vnode_pageout except that we need to do it after grabbing the truncate
4737          * lock in HFS so that we don't lock invert ourselves.
4738          *
4739          * Note that we can still get into this function on behalf of the default pager with
4740          * non-V2 behavior (swapfiles).  However in that case, we did not grab locks above
4741          * since fsync and other writing threads will grab the locks, then mark the
4742          * relevant pages as busy.  But the pageout codepath marks the pages as busy,
4743          * and THEN would attempt to grab the truncate lock, which would result in deadlock.  So
4744          * we do not try to grab anything for the pre-V2 case, which should only be accessed
4745          * by the paging/VM system.
4746          */
4747
4748         if (is_pageoutv2) {
4749                 off_t f_offset;
4750                 int offset;
4751                 int isize;
4752                 int pg_index;
4753                 int error;
4754                 int error_ret = 0;
4755
4756                 isize = ap->a_size;
4757                 f_offset = ap->a_f_offset;
4758
4759                 /*
4760                  * Scan from the back to find the last page in the UPL, so that we
4761                  * aren't looking at a UPL that may have already been freed by the
4762                  * preceding aborts/completions.
4763                  */
4764                 for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) {
4765                         if (upl_page_present(pl, --pg_index))
4766                                 break;
4767                         if (pg_index == 0) {
4768                                 ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY);
4769                                 goto pageout_done;
4770                         }
4771                 }
4772
4773                 /*
4774                  * initialize the offset variables before we touch the UPL.
4775                  * a_f_offset is the position into the file, in bytes
4776                  * offset is the position into the UPL, in bytes
4777                  * pg_index is the pg# of the UPL we're operating on.
4778                  * isize is the offset into the UPL of the last non-clean page.
4779                  */
4780                 isize = ((pg_index + 1) * PAGE_SIZE);
4781
4782                 offset = 0;
4783                 pg_index = 0;
4784
4785                 while (isize) {
4786                         int  xsize;
4787                         int  num_of_pages;
4788
4789                         if ( !upl_page_present(pl, pg_index)) {
4790                                 /*
4791                                  * we asked for RET_ONLY_DIRTY, so it's possible
4792                                  * to get back empty slots in the UPL.
4793                                  * just skip over them
4794                                  */
4795                                 f_offset += PAGE_SIZE;
4796                                 offset   += PAGE_SIZE;
4797                                 isize    -= PAGE_SIZE;
4798                                 pg_index++;
4799
4800                                 continue;
4801                         }
4802                         if ( !upl_dirty_page(pl, pg_index)) {
4803                                 panic ("hfs_vnop_pageout: unforeseen clean page @ index %d for UPL %p\n", pg_index, upl);
4804                         }
4805
4806                         /*
4807                          * We know that we have at least one dirty page.
4808                          * Now checking to see how many in a row we have
4809                          */
4810                         num_of_pages = 1;
4811                         xsize = isize - PAGE_SIZE;
4812
4813                         while (xsize) {
4814                                 if ( !upl_dirty_page(pl, pg_index + num_of_pages))
4815                                         break;
4816                                 num_of_pages++;
4817                                 xsize -= PAGE_SIZE;
4818                         }
4819                         xsize = num_of_pages * PAGE_SIZE;
4820
4821                         if ((error = cluster_pageout(vp, upl, offset, f_offset,
4822                                                         xsize, filesize, a_flags))) {
4823                                 if (error_ret == 0)
4824                                         error_ret = error;
4825                         }
4826                         f_offset += xsize;
4827                         offset   += xsize;
4828                         isize    -= xsize;
4829                         pg_index += num_of_pages;
4830                 }
4831                 /* capture errnos bubbled out of cluster_pageout if they occurred */
4832                 if (error_ret != 0) {
4833                         retval = error_ret;
4834                 }
4835         } /* end block for v2 pageout behavior */
4836         else {
4837                 /*
4838                  * just call cluster_pageout for old pre-v2 behavior
4839                  */
4840                 retval = cluster_pageout(vp, upl, a_pl_offset, ap->a_f_offset,
4841                                 ap->a_size, filesize, a_flags);
4842         }
4843
4844         /*
4845          * If data was written, update the modification time of the file
4846          * but only if it's mapped writable; we will have touched the
4847          * modifcation time for direct writes.
4848          */
4849         if (retval == 0 && (ubc_is_mapped_writable(vp)
4850                                                 || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING))) {
4851                 hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
4852
4853                 // Check again with lock
4854                 bool mapped_writable = ubc_is_mapped_writable(vp);
4855                 if (mapped_writable
4856                         || ISSET(cp->c_flag, C_MIGHT_BE_DIRTY_FROM_MAPPING)) {
4857                         cp->c_touch_modtime = TRUE;
4858                         cp->c_touch_chgtime = TRUE;
4859
4860                         /*
4861                          * We only need to increment the generation counter if
4862                          * it's currently mapped writable because we incremented
4863                          * the counter in hfs_vnop_mnomap.
4864                          */
4865                         if (mapped_writable)
4866                                 hfs_incr_gencount(VTOC(vp));
4867
4868                         /*
4869                          * If setuid or setgid bits are set and this process is
4870                          * not the superuser then clear the setuid and setgid bits
4871                          * as a precaution against tampering.
4872                          */
4873                         if ((cp->c_mode & (S_ISUID | S_ISGID)) &&
4874                                 (vfs_context_suser(ap->a_context) != 0)) {
4875                                 cp->c_mode &= ~(S_ISUID | S_ISGID);
4876                         }
4877                 }
4878
4879                 hfs_unlock(cp);
4880         }
4881
4882 pageout_done:
4883         if (is_pageoutv2) {
4884                 /*
4885                  * Release the truncate lock.  Note that because
4886                  * we may have taken the lock recursively by
4887                  * being invoked via ubc_msync due to lockdown,
4888                  * we should release it recursively, too.
4889                  */
4890                 hfs_unlock_truncate(cp, HFS_LOCK_SKIP_IF_EXCLUSIVE);
4891         }
4892         return (retval);
4893 }
4894
4895 /*
4896  * Intercept B-Tree node writes to unswap them if necessary.
4897  */
4898 int
4899 hfs_vnop_bwrite(struct vnop_bwrite_args *ap)
4900 {
4901         int retval = 0;
4902         register struct buf *bp = ap->a_bp;
4903         register struct vnode *vp = buf_vnode(bp);
4904         BlockDescriptor block;
4905
4906         /* Trap B-Tree writes */
4907         if ((VTOC(vp)->c_fileid == kHFSExtentsFileID) ||
4908             (VTOC(vp)->c_fileid == kHFSCatalogFileID) ||
4909             (VTOC(vp)->c_fileid == kHFSAttributesFileID) ||
4910             (vp == VTOHFS(vp)->hfc_filevp)) {
4911
4912                 /*
4913                  * Swap and validate the node if it is in native byte order.
4914                  * This is always be true on big endian, so we always validate
4915                  * before writing here.  On little endian, the node typically has
4916                  * been swapped and validated when it was written to the journal,
4917                  * so we won't do anything here.
4918                  */
4919                 if (((u_int16_t *)((char *)buf_dataptr(bp) + buf_count(bp) - 2))[0] == 0x000e) {
4920                         /* Prepare the block pointer */
4921                         block.blockHeader = bp;
4922                         block.buffer = (char *)buf_dataptr(bp);
4923                         block.blockNum = buf_lblkno(bp);
4924                         /* not found in cache ==> came from disk */
4925                         block.blockReadFromDisk = (buf_fromcache(bp) == 0);
4926                         block.blockSize = buf_count(bp);
4927
4928                         /* Endian un-swap B-Tree node */
4929                         retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, false);
4930                         if (retval)
4931                                 panic("hfs_vnop_bwrite: about to write corrupt node!\n");
4932                 }
4933         }
4934
4935         /* This buffer shouldn't be locked anymore but if it is clear it */
4936         if ((buf_flags(bp) & B_LOCKED)) {
4937                 // XXXdbg
4938                 if (VTOHFS(vp)->jnl) {
4939                         panic("hfs: CLEARING the lock bit on bp %p\n", bp);
4940                 }
4941                 buf_clearflags(bp, B_LOCKED);
4942         }
4943         retval = vn_bwrite (ap);
4944
4945         return (retval);
4946 }
4947
4948
4949 int
4950 hfs_pin_block_range(struct hfsmount *hfsmp, int pin_state, uint32_t start_block, uint32_t nblocks, vfs_context_t ctx)
4951 {
4952         _dk_cs_pin_t pin;
4953         unsigned ioc;
4954         int err;
4955
4956         memset(&pin, 0, sizeof(pin));
4957         pin.cp_extent.offset = ((uint64_t)start_block) * HFSTOVCB(hfsmp)->blockSize;
4958         pin.cp_extent.length = ((uint64_t)nblocks) * HFSTOVCB(hfsmp)->blockSize;
4959         switch (pin_state) {
4960         case HFS_PIN_IT:
4961                 ioc = _DKIOCCSPINEXTENT;
4962                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA;
4963                 break;
4964         case HFS_PIN_IT | HFS_TEMP_PIN:
4965                 ioc = _DKIOCCSPINEXTENT;
4966                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSTEMPORARYPIN;
4967                 break;
4968         case HFS_PIN_IT | HFS_DATALESS_PIN:
4969                 ioc = _DKIOCCSPINEXTENT;
4970                 pin.cp_flags = _DKIOCCSPINTOFASTMEDIA | _DKIOCCSPINFORSWAPFILE;
4971                 break;
4972         case HFS_UNPIN_IT:
4973                 ioc = _DKIOCCSUNPINEXTENT;
4974                 pin.cp_flags = 0;
4975                 break;
4976         case HFS_UNPIN_IT | HFS_EVICT_PIN:
4977                 ioc = _DKIOCCSPINEXTENT;
4978                 pin.cp_flags = _DKIOCCSPINTOSLOWMEDIA;
4979                 break;
4980         default:
4981                 return EINVAL;
4982         }
4983         err = VNOP_IOCTL(hfsmp->hfs_devvp, ioc, (caddr_t)&pin, 0, ctx);
4984         return err;
4985 }
4986
4987 //
4988 // The cnode lock should already be held on entry to this function
4989 //
4990 int
4991 hfs_pin_vnode(struct hfsmount *hfsmp, struct vnode *vp, int pin_state, uint32_t *num_blocks_pinned, vfs_context_t ctx)
4992 {
4993         struct filefork *fp = VTOF(vp);
4994         int i, err=0, need_put=0;
4995         struct vnode *rsrc_vp=NULL;
4996         uint32_t npinned = 0;
4997         off_t               offset;
4998
4999         if (num_blocks_pinned) {
5000                 *num_blocks_pinned = 0;
5001         }
5002
5003         if (vnode_vtype(vp) != VREG) {
5004                 /* Not allowed to pin directories or symlinks */
5005                 printf("hfs: can't pin vnode of type %d\n", vnode_vtype(vp));
5006                 return (EPERM);
5007         }
5008
5009         if (fp->ff_unallocblocks) {
5010                 printf("hfs: can't pin a vnode w/unalloced blocks (%d)\n", fp->ff_unallocblocks);
5011                 return (EINVAL);
5012         }
5013
5014         /*
5015          * It is possible that if the caller unlocked/re-locked the cnode after checking
5016          * for C_NOEXISTS|C_DELETED that the file could have been deleted while the
5017          * cnode was unlocked.  So check the condition again and return ENOENT so that
5018          * the caller knows why we failed to pin the vnode.
5019          */
5020         if (VTOC(vp)->c_flag & (C_NOEXISTS|C_DELETED)) {
5021                 // makes no sense to pin something that's pending deletion
5022                 return ENOENT;
5023         }
5024
5025         if (fp->ff_blocks == 0 && (VTOC(vp)->c_bsdflags & UF_COMPRESSED)) {
5026                 if (!VNODE_IS_RSRC(vp) && hfs_vgetrsrc(hfsmp, vp, &rsrc_vp) == 0) {
5027                         //printf("hfs: fileid %d resource fork nblocks: %d / size: %lld\n", VTOC(vp)->c_fileid,
5028                         //       VTOC(rsrc_vp)->c_rsrcfork->ff_blocks,VTOC(rsrc_vp)->c_rsrcfork->ff_size);
5029
5030                         fp = VTOC(rsrc_vp)->c_rsrcfork;
5031                         need_put = 1;
5032                 }
5033         }
5034         if (fp->ff_blocks == 0) {
5035                 if (need_put) {
5036                         //
5037                         // use a distinct error code for a compressed file that has no resource fork;
5038                         // we return EALREADY to indicate that the data is already probably hot file
5039                         // cached because it's in an EA and the attributes btree is on the ssd
5040                         //
5041                         err = EALREADY;
5042                 } else {
5043                         err = EINVAL;
5044                 }
5045                 goto out;
5046         }
5047
5048         offset = 0;
5049         for (i = 0; i < kHFSPlusExtentDensity; i++) {
5050                 if (fp->ff_extents[i].startBlock == 0) {
5051                         break;
5052                 }
5053
5054                 err = hfs_pin_block_range(hfsmp, pin_state, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, ctx);
5055                 if (err) {
5056                         break;
5057                 } else {
5058                         npinned += fp->ff_extents[i].blockCount;
5059                 }
5060         }
5061
5062         if (err || npinned == 0) {
5063                 goto out;
5064         }
5065
5066         if (fp->ff_extents[kHFSPlusExtentDensity-1].startBlock) {
5067                 uint32_t pblocks;
5068                 uint8_t forktype = 0;
5069
5070                 if (fp == VTOC(vp)->c_rsrcfork) {
5071                         forktype = 0xff;
5072                 }
5073                 /*
5074                  * The file could have overflow extents, better pin them.
5075                  *
5076                  * We assume that since we are holding the cnode lock for this cnode,
5077                  * the files extents cannot be manipulated, but the tree could, so we
5078                  * need to ensure that it doesn't change behind our back as we iterate it.
5079                  */
5080                 int lockflags = hfs_systemfile_lock (hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
5081                 err = hfs_pin_overflow_extents(hfsmp, VTOC(vp)->c_fileid, forktype, &pblocks);
5082                 hfs_systemfile_unlock (hfsmp, lockflags);
5083
5084                 if (err) {
5085                         goto out;
5086                 }
5087                 npinned += pblocks;
5088         }
5089
5090 out:
5091         if (num_blocks_pinned) {
5092                 *num_blocks_pinned = npinned;
5093         }
5094
5095         if (need_put && rsrc_vp) {
5096                 //
5097                 // have to unlock the cnode since it's shared between the
5098                 // resource fork vnode and the data fork vnode (and the
5099                 // vnode_put() may need to re-acquire the cnode lock to
5100                 // reclaim the resource fork vnode)
5101                 //
5102                 hfs_unlock(VTOC(vp));
5103                 vnode_put(rsrc_vp);
5104                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5105         }
5106         return err;
5107 }
5108
5109
5110 /*
5111  * Relocate a file to a new location on disk
5112  *  cnode must be locked on entry
5113  *
5114  * Relocation occurs by cloning the file's data from its
5115  * current set of blocks to a new set of blocks. During
5116  * the relocation all of the blocks (old and new) are
5117  * owned by the file.
5118  *
5119  * -----------------
5120  * |///////////////|
5121  * -----------------
5122  * 0               N (file offset)
5123  *
5124  * -----------------     -----------------
5125  * |///////////////|     |               |     STEP 1 (acquire new blocks)
5126  * -----------------     -----------------
5127  * 0               N     N+1             2N
5128  *
5129  * -----------------     -----------------
5130  * |///////////////|     |///////////////|     STEP 2 (clone data)
5131  * -----------------     -----------------
5132  * 0               N     N+1             2N
5133  *
5134  *                       -----------------
5135  *                       |///////////////|     STEP 3 (head truncate blocks)
5136  *                       -----------------
5137  *                       0               N
5138  *
5139  * During steps 2 and 3 page-outs to file offsets less
5140  * than or equal to N are suspended.
5141  *
5142  * During step 3 page-ins to the file get suspended.
5143  */
5144 int
5145 hfs_relocate(struct  vnode *vp, u_int32_t  blockHint, kauth_cred_t cred,
5146         struct  proc *p)
5147 {
5148         struct  cnode *cp;
5149         struct  filefork *fp;
5150         struct  hfsmount *hfsmp;
5151         u_int32_t  headblks;
5152         u_int32_t  datablks;
5153         u_int32_t  blksize;
5154         u_int32_t  growsize;
5155         u_int32_t  nextallocsave;
5156         daddr64_t  sector_a,  sector_b;
5157         int eflags;
5158         off_t  newbytes;
5159         int  retval;
5160         int lockflags = 0;
5161         int took_trunc_lock = 0;
5162         int started_tr = 0;
5163         enum vtype vnodetype;
5164
5165         vnodetype = vnode_vtype(vp);
5166         if (vnodetype != VREG) {
5167                 /* Not allowed to move symlinks. */
5168                 return (EPERM);
5169         }
5170
5171         hfsmp = VTOHFS(vp);
5172         if (hfsmp->hfs_flags & HFS_FRAGMENTED_FREESPACE) {
5173                 return (ENOSPC);
5174         }
5175
5176         cp = VTOC(vp);
5177         fp = VTOF(vp);
5178         if (fp->ff_unallocblocks)
5179                 return (EINVAL);
5180
5181 #if CONFIG_PROTECT
5182         /*
5183          * <rdar://problem/9118426>
5184          * Disable HFS file relocation on content-protected filesystems
5185          */
5186         if (cp_fs_protected (hfsmp->hfs_mp)) {
5187                 return EINVAL;
5188         }
5189 #endif
5190         /* If it's an SSD, also disable HFS relocation */
5191         if (hfsmp->hfs_flags & HFS_SSD) {
5192                 return EINVAL;
5193         }
5194
5195
5196         blksize = hfsmp->blockSize;
5197         if (blockHint == 0)
5198                 blockHint = hfsmp->nextAllocation;
5199
5200         if (fp->ff_size > 0x7fffffff) {
5201                 return (EFBIG);
5202         }
5203
5204         //
5205         // We do not believe that this call to hfs_fsync() is
5206         // necessary and it causes a journal transaction
5207         // deadlock so we are removing it.
5208         //
5209         //if (vnodetype == VREG && !vnode_issystem(vp)) {
5210         //      retval = hfs_fsync(vp, MNT_WAIT, 0, p);
5211         //      if (retval)
5212         //              return (retval);
5213         //}
5214
5215         if (!vnode_issystem(vp) && (vnodetype != VLNK)) {
5216                 hfs_unlock(cp);
5217                 hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_DEFAULT);
5218                 /* Force lock since callers expects lock to be held. */
5219                 if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS))) {
5220                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5221                         return (retval);
5222                 }
5223                 /* No need to continue if file was removed. */
5224                 if (cp->c_flag & C_NOEXISTS) {
5225                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5226                         return (ENOENT);
5227                 }
5228                 took_trunc_lock = 1;
5229         }
5230         headblks = fp->ff_blocks;
5231         datablks = howmany(fp->ff_size, blksize);
5232         growsize = datablks * blksize;
5233         eflags = kEFContigMask | kEFAllMask | kEFNoClumpMask;
5234         if (blockHint >= hfsmp->hfs_metazone_start &&
5235             blockHint <= hfsmp->hfs_metazone_end)
5236                 eflags |= kEFMetadataMask;
5237
5238         if (hfs_start_transaction(hfsmp) != 0) {
5239                 if (took_trunc_lock)
5240                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5241             return (EINVAL);
5242         }
5243         started_tr = 1;
5244         /*
5245          * Protect the extents b-tree and the allocation bitmap
5246          * during MapFileBlockC and ExtendFileC operations.
5247          */
5248         lockflags = SFL_BITMAP;
5249         if (overflow_extents(fp))
5250                 lockflags |= SFL_EXTENTS;
5251         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5252
5253         retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize - 1, &sector_a, NULL);
5254         if (retval) {
5255                 retval = MacToVFSError(retval);
5256                 goto out;
5257         }
5258
5259         /*
5260          * STEP 1 - acquire new allocation blocks.
5261          */
5262         nextallocsave = hfsmp->nextAllocation;
5263         retval = ExtendFileC(hfsmp, (FCB*)fp, growsize, blockHint, eflags, &newbytes);
5264         if (eflags & kEFMetadataMask) {
5265                 hfs_lock_mount(hfsmp);
5266                 HFS_UPDATE_NEXT_ALLOCATION(hfsmp, nextallocsave);
5267                 MarkVCBDirty(hfsmp);
5268                 hfs_unlock_mount(hfsmp);
5269         }
5270
5271         retval = MacToVFSError(retval);
5272         if (retval == 0) {
5273                 cp->c_flag |= C_MODIFIED;
5274                 if (newbytes < growsize) {
5275                         retval = ENOSPC;
5276                         goto restore;
5277                 } else if (fp->ff_blocks < (headblks + datablks)) {
5278                         printf("hfs_relocate: allocation failed id=%u, vol=%s\n", cp->c_cnid, hfsmp->vcbVN);
5279                         retval = ENOSPC;
5280                         goto restore;
5281                 }
5282
5283                 retval = MapFileBlockC(hfsmp, (FCB *)fp, 1, growsize, &sector_b, NULL);
5284                 if (retval) {
5285                         retval = MacToVFSError(retval);
5286                 } else if ((sector_a + 1) == sector_b) {
5287                         retval = ENOSPC;
5288                         goto restore;
5289                 } else if ((eflags & kEFMetadataMask) &&
5290                            ((((u_int64_t)sector_b * hfsmp->hfs_logical_block_size) / blksize) >
5291                               hfsmp->hfs_metazone_end)) {
5292 #if 0
5293                         const char * filestr;
5294                         char emptystr = '\0';
5295
5296                         if (cp->c_desc.cd_nameptr != NULL) {
5297                                 filestr = (const char *)&cp->c_desc.cd_nameptr[0];
5298                         } else if (vnode_name(vp) != NULL) {
5299                                 filestr = vnode_name(vp);
5300                         } else {
5301                                 filestr = &emptystr;
5302                         }
5303 #endif
5304                         retval = ENOSPC;
5305                         goto restore;
5306                 }
5307         }
5308         /* Done with system locks and journal for now. */
5309         hfs_systemfile_unlock(hfsmp, lockflags);
5310         lockflags = 0;
5311         hfs_end_transaction(hfsmp);
5312         started_tr = 0;
5313
5314         if (retval) {
5315                 /*
5316                  * Check to see if failure is due to excessive fragmentation.
5317                  */
5318                 if ((retval == ENOSPC) &&
5319                     (hfs_freeblks(hfsmp, 0) > (datablks * 2))) {
5320                         hfsmp->hfs_flags |= HFS_FRAGMENTED_FREESPACE;
5321                 }
5322                 goto out;
5323         }
5324         /*
5325          * STEP 2 - clone file data into the new allocation blocks.
5326          */
5327
5328         if (vnodetype == VLNK)
5329                 retval = EPERM;
5330         else if (vnode_issystem(vp))
5331                 retval = hfs_clonesysfile(vp, headblks, datablks, blksize, cred, p);
5332         else
5333                 retval = hfs_clonefile(vp, headblks, datablks, blksize);
5334
5335         /* Start transaction for step 3 or for a restore. */
5336         if (hfs_start_transaction(hfsmp) != 0) {
5337                 retval = EINVAL;
5338                 goto out;
5339         }
5340         started_tr = 1;
5341         if (retval)
5342                 goto restore;
5343
5344         /*
5345          * STEP 3 - switch to cloned data and remove old blocks.
5346          */
5347         lockflags = SFL_BITMAP;
5348         if (overflow_extents(fp))
5349                 lockflags |= SFL_EXTENTS;
5350         lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5351
5352         retval = HeadTruncateFile(hfsmp, (FCB*)fp, headblks);
5353
5354         hfs_systemfile_unlock(hfsmp, lockflags);
5355         lockflags = 0;
5356         if (retval)
5357                 goto restore;
5358 out:
5359         if (took_trunc_lock)
5360                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5361
5362         if (lockflags) {
5363                 hfs_systemfile_unlock(hfsmp, lockflags);
5364                 lockflags = 0;
5365         }
5366
5367         /* Push cnode's new extent data to disk. */
5368         if (retval == 0) {
5369                 hfs_update(vp, 0);
5370         }
5371         if (hfsmp->jnl) {
5372                 if (cp->c_cnid < kHFSFirstUserCatalogNodeID)
5373                         (void) hfs_flushvolumeheader(hfsmp, HFS_FVH_WAIT | HFS_FVH_WRITE_ALT);
5374                 else
5375                         (void) hfs_flushvolumeheader(hfsmp, 0);
5376         }
5377 exit:
5378         if (started_tr)
5379                 hfs_end_transaction(hfsmp);
5380
5381         return (retval);
5382
5383 restore:
5384         if (fp->ff_blocks == headblks) {
5385                 if (took_trunc_lock)
5386                         hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5387                 goto exit;
5388         }
5389         /*
5390          * Give back any newly allocated space.
5391          */
5392         if (lockflags == 0) {
5393                 lockflags = SFL_BITMAP;
5394                 if (overflow_extents(fp))
5395                         lockflags |= SFL_EXTENTS;
5396                 lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5397         }
5398
5399         (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp),
5400                                                  FTOC(fp)->c_fileid, false);
5401
5402         hfs_systemfile_unlock(hfsmp, lockflags);
5403         lockflags = 0;
5404
5405         if (took_trunc_lock)
5406                 hfs_unlock_truncate(cp, HFS_LOCK_DEFAULT);
5407         goto exit;
5408 }
5409
5410
5411 /*
5412  * Clone a file's data within the file.
5413  *
5414  */
5415 static int
5416 hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize)
5417 {
5418         caddr_t  bufp;
5419         size_t  bufsize;
5420         size_t  copysize;
5421         size_t  iosize;
5422         size_t  offset;
5423         off_t   writebase;
5424         uio_t auio;
5425         int  error = 0;
5426
5427         writebase = blkstart * blksize;
5428         copysize = blkcnt * blksize;
5429         iosize = bufsize = MIN(copysize, 128 * 1024);
5430         offset = 0;
5431
5432         hfs_unlock(VTOC(vp));
5433
5434 #if CONFIG_PROTECT
5435         if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
5436                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5437                 return (error);
5438         }
5439 #endif /* CONFIG_PROTECT */
5440
5441         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize, VM_KERN_MEMORY_FILE)) {
5442                 hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5443                 return (ENOMEM);
5444         }
5445
5446         auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ);
5447
5448         while (offset < copysize) {
5449                 iosize = MIN(copysize - offset, iosize);
5450
5451                 uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ);
5452                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5453
5454                 error = cluster_read(vp, auio, copysize, IO_NOCACHE);
5455                 if (error) {
5456                         printf("hfs_clonefile: cluster_read failed - %d\n", error);
5457                         break;
5458                 }
5459                 if (uio_resid(auio) != 0) {
5460                         printf("hfs_clonefile: cluster_read: uio_resid = %lld\n", (int64_t)uio_resid(auio));
5461                         error = EIO;
5462                         break;
5463                 }
5464
5465                 uio_reset(auio, writebase + offset, UIO_SYSSPACE, UIO_WRITE);
5466                 uio_addiov(auio, (uintptr_t)bufp, iosize);
5467
5468                 error = cluster_write(vp, auio, writebase + offset,
5469                                       writebase + offset + iosize,
5470                                       uio_offset(auio), 0, IO_NOCACHE | IO_SYNC);
5471                 if (error) {
5472                         printf("hfs_clonefile: cluster_write failed - %d\n", error);
5473                         break;
5474                 }
5475                 if (uio_resid(auio) != 0) {
5476                         printf("hfs_clonefile: cluster_write failed - uio_resid not zero\n");
5477                         error = EIO;
5478                         break;
5479                 }
5480                 offset += iosize;
5481         }
5482         uio_free(auio);
5483
5484         if ((blksize & PAGE_MASK)) {
5485                 /*
5486                  * since the copy may not have started on a PAGE
5487                  * boundary (or may not have ended on one), we
5488                  * may have pages left in the cache since NOCACHE
5489                  * will let partially written pages linger...
5490                  * lets just flush the entire range to make sure
5491                  * we don't have any pages left that are beyond
5492                  * (or intersect) the real LEOF of this file
5493                  */
5494                 ubc_msync(vp, writebase, writebase + offset, NULL, UBC_INVALIDATE | UBC_PUSHDIRTY);
5495         } else {
5496                 /*
5497                  * No need to call ubc_msync or hfs_invalbuf
5498                  * since the file was copied using IO_NOCACHE and
5499                  * the copy was done starting and ending on a page
5500                  * boundary in the file.
5501                  */
5502         }
5503         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5504
5505         hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK, HFS_LOCK_ALLOW_NOEXISTS);
5506         return (error);
5507 }
5508
5509 /*
5510  * Clone a system (metadata) file.
5511  *
5512  */
5513 static int
5514 hfs_clonesysfile(struct vnode *vp, int blkstart, int blkcnt, int blksize,
5515                  kauth_cred_t cred, struct proc *p)
5516 {
5517         caddr_t  bufp;
5518         char * offset;
5519         size_t  bufsize;
5520         size_t  iosize;
5521         struct buf *bp = NULL;
5522         daddr64_t  blkno;
5523         daddr64_t  blk;
5524         daddr64_t  start_blk;
5525         daddr64_t  last_blk;
5526         int  breadcnt;
5527         int  i;
5528         int  error = 0;
5529
5530
5531         iosize = GetLogicalBlockSize(vp);
5532         bufsize = MIN(blkcnt * blksize, 1024 * 1024) & ~(iosize - 1);
5533         breadcnt = bufsize / iosize;
5534
5535         if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize, VM_KERN_MEMORY_FILE)) {
5536                 return (ENOMEM);
5537         }
5538         start_blk = ((daddr64_t)blkstart * blksize) / iosize;
5539         last_blk  = ((daddr64_t)blkcnt * blksize) / iosize;
5540         blkno = 0;
5541
5542         while (blkno < last_blk) {
5543                 /*
5544                  * Read up to a megabyte
5545                  */
5546                 offset = bufp;
5547                 for (i = 0, blk = blkno; (i < breadcnt) && (blk < last_blk); ++i, ++blk) {
5548                         error = (int)buf_meta_bread(vp, blk, iosize, cred, &bp);
5549                         if (error) {
5550                                 printf("hfs_clonesysfile: meta_bread error %d\n", error);
5551                                 goto out;
5552                         }
5553                         if (buf_count(bp) != iosize) {
5554                                 printf("hfs_clonesysfile: b_bcount is only %d\n", buf_count(bp));
5555                                 goto out;
5556                         }
5557                         bcopy((char *)buf_dataptr(bp), offset, iosize);
5558
5559                         buf_markinvalid(bp);
5560                         buf_brelse(bp);
5561                         bp = NULL;
5562
5563                         offset += iosize;
5564                 }
5565
5566                 /*
5567                  * Write up to a megabyte
5568                  */
5569                 offset = bufp;
5570                 for (i = 0; (i < breadcnt) && (blkno < last_blk); ++i, ++blkno) {
5571                         bp = buf_getblk(vp, start_blk + blkno, iosize, 0, 0, BLK_META);
5572                         if (bp == NULL) {
5573                                 printf("hfs_clonesysfile: getblk failed on blk %qd\n", start_blk + blkno);
5574                                 error = EIO;
5575                                 goto out;
5576                         }
5577                         bcopy(offset, (char *)buf_dataptr(bp), iosize);
5578                         error = (int)buf_bwrite(bp);
5579                         bp = NULL;
5580                         if (error)
5581                                 goto out;
5582                         offset += iosize;
5583                 }
5584         }
5585 out:
5586         if (bp) {
5587                 buf_brelse(bp);
5588         }
5589
5590         kmem_free(kernel_map, (vm_offset_t)bufp, bufsize);
5591
5592         error = hfs_fsync(vp, MNT_WAIT, 0, p);
5593
5594         return (error);
5595 }
5596
5597 errno_t hfs_flush_invalid_ranges(vnode_t vp)
5598 {
5599         cnode_t *cp = VTOC(vp);
5600
5601         assert(cp->c_lockowner == current_thread());
5602         assert(cp->c_truncatelockowner == current_thread());
5603
5604         if (!ISSET(cp->c_flag, C_ZFWANTSYNC) && !cp->c_zftimeout)
5605                 return 0;
5606
5607         filefork_t *fp = VTOF(vp);
5608
5609         /*
5610          * We can't hold the cnode lock whilst we call cluster_write so we
5611          * need to copy the extents into a local buffer.
5612          */
5613         int max_exts = 16;
5614         struct ext {
5615                 off_t start, end;
5616         } exts_buf[max_exts];           // 256 bytes
5617         struct ext *exts = exts_buf;
5618         int ext_count = 0;
5619         errno_t ret;
5620
5621         struct rl_entry *r = TAILQ_FIRST(&fp->ff_invalidranges);
5622
5623         while (r) {
5624                 /* If we have more than can fit in our stack buffer, switch
5625                    to a heap buffer. */
5626                 if (exts == exts_buf && ext_count == max_exts) {
5627                         max_exts = 256;
5628                         MALLOC(exts, struct ext *, sizeof(struct ext) * max_exts,
5629                                    M_TEMP, M_WAITOK);
5630                         memcpy(exts, exts_buf, ext_count * sizeof(struct ext));
5631                 }
5632
5633                 struct rl_entry *next = TAILQ_NEXT(r, rl_link);
5634
5635                 exts[ext_count++] = (struct ext){ r->rl_start, r->rl_end };
5636
5637                 if (!next || (ext_count == max_exts && exts != exts_buf)) {
5638                         hfs_unlock(cp);
5639                         for (int i = 0; i < ext_count; ++i) {
5640                                 ret = cluster_write(vp, NULL, fp->ff_size, exts[i].end + 1,
5641                                                                         exts[i].start, 0,
5642                                                                         IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE);
5643                                 if (ret) {
5644                                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5645                                         goto exit;
5646                                 }
5647                         }
5648
5649                         if (!next) {
5650                                 hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5651                                 break;
5652                         }
5653
5654                         /* Push any existing clusters which should clean up our invalid
5655                            ranges as they go through hfs_vnop_blockmap. */
5656                         cluster_push(vp, 0);
5657
5658                         hfs_lock_always(cp, HFS_EXCLUSIVE_LOCK);
5659
5660                         /*
5661                          * Get back to where we were (given we dropped the lock).
5662                          * This shouldn't be many because we pushed above.
5663                          */
5664                         TAILQ_FOREACH(r, &fp->ff_invalidranges, rl_link) {
5665                                 if (r->rl_end > exts[ext_count - 1].end)
5666                                         break;
5667                         }
5668
5669                         ext_count = 0;
5670                 } else
5671                         r = next;
5672         }
5673
5674         ret = 0;
5675
5676 exit:
5677
5678         if (exts != exts_buf)
5679                 FREE(exts, M_TEMP);
5680
5681         return ret;
5682 }